test/CodeGen/X86/avx2-intrinsics-fast-isel.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X86
   3 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X64
   4
   5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx2-builtins.c
   6
   7 define <4 x i64> @test_mm256_abs_epi8(<4 x i64> %a0) {
   8 ; CHECK-LABEL: test_mm256_abs_epi8:
   9 ; CHECK:       # %bb.0:
  10 ; CHECK-NEXT:    vpabsb %ymm0, %ymm0
  11 ; CHECK-NEXT:    ret{{[l|q]}}
  12   %arg = bitcast <4 x i64> %a0 to <32 x i8>
  13   %sub = sub <32 x i8> zeroinitializer, %arg
  14   %cmp = icmp sgt <32 x i8> %arg, zeroinitializer
  15   %sel = select <32 x i1> %cmp, <32 x i8> %arg, <32 x i8> %sub
  16   %res = bitcast <32 x i8> %sel to <4 x i64>
  17   ret <4 x i64> %res
  18 }
  19 declare <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8>) nounwind readnone
  20
  21 define <4 x i64> @test_mm256_abs_epi16(<4 x i64> %a0) {
  22 ; CHECK-LABEL: test_mm256_abs_epi16:
  23 ; CHECK:       # %bb.0:
  24 ; CHECK-NEXT:    vpabsw %ymm0, %ymm0
  25 ; CHECK-NEXT:    ret{{[l|q]}}
  26   %arg = bitcast <4 x i64> %a0 to <16 x i16>
  27   %sub = sub <16 x i16> zeroinitializer, %arg
  28   %cmp = icmp sgt <16 x i16> %arg, zeroinitializer
  29   %sel = select <16 x i1> %cmp, <16 x i16> %arg, <16 x i16> %sub
  30   %res = bitcast <16 x i16> %sel to <4 x i64>
  31   ret <4 x i64> %res
  32 }
  33 declare <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16>) nounwind readnone
  34
  35 define <4 x i64> @test_mm256_abs_epi32(<4 x i64> %a0) {
  36 ; CHECK-LABEL: test_mm256_abs_epi32:
  37 ; CHECK:       # %bb.0:
  38 ; CHECK-NEXT:    vpabsd %ymm0, %ymm0
  39 ; CHECK-NEXT:    ret{{[l|q]}}
  40   %arg = bitcast <4 x i64> %a0 to <8 x i32>
  41   %sub = sub <8 x i32> zeroinitializer, %arg
  42   %cmp = icmp sgt <8 x i32> %arg, zeroinitializer
  43   %sel = select <8 x i1> %cmp, <8 x i32> %arg, <8 x i32> %sub
  44   %res = bitcast <8 x i32> %sel to <4 x i64>
  45   ret <4 x i64> %res
  46 }
  47 declare <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32>) nounwind readnone
  48
  49 define <4 x i64> @test_mm256_add_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
  50 ; CHECK-LABEL: test_mm256_add_epi8:
  51 ; CHECK:       # %bb.0:
  52 ; CHECK-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
  53 ; CHECK-NEXT:    ret{{[l|q]}}
  54   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
  55   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
  56   %res = add <32 x i8> %arg0, %arg1
  57   %bc = bitcast <32 x i8> %res to <4 x i64>
  58   ret <4 x i64> %bc
  59 }
  60
  61 define <4 x i64> @test_mm256_add_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
  62 ; CHECK-LABEL: test_mm256_add_epi16:
  63 ; CHECK:       # %bb.0:
  64 ; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
  65 ; CHECK-NEXT:    ret{{[l|q]}}
  66   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
  67   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
  68   %res = add <16 x i16> %arg0, %arg1
  69   %bc = bitcast <16 x i16> %res to <4 x i64>
  70   ret <4 x i64> %bc
  71 }
  72
  73 define <4 x i64> @test_mm256_add_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
  74 ; CHECK-LABEL: test_mm256_add_epi32:
  75 ; CHECK:       # %bb.0:
  76 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
  77 ; CHECK-NEXT:    ret{{[l|q]}}
  78   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
  79   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
  80   %res = add <8 x i32> %arg0, %arg1
  81   %bc = bitcast <8 x i32> %res to <4 x i64>
  82   ret <4 x i64> %bc
  83 }
  84
  85 define <4 x i64> @test_mm256_add_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
  86 ; CHECK-LABEL: test_mm256_add_epi64:
  87 ; CHECK:       # %bb.0:
  88 ; CHECK-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
  89 ; CHECK-NEXT:    ret{{[l|q]}}
  90   %res = add <4 x i64> %a0, %a1
  91   ret <4 x i64> %res
  92 }
  93
  94 define <4 x i64> @test_mm256_adds_epi8(<4 x i64> %a0, <4 x i64> %a1) {
  95 ; CHECK-LABEL: test_mm256_adds_epi8:
  96 ; CHECK:       # %bb.0:
  97 ; CHECK-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0
  98 ; CHECK-NEXT:    ret{{[l|q]}}
  99   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 100   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
 101   %res = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
 102   %bc = bitcast <32 x i8> %res to <4 x i64>
 103   ret <4 x i64> %bc
 104 }
 105 declare <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
 106
 107 define <4 x i64> @test_mm256_adds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
 108 ; CHECK-LABEL: test_mm256_adds_epi16:
 109 ; CHECK:       # %bb.0:
 110 ; CHECK-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0
 111 ; CHECK-NEXT:    ret{{[l|q]}}
 112   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 113   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 114   %res = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
 115   %bc = bitcast <16 x i16> %res to <4 x i64>
 116   ret <4 x i64> %bc
 117 }
 118 declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
 119
 120 define <4 x i64> @test_mm256_adds_epu8(<4 x i64> %a0, <4 x i64> %a1) {
 121 ; CHECK-LABEL: test_mm256_adds_epu8:
 122 ; CHECK:       # %bb.0:
 123 ; CHECK-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0
 124 ; CHECK-NEXT:    ret{{[l|q]}}
 125   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 126   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
 127   %res = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
 128   %bc = bitcast <32 x i8> %res to <4 x i64>
 129   ret <4 x i64> %bc
 130 }
 131 declare <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8>, <32 x i8>)
 132
 133 define <4 x i64> @test_mm256_adds_epu16(<4 x i64> %a0, <4 x i64> %a1) {
 134 ; CHECK-LABEL: test_mm256_adds_epu16:
 135 ; CHECK:       # %bb.0:
 136 ; CHECK-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0
 137 ; CHECK-NEXT:    ret{{[l|q]}}
 138   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 139   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 140   %res = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
 141   %bc = bitcast <16 x i16> %res to <4 x i64>
 142   ret <4 x i64> %bc
 143 }
 144 declare <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16>, <16 x i16>)
 145
 146 define <4 x i64> @test_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
 147 ; CHECK-LABEL: test_mm256_alignr_epi8:
 148 ; CHECK:       # %bb.0:
 149 ; CHECK-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17]
 150 ; CHECK-NEXT:    ret{{[l|q]}}
 151   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 152   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
 153   %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49>
 154   %res = bitcast <32 x i8> %shuf to <4 x i64>
 155   ret <4 x i64> %res
 156 }
 157
 158 define <4 x i64> @test2_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
 159 ; CHECK-LABEL: test2_mm256_alignr_epi8:
 160 ; CHECK:       # %bb.0:
 161 ; CHECK-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16]
 162 ; CHECK-NEXT:    ret{{[l|q]}}
 163   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 164   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
 165   %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
 166   %res = bitcast <32 x i8> %shuf to <4 x i64>
 167   ret <4 x i64> %res
 168 }
 169
 170 define <4 x i64> @test_mm256_and_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 171 ; CHECK-LABEL: test_mm256_and_si256:
 172 ; CHECK:       # %bb.0:
 173 ; CHECK-NEXT:    vandps %ymm1, %ymm0, %ymm0
 174 ; CHECK-NEXT:    ret{{[l|q]}}
 175   %res = and <4 x i64> %a0, %a1
 176   ret <4 x i64> %res
 177 }
 178
 179 define <4 x i64> @test_mm256_andnot_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 180 ; CHECK-LABEL: test_mm256_andnot_si256:
 181 ; CHECK:       # %bb.0:
 182 ; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 183 ; CHECK-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 184 ; CHECK-NEXT:    vpand %ymm1, %ymm0, %ymm0
 185 ; CHECK-NEXT:    ret{{[l|q]}}
 186   %not = xor <4 x i64> %a0, <i64 -1, i64 -1, i64 -1, i64 -1>
 187   %res = and <4 x i64> %not, %a1
 188   ret <4 x i64> %res
 189 }
 190
 191 define <4 x i64> @test_mm256_avg_epu8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 192 ; CHECK-LABEL: test_mm256_avg_epu8:
 193 ; CHECK:       # %bb.0:
 194 ; CHECK-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
 195 ; CHECK-NEXT:    ret{{[l|q]}}
 196   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 197   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
 198   %zext0 = zext <32 x i8> %arg0 to <32 x i16>
 199   %zext1 = zext <32 x i8> %arg1 to <32 x i16>
 200   %add = add <32 x i16> %zext0, %zext1
 201   %add1 = add <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
 202   %lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
 203   %res = trunc <32 x i16> %lshr to <32 x i8>
 204   %bc = bitcast <32 x i8> %res to <4 x i64>
 205   ret <4 x i64> %bc
 206 }
 207
 208 define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 209 ; CHECK-LABEL: test_mm256_avg_epu16:
 210 ; CHECK:       # %bb.0:
 211 ; CHECK-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
 212 ; CHECK-NEXT:    ret{{[l|q]}}
 213   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 214   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 215   %zext0 = zext <16 x i16> %arg0 to <16 x i32>
 216   %zext1 = zext <16 x i16> %arg1 to <16 x i32>
 217   %add = add <16 x i32> %zext0, %zext1
 218   %add1 = add <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 219   %lshr = lshr <16 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 220   %res = trunc <16 x i32> %lshr to <16 x i16>
 221   %bc = bitcast <16 x i16> %res to <4 x i64>
 222   ret <4 x i64> %bc
 223 }
 224
 225 define <4 x i64> @test_mm256_blend_epi16(<4 x i64> %a0, <4 x i64> %a1) {
 226 ; CHECK-LABEL: test_mm256_blend_epi16:
 227 ; CHECK:       # %bb.0:
 228 ; CHECK-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
 229 ; CHECK-NEXT:    ret{{[l|q]}}
 230   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 231   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 232   %shuf = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 233   %res = bitcast <16 x i16> %shuf to <4 x i64>
 234   ret <4 x i64> %res
 235 }
 236
 237 define <2 x i64> @test_mm_blend_epi32(<2 x i64> %a0, <2 x i64> %a1) {
 238 ; CHECK-LABEL: test_mm_blend_epi32:
 239 ; CHECK:       # %bb.0:
 240 ; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
 241 ; CHECK-NEXT:    ret{{[l|q]}}
 242   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
 243   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
 244   %shuf = shufflevector <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
 245   %res = bitcast <4 x i32> %shuf to <2 x i64>
 246   ret <2 x i64> %res
 247 }
 248
 249 define <4 x i64> @test_mm256_blend_epi32(<4 x i64> %a0, <4 x i64> %a1) {
 250 ; CHECK-LABEL: test_mm256_blend_epi32:
 251 ; CHECK:       # %bb.0:
 252 ; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
 253 ; CHECK-NEXT:    ret{{[l|q]}}
 254   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
 255   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
 256   %shuf = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
 257   %res = bitcast <8 x i32> %shuf to <4 x i64>
 258   ret <4 x i64> %res
 259 }
 260
 261 define <4 x i64> @test_mm256_blendv_epi8(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
 262 ; CHECK-LABEL: test_mm256_blendv_epi8:
 263 ; CHECK:       # %bb.0:
 264 ; CHECK-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
 265 ; CHECK-NEXT:    ret{{[l|q]}}
 266   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 267   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
 268   %arg2 = bitcast <4 x i64> %a2 to <32 x i8>
 269   %call = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %arg0, <32 x i8> %arg1, <32 x i8> %arg2)
 270   %res = bitcast <32 x i8> %call to <4 x i64>
 271   ret <4 x i64> %res
 272 }
 273 declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone
 274
 275 define <2 x i64> @test_mm_broadcastb_epi8(<2 x i64> %a0) {
 276 ; CHECK-LABEL: test_mm_broadcastb_epi8:
 277 ; CHECK:       # %bb.0:
 278 ; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm0
 279 ; CHECK-NEXT:    ret{{[l|q]}}
 280   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
 281   %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <16 x i32> zeroinitializer
 282   %res = bitcast <16 x i8> %shuf to <2 x i64>
 283   ret <2 x i64> %res
 284 }
 285
 286 define <4 x i64> @test_mm256_broadcastb_epi8(<4 x i64> %a0) {
 287 ; CHECK-LABEL: test_mm256_broadcastb_epi8:
 288 ; CHECK:       # %bb.0:
 289 ; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm0
 290 ; CHECK-NEXT:    ret{{[l|q]}}
 291   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 292   %shuf = shufflevector <32 x i8> %arg0, <32 x i8> undef, <32 x i32> zeroinitializer
 293   %res = bitcast <32 x i8> %shuf to <4 x i64>
 294   ret <4 x i64> %res
 295 }
 296
 297 define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {
 298 ; CHECK-LABEL: test_mm_broadcastd_epi32:
 299 ; CHECK:       # %bb.0:
 300 ; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
 301 ; CHECK-NEXT:    ret{{[l|q]}}
 302   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
 303   %shuf = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
 304   %res = bitcast <4 x i32> %shuf to <2 x i64>
 305   ret <2 x i64> %res
 306 }
 307
 308 define <4 x i64> @test_mm256_broadcastd_epi32(<4 x i64> %a0) {
 309 ; CHECK-LABEL: test_mm256_broadcastd_epi32:
 310 ; CHECK:       # %bb.0:
 311 ; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
 312 ; CHECK-NEXT:    ret{{[l|q]}}
 313   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
 314   %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> zeroinitializer
 315   %res = bitcast <8 x i32> %shuf to <4 x i64>
 316   ret <4 x i64> %res
 317 }
 318
 319 define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) {
 320 ; CHECK-LABEL: test_mm_broadcastq_epi64:
 321 ; CHECK:       # %bb.0:
 322 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 323 ; CHECK-NEXT:    ret{{[l|q]}}
 324   %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
 325   ret <2 x i64> %res
 326 }
 327
 328 define <4 x i64> @test_mm256_broadcastq_epi64(<4 x i64> %a0) {
 329 ; CHECK-LABEL: test_mm256_broadcastq_epi64:
 330 ; CHECK:       # %bb.0:
 331 ; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
 332 ; CHECK-NEXT:    ret{{[l|q]}}
 333   %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> zeroinitializer
 334   ret <4 x i64> %res
 335 }
 336
 337 define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) {
 338 ; CHECK-LABEL: test_mm_broadcastsd_pd:
 339 ; CHECK:       # %bb.0:
 340 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 341 ; CHECK-NEXT:    ret{{[l|q]}}
 342   %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
 343   ret <2 x double> %res
 344 }
 345
 346 define <4 x double> @test_mm256_broadcastsd_pd(<4 x double> %a0) {
 347 ; CHECK-LABEL: test_mm256_broadcastsd_pd:
 348 ; CHECK:       # %bb.0:
 349 ; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
 350 ; CHECK-NEXT:    ret{{[l|q]}}
 351   %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> zeroinitializer
 352   ret <4 x double> %res
 353 }
 354
 355 define <4 x i64> @test_mm256_broadcastsi128_si256(<2 x i64> %a0) {
 356 ; CHECK-LABEL: test_mm256_broadcastsi128_si256:
 357 ; CHECK:       # %bb.0:
 358 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
 359 ; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 360 ; CHECK-NEXT:    ret{{[l|q]}}
 361   %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 362   ret <4 x i64> %res
 363 }
 364
 365 define <4 x i64> @test_mm256_broadcastsi128_si256_mem(<2 x i64>* %p0) {
 366 ; X86-LABEL: test_mm256_broadcastsi128_si256_mem:
 367 ; X86:       # %bb.0:
 368 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 369 ; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
 370 ; X86-NEXT:    retl
 371 ;
 372 ; X64-LABEL: test_mm256_broadcastsi128_si256_mem:
 373 ; X64:       # %bb.0:
 374 ; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
 375 ; X64-NEXT:    retq
 376   %a0 = load <2 x i64>, <2 x i64>* %p0
 377   %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 378   ret <4 x i64> %res
 379 }
 380
 381 define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {
 382 ; CHECK-LABEL: test_mm_broadcastss_ps:
 383 ; CHECK:       # %bb.0:
 384 ; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
 385 ; CHECK-NEXT:    ret{{[l|q]}}
 386   %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
 387   ret <4 x float> %res
 388 }
 389
 390 define <8 x float> @test_mm256_broadcastss_ps(<8 x float> %a0) {
 391 ; CHECK-LABEL: test_mm256_broadcastss_ps:
 392 ; CHECK:       # %bb.0:
 393 ; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
 394 ; CHECK-NEXT:    ret{{[l|q]}}
 395   %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer
 396   ret <8 x float> %res
 397 }
 398
 399 define <2 x i64> @test_mm_broadcastw_epi16(<2 x i64> %a0) {
 400 ; CHECK-LABEL: test_mm_broadcastw_epi16:
 401 ; CHECK:       # %bb.0:
 402 ; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm0
 403 ; CHECK-NEXT:    ret{{[l|q]}}
 404   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
 405   %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> zeroinitializer
 406   %res = bitcast <8 x i16> %shuf to <2 x i64>
 407   ret <2 x i64> %res
 408 }
 409
 410 define <4 x i64> @test_mm256_broadcastw_epi16(<4 x i64> %a0) {
 411 ; CHECK-LABEL: test_mm256_broadcastw_epi16:
 412 ; CHECK:       # %bb.0:
 413 ; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0
 414 ; CHECK-NEXT:    ret{{[l|q]}}
 415   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 416   %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> zeroinitializer
 417   %res = bitcast <16 x i16> %shuf to <4 x i64>
 418   ret <4 x i64> %res
 419 }
 420
 421 define <4 x i64> @test_mm256_bslli_epi128(<4 x i64> %a0) {
 422 ; CHECK-LABEL: test_mm256_bslli_epi128:
 423 ; CHECK:       # %bb.0:
 424 ; CHECK-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
 425 ; CHECK-NEXT:    ret{{[l|q]}}
 426   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 427   %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
 428   %res = bitcast <32 x i8> %shuf to <4 x i64>
 429   ret <4 x i64> %res
 430 }
 431
 432 define <4 x i64> @test_mm256_bsrli_epi128(<4 x i64> %a0) {
 433 ; CHECK-LABEL: test_mm256_bsrli_epi128:
 434 ; CHECK:       # %bb.0:
 435 ; CHECK-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
 436 ; CHECK-NEXT:    ret{{[l|q]}}
 437   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 438   %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
 439   %res = bitcast <32 x i8> %shuf to <4 x i64>
 440   ret <4 x i64> %res
 441 }
 442
 443 define <4 x i64> @test_mm256_cmpeq_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 444 ; CHECK-LABEL: test_mm256_cmpeq_epi8:
 445 ; CHECK:       # %bb.0:
 446 ; CHECK-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
 447 ; CHECK-NEXT:    ret{{[l|q]}}
 448   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 449   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
 450   %cmp = icmp eq <32 x i8> %arg0, %arg1
 451   %res = sext <32 x i1> %cmp to <32 x i8>
 452   %bc = bitcast <32 x i8> %res to <4 x i64>
 453   ret <4 x i64> %bc
 454 }
 455
 456 define <4 x i64> @test_mm256_cmpeq_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 457 ; CHECK-LABEL: test_mm256_cmpeq_epi16:
 458 ; CHECK:       # %bb.0:
 459 ; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 460 ; CHECK-NEXT:    ret{{[l|q]}}
 461   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 462   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 463   %cmp = icmp eq <16 x i16> %arg0, %arg1
 464   %res = sext <16 x i1> %cmp to <16 x i16>
 465   %bc = bitcast <16 x i16> %res to <4 x i64>
 466   ret <4 x i64> %bc
 467 }
 468
 469 define <4 x i64> @test_mm256_cmpeq_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 470 ; CHECK-LABEL: test_mm256_cmpeq_epi32:
 471 ; CHECK:       # %bb.0:
 472 ; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0
 473 ; CHECK-NEXT:    ret{{[l|q]}}
 474   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
 475   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
 476   %cmp = icmp eq <8 x i32> %arg0, %arg1
 477   %res = sext <8 x i1> %cmp to <8 x i32>
 478   %bc = bitcast <8 x i32> %res to <4 x i64>
 479   ret <4 x i64> %bc
 480 }
 481
 482 define <4 x i64> @test_mm256_cmpeq_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 483 ; CHECK-LABEL: test_mm256_cmpeq_epi64:
 484 ; CHECK:       # %bb.0:
 485 ; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
 486 ; CHECK-NEXT:    ret{{[l|q]}}
 487   %cmp = icmp eq <4 x i64> %a0, %a1
 488   %res = sext <4 x i1> %cmp to <4 x i64>
 489   ret <4 x i64> %res
 490 }
 491
 492 define <4 x i64> @test_mm256_cmpgt_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 493 ; CHECK-LABEL: test_mm256_cmpgt_epi8:
 494 ; CHECK:       # %bb.0:
 495 ; CHECK-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
 496 ; CHECK-NEXT:    ret{{[l|q]}}
 497   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 498   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
 499   %cmp = icmp sgt <32 x i8> %arg0, %arg1
 500   %res = sext <32 x i1> %cmp to <32 x i8>
 501   %bc = bitcast <32 x i8> %res to <4 x i64>
 502   ret <4 x i64> %bc
 503 }
 504
 505 define <4 x i64> @test_mm256_cmpgt_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 506 ; CHECK-LABEL: test_mm256_cmpgt_epi16:
 507 ; CHECK:       # %bb.0:
 508 ; CHECK-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 509 ; CHECK-NEXT:    ret{{[l|q]}}
 510   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 511   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 512   %cmp = icmp sgt <16 x i16> %arg0, %arg1
 513   %res = sext <16 x i1> %cmp to <16 x i16>
 514   %bc = bitcast <16 x i16> %res to <4 x i64>
 515   ret <4 x i64> %bc
 516 }
 517
 518 define <4 x i64> @test_mm256_cmpgt_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 519 ; CHECK-LABEL: test_mm256_cmpgt_epi32:
 520 ; CHECK:       # %bb.0:
 521 ; CHECK-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
 522 ; CHECK-NEXT:    ret{{[l|q]}}
 523   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
 524   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
 525   %cmp = icmp sgt <8 x i32> %arg0, %arg1
 526   %res = sext <8 x i1> %cmp to <8 x i32>
 527   %bc = bitcast <8 x i32> %res to <4 x i64>
 528   ret <4 x i64> %bc
 529 }
 530
 531 define <4 x i64> @test_mm256_cmpgt_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 532 ; CHECK-LABEL: test_mm256_cmpgt_epi64:
 533 ; CHECK:       # %bb.0:
 534 ; CHECK-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
 535 ; CHECK-NEXT:    ret{{[l|q]}}
 536   %cmp = icmp sgt <4 x i64> %a0, %a1
 537   %res = sext <4 x i1> %cmp to <4 x i64>
 538   ret <4 x i64> %res
 539 }
 540
 541 define <4 x i64> @test_mm256_cvtepi8_epi16(<2 x i64> %a0) {
 542 ; CHECK-LABEL: test_mm256_cvtepi8_epi16:
 543 ; CHECK:       # %bb.0:
 544 ; CHECK-NEXT:    vpmovsxbw %xmm0, %ymm0
 545 ; CHECK-NEXT:    ret{{[l|q]}}
 546   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
 547   %ext = sext <16 x i8> %arg0 to <16 x i16>
 548   %res = bitcast <16 x i16> %ext to <4 x i64>
 549   ret <4 x i64> %res
 550 }
 551
 552 define <4 x i64> @test_mm256_cvtepi8_epi32(<2 x i64> %a0) {
 553 ; CHECK-LABEL: test_mm256_cvtepi8_epi32:
 554 ; CHECK:       # %bb.0:
 555 ; CHECK-NEXT:    vpmovsxbd %xmm0, %ymm0
 556 ; CHECK-NEXT:    ret{{[l|q]}}
 557   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
 558   %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 559   %ext = sext <8 x i8> %shuf to <8 x i32>
 560   %res = bitcast <8 x i32> %ext to <4 x i64>
 561   ret <4 x i64> %res
 562 }
 563
 564 define <4 x i64> @test_mm256_cvtepi8_epi64(<2 x i64> %a0) {
 565 ; CHECK-LABEL: test_mm256_cvtepi8_epi64:
 566 ; CHECK:       # %bb.0:
 567 ; CHECK-NEXT:    vpmovsxbq %xmm0, %ymm0
 568 ; CHECK-NEXT:    ret{{[l|q]}}
 569   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
 570   %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 571   %ext = sext <4 x i8> %shuf to <4 x i64>
 572   ret <4 x i64> %ext
 573 }
 574
 575 define <4 x i64> @test_mm256_cvtepi16_epi32(<2 x i64> %a0) {
 576 ; CHECK-LABEL: test_mm256_cvtepi16_epi32:
 577 ; CHECK:       # %bb.0:
 578 ; CHECK-NEXT:    vpmovsxwd %xmm0, %ymm0
 579 ; CHECK-NEXT:    ret{{[l|q]}}
 580   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
 581   %ext = sext <8 x i16> %arg0 to <8 x i32>
 582   %res = bitcast <8 x i32> %ext to <4 x i64>
 583   ret <4 x i64> %res
 584 }
 585
 586 define <4 x i64> @test_mm256_cvtepi16_epi64(<2 x i64> %a0) {
 587 ; CHECK-LABEL: test_mm256_cvtepi16_epi64:
 588 ; CHECK:       # %bb.0:
 589 ; CHECK-NEXT:    vpmovsxwq %xmm0, %ymm0
 590 ; CHECK-NEXT:    ret{{[l|q]}}
 591   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
 592   %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 593   %ext = sext <4 x i16> %shuf to <4 x i64>
 594   ret <4 x i64> %ext
 595 }
 596
 597 define <4 x i64> @test_mm256_cvtepi32_epi64(<2 x i64> %a0) {
 598 ; CHECK-LABEL: test_mm256_cvtepi32_epi64:
 599 ; CHECK:       # %bb.0:
 600 ; CHECK-NEXT:    vpmovsxdq %xmm0, %ymm0
 601 ; CHECK-NEXT:    ret{{[l|q]}}
 602   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
 603   %ext = sext <4 x i32> %arg0 to <4 x i64>
 604   ret <4 x i64> %ext
 605 }
 606
 607 define <4 x i64> @test_mm256_cvtepu8_epi16(<2 x i64> %a0) {
 608 ; CHECK-LABEL: test_mm256_cvtepu8_epi16:
 609 ; CHECK:       # %bb.0:
 610 ; CHECK-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 611 ; CHECK-NEXT:    ret{{[l|q]}}
 612   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
 613   %ext = zext <16 x i8> %arg0 to <16 x i16>
 614   %res = bitcast <16 x i16> %ext to <4 x i64>
 615   ret <4 x i64> %res
 616 }
 617
 618 define <4 x i64> @test_mm256_cvtepu8_epi32(<2 x i64> %a0) {
 619 ; CHECK-LABEL: test_mm256_cvtepu8_epi32:
 620 ; CHECK:       # %bb.0:
 621 ; CHECK-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 622 ; CHECK-NEXT:    ret{{[l|q]}}
 623   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
 624   %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 625   %ext = zext <8 x i8> %shuf to <8 x i32>
 626   %res = bitcast <8 x i32> %ext to <4 x i64>
 627   ret <4 x i64> %res
 628 }
 629
 630 define <4 x i64> @test_mm256_cvtepu8_epi64(<2 x i64> %a0) {
 631 ; CHECK-LABEL: test_mm256_cvtepu8_epi64:
 632 ; CHECK:       # %bb.0:
 633 ; CHECK-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
 634 ; CHECK-NEXT:    ret{{[l|q]}}
 635   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
 636   %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 637   %ext = zext <4 x i8> %shuf to <4 x i64>
 638   ret <4 x i64> %ext
 639 }
 640
 641 define <4 x i64> @test_mm256_cvtepu16_epi32(<2 x i64> %a0) {
 642 ; CHECK-LABEL: test_mm256_cvtepu16_epi32:
 643 ; CHECK:       # %bb.0:
 644 ; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 645 ; CHECK-NEXT:    ret{{[l|q]}}
 646   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
 647   %ext = zext <8 x i16> %arg0 to <8 x i32>
 648   %res = bitcast <8 x i32> %ext to <4 x i64>
 649   ret <4 x i64> %res
 650 }
 651
 652 define <4 x i64> @test_mm256_cvtepu16_epi64(<2 x i64> %a0) {
 653 ; CHECK-LABEL: test_mm256_cvtepu16_epi64:
 654 ; CHECK:       # %bb.0:
 655 ; CHECK-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 656 ; CHECK-NEXT:    ret{{[l|q]}}
 657   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
 658   %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 659   %ext = zext <4 x i16> %shuf to <4 x i64>
 660   ret <4 x i64> %ext
 661 }
 662
 663 define <4 x i64> @test_mm256_cvtepu32_epi64(<2 x i64> %a0) {
 664 ; CHECK-LABEL: test_mm256_cvtepu32_epi64:
 665 ; CHECK:       # %bb.0:
 666 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 667 ; CHECK-NEXT:    ret{{[l|q]}}
 668   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
 669   %ext = zext <4 x i32> %arg0 to <4 x i64>
 670   ret <4 x i64> %ext
 671 }
 672
 673 define <2 x i64> @test_mm256_extracti128_si256(<4 x i64> %a0) nounwind {
 674 ; CHECK-LABEL: test_mm256_extracti128_si256:
 675 ; CHECK:       # %bb.0:
 676 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
 677 ; CHECK-NEXT:    vzeroupper
 678 ; CHECK-NEXT:    ret{{[l|q]}}
 679   %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3>
 680   ret <2 x i64> %res
 681 }
 682
 683 define <4 x i64> @test_mm256_hadd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
 684 ; CHECK-LABEL: test_mm256_hadd_epi16:
 685 ; CHECK:       # %bb.0:
 686 ; CHECK-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
 687 ; CHECK-NEXT:    ret{{[l|q]}}
 688   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 689   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 690   %res = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %arg0, <16 x i16> %arg1)
 691   %bc = bitcast <16 x i16> %res to <4 x i64>
 692   ret <4 x i64> %bc
 693 }
 694 declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone
 695
 696 define <4 x i64> @test_mm256_hadd_epi32(<4 x i64> %a0, <4 x i64> %a1) {
 697 ; CHECK-LABEL: test_mm256_hadd_epi32:
 698 ; CHECK:       # %bb.0:
 699 ; CHECK-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
 700 ; CHECK-NEXT:    ret{{[l|q]}}
 701   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
 702   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
 703   %res = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %arg0, <8 x i32> %arg1)
 704   %bc = bitcast <8 x i32> %res to <4 x i64>
 705   ret <4 x i64> %bc
 706 }
 707 declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
 708
 709 define <4 x i64> @test_mm256_hadds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
 710 ; CHECK-LABEL: test_mm256_hadds_epi16:
 711 ; CHECK:       # %bb.0:
 712 ; CHECK-NEXT:    vphaddsw %ymm1, %ymm0, %ymm0
 713 ; CHECK-NEXT:    ret{{[l|q]}}
 714   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 715   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 716   %res = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %arg0, <16 x i16> %arg1)
 717   %bc = bitcast <16 x i16> %res to <4 x i64>
 718   ret <4 x i64> %bc
 719 }
 720 declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone
 721
 722 define <4 x i64> @test_mm256_hsub_epi16(<4 x i64> %a0, <4 x i64> %a1) {
 723 ; CHECK-LABEL: test_mm256_hsub_epi16:
 724 ; CHECK:       # %bb.0:
 725 ; CHECK-NEXT:    vphsubw %ymm1, %ymm0, %ymm0
 726 ; CHECK-NEXT:    ret{{[l|q]}}
 727   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 728   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 729   %res = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %arg0, <16 x i16> %arg1)
 730   %bc = bitcast <16 x i16> %res to <4 x i64>
 731   ret <4 x i64> %bc
 732 }
 733 declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone
 734
 735 define <4 x i64> @test_mm256_hsub_epi32(<4 x i64> %a0, <4 x i64> %a1) {
 736 ; CHECK-LABEL: test_mm256_hsub_epi32:
 737 ; CHECK:       # %bb.0:
 738 ; CHECK-NEXT:    vphsubd %ymm1, %ymm0, %ymm0
 739 ; CHECK-NEXT:    ret{{[l|q]}}
 740   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
 741   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
 742   %res = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %arg0, <8 x i32> %arg1)
 743   %bc = bitcast <8 x i32> %res to <4 x i64>
 744   ret <4 x i64> %bc
 745 }
 746 declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
 747
 748 define <4 x i64> @test_mm256_hsubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
 749 ; CHECK-LABEL: test_mm256_hsubs_epi16:
 750 ; CHECK:       # %bb.0:
 751 ; CHECK-NEXT:    vphsubsw %ymm1, %ymm0, %ymm0
 752 ; CHECK-NEXT:    ret{{[l|q]}}
 753   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 754   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 755   %res = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %arg0, <16 x i16> %arg1)
 756   %bc = bitcast <16 x i16> %res to <4 x i64>
 757   ret <4 x i64> %bc
 758 }
 759 declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone
 760
 761 define <2 x i64> @test_mm_i32gather_epi32(i32 *%a0, <2 x i64> %a1) {
 762 ; X86-LABEL: test_mm_i32gather_epi32:
 763 ; X86:       # %bb.0:
 764 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 765 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 766 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 767 ; X86-NEXT:    vpgatherdd %xmm2, (%eax,%xmm0,2), %xmm1
 768 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
 769 ; X86-NEXT:    retl
 770 ;
 771 ; X64-LABEL: test_mm_i32gather_epi32:
 772 ; X64:       # %bb.0:
 773 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 774 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 775 ; X64-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm0,2), %xmm1
 776 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
 777 ; X64-NEXT:    retq
 778   %arg0 = bitcast i32 *%a0 to i8*
 779   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
 780   %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
 781   %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, i8* %arg0, <4 x i32> %arg1, <4 x i32> %mask, i8 2)
 782   %bc = bitcast <4 x i32> %call to <2 x i64>
 783   ret <2 x i64> %bc
 784 }
 785 declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*, <4 x i32>, <4 x i32>, i8) nounwind readonly
 786
 787 define <2 x i64> @test_mm_mask_i32gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
 788 ; X86-LABEL: test_mm_mask_i32gather_epi32:
 789 ; X86:       # %bb.0:
 790 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 791 ; X86-NEXT:    vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0
 792 ; X86-NEXT:    retl
 793 ;
 794 ; X64-LABEL: test_mm_mask_i32gather_epi32:
 795 ; X64:       # %bb.0:
 796 ; X64-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0
 797 ; X64-NEXT:    retq
 798   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
 799   %arg1 = bitcast i32 *%a1 to i8*
 800   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
 801   %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
 802   %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %arg0, i8* %arg1, <4 x i32> %arg2, <4 x i32> %arg3, i8 2)
 803   %bc = bitcast <4 x i32> %call to <2 x i64>
 804   ret <2 x i64> %bc
 805 }
 806
 807 define <4 x i64> @test_mm256_i32gather_epi32(i32 *%a0, <4 x i64> %a1) {
 808 ; X86-LABEL: test_mm256_i32gather_epi32:
 809 ; X86:       # %bb.0:
 810 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 811 ; X86-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 812 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 813 ; X86-NEXT:    vpgatherdd %ymm2, (%eax,%ymm0,2), %ymm1
 814 ; X86-NEXT:    vmovdqa %ymm1, %ymm0
 815 ; X86-NEXT:    retl
 816 ;
 817 ; X64-LABEL: test_mm256_i32gather_epi32:
 818 ; X64:       # %bb.0:
 819 ; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 820 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 821 ; X64-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm0,2), %ymm1
 822 ; X64-NEXT:    vmovdqa %ymm1, %ymm0
 823 ; X64-NEXT:    retq
 824   %arg0 = bitcast i32 *%a0 to i8*
 825   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
 826   %mask = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <8 x i32>
 827   %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8* %arg0, <8 x i32> %arg1, <8 x i32> %mask, i8 2)
 828   %bc = bitcast <8 x i32> %call to <4 x i64>
 829   ret <4 x i64> %bc
 830 }
 831 declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, i8*, <8 x i32>, <8 x i32>, i8) nounwind readonly
 832
 833 define <4 x i64> @test_mm256_mask_i32gather_epi32(<4 x i64> %a0, i32 *%a1, <4 x i64> %a2, <4 x i64> %a3) {
 834 ; X86-LABEL: test_mm256_mask_i32gather_epi32:
 835 ; X86:       # %bb.0:
 836 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 837 ; X86-NEXT:    vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0
 838 ; X86-NEXT:    retl
 839 ;
 840 ; X64-LABEL: test_mm256_mask_i32gather_epi32:
 841 ; X64:       # %bb.0:
 842 ; X64-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0
 843 ; X64-NEXT:    retq
 844   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
 845   %arg1 = bitcast i32 *%a1 to i8*
 846   %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
 847   %arg3 = bitcast <4 x i64> %a3 to <8 x i32>
 848   %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %arg0, i8* %arg1, <8 x i32> %arg2, <8 x i32> %arg3, i8 2)
 849   %bc = bitcast <8 x i32> %call to <4 x i64>
 850   ret <4 x i64> %bc
 851 }
 852
 853 define <2 x i64> @test_mm_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
 854 ; X86-LABEL: test_mm_i32gather_epi64:
 855 ; X86:       # %bb.0:
 856 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 857 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 858 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 859 ; X86-NEXT:    vpgatherdq %xmm2, (%eax,%xmm0,2), %xmm1
 860 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
 861 ; X86-NEXT:    retl
 862 ;
 863 ; X64-LABEL: test_mm_i32gather_epi64:
 864 ; X64:       # %bb.0:
 865 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 866 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 867 ; X64-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm0,2), %xmm1
 868 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
 869 ; X64-NEXT:    retq
 870   %arg0 = bitcast i64 *%a0 to i8*
 871   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
 872   %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> undef, i8* %arg0, <4 x i32> %arg1, <2 x i64> <i64 -1, i64 -1>, i8 2)
 873   ret <2 x i64> %res
 874 }
 875 declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, i8*, <4 x i32>, <2 x i64>, i8) nounwind readonly
 876
 877 define <2 x i64> @test_mm_mask_i32gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
 878 ; X86-LABEL: test_mm_mask_i32gather_epi64:
 879 ; X86:       # %bb.0:
 880 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 881 ; X86-NEXT:    vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0
 882 ; X86-NEXT:    retl
 883 ;
 884 ; X64-LABEL: test_mm_mask_i32gather_epi64:
 885 ; X64:       # %bb.0:
 886 ; X64-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0
 887 ; X64-NEXT:    retq
 888   %arg1 = bitcast i64 *%a1 to i8*
 889   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
 890   %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <2 x i64> %a3, i8 2)
 891   ret <2 x i64> %res
 892 }
 893
 894 define <4 x i64> @test_mm256_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
 895 ; X86-LABEL: test_mm256_i32gather_epi64:
 896 ; X86:       # %bb.0:
 897 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 898 ; X86-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 899 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 900 ; X86-NEXT:    vpgatherdq %ymm2, (%eax,%xmm0,2), %ymm1
 901 ; X86-NEXT:    vmovdqa %ymm1, %ymm0
 902 ; X86-NEXT:    retl
 903 ;
 904 ; X64-LABEL: test_mm256_i32gather_epi64:
 905 ; X64:       # %bb.0:
 906 ; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 907 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 908 ; X64-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm0,2), %ymm1
 909 ; X64-NEXT:    vmovdqa %ymm1, %ymm0
 910 ; X64-NEXT:    retq
 911   %arg0 = bitcast i64 *%a0 to i8*
 912   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
 913   %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8* %arg0, <4 x i32> %arg1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
 914   ret <4 x i64> %res
 915 }
 916 declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, i8*, <4 x i32>, <4 x i64>, i8) nounwind readonly
 917
 918 define <4 x i64> @test_mm256_mask_i32gather_epi64(<4 x i64> %a0, i64 *%a1, <2 x i64> %a2, <4 x i64> %a3) {
 919 ; X86-LABEL: test_mm256_mask_i32gather_epi64:
 920 ; X86:       # %bb.0:
 921 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 922 ; X86-NEXT:    vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0
 923 ; X86-NEXT:    retl
 924 ;
 925 ; X64-LABEL: test_mm256_mask_i32gather_epi64:
 926 ; X64:       # %bb.0:
 927 ; X64-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0
 928 ; X64-NEXT:    retq
 929   %arg1 = bitcast i64 *%a1 to i8*
 930   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
 931   %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <4 x i64> %a3, i8 2)
 932   ret <4 x i64> %res
 933 }
 934
 935 define <2 x double> @test_mm_i32gather_pd(double *%a0, <2 x i64> %a1) {
 936 ; X86-LABEL: test_mm_i32gather_pd:
 937 ; X86:       # %bb.0:
 938 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 939 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 940 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 941 ; X86-NEXT:    vgatherdpd %xmm2, (%eax,%xmm0,2), %xmm1
 942 ; X86-NEXT:    vmovapd %xmm1, %xmm0
 943 ; X86-NEXT:    retl
 944 ;
 945 ; X64-LABEL: test_mm_i32gather_pd:
 946 ; X64:       # %bb.0:
 947 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 948 ; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 949 ; X64-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm0,2), %xmm1
 950 ; X64-NEXT:    vmovapd %xmm1, %xmm0
 951 ; X64-NEXT:    retq
 952   %arg0 = bitcast double *%a0 to i8*
 953   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
 954   %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
 955   %sext = sext <2 x i1> %cmp to <2 x i64>
 956   %mask = bitcast <2 x i64> %sext to <2 x double>
 957   %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> undef, i8* %arg0, <4 x i32> %arg1, <2 x double> %mask, i8 2)
 958   ret <2 x double> %res
 959 }
 960 declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*, <4 x i32>, <2 x double>, i8) nounwind readonly
 961
 962 define <2 x double> @test_mm_mask_i32gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) {
 963 ; X86-LABEL: test_mm_mask_i32gather_pd:
 964 ; X86:       # %bb.0:
 965 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 966 ; X86-NEXT:    vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0
 967 ; X86-NEXT:    retl
 968 ;
 969 ; X64-LABEL: test_mm_mask_i32gather_pd:
 970 ; X64:       # %bb.0:
 971 ; X64-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0
 972 ; X64-NEXT:    retq
 973   %arg1 = bitcast double *%a1 to i8*
 974   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
 975   %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0, i8* %arg1, <4 x i32> %arg2, <2 x double> %a3, i8 2)
 976   ret <2 x double> %res
 977 }
 978
 979 define <4 x double> @test_mm256_i32gather_pd(double *%a0, <2 x i64> %a1) {
 980 ; X86-LABEL: test_mm256_i32gather_pd:
 981 ; X86:       # %bb.0:
 982 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 983 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 984 ; X86-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
 985 ; X86-NEXT:    vgatherdpd %ymm2, (%eax,%xmm0,2), %ymm1
 986 ; X86-NEXT:    vmovapd %ymm1, %ymm0
 987 ; X86-NEXT:    retl
 988 ;
 989 ; X64-LABEL: test_mm256_i32gather_pd:
 990 ; X64:       # %bb.0:
 991 ; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 992 ; X64-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
 993 ; X64-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm0,2), %ymm1
 994 ; X64-NEXT:    vmovapd %ymm1, %ymm0
 995 ; X64-NEXT:    retq
 996   %arg0 = bitcast double *%a0 to i8*
 997   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
 998   %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
 999   %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8* %arg0, <4 x i32> %arg1, <4 x double> %mask, i8 2)
1000   ret <4 x double> %res
1001 }
1002 declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*, <4 x i32>, <4 x double>, i8) nounwind readonly
1003
1004 define <4 x double> @test_mm256_mask_i32gather_pd(<4 x double> %a0, double *%a1, <2 x i64> %a2, <4 x double> %a3) {
1005 ; X86-LABEL: test_mm256_mask_i32gather_pd:
1006 ; X86:       # %bb.0:
1007 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1008 ; X86-NEXT:    vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0
1009 ; X86-NEXT:    retl
1010 ;
1011 ; X64-LABEL: test_mm256_mask_i32gather_pd:
1012 ; X64:       # %bb.0:
1013 ; X64-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm1,2), %ymm0
1014 ; X64-NEXT:    retq
1015   %arg1 = bitcast double *%a1 to i8*
1016   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1017   %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0, i8* %arg1, <4 x i32> %arg2, <4 x double> %a3, i8 2)
1018   ret <4 x double> %res
1019 }
1020
1021 define <4 x float> @test_mm_i32gather_ps(float *%a0, <2 x i64> %a1) {
1022 ; X86-LABEL: test_mm_i32gather_ps:
1023 ; X86:       # %bb.0:
1024 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1025 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1026 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1027 ; X86-NEXT:    vgatherdps %xmm2, (%eax,%xmm0,2), %xmm1
1028 ; X86-NEXT:    vmovaps %xmm1, %xmm0
1029 ; X86-NEXT:    retl
1030 ;
1031 ; X64-LABEL: test_mm_i32gather_ps:
1032 ; X64:       # %bb.0:
1033 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1034 ; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1035 ; X64-NEXT:    vgatherdps %xmm2, (%rdi,%xmm0,2), %xmm1
1036 ; X64-NEXT:    vmovaps %xmm1, %xmm0
1037 ; X64-NEXT:    retq
1038   %arg0 = bitcast float *%a0 to i8*
1039   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1040   %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1041   %sext = sext <4 x i1> %cmp to <4 x i32>
1042   %mask = bitcast <4 x i32> %sext to <4 x float>
1043   %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, i8* %arg0, <4 x i32> %arg1, <4 x float> %mask, i8 2)
1044   ret <4 x float> %call
1045 }
1046 declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*, <4 x i32>, <4 x float>, i8) nounwind readonly
1047
1048 define <4 x float> @test_mm_mask_i32gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) {
1049 ; X86-LABEL: test_mm_mask_i32gather_ps:
1050 ; X86:       # %bb.0:
1051 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1052 ; X86-NEXT:    vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0
1053 ; X86-NEXT:    retl
1054 ;
1055 ; X64-LABEL: test_mm_mask_i32gather_ps:
1056 ; X64:       # %bb.0:
1057 ; X64-NEXT:    vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0
1058 ; X64-NEXT:    retq
1059   %arg1 = bitcast float *%a1 to i8*
1060   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1061   %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0, i8* %arg1, <4 x i32> %arg2, <4 x float> %a3, i8 2)
1062   ret <4 x float> %call
1063 }
1064
1065 define <8 x float> @test_mm256_i32gather_ps(float *%a0, <4 x i64> %a1) {
1066 ; X86-LABEL: test_mm256_i32gather_ps:
1067 ; X86:       # %bb.0:
1068 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1069 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1070 ; X86-NEXT:    vcmpeqps %ymm1, %ymm1, %ymm2
1071 ; X86-NEXT:    vgatherdps %ymm2, (%eax,%ymm0,2), %ymm1
1072 ; X86-NEXT:    vmovaps %ymm1, %ymm0
1073 ; X86-NEXT:    retl
1074 ;
1075 ; X64-LABEL: test_mm256_i32gather_ps:
1076 ; X64:       # %bb.0:
1077 ; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1078 ; X64-NEXT:    vcmpeqps %ymm1, %ymm1, %ymm2
1079 ; X64-NEXT:    vgatherdps %ymm2, (%rdi,%ymm0,2), %ymm1
1080 ; X64-NEXT:    vmovaps %ymm1, %ymm0
1081 ; X64-NEXT:    retq
1082   %arg0 = bitcast float *%a0 to i8*
1083   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1084   %mask = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i8 0)
1085   %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8* %arg0, <8 x i32> %arg1, <8 x float> %mask, i8 2)
1086   ret <8 x float> %call
1087 }
1088 declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*, <8 x i32>, <8 x float>, i8) nounwind readonly
1089
1090 define <8 x float> @test_mm256_mask_i32gather_ps(<8 x float> %a0, float *%a1, <4 x i64> %a2, <8 x float> %a3) {
1091 ; X86-LABEL: test_mm256_mask_i32gather_ps:
1092 ; X86:       # %bb.0:
1093 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1094 ; X86-NEXT:    vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0
1095 ; X86-NEXT:    retl
1096 ;
1097 ; X64-LABEL: test_mm256_mask_i32gather_ps:
1098 ; X64:       # %bb.0:
1099 ; X64-NEXT:    vgatherdps %ymm2, (%rdi,%ymm1,2), %ymm0
1100 ; X64-NEXT:    retq
1101   %arg1 = bitcast float *%a1 to i8*
1102   %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
1103   %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0, i8* %arg1, <8 x i32> %arg2, <8 x float> %a3, i8 2)
1104   ret <8 x float> %call
1105 }
1106
1107 define <2 x i64> @test_mm_i64gather_epi32(i32 *%a0, <2 x i64> %a1) {
1108 ; X86-LABEL: test_mm_i64gather_epi32:
1109 ; X86:       # %bb.0:
1110 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1111 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1112 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1113 ; X86-NEXT:    vpgatherqd %xmm2, (%eax,%xmm0,2), %xmm1
1114 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
1115 ; X86-NEXT:    retl
1116 ;
1117 ; X64-LABEL: test_mm_i64gather_epi32:
1118 ; X64:       # %bb.0:
1119 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1120 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1121 ; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm0,2), %xmm1
1122 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
1123 ; X64-NEXT:    retq
1124   %arg0 = bitcast i32 *%a0 to i8*
1125   %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
1126   %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> undef, i8* %arg0, <2 x i64> %a1, <4 x i32> %mask, i8 2)
1127   %bc = bitcast <4 x i32> %call to <2 x i64>
1128   ret <2 x i64> %bc
1129 }
1130 declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, i8*, <2 x i64>, <4 x i32>, i8) nounwind readonly
1131
1132 define <2 x i64> @test_mm_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
1133 ; X86-LABEL: test_mm_mask_i64gather_epi32:
1134 ; X86:       # %bb.0:
1135 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1136 ; X86-NEXT:    vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0
1137 ; X86-NEXT:    retl
1138 ;
1139 ; X64-LABEL: test_mm_mask_i64gather_epi32:
1140 ; X64:       # %bb.0:
1141 ; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0
1142 ; X64-NEXT:    retq
1143   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1144   %arg1 = bitcast i32 *%a1 to i8*
1145   %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
1146   %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %arg0, i8* %arg1, <2 x i64> %a2, <4 x i32> %arg3, i8 2)
1147   %bc = bitcast <4 x i32> %call to <2 x i64>
1148   ret <2 x i64> %bc
1149 }
1150
1151 define <2 x i64> @test_mm256_i64gather_epi32(i32 *%a0, <4 x i64> %a1) {
1152 ; X86-LABEL: test_mm256_i64gather_epi32:
1153 ; X86:       # %bb.0:
1154 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1155 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1156 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1157 ; X86-NEXT:    vpgatherqd %xmm2, (%eax,%ymm0,2), %xmm1
1158 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
1159 ; X86-NEXT:    vzeroupper
1160 ; X86-NEXT:    retl
1161 ;
1162 ; X64-LABEL: test_mm256_i64gather_epi32:
1163 ; X64:       # %bb.0:
1164 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1165 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1166 ; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm0,2), %xmm1
1167 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
1168 ; X64-NEXT:    vzeroupper
1169 ; X64-NEXT:    retq
1170   %arg0 = bitcast i32 *%a0 to i8*
1171   %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
1172   %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8* %arg0, <4 x i64> %a1, <4 x i32> %mask, i8 2)
1173   %bc = bitcast <4 x i32> %call to <2 x i64>
1174   ret <2 x i64> %bc
1175 }
1176 declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, i8*, <4 x i64>, <4 x i32>, i8) nounwind readonly
1177
1178 define <2 x i64> @test_mm256_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <4 x i64> %a2, <2 x i64> %a3) {
1179 ; X86-LABEL: test_mm256_mask_i64gather_epi32:
1180 ; X86:       # %bb.0:
1181 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1182 ; X86-NEXT:    vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0
1183 ; X86-NEXT:    vzeroupper
1184 ; X86-NEXT:    retl
1185 ;
1186 ; X64-LABEL: test_mm256_mask_i64gather_epi32:
1187 ; X64:       # %bb.0:
1188 ; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0
1189 ; X64-NEXT:    vzeroupper
1190 ; X64-NEXT:    retq
1191   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1192   %arg1 = bitcast i32 *%a1 to i8*
1193   %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
1194   %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %arg0, i8* %arg1, <4 x i64> %a2, <4 x i32> %arg3, i8 2)
1195   %bc = bitcast <4 x i32> %call to <2 x i64>
1196   ret <2 x i64> %bc
1197 }
1198
1199 define <2 x i64> @test_mm_i64gather_epi64(i64 *%a0, <2 x i64> %a1) {
1200 ; X86-LABEL: test_mm_i64gather_epi64:
1201 ; X86:       # %bb.0:
1202 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1203 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1204 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1205 ; X86-NEXT:    vpgatherqq %xmm2, (%eax,%xmm0,2), %xmm1
1206 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
1207 ; X86-NEXT:    retl
1208 ;
1209 ; X64-LABEL: test_mm_i64gather_epi64:
1210 ; X64:       # %bb.0:
1211 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1212 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1213 ; X64-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm0,2), %xmm1
1214 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
1215 ; X64-NEXT:    retq
1216   %arg0 = bitcast i64 *%a0 to i8*
1217   %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> undef, i8* %arg0, <2 x i64> %a1, <2 x i64> <i64 -1, i64 -1>, i8 2)
1218   ret <2 x i64> %call
1219 }
1220 declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, i8*, <2 x i64>, <2 x i64>, i8) nounwind readonly
1221
1222 define <2 x i64> @test_mm_mask_i64gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
1223 ; X86-LABEL: test_mm_mask_i64gather_epi64:
1224 ; X86:       # %bb.0:
1225 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1226 ; X86-NEXT:    vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0
1227 ; X86-NEXT:    retl
1228 ;
1229 ; X64-LABEL: test_mm_mask_i64gather_epi64:
1230 ; X64:       # %bb.0:
1231 ; X64-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0
1232 ; X64-NEXT:    retq
1233   %arg1 = bitcast i64 *%a1 to i8*
1234   %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0, i8* %arg1, <2 x i64> %a2, <2 x i64> %a3, i8 2)
1235   ret <2 x i64> %call
1236 }
1237
1238 define <4 x i64> @test_mm256_i64gather_epi64(i64 *%a0, <4 x i64> %a1) {
1239 ; X86-LABEL: test_mm256_i64gather_epi64:
1240 ; X86:       # %bb.0:
1241 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1242 ; X86-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
1243 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1244 ; X86-NEXT:    vpgatherqq %ymm2, (%eax,%ymm0,2), %ymm1
1245 ; X86-NEXT:    vmovdqa %ymm1, %ymm0
1246 ; X86-NEXT:    retl
1247 ;
1248 ; X64-LABEL: test_mm256_i64gather_epi64:
1249 ; X64:       # %bb.0:
1250 ; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
1251 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1252 ; X64-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm0,2), %ymm1
1253 ; X64-NEXT:    vmovdqa %ymm1, %ymm0
1254 ; X64-NEXT:    retq
1255   %arg0 = bitcast i64 *%a0 to i8*
1256   %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8* %arg0, <4 x i64> %a1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
1257   ret <4 x i64> %call
1258 }
1259 declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, i8*, <4 x i64>, <4 x i64>, i8) nounwind readonly
1260
1261 define <4 x i64> @test_mm256_mask_i64gather_epi64(<4 x i64> %a0, i64 *%a1, <4 x i64> %a2, <4 x i64> %a3) {
1262 ; X86-LABEL: test_mm256_mask_i64gather_epi64:
1263 ; X86:       # %bb.0:
1264 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1265 ; X86-NEXT:    vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0
1266 ; X86-NEXT:    retl
1267 ;
1268 ; X64-LABEL: test_mm256_mask_i64gather_epi64:
1269 ; X64:       # %bb.0:
1270 ; X64-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0
1271 ; X64-NEXT:    retq
1272   %arg1 = bitcast i64 *%a1 to i8*
1273   %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0, i8* %arg1, <4 x i64> %a2, <4 x i64> %a3, i8 2)
1274   ret <4 x i64> %call
1275 }
1276
1277 define <2 x double> @test_mm_i64gather_pd(double *%a0, <2 x i64> %a1) {
1278 ; X86-LABEL: test_mm_i64gather_pd:
1279 ; X86:       # %bb.0:
1280 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1281 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1282 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1283 ; X86-NEXT:    vgatherqpd %xmm2, (%eax,%xmm0,2), %xmm1
1284 ; X86-NEXT:    vmovapd %xmm1, %xmm0
1285 ; X86-NEXT:    retl
1286 ;
1287 ; X64-LABEL: test_mm_i64gather_pd:
1288 ; X64:       # %bb.0:
1289 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1290 ; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1291 ; X64-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm0,2), %xmm1
1292 ; X64-NEXT:    vmovapd %xmm1, %xmm0
1293 ; X64-NEXT:    retq
1294   %arg0 = bitcast double *%a0 to i8*
1295   %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
1296   %sext = sext <2 x i1> %cmp to <2 x i64>
1297   %mask = bitcast <2 x i64> %sext to <2 x double>
1298   %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> undef, i8* %arg0, <2 x i64> %a1, <2 x double> %mask, i8 2)
1299   ret <2 x double> %call
1300 }
1301 declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, i8*, <2 x i64>, <2 x double>, i8) nounwind readonly
1302
1303 define <2 x double> @test_mm_mask_i64gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) {
1304 ; X86-LABEL: test_mm_mask_i64gather_pd:
1305 ; X86:       # %bb.0:
1306 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1307 ; X86-NEXT:    vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0
1308 ; X86-NEXT:    retl
1309 ;
1310 ; X64-LABEL: test_mm_mask_i64gather_pd:
1311 ; X64:       # %bb.0:
1312 ; X64-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0
1313 ; X64-NEXT:    retq
1314   %arg1 = bitcast double *%a1 to i8*
1315   %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0, i8* %arg1, <2 x i64> %a2, <2 x double> %a3, i8 2)
1316   ret <2 x double> %call
1317 }
1318
1319 define <4 x double> @test_mm256_i64gather_pd(double *%a0, <4 x i64> %a1) {
1320 ; X86-LABEL: test_mm256_i64gather_pd:
1321 ; X86:       # %bb.0:
1322 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1323 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1324 ; X86-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
1325 ; X86-NEXT:    vgatherqpd %ymm2, (%eax,%ymm0,2), %ymm1
1326 ; X86-NEXT:    vmovapd %ymm1, %ymm0
1327 ; X86-NEXT:    retl
1328 ;
1329 ; X64-LABEL: test_mm256_i64gather_pd:
1330 ; X64:       # %bb.0:
1331 ; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1332 ; X64-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
1333 ; X64-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm0,2), %ymm1
1334 ; X64-NEXT:    vmovapd %ymm1, %ymm0
1335 ; X64-NEXT:    retq
1336   %arg0 = bitcast double *%a0 to i8*
1337   %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
1338   %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8* %arg0, <4 x i64> %a1, <4 x double> %mask, i8 2)
1339   ret <4 x double> %call
1340 }
1341 declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, i8*, <4 x i64>, <4 x double>, i8) nounwind readonly
1342
1343 define <4 x double> @test_mm256_mask_i64gather_pd(<4 x double> %a0, i64 *%a1, <4 x i64> %a2, <4 x double> %a3) {
1344 ; X86-LABEL: test_mm256_mask_i64gather_pd:
1345 ; X86:       # %bb.0:
1346 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1347 ; X86-NEXT:    vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0
1348 ; X86-NEXT:    retl
1349 ;
1350 ; X64-LABEL: test_mm256_mask_i64gather_pd:
1351 ; X64:       # %bb.0:
1352 ; X64-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm1,2), %ymm0
1353 ; X64-NEXT:    retq
1354   %arg1 = bitcast i64 *%a1 to i8*
1355   %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0, i8* %arg1, <4 x i64> %a2, <4 x double> %a3, i8 2)
1356   ret <4 x double> %call
1357 }
1358
1359 define <4 x float> @test_mm_i64gather_ps(float *%a0, <2 x i64> %a1) {
1360 ; X86-LABEL: test_mm_i64gather_ps:
1361 ; X86:       # %bb.0:
1362 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1363 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1364 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1365 ; X86-NEXT:    vgatherqps %xmm2, (%eax,%xmm0,2), %xmm1
1366 ; X86-NEXT:    vmovaps %xmm1, %xmm0
1367 ; X86-NEXT:    retl
1368 ;
1369 ; X64-LABEL: test_mm_i64gather_ps:
1370 ; X64:       # %bb.0:
1371 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1372 ; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1373 ; X64-NEXT:    vgatherqps %xmm2, (%rdi,%xmm0,2), %xmm1
1374 ; X64-NEXT:    vmovaps %xmm1, %xmm0
1375 ; X64-NEXT:    retq
1376   %arg0 = bitcast float *%a0 to i8*
1377   %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1378   %sext = sext <4 x i1> %cmp to <4 x i32>
1379   %mask = bitcast <4 x i32> %sext to <4 x float>
1380   %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> undef, i8* %arg0, <2 x i64> %a1, <4 x float> %mask, i8 2)
1381   ret <4 x float> %call
1382 }
1383 declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, i8*, <2 x i64>, <4 x float>, i8) nounwind readonly
1384
1385 define <4 x float> @test_mm_mask_i64gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) {
1386 ; X86-LABEL: test_mm_mask_i64gather_ps:
1387 ; X86:       # %bb.0:
1388 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1389 ; X86-NEXT:    vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0
1390 ; X86-NEXT:    retl
1391 ;
1392 ; X64-LABEL: test_mm_mask_i64gather_ps:
1393 ; X64:       # %bb.0:
1394 ; X64-NEXT:    vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0
1395 ; X64-NEXT:    retq
1396   %arg1 = bitcast float *%a1 to i8*
1397   %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0, i8* %arg1, <2 x i64> %a2, <4 x float> %a3, i8 2)
1398   ret <4 x float> %call
1399 }
1400
1401 define <4 x float> @test_mm256_i64gather_ps(float *%a0, <4 x i64> %a1) {
1402 ; X86-LABEL: test_mm256_i64gather_ps:
1403 ; X86:       # %bb.0:
1404 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1405 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1406 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1407 ; X86-NEXT:    vgatherqps %xmm2, (%eax,%ymm0,2), %xmm1
1408 ; X86-NEXT:    vmovaps %xmm1, %xmm0
1409 ; X86-NEXT:    vzeroupper
1410 ; X86-NEXT:    retl
1411 ;
1412 ; X64-LABEL: test_mm256_i64gather_ps:
1413 ; X64:       # %bb.0:
1414 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1415 ; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1416 ; X64-NEXT:    vgatherqps %xmm2, (%rdi,%ymm0,2), %xmm1
1417 ; X64-NEXT:    vmovaps %xmm1, %xmm0
1418 ; X64-NEXT:    vzeroupper
1419 ; X64-NEXT:    retq
1420   %arg0 = bitcast float *%a0 to i8*
1421   %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1422   %sext = sext <4 x i1> %cmp to <4 x i32>
1423   %mask = bitcast <4 x i32> %sext to <4 x float>
1424   %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8* %arg0, <4 x i64> %a1, <4 x float> %mask, i8 2)
1425   ret <4 x float> %call
1426 }
1427 declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, i8*, <4 x i64>, <4 x float>, i8) nounwind readonly
1428
1429 define <4 x float> @test_mm256_mask_i64gather_ps(<4 x float> %a0, float *%a1, <4 x i64> %a2, <4 x float> %a3) {
1430 ; X86-LABEL: test_mm256_mask_i64gather_ps:
1431 ; X86:       # %bb.0:
1432 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1433 ; X86-NEXT:    vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0
1434 ; X86-NEXT:    vzeroupper
1435 ; X86-NEXT:    retl
1436 ;
1437 ; X64-LABEL: test_mm256_mask_i64gather_ps:
1438 ; X64:       # %bb.0:
1439 ; X64-NEXT:    vgatherqps %xmm2, (%rdi,%ymm1,2), %xmm0
1440 ; X64-NEXT:    vzeroupper
1441 ; X64-NEXT:    retq
1442   %arg1 = bitcast float *%a1 to i8*
1443   %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0, i8* %arg1, <4 x i64> %a2, <4 x float> %a3, i8 2)
1444   ret <4 x float> %call
1445 }
1446
1447 define <4 x i64> @test0_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
1448 ; CHECK-LABEL: test0_mm256_inserti128_si256:
1449 ; CHECK:       # %bb.0:
1450 ; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
1451 ; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1452 ; CHECK-NEXT:    ret{{[l|q]}}
1453   %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1454   %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1455   ret <4 x i64> %res
1456 }
1457
1458 define <4 x i64> @test1_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
1459 ; CHECK-LABEL: test1_mm256_inserti128_si256:
1460 ; CHECK:       # %bb.0:
1461 ; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1462 ; CHECK-NEXT:    ret{{[l|q]}}
1463   %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1464   %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1465   ret <4 x i64> %res
1466 }
1467
1468 define <4 x i64> @test_mm256_madd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1469 ; CHECK-LABEL: test_mm256_madd_epi16:
1470 ; CHECK:       # %bb.0:
1471 ; CHECK-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
1472 ; CHECK-NEXT:    ret{{[l|q]}}
1473   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1474   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1475   %res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %arg0, <16 x i16> %arg1)
1476   %bc = bitcast <8 x i32> %res to <4 x i64>
1477   ret <4 x i64> %bc
1478 }
1479 declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
1480
1481 define <4 x i64> @test_mm256_maddubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1482 ; CHECK-LABEL: test_mm256_maddubs_epi16:
1483 ; CHECK:       # %bb.0:
1484 ; CHECK-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0
1485 ; CHECK-NEXT:    ret{{[l|q]}}
1486   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1487   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1488   %res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %arg0, <32 x i8> %arg1)
1489   %bc = bitcast <16 x i16> %res to <4 x i64>
1490   ret <4 x i64> %bc
1491 }
1492 declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone
1493
1494 define <2 x i64> @test_mm_maskload_epi32(i32* %a0, <2 x i64> %a1) nounwind {
1495 ; X86-LABEL: test_mm_maskload_epi32:
1496 ; X86:       # %bb.0:
1497 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1498 ; X86-NEXT:    vpmaskmovd (%eax), %xmm0, %xmm0
1499 ; X86-NEXT:    retl
1500 ;
1501 ; X64-LABEL: test_mm_maskload_epi32:
1502 ; X64:       # %bb.0:
1503 ; X64-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm0
1504 ; X64-NEXT:    retq
1505   %arg0 = bitcast i32* %a0 to i8*
1506   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1507   %call = call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %arg0, <4 x i32> %arg1)
1508   %bc = bitcast <4 x i32> %call to <2 x i64>
1509   ret <2 x i64> %bc
1510 }
1511 declare <4 x i32> @llvm.x86.avx2.maskload.d(i8*, <4 x i32>) nounwind readonly
1512
1513 define <4 x i64> @test_mm256_maskload_epi32(i32* %a0, <4 x i64> %a1) nounwind {
1514 ; X86-LABEL: test_mm256_maskload_epi32:
1515 ; X86:       # %bb.0:
1516 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1517 ; X86-NEXT:    vpmaskmovd (%eax), %ymm0, %ymm0
1518 ; X86-NEXT:    retl
1519 ;
1520 ; X64-LABEL: test_mm256_maskload_epi32:
1521 ; X64:       # %bb.0:
1522 ; X64-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm0
1523 ; X64-NEXT:    retq
1524   %arg0 = bitcast i32* %a0 to i8*
1525   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1526   %call = call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %arg0, <8 x i32> %arg1)
1527   %bc = bitcast <8 x i32> %call to <4 x i64>
1528   ret <4 x i64> %bc
1529 }
1530 declare <8 x i32> @llvm.x86.avx2.maskload.d.256(i8*, <8 x i32>) nounwind readonly
1531
1532 define <2 x i64> @test_mm_maskload_epi64(i64* %a0, <2 x i64> %a1) nounwind {
1533 ; X86-LABEL: test_mm_maskload_epi64:
1534 ; X86:       # %bb.0:
1535 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1536 ; X86-NEXT:    vpmaskmovq (%eax), %xmm0, %xmm0
1537 ; X86-NEXT:    retl
1538 ;
1539 ; X64-LABEL: test_mm_maskload_epi64:
1540 ; X64:       # %bb.0:
1541 ; X64-NEXT:    vpmaskmovq (%rdi), %xmm0, %xmm0
1542 ; X64-NEXT:    retq
1543   %arg0 = bitcast i64* %a0 to i8*
1544   %res = call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %arg0, <2 x i64> %a1)
1545   ret <2 x i64> %res
1546 }
1547 declare <2 x i64> @llvm.x86.avx2.maskload.q(i8*, <2 x i64>) nounwind readonly
1548
1549 define <4 x i64> @test_mm256_maskload_epi64(i64* %a0, <4 x i64> %a1) nounwind {
1550 ; X86-LABEL: test_mm256_maskload_epi64:
1551 ; X86:       # %bb.0:
1552 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1553 ; X86-NEXT:    vpmaskmovq (%eax), %ymm0, %ymm0
1554 ; X86-NEXT:    retl
1555 ;
1556 ; X64-LABEL: test_mm256_maskload_epi64:
1557 ; X64:       # %bb.0:
1558 ; X64-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm0
1559 ; X64-NEXT:    retq
1560   %arg0 = bitcast i64* %a0 to i8*
1561   %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %arg0, <4 x i64> %a1)
1562   ret <4 x i64> %res
1563 }
1564 declare <4 x i64> @llvm.x86.avx2.maskload.q.256(i8*, <4 x i64>) nounwind readonly
1565
1566 define void @test_mm_maskstore_epi32(float* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
1567 ; X86-LABEL: test_mm_maskstore_epi32:
1568 ; X86:       # %bb.0:
1569 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1570 ; X86-NEXT:    vpmaskmovd %xmm1, %xmm0, (%eax)
1571 ; X86-NEXT:    retl
1572 ;
1573 ; X64-LABEL: test_mm_maskstore_epi32:
1574 ; X64:       # %bb.0:
1575 ; X64-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
1576 ; X64-NEXT:    retq
1577   %arg0 = bitcast float* %a0 to i8*
1578   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1579   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1580   call void @llvm.x86.avx2.maskstore.d(i8* %arg0, <4 x i32> %arg1, <4 x i32> %arg2)
1581   ret void
1582 }
1583 declare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>) nounwind readnone
1584
1585 define void @test_mm256_maskstore_epi32(float* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
1586 ; X86-LABEL: test_mm256_maskstore_epi32:
1587 ; X86:       # %bb.0:
1588 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1589 ; X86-NEXT:    vpmaskmovd %ymm1, %ymm0, (%eax)
1590 ; X86-NEXT:    vzeroupper
1591 ; X86-NEXT:    retl
1592 ;
1593 ; X64-LABEL: test_mm256_maskstore_epi32:
1594 ; X64:       # %bb.0:
1595 ; X64-NEXT:    vpmaskmovd %ymm1, %ymm0, (%rdi)
1596 ; X64-NEXT:    vzeroupper
1597 ; X64-NEXT:    retq
1598   %arg0 = bitcast float* %a0 to i8*
1599   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1600   %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
1601   call void @llvm.x86.avx2.maskstore.d.256(i8* %arg0, <8 x i32> %arg1, <8 x i32> %arg2)
1602   ret void
1603 }
1604 declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>) nounwind readnone
1605
1606 define void @test_mm_maskstore_epi64(i64* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
1607 ; X86-LABEL: test_mm_maskstore_epi64:
1608 ; X86:       # %bb.0:
1609 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1610 ; X86-NEXT:    vpmaskmovq %xmm1, %xmm0, (%eax)
1611 ; X86-NEXT:    retl
1612 ;
1613 ; X64-LABEL: test_mm_maskstore_epi64:
1614 ; X64:       # %bb.0:
1615 ; X64-NEXT:    vpmaskmovq %xmm1, %xmm0, (%rdi)
1616 ; X64-NEXT:    retq
1617   %arg0 = bitcast i64* %a0 to i8*
1618   call void @llvm.x86.avx2.maskstore.q(i8* %arg0, <2 x i64> %a1, <2 x i64> %a2)
1619   ret void
1620 }
1621 declare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>) nounwind readnone
1622
1623 define void @test_mm256_maskstore_epi64(i64* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
1624 ; X86-LABEL: test_mm256_maskstore_epi64:
1625 ; X86:       # %bb.0:
1626 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1627 ; X86-NEXT:    vpmaskmovq %ymm1, %ymm0, (%eax)
1628 ; X86-NEXT:    vzeroupper
1629 ; X86-NEXT:    retl
1630 ;
1631 ; X64-LABEL: test_mm256_maskstore_epi64:
1632 ; X64:       # %bb.0:
1633 ; X64-NEXT:    vpmaskmovq %ymm1, %ymm0, (%rdi)
1634 ; X64-NEXT:    vzeroupper
1635 ; X64-NEXT:    retq
1636   %arg0 = bitcast i64* %a0 to i8*
1637   call void @llvm.x86.avx2.maskstore.q.256(i8* %arg0, <4 x i64> %a1, <4 x i64> %a2)
1638   ret void
1639 }
1640 declare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>) nounwind readnone
1641
1642 define <4 x i64> @test_mm256_max_epi8(<4 x i64> %a0, <4 x i64> %a1) {
1643 ; CHECK-LABEL: test_mm256_max_epi8:
1644 ; CHECK:       # %bb.0:
1645 ; CHECK-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
1646 ; CHECK-NEXT:    ret{{[l|q]}}
1647   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1648   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1649   %cmp = icmp sgt <32 x i8> %arg0, %arg1
1650   %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
1651   %bc = bitcast <32 x i8> %sel to <4 x i64>
1652   ret <4 x i64> %bc
1653 }
1654
1655 define <4 x i64> @test_mm256_max_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1656 ; CHECK-LABEL: test_mm256_max_epi16:
1657 ; CHECK:       # %bb.0:
1658 ; CHECK-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
1659 ; CHECK-NEXT:    ret{{[l|q]}}
1660   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1661   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1662   %cmp = icmp sgt <16 x i16> %arg0, %arg1
1663   %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
1664   %bc = bitcast <16 x i16> %sel to <4 x i64>
1665   ret <4 x i64> %bc
1666 }
1667
1668 define <4 x i64> @test_mm256_max_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1669 ; CHECK-LABEL: test_mm256_max_epi32:
1670 ; CHECK:       # %bb.0:
1671 ; CHECK-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
1672 ; CHECK-NEXT:    ret{{[l|q]}}
1673   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1674   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1675   %cmp = icmp sgt <8 x i32> %arg0, %arg1
1676   %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
1677   %bc = bitcast <8 x i32> %sel to <4 x i64>
1678   ret <4 x i64> %bc
1679 }
1680
1681 define <4 x i64> @test_mm256_max_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1682 ; CHECK-LABEL: test_mm256_max_epu8:
1683 ; CHECK:       # %bb.0:
1684 ; CHECK-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
1685 ; CHECK-NEXT:    ret{{[l|q]}}
1686   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1687   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1688   %cmp = icmp ugt <32 x i8> %arg0, %arg1
1689   %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
1690   %bc = bitcast <32 x i8> %sel to <4 x i64>
1691   ret <4 x i64> %bc
1692 }
1693
1694 define <4 x i64> @test_mm256_max_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1695 ; CHECK-LABEL: test_mm256_max_epu16:
1696 ; CHECK:       # %bb.0:
1697 ; CHECK-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
1698 ; CHECK-NEXT:    ret{{[l|q]}}
1699   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1700   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1701   %cmp = icmp ugt <16 x i16> %arg0, %arg1
1702   %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
1703   %bc = bitcast <16 x i16> %sel to <4 x i64>
1704   ret <4 x i64> %bc
1705 }
1706
1707 define <4 x i64> @test_mm256_max_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1708 ; CHECK-LABEL: test_mm256_max_epu32:
1709 ; CHECK:       # %bb.0:
1710 ; CHECK-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
1711 ; CHECK-NEXT:    ret{{[l|q]}}
1712   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1713   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1714   %cmp = icmp ugt <8 x i32> %arg0, %arg1
1715   %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
1716   %bc = bitcast <8 x i32> %sel to <4 x i64>
1717   ret <4 x i64> %bc
1718 }
1719
1720 define <4 x i64> @test_mm256_min_epi8(<4 x i64> %a0, <4 x i64> %a1) {
1721 ; CHECK-LABEL: test_mm256_min_epi8:
1722 ; CHECK:       # %bb.0:
1723 ; CHECK-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
1724 ; CHECK-NEXT:    ret{{[l|q]}}
1725   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1726   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1727   %cmp = icmp slt <32 x i8> %arg0, %arg1
1728   %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
1729   %bc = bitcast <32 x i8> %sel to <4 x i64>
1730   ret <4 x i64> %bc
1731 }
1732
1733 define <4 x i64> @test_mm256_min_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1734 ; CHECK-LABEL: test_mm256_min_epi16:
1735 ; CHECK:       # %bb.0:
1736 ; CHECK-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
1737 ; CHECK-NEXT:    ret{{[l|q]}}
1738   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1739   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1740   %cmp = icmp slt <16 x i16> %arg0, %arg1
1741   %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
1742   %bc = bitcast <16 x i16> %sel to <4 x i64>
1743   ret <4 x i64> %bc
1744 }
1745
1746 define <4 x i64> @test_mm256_min_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1747 ; CHECK-LABEL: test_mm256_min_epi32:
1748 ; CHECK:       # %bb.0:
1749 ; CHECK-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
1750 ; CHECK-NEXT:    ret{{[l|q]}}
1751   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1752   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1753   %cmp = icmp slt <8 x i32> %arg0, %arg1
1754   %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
1755   %bc = bitcast <8 x i32> %sel to <4 x i64>
1756   ret <4 x i64> %bc
1757 }
1758
1759 define <4 x i64> @test_mm256_min_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1760 ; CHECK-LABEL: test_mm256_min_epu8:
1761 ; CHECK:       # %bb.0:
1762 ; CHECK-NEXT:    vpminub %ymm1, %ymm0, %ymm0
1763 ; CHECK-NEXT:    ret{{[l|q]}}
1764   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1765   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1766   %cmp = icmp ult <32 x i8> %arg0, %arg1
1767   %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
1768   %bc = bitcast <32 x i8> %sel to <4 x i64>
1769   ret <4 x i64> %bc
1770 }
1771
1772 define <4 x i64> @test_mm256_min_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1773 ; CHECK-LABEL: test_mm256_min_epu16:
1774 ; CHECK:       # %bb.0:
1775 ; CHECK-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
1776 ; CHECK-NEXT:    ret{{[l|q]}}
1777   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1778   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1779   %cmp = icmp ult <16 x i16> %arg0, %arg1
1780   %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
1781   %bc = bitcast <16 x i16> %sel to <4 x i64>
1782   ret <4 x i64> %bc
1783 }
1784
1785 define <4 x i64> @test_mm256_min_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1786 ; CHECK-LABEL: test_mm256_min_epu32:
1787 ; CHECK:       # %bb.0:
1788 ; CHECK-NEXT:    vpminud %ymm1, %ymm0, %ymm0
1789 ; CHECK-NEXT:    ret{{[l|q]}}
1790   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1791   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1792   %cmp = icmp ult <8 x i32> %arg0, %arg1
1793   %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
1794   %bc = bitcast <8 x i32> %sel to <4 x i64>
1795   ret <4 x i64> %bc
1796 }
1797
1798 define i32 @test_mm256_movemask_epi8(<4 x i64> %a0) nounwind {
1799 ; CHECK-LABEL: test_mm256_movemask_epi8:
1800 ; CHECK:       # %bb.0:
1801 ; CHECK-NEXT:    vpmovmskb %ymm0, %eax
1802 ; CHECK-NEXT:    vzeroupper
1803 ; CHECK-NEXT:    ret{{[l|q]}}
1804   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1805   %res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %arg0)
1806   ret i32 %res
1807 }
1808 declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone
1809
1810 define <4 x i64> @test_mm256_mpsadbw_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1811 ; CHECK-LABEL: test_mm256_mpsadbw_epu8:
1812 ; CHECK:       # %bb.0:
1813 ; CHECK-NEXT:    vmpsadbw $3, %ymm1, %ymm0, %ymm0
1814 ; CHECK-NEXT:    ret{{[l|q]}}
1815   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1816   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1817   %call = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %arg0, <32 x i8> %arg1, i8 3)
1818   %bc = bitcast <16 x i16>  %call to <4 x i64>
1819   ret <4 x i64> %bc
1820 }
1821 declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone
1822
1823 define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1824 ; CHECK-LABEL: test_mm256_mul_epi32:
1825 ; CHECK:       # %bb.0:
1826 ; CHECK-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0
1827 ; CHECK-NEXT:    ret{{[l|q]}}
1828   %A = shl <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32>
1829   %A1 = ashr exact <4 x i64> %A, <i64 32, i64 32, i64 32, i64 32>
1830   %B = shl <4 x i64> %a1, <i64 32, i64 32, i64 32, i64 32>
1831   %B1 = ashr exact <4 x i64> %B, <i64 32, i64 32, i64 32, i64 32>
1832   %res = mul nsw <4 x i64> %A1, %B1
1833   ret <4 x i64> %res
1834 }
1835 declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
1836
1837 define <4 x i64> @test_mm256_mul_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1838 ; CHECK-LABEL: test_mm256_mul_epu32:
1839 ; CHECK:       # %bb.0:
1840 ; CHECK-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
1841 ; CHECK-NEXT:    ret{{[l|q]}}
1842   %A = and <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1843   %B = and <4 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1844   %res = mul nuw <4 x i64> %A, %B
1845   ret <4 x i64> %res
1846 }
1847 declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone
1848
1849 define <4 x i64> @test_mm256_mulhi_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1850 ; CHECK-LABEL: test_mm256_mulhi_epi16:
1851 ; CHECK:       # %bb.0:
1852 ; CHECK-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
1853 ; CHECK-NEXT:    ret{{[l|q]}}
1854   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1855   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1856   %res = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %arg0, <16 x i16> %arg1)
1857   %bc = bitcast <16 x i16> %res to <4 x i64>
1858   ret <4 x i64> %bc
1859 }
1860 declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone
1861
1862 define <4 x i64> @test_mm256_mulhi_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1863 ; CHECK-LABEL: test_mm256_mulhi_epu16:
1864 ; CHECK:       # %bb.0:
1865 ; CHECK-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0
1866 ; CHECK-NEXT:    ret{{[l|q]}}
1867   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1868   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1869   %res = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %arg0, <16 x i16> %arg1)
1870   %bc = bitcast <16 x i16> %res to <4 x i64>
1871   ret <4 x i64> %bc
1872 }
1873 declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone
1874
1875 define <4 x i64> @test_mm256_mulhrs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1876 ; CHECK-LABEL: test_mm256_mulhrs_epi16:
1877 ; CHECK:       # %bb.0:
1878 ; CHECK-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0
1879 ; CHECK-NEXT:    ret{{[l|q]}}
1880   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1881   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1882   %res = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %arg0, <16 x i16> %arg1)
1883   %bc = bitcast <16 x i16> %res to <4 x i64>
1884   ret <4 x i64> %bc
1885 }
1886 declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone
1887
1888 define <4 x i64> @test_mm256_mullo_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1889 ; CHECK-LABEL: test_mm256_mullo_epi16:
1890 ; CHECK:       # %bb.0:
1891 ; CHECK-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1892 ; CHECK-NEXT:    ret{{[l|q]}}
1893   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1894   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1895   %res = mul <16 x i16> %arg0, %arg1
1896   %bc = bitcast <16 x i16> %res to <4 x i64>
1897   ret <4 x i64> %bc
1898 }
1899
1900 define <4 x i64> @test_mm256_mullo_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1901 ; CHECK-LABEL: test_mm256_mullo_epi32:
1902 ; CHECK:       # %bb.0:
1903 ; CHECK-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
1904 ; CHECK-NEXT:    ret{{[l|q]}}
1905   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1906   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1907   %res = mul <8 x i32> %arg0, %arg1
1908   %bc = bitcast <8 x i32> %res to <4 x i64>
1909   ret <4 x i64> %bc
1910 }
1911
1912 define <4 x i64> @test_mm256_or_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
1913 ; CHECK-LABEL: test_mm256_or_si256:
1914 ; CHECK:       # %bb.0:
1915 ; CHECK-NEXT:    vorps %ymm1, %ymm0, %ymm0
1916 ; CHECK-NEXT:    ret{{[l|q]}}
1917   %res = or <4 x i64> %a0, %a1
1918   ret <4 x i64> %res
1919 }
1920
1921 define <4 x i64> @test_mm256_packs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1922 ; CHECK-LABEL: test_mm256_packs_epi16:
1923 ; CHECK:       # %bb.0:
1924 ; CHECK-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0
1925 ; CHECK-NEXT:    ret{{[l|q]}}
1926   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1927   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1928   %call = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %arg0, <16 x i16> %arg1)
1929   %res = bitcast <32 x i8> %call to <4 x i64>
1930   ret <4 x i64> %res
1931 }
1932 declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
1933
1934 define <4 x i64> @test_mm256_packs_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1935 ; CHECK-LABEL: test_mm256_packs_epi32:
1936 ; CHECK:       # %bb.0:
1937 ; CHECK-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
1938 ; CHECK-NEXT:    ret{{[l|q]}}
1939   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1940   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1941   %call = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %arg0, <8 x i32> %arg1)
1942   %res = bitcast <16 x i16> %call to <4 x i64>
1943   ret <4 x i64> %res
1944 }
1945 declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
1946
1947 define <4 x i64> @test_mm256_packus_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1948 ; CHECK-LABEL: test_mm256_packus_epi16:
1949 ; CHECK:       # %bb.0:
1950 ; CHECK-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
1951 ; CHECK-NEXT:    ret{{[l|q]}}
1952   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1953   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1954   %call = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %arg0, <16 x i16> %arg1)
1955   %res = bitcast <32 x i8> %call to <4 x i64>
1956   ret <4 x i64> %res
1957 }
1958 declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
1959
1960 define <4 x i64> @test_mm256_packus_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1961 ; CHECK-LABEL: test_mm256_packus_epi32:
1962 ; CHECK:       # %bb.0:
1963 ; CHECK-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
1964 ; CHECK-NEXT:    ret{{[l|q]}}
1965   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1966   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1967   %call = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %arg0, <8 x i32> %arg1)
1968   %res = bitcast <16 x i16> %call to <4 x i64>
1969   ret <4 x i64> %res
1970 }
1971 declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
1972
1973 define <4 x i64> @test_mm256_permute2x128_si256(<4 x i64> %a0, <4 x i64> %a1) {
1974 ; CHECK-LABEL: test_mm256_permute2x128_si256:
1975 ; CHECK:       # %bb.0:
1976 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1977 ; CHECK-NEXT:    ret{{[l|q]}}
1978   %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1979   ret <4 x i64> %res
1980 }
1981 declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly
1982
1983 define <4 x i64> @test_mm256_permute4x64_epi64(<4 x i64> %a0) {
1984 ; CHECK-LABEL: test_mm256_permute4x64_epi64:
1985 ; CHECK:       # %bb.0:
1986 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,0,2,0]
1987 ; CHECK-NEXT:    ret{{[l|q]}}
1988   %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 0>
1989   ret <4 x i64> %res
1990 }
1991
1992 define <4 x double> @test_mm256_permute4x64_pd(<4 x double> %a0) {
1993 ; CHECK-LABEL: test_mm256_permute4x64_pd:
1994 ; CHECK:       # %bb.0:
1995 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,2,1,0]
1996 ; CHECK-NEXT:    ret{{[l|q]}}
1997   %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
1998   ret <4 x double> %res
1999 }
2000
2001 define <4 x i64> @test_mm256_permutevar8x32_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2002 ; CHECK-LABEL: test_mm256_permutevar8x32_epi32:
2003 ; CHECK:       # %bb.0:
2004 ; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
2005 ; CHECK-NEXT:    ret{{[l|q]}}
2006   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2007   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2008   %call = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %arg0, <8 x i32> %arg1)
2009   %res = bitcast <8 x i32> %call to <4 x i64>
2010   ret <4 x i64> %res
2011 }
2012 declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
2013
2014 define <8 x float> @test_mm256_permutevar8x32_ps(<8 x float> %a0, <4 x i64> %a1) {
2015 ; CHECK-LABEL: test_mm256_permutevar8x32_ps:
2016 ; CHECK:       # %bb.0:
2017 ; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
2018 ; CHECK-NEXT:    ret{{[l|q]}}
2019   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2020   %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %arg1)
2021   ret <8 x float> %res
2022 }
2023 declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
2024
2025 define <4 x i64> @test_mm256_sad_epu8(<4 x i64> %a0, <4 x i64> %a1) {
2026 ; CHECK-LABEL: test_mm256_sad_epu8:
2027 ; CHECK:       # %bb.0:
2028 ; CHECK-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
2029 ; CHECK-NEXT:    ret{{[l|q]}}
2030   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2031   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2032   %res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %arg0, <32 x i8> %arg1)
2033   ret <4 x i64> %res
2034 }
2035 declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
2036
2037 define <4 x i64> @test_mm256_shuffle_epi32(<4 x i64> %a0) {
2038 ; CHECK-LABEL: test_mm256_shuffle_epi32:
2039 ; CHECK:       # %bb.0:
2040 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4]
2041 ; CHECK-NEXT:    ret{{[l|q]}}
2042   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2043   %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4>
2044   %res = bitcast <8 x i32> %shuf to <4 x i64>
2045   ret <4 x i64> %res
2046 }
2047
2048 define <4 x i64> @test_mm256_shuffle_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2049 ; CHECK-LABEL: test_mm256_shuffle_epi8:
2050 ; CHECK:       # %bb.0:
2051 ; CHECK-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
2052 ; CHECK-NEXT:    ret{{[l|q]}}
2053   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2054   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2055   %shuf = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %arg0, <32 x i8> %arg1)
2056   %res = bitcast <32 x i8> %shuf to <4 x i64>
2057   ret <4 x i64> %res
2058 }
2059 declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
2060
2061 define <4 x i64> @test_mm256_shufflehi_epi16(<4 x i64> %a0) {
2062 ; CHECK-LABEL: test_mm256_shufflehi_epi16:
2063 ; CHECK:       # %bb.0:
2064 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,5,8,9,10,11,15,14,14,13]
2065 ; CHECK-NEXT:    ret{{[l|q]}}
2066   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2067   %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13>
2068   %res = bitcast <16 x i16> %shuf to <4 x i64>
2069   ret <4 x i64> %res
2070 }
2071
2072 define <4 x i64> @test_mm256_shufflelo_epi16(<4 x i64> %a0) {
2073 ; CHECK-LABEL: test_mm256_shufflelo_epi16:
2074 ; CHECK:       # %bb.0:
2075 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,0,1,1,4,5,6,7,11,8,9,9,12,13,14,15]
2076 ; CHECK-NEXT:    ret{{[l|q]}}
2077   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2078   %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, i32 9, i32 12, i32 13, i32 14, i32 15>
2079   %res = bitcast <16 x i16> %shuf to <4 x i64>
2080   ret <4 x i64> %res
2081 }
2082
2083 define <4 x i64> @test_mm256_sign_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2084 ; CHECK-LABEL: test_mm256_sign_epi8:
2085 ; CHECK:       # %bb.0:
2086 ; CHECK-NEXT:    vpsignb %ymm1, %ymm0, %ymm0
2087 ; CHECK-NEXT:    ret{{[l|q]}}
2088   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2089   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2090   %call = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %arg0, <32 x i8> %arg1)
2091   %res = bitcast <32 x i8> %call to <4 x i64>
2092   ret <4 x i64> %res
2093 }
2094 declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
2095
2096 define <4 x i64> @test_mm256_sign_epi16(<4 x i64> %a0, <4 x i64> %a1) {
2097 ; CHECK-LABEL: test_mm256_sign_epi16:
2098 ; CHECK:       # %bb.0:
2099 ; CHECK-NEXT:    vpsignw %ymm1, %ymm0, %ymm0
2100 ; CHECK-NEXT:    ret{{[l|q]}}
2101   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2102   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2103   %call = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %arg0, <16 x i16> %arg1)
2104   %res = bitcast <16 x i16> %call to <4 x i64>
2105   ret <4 x i64> %res
2106 }
2107 declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone
2108
2109 define <4 x i64> @test_mm256_sign_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2110 ; CHECK-LABEL: test_mm256_sign_epi32:
2111 ; CHECK:       # %bb.0:
2112 ; CHECK-NEXT:    vpsignd %ymm1, %ymm0, %ymm0
2113 ; CHECK-NEXT:    ret{{[l|q]}}
2114   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2115   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2116   %call = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %arg0, <8 x i32> %arg1)
2117   %res = bitcast <8 x i32> %call to <4 x i64>
2118   ret <4 x i64> %res
2119 }
2120 declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
2121
2122 define <4 x i64> @test_mm256_sll_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2123 ; CHECK-LABEL: test_mm256_sll_epi16:
2124 ; CHECK:       # %bb.0:
2125 ; CHECK-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
2126 ; CHECK-NEXT:    ret{{[l|q]}}
2127   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2128   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2129   %res = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %arg0, <8 x i16> %arg1)
2130   %bc = bitcast <16 x i16> %res to <4 x i64>
2131   ret <4 x i64> %bc
2132 }
2133 declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone
2134
2135 define <4 x i64> @test_mm256_sll_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2136 ; CHECK-LABEL: test_mm256_sll_epi32:
2137 ; CHECK:       # %bb.0:
2138 ; CHECK-NEXT:    vpslld %xmm1, %ymm0, %ymm0
2139 ; CHECK-NEXT:    ret{{[l|q]}}
2140   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2141   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2142   %res = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %arg0, <4 x i32> %arg1)
2143   %bc = bitcast <8 x i32> %res to <4 x i64>
2144   ret <4 x i64> %bc
2145 }
2146 declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
2147
2148 define <4 x i64> @test_mm256_sll_epi64(<4 x i64> %a0, <2 x i64> %a1) {
2149 ; CHECK-LABEL: test_mm256_sll_epi64:
2150 ; CHECK:       # %bb.0:
2151 ; CHECK-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
2152 ; CHECK-NEXT:    ret{{[l|q]}}
2153   %res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1)
2154   ret <4 x i64> %res
2155 }
2156 declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
2157
2158 define <4 x i64> @test_mm256_slli_epi16(<4 x i64> %a0) {
2159 ; CHECK-LABEL: test_mm256_slli_epi16:
2160 ; CHECK:       # %bb.0:
2161 ; CHECK-NEXT:    vpsllw $3, %ymm0, %ymm0
2162 ; CHECK-NEXT:    ret{{[l|q]}}
2163   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2164   %res = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %arg0, i32 3)
2165   %bc = bitcast <16 x i16> %res to <4 x i64>
2166   ret <4 x i64> %bc
2167 }
2168 declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone
2169
2170 define <4 x i64> @test_mm256_slli_epi32(<4 x i64> %a0) {
2171 ; CHECK-LABEL: test_mm256_slli_epi32:
2172 ; CHECK:       # %bb.0:
2173 ; CHECK-NEXT:    vpslld $3, %ymm0, %ymm0
2174 ; CHECK-NEXT:    ret{{[l|q]}}
2175   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2176   %res = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %arg0, i32 3)
2177   %bc = bitcast <8 x i32> %res to <4 x i64>
2178   ret <4 x i64> %bc
2179 }
2180 declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) nounwind readnone
2181
2182 define <4 x i64> @test_mm256_slli_epi64(<4 x i64> %a0) {
2183 ; CHECK-LABEL: test_mm256_slli_epi64:
2184 ; CHECK:       # %bb.0:
2185 ; CHECK-NEXT:    vpsllq $3, %ymm0, %ymm0
2186 ; CHECK-NEXT:    ret{{[l|q]}}
2187   %res = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %a0, i32 3)
2188   ret <4 x i64> %res
2189 }
2190 declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) nounwind readnone
2191
2192 define <4 x i64> @test_mm256_slli_si256(<4 x i64> %a0) {
2193 ; CHECK-LABEL: test_mm256_slli_si256:
2194 ; CHECK:       # %bb.0:
2195 ; CHECK-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
2196 ; CHECK-NEXT:    ret{{[l|q]}}
2197   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2198   %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
2199   %res = bitcast <32 x i8> %shuf to <4 x i64>
2200   ret <4 x i64> %res
2201 }
2202
2203 define <2 x i64> @test_mm_sllv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2204 ; CHECK-LABEL: test_mm_sllv_epi32:
2205 ; CHECK:       # %bb.0:
2206 ; CHECK-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
2207 ; CHECK-NEXT:    ret{{[l|q]}}
2208   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2209   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2210   %res = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %arg0, <4 x i32> %arg1)
2211   %bc = bitcast <4 x i32> %res to <2 x i64>
2212   ret <2 x i64> %bc
2213 }
2214 declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
2215
2216 define <4 x i64> @test_mm256_sllv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2217 ; CHECK-LABEL: test_mm256_sllv_epi32:
2218 ; CHECK:       # %bb.0:
2219 ; CHECK-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
2220 ; CHECK-NEXT:    ret{{[l|q]}}
2221   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2222   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2223   %res = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2224   %bc = bitcast <8 x i32> %res to <4 x i64>
2225   ret <4 x i64> %bc
2226 }
2227 declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2228
2229 define <2 x i64> @test_mm_sllv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
2230 ; CHECK-LABEL: test_mm_sllv_epi64:
2231 ; CHECK:       # %bb.0:
2232 ; CHECK-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
2233 ; CHECK-NEXT:    ret{{[l|q]}}
2234   %res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1)
2235   ret <2 x i64> %res
2236 }
2237 declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
2238
2239 define <4 x i64> @test_mm256_sllv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
2240 ; CHECK-LABEL: test_mm256_sllv_epi64:
2241 ; CHECK:       # %bb.0:
2242 ; CHECK-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0
2243 ; CHECK-NEXT:    ret{{[l|q]}}
2244   %res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1)
2245   ret <4 x i64> %res
2246 }
2247 declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
2248
2249 define <4 x i64> @test_mm256_sra_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2250 ; CHECK-LABEL: test_mm256_sra_epi16:
2251 ; CHECK:       # %bb.0:
2252 ; CHECK-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
2253 ; CHECK-NEXT:    ret{{[l|q]}}
2254   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2255   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2256   %res = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %arg0, <8 x i16> %arg1)
2257   %bc = bitcast <16 x i16> %res to <4 x i64>
2258   ret <4 x i64> %bc
2259 }
2260 declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone
2261
2262 define <4 x i64> @test_mm256_sra_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2263 ; CHECK-LABEL: test_mm256_sra_epi32:
2264 ; CHECK:       # %bb.0:
2265 ; CHECK-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
2266 ; CHECK-NEXT:    ret{{[l|q]}}
2267   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2268   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2269   %res = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %arg0, <4 x i32> %arg1)
2270   %bc = bitcast <8 x i32> %res to <4 x i64>
2271   ret <4 x i64> %bc
2272 }
2273 declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
2274
2275 define <4 x i64> @test_mm256_srai_epi16(<4 x i64> %a0) {
2276 ; CHECK-LABEL: test_mm256_srai_epi16:
2277 ; CHECK:       # %bb.0:
2278 ; CHECK-NEXT:    vpsraw $3, %ymm0, %ymm0
2279 ; CHECK-NEXT:    ret{{[l|q]}}
2280   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2281   %res = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %arg0, i32 3)
2282   %bc = bitcast <16 x i16> %res to <4 x i64>
2283   ret <4 x i64> %bc
2284 }
2285 declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone
2286
2287 define <4 x i64> @test_mm256_srai_epi32(<4 x i64> %a0) {
2288 ; CHECK-LABEL: test_mm256_srai_epi32:
2289 ; CHECK:       # %bb.0:
2290 ; CHECK-NEXT:    vpsrad $3, %ymm0, %ymm0
2291 ; CHECK-NEXT:    ret{{[l|q]}}
2292   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2293   %res = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %arg0, i32 3)
2294   %bc = bitcast <8 x i32> %res to <4 x i64>
2295   ret <4 x i64> %bc
2296 }
2297 declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) nounwind readnone
2298
2299 define <2 x i64> @test_mm_srav_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2300 ; CHECK-LABEL: test_mm_srav_epi32:
2301 ; CHECK:       # %bb.0:
2302 ; CHECK-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
2303 ; CHECK-NEXT:    ret{{[l|q]}}
2304   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2305   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2306   %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %arg0, <4 x i32> %arg1)
2307   %bc = bitcast <4 x i32> %res to <2 x i64>
2308   ret <2 x i64> %bc
2309 }
2310 declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
2311
2312 define <4 x i64> @test_mm256_srav_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2313 ; CHECK-LABEL: test_mm256_srav_epi32:
2314 ; CHECK:       # %bb.0:
2315 ; CHECK-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
2316 ; CHECK-NEXT:    ret{{[l|q]}}
2317   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2318   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2319   %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2320   %bc = bitcast <8 x i32> %res to <4 x i64>
2321   ret <4 x i64> %bc
2322 }
2323 declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2324
2325 define <4 x i64> @test_mm256_srl_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2326 ; CHECK-LABEL: test_mm256_srl_epi16:
2327 ; CHECK:       # %bb.0:
2328 ; CHECK-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
2329 ; CHECK-NEXT:    ret{{[l|q]}}
2330   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2331   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2332   %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %arg0, <8 x i16> %arg1)
2333   %bc = bitcast <16 x i16> %res to <4 x i64>
2334   ret <4 x i64> %bc
2335 }
2336 declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone
2337
2338 define <4 x i64> @test_mm256_srl_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2339 ; CHECK-LABEL: test_mm256_srl_epi32:
2340 ; CHECK:       # %bb.0:
2341 ; CHECK-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
2342 ; CHECK-NEXT:    ret{{[l|q]}}
2343   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2344   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2345   %res = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %arg0, <4 x i32> %arg1)
2346   %bc = bitcast <8 x i32> %res to <4 x i64>
2347   ret <4 x i64> %bc
2348 }
2349 declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
2350
2351 define <4 x i64> @test_mm256_srl_epi64(<4 x i64> %a0, <2 x i64> %a1) {
2352 ; CHECK-LABEL: test_mm256_srl_epi64:
2353 ; CHECK:       # %bb.0:
2354 ; CHECK-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
2355 ; CHECK-NEXT:    ret{{[l|q]}}
2356   %res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1)
2357   ret <4 x i64> %res
2358 }
2359 declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
2360
2361 define <4 x i64> @test_mm256_srli_epi16(<4 x i64> %a0) {
2362 ; CHECK-LABEL: test_mm256_srli_epi16:
2363 ; CHECK:       # %bb.0:
2364 ; CHECK-NEXT:    vpsrlw $3, %ymm0, %ymm0
2365 ; CHECK-NEXT:    ret{{[l|q]}}
2366   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2367   %res = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %arg0, i32 3)
2368   %bc = bitcast <16 x i16> %res to <4 x i64>
2369   ret <4 x i64> %bc
2370 }
2371 declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone
2372
2373 define <4 x i64> @test_mm256_srli_epi32(<4 x i64> %a0) {
2374 ; CHECK-LABEL: test_mm256_srli_epi32:
2375 ; CHECK:       # %bb.0:
2376 ; CHECK-NEXT:    vpsrld $3, %ymm0, %ymm0
2377 ; CHECK-NEXT:    ret{{[l|q]}}
2378   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2379   %res = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %arg0, i32 3)
2380   %bc = bitcast <8 x i32> %res to <4 x i64>
2381   ret <4 x i64> %bc
2382 }
2383 declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) nounwind readnone
2384
2385 define <4 x i64> @test_mm256_srli_epi64(<4 x i64> %a0) {
2386 ; CHECK-LABEL: test_mm256_srli_epi64:
2387 ; CHECK:       # %bb.0:
2388 ; CHECK-NEXT:    vpsrlq $3, %ymm0, %ymm0
2389 ; CHECK-NEXT:    ret{{[l|q]}}
2390   %res = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %a0, i32 3)
2391   ret <4 x i64> %res
2392 }
2393 declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) nounwind readnone
2394
2395 define <4 x i64> @test_mm256_srli_si256(<4 x i64> %a0) {
2396 ; CHECK-LABEL: test_mm256_srli_si256:
2397 ; CHECK:       # %bb.0:
2398 ; CHECK-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
2399 ; CHECK-NEXT:    ret{{[l|q]}}
2400   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2401   %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
2402   %res = bitcast <32 x i8> %shuf to <4 x i64>
2403   ret <4 x i64> %res
2404 }
2405
2406 define <2 x i64> @test_mm_srlv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2407 ; CHECK-LABEL: test_mm_srlv_epi32:
2408 ; CHECK:       # %bb.0:
2409 ; CHECK-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
2410 ; CHECK-NEXT:    ret{{[l|q]}}
2411   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2412   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2413   %res = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %arg0, <4 x i32> %arg1)
2414   %bc = bitcast <4 x i32> %res to <2 x i64>
2415   ret <2 x i64> %bc
2416 }
2417 declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
2418
2419 define <4 x i64> @test_mm256_srlv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2420 ; CHECK-LABEL: test_mm256_srlv_epi32:
2421 ; CHECK:       # %bb.0:
2422 ; CHECK-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
2423 ; CHECK-NEXT:    ret{{[l|q]}}
2424   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2425   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2426   %res = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2427   %bc = bitcast <8 x i32> %res to <4 x i64>
2428   ret <4 x i64> %bc
2429 }
2430 declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2431
2432 define <2 x i64> @test_mm_srlv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
2433 ; CHECK-LABEL: test_mm_srlv_epi64:
2434 ; CHECK:       # %bb.0:
2435 ; CHECK-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
2436 ; CHECK-NEXT:    ret{{[l|q]}}
2437   %res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1)
2438   ret <2 x i64> %res
2439 }
2440 declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
2441
2442 define <4 x i64> @test_mm256_srlv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
2443 ; CHECK-LABEL: test_mm256_srlv_epi64:
2444 ; CHECK:       # %bb.0:
2445 ; CHECK-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
2446 ; CHECK-NEXT:    ret{{[l|q]}}
2447   %res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1)
2448   ret <4 x i64> %res
2449 }
2450 declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
2451
2452 define <4 x i64> @test_mm256_stream_load_si256(<4 x i64> *%a0) {
2453 ; X86-LABEL: test_mm256_stream_load_si256:
2454 ; X86:       # %bb.0:
2455 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2456 ; X86-NEXT:    vmovntdqa (%eax), %ymm0
2457 ; X86-NEXT:    retl
2458 ;
2459 ; X64-LABEL: test_mm256_stream_load_si256:
2460 ; X64:       # %bb.0:
2461 ; X64-NEXT:    vmovntdqa (%rdi), %ymm0
2462 ; X64-NEXT:    retq
2463   %arg0 = bitcast <4 x i64> *%a0 to i8*
2464   %res = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %arg0)
2465   ret <4 x i64> %res
2466 }
2467 declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly
2468
2469 define <4 x i64> @test_mm256_sub_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2470 ; CHECK-LABEL: test_mm256_sub_epi8:
2471 ; CHECK:       # %bb.0:
2472 ; CHECK-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
2473 ; CHECK-NEXT:    ret{{[l|q]}}
2474   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2475   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2476   %res = sub <32 x i8> %arg0, %arg1
2477   %bc = bitcast <32 x i8> %res to <4 x i64>
2478   ret <4 x i64> %bc
2479 }
2480
2481 define <4 x i64> @test_mm256_sub_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2482 ; CHECK-LABEL: test_mm256_sub_epi16:
2483 ; CHECK:       # %bb.0:
2484 ; CHECK-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
2485 ; CHECK-NEXT:    ret{{[l|q]}}
2486   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2487   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2488   %res = sub <16 x i16> %arg0, %arg1
2489   %bc = bitcast <16 x i16> %res to <4 x i64>
2490   ret <4 x i64> %bc
2491 }
2492
2493 define <4 x i64> @test_mm256_sub_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2494 ; CHECK-LABEL: test_mm256_sub_epi32:
2495 ; CHECK:       # %bb.0:
2496 ; CHECK-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
2497 ; CHECK-NEXT:    ret{{[l|q]}}
2498   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2499   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2500   %res = sub <8 x i32> %arg0, %arg1
2501   %bc = bitcast <8 x i32> %res to <4 x i64>
2502   ret <4 x i64> %bc
2503 }
2504
2505 define <4 x i64> @test_mm256_sub_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2506 ; CHECK-LABEL: test_mm256_sub_epi64:
2507 ; CHECK:       # %bb.0:
2508 ; CHECK-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
2509 ; CHECK-NEXT:    ret{{[l|q]}}
2510   %res = sub <4 x i64> %a0, %a1
2511   ret <4 x i64> %res
2512 }
2513
2514 define <4 x i64> @test_mm256_subs_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2515 ; CHECK-LABEL: test_mm256_subs_epi8:
2516 ; CHECK:       # %bb.0:
2517 ; CHECK-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0
2518 ; CHECK-NEXT:    ret{{[l|q]}}
2519   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2520   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2521   %res = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
2522   %bc = bitcast <32 x i8> %res to <4 x i64>
2523   ret <4 x i64> %bc
2524 }
2525 declare <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
2526
2527 define <4 x i64> @test_mm256_subs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
2528 ; CHECK-LABEL: test_mm256_subs_epi16:
2529 ; CHECK:       # %bb.0:
2530 ; CHECK-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0
2531 ; CHECK-NEXT:    ret{{[l|q]}}
2532   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2533   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2534   %res = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
2535   %bc = bitcast <16 x i16> %res to <4 x i64>
2536   ret <4 x i64> %bc
2537 }
2538 declare <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
2539
2540 define <4 x i64> @test_mm256_subs_epu8(<4 x i64> %a0, <4 x i64> %a1) {
2541 ; CHECK-LABEL: test_mm256_subs_epu8:
2542 ; CHECK:       # %bb.0:
2543 ; CHECK-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0
2544 ; CHECK-NEXT:    ret{{[l|q]}}
2545   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2546   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2547   %res = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
2548   %bc = bitcast <32 x i8> %res to <4 x i64>
2549   ret <4 x i64> %bc
2550 }
2551 declare <32 x i8> @llvm.usub.sat.v32i8(<32 x i8>, <32 x i8>)
2552
2553 define <4 x i64> @test_mm256_subs_epu16(<4 x i64> %a0, <4 x i64> %a1) {
2554 ; CHECK-LABEL: test_mm256_subs_epu16:
2555 ; CHECK:       # %bb.0:
2556 ; CHECK-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0
2557 ; CHECK-NEXT:    ret{{[l|q]}}
2558   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2559   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2560   %res = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
2561   %bc = bitcast <16 x i16> %res to <4 x i64>
2562   ret <4 x i64> %bc
2563 }
2564 declare <16 x i16> @llvm.usub.sat.v16i16(<16 x i16>, <16 x i16>)
2565
2566 define <4 x i64> @test_mm256_unpackhi_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2567 ; CHECK-LABEL: test_mm256_unpackhi_epi8:
2568 ; CHECK:       # %bb.0:
2569 ; CHECK-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
2570 ; CHECK-NEXT:    ret{{[l|q]}}
2571   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2572   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2573   %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
2574   %bc = bitcast <32 x i8> %res to <4 x i64>
2575   ret <4 x i64> %bc
2576 }
2577
2578 define <4 x i64> @test_mm256_unpackhi_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2579 ; CHECK-LABEL: test_mm256_unpackhi_epi16:
2580 ; CHECK:       # %bb.0:
2581 ; CHECK-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
2582 ; CHECK-NEXT:    ret{{[l|q]}}
2583   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2584   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2585   %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
2586   %bc = bitcast <16 x i16> %res to <4 x i64>
2587   ret <4 x i64> %bc
2588 }
2589
2590 define <4 x i64> @test_mm256_unpackhi_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2591 ; CHECK-LABEL: test_mm256_unpackhi_epi32:
2592 ; CHECK:       # %bb.0:
2593 ; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
2594 ; CHECK-NEXT:    ret{{[l|q]}}
2595   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2596   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2597   %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
2598   %bc = bitcast <8 x i32> %res to <4 x i64>
2599   ret <4 x i64> %bc
2600 }
2601
2602 define <4 x i64> @test_mm256_unpackhi_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2603 ; CHECK-LABEL: test_mm256_unpackhi_epi64:
2604 ; CHECK:       # %bb.0:
2605 ; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
2606 ; CHECK-NEXT:    ret{{[l|q]}}
2607   %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
2608   ret <4 x i64> %res
2609 }
2610
2611 define <4 x i64> @test_mm256_unpacklo_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2612 ; CHECK-LABEL: test_mm256_unpacklo_epi8:
2613 ; CHECK:       # %bb.0:
2614 ; CHECK-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
2615 ; CHECK-NEXT:    ret{{[l|q]}}
2616   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2617   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2618   %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
2619   %bc = bitcast <32 x i8> %res to <4 x i64>
2620   ret <4 x i64> %bc
2621 }
2622
2623 define <4 x i64> @test_mm256_unpacklo_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2624 ; CHECK-LABEL: test_mm256_unpacklo_epi16:
2625 ; CHECK:       # %bb.0:
2626 ; CHECK-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
2627 ; CHECK-NEXT:    ret{{[l|q]}}
2628   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2629   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2630   %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
2631   %bc = bitcast <16 x i16> %res to <4 x i64>
2632   ret <4 x i64> %bc
2633 }
2634
2635 define <4 x i64> @test_mm256_unpacklo_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2636 ; CHECK-LABEL: test_mm256_unpacklo_epi32:
2637 ; CHECK:       # %bb.0:
2638 ; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
2639 ; CHECK-NEXT:    ret{{[l|q]}}
2640   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2641   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2642   %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
2643   %bc = bitcast <8 x i32> %res to <4 x i64>
2644   ret <4 x i64> %bc
2645 }
2646
2647 define <4 x i64> @test_mm256_unpacklo_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2648 ; CHECK-LABEL: test_mm256_unpacklo_epi64:
2649 ; CHECK:       # %bb.0:
2650 ; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
2651 ; CHECK-NEXT:    ret{{[l|q]}}
2652   %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
2653   ret <4 x i64> %res
2654 }
2655
2656 define <4 x i64> @test_mm256_xor_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2657 ; CHECK-LABEL: test_mm256_xor_si256:
2658 ; CHECK:       # %bb.0:
2659 ; CHECK-NEXT:    vxorps %ymm1, %ymm0, %ymm0
2660 ; CHECK-NEXT:    ret{{[l|q]}}
2661   %res = xor <4 x i64> %a0, %a1
2662   ret <4 x i64> %res
2663 }
2664
2665 declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
2666
2667 declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone