llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X86
   3 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X64
   4
   5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx2-builtins.c
   6
   7 define <4 x i64> @test_mm256_abs_epi8(<4 x i64> %a0) {
   8 ; CHECK-LABEL: test_mm256_abs_epi8:
   9 ; CHECK:       # %bb.0:
  10 ; CHECK-NEXT:    vpabsb %ymm0, %ymm0
  11 ; CHECK-NEXT:    ret{{[l|q]}}
  12   %arg = bitcast <4 x i64> %a0 to <32 x i8>
  13   %abs = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %arg, i1 false)
  14   %res = bitcast <32 x i8> %abs to <4 x i64>
  15   ret <4 x i64> %res
  16 }
  17 declare <32 x i8> @llvm.abs.v32i8(<32 x i8>, i1) nounwind readnone
  18
  19 define <4 x i64> @test_mm256_abs_epi16(<4 x i64> %a0) {
  20 ; CHECK-LABEL: test_mm256_abs_epi16:
  21 ; CHECK:       # %bb.0:
  22 ; CHECK-NEXT:    vpabsw %ymm0, %ymm0
  23 ; CHECK-NEXT:    ret{{[l|q]}}
  24   %arg = bitcast <4 x i64> %a0 to <16 x i16>
  25   %abs = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %arg, i1 false)
  26   %res = bitcast <16 x i16> %abs to <4 x i64>
  27   ret <4 x i64> %res
  28 }
  29 declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1) nounwind readnone
  30
  31 define <4 x i64> @test_mm256_abs_epi32(<4 x i64> %a0) {
  32 ; CHECK-LABEL: test_mm256_abs_epi32:
  33 ; CHECK:       # %bb.0:
  34 ; CHECK-NEXT:    vpabsd %ymm0, %ymm0
  35 ; CHECK-NEXT:    ret{{[l|q]}}
  36   %arg = bitcast <4 x i64> %a0 to <8 x i32>
  37   %abs = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %arg, i1 false)
  38   %res = bitcast <8 x i32> %abs to <4 x i64>
  39   ret <4 x i64> %res
  40 }
  41 declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1) nounwind readnone
  42
  43 define <4 x i64> @test_mm256_add_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
  44 ; CHECK-LABEL: test_mm256_add_epi8:
  45 ; CHECK:       # %bb.0:
  46 ; CHECK-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
  47 ; CHECK-NEXT:    ret{{[l|q]}}
  48   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
  49   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
  50   %res = add <32 x i8> %arg0, %arg1
  51   %bc = bitcast <32 x i8> %res to <4 x i64>
  52   ret <4 x i64> %bc
  53 }
  54
  55 define <4 x i64> @test_mm256_add_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
  56 ; CHECK-LABEL: test_mm256_add_epi16:
  57 ; CHECK:       # %bb.0:
  58 ; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
  59 ; CHECK-NEXT:    ret{{[l|q]}}
  60   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
  61   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
  62   %res = add <16 x i16> %arg0, %arg1
  63   %bc = bitcast <16 x i16> %res to <4 x i64>
  64   ret <4 x i64> %bc
  65 }
  66
  67 define <4 x i64> @test_mm256_add_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
  68 ; CHECK-LABEL: test_mm256_add_epi32:
  69 ; CHECK:       # %bb.0:
  70 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
  71 ; CHECK-NEXT:    ret{{[l|q]}}
  72   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
  73   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
  74   %res = add <8 x i32> %arg0, %arg1
  75   %bc = bitcast <8 x i32> %res to <4 x i64>
  76   ret <4 x i64> %bc
  77 }
  78
  79 define <4 x i64> @test_mm256_add_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
  80 ; CHECK-LABEL: test_mm256_add_epi64:
  81 ; CHECK:       # %bb.0:
  82 ; CHECK-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
  83 ; CHECK-NEXT:    ret{{[l|q]}}
  84   %res = add <4 x i64> %a0, %a1
  85   ret <4 x i64> %res
  86 }
  87
  88 define <4 x i64> @test_mm256_adds_epi8(<4 x i64> %a0, <4 x i64> %a1) {
  89 ; CHECK-LABEL: test_mm256_adds_epi8:
  90 ; CHECK:       # %bb.0:
  91 ; CHECK-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0
  92 ; CHECK-NEXT:    ret{{[l|q]}}
  93   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
  94   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
  95   %res = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
  96   %bc = bitcast <32 x i8> %res to <4 x i64>
  97   ret <4 x i64> %bc
  98 }
  99 declare <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
 100
 101 define <4 x i64> @test_mm256_adds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
 102 ; CHECK-LABEL: test_mm256_adds_epi16:
 103 ; CHECK:       # %bb.0:
 104 ; CHECK-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0
 105 ; CHECK-NEXT:    ret{{[l|q]}}
 106   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 107   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 108   %res = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
 109   %bc = bitcast <16 x i16> %res to <4 x i64>
 110   ret <4 x i64> %bc
 111 }
 112 declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
 113
 114 define <4 x i64> @test_mm256_adds_epu8(<4 x i64> %a0, <4 x i64> %a1) {
 115 ; CHECK-LABEL: test_mm256_adds_epu8:
 116 ; CHECK:       # %bb.0:
 117 ; CHECK-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0
 118 ; CHECK-NEXT:    ret{{[l|q]}}
 119   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 120   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
 121   %res = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
 122   %bc = bitcast <32 x i8> %res to <4 x i64>
 123   ret <4 x i64> %bc
 124 }
 125 declare <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8>, <32 x i8>)
 126
 127 define <4 x i64> @test_mm256_adds_epu16(<4 x i64> %a0, <4 x i64> %a1) {
 128 ; CHECK-LABEL: test_mm256_adds_epu16:
 129 ; CHECK:       # %bb.0:
 130 ; CHECK-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0
 131 ; CHECK-NEXT:    ret{{[l|q]}}
 132   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 133   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 134   %res = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
 135   %bc = bitcast <16 x i16> %res to <4 x i64>
 136   ret <4 x i64> %bc
 137 }
 138 declare <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16>, <16 x i16>)
 139
 140 define <4 x i64> @test_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
 141 ; CHECK-LABEL: test_mm256_alignr_epi8:
 142 ; CHECK:       # %bb.0:
 143 ; CHECK-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17]
 144 ; CHECK-NEXT:    ret{{[l|q]}}
 145   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 146   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
 147   %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49>
 148   %res = bitcast <32 x i8> %shuf to <4 x i64>
 149   ret <4 x i64> %res
 150 }
 151
 152 define <4 x i64> @test2_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
 153 ; CHECK-LABEL: test2_mm256_alignr_epi8:
 154 ; CHECK:       # %bb.0:
 155 ; CHECK-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16]
 156 ; CHECK-NEXT:    ret{{[l|q]}}
 157   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 158   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
 159   %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
 160   %res = bitcast <32 x i8> %shuf to <4 x i64>
 161   ret <4 x i64> %res
 162 }
 163
 164 define <4 x i64> @test_mm256_and_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 165 ; CHECK-LABEL: test_mm256_and_si256:
 166 ; CHECK:       # %bb.0:
 167 ; CHECK-NEXT:    vandps %ymm1, %ymm0, %ymm0
 168 ; CHECK-NEXT:    ret{{[l|q]}}
 169   %res = and <4 x i64> %a0, %a1
 170   ret <4 x i64> %res
 171 }
 172
 173 define <4 x i64> @test_mm256_andnot_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 174 ; CHECK-LABEL: test_mm256_andnot_si256:
 175 ; CHECK:       # %bb.0:
 176 ; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 177 ; CHECK-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 178 ; CHECK-NEXT:    vpand %ymm1, %ymm0, %ymm0
 179 ; CHECK-NEXT:    ret{{[l|q]}}
 180   %not = xor <4 x i64> %a0, <i64 -1, i64 -1, i64 -1, i64 -1>
 181   %res = and <4 x i64> %not, %a1
 182   ret <4 x i64> %res
 183 }
 184
 185 define <4 x i64> @test_mm256_avg_epu8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 186 ; CHECK-LABEL: test_mm256_avg_epu8:
 187 ; CHECK:       # %bb.0:
 188 ; CHECK-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
 189 ; CHECK-NEXT:    ret{{[l|q]}}
 190   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 191   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
 192   %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %arg0, <32 x i8> %arg1)
 193   %bc = bitcast <32 x i8> %res to <4 x i64>
 194   ret <4 x i64> %bc
 195 }
 196 declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
 197
 198 define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 199 ; CHECK-LABEL: test_mm256_avg_epu16:
 200 ; CHECK:       # %bb.0:
 201 ; CHECK-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
 202 ; CHECK-NEXT:    ret{{[l|q]}}
 203   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 204   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 205   %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %arg0, <16 x i16> %arg1)
 206   %bc = bitcast <16 x i16> %res to <4 x i64>
 207   ret <4 x i64> %bc
 208 }
 209 declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone
 210
 211 define <4 x i64> @test_mm256_blend_epi16(<4 x i64> %a0, <4 x i64> %a1) {
 212 ; CHECK-LABEL: test_mm256_blend_epi16:
 213 ; CHECK:       # %bb.0:
 214 ; CHECK-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
 215 ; CHECK-NEXT:    ret{{[l|q]}}
 216   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 217   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 218   %shuf = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 219   %res = bitcast <16 x i16> %shuf to <4 x i64>
 220   ret <4 x i64> %res
 221 }
 222
 223 define <2 x i64> @test_mm_blend_epi32(<2 x i64> %a0, <2 x i64> %a1) {
 224 ; CHECK-LABEL: test_mm_blend_epi32:
 225 ; CHECK:       # %bb.0:
 226 ; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
 227 ; CHECK-NEXT:    ret{{[l|q]}}
 228   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
 229   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
 230   %shuf = shufflevector <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
 231   %res = bitcast <4 x i32> %shuf to <2 x i64>
 232   ret <2 x i64> %res
 233 }
 234
 235 define <4 x i64> @test_mm256_blend_epi32(<4 x i64> %a0, <4 x i64> %a1) {
 236 ; CHECK-LABEL: test_mm256_blend_epi32:
 237 ; CHECK:       # %bb.0:
 238 ; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
 239 ; CHECK-NEXT:    ret{{[l|q]}}
 240   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
 241   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
 242   %shuf = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
 243   %res = bitcast <8 x i32> %shuf to <4 x i64>
 244   ret <4 x i64> %res
 245 }
 246
 247 define <4 x i64> @test_mm256_blendv_epi8(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
 248 ; CHECK-LABEL: test_mm256_blendv_epi8:
 249 ; CHECK:       # %bb.0:
 250 ; CHECK-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
 251 ; CHECK-NEXT:    ret{{[l|q]}}
 252   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 253   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
 254   %arg2 = bitcast <4 x i64> %a2 to <32 x i8>
 255   %call = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %arg0, <32 x i8> %arg1, <32 x i8> %arg2)
 256   %res = bitcast <32 x i8> %call to <4 x i64>
 257   ret <4 x i64> %res
 258 }
 259 declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone
 260
 261 define <2 x i64> @test_mm_broadcastb_epi8(<2 x i64> %a0) {
 262 ; CHECK-LABEL: test_mm_broadcastb_epi8:
 263 ; CHECK:       # %bb.0:
 264 ; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm0
 265 ; CHECK-NEXT:    ret{{[l|q]}}
 266   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
 267   %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <16 x i32> zeroinitializer
 268   %res = bitcast <16 x i8> %shuf to <2 x i64>
 269   ret <2 x i64> %res
 270 }
 271
 272 define <4 x i64> @test_mm256_broadcastb_epi8(<4 x i64> %a0) {
 273 ; CHECK-LABEL: test_mm256_broadcastb_epi8:
 274 ; CHECK:       # %bb.0:
 275 ; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm0
 276 ; CHECK-NEXT:    ret{{[l|q]}}
 277   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 278   %shuf = shufflevector <32 x i8> %arg0, <32 x i8> undef, <32 x i32> zeroinitializer
 279   %res = bitcast <32 x i8> %shuf to <4 x i64>
 280   ret <4 x i64> %res
 281 }
 282
 283 define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {
 284 ; CHECK-LABEL: test_mm_broadcastd_epi32:
 285 ; CHECK:       # %bb.0:
 286 ; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
 287 ; CHECK-NEXT:    ret{{[l|q]}}
 288   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
 289   %shuf = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
 290   %res = bitcast <4 x i32> %shuf to <2 x i64>
 291   ret <2 x i64> %res
 292 }
 293
 294 define <4 x i64> @test_mm256_broadcastd_epi32(<4 x i64> %a0) {
 295 ; CHECK-LABEL: test_mm256_broadcastd_epi32:
 296 ; CHECK:       # %bb.0:
 297 ; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
 298 ; CHECK-NEXT:    ret{{[l|q]}}
 299   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
 300   %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> zeroinitializer
 301   %res = bitcast <8 x i32> %shuf to <4 x i64>
 302   ret <4 x i64> %res
 303 }
 304
 305 define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) {
 306 ; CHECK-LABEL: test_mm_broadcastq_epi64:
 307 ; CHECK:       # %bb.0:
 308 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 309 ; CHECK-NEXT:    ret{{[l|q]}}
 310   %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
 311   ret <2 x i64> %res
 312 }
 313
 314 define <4 x i64> @test_mm256_broadcastq_epi64(<4 x i64> %a0) {
 315 ; CHECK-LABEL: test_mm256_broadcastq_epi64:
 316 ; CHECK:       # %bb.0:
 317 ; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
 318 ; CHECK-NEXT:    ret{{[l|q]}}
 319   %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> zeroinitializer
 320   ret <4 x i64> %res
 321 }
 322
 323 define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) {
 324 ; CHECK-LABEL: test_mm_broadcastsd_pd:
 325 ; CHECK:       # %bb.0:
 326 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 327 ; CHECK-NEXT:    ret{{[l|q]}}
 328   %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
 329   ret <2 x double> %res
 330 }
 331
 332 define <4 x double> @test_mm256_broadcastsd_pd(<4 x double> %a0) {
 333 ; CHECK-LABEL: test_mm256_broadcastsd_pd:
 334 ; CHECK:       # %bb.0:
 335 ; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
 336 ; CHECK-NEXT:    ret{{[l|q]}}
 337   %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> zeroinitializer
 338   ret <4 x double> %res
 339 }
 340
 341 define <4 x i64> @test_mm256_broadcastsi128_si256(<2 x i64> %a0) {
 342 ; CHECK-LABEL: test_mm256_broadcastsi128_si256:
 343 ; CHECK:       # %bb.0:
 344 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
 345 ; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 346 ; CHECK-NEXT:    ret{{[l|q]}}
 347   %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 348   ret <4 x i64> %res
 349 }
 350
 351 define <4 x i64> @test_mm256_broadcastsi128_si256_mem(ptr %p0) {
 352 ; X86-LABEL: test_mm256_broadcastsi128_si256_mem:
 353 ; X86:       # %bb.0:
 354 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 355 ; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
 356 ; X86-NEXT:    retl
 357 ;
 358 ; X64-LABEL: test_mm256_broadcastsi128_si256_mem:
 359 ; X64:       # %bb.0:
 360 ; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
 361 ; X64-NEXT:    retq
 362   %a0 = load <2 x i64>, ptr %p0
 363   %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 364   ret <4 x i64> %res
 365 }
 366
 367 define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {
 368 ; CHECK-LABEL: test_mm_broadcastss_ps:
 369 ; CHECK:       # %bb.0:
 370 ; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
 371 ; CHECK-NEXT:    ret{{[l|q]}}
 372   %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
 373   ret <4 x float> %res
 374 }
 375
 376 define <8 x float> @test_mm256_broadcastss_ps(<8 x float> %a0) {
 377 ; CHECK-LABEL: test_mm256_broadcastss_ps:
 378 ; CHECK:       # %bb.0:
 379 ; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
 380 ; CHECK-NEXT:    ret{{[l|q]}}
 381   %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer
 382   ret <8 x float> %res
 383 }
 384
 385 define <2 x i64> @test_mm_broadcastw_epi16(<2 x i64> %a0) {
 386 ; CHECK-LABEL: test_mm_broadcastw_epi16:
 387 ; CHECK:       # %bb.0:
 388 ; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm0
 389 ; CHECK-NEXT:    ret{{[l|q]}}
 390   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
 391   %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> zeroinitializer
 392   %res = bitcast <8 x i16> %shuf to <2 x i64>
 393   ret <2 x i64> %res
 394 }
 395
 396 define <4 x i64> @test_mm256_broadcastw_epi16(<4 x i64> %a0) {
 397 ; CHECK-LABEL: test_mm256_broadcastw_epi16:
 398 ; CHECK:       # %bb.0:
 399 ; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0
 400 ; CHECK-NEXT:    ret{{[l|q]}}
 401   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 402   %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> zeroinitializer
 403   %res = bitcast <16 x i16> %shuf to <4 x i64>
 404   ret <4 x i64> %res
 405 }
 406
 407 define <4 x i64> @test_mm256_bslli_epi128(<4 x i64> %a0) {
 408 ; CHECK-LABEL: test_mm256_bslli_epi128:
 409 ; CHECK:       # %bb.0:
 410 ; CHECK-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
 411 ; CHECK-NEXT:    ret{{[l|q]}}
 412   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 413   %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
 414   %res = bitcast <32 x i8> %shuf to <4 x i64>
 415   ret <4 x i64> %res
 416 }
 417
 418 define <4 x i64> @test_mm256_bsrli_epi128(<4 x i64> %a0) {
 419 ; CHECK-LABEL: test_mm256_bsrli_epi128:
 420 ; CHECK:       # %bb.0:
 421 ; CHECK-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
 422 ; CHECK-NEXT:    ret{{[l|q]}}
 423   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 424   %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
 425   %res = bitcast <32 x i8> %shuf to <4 x i64>
 426   ret <4 x i64> %res
 427 }
 428
 429 define <4 x i64> @test_mm256_cmpeq_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 430 ; CHECK-LABEL: test_mm256_cmpeq_epi8:
 431 ; CHECK:       # %bb.0:
 432 ; CHECK-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
 433 ; CHECK-NEXT:    ret{{[l|q]}}
 434   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 435   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
 436   %cmp = icmp eq <32 x i8> %arg0, %arg1
 437   %res = sext <32 x i1> %cmp to <32 x i8>
 438   %bc = bitcast <32 x i8> %res to <4 x i64>
 439   ret <4 x i64> %bc
 440 }
 441
 442 define <4 x i64> @test_mm256_cmpeq_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 443 ; CHECK-LABEL: test_mm256_cmpeq_epi16:
 444 ; CHECK:       # %bb.0:
 445 ; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 446 ; CHECK-NEXT:    ret{{[l|q]}}
 447   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 448   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 449   %cmp = icmp eq <16 x i16> %arg0, %arg1
 450   %res = sext <16 x i1> %cmp to <16 x i16>
 451   %bc = bitcast <16 x i16> %res to <4 x i64>
 452   ret <4 x i64> %bc
 453 }
 454
 455 define <4 x i64> @test_mm256_cmpeq_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 456 ; CHECK-LABEL: test_mm256_cmpeq_epi32:
 457 ; CHECK:       # %bb.0:
 458 ; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0
 459 ; CHECK-NEXT:    ret{{[l|q]}}
 460   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
 461   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
 462   %cmp = icmp eq <8 x i32> %arg0, %arg1
 463   %res = sext <8 x i1> %cmp to <8 x i32>
 464   %bc = bitcast <8 x i32> %res to <4 x i64>
 465   ret <4 x i64> %bc
 466 }
 467
 468 define <4 x i64> @test_mm256_cmpeq_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 469 ; CHECK-LABEL: test_mm256_cmpeq_epi64:
 470 ; CHECK:       # %bb.0:
 471 ; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
 472 ; CHECK-NEXT:    ret{{[l|q]}}
 473   %cmp = icmp eq <4 x i64> %a0, %a1
 474   %res = sext <4 x i1> %cmp to <4 x i64>
 475   ret <4 x i64> %res
 476 }
 477
 478 define <4 x i64> @test_mm256_cmpgt_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 479 ; CHECK-LABEL: test_mm256_cmpgt_epi8:
 480 ; CHECK:       # %bb.0:
 481 ; CHECK-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
 482 ; CHECK-NEXT:    ret{{[l|q]}}
 483   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 484   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
 485   %cmp = icmp sgt <32 x i8> %arg0, %arg1
 486   %res = sext <32 x i1> %cmp to <32 x i8>
 487   %bc = bitcast <32 x i8> %res to <4 x i64>
 488   ret <4 x i64> %bc
 489 }
 490
 491 define <4 x i64> @test_mm256_cmpgt_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 492 ; CHECK-LABEL: test_mm256_cmpgt_epi16:
 493 ; CHECK:       # %bb.0:
 494 ; CHECK-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 495 ; CHECK-NEXT:    ret{{[l|q]}}
 496   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 497   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 498   %cmp = icmp sgt <16 x i16> %arg0, %arg1
 499   %res = sext <16 x i1> %cmp to <16 x i16>
 500   %bc = bitcast <16 x i16> %res to <4 x i64>
 501   ret <4 x i64> %bc
 502 }
 503
 504 define <4 x i64> @test_mm256_cmpgt_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 505 ; CHECK-LABEL: test_mm256_cmpgt_epi32:
 506 ; CHECK:       # %bb.0:
 507 ; CHECK-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
 508 ; CHECK-NEXT:    ret{{[l|q]}}
 509   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
 510   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
 511   %cmp = icmp sgt <8 x i32> %arg0, %arg1
 512   %res = sext <8 x i1> %cmp to <8 x i32>
 513   %bc = bitcast <8 x i32> %res to <4 x i64>
 514   ret <4 x i64> %bc
 515 }
 516
 517 define <4 x i64> @test_mm256_cmpgt_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 518 ; CHECK-LABEL: test_mm256_cmpgt_epi64:
 519 ; CHECK:       # %bb.0:
 520 ; CHECK-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
 521 ; CHECK-NEXT:    ret{{[l|q]}}
 522   %cmp = icmp sgt <4 x i64> %a0, %a1
 523   %res = sext <4 x i1> %cmp to <4 x i64>
 524   ret <4 x i64> %res
 525 }
 526
 527 define <4 x i64> @test_mm256_cvtepi8_epi16(<2 x i64> %a0) {
 528 ; CHECK-LABEL: test_mm256_cvtepi8_epi16:
 529 ; CHECK:       # %bb.0:
 530 ; CHECK-NEXT:    vpmovsxbw %xmm0, %ymm0
 531 ; CHECK-NEXT:    ret{{[l|q]}}
 532   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
 533   %ext = sext <16 x i8> %arg0 to <16 x i16>
 534   %res = bitcast <16 x i16> %ext to <4 x i64>
 535   ret <4 x i64> %res
 536 }
 537
 538 define <4 x i64> @test_mm256_cvtepi8_epi32(<2 x i64> %a0) {
 539 ; CHECK-LABEL: test_mm256_cvtepi8_epi32:
 540 ; CHECK:       # %bb.0:
 541 ; CHECK-NEXT:    vpmovsxbd %xmm0, %ymm0
 542 ; CHECK-NEXT:    ret{{[l|q]}}
 543   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
 544   %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 545   %ext = sext <8 x i8> %shuf to <8 x i32>
 546   %res = bitcast <8 x i32> %ext to <4 x i64>
 547   ret <4 x i64> %res
 548 }
 549
 550 define <4 x i64> @test_mm256_cvtepi8_epi64(<2 x i64> %a0) {
 551 ; CHECK-LABEL: test_mm256_cvtepi8_epi64:
 552 ; CHECK:       # %bb.0:
 553 ; CHECK-NEXT:    vpmovsxbq %xmm0, %ymm0
 554 ; CHECK-NEXT:    ret{{[l|q]}}
 555   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
 556   %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 557   %ext = sext <4 x i8> %shuf to <4 x i64>
 558   ret <4 x i64> %ext
 559 }
 560
 561 define <4 x i64> @test_mm256_cvtepi16_epi32(<2 x i64> %a0) {
 562 ; CHECK-LABEL: test_mm256_cvtepi16_epi32:
 563 ; CHECK:       # %bb.0:
 564 ; CHECK-NEXT:    vpmovsxwd %xmm0, %ymm0
 565 ; CHECK-NEXT:    ret{{[l|q]}}
 566   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
 567   %ext = sext <8 x i16> %arg0 to <8 x i32>
 568   %res = bitcast <8 x i32> %ext to <4 x i64>
 569   ret <4 x i64> %res
 570 }
 571
 572 define <4 x i64> @test_mm256_cvtepi16_epi64(<2 x i64> %a0) {
 573 ; CHECK-LABEL: test_mm256_cvtepi16_epi64:
 574 ; CHECK:       # %bb.0:
 575 ; CHECK-NEXT:    vpmovsxwq %xmm0, %ymm0
 576 ; CHECK-NEXT:    ret{{[l|q]}}
 577   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
 578   %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 579   %ext = sext <4 x i16> %shuf to <4 x i64>
 580   ret <4 x i64> %ext
 581 }
 582
 583 define <4 x i64> @test_mm256_cvtepi32_epi64(<2 x i64> %a0) {
 584 ; CHECK-LABEL: test_mm256_cvtepi32_epi64:
 585 ; CHECK:       # %bb.0:
 586 ; CHECK-NEXT:    vpmovsxdq %xmm0, %ymm0
 587 ; CHECK-NEXT:    ret{{[l|q]}}
 588   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
 589   %ext = sext <4 x i32> %arg0 to <4 x i64>
 590   ret <4 x i64> %ext
 591 }
 592
 593 define <4 x i64> @test_mm256_cvtepu8_epi16(<2 x i64> %a0) {
 594 ; CHECK-LABEL: test_mm256_cvtepu8_epi16:
 595 ; CHECK:       # %bb.0:
 596 ; CHECK-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 597 ; CHECK-NEXT:    ret{{[l|q]}}
 598   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
 599   %ext = zext <16 x i8> %arg0 to <16 x i16>
 600   %res = bitcast <16 x i16> %ext to <4 x i64>
 601   ret <4 x i64> %res
 602 }
 603
 604 define <4 x i64> @test_mm256_cvtepu8_epi32(<2 x i64> %a0) {
 605 ; CHECK-LABEL: test_mm256_cvtepu8_epi32:
 606 ; CHECK:       # %bb.0:
 607 ; CHECK-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 608 ; CHECK-NEXT:    ret{{[l|q]}}
 609   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
 610   %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 611   %ext = zext <8 x i8> %shuf to <8 x i32>
 612   %res = bitcast <8 x i32> %ext to <4 x i64>
 613   ret <4 x i64> %res
 614 }
 615
 616 define <4 x i64> @test_mm256_cvtepu8_epi64(<2 x i64> %a0) {
 617 ; CHECK-LABEL: test_mm256_cvtepu8_epi64:
 618 ; CHECK:       # %bb.0:
 619 ; CHECK-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
 620 ; CHECK-NEXT:    ret{{[l|q]}}
 621   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
 622   %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 623   %ext = zext <4 x i8> %shuf to <4 x i64>
 624   ret <4 x i64> %ext
 625 }
 626
 627 define <4 x i64> @test_mm256_cvtepu16_epi32(<2 x i64> %a0) {
 628 ; CHECK-LABEL: test_mm256_cvtepu16_epi32:
 629 ; CHECK:       # %bb.0:
 630 ; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 631 ; CHECK-NEXT:    ret{{[l|q]}}
 632   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
 633   %ext = zext <8 x i16> %arg0 to <8 x i32>
 634   %res = bitcast <8 x i32> %ext to <4 x i64>
 635   ret <4 x i64> %res
 636 }
 637
 638 define <4 x i64> @test_mm256_cvtepu16_epi64(<2 x i64> %a0) {
 639 ; CHECK-LABEL: test_mm256_cvtepu16_epi64:
 640 ; CHECK:       # %bb.0:
 641 ; CHECK-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 642 ; CHECK-NEXT:    ret{{[l|q]}}
 643   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
 644   %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 645   %ext = zext <4 x i16> %shuf to <4 x i64>
 646   ret <4 x i64> %ext
 647 }
 648
 649 define <4 x i64> @test_mm256_cvtepu32_epi64(<2 x i64> %a0) {
 650 ; CHECK-LABEL: test_mm256_cvtepu32_epi64:
 651 ; CHECK:       # %bb.0:
 652 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 653 ; CHECK-NEXT:    ret{{[l|q]}}
 654   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
 655   %ext = zext <4 x i32> %arg0 to <4 x i64>
 656   ret <4 x i64> %ext
 657 }
 658
 659 define <2 x i64> @test_mm256_extracti128_si256(<4 x i64> %a0) nounwind {
 660 ; CHECK-LABEL: test_mm256_extracti128_si256:
 661 ; CHECK:       # %bb.0:
 662 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
 663 ; CHECK-NEXT:    vzeroupper
 664 ; CHECK-NEXT:    ret{{[l|q]}}
 665   %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3>
 666   ret <2 x i64> %res
 667 }
 668
 669 define <4 x i64> @test_mm256_hadd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
 670 ; CHECK-LABEL: test_mm256_hadd_epi16:
 671 ; CHECK:       # %bb.0:
 672 ; CHECK-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
 673 ; CHECK-NEXT:    ret{{[l|q]}}
 674   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 675   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 676   %res = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %arg0, <16 x i16> %arg1)
 677   %bc = bitcast <16 x i16> %res to <4 x i64>
 678   ret <4 x i64> %bc
 679 }
 680 declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone
 681
 682 define <4 x i64> @test_mm256_hadd_epi32(<4 x i64> %a0, <4 x i64> %a1) {
 683 ; CHECK-LABEL: test_mm256_hadd_epi32:
 684 ; CHECK:       # %bb.0:
 685 ; CHECK-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
 686 ; CHECK-NEXT:    ret{{[l|q]}}
 687   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
 688   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
 689   %res = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %arg0, <8 x i32> %arg1)
 690   %bc = bitcast <8 x i32> %res to <4 x i64>
 691   ret <4 x i64> %bc
 692 }
 693 declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
 694
 695 define <4 x i64> @test_mm256_hadds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
 696 ; CHECK-LABEL: test_mm256_hadds_epi16:
 697 ; CHECK:       # %bb.0:
 698 ; CHECK-NEXT:    vphaddsw %ymm1, %ymm0, %ymm0
 699 ; CHECK-NEXT:    ret{{[l|q]}}
 700   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 701   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 702   %res = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %arg0, <16 x i16> %arg1)
 703   %bc = bitcast <16 x i16> %res to <4 x i64>
 704   ret <4 x i64> %bc
 705 }
 706 declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone
 707
 708 define <4 x i64> @test_mm256_hsub_epi16(<4 x i64> %a0, <4 x i64> %a1) {
 709 ; CHECK-LABEL: test_mm256_hsub_epi16:
 710 ; CHECK:       # %bb.0:
 711 ; CHECK-NEXT:    vphsubw %ymm1, %ymm0, %ymm0
 712 ; CHECK-NEXT:    ret{{[l|q]}}
 713   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 714   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 715   %res = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %arg0, <16 x i16> %arg1)
 716   %bc = bitcast <16 x i16> %res to <4 x i64>
 717   ret <4 x i64> %bc
 718 }
 719 declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone
 720
 721 define <4 x i64> @test_mm256_hsub_epi32(<4 x i64> %a0, <4 x i64> %a1) {
 722 ; CHECK-LABEL: test_mm256_hsub_epi32:
 723 ; CHECK:       # %bb.0:
 724 ; CHECK-NEXT:    vphsubd %ymm1, %ymm0, %ymm0
 725 ; CHECK-NEXT:    ret{{[l|q]}}
 726   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
 727   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
 728   %res = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %arg0, <8 x i32> %arg1)
 729   %bc = bitcast <8 x i32> %res to <4 x i64>
 730   ret <4 x i64> %bc
 731 }
 732 declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
 733
 734 define <4 x i64> @test_mm256_hsubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
 735 ; CHECK-LABEL: test_mm256_hsubs_epi16:
 736 ; CHECK:       # %bb.0:
 737 ; CHECK-NEXT:    vphsubsw %ymm1, %ymm0, %ymm0
 738 ; CHECK-NEXT:    ret{{[l|q]}}
 739   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 740   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 741   %res = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %arg0, <16 x i16> %arg1)
 742   %bc = bitcast <16 x i16> %res to <4 x i64>
 743   ret <4 x i64> %bc
 744 }
 745 declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone
 746
 747 define <2 x i64> @test_mm_i32gather_epi32(ptr%a0, <2 x i64> %a1) {
 748 ; X86-LABEL: test_mm_i32gather_epi32:
 749 ; X86:       # %bb.0:
 750 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 751 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 752 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 753 ; X86-NEXT:    vpgatherdd %xmm2, (%eax,%xmm0,2), %xmm1
 754 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
 755 ; X86-NEXT:    retl
 756 ;
 757 ; X64-LABEL: test_mm_i32gather_epi32:
 758 ; X64:       # %bb.0:
 759 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 760 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 761 ; X64-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm0,2), %xmm1
 762 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
 763 ; X64-NEXT:    retq
 764   %arg0 = bitcast ptr%a0 to ptr
 765   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
 766   %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
 767   %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, ptr %arg0, <4 x i32> %arg1, <4 x i32> %mask, i8 2)
 768   %bc = bitcast <4 x i32> %call to <2 x i64>
 769   ret <2 x i64> %bc
 770 }
 771 declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, ptr, <4 x i32>, <4 x i32>, i8) nounwind readonly
 772
 773 define <2 x i64> @test_mm_mask_i32gather_epi32(<2 x i64> %a0, ptr%a1, <2 x i64> %a2, <2 x i64> %a3) {
 774 ; X86-LABEL: test_mm_mask_i32gather_epi32:
 775 ; X86:       # %bb.0:
 776 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 777 ; X86-NEXT:    vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0
 778 ; X86-NEXT:    retl
 779 ;
 780 ; X64-LABEL: test_mm_mask_i32gather_epi32:
 781 ; X64:       # %bb.0:
 782 ; X64-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0
 783 ; X64-NEXT:    retq
 784   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
 785   %arg1 = bitcast ptr%a1 to ptr
 786   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
 787   %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
 788   %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %arg0, ptr %arg1, <4 x i32> %arg2, <4 x i32> %arg3, i8 2)
 789   %bc = bitcast <4 x i32> %call to <2 x i64>
 790   ret <2 x i64> %bc
 791 }
 792
 793 define <4 x i64> @test_mm256_i32gather_epi32(ptr%a0, <4 x i64> %a1) {
 794 ; X86-LABEL: test_mm256_i32gather_epi32:
 795 ; X86:       # %bb.0:
 796 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 797 ; X86-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 798 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 799 ; X86-NEXT:    vpgatherdd %ymm2, (%eax,%ymm0,2), %ymm1
 800 ; X86-NEXT:    vmovdqa %ymm1, %ymm0
 801 ; X86-NEXT:    retl
 802 ;
 803 ; X64-LABEL: test_mm256_i32gather_epi32:
 804 ; X64:       # %bb.0:
 805 ; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 806 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 807 ; X64-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm0,2), %ymm1
 808 ; X64-NEXT:    vmovdqa %ymm1, %ymm0
 809 ; X64-NEXT:    retq
 810   %arg0 = bitcast ptr%a0 to ptr
 811   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
 812   %mask = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <8 x i32>
 813   %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, ptr %arg0, <8 x i32> %arg1, <8 x i32> %mask, i8 2)
 814   %bc = bitcast <8 x i32> %call to <4 x i64>
 815   ret <4 x i64> %bc
 816 }
 817 declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, ptr, <8 x i32>, <8 x i32>, i8) nounwind readonly
 818
 819 define <4 x i64> @test_mm256_mask_i32gather_epi32(<4 x i64> %a0, ptr%a1, <4 x i64> %a2, <4 x i64> %a3) {
 820 ; X86-LABEL: test_mm256_mask_i32gather_epi32:
 821 ; X86:       # %bb.0:
 822 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 823 ; X86-NEXT:    vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0
 824 ; X86-NEXT:    retl
 825 ;
 826 ; X64-LABEL: test_mm256_mask_i32gather_epi32:
 827 ; X64:       # %bb.0:
 828 ; X64-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0
 829 ; X64-NEXT:    retq
 830   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
 831   %arg1 = bitcast ptr%a1 to ptr
 832   %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
 833   %arg3 = bitcast <4 x i64> %a3 to <8 x i32>
 834   %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %arg0, ptr %arg1, <8 x i32> %arg2, <8 x i32> %arg3, i8 2)
 835   %bc = bitcast <8 x i32> %call to <4 x i64>
 836   ret <4 x i64> %bc
 837 }
 838
 839 define <2 x i64> @test_mm_i32gather_epi64(ptr%a0, <2 x i64> %a1) {
 840 ; X86-LABEL: test_mm_i32gather_epi64:
 841 ; X86:       # %bb.0:
 842 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 843 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 844 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 845 ; X86-NEXT:    vpgatherdq %xmm2, (%eax,%xmm0,2), %xmm1
 846 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
 847 ; X86-NEXT:    retl
 848 ;
 849 ; X64-LABEL: test_mm_i32gather_epi64:
 850 ; X64:       # %bb.0:
 851 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 852 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 853 ; X64-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm0,2), %xmm1
 854 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
 855 ; X64-NEXT:    retq
 856   %arg0 = bitcast ptr%a0 to ptr
 857   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
 858   %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> undef, ptr %arg0, <4 x i32> %arg1, <2 x i64> <i64 -1, i64 -1>, i8 2)
 859   ret <2 x i64> %res
 860 }
 861 declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, ptr, <4 x i32>, <2 x i64>, i8) nounwind readonly
 862
 863 define <2 x i64> @test_mm_mask_i32gather_epi64(<2 x i64> %a0, ptr%a1, <2 x i64> %a2, <2 x i64> %a3) {
 864 ; X86-LABEL: test_mm_mask_i32gather_epi64:
 865 ; X86:       # %bb.0:
 866 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 867 ; X86-NEXT:    vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0
 868 ; X86-NEXT:    retl
 869 ;
 870 ; X64-LABEL: test_mm_mask_i32gather_epi64:
 871 ; X64:       # %bb.0:
 872 ; X64-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0
 873 ; X64-NEXT:    retq
 874   %arg1 = bitcast ptr%a1 to ptr
 875   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
 876   %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0, ptr %arg1, <4 x i32> %arg2, <2 x i64> %a3, i8 2)
 877   ret <2 x i64> %res
 878 }
 879
 880 define <4 x i64> @test_mm256_i32gather_epi64(ptr%a0, <2 x i64> %a1) {
 881 ; X86-LABEL: test_mm256_i32gather_epi64:
 882 ; X86:       # %bb.0:
 883 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 884 ; X86-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 885 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 886 ; X86-NEXT:    vpgatherdq %ymm2, (%eax,%xmm0,2), %ymm1
 887 ; X86-NEXT:    vmovdqa %ymm1, %ymm0
 888 ; X86-NEXT:    retl
 889 ;
 890 ; X64-LABEL: test_mm256_i32gather_epi64:
 891 ; X64:       # %bb.0:
 892 ; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 893 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 894 ; X64-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm0,2), %ymm1
 895 ; X64-NEXT:    vmovdqa %ymm1, %ymm0
 896 ; X64-NEXT:    retq
 897   %arg0 = bitcast ptr%a0 to ptr
 898   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
 899   %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, ptr %arg0, <4 x i32> %arg1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
 900   ret <4 x i64> %res
 901 }
 902 declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, ptr, <4 x i32>, <4 x i64>, i8) nounwind readonly
 903
 904 define <4 x i64> @test_mm256_mask_i32gather_epi64(<4 x i64> %a0, ptr%a1, <2 x i64> %a2, <4 x i64> %a3) {
 905 ; X86-LABEL: test_mm256_mask_i32gather_epi64:
 906 ; X86:       # %bb.0:
 907 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 908 ; X86-NEXT:    vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0
 909 ; X86-NEXT:    retl
 910 ;
 911 ; X64-LABEL: test_mm256_mask_i32gather_epi64:
 912 ; X64:       # %bb.0:
 913 ; X64-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0
 914 ; X64-NEXT:    retq
 915   %arg1 = bitcast ptr%a1 to ptr
 916   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
 917   %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0, ptr %arg1, <4 x i32> %arg2, <4 x i64> %a3, i8 2)
 918   ret <4 x i64> %res
 919 }
 920
 921 define <2 x double> @test_mm_i32gather_pd(ptr%a0, <2 x i64> %a1) {
 922 ; X86-LABEL: test_mm_i32gather_pd:
 923 ; X86:       # %bb.0:
 924 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 925 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 926 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 927 ; X86-NEXT:    vgatherdpd %xmm2, (%eax,%xmm0,2), %xmm1
 928 ; X86-NEXT:    vmovapd %xmm1, %xmm0
 929 ; X86-NEXT:    retl
 930 ;
 931 ; X64-LABEL: test_mm_i32gather_pd:
 932 ; X64:       # %bb.0:
 933 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 934 ; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 935 ; X64-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm0,2), %xmm1
 936 ; X64-NEXT:    vmovapd %xmm1, %xmm0
 937 ; X64-NEXT:    retq
 938   %arg0 = bitcast ptr%a0 to ptr
 939   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
 940   %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
 941   %sext = sext <2 x i1> %cmp to <2 x i64>
 942   %mask = bitcast <2 x i64> %sext to <2 x double>
 943   %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> undef, ptr %arg0, <4 x i32> %arg1, <2 x double> %mask, i8 2)
 944   ret <2 x double> %res
 945 }
 946 declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, ptr, <4 x i32>, <2 x double>, i8) nounwind readonly
 947
 948 define <2 x double> @test_mm_mask_i32gather_pd(<2 x double> %a0, ptr%a1, <2 x i64> %a2, <2 x double> %a3) {
 949 ; X86-LABEL: test_mm_mask_i32gather_pd:
 950 ; X86:       # %bb.0:
 951 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 952 ; X86-NEXT:    vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0
 953 ; X86-NEXT:    retl
 954 ;
 955 ; X64-LABEL: test_mm_mask_i32gather_pd:
 956 ; X64:       # %bb.0:
 957 ; X64-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0
 958 ; X64-NEXT:    retq
 959   %arg1 = bitcast ptr%a1 to ptr
 960   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
 961   %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0, ptr %arg1, <4 x i32> %arg2, <2 x double> %a3, i8 2)
 962   ret <2 x double> %res
 963 }
 964
 965 define <4 x double> @test_mm256_i32gather_pd(ptr%a0, <2 x i64> %a1) {
 966 ; X86-LABEL: test_mm256_i32gather_pd:
 967 ; X86:       # %bb.0:
 968 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 969 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 970 ; X86-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
 971 ; X86-NEXT:    vgatherdpd %ymm2, (%eax,%xmm0,2), %ymm1
 972 ; X86-NEXT:    vmovapd %ymm1, %ymm0
 973 ; X86-NEXT:    retl
 974 ;
 975 ; X64-LABEL: test_mm256_i32gather_pd:
 976 ; X64:       # %bb.0:
 977 ; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 978 ; X64-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
 979 ; X64-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm0,2), %ymm1
 980 ; X64-NEXT:    vmovapd %ymm1, %ymm0
 981 ; X64-NEXT:    retq
 982   %arg0 = bitcast ptr%a0 to ptr
 983   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
 984   %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
 985   %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, ptr %arg0, <4 x i32> %arg1, <4 x double> %mask, i8 2)
 986   ret <4 x double> %res
 987 }
 988 declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, ptr, <4 x i32>, <4 x double>, i8) nounwind readonly
 989
 990 define <4 x double> @test_mm256_mask_i32gather_pd(<4 x double> %a0, ptr%a1, <2 x i64> %a2, <4 x double> %a3) {
 991 ; X86-LABEL: test_mm256_mask_i32gather_pd:
 992 ; X86:       # %bb.0:
 993 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 994 ; X86-NEXT:    vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0
 995 ; X86-NEXT:    retl
 996 ;
 997 ; X64-LABEL: test_mm256_mask_i32gather_pd:
 998 ; X64:       # %bb.0:
 999 ; X64-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm1,2), %ymm0
1000 ; X64-NEXT:    retq
1001   %arg1 = bitcast ptr%a1 to ptr
1002   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1003   %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0, ptr %arg1, <4 x i32> %arg2, <4 x double> %a3, i8 2)
1004   ret <4 x double> %res
1005 }
1006
1007 define <4 x float> @test_mm_i32gather_ps(ptr%a0, <2 x i64> %a1) {
1008 ; X86-LABEL: test_mm_i32gather_ps:
1009 ; X86:       # %bb.0:
1010 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1011 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1012 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1013 ; X86-NEXT:    vgatherdps %xmm2, (%eax,%xmm0,2), %xmm1
1014 ; X86-NEXT:    vmovaps %xmm1, %xmm0
1015 ; X86-NEXT:    retl
1016 ;
1017 ; X64-LABEL: test_mm_i32gather_ps:
1018 ; X64:       # %bb.0:
1019 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1020 ; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1021 ; X64-NEXT:    vgatherdps %xmm2, (%rdi,%xmm0,2), %xmm1
1022 ; X64-NEXT:    vmovaps %xmm1, %xmm0
1023 ; X64-NEXT:    retq
1024   %arg0 = bitcast ptr%a0 to ptr
1025   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1026   %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1027   %sext = sext <4 x i1> %cmp to <4 x i32>
1028   %mask = bitcast <4 x i32> %sext to <4 x float>
1029   %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, ptr %arg0, <4 x i32> %arg1, <4 x float> %mask, i8 2)
1030   ret <4 x float> %call
1031 }
1032 declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, ptr, <4 x i32>, <4 x float>, i8) nounwind readonly
1033
1034 define <4 x float> @test_mm_mask_i32gather_ps(<4 x float> %a0, ptr%a1, <2 x i64> %a2, <4 x float> %a3) {
1035 ; X86-LABEL: test_mm_mask_i32gather_ps:
1036 ; X86:       # %bb.0:
1037 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1038 ; X86-NEXT:    vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0
1039 ; X86-NEXT:    retl
1040 ;
1041 ; X64-LABEL: test_mm_mask_i32gather_ps:
1042 ; X64:       # %bb.0:
1043 ; X64-NEXT:    vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0
1044 ; X64-NEXT:    retq
1045   %arg1 = bitcast ptr%a1 to ptr
1046   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1047   %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0, ptr %arg1, <4 x i32> %arg2, <4 x float> %a3, i8 2)
1048   ret <4 x float> %call
1049 }
1050
1051 define <8 x float> @test_mm256_i32gather_ps(ptr%a0, <4 x i64> %a1) {
1052 ; X86-LABEL: test_mm256_i32gather_ps:
1053 ; X86:       # %bb.0:
1054 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1055 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1056 ; X86-NEXT:    vcmpeqps %ymm1, %ymm1, %ymm2
1057 ; X86-NEXT:    vgatherdps %ymm2, (%eax,%ymm0,2), %ymm1
1058 ; X86-NEXT:    vmovaps %ymm1, %ymm0
1059 ; X86-NEXT:    retl
1060 ;
1061 ; X64-LABEL: test_mm256_i32gather_ps:
1062 ; X64:       # %bb.0:
1063 ; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1064 ; X64-NEXT:    vcmpeqps %ymm1, %ymm1, %ymm2
1065 ; X64-NEXT:    vgatherdps %ymm2, (%rdi,%ymm0,2), %ymm1
1066 ; X64-NEXT:    vmovaps %ymm1, %ymm0
1067 ; X64-NEXT:    retq
1068   %arg0 = bitcast ptr%a0 to ptr
1069   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1070   %mask = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i8 0)
1071   %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, ptr %arg0, <8 x i32> %arg1, <8 x float> %mask, i8 2)
1072   ret <8 x float> %call
1073 }
1074 declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, ptr, <8 x i32>, <8 x float>, i8) nounwind readonly
1075
1076 define <8 x float> @test_mm256_mask_i32gather_ps(<8 x float> %a0, ptr%a1, <4 x i64> %a2, <8 x float> %a3) {
1077 ; X86-LABEL: test_mm256_mask_i32gather_ps:
1078 ; X86:       # %bb.0:
1079 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1080 ; X86-NEXT:    vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0
1081 ; X86-NEXT:    retl
1082 ;
1083 ; X64-LABEL: test_mm256_mask_i32gather_ps:
1084 ; X64:       # %bb.0:
1085 ; X64-NEXT:    vgatherdps %ymm2, (%rdi,%ymm1,2), %ymm0
1086 ; X64-NEXT:    retq
1087   %arg1 = bitcast ptr%a1 to ptr
1088   %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
1089   %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0, ptr %arg1, <8 x i32> %arg2, <8 x float> %a3, i8 2)
1090   ret <8 x float> %call
1091 }
1092
1093 define <2 x i64> @test_mm_i64gather_epi32(ptr%a0, <2 x i64> %a1) {
1094 ; X86-LABEL: test_mm_i64gather_epi32:
1095 ; X86:       # %bb.0:
1096 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1097 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1098 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1099 ; X86-NEXT:    vpgatherqd %xmm2, (%eax,%xmm0,2), %xmm1
1100 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
1101 ; X86-NEXT:    retl
1102 ;
1103 ; X64-LABEL: test_mm_i64gather_epi32:
1104 ; X64:       # %bb.0:
1105 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1106 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1107 ; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm0,2), %xmm1
1108 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
1109 ; X64-NEXT:    retq
1110   %arg0 = bitcast ptr%a0 to ptr
1111   %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
1112   %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> undef, ptr %arg0, <2 x i64> %a1, <4 x i32> %mask, i8 2)
1113   %bc = bitcast <4 x i32> %call to <2 x i64>
1114   ret <2 x i64> %bc
1115 }
1116 declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, ptr, <2 x i64>, <4 x i32>, i8) nounwind readonly
1117
1118 define <2 x i64> @test_mm_mask_i64gather_epi32(<2 x i64> %a0, ptr%a1, <2 x i64> %a2, <2 x i64> %a3) {
1119 ; X86-LABEL: test_mm_mask_i64gather_epi32:
1120 ; X86:       # %bb.0:
1121 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1122 ; X86-NEXT:    vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0
1123 ; X86-NEXT:    retl
1124 ;
1125 ; X64-LABEL: test_mm_mask_i64gather_epi32:
1126 ; X64:       # %bb.0:
1127 ; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0
1128 ; X64-NEXT:    retq
1129   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1130   %arg1 = bitcast ptr%a1 to ptr
1131   %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
1132   %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %arg0, ptr %arg1, <2 x i64> %a2, <4 x i32> %arg3, i8 2)
1133   %bc = bitcast <4 x i32> %call to <2 x i64>
1134   ret <2 x i64> %bc
1135 }
1136
1137 define <2 x i64> @test_mm256_i64gather_epi32(ptr%a0, <4 x i64> %a1) {
1138 ; X86-LABEL: test_mm256_i64gather_epi32:
1139 ; X86:       # %bb.0:
1140 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1141 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1142 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1143 ; X86-NEXT:    vpgatherqd %xmm2, (%eax,%ymm0,2), %xmm1
1144 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
1145 ; X86-NEXT:    vzeroupper
1146 ; X86-NEXT:    retl
1147 ;
1148 ; X64-LABEL: test_mm256_i64gather_epi32:
1149 ; X64:       # %bb.0:
1150 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1151 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1152 ; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm0,2), %xmm1
1153 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
1154 ; X64-NEXT:    vzeroupper
1155 ; X64-NEXT:    retq
1156   %arg0 = bitcast ptr%a0 to ptr
1157   %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
1158   %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, ptr %arg0, <4 x i64> %a1, <4 x i32> %mask, i8 2)
1159   %bc = bitcast <4 x i32> %call to <2 x i64>
1160   ret <2 x i64> %bc
1161 }
1162 declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, ptr, <4 x i64>, <4 x i32>, i8) nounwind readonly
1163
1164 define <2 x i64> @test_mm256_mask_i64gather_epi32(<2 x i64> %a0, ptr%a1, <4 x i64> %a2, <2 x i64> %a3) {
1165 ; X86-LABEL: test_mm256_mask_i64gather_epi32:
1166 ; X86:       # %bb.0:
1167 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1168 ; X86-NEXT:    vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0
1169 ; X86-NEXT:    vzeroupper
1170 ; X86-NEXT:    retl
1171 ;
1172 ; X64-LABEL: test_mm256_mask_i64gather_epi32:
1173 ; X64:       # %bb.0:
1174 ; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0
1175 ; X64-NEXT:    vzeroupper
1176 ; X64-NEXT:    retq
1177   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1178   %arg1 = bitcast ptr%a1 to ptr
1179   %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
1180   %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %arg0, ptr %arg1, <4 x i64> %a2, <4 x i32> %arg3, i8 2)
1181   %bc = bitcast <4 x i32> %call to <2 x i64>
1182   ret <2 x i64> %bc
1183 }
1184
1185 define <2 x i64> @test_mm_i64gather_epi64(ptr%a0, <2 x i64> %a1) {
1186 ; X86-LABEL: test_mm_i64gather_epi64:
1187 ; X86:       # %bb.0:
1188 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1189 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1190 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1191 ; X86-NEXT:    vpgatherqq %xmm2, (%eax,%xmm0,2), %xmm1
1192 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
1193 ; X86-NEXT:    retl
1194 ;
1195 ; X64-LABEL: test_mm_i64gather_epi64:
1196 ; X64:       # %bb.0:
1197 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1198 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1199 ; X64-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm0,2), %xmm1
1200 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
1201 ; X64-NEXT:    retq
1202   %arg0 = bitcast ptr%a0 to ptr
1203   %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> undef, ptr %arg0, <2 x i64> %a1, <2 x i64> <i64 -1, i64 -1>, i8 2)
1204   ret <2 x i64> %call
1205 }
1206 declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, ptr, <2 x i64>, <2 x i64>, i8) nounwind readonly
1207
1208 define <2 x i64> @test_mm_mask_i64gather_epi64(<2 x i64> %a0, ptr%a1, <2 x i64> %a2, <2 x i64> %a3) {
1209 ; X86-LABEL: test_mm_mask_i64gather_epi64:
1210 ; X86:       # %bb.0:
1211 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1212 ; X86-NEXT:    vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0
1213 ; X86-NEXT:    retl
1214 ;
1215 ; X64-LABEL: test_mm_mask_i64gather_epi64:
1216 ; X64:       # %bb.0:
1217 ; X64-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0
1218 ; X64-NEXT:    retq
1219   %arg1 = bitcast ptr%a1 to ptr
1220   %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0, ptr %arg1, <2 x i64> %a2, <2 x i64> %a3, i8 2)
1221   ret <2 x i64> %call
1222 }
1223
1224 define <4 x i64> @test_mm256_i64gather_epi64(ptr%a0, <4 x i64> %a1) {
1225 ; X86-LABEL: test_mm256_i64gather_epi64:
1226 ; X86:       # %bb.0:
1227 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1228 ; X86-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
1229 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1230 ; X86-NEXT:    vpgatherqq %ymm2, (%eax,%ymm0,2), %ymm1
1231 ; X86-NEXT:    vmovdqa %ymm1, %ymm0
1232 ; X86-NEXT:    retl
1233 ;
1234 ; X64-LABEL: test_mm256_i64gather_epi64:
1235 ; X64:       # %bb.0:
1236 ; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
1237 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1238 ; X64-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm0,2), %ymm1
1239 ; X64-NEXT:    vmovdqa %ymm1, %ymm0
1240 ; X64-NEXT:    retq
1241   %arg0 = bitcast ptr%a0 to ptr
1242   %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, ptr %arg0, <4 x i64> %a1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
1243   ret <4 x i64> %call
1244 }
1245 declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, ptr, <4 x i64>, <4 x i64>, i8) nounwind readonly
1246
1247 define <4 x i64> @test_mm256_mask_i64gather_epi64(<4 x i64> %a0, ptr%a1, <4 x i64> %a2, <4 x i64> %a3) {
1248 ; X86-LABEL: test_mm256_mask_i64gather_epi64:
1249 ; X86:       # %bb.0:
1250 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1251 ; X86-NEXT:    vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0
1252 ; X86-NEXT:    retl
1253 ;
1254 ; X64-LABEL: test_mm256_mask_i64gather_epi64:
1255 ; X64:       # %bb.0:
1256 ; X64-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0
1257 ; X64-NEXT:    retq
1258   %arg1 = bitcast ptr%a1 to ptr
1259   %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0, ptr %arg1, <4 x i64> %a2, <4 x i64> %a3, i8 2)
1260   ret <4 x i64> %call
1261 }
1262
1263 define <2 x double> @test_mm_i64gather_pd(ptr%a0, <2 x i64> %a1) {
1264 ; X86-LABEL: test_mm_i64gather_pd:
1265 ; X86:       # %bb.0:
1266 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1267 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1268 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1269 ; X86-NEXT:    vgatherqpd %xmm2, (%eax,%xmm0,2), %xmm1
1270 ; X86-NEXT:    vmovapd %xmm1, %xmm0
1271 ; X86-NEXT:    retl
1272 ;
1273 ; X64-LABEL: test_mm_i64gather_pd:
1274 ; X64:       # %bb.0:
1275 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1276 ; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1277 ; X64-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm0,2), %xmm1
1278 ; X64-NEXT:    vmovapd %xmm1, %xmm0
1279 ; X64-NEXT:    retq
1280   %arg0 = bitcast ptr%a0 to ptr
1281   %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
1282   %sext = sext <2 x i1> %cmp to <2 x i64>
1283   %mask = bitcast <2 x i64> %sext to <2 x double>
1284   %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> undef, ptr %arg0, <2 x i64> %a1, <2 x double> %mask, i8 2)
1285   ret <2 x double> %call
1286 }
1287 declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, ptr, <2 x i64>, <2 x double>, i8) nounwind readonly
1288
1289 define <2 x double> @test_mm_mask_i64gather_pd(<2 x double> %a0, ptr%a1, <2 x i64> %a2, <2 x double> %a3) {
1290 ; X86-LABEL: test_mm_mask_i64gather_pd:
1291 ; X86:       # %bb.0:
1292 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1293 ; X86-NEXT:    vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0
1294 ; X86-NEXT:    retl
1295 ;
1296 ; X64-LABEL: test_mm_mask_i64gather_pd:
1297 ; X64:       # %bb.0:
1298 ; X64-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0
1299 ; X64-NEXT:    retq
1300   %arg1 = bitcast ptr%a1 to ptr
1301   %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0, ptr %arg1, <2 x i64> %a2, <2 x double> %a3, i8 2)
1302   ret <2 x double> %call
1303 }
1304
1305 define <4 x double> @test_mm256_i64gather_pd(ptr%a0, <4 x i64> %a1) {
1306 ; X86-LABEL: test_mm256_i64gather_pd:
1307 ; X86:       # %bb.0:
1308 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1309 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1310 ; X86-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
1311 ; X86-NEXT:    vgatherqpd %ymm2, (%eax,%ymm0,2), %ymm1
1312 ; X86-NEXT:    vmovapd %ymm1, %ymm0
1313 ; X86-NEXT:    retl
1314 ;
1315 ; X64-LABEL: test_mm256_i64gather_pd:
1316 ; X64:       # %bb.0:
1317 ; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1318 ; X64-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
1319 ; X64-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm0,2), %ymm1
1320 ; X64-NEXT:    vmovapd %ymm1, %ymm0
1321 ; X64-NEXT:    retq
1322   %arg0 = bitcast ptr%a0 to ptr
1323   %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
1324   %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, ptr %arg0, <4 x i64> %a1, <4 x double> %mask, i8 2)
1325   ret <4 x double> %call
1326 }
1327 declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, ptr, <4 x i64>, <4 x double>, i8) nounwind readonly
1328
1329 define <4 x double> @test_mm256_mask_i64gather_pd(<4 x double> %a0, ptr%a1, <4 x i64> %a2, <4 x double> %a3) {
1330 ; X86-LABEL: test_mm256_mask_i64gather_pd:
1331 ; X86:       # %bb.0:
1332 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1333 ; X86-NEXT:    vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0
1334 ; X86-NEXT:    retl
1335 ;
1336 ; X64-LABEL: test_mm256_mask_i64gather_pd:
1337 ; X64:       # %bb.0:
1338 ; X64-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm1,2), %ymm0
1339 ; X64-NEXT:    retq
1340   %arg1 = bitcast ptr%a1 to ptr
1341   %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0, ptr %arg1, <4 x i64> %a2, <4 x double> %a3, i8 2)
1342   ret <4 x double> %call
1343 }
1344
1345 define <4 x float> @test_mm_i64gather_ps(ptr%a0, <2 x i64> %a1) {
1346 ; X86-LABEL: test_mm_i64gather_ps:
1347 ; X86:       # %bb.0:
1348 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1349 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1350 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1351 ; X86-NEXT:    vgatherqps %xmm2, (%eax,%xmm0,2), %xmm1
1352 ; X86-NEXT:    vmovaps %xmm1, %xmm0
1353 ; X86-NEXT:    retl
1354 ;
1355 ; X64-LABEL: test_mm_i64gather_ps:
1356 ; X64:       # %bb.0:
1357 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1358 ; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1359 ; X64-NEXT:    vgatherqps %xmm2, (%rdi,%xmm0,2), %xmm1
1360 ; X64-NEXT:    vmovaps %xmm1, %xmm0
1361 ; X64-NEXT:    retq
1362   %arg0 = bitcast ptr%a0 to ptr
1363   %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1364   %sext = sext <4 x i1> %cmp to <4 x i32>
1365   %mask = bitcast <4 x i32> %sext to <4 x float>
1366   %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> undef, ptr %arg0, <2 x i64> %a1, <4 x float> %mask, i8 2)
1367   ret <4 x float> %call
1368 }
1369 declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, ptr, <2 x i64>, <4 x float>, i8) nounwind readonly
1370
1371 define <4 x float> @test_mm_mask_i64gather_ps(<4 x float> %a0, ptr%a1, <2 x i64> %a2, <4 x float> %a3) {
1372 ; X86-LABEL: test_mm_mask_i64gather_ps:
1373 ; X86:       # %bb.0:
1374 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1375 ; X86-NEXT:    vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0
1376 ; X86-NEXT:    retl
1377 ;
1378 ; X64-LABEL: test_mm_mask_i64gather_ps:
1379 ; X64:       # %bb.0:
1380 ; X64-NEXT:    vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0
1381 ; X64-NEXT:    retq
1382   %arg1 = bitcast ptr%a1 to ptr
1383   %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0, ptr %arg1, <2 x i64> %a2, <4 x float> %a3, i8 2)
1384   ret <4 x float> %call
1385 }
1386
1387 define <4 x float> @test_mm256_i64gather_ps(ptr%a0, <4 x i64> %a1) {
1388 ; X86-LABEL: test_mm256_i64gather_ps:
1389 ; X86:       # %bb.0:
1390 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1391 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1392 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1393 ; X86-NEXT:    vgatherqps %xmm2, (%eax,%ymm0,2), %xmm1
1394 ; X86-NEXT:    vmovaps %xmm1, %xmm0
1395 ; X86-NEXT:    vzeroupper
1396 ; X86-NEXT:    retl
1397 ;
1398 ; X64-LABEL: test_mm256_i64gather_ps:
1399 ; X64:       # %bb.0:
1400 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1401 ; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1402 ; X64-NEXT:    vgatherqps %xmm2, (%rdi,%ymm0,2), %xmm1
1403 ; X64-NEXT:    vmovaps %xmm1, %xmm0
1404 ; X64-NEXT:    vzeroupper
1405 ; X64-NEXT:    retq
1406   %arg0 = bitcast ptr%a0 to ptr
1407   %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1408   %sext = sext <4 x i1> %cmp to <4 x i32>
1409   %mask = bitcast <4 x i32> %sext to <4 x float>
1410   %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, ptr %arg0, <4 x i64> %a1, <4 x float> %mask, i8 2)
1411   ret <4 x float> %call
1412 }
1413 declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, ptr, <4 x i64>, <4 x float>, i8) nounwind readonly
1414
1415 define <4 x float> @test_mm256_mask_i64gather_ps(<4 x float> %a0, ptr%a1, <4 x i64> %a2, <4 x float> %a3) {
1416 ; X86-LABEL: test_mm256_mask_i64gather_ps:
1417 ; X86:       # %bb.0:
1418 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1419 ; X86-NEXT:    vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0
1420 ; X86-NEXT:    vzeroupper
1421 ; X86-NEXT:    retl
1422 ;
1423 ; X64-LABEL: test_mm256_mask_i64gather_ps:
1424 ; X64:       # %bb.0:
1425 ; X64-NEXT:    vgatherqps %xmm2, (%rdi,%ymm1,2), %xmm0
1426 ; X64-NEXT:    vzeroupper
1427 ; X64-NEXT:    retq
1428   %arg1 = bitcast ptr%a1 to ptr
1429   %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0, ptr %arg1, <4 x i64> %a2, <4 x float> %a3, i8 2)
1430   ret <4 x float> %call
1431 }
1432
1433 define <4 x i64> @test0_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
1434 ; CHECK-LABEL: test0_mm256_inserti128_si256:
1435 ; CHECK:       # %bb.0:
1436 ; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
1437 ; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1438 ; CHECK-NEXT:    ret{{[l|q]}}
1439   %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1440   %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1441   ret <4 x i64> %res
1442 }
1443
1444 define <4 x i64> @test1_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
1445 ; CHECK-LABEL: test1_mm256_inserti128_si256:
1446 ; CHECK:       # %bb.0:
1447 ; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1448 ; CHECK-NEXT:    ret{{[l|q]}}
1449   %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1450   %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1451   ret <4 x i64> %res
1452 }
1453
1454 define <4 x i64> @test_mm256_madd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1455 ; CHECK-LABEL: test_mm256_madd_epi16:
1456 ; CHECK:       # %bb.0:
1457 ; CHECK-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
1458 ; CHECK-NEXT:    ret{{[l|q]}}
1459   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1460   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1461   %res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %arg0, <16 x i16> %arg1)
1462   %bc = bitcast <8 x i32> %res to <4 x i64>
1463   ret <4 x i64> %bc
1464 }
1465 declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
1466
1467 define <4 x i64> @test_mm256_maddubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1468 ; CHECK-LABEL: test_mm256_maddubs_epi16:
1469 ; CHECK:       # %bb.0:
1470 ; CHECK-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0
1471 ; CHECK-NEXT:    ret{{[l|q]}}
1472   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1473   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1474   %res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %arg0, <32 x i8> %arg1)
1475   %bc = bitcast <16 x i16> %res to <4 x i64>
1476   ret <4 x i64> %bc
1477 }
1478 declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone
1479
1480 define <2 x i64> @test_mm_maskload_epi32(ptr %a0, <2 x i64> %a1) nounwind {
1481 ; X86-LABEL: test_mm_maskload_epi32:
1482 ; X86:       # %bb.0:
1483 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1484 ; X86-NEXT:    vpmaskmovd (%eax), %xmm0, %xmm0
1485 ; X86-NEXT:    retl
1486 ;
1487 ; X64-LABEL: test_mm_maskload_epi32:
1488 ; X64:       # %bb.0:
1489 ; X64-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm0
1490 ; X64-NEXT:    retq
1491   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1492   %call = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr %a0, <4 x i32> %arg1)
1493   %bc = bitcast <4 x i32> %call to <2 x i64>
1494   ret <2 x i64> %bc
1495 }
1496 declare <4 x i32> @llvm.x86.avx2.maskload.d(ptr, <4 x i32>) nounwind readonly
1497
1498 define <4 x i64> @test_mm256_maskload_epi32(ptr %a0, <4 x i64> %a1) nounwind {
1499 ; X86-LABEL: test_mm256_maskload_epi32:
1500 ; X86:       # %bb.0:
1501 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1502 ; X86-NEXT:    vpmaskmovd (%eax), %ymm0, %ymm0
1503 ; X86-NEXT:    retl
1504 ;
1505 ; X64-LABEL: test_mm256_maskload_epi32:
1506 ; X64:       # %bb.0:
1507 ; X64-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm0
1508 ; X64-NEXT:    retq
1509   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1510   %call = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr %a0, <8 x i32> %arg1)
1511   %bc = bitcast <8 x i32> %call to <4 x i64>
1512   ret <4 x i64> %bc
1513 }
1514 declare <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr, <8 x i32>) nounwind readonly
1515
1516 define <2 x i64> @test_mm_maskload_epi64(ptr %a0, <2 x i64> %a1) nounwind {
1517 ; X86-LABEL: test_mm_maskload_epi64:
1518 ; X86:       # %bb.0:
1519 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1520 ; X86-NEXT:    vpmaskmovq (%eax), %xmm0, %xmm0
1521 ; X86-NEXT:    retl
1522 ;
1523 ; X64-LABEL: test_mm_maskload_epi64:
1524 ; X64:       # %bb.0:
1525 ; X64-NEXT:    vpmaskmovq (%rdi), %xmm0, %xmm0
1526 ; X64-NEXT:    retq
1527   %res = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr %a0, <2 x i64> %a1)
1528   ret <2 x i64> %res
1529 }
1530 declare <2 x i64> @llvm.x86.avx2.maskload.q(ptr, <2 x i64>) nounwind readonly
1531
1532 define <4 x i64> @test_mm256_maskload_epi64(ptr %a0, <4 x i64> %a1) nounwind {
1533 ; X86-LABEL: test_mm256_maskload_epi64:
1534 ; X86:       # %bb.0:
1535 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1536 ; X86-NEXT:    vpmaskmovq (%eax), %ymm0, %ymm0
1537 ; X86-NEXT:    retl
1538 ;
1539 ; X64-LABEL: test_mm256_maskload_epi64:
1540 ; X64:       # %bb.0:
1541 ; X64-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm0
1542 ; X64-NEXT:    retq
1543   %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr %a0, <4 x i64> %a1)
1544   ret <4 x i64> %res
1545 }
1546 declare <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr, <4 x i64>) nounwind readonly
1547
1548 define void @test_mm_maskstore_epi32(ptr %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
1549 ; X86-LABEL: test_mm_maskstore_epi32:
1550 ; X86:       # %bb.0:
1551 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1552 ; X86-NEXT:    vpmaskmovd %xmm1, %xmm0, (%eax)
1553 ; X86-NEXT:    retl
1554 ;
1555 ; X64-LABEL: test_mm_maskstore_epi32:
1556 ; X64:       # %bb.0:
1557 ; X64-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
1558 ; X64-NEXT:    retq
1559   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1560   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1561   call void @llvm.x86.avx2.maskstore.d(ptr %a0, <4 x i32> %arg1, <4 x i32> %arg2)
1562   ret void
1563 }
1564 declare void @llvm.x86.avx2.maskstore.d(ptr, <4 x i32>, <4 x i32>) nounwind readnone
1565
1566 define void @test_mm256_maskstore_epi32(ptr %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
1567 ; X86-LABEL: test_mm256_maskstore_epi32:
1568 ; X86:       # %bb.0:
1569 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1570 ; X86-NEXT:    vpmaskmovd %ymm1, %ymm0, (%eax)
1571 ; X86-NEXT:    vzeroupper
1572 ; X86-NEXT:    retl
1573 ;
1574 ; X64-LABEL: test_mm256_maskstore_epi32:
1575 ; X64:       # %bb.0:
1576 ; X64-NEXT:    vpmaskmovd %ymm1, %ymm0, (%rdi)
1577 ; X64-NEXT:    vzeroupper
1578 ; X64-NEXT:    retq
1579   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1580   %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
1581   call void @llvm.x86.avx2.maskstore.d.256(ptr %a0, <8 x i32> %arg1, <8 x i32> %arg2)
1582   ret void
1583 }
1584 declare void @llvm.x86.avx2.maskstore.d.256(ptr, <8 x i32>, <8 x i32>) nounwind readnone
1585
1586 define void @test_mm_maskstore_epi64(ptr %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
1587 ; X86-LABEL: test_mm_maskstore_epi64:
1588 ; X86:       # %bb.0:
1589 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1590 ; X86-NEXT:    vpmaskmovq %xmm1, %xmm0, (%eax)
1591 ; X86-NEXT:    retl
1592 ;
1593 ; X64-LABEL: test_mm_maskstore_epi64:
1594 ; X64:       # %bb.0:
1595 ; X64-NEXT:    vpmaskmovq %xmm1, %xmm0, (%rdi)
1596 ; X64-NEXT:    retq
1597   call void @llvm.x86.avx2.maskstore.q(ptr %a0, <2 x i64> %a1, <2 x i64> %a2)
1598   ret void
1599 }
1600 declare void @llvm.x86.avx2.maskstore.q(ptr, <2 x i64>, <2 x i64>) nounwind readnone
1601
1602 define void @test_mm256_maskstore_epi64(ptr %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
1603 ; X86-LABEL: test_mm256_maskstore_epi64:
1604 ; X86:       # %bb.0:
1605 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1606 ; X86-NEXT:    vpmaskmovq %ymm1, %ymm0, (%eax)
1607 ; X86-NEXT:    vzeroupper
1608 ; X86-NEXT:    retl
1609 ;
1610 ; X64-LABEL: test_mm256_maskstore_epi64:
1611 ; X64:       # %bb.0:
1612 ; X64-NEXT:    vpmaskmovq %ymm1, %ymm0, (%rdi)
1613 ; X64-NEXT:    vzeroupper
1614 ; X64-NEXT:    retq
1615   call void @llvm.x86.avx2.maskstore.q.256(ptr %a0, <4 x i64> %a1, <4 x i64> %a2)
1616   ret void
1617 }
1618 declare void @llvm.x86.avx2.maskstore.q.256(ptr, <4 x i64>, <4 x i64>) nounwind readnone
1619
1620 define <4 x i64> @test_mm256_max_epi8(<4 x i64> %a0, <4 x i64> %a1) {
1621 ; CHECK-LABEL: test_mm256_max_epi8:
1622 ; CHECK:       # %bb.0:
1623 ; CHECK-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
1624 ; CHECK-NEXT:    ret{{[l|q]}}
1625   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1626   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1627   %sel = call <32 x i8> @llvm.smax.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
1628   %bc = bitcast <32 x i8> %sel to <4 x i64>
1629   ret <4 x i64> %bc
1630 }
1631 declare <32 x i8> @llvm.smax.v32i8(<32 x i8>, <32 x i8>)
1632
1633 define <4 x i64> @test_mm256_max_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1634 ; CHECK-LABEL: test_mm256_max_epi16:
1635 ; CHECK:       # %bb.0:
1636 ; CHECK-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
1637 ; CHECK-NEXT:    ret{{[l|q]}}
1638   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1639   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1640   %sel = call <16 x i16> @llvm.smax.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
1641   %bc = bitcast <16 x i16> %sel to <4 x i64>
1642   ret <4 x i64> %bc
1643 }
1644 declare <16 x i16> @llvm.smax.v16i16(<16 x i16>, <16 x i16>)
1645
1646 define <4 x i64> @test_mm256_max_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1647 ; CHECK-LABEL: test_mm256_max_epi32:
1648 ; CHECK:       # %bb.0:
1649 ; CHECK-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
1650 ; CHECK-NEXT:    ret{{[l|q]}}
1651   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1652   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1653   %sel = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %arg0, <8 x i32> %arg1)
1654   %bc = bitcast <8 x i32> %sel to <4 x i64>
1655   ret <4 x i64> %bc
1656 }
1657 declare <8 x i32> @llvm.smax.v8i32(<8 x i32>, <8 x i32>)
1658
1659 define <4 x i64> @test_mm256_max_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1660 ; CHECK-LABEL: test_mm256_max_epu8:
1661 ; CHECK:       # %bb.0:
1662 ; CHECK-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
1663 ; CHECK-NEXT:    ret{{[l|q]}}
1664   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1665   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1666   %sel = call <32 x i8> @llvm.umax.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
1667   %bc = bitcast <32 x i8> %sel to <4 x i64>
1668   ret <4 x i64> %bc
1669 }
1670 declare <32 x i8> @llvm.umax.v32i8(<32 x i8>, <32 x i8>)
1671
1672 define <4 x i64> @test_mm256_max_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1673 ; CHECK-LABEL: test_mm256_max_epu16:
1674 ; CHECK:       # %bb.0:
1675 ; CHECK-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
1676 ; CHECK-NEXT:    ret{{[l|q]}}
1677   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1678   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1679   %sel = call <16 x i16> @llvm.umax.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
1680   %bc = bitcast <16 x i16> %sel to <4 x i64>
1681   ret <4 x i64> %bc
1682 }
1683 declare <16 x i16> @llvm.umax.v16i16(<16 x i16>, <16 x i16>)
1684
1685 define <4 x i64> @test_mm256_max_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1686 ; CHECK-LABEL: test_mm256_max_epu32:
1687 ; CHECK:       # %bb.0:
1688 ; CHECK-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
1689 ; CHECK-NEXT:    ret{{[l|q]}}
1690   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1691   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1692   %sel = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %arg0, <8 x i32> %arg1)
1693   %bc = bitcast <8 x i32> %sel to <4 x i64>
1694   ret <4 x i64> %bc
1695 }
1696 declare <8 x i32> @llvm.umax.v8i32(<8 x i32>, <8 x i32>)
1697
1698 define <4 x i64> @test_mm256_min_epi8(<4 x i64> %a0, <4 x i64> %a1) {
1699 ; CHECK-LABEL: test_mm256_min_epi8:
1700 ; CHECK:       # %bb.0:
1701 ; CHECK-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
1702 ; CHECK-NEXT:    ret{{[l|q]}}
1703   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1704   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1705   %sel = call <32 x i8> @llvm.smin.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
1706   %bc = bitcast <32 x i8> %sel to <4 x i64>
1707   ret <4 x i64> %bc
1708 }
1709 declare <32 x i8> @llvm.smin.v32i8(<32 x i8>, <32 x i8>)
1710
1711 define <4 x i64> @test_mm256_min_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1712 ; CHECK-LABEL: test_mm256_min_epi16:
1713 ; CHECK:       # %bb.0:
1714 ; CHECK-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
1715 ; CHECK-NEXT:    ret{{[l|q]}}
1716   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1717   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1718   %sel = call <16 x i16> @llvm.smin.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
1719   %bc = bitcast <16 x i16> %sel to <4 x i64>
1720   ret <4 x i64> %bc
1721 }
1722 declare <16 x i16> @llvm.smin.v16i16(<16 x i16>, <16 x i16>)
1723
1724 define <4 x i64> @test_mm256_min_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1725 ; CHECK-LABEL: test_mm256_min_epi32:
1726 ; CHECK:       # %bb.0:
1727 ; CHECK-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
1728 ; CHECK-NEXT:    ret{{[l|q]}}
1729   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1730   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1731   %sel = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %arg0, <8 x i32> %arg1)
1732   %bc = bitcast <8 x i32> %sel to <4 x i64>
1733   ret <4 x i64> %bc
1734 }
1735 declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>)
1736
1737 define <4 x i64> @test_mm256_min_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1738 ; CHECK-LABEL: test_mm256_min_epu8:
1739 ; CHECK:       # %bb.0:
1740 ; CHECK-NEXT:    vpminub %ymm1, %ymm0, %ymm0
1741 ; CHECK-NEXT:    ret{{[l|q]}}
1742   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1743   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1744   %sel = call <32 x i8> @llvm.umin.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
1745   %bc = bitcast <32 x i8> %sel to <4 x i64>
1746   ret <4 x i64> %bc
1747 }
1748 declare <32 x i8> @llvm.umin.v32i8(<32 x i8>, <32 x i8>)
1749
1750 define <4 x i64> @test_mm256_min_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1751 ; CHECK-LABEL: test_mm256_min_epu16:
1752 ; CHECK:       # %bb.0:
1753 ; CHECK-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
1754 ; CHECK-NEXT:    ret{{[l|q]}}
1755   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1756   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1757   %sel = call <16 x i16> @llvm.umin.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
1758   %bc = bitcast <16 x i16> %sel to <4 x i64>
1759   ret <4 x i64> %bc
1760 }
1761 declare <16 x i16> @llvm.umin.v16i16(<16 x i16>, <16 x i16>)
1762
1763 define <4 x i64> @test_mm256_min_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1764 ; CHECK-LABEL: test_mm256_min_epu32:
1765 ; CHECK:       # %bb.0:
1766 ; CHECK-NEXT:    vpminud %ymm1, %ymm0, %ymm0
1767 ; CHECK-NEXT:    ret{{[l|q]}}
1768   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1769   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1770   %sel = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %arg0, <8 x i32> %arg1)
1771   %bc = bitcast <8 x i32> %sel to <4 x i64>
1772   ret <4 x i64> %bc
1773 }
1774 declare <8 x i32> @llvm.umin.v8i32(<8 x i32>, <8 x i32>)
1775
1776 define i32 @test_mm256_movemask_epi8(<4 x i64> %a0) nounwind {
1777 ; CHECK-LABEL: test_mm256_movemask_epi8:
1778 ; CHECK:       # %bb.0:
1779 ; CHECK-NEXT:    vpmovmskb %ymm0, %eax
1780 ; CHECK-NEXT:    vzeroupper
1781 ; CHECK-NEXT:    ret{{[l|q]}}
1782   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1783   %res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %arg0)
1784   ret i32 %res
1785 }
1786 declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone
1787
1788 define <4 x i64> @test_mm256_mpsadbw_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1789 ; CHECK-LABEL: test_mm256_mpsadbw_epu8:
1790 ; CHECK:       # %bb.0:
1791 ; CHECK-NEXT:    vmpsadbw $3, %ymm1, %ymm0, %ymm0
1792 ; CHECK-NEXT:    ret{{[l|q]}}
1793   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1794   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1795   %call = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %arg0, <32 x i8> %arg1, i8 3)
1796   %bc = bitcast <16 x i16>  %call to <4 x i64>
1797   ret <4 x i64> %bc
1798 }
1799 declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone
1800
1801 define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1802 ; CHECK-LABEL: test_mm256_mul_epi32:
1803 ; CHECK:       # %bb.0:
1804 ; CHECK-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0
1805 ; CHECK-NEXT:    ret{{[l|q]}}
1806   %A = shl <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32>
1807   %A1 = ashr exact <4 x i64> %A, <i64 32, i64 32, i64 32, i64 32>
1808   %B = shl <4 x i64> %a1, <i64 32, i64 32, i64 32, i64 32>
1809   %B1 = ashr exact <4 x i64> %B, <i64 32, i64 32, i64 32, i64 32>
1810   %res = mul nsw <4 x i64> %A1, %B1
1811   ret <4 x i64> %res
1812 }
1813 declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
1814
1815 define <4 x i64> @test_mm256_mul_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1816 ; CHECK-LABEL: test_mm256_mul_epu32:
1817 ; CHECK:       # %bb.0:
1818 ; CHECK-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
1819 ; CHECK-NEXT:    ret{{[l|q]}}
1820   %A = and <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1821   %B = and <4 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1822   %res = mul nuw <4 x i64> %A, %B
1823   ret <4 x i64> %res
1824 }
1825 declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone
1826
1827 define <4 x i64> @test_mm256_mulhi_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1828 ; CHECK-LABEL: test_mm256_mulhi_epi16:
1829 ; CHECK:       # %bb.0:
1830 ; CHECK-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
1831 ; CHECK-NEXT:    ret{{[l|q]}}
1832   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1833   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1834   %res = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %arg0, <16 x i16> %arg1)
1835   %bc = bitcast <16 x i16> %res to <4 x i64>
1836   ret <4 x i64> %bc
1837 }
1838 declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone
1839
1840 define <4 x i64> @test_mm256_mulhi_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1841 ; CHECK-LABEL: test_mm256_mulhi_epu16:
1842 ; CHECK:       # %bb.0:
1843 ; CHECK-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0
1844 ; CHECK-NEXT:    ret{{[l|q]}}
1845   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1846   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1847   %res = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %arg0, <16 x i16> %arg1)
1848   %bc = bitcast <16 x i16> %res to <4 x i64>
1849   ret <4 x i64> %bc
1850 }
1851 declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone
1852
1853 define <4 x i64> @test_mm256_mulhrs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1854 ; CHECK-LABEL: test_mm256_mulhrs_epi16:
1855 ; CHECK:       # %bb.0:
1856 ; CHECK-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0
1857 ; CHECK-NEXT:    ret{{[l|q]}}
1858   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1859   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1860   %res = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %arg0, <16 x i16> %arg1)
1861   %bc = bitcast <16 x i16> %res to <4 x i64>
1862   ret <4 x i64> %bc
1863 }
1864 declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone
1865
1866 define <4 x i64> @test_mm256_mullo_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1867 ; CHECK-LABEL: test_mm256_mullo_epi16:
1868 ; CHECK:       # %bb.0:
1869 ; CHECK-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1870 ; CHECK-NEXT:    ret{{[l|q]}}
1871   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1872   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1873   %res = mul <16 x i16> %arg0, %arg1
1874   %bc = bitcast <16 x i16> %res to <4 x i64>
1875   ret <4 x i64> %bc
1876 }
1877
1878 define <4 x i64> @test_mm256_mullo_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1879 ; CHECK-LABEL: test_mm256_mullo_epi32:
1880 ; CHECK:       # %bb.0:
1881 ; CHECK-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
1882 ; CHECK-NEXT:    ret{{[l|q]}}
1883   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1884   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1885   %res = mul <8 x i32> %arg0, %arg1
1886   %bc = bitcast <8 x i32> %res to <4 x i64>
1887   ret <4 x i64> %bc
1888 }
1889
1890 define <4 x i64> @test_mm256_or_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
1891 ; CHECK-LABEL: test_mm256_or_si256:
1892 ; CHECK:       # %bb.0:
1893 ; CHECK-NEXT:    vorps %ymm1, %ymm0, %ymm0
1894 ; CHECK-NEXT:    ret{{[l|q]}}
1895   %res = or <4 x i64> %a0, %a1
1896   ret <4 x i64> %res
1897 }
1898
1899 define <4 x i64> @test_mm256_packs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1900 ; CHECK-LABEL: test_mm256_packs_epi16:
1901 ; CHECK:       # %bb.0:
1902 ; CHECK-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0
1903 ; CHECK-NEXT:    ret{{[l|q]}}
1904   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1905   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1906   %call = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %arg0, <16 x i16> %arg1)
1907   %res = bitcast <32 x i8> %call to <4 x i64>
1908   ret <4 x i64> %res
1909 }
1910 declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
1911
1912 define <4 x i64> @test_mm256_packs_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1913 ; CHECK-LABEL: test_mm256_packs_epi32:
1914 ; CHECK:       # %bb.0:
1915 ; CHECK-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
1916 ; CHECK-NEXT:    ret{{[l|q]}}
1917   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1918   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1919   %call = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %arg0, <8 x i32> %arg1)
1920   %res = bitcast <16 x i16> %call to <4 x i64>
1921   ret <4 x i64> %res
1922 }
1923 declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
1924
1925 define <4 x i64> @test_mm256_packus_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1926 ; CHECK-LABEL: test_mm256_packus_epi16:
1927 ; CHECK:       # %bb.0:
1928 ; CHECK-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
1929 ; CHECK-NEXT:    ret{{[l|q]}}
1930   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1931   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1932   %call = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %arg0, <16 x i16> %arg1)
1933   %res = bitcast <32 x i8> %call to <4 x i64>
1934   ret <4 x i64> %res
1935 }
1936 declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
1937
1938 define <4 x i64> @test_mm256_packus_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1939 ; CHECK-LABEL: test_mm256_packus_epi32:
1940 ; CHECK:       # %bb.0:
1941 ; CHECK-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
1942 ; CHECK-NEXT:    ret{{[l|q]}}
1943   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1944   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1945   %call = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %arg0, <8 x i32> %arg1)
1946   %res = bitcast <16 x i16> %call to <4 x i64>
1947   ret <4 x i64> %res
1948 }
1949 declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
1950
1951 define <4 x i64> @test_mm256_permute2x128_si256(<4 x i64> %a0, <4 x i64> %a1) {
1952 ; CHECK-LABEL: test_mm256_permute2x128_si256:
1953 ; CHECK:       # %bb.0:
1954 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1955 ; CHECK-NEXT:    ret{{[l|q]}}
1956   %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1957   ret <4 x i64> %res
1958 }
1959 declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly
1960
1961 define <4 x i64> @test_mm256_permute4x64_epi64(<4 x i64> %a0) {
1962 ; CHECK-LABEL: test_mm256_permute4x64_epi64:
1963 ; CHECK:       # %bb.0:
1964 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,0,2,0]
1965 ; CHECK-NEXT:    ret{{[l|q]}}
1966   %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 0>
1967   ret <4 x i64> %res
1968 }
1969
1970 define <4 x double> @test_mm256_permute4x64_pd(<4 x double> %a0) {
1971 ; CHECK-LABEL: test_mm256_permute4x64_pd:
1972 ; CHECK:       # %bb.0:
1973 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,2,1,0]
1974 ; CHECK-NEXT:    ret{{[l|q]}}
1975   %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
1976   ret <4 x double> %res
1977 }
1978
1979 define <4 x i64> @test_mm256_permutevar8x32_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1980 ; CHECK-LABEL: test_mm256_permutevar8x32_epi32:
1981 ; CHECK:       # %bb.0:
1982 ; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1983 ; CHECK-NEXT:    ret{{[l|q]}}
1984   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1985   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1986   %call = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %arg0, <8 x i32> %arg1)
1987   %res = bitcast <8 x i32> %call to <4 x i64>
1988   ret <4 x i64> %res
1989 }
1990 declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
1991
1992 define <8 x float> @test_mm256_permutevar8x32_ps(<8 x float> %a0, <4 x i64> %a1) {
1993 ; CHECK-LABEL: test_mm256_permutevar8x32_ps:
1994 ; CHECK:       # %bb.0:
1995 ; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1996 ; CHECK-NEXT:    ret{{[l|q]}}
1997   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1998   %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %arg1)
1999   ret <8 x float> %res
2000 }
2001 declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
2002
2003 define <4 x i64> @test_mm256_sad_epu8(<4 x i64> %a0, <4 x i64> %a1) {
2004 ; CHECK-LABEL: test_mm256_sad_epu8:
2005 ; CHECK:       # %bb.0:
2006 ; CHECK-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
2007 ; CHECK-NEXT:    ret{{[l|q]}}
2008   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2009   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2010   %res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %arg0, <32 x i8> %arg1)
2011   ret <4 x i64> %res
2012 }
2013 declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
2014
2015 define <4 x i64> @test_mm256_shuffle_epi32(<4 x i64> %a0) {
2016 ; CHECK-LABEL: test_mm256_shuffle_epi32:
2017 ; CHECK:       # %bb.0:
2018 ; CHECK-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4]
2019 ; CHECK-NEXT:    ret{{[l|q]}}
2020   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2021   %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4>
2022   %res = bitcast <8 x i32> %shuf to <4 x i64>
2023   ret <4 x i64> %res
2024 }
2025
2026 define <4 x i64> @test_mm256_shuffle_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2027 ; CHECK-LABEL: test_mm256_shuffle_epi8:
2028 ; CHECK:       # %bb.0:
2029 ; CHECK-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
2030 ; CHECK-NEXT:    ret{{[l|q]}}
2031   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2032   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2033   %shuf = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %arg0, <32 x i8> %arg1)
2034   %res = bitcast <32 x i8> %shuf to <4 x i64>
2035   ret <4 x i64> %res
2036 }
2037 declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
2038
2039 define <4 x i64> @test_mm256_shufflehi_epi16(<4 x i64> %a0) {
2040 ; CHECK-LABEL: test_mm256_shufflehi_epi16:
2041 ; CHECK:       # %bb.0:
2042 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,5,8,9,10,11,15,14,14,13]
2043 ; CHECK-NEXT:    ret{{[l|q]}}
2044   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2045   %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13>
2046   %res = bitcast <16 x i16> %shuf to <4 x i64>
2047   ret <4 x i64> %res
2048 }
2049
2050 define <4 x i64> @test_mm256_shufflelo_epi16(<4 x i64> %a0) {
2051 ; CHECK-LABEL: test_mm256_shufflelo_epi16:
2052 ; CHECK:       # %bb.0:
2053 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,0,1,1,4,5,6,7,11,8,9,9,12,13,14,15]
2054 ; CHECK-NEXT:    ret{{[l|q]}}
2055   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2056   %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, i32 9, i32 12, i32 13, i32 14, i32 15>
2057   %res = bitcast <16 x i16> %shuf to <4 x i64>
2058   ret <4 x i64> %res
2059 }
2060
2061 define <4 x i64> @test_mm256_sign_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2062 ; CHECK-LABEL: test_mm256_sign_epi8:
2063 ; CHECK:       # %bb.0:
2064 ; CHECK-NEXT:    vpsignb %ymm1, %ymm0, %ymm0
2065 ; CHECK-NEXT:    ret{{[l|q]}}
2066   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2067   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2068   %call = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %arg0, <32 x i8> %arg1)
2069   %res = bitcast <32 x i8> %call to <4 x i64>
2070   ret <4 x i64> %res
2071 }
2072 declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
2073
2074 define <4 x i64> @test_mm256_sign_epi16(<4 x i64> %a0, <4 x i64> %a1) {
2075 ; CHECK-LABEL: test_mm256_sign_epi16:
2076 ; CHECK:       # %bb.0:
2077 ; CHECK-NEXT:    vpsignw %ymm1, %ymm0, %ymm0
2078 ; CHECK-NEXT:    ret{{[l|q]}}
2079   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2080   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2081   %call = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %arg0, <16 x i16> %arg1)
2082   %res = bitcast <16 x i16> %call to <4 x i64>
2083   ret <4 x i64> %res
2084 }
2085 declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone
2086
2087 define <4 x i64> @test_mm256_sign_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2088 ; CHECK-LABEL: test_mm256_sign_epi32:
2089 ; CHECK:       # %bb.0:
2090 ; CHECK-NEXT:    vpsignd %ymm1, %ymm0, %ymm0
2091 ; CHECK-NEXT:    ret{{[l|q]}}
2092   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2093   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2094   %call = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %arg0, <8 x i32> %arg1)
2095   %res = bitcast <8 x i32> %call to <4 x i64>
2096   ret <4 x i64> %res
2097 }
2098 declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
2099
2100 define <4 x i64> @test_mm256_sll_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2101 ; CHECK-LABEL: test_mm256_sll_epi16:
2102 ; CHECK:       # %bb.0:
2103 ; CHECK-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
2104 ; CHECK-NEXT:    ret{{[l|q]}}
2105   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2106   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2107   %res = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %arg0, <8 x i16> %arg1)
2108   %bc = bitcast <16 x i16> %res to <4 x i64>
2109   ret <4 x i64> %bc
2110 }
2111 declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone
2112
2113 define <4 x i64> @test_mm256_sll_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2114 ; CHECK-LABEL: test_mm256_sll_epi32:
2115 ; CHECK:       # %bb.0:
2116 ; CHECK-NEXT:    vpslld %xmm1, %ymm0, %ymm0
2117 ; CHECK-NEXT:    ret{{[l|q]}}
2118   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2119   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2120   %res = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %arg0, <4 x i32> %arg1)
2121   %bc = bitcast <8 x i32> %res to <4 x i64>
2122   ret <4 x i64> %bc
2123 }
2124 declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
2125
2126 define <4 x i64> @test_mm256_sll_epi64(<4 x i64> %a0, <2 x i64> %a1) {
2127 ; CHECK-LABEL: test_mm256_sll_epi64:
2128 ; CHECK:       # %bb.0:
2129 ; CHECK-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
2130 ; CHECK-NEXT:    ret{{[l|q]}}
2131   %res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1)
2132   ret <4 x i64> %res
2133 }
2134 declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
2135
2136 define <4 x i64> @test_mm256_slli_epi16(<4 x i64> %a0) {
2137 ; CHECK-LABEL: test_mm256_slli_epi16:
2138 ; CHECK:       # %bb.0:
2139 ; CHECK-NEXT:    vpsllw $3, %ymm0, %ymm0
2140 ; CHECK-NEXT:    ret{{[l|q]}}
2141   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2142   %res = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %arg0, i32 3)
2143   %bc = bitcast <16 x i16> %res to <4 x i64>
2144   ret <4 x i64> %bc
2145 }
2146 declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone
2147
2148 define <4 x i64> @test_mm256_slli_epi32(<4 x i64> %a0) {
2149 ; CHECK-LABEL: test_mm256_slli_epi32:
2150 ; CHECK:       # %bb.0:
2151 ; CHECK-NEXT:    vpslld $3, %ymm0, %ymm0
2152 ; CHECK-NEXT:    ret{{[l|q]}}
2153   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2154   %res = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %arg0, i32 3)
2155   %bc = bitcast <8 x i32> %res to <4 x i64>
2156   ret <4 x i64> %bc
2157 }
2158 declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) nounwind readnone
2159
2160 define <4 x i64> @test_mm256_slli_epi64(<4 x i64> %a0) {
2161 ; CHECK-LABEL: test_mm256_slli_epi64:
2162 ; CHECK:       # %bb.0:
2163 ; CHECK-NEXT:    vpsllq $3, %ymm0, %ymm0
2164 ; CHECK-NEXT:    ret{{[l|q]}}
2165   %res = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %a0, i32 3)
2166   ret <4 x i64> %res
2167 }
2168 declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) nounwind readnone
2169
2170 define <4 x i64> @test_mm256_slli_si256(<4 x i64> %a0) {
2171 ; CHECK-LABEL: test_mm256_slli_si256:
2172 ; CHECK:       # %bb.0:
2173 ; CHECK-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
2174 ; CHECK-NEXT:    ret{{[l|q]}}
2175   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2176   %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
2177   %res = bitcast <32 x i8> %shuf to <4 x i64>
2178   ret <4 x i64> %res
2179 }
2180
2181 define <2 x i64> @test_mm_sllv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2182 ; CHECK-LABEL: test_mm_sllv_epi32:
2183 ; CHECK:       # %bb.0:
2184 ; CHECK-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
2185 ; CHECK-NEXT:    ret{{[l|q]}}
2186   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2187   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2188   %res = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %arg0, <4 x i32> %arg1)
2189   %bc = bitcast <4 x i32> %res to <2 x i64>
2190   ret <2 x i64> %bc
2191 }
2192 declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
2193
2194 define <4 x i64> @test_mm256_sllv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2195 ; CHECK-LABEL: test_mm256_sllv_epi32:
2196 ; CHECK:       # %bb.0:
2197 ; CHECK-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
2198 ; CHECK-NEXT:    ret{{[l|q]}}
2199   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2200   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2201   %res = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2202   %bc = bitcast <8 x i32> %res to <4 x i64>
2203   ret <4 x i64> %bc
2204 }
2205 declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2206
2207 define <2 x i64> @test_mm_sllv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
2208 ; CHECK-LABEL: test_mm_sllv_epi64:
2209 ; CHECK:       # %bb.0:
2210 ; CHECK-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
2211 ; CHECK-NEXT:    ret{{[l|q]}}
2212   %res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1)
2213   ret <2 x i64> %res
2214 }
2215 declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
2216
2217 define <4 x i64> @test_mm256_sllv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
2218 ; CHECK-LABEL: test_mm256_sllv_epi64:
2219 ; CHECK:       # %bb.0:
2220 ; CHECK-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0
2221 ; CHECK-NEXT:    ret{{[l|q]}}
2222   %res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1)
2223   ret <4 x i64> %res
2224 }
2225 declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
2226
2227 define <4 x i64> @test_mm256_sra_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2228 ; CHECK-LABEL: test_mm256_sra_epi16:
2229 ; CHECK:       # %bb.0:
2230 ; CHECK-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
2231 ; CHECK-NEXT:    ret{{[l|q]}}
2232   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2233   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2234   %res = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %arg0, <8 x i16> %arg1)
2235   %bc = bitcast <16 x i16> %res to <4 x i64>
2236   ret <4 x i64> %bc
2237 }
2238 declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone
2239
2240 define <4 x i64> @test_mm256_sra_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2241 ; CHECK-LABEL: test_mm256_sra_epi32:
2242 ; CHECK:       # %bb.0:
2243 ; CHECK-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
2244 ; CHECK-NEXT:    ret{{[l|q]}}
2245   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2246   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2247   %res = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %arg0, <4 x i32> %arg1)
2248   %bc = bitcast <8 x i32> %res to <4 x i64>
2249   ret <4 x i64> %bc
2250 }
2251 declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
2252
2253 define <4 x i64> @test_mm256_srai_epi16(<4 x i64> %a0) {
2254 ; CHECK-LABEL: test_mm256_srai_epi16:
2255 ; CHECK:       # %bb.0:
2256 ; CHECK-NEXT:    vpsraw $3, %ymm0, %ymm0
2257 ; CHECK-NEXT:    ret{{[l|q]}}
2258   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2259   %res = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %arg0, i32 3)
2260   %bc = bitcast <16 x i16> %res to <4 x i64>
2261   ret <4 x i64> %bc
2262 }
2263 declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone
2264
2265 define <4 x i64> @test_mm256_srai_epi32(<4 x i64> %a0) {
2266 ; CHECK-LABEL: test_mm256_srai_epi32:
2267 ; CHECK:       # %bb.0:
2268 ; CHECK-NEXT:    vpsrad $3, %ymm0, %ymm0
2269 ; CHECK-NEXT:    ret{{[l|q]}}
2270   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2271   %res = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %arg0, i32 3)
2272   %bc = bitcast <8 x i32> %res to <4 x i64>
2273   ret <4 x i64> %bc
2274 }
2275 declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) nounwind readnone
2276
2277 define <2 x i64> @test_mm_srav_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2278 ; CHECK-LABEL: test_mm_srav_epi32:
2279 ; CHECK:       # %bb.0:
2280 ; CHECK-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
2281 ; CHECK-NEXT:    ret{{[l|q]}}
2282   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2283   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2284   %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %arg0, <4 x i32> %arg1)
2285   %bc = bitcast <4 x i32> %res to <2 x i64>
2286   ret <2 x i64> %bc
2287 }
2288 declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
2289
2290 define <4 x i64> @test_mm256_srav_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2291 ; CHECK-LABEL: test_mm256_srav_epi32:
2292 ; CHECK:       # %bb.0:
2293 ; CHECK-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
2294 ; CHECK-NEXT:    ret{{[l|q]}}
2295   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2296   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2297   %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2298   %bc = bitcast <8 x i32> %res to <4 x i64>
2299   ret <4 x i64> %bc
2300 }
2301 declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2302
2303 define <4 x i64> @test_mm256_srl_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2304 ; CHECK-LABEL: test_mm256_srl_epi16:
2305 ; CHECK:       # %bb.0:
2306 ; CHECK-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
2307 ; CHECK-NEXT:    ret{{[l|q]}}
2308   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2309   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2310   %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %arg0, <8 x i16> %arg1)
2311   %bc = bitcast <16 x i16> %res to <4 x i64>
2312   ret <4 x i64> %bc
2313 }
2314 declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone
2315
2316 define <4 x i64> @test_mm256_srl_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2317 ; CHECK-LABEL: test_mm256_srl_epi32:
2318 ; CHECK:       # %bb.0:
2319 ; CHECK-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
2320 ; CHECK-NEXT:    ret{{[l|q]}}
2321   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2322   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2323   %res = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %arg0, <4 x i32> %arg1)
2324   %bc = bitcast <8 x i32> %res to <4 x i64>
2325   ret <4 x i64> %bc
2326 }
2327 declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
2328
2329 define <4 x i64> @test_mm256_srl_epi64(<4 x i64> %a0, <2 x i64> %a1) {
2330 ; CHECK-LABEL: test_mm256_srl_epi64:
2331 ; CHECK:       # %bb.0:
2332 ; CHECK-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
2333 ; CHECK-NEXT:    ret{{[l|q]}}
2334   %res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1)
2335   ret <4 x i64> %res
2336 }
2337 declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
2338
2339 define <4 x i64> @test_mm256_srli_epi16(<4 x i64> %a0) {
2340 ; CHECK-LABEL: test_mm256_srli_epi16:
2341 ; CHECK:       # %bb.0:
2342 ; CHECK-NEXT:    vpsrlw $3, %ymm0, %ymm0
2343 ; CHECK-NEXT:    ret{{[l|q]}}
2344   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2345   %res = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %arg0, i32 3)
2346   %bc = bitcast <16 x i16> %res to <4 x i64>
2347   ret <4 x i64> %bc
2348 }
2349 declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone
2350
2351 define <4 x i64> @test_mm256_srli_epi32(<4 x i64> %a0) {
2352 ; CHECK-LABEL: test_mm256_srli_epi32:
2353 ; CHECK:       # %bb.0:
2354 ; CHECK-NEXT:    vpsrld $3, %ymm0, %ymm0
2355 ; CHECK-NEXT:    ret{{[l|q]}}
2356   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2357   %res = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %arg0, i32 3)
2358   %bc = bitcast <8 x i32> %res to <4 x i64>
2359   ret <4 x i64> %bc
2360 }
2361 declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) nounwind readnone
2362
2363 define <4 x i64> @test_mm256_srli_epi64(<4 x i64> %a0) {
2364 ; CHECK-LABEL: test_mm256_srli_epi64:
2365 ; CHECK:       # %bb.0:
2366 ; CHECK-NEXT:    vpsrlq $3, %ymm0, %ymm0
2367 ; CHECK-NEXT:    ret{{[l|q]}}
2368   %res = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %a0, i32 3)
2369   ret <4 x i64> %res
2370 }
2371 declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) nounwind readnone
2372
2373 define <4 x i64> @test_mm256_srli_si256(<4 x i64> %a0) {
2374 ; CHECK-LABEL: test_mm256_srli_si256:
2375 ; CHECK:       # %bb.0:
2376 ; CHECK-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
2377 ; CHECK-NEXT:    ret{{[l|q]}}
2378   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2379   %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
2380   %res = bitcast <32 x i8> %shuf to <4 x i64>
2381   ret <4 x i64> %res
2382 }
2383
2384 define <2 x i64> @test_mm_srlv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2385 ; CHECK-LABEL: test_mm_srlv_epi32:
2386 ; CHECK:       # %bb.0:
2387 ; CHECK-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
2388 ; CHECK-NEXT:    ret{{[l|q]}}
2389   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2390   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2391   %res = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %arg0, <4 x i32> %arg1)
2392   %bc = bitcast <4 x i32> %res to <2 x i64>
2393   ret <2 x i64> %bc
2394 }
2395 declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
2396
2397 define <4 x i64> @test_mm256_srlv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2398 ; CHECK-LABEL: test_mm256_srlv_epi32:
2399 ; CHECK:       # %bb.0:
2400 ; CHECK-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
2401 ; CHECK-NEXT:    ret{{[l|q]}}
2402   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2403   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2404   %res = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2405   %bc = bitcast <8 x i32> %res to <4 x i64>
2406   ret <4 x i64> %bc
2407 }
2408 declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2409
2410 define <2 x i64> @test_mm_srlv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
2411 ; CHECK-LABEL: test_mm_srlv_epi64:
2412 ; CHECK:       # %bb.0:
2413 ; CHECK-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
2414 ; CHECK-NEXT:    ret{{[l|q]}}
2415   %res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1)
2416   ret <2 x i64> %res
2417 }
2418 declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
2419
2420 define <4 x i64> @test_mm256_srlv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
2421 ; CHECK-LABEL: test_mm256_srlv_epi64:
2422 ; CHECK:       # %bb.0:
2423 ; CHECK-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
2424 ; CHECK-NEXT:    ret{{[l|q]}}
2425   %res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1)
2426   ret <4 x i64> %res
2427 }
2428 declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
2429
2430 define <4 x i64> @test_mm256_stream_load_si256(ptr%a0) {
2431 ; X86-LABEL: test_mm256_stream_load_si256:
2432 ; X86:       # %bb.0:
2433 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2434 ; X86-NEXT:    vmovntdqa (%eax), %ymm0
2435 ; X86-NEXT:    retl
2436 ;
2437 ; X64-LABEL: test_mm256_stream_load_si256:
2438 ; X64:       # %bb.0:
2439 ; X64-NEXT:    vmovntdqa (%rdi), %ymm0
2440 ; X64-NEXT:    retq
2441   %arg0 = bitcast ptr%a0 to ptr
2442   %res = call <4 x i64> @llvm.x86.avx2.movntdqa(ptr %arg0)
2443   ret <4 x i64> %res
2444 }
2445 declare <4 x i64> @llvm.x86.avx2.movntdqa(ptr) nounwind readonly
2446
2447 define <4 x i64> @test_mm256_sub_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2448 ; CHECK-LABEL: test_mm256_sub_epi8:
2449 ; CHECK:       # %bb.0:
2450 ; CHECK-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
2451 ; CHECK-NEXT:    ret{{[l|q]}}
2452   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2453   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2454   %res = sub <32 x i8> %arg0, %arg1
2455   %bc = bitcast <32 x i8> %res to <4 x i64>
2456   ret <4 x i64> %bc
2457 }
2458
2459 define <4 x i64> @test_mm256_sub_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2460 ; CHECK-LABEL: test_mm256_sub_epi16:
2461 ; CHECK:       # %bb.0:
2462 ; CHECK-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
2463 ; CHECK-NEXT:    ret{{[l|q]}}
2464   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2465   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2466   %res = sub <16 x i16> %arg0, %arg1
2467   %bc = bitcast <16 x i16> %res to <4 x i64>
2468   ret <4 x i64> %bc
2469 }
2470
2471 define <4 x i64> @test_mm256_sub_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2472 ; CHECK-LABEL: test_mm256_sub_epi32:
2473 ; CHECK:       # %bb.0:
2474 ; CHECK-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
2475 ; CHECK-NEXT:    ret{{[l|q]}}
2476   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2477   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2478   %res = sub <8 x i32> %arg0, %arg1
2479   %bc = bitcast <8 x i32> %res to <4 x i64>
2480   ret <4 x i64> %bc
2481 }
2482
2483 define <4 x i64> @test_mm256_sub_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2484 ; CHECK-LABEL: test_mm256_sub_epi64:
2485 ; CHECK:       # %bb.0:
2486 ; CHECK-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
2487 ; CHECK-NEXT:    ret{{[l|q]}}
2488   %res = sub <4 x i64> %a0, %a1
2489   ret <4 x i64> %res
2490 }
2491
2492 define <4 x i64> @test_mm256_subs_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2493 ; CHECK-LABEL: test_mm256_subs_epi8:
2494 ; CHECK:       # %bb.0:
2495 ; CHECK-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0
2496 ; CHECK-NEXT:    ret{{[l|q]}}
2497   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2498   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2499   %res = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
2500   %bc = bitcast <32 x i8> %res to <4 x i64>
2501   ret <4 x i64> %bc
2502 }
2503 declare <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
2504
2505 define <4 x i64> @test_mm256_subs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
2506 ; CHECK-LABEL: test_mm256_subs_epi16:
2507 ; CHECK:       # %bb.0:
2508 ; CHECK-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0
2509 ; CHECK-NEXT:    ret{{[l|q]}}
2510   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2511   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2512   %res = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
2513   %bc = bitcast <16 x i16> %res to <4 x i64>
2514   ret <4 x i64> %bc
2515 }
2516 declare <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
2517
2518 define <4 x i64> @test_mm256_subs_epu8(<4 x i64> %a0, <4 x i64> %a1) {
2519 ; CHECK-LABEL: test_mm256_subs_epu8:
2520 ; CHECK:       # %bb.0:
2521 ; CHECK-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0
2522 ; CHECK-NEXT:    ret{{[l|q]}}
2523   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2524   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2525   %res = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
2526   %bc = bitcast <32 x i8> %res to <4 x i64>
2527   ret <4 x i64> %bc
2528 }
2529 declare <32 x i8> @llvm.usub.sat.v32i8(<32 x i8>, <32 x i8>)
2530
2531 define <4 x i64> @test_mm256_subs_epu16(<4 x i64> %a0, <4 x i64> %a1) {
2532 ; CHECK-LABEL: test_mm256_subs_epu16:
2533 ; CHECK:       # %bb.0:
2534 ; CHECK-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0
2535 ; CHECK-NEXT:    ret{{[l|q]}}
2536   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2537   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2538   %res = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
2539   %bc = bitcast <16 x i16> %res to <4 x i64>
2540   ret <4 x i64> %bc
2541 }
2542 declare <16 x i16> @llvm.usub.sat.v16i16(<16 x i16>, <16 x i16>)
2543
2544 define <4 x i64> @test_mm256_unpackhi_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2545 ; CHECK-LABEL: test_mm256_unpackhi_epi8:
2546 ; CHECK:       # %bb.0:
2547 ; CHECK-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
2548 ; CHECK-NEXT:    ret{{[l|q]}}
2549   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2550   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2551   %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
2552   %bc = bitcast <32 x i8> %res to <4 x i64>
2553   ret <4 x i64> %bc
2554 }
2555
2556 define <4 x i64> @test_mm256_unpackhi_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2557 ; CHECK-LABEL: test_mm256_unpackhi_epi16:
2558 ; CHECK:       # %bb.0:
2559 ; CHECK-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
2560 ; CHECK-NEXT:    ret{{[l|q]}}
2561   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2562   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2563   %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
2564   %bc = bitcast <16 x i16> %res to <4 x i64>
2565   ret <4 x i64> %bc
2566 }
2567
2568 define <4 x i64> @test_mm256_unpackhi_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2569 ; CHECK-LABEL: test_mm256_unpackhi_epi32:
2570 ; CHECK:       # %bb.0:
2571 ; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
2572 ; CHECK-NEXT:    ret{{[l|q]}}
2573   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2574   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2575   %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
2576   %bc = bitcast <8 x i32> %res to <4 x i64>
2577   ret <4 x i64> %bc
2578 }
2579
2580 define <4 x i64> @test_mm256_unpackhi_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2581 ; CHECK-LABEL: test_mm256_unpackhi_epi64:
2582 ; CHECK:       # %bb.0:
2583 ; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
2584 ; CHECK-NEXT:    ret{{[l|q]}}
2585   %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
2586   ret <4 x i64> %res
2587 }
2588
2589 define <4 x i64> @test_mm256_unpacklo_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2590 ; CHECK-LABEL: test_mm256_unpacklo_epi8:
2591 ; CHECK:       # %bb.0:
2592 ; CHECK-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
2593 ; CHECK-NEXT:    ret{{[l|q]}}
2594   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2595   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2596   %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
2597   %bc = bitcast <32 x i8> %res to <4 x i64>
2598   ret <4 x i64> %bc
2599 }
2600
2601 define <4 x i64> @test_mm256_unpacklo_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2602 ; CHECK-LABEL: test_mm256_unpacklo_epi16:
2603 ; CHECK:       # %bb.0:
2604 ; CHECK-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
2605 ; CHECK-NEXT:    ret{{[l|q]}}
2606   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2607   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2608   %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
2609   %bc = bitcast <16 x i16> %res to <4 x i64>
2610   ret <4 x i64> %bc
2611 }
2612
2613 define <4 x i64> @test_mm256_unpacklo_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2614 ; CHECK-LABEL: test_mm256_unpacklo_epi32:
2615 ; CHECK:       # %bb.0:
2616 ; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
2617 ; CHECK-NEXT:    ret{{[l|q]}}
2618   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2619   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2620   %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
2621   %bc = bitcast <8 x i32> %res to <4 x i64>
2622   ret <4 x i64> %bc
2623 }
2624
2625 define <4 x i64> @test_mm256_unpacklo_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2626 ; CHECK-LABEL: test_mm256_unpacklo_epi64:
2627 ; CHECK:       # %bb.0:
2628 ; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
2629 ; CHECK-NEXT:    ret{{[l|q]}}
2630   %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
2631   ret <4 x i64> %res
2632 }
2633
2634 define <4 x i64> @test_mm256_xor_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2635 ; CHECK-LABEL: test_mm256_xor_si256:
2636 ; CHECK:       # %bb.0:
2637 ; CHECK-NEXT:    vxorps %ymm1, %ymm0, %ymm0
2638 ; CHECK-NEXT:    ret{{[l|q]}}
2639   %res = xor <4 x i64> %a0, %a1
2640   ret <4 x i64> %res
2641 }
2642
2643 declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
2644
2645 declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone