llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X86
   3 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X64
   4
   5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx2-builtins.c
   6
   7 define <4 x i64> @test_mm256_abs_epi8(<4 x i64> %a0) {
   8 ; CHECK-LABEL: test_mm256_abs_epi8:
   9 ; CHECK:       # %bb.0:
  10 ; CHECK-NEXT:    vpabsb %ymm0, %ymm0
  11 ; CHECK-NEXT:    ret{{[l|q]}}
  12   %arg = bitcast <4 x i64> %a0 to <32 x i8>
  13   %abs = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %arg, i1 false)
  14   %res = bitcast <32 x i8> %abs to <4 x i64>
  15   ret <4 x i64> %res
  16 }
  17 declare <32 x i8> @llvm.abs.v32i8(<32 x i8>, i1) nounwind readnone
  18
  19 define <4 x i64> @test_mm256_abs_epi16(<4 x i64> %a0) {
  20 ; CHECK-LABEL: test_mm256_abs_epi16:
  21 ; CHECK:       # %bb.0:
  22 ; CHECK-NEXT:    vpabsw %ymm0, %ymm0
  23 ; CHECK-NEXT:    ret{{[l|q]}}
  24   %arg = bitcast <4 x i64> %a0 to <16 x i16>
  25   %abs = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %arg, i1 false)
  26   %res = bitcast <16 x i16> %abs to <4 x i64>
  27   ret <4 x i64> %res
  28 }
  29 declare <16 x i16> @llvm.abs.v16i16(<16 x i16>, i1) nounwind readnone
  30
  31 define <4 x i64> @test_mm256_abs_epi32(<4 x i64> %a0) {
  32 ; CHECK-LABEL: test_mm256_abs_epi32:
  33 ; CHECK:       # %bb.0:
  34 ; CHECK-NEXT:    vpabsd %ymm0, %ymm0
  35 ; CHECK-NEXT:    ret{{[l|q]}}
  36   %arg = bitcast <4 x i64> %a0 to <8 x i32>
  37   %abs = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %arg, i1 false)
  38   %res = bitcast <8 x i32> %abs to <4 x i64>
  39   ret <4 x i64> %res
  40 }
  41 declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1) nounwind readnone
  42
  43 define <4 x i64> @test_mm256_add_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
  44 ; CHECK-LABEL: test_mm256_add_epi8:
  45 ; CHECK:       # %bb.0:
  46 ; CHECK-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
  47 ; CHECK-NEXT:    ret{{[l|q]}}
  48   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
  49   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
  50   %res = add <32 x i8> %arg0, %arg1
  51   %bc = bitcast <32 x i8> %res to <4 x i64>
  52   ret <4 x i64> %bc
  53 }
  54
  55 define <4 x i64> @test_mm256_add_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
  56 ; CHECK-LABEL: test_mm256_add_epi16:
  57 ; CHECK:       # %bb.0:
  58 ; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
  59 ; CHECK-NEXT:    ret{{[l|q]}}
  60   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
  61   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
  62   %res = add <16 x i16> %arg0, %arg1
  63   %bc = bitcast <16 x i16> %res to <4 x i64>
  64   ret <4 x i64> %bc
  65 }
  66
  67 define <4 x i64> @test_mm256_add_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
  68 ; CHECK-LABEL: test_mm256_add_epi32:
  69 ; CHECK:       # %bb.0:
  70 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
  71 ; CHECK-NEXT:    ret{{[l|q]}}
  72   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
  73   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
  74   %res = add <8 x i32> %arg0, %arg1
  75   %bc = bitcast <8 x i32> %res to <4 x i64>
  76   ret <4 x i64> %bc
  77 }
  78
  79 define <4 x i64> @test_mm256_add_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
  80 ; CHECK-LABEL: test_mm256_add_epi64:
  81 ; CHECK:       # %bb.0:
  82 ; CHECK-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
  83 ; CHECK-NEXT:    ret{{[l|q]}}
  84   %res = add <4 x i64> %a0, %a1
  85   ret <4 x i64> %res
  86 }
  87
  88 define <4 x i64> @test_mm256_adds_epi8(<4 x i64> %a0, <4 x i64> %a1) {
  89 ; CHECK-LABEL: test_mm256_adds_epi8:
  90 ; CHECK:       # %bb.0:
  91 ; CHECK-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0
  92 ; CHECK-NEXT:    ret{{[l|q]}}
  93   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
  94   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
  95   %res = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
  96   %bc = bitcast <32 x i8> %res to <4 x i64>
  97   ret <4 x i64> %bc
  98 }
  99 declare <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
 100
 101 define <4 x i64> @test_mm256_adds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
 102 ; CHECK-LABEL: test_mm256_adds_epi16:
 103 ; CHECK:       # %bb.0:
 104 ; CHECK-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0
 105 ; CHECK-NEXT:    ret{{[l|q]}}
 106   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 107   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 108   %res = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
 109   %bc = bitcast <16 x i16> %res to <4 x i64>
 110   ret <4 x i64> %bc
 111 }
 112 declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
 113
 114 define <4 x i64> @test_mm256_adds_epu8(<4 x i64> %a0, <4 x i64> %a1) {
 115 ; CHECK-LABEL: test_mm256_adds_epu8:
 116 ; CHECK:       # %bb.0:
 117 ; CHECK-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0
 118 ; CHECK-NEXT:    ret{{[l|q]}}
 119   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 120   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
 121   %res = call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
 122   %bc = bitcast <32 x i8> %res to <4 x i64>
 123   ret <4 x i64> %bc
 124 }
 125 declare <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8>, <32 x i8>)
 126
 127 define <4 x i64> @test_mm256_adds_epu16(<4 x i64> %a0, <4 x i64> %a1) {
 128 ; CHECK-LABEL: test_mm256_adds_epu16:
 129 ; CHECK:       # %bb.0:
 130 ; CHECK-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0
 131 ; CHECK-NEXT:    ret{{[l|q]}}
 132   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 133   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 134   %res = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
 135   %bc = bitcast <16 x i16> %res to <4 x i64>
 136   ret <4 x i64> %bc
 137 }
 138 declare <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16>, <16 x i16>)
 139
 140 define <4 x i64> @test_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
 141 ; CHECK-LABEL: test_mm256_alignr_epi8:
 142 ; CHECK:       # %bb.0:
 143 ; CHECK-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17]
 144 ; CHECK-NEXT:    ret{{[l|q]}}
 145   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 146   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
 147   %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49>
 148   %res = bitcast <32 x i8> %shuf to <4 x i64>
 149   ret <4 x i64> %res
 150 }
 151
 152 define <4 x i64> @test2_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
 153 ; CHECK-LABEL: test2_mm256_alignr_epi8:
 154 ; CHECK:       # %bb.0:
 155 ; CHECK-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16]
 156 ; CHECK-NEXT:    ret{{[l|q]}}
 157   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 158   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
 159   %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
 160   %res = bitcast <32 x i8> %shuf to <4 x i64>
 161   ret <4 x i64> %res
 162 }
 163
 164 define <4 x i64> @test_mm256_and_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 165 ; CHECK-LABEL: test_mm256_and_si256:
 166 ; CHECK:       # %bb.0:
 167 ; CHECK-NEXT:    vandps %ymm1, %ymm0, %ymm0
 168 ; CHECK-NEXT:    ret{{[l|q]}}
 169   %res = and <4 x i64> %a0, %a1
 170   ret <4 x i64> %res
 171 }
 172
 173 define <4 x i64> @test_mm256_andnot_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 174 ; CHECK-LABEL: test_mm256_andnot_si256:
 175 ; CHECK:       # %bb.0:
 176 ; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 177 ; CHECK-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 178 ; CHECK-NEXT:    vpand %ymm1, %ymm0, %ymm0
 179 ; CHECK-NEXT:    ret{{[l|q]}}
 180   %not = xor <4 x i64> %a0, <i64 -1, i64 -1, i64 -1, i64 -1>
 181   %res = and <4 x i64> %not, %a1
 182   ret <4 x i64> %res
 183 }
 184
 185 define <4 x i64> @test_mm256_avg_epu8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 186 ; CHECK-LABEL: test_mm256_avg_epu8:
 187 ; CHECK:       # %bb.0:
 188 ; CHECK-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
 189 ; CHECK-NEXT:    ret{{[l|q]}}
 190   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 191   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
 192   %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %arg0, <32 x i8> %arg1)
 193   %bc = bitcast <32 x i8> %res to <4 x i64>
 194   ret <4 x i64> %bc
 195 }
 196 declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
 197
 198 define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 199 ; CHECK-LABEL: test_mm256_avg_epu16:
 200 ; CHECK:       # %bb.0:
 201 ; CHECK-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
 202 ; CHECK-NEXT:    ret{{[l|q]}}
 203   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 204   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 205   %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %arg0, <16 x i16> %arg1)
 206   %bc = bitcast <16 x i16> %res to <4 x i64>
 207   ret <4 x i64> %bc
 208 }
 209 declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone
 210
 211 define <4 x i64> @test_mm256_blend_epi16(<4 x i64> %a0, <4 x i64> %a1) {
 212 ; CHECK-LABEL: test_mm256_blend_epi16:
 213 ; CHECK:       # %bb.0:
 214 ; CHECK-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
 215 ; CHECK-NEXT:    ret{{[l|q]}}
 216   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 217   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 218   %shuf = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 219   %res = bitcast <16 x i16> %shuf to <4 x i64>
 220   ret <4 x i64> %res
 221 }
 222
 223 define <2 x i64> @test_mm_blend_epi32(<2 x i64> %a0, <2 x i64> %a1) {
 224 ; CHECK-LABEL: test_mm_blend_epi32:
 225 ; CHECK:       # %bb.0:
 226 ; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
 227 ; CHECK-NEXT:    ret{{[l|q]}}
 228   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
 229   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
 230   %shuf = shufflevector <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
 231   %res = bitcast <4 x i32> %shuf to <2 x i64>
 232   ret <2 x i64> %res
 233 }
 234
 235 define <4 x i64> @test_mm256_blend_epi32(<4 x i64> %a0, <4 x i64> %a1) {
 236 ; CHECK-LABEL: test_mm256_blend_epi32:
 237 ; CHECK:       # %bb.0:
 238 ; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
 239 ; CHECK-NEXT:    ret{{[l|q]}}
 240   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
 241   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
 242   %shuf = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
 243   %res = bitcast <8 x i32> %shuf to <4 x i64>
 244   ret <4 x i64> %res
 245 }
 246
 247 define <4 x i64> @test_mm256_blendv_epi8(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
 248 ; CHECK-LABEL: test_mm256_blendv_epi8:
 249 ; CHECK:       # %bb.0:
 250 ; CHECK-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
 251 ; CHECK-NEXT:    ret{{[l|q]}}
 252   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 253   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
 254   %arg2 = bitcast <4 x i64> %a2 to <32 x i8>
 255   %call = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %arg0, <32 x i8> %arg1, <32 x i8> %arg2)
 256   %res = bitcast <32 x i8> %call to <4 x i64>
 257   ret <4 x i64> %res
 258 }
 259 declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone
 260
 261 define <2 x i64> @test_mm_broadcastb_epi8(<2 x i64> %a0) {
 262 ; CHECK-LABEL: test_mm_broadcastb_epi8:
 263 ; CHECK:       # %bb.0:
 264 ; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm0
 265 ; CHECK-NEXT:    ret{{[l|q]}}
 266   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
 267   %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <16 x i32> zeroinitializer
 268   %res = bitcast <16 x i8> %shuf to <2 x i64>
 269   ret <2 x i64> %res
 270 }
 271
 272 define <4 x i64> @test_mm256_broadcastb_epi8(<4 x i64> %a0) {
 273 ; CHECK-LABEL: test_mm256_broadcastb_epi8:
 274 ; CHECK:       # %bb.0:
 275 ; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm0
 276 ; CHECK-NEXT:    ret{{[l|q]}}
 277   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 278   %shuf = shufflevector <32 x i8> %arg0, <32 x i8> undef, <32 x i32> zeroinitializer
 279   %res = bitcast <32 x i8> %shuf to <4 x i64>
 280   ret <4 x i64> %res
 281 }
 282
 283 define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {
 284 ; CHECK-LABEL: test_mm_broadcastd_epi32:
 285 ; CHECK:       # %bb.0:
 286 ; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
 287 ; CHECK-NEXT:    ret{{[l|q]}}
 288   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
 289   %shuf = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
 290   %res = bitcast <4 x i32> %shuf to <2 x i64>
 291   ret <2 x i64> %res
 292 }
 293
 294 define <4 x i64> @test_mm256_broadcastd_epi32(<4 x i64> %a0) {
 295 ; CHECK-LABEL: test_mm256_broadcastd_epi32:
 296 ; CHECK:       # %bb.0:
 297 ; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
 298 ; CHECK-NEXT:    ret{{[l|q]}}
 299   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
 300   %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> zeroinitializer
 301   %res = bitcast <8 x i32> %shuf to <4 x i64>
 302   ret <4 x i64> %res
 303 }
 304
 305 define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) {
 306 ; CHECK-LABEL: test_mm_broadcastq_epi64:
 307 ; CHECK:       # %bb.0:
 308 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 309 ; CHECK-NEXT:    ret{{[l|q]}}
 310   %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
 311   ret <2 x i64> %res
 312 }
 313
 314 define <4 x i64> @test_mm256_broadcastq_epi64(<4 x i64> %a0) {
 315 ; CHECK-LABEL: test_mm256_broadcastq_epi64:
 316 ; CHECK:       # %bb.0:
 317 ; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
 318 ; CHECK-NEXT:    ret{{[l|q]}}
 319   %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> zeroinitializer
 320   ret <4 x i64> %res
 321 }
 322
 323 define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) {
 324 ; CHECK-LABEL: test_mm_broadcastsd_pd:
 325 ; CHECK:       # %bb.0:
 326 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 327 ; CHECK-NEXT:    ret{{[l|q]}}
 328   %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
 329   ret <2 x double> %res
 330 }
 331
 332 define <4 x double> @test_mm256_broadcastsd_pd(<4 x double> %a0) {
 333 ; CHECK-LABEL: test_mm256_broadcastsd_pd:
 334 ; CHECK:       # %bb.0:
 335 ; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
 336 ; CHECK-NEXT:    ret{{[l|q]}}
 337   %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> zeroinitializer
 338   ret <4 x double> %res
 339 }
 340
 341 define <4 x i64> @test_mm256_broadcastsi128_si256(<2 x i64> %a0) {
 342 ; CHECK-LABEL: test_mm256_broadcastsi128_si256:
 343 ; CHECK:       # %bb.0:
 344 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
 345 ; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 346 ; CHECK-NEXT:    ret{{[l|q]}}
 347   %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 348   ret <4 x i64> %res
 349 }
 350
 351 define <4 x i64> @test_mm256_broadcastsi128_si256_mem(<2 x i64>* %p0) {
 352 ; X86-LABEL: test_mm256_broadcastsi128_si256_mem:
 353 ; X86:       # %bb.0:
 354 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 355 ; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
 356 ; X86-NEXT:    retl
 357 ;
 358 ; X64-LABEL: test_mm256_broadcastsi128_si256_mem:
 359 ; X64:       # %bb.0:
 360 ; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
 361 ; X64-NEXT:    retq
 362   %a0 = load <2 x i64>, <2 x i64>* %p0
 363   %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 364   ret <4 x i64> %res
 365 }
 366
 367 define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {
 368 ; CHECK-LABEL: test_mm_broadcastss_ps:
 369 ; CHECK:       # %bb.0:
 370 ; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
 371 ; CHECK-NEXT:    ret{{[l|q]}}
 372   %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
 373   ret <4 x float> %res
 374 }
 375
 376 define <8 x float> @test_mm256_broadcastss_ps(<8 x float> %a0) {
 377 ; CHECK-LABEL: test_mm256_broadcastss_ps:
 378 ; CHECK:       # %bb.0:
 379 ; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
 380 ; CHECK-NEXT:    ret{{[l|q]}}
 381   %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer
 382   ret <8 x float> %res
 383 }
 384
 385 define <2 x i64> @test_mm_broadcastw_epi16(<2 x i64> %a0) {
 386 ; CHECK-LABEL: test_mm_broadcastw_epi16:
 387 ; CHECK:       # %bb.0:
 388 ; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm0
 389 ; CHECK-NEXT:    ret{{[l|q]}}
 390   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
 391   %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> zeroinitializer
 392   %res = bitcast <8 x i16> %shuf to <2 x i64>
 393   ret <2 x i64> %res
 394 }
 395
 396 define <4 x i64> @test_mm256_broadcastw_epi16(<4 x i64> %a0) {
 397 ; CHECK-LABEL: test_mm256_broadcastw_epi16:
 398 ; CHECK:       # %bb.0:
 399 ; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0
 400 ; CHECK-NEXT:    ret{{[l|q]}}
 401   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 402   %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> zeroinitializer
 403   %res = bitcast <16 x i16> %shuf to <4 x i64>
 404   ret <4 x i64> %res
 405 }
 406
 407 define <4 x i64> @test_mm256_bslli_epi128(<4 x i64> %a0) {
 408 ; CHECK-LABEL: test_mm256_bslli_epi128:
 409 ; CHECK:       # %bb.0:
 410 ; CHECK-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
 411 ; CHECK-NEXT:    ret{{[l|q]}}
 412   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 413   %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
 414   %res = bitcast <32 x i8> %shuf to <4 x i64>
 415   ret <4 x i64> %res
 416 }
 417
 418 define <4 x i64> @test_mm256_bsrli_epi128(<4 x i64> %a0) {
 419 ; CHECK-LABEL: test_mm256_bsrli_epi128:
 420 ; CHECK:       # %bb.0:
 421 ; CHECK-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
 422 ; CHECK-NEXT:    ret{{[l|q]}}
 423   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 424   %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
 425   %res = bitcast <32 x i8> %shuf to <4 x i64>
 426   ret <4 x i64> %res
 427 }
 428
 429 define <4 x i64> @test_mm256_cmpeq_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 430 ; CHECK-LABEL: test_mm256_cmpeq_epi8:
 431 ; CHECK:       # %bb.0:
 432 ; CHECK-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
 433 ; CHECK-NEXT:    ret{{[l|q]}}
 434   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 435   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
 436   %cmp = icmp eq <32 x i8> %arg0, %arg1
 437   %res = sext <32 x i1> %cmp to <32 x i8>
 438   %bc = bitcast <32 x i8> %res to <4 x i64>
 439   ret <4 x i64> %bc
 440 }
 441
 442 define <4 x i64> @test_mm256_cmpeq_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 443 ; CHECK-LABEL: test_mm256_cmpeq_epi16:
 444 ; CHECK:       # %bb.0:
 445 ; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
 446 ; CHECK-NEXT:    ret{{[l|q]}}
 447   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 448   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 449   %cmp = icmp eq <16 x i16> %arg0, %arg1
 450   %res = sext <16 x i1> %cmp to <16 x i16>
 451   %bc = bitcast <16 x i16> %res to <4 x i64>
 452   ret <4 x i64> %bc
 453 }
 454
 455 define <4 x i64> @test_mm256_cmpeq_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 456 ; CHECK-LABEL: test_mm256_cmpeq_epi32:
 457 ; CHECK:       # %bb.0:
 458 ; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0
 459 ; CHECK-NEXT:    ret{{[l|q]}}
 460   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
 461   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
 462   %cmp = icmp eq <8 x i32> %arg0, %arg1
 463   %res = sext <8 x i1> %cmp to <8 x i32>
 464   %bc = bitcast <8 x i32> %res to <4 x i64>
 465   ret <4 x i64> %bc
 466 }
 467
 468 define <4 x i64> @test_mm256_cmpeq_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 469 ; CHECK-LABEL: test_mm256_cmpeq_epi64:
 470 ; CHECK:       # %bb.0:
 471 ; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
 472 ; CHECK-NEXT:    ret{{[l|q]}}
 473   %cmp = icmp eq <4 x i64> %a0, %a1
 474   %res = sext <4 x i1> %cmp to <4 x i64>
 475   ret <4 x i64> %res
 476 }
 477
 478 define <4 x i64> @test_mm256_cmpgt_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 479 ; CHECK-LABEL: test_mm256_cmpgt_epi8:
 480 ; CHECK:       # %bb.0:
 481 ; CHECK-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
 482 ; CHECK-NEXT:    ret{{[l|q]}}
 483   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 484   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
 485   %cmp = icmp sgt <32 x i8> %arg0, %arg1
 486   %res = sext <32 x i1> %cmp to <32 x i8>
 487   %bc = bitcast <32 x i8> %res to <4 x i64>
 488   ret <4 x i64> %bc
 489 }
 490
 491 define <4 x i64> @test_mm256_cmpgt_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 492 ; CHECK-LABEL: test_mm256_cmpgt_epi16:
 493 ; CHECK:       # %bb.0:
 494 ; CHECK-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 495 ; CHECK-NEXT:    ret{{[l|q]}}
 496   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 497   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 498   %cmp = icmp sgt <16 x i16> %arg0, %arg1
 499   %res = sext <16 x i1> %cmp to <16 x i16>
 500   %bc = bitcast <16 x i16> %res to <4 x i64>
 501   ret <4 x i64> %bc
 502 }
 503
 504 define <4 x i64> @test_mm256_cmpgt_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 505 ; CHECK-LABEL: test_mm256_cmpgt_epi32:
 506 ; CHECK:       # %bb.0:
 507 ; CHECK-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
 508 ; CHECK-NEXT:    ret{{[l|q]}}
 509   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
 510   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
 511   %cmp = icmp sgt <8 x i32> %arg0, %arg1
 512   %res = sext <8 x i1> %cmp to <8 x i32>
 513   %bc = bitcast <8 x i32> %res to <4 x i64>
 514   ret <4 x i64> %bc
 515 }
 516
 517 define <4 x i64> @test_mm256_cmpgt_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 518 ; CHECK-LABEL: test_mm256_cmpgt_epi64:
 519 ; CHECK:       # %bb.0:
 520 ; CHECK-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
 521 ; CHECK-NEXT:    ret{{[l|q]}}
 522   %cmp = icmp sgt <4 x i64> %a0, %a1
 523   %res = sext <4 x i1> %cmp to <4 x i64>
 524   ret <4 x i64> %res
 525 }
 526
 527 define <4 x i64> @test_mm256_cvtepi8_epi16(<2 x i64> %a0) {
 528 ; CHECK-LABEL: test_mm256_cvtepi8_epi16:
 529 ; CHECK:       # %bb.0:
 530 ; CHECK-NEXT:    vpmovsxbw %xmm0, %ymm0
 531 ; CHECK-NEXT:    ret{{[l|q]}}
 532   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
 533   %ext = sext <16 x i8> %arg0 to <16 x i16>
 534   %res = bitcast <16 x i16> %ext to <4 x i64>
 535   ret <4 x i64> %res
 536 }
 537
 538 define <4 x i64> @test_mm256_cvtepi8_epi32(<2 x i64> %a0) {
 539 ; CHECK-LABEL: test_mm256_cvtepi8_epi32:
 540 ; CHECK:       # %bb.0:
 541 ; CHECK-NEXT:    vpmovsxbd %xmm0, %ymm0
 542 ; CHECK-NEXT:    ret{{[l|q]}}
 543   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
 544   %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 545   %ext = sext <8 x i8> %shuf to <8 x i32>
 546   %res = bitcast <8 x i32> %ext to <4 x i64>
 547   ret <4 x i64> %res
 548 }
 549
 550 define <4 x i64> @test_mm256_cvtepi8_epi64(<2 x i64> %a0) {
 551 ; CHECK-LABEL: test_mm256_cvtepi8_epi64:
 552 ; CHECK:       # %bb.0:
 553 ; CHECK-NEXT:    vpmovsxbq %xmm0, %ymm0
 554 ; CHECK-NEXT:    ret{{[l|q]}}
 555   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
 556   %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 557   %ext = sext <4 x i8> %shuf to <4 x i64>
 558   ret <4 x i64> %ext
 559 }
 560
 561 define <4 x i64> @test_mm256_cvtepi16_epi32(<2 x i64> %a0) {
 562 ; CHECK-LABEL: test_mm256_cvtepi16_epi32:
 563 ; CHECK:       # %bb.0:
 564 ; CHECK-NEXT:    vpmovsxwd %xmm0, %ymm0
 565 ; CHECK-NEXT:    ret{{[l|q]}}
 566   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
 567   %ext = sext <8 x i16> %arg0 to <8 x i32>
 568   %res = bitcast <8 x i32> %ext to <4 x i64>
 569   ret <4 x i64> %res
 570 }
 571
 572 define <4 x i64> @test_mm256_cvtepi16_epi64(<2 x i64> %a0) {
 573 ; CHECK-LABEL: test_mm256_cvtepi16_epi64:
 574 ; CHECK:       # %bb.0:
 575 ; CHECK-NEXT:    vpmovsxwq %xmm0, %ymm0
 576 ; CHECK-NEXT:    ret{{[l|q]}}
 577   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
 578   %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 579   %ext = sext <4 x i16> %shuf to <4 x i64>
 580   ret <4 x i64> %ext
 581 }
 582
 583 define <4 x i64> @test_mm256_cvtepi32_epi64(<2 x i64> %a0) {
 584 ; CHECK-LABEL: test_mm256_cvtepi32_epi64:
 585 ; CHECK:       # %bb.0:
 586 ; CHECK-NEXT:    vpmovsxdq %xmm0, %ymm0
 587 ; CHECK-NEXT:    ret{{[l|q]}}
 588   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
 589   %ext = sext <4 x i32> %arg0 to <4 x i64>
 590   ret <4 x i64> %ext
 591 }
 592
 593 define <4 x i64> @test_mm256_cvtepu8_epi16(<2 x i64> %a0) {
 594 ; CHECK-LABEL: test_mm256_cvtepu8_epi16:
 595 ; CHECK:       # %bb.0:
 596 ; CHECK-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 597 ; CHECK-NEXT:    ret{{[l|q]}}
 598   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
 599   %ext = zext <16 x i8> %arg0 to <16 x i16>
 600   %res = bitcast <16 x i16> %ext to <4 x i64>
 601   ret <4 x i64> %res
 602 }
 603
 604 define <4 x i64> @test_mm256_cvtepu8_epi32(<2 x i64> %a0) {
 605 ; CHECK-LABEL: test_mm256_cvtepu8_epi32:
 606 ; CHECK:       # %bb.0:
 607 ; CHECK-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
 608 ; CHECK-NEXT:    ret{{[l|q]}}
 609   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
 610   %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 611   %ext = zext <8 x i8> %shuf to <8 x i32>
 612   %res = bitcast <8 x i32> %ext to <4 x i64>
 613   ret <4 x i64> %res
 614 }
 615
 616 define <4 x i64> @test_mm256_cvtepu8_epi64(<2 x i64> %a0) {
 617 ; CHECK-LABEL: test_mm256_cvtepu8_epi64:
 618 ; CHECK:       # %bb.0:
 619 ; CHECK-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
 620 ; CHECK-NEXT:    ret{{[l|q]}}
 621   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
 622   %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 623   %ext = zext <4 x i8> %shuf to <4 x i64>
 624   ret <4 x i64> %ext
 625 }
 626
 627 define <4 x i64> @test_mm256_cvtepu16_epi32(<2 x i64> %a0) {
 628 ; CHECK-LABEL: test_mm256_cvtepu16_epi32:
 629 ; CHECK:       # %bb.0:
 630 ; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 631 ; CHECK-NEXT:    ret{{[l|q]}}
 632   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
 633   %ext = zext <8 x i16> %arg0 to <8 x i32>
 634   %res = bitcast <8 x i32> %ext to <4 x i64>
 635   ret <4 x i64> %res
 636 }
 637
 638 define <4 x i64> @test_mm256_cvtepu16_epi64(<2 x i64> %a0) {
 639 ; CHECK-LABEL: test_mm256_cvtepu16_epi64:
 640 ; CHECK:       # %bb.0:
 641 ; CHECK-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 642 ; CHECK-NEXT:    ret{{[l|q]}}
 643   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
 644   %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 645   %ext = zext <4 x i16> %shuf to <4 x i64>
 646   ret <4 x i64> %ext
 647 }
 648
 649 define <4 x i64> @test_mm256_cvtepu32_epi64(<2 x i64> %a0) {
 650 ; CHECK-LABEL: test_mm256_cvtepu32_epi64:
 651 ; CHECK:       # %bb.0:
 652 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 653 ; CHECK-NEXT:    ret{{[l|q]}}
 654   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
 655   %ext = zext <4 x i32> %arg0 to <4 x i64>
 656   ret <4 x i64> %ext
 657 }
 658
 659 define <2 x i64> @test_mm256_extracti128_si256(<4 x i64> %a0) nounwind {
 660 ; CHECK-LABEL: test_mm256_extracti128_si256:
 661 ; CHECK:       # %bb.0:
 662 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
 663 ; CHECK-NEXT:    vzeroupper
 664 ; CHECK-NEXT:    ret{{[l|q]}}
 665   %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3>
 666   ret <2 x i64> %res
 667 }
 668
 669 define <4 x i64> @test_mm256_hadd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
 670 ; CHECK-LABEL: test_mm256_hadd_epi16:
 671 ; CHECK:       # %bb.0:
 672 ; CHECK-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
 673 ; CHECK-NEXT:    ret{{[l|q]}}
 674   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 675   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 676   %res = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %arg0, <16 x i16> %arg1)
 677   %bc = bitcast <16 x i16> %res to <4 x i64>
 678   ret <4 x i64> %bc
 679 }
 680 declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone
 681
 682 define <4 x i64> @test_mm256_hadd_epi32(<4 x i64> %a0, <4 x i64> %a1) {
 683 ; CHECK-LABEL: test_mm256_hadd_epi32:
 684 ; CHECK:       # %bb.0:
 685 ; CHECK-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
 686 ; CHECK-NEXT:    ret{{[l|q]}}
 687   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
 688   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
 689   %res = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %arg0, <8 x i32> %arg1)
 690   %bc = bitcast <8 x i32> %res to <4 x i64>
 691   ret <4 x i64> %bc
 692 }
 693 declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
 694
 695 define <4 x i64> @test_mm256_hadds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
 696 ; CHECK-LABEL: test_mm256_hadds_epi16:
 697 ; CHECK:       # %bb.0:
 698 ; CHECK-NEXT:    vphaddsw %ymm1, %ymm0, %ymm0
 699 ; CHECK-NEXT:    ret{{[l|q]}}
 700   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 701   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 702   %res = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %arg0, <16 x i16> %arg1)
 703   %bc = bitcast <16 x i16> %res to <4 x i64>
 704   ret <4 x i64> %bc
 705 }
 706 declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone
 707
 708 define <4 x i64> @test_mm256_hsub_epi16(<4 x i64> %a0, <4 x i64> %a1) {
 709 ; CHECK-LABEL: test_mm256_hsub_epi16:
 710 ; CHECK:       # %bb.0:
 711 ; CHECK-NEXT:    vphsubw %ymm1, %ymm0, %ymm0
 712 ; CHECK-NEXT:    ret{{[l|q]}}
 713   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 714   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 715   %res = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %arg0, <16 x i16> %arg1)
 716   %bc = bitcast <16 x i16> %res to <4 x i64>
 717   ret <4 x i64> %bc
 718 }
 719 declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone
 720
 721 define <4 x i64> @test_mm256_hsub_epi32(<4 x i64> %a0, <4 x i64> %a1) {
 722 ; CHECK-LABEL: test_mm256_hsub_epi32:
 723 ; CHECK:       # %bb.0:
 724 ; CHECK-NEXT:    vphsubd %ymm1, %ymm0, %ymm0
 725 ; CHECK-NEXT:    ret{{[l|q]}}
 726   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
 727   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
 728   %res = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %arg0, <8 x i32> %arg1)
 729   %bc = bitcast <8 x i32> %res to <4 x i64>
 730   ret <4 x i64> %bc
 731 }
 732 declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
 733
 734 define <4 x i64> @test_mm256_hsubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
 735 ; CHECK-LABEL: test_mm256_hsubs_epi16:
 736 ; CHECK:       # %bb.0:
 737 ; CHECK-NEXT:    vphsubsw %ymm1, %ymm0, %ymm0
 738 ; CHECK-NEXT:    ret{{[l|q]}}
 739   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 740   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
 741   %res = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %arg0, <16 x i16> %arg1)
 742   %bc = bitcast <16 x i16> %res to <4 x i64>
 743   ret <4 x i64> %bc
 744 }
 745 declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone
 746
 747 define <2 x i64> @test_mm_i32gather_epi32(i32 *%a0, <2 x i64> %a1) {
 748 ; X86-LABEL: test_mm_i32gather_epi32:
 749 ; X86:       # %bb.0:
 750 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 751 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 752 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 753 ; X86-NEXT:    vpgatherdd %xmm2, (%eax,%xmm0,2), %xmm1
 754 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
 755 ; X86-NEXT:    retl
 756 ;
 757 ; X64-LABEL: test_mm_i32gather_epi32:
 758 ; X64:       # %bb.0:
 759 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 760 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 761 ; X64-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm0,2), %xmm1
 762 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
 763 ; X64-NEXT:    retq
 764   %arg0 = bitcast i32 *%a0 to i8*
 765   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
 766   %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
 767   %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, i8* %arg0, <4 x i32> %arg1, <4 x i32> %mask, i8 2)
 768   %bc = bitcast <4 x i32> %call to <2 x i64>
 769   ret <2 x i64> %bc
 770 }
 771 declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*, <4 x i32>, <4 x i32>, i8) nounwind readonly
 772
 773 define <2 x i64> @test_mm_mask_i32gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
 774 ; X86-LABEL: test_mm_mask_i32gather_epi32:
 775 ; X86:       # %bb.0:
 776 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 777 ; X86-NEXT:    vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0
 778 ; X86-NEXT:    retl
 779 ;
 780 ; X64-LABEL: test_mm_mask_i32gather_epi32:
 781 ; X64:       # %bb.0:
 782 ; X64-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0
 783 ; X64-NEXT:    retq
 784   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
 785   %arg1 = bitcast i32 *%a1 to i8*
 786   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
 787   %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
 788   %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %arg0, i8* %arg1, <4 x i32> %arg2, <4 x i32> %arg3, i8 2)
 789   %bc = bitcast <4 x i32> %call to <2 x i64>
 790   ret <2 x i64> %bc
 791 }
 792
 793 define <4 x i64> @test_mm256_i32gather_epi32(i32 *%a0, <4 x i64> %a1) {
 794 ; X86-LABEL: test_mm256_i32gather_epi32:
 795 ; X86:       # %bb.0:
 796 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 797 ; X86-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 798 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 799 ; X86-NEXT:    vpgatherdd %ymm2, (%eax,%ymm0,2), %ymm1
 800 ; X86-NEXT:    vmovdqa %ymm1, %ymm0
 801 ; X86-NEXT:    retl
 802 ;
 803 ; X64-LABEL: test_mm256_i32gather_epi32:
 804 ; X64:       # %bb.0:
 805 ; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 806 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 807 ; X64-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm0,2), %ymm1
 808 ; X64-NEXT:    vmovdqa %ymm1, %ymm0
 809 ; X64-NEXT:    retq
 810   %arg0 = bitcast i32 *%a0 to i8*
 811   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
 812   %mask = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <8 x i32>
 813   %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8* %arg0, <8 x i32> %arg1, <8 x i32> %mask, i8 2)
 814   %bc = bitcast <8 x i32> %call to <4 x i64>
 815   ret <4 x i64> %bc
 816 }
 817 declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, i8*, <8 x i32>, <8 x i32>, i8) nounwind readonly
 818
 819 define <4 x i64> @test_mm256_mask_i32gather_epi32(<4 x i64> %a0, i32 *%a1, <4 x i64> %a2, <4 x i64> %a3) {
 820 ; X86-LABEL: test_mm256_mask_i32gather_epi32:
 821 ; X86:       # %bb.0:
 822 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 823 ; X86-NEXT:    vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0
 824 ; X86-NEXT:    retl
 825 ;
 826 ; X64-LABEL: test_mm256_mask_i32gather_epi32:
 827 ; X64:       # %bb.0:
 828 ; X64-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0
 829 ; X64-NEXT:    retq
 830   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
 831   %arg1 = bitcast i32 *%a1 to i8*
 832   %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
 833   %arg3 = bitcast <4 x i64> %a3 to <8 x i32>
 834   %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %arg0, i8* %arg1, <8 x i32> %arg2, <8 x i32> %arg3, i8 2)
 835   %bc = bitcast <8 x i32> %call to <4 x i64>
 836   ret <4 x i64> %bc
 837 }
 838
 839 define <2 x i64> @test_mm_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
 840 ; X86-LABEL: test_mm_i32gather_epi64:
 841 ; X86:       # %bb.0:
 842 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 843 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 844 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 845 ; X86-NEXT:    vpgatherdq %xmm2, (%eax,%xmm0,2), %xmm1
 846 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
 847 ; X86-NEXT:    retl
 848 ;
 849 ; X64-LABEL: test_mm_i32gather_epi64:
 850 ; X64:       # %bb.0:
 851 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 852 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 853 ; X64-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm0,2), %xmm1
 854 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
 855 ; X64-NEXT:    retq
 856   %arg0 = bitcast i64 *%a0 to i8*
 857   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
 858   %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> undef, i8* %arg0, <4 x i32> %arg1, <2 x i64> <i64 -1, i64 -1>, i8 2)
 859   ret <2 x i64> %res
 860 }
 861 declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, i8*, <4 x i32>, <2 x i64>, i8) nounwind readonly
 862
 863 define <2 x i64> @test_mm_mask_i32gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
 864 ; X86-LABEL: test_mm_mask_i32gather_epi64:
 865 ; X86:       # %bb.0:
 866 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 867 ; X86-NEXT:    vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0
 868 ; X86-NEXT:    retl
 869 ;
 870 ; X64-LABEL: test_mm_mask_i32gather_epi64:
 871 ; X64:       # %bb.0:
 872 ; X64-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0
 873 ; X64-NEXT:    retq
 874   %arg1 = bitcast i64 *%a1 to i8*
 875   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
 876   %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <2 x i64> %a3, i8 2)
 877   ret <2 x i64> %res
 878 }
 879
 880 define <4 x i64> @test_mm256_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
 881 ; X86-LABEL: test_mm256_i32gather_epi64:
 882 ; X86:       # %bb.0:
 883 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 884 ; X86-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 885 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 886 ; X86-NEXT:    vpgatherdq %ymm2, (%eax,%xmm0,2), %ymm1
 887 ; X86-NEXT:    vmovdqa %ymm1, %ymm0
 888 ; X86-NEXT:    retl
 889 ;
 890 ; X64-LABEL: test_mm256_i32gather_epi64:
 891 ; X64:       # %bb.0:
 892 ; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
 893 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 894 ; X64-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm0,2), %ymm1
 895 ; X64-NEXT:    vmovdqa %ymm1, %ymm0
 896 ; X64-NEXT:    retq
 897   %arg0 = bitcast i64 *%a0 to i8*
 898   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
 899   %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8* %arg0, <4 x i32> %arg1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
 900   ret <4 x i64> %res
 901 }
 902 declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, i8*, <4 x i32>, <4 x i64>, i8) nounwind readonly
 903
 904 define <4 x i64> @test_mm256_mask_i32gather_epi64(<4 x i64> %a0, i64 *%a1, <2 x i64> %a2, <4 x i64> %a3) {
 905 ; X86-LABEL: test_mm256_mask_i32gather_epi64:
 906 ; X86:       # %bb.0:
 907 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 908 ; X86-NEXT:    vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0
 909 ; X86-NEXT:    retl
 910 ;
 911 ; X64-LABEL: test_mm256_mask_i32gather_epi64:
 912 ; X64:       # %bb.0:
 913 ; X64-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0
 914 ; X64-NEXT:    retq
 915   %arg1 = bitcast i64 *%a1 to i8*
 916   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
 917   %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <4 x i64> %a3, i8 2)
 918   ret <4 x i64> %res
 919 }
 920
 921 define <2 x double> @test_mm_i32gather_pd(double *%a0, <2 x i64> %a1) {
 922 ; X86-LABEL: test_mm_i32gather_pd:
 923 ; X86:       # %bb.0:
 924 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 925 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 926 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 927 ; X86-NEXT:    vgatherdpd %xmm2, (%eax,%xmm0,2), %xmm1
 928 ; X86-NEXT:    vmovapd %xmm1, %xmm0
 929 ; X86-NEXT:    retl
 930 ;
 931 ; X64-LABEL: test_mm_i32gather_pd:
 932 ; X64:       # %bb.0:
 933 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 934 ; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 935 ; X64-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm0,2), %xmm1
 936 ; X64-NEXT:    vmovapd %xmm1, %xmm0
 937 ; X64-NEXT:    retq
 938   %arg0 = bitcast double *%a0 to i8*
 939   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
 940   %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
 941   %sext = sext <2 x i1> %cmp to <2 x i64>
 942   %mask = bitcast <2 x i64> %sext to <2 x double>
 943   %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> undef, i8* %arg0, <4 x i32> %arg1, <2 x double> %mask, i8 2)
 944   ret <2 x double> %res
 945 }
 946 declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*, <4 x i32>, <2 x double>, i8) nounwind readonly
 947
 948 define <2 x double> @test_mm_mask_i32gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) {
 949 ; X86-LABEL: test_mm_mask_i32gather_pd:
 950 ; X86:       # %bb.0:
 951 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 952 ; X86-NEXT:    vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0
 953 ; X86-NEXT:    retl
 954 ;
 955 ; X64-LABEL: test_mm_mask_i32gather_pd:
 956 ; X64:       # %bb.0:
 957 ; X64-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0
 958 ; X64-NEXT:    retq
 959   %arg1 = bitcast double *%a1 to i8*
 960   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
 961   %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0, i8* %arg1, <4 x i32> %arg2, <2 x double> %a3, i8 2)
 962   ret <2 x double> %res
 963 }
 964
 965 define <4 x double> @test_mm256_i32gather_pd(double *%a0, <2 x i64> %a1) {
 966 ; X86-LABEL: test_mm256_i32gather_pd:
 967 ; X86:       # %bb.0:
 968 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 969 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 970 ; X86-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
 971 ; X86-NEXT:    vgatherdpd %ymm2, (%eax,%xmm0,2), %ymm1
 972 ; X86-NEXT:    vmovapd %ymm1, %ymm0
 973 ; X86-NEXT:    retl
 974 ;
 975 ; X64-LABEL: test_mm256_i32gather_pd:
 976 ; X64:       # %bb.0:
 977 ; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 978 ; X64-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
 979 ; X64-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm0,2), %ymm1
 980 ; X64-NEXT:    vmovapd %ymm1, %ymm0
 981 ; X64-NEXT:    retq
 982   %arg0 = bitcast double *%a0 to i8*
 983   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
 984   %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
 985   %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8* %arg0, <4 x i32> %arg1, <4 x double> %mask, i8 2)
 986   ret <4 x double> %res
 987 }
 988 declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*, <4 x i32>, <4 x double>, i8) nounwind readonly
 989
 990 define <4 x double> @test_mm256_mask_i32gather_pd(<4 x double> %a0, double *%a1, <2 x i64> %a2, <4 x double> %a3) {
 991 ; X86-LABEL: test_mm256_mask_i32gather_pd:
 992 ; X86:       # %bb.0:
 993 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 994 ; X86-NEXT:    vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0
 995 ; X86-NEXT:    retl
 996 ;
 997 ; X64-LABEL: test_mm256_mask_i32gather_pd:
 998 ; X64:       # %bb.0:
 999 ; X64-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm1,2), %ymm0
1000 ; X64-NEXT:    retq
1001   %arg1 = bitcast double *%a1 to i8*
1002   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1003   %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0, i8* %arg1, <4 x i32> %arg2, <4 x double> %a3, i8 2)
1004   ret <4 x double> %res
1005 }
1006
1007 define <4 x float> @test_mm_i32gather_ps(float *%a0, <2 x i64> %a1) {
1008 ; X86-LABEL: test_mm_i32gather_ps:
1009 ; X86:       # %bb.0:
1010 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1011 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1012 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1013 ; X86-NEXT:    vgatherdps %xmm2, (%eax,%xmm0,2), %xmm1
1014 ; X86-NEXT:    vmovaps %xmm1, %xmm0
1015 ; X86-NEXT:    retl
1016 ;
1017 ; X64-LABEL: test_mm_i32gather_ps:
1018 ; X64:       # %bb.0:
1019 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1020 ; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1021 ; X64-NEXT:    vgatherdps %xmm2, (%rdi,%xmm0,2), %xmm1
1022 ; X64-NEXT:    vmovaps %xmm1, %xmm0
1023 ; X64-NEXT:    retq
1024   %arg0 = bitcast float *%a0 to i8*
1025   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1026   %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1027   %sext = sext <4 x i1> %cmp to <4 x i32>
1028   %mask = bitcast <4 x i32> %sext to <4 x float>
1029   %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, i8* %arg0, <4 x i32> %arg1, <4 x float> %mask, i8 2)
1030   ret <4 x float> %call
1031 }
1032 declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*, <4 x i32>, <4 x float>, i8) nounwind readonly
1033
1034 define <4 x float> @test_mm_mask_i32gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) {
1035 ; X86-LABEL: test_mm_mask_i32gather_ps:
1036 ; X86:       # %bb.0:
1037 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1038 ; X86-NEXT:    vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0
1039 ; X86-NEXT:    retl
1040 ;
1041 ; X64-LABEL: test_mm_mask_i32gather_ps:
1042 ; X64:       # %bb.0:
1043 ; X64-NEXT:    vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0
1044 ; X64-NEXT:    retq
1045   %arg1 = bitcast float *%a1 to i8*
1046   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1047   %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0, i8* %arg1, <4 x i32> %arg2, <4 x float> %a3, i8 2)
1048   ret <4 x float> %call
1049 }
1050
1051 define <8 x float> @test_mm256_i32gather_ps(float *%a0, <4 x i64> %a1) {
1052 ; X86-LABEL: test_mm256_i32gather_ps:
1053 ; X86:       # %bb.0:
1054 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1055 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1056 ; X86-NEXT:    vcmpeqps %ymm1, %ymm1, %ymm2
1057 ; X86-NEXT:    vgatherdps %ymm2, (%eax,%ymm0,2), %ymm1
1058 ; X86-NEXT:    vmovaps %ymm1, %ymm0
1059 ; X86-NEXT:    retl
1060 ;
1061 ; X64-LABEL: test_mm256_i32gather_ps:
1062 ; X64:       # %bb.0:
1063 ; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1064 ; X64-NEXT:    vcmpeqps %ymm1, %ymm1, %ymm2
1065 ; X64-NEXT:    vgatherdps %ymm2, (%rdi,%ymm0,2), %ymm1
1066 ; X64-NEXT:    vmovaps %ymm1, %ymm0
1067 ; X64-NEXT:    retq
1068   %arg0 = bitcast float *%a0 to i8*
1069   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1070   %mask = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i8 0)
1071   %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8* %arg0, <8 x i32> %arg1, <8 x float> %mask, i8 2)
1072   ret <8 x float> %call
1073 }
1074 declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*, <8 x i32>, <8 x float>, i8) nounwind readonly
1075
1076 define <8 x float> @test_mm256_mask_i32gather_ps(<8 x float> %a0, float *%a1, <4 x i64> %a2, <8 x float> %a3) {
1077 ; X86-LABEL: test_mm256_mask_i32gather_ps:
1078 ; X86:       # %bb.0:
1079 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1080 ; X86-NEXT:    vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0
1081 ; X86-NEXT:    retl
1082 ;
1083 ; X64-LABEL: test_mm256_mask_i32gather_ps:
1084 ; X64:       # %bb.0:
1085 ; X64-NEXT:    vgatherdps %ymm2, (%rdi,%ymm1,2), %ymm0
1086 ; X64-NEXT:    retq
1087   %arg1 = bitcast float *%a1 to i8*
1088   %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
1089   %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0, i8* %arg1, <8 x i32> %arg2, <8 x float> %a3, i8 2)
1090   ret <8 x float> %call
1091 }
1092
1093 define <2 x i64> @test_mm_i64gather_epi32(i32 *%a0, <2 x i64> %a1) {
1094 ; X86-LABEL: test_mm_i64gather_epi32:
1095 ; X86:       # %bb.0:
1096 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1097 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1098 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1099 ; X86-NEXT:    vpgatherqd %xmm2, (%eax,%xmm0,2), %xmm1
1100 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
1101 ; X86-NEXT:    retl
1102 ;
1103 ; X64-LABEL: test_mm_i64gather_epi32:
1104 ; X64:       # %bb.0:
1105 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1106 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1107 ; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm0,2), %xmm1
1108 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
1109 ; X64-NEXT:    retq
1110   %arg0 = bitcast i32 *%a0 to i8*
1111   %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
1112   %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> undef, i8* %arg0, <2 x i64> %a1, <4 x i32> %mask, i8 2)
1113   %bc = bitcast <4 x i32> %call to <2 x i64>
1114   ret <2 x i64> %bc
1115 }
1116 declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, i8*, <2 x i64>, <4 x i32>, i8) nounwind readonly
1117
1118 define <2 x i64> @test_mm_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
1119 ; X86-LABEL: test_mm_mask_i64gather_epi32:
1120 ; X86:       # %bb.0:
1121 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1122 ; X86-NEXT:    vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0
1123 ; X86-NEXT:    retl
1124 ;
1125 ; X64-LABEL: test_mm_mask_i64gather_epi32:
1126 ; X64:       # %bb.0:
1127 ; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0
1128 ; X64-NEXT:    retq
1129   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1130   %arg1 = bitcast i32 *%a1 to i8*
1131   %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
1132   %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %arg0, i8* %arg1, <2 x i64> %a2, <4 x i32> %arg3, i8 2)
1133   %bc = bitcast <4 x i32> %call to <2 x i64>
1134   ret <2 x i64> %bc
1135 }
1136
1137 define <2 x i64> @test_mm256_i64gather_epi32(i32 *%a0, <4 x i64> %a1) {
1138 ; X86-LABEL: test_mm256_i64gather_epi32:
1139 ; X86:       # %bb.0:
1140 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1141 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1142 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1143 ; X86-NEXT:    vpgatherqd %xmm2, (%eax,%ymm0,2), %xmm1
1144 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
1145 ; X86-NEXT:    vzeroupper
1146 ; X86-NEXT:    retl
1147 ;
1148 ; X64-LABEL: test_mm256_i64gather_epi32:
1149 ; X64:       # %bb.0:
1150 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1151 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1152 ; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm0,2), %xmm1
1153 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
1154 ; X64-NEXT:    vzeroupper
1155 ; X64-NEXT:    retq
1156   %arg0 = bitcast i32 *%a0 to i8*
1157   %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
1158   %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8* %arg0, <4 x i64> %a1, <4 x i32> %mask, i8 2)
1159   %bc = bitcast <4 x i32> %call to <2 x i64>
1160   ret <2 x i64> %bc
1161 }
1162 declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, i8*, <4 x i64>, <4 x i32>, i8) nounwind readonly
1163
1164 define <2 x i64> @test_mm256_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <4 x i64> %a2, <2 x i64> %a3) {
1165 ; X86-LABEL: test_mm256_mask_i64gather_epi32:
1166 ; X86:       # %bb.0:
1167 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1168 ; X86-NEXT:    vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0
1169 ; X86-NEXT:    vzeroupper
1170 ; X86-NEXT:    retl
1171 ;
1172 ; X64-LABEL: test_mm256_mask_i64gather_epi32:
1173 ; X64:       # %bb.0:
1174 ; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0
1175 ; X64-NEXT:    vzeroupper
1176 ; X64-NEXT:    retq
1177   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
1178   %arg1 = bitcast i32 *%a1 to i8*
1179   %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
1180   %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %arg0, i8* %arg1, <4 x i64> %a2, <4 x i32> %arg3, i8 2)
1181   %bc = bitcast <4 x i32> %call to <2 x i64>
1182   ret <2 x i64> %bc
1183 }
1184
1185 define <2 x i64> @test_mm_i64gather_epi64(i64 *%a0, <2 x i64> %a1) {
1186 ; X86-LABEL: test_mm_i64gather_epi64:
1187 ; X86:       # %bb.0:
1188 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1189 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1190 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1191 ; X86-NEXT:    vpgatherqq %xmm2, (%eax,%xmm0,2), %xmm1
1192 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
1193 ; X86-NEXT:    retl
1194 ;
1195 ; X64-LABEL: test_mm_i64gather_epi64:
1196 ; X64:       # %bb.0:
1197 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1198 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1199 ; X64-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm0,2), %xmm1
1200 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
1201 ; X64-NEXT:    retq
1202   %arg0 = bitcast i64 *%a0 to i8*
1203   %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> undef, i8* %arg0, <2 x i64> %a1, <2 x i64> <i64 -1, i64 -1>, i8 2)
1204   ret <2 x i64> %call
1205 }
1206 declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, i8*, <2 x i64>, <2 x i64>, i8) nounwind readonly
1207
1208 define <2 x i64> @test_mm_mask_i64gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
1209 ; X86-LABEL: test_mm_mask_i64gather_epi64:
1210 ; X86:       # %bb.0:
1211 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1212 ; X86-NEXT:    vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0
1213 ; X86-NEXT:    retl
1214 ;
1215 ; X64-LABEL: test_mm_mask_i64gather_epi64:
1216 ; X64:       # %bb.0:
1217 ; X64-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0
1218 ; X64-NEXT:    retq
1219   %arg1 = bitcast i64 *%a1 to i8*
1220   %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0, i8* %arg1, <2 x i64> %a2, <2 x i64> %a3, i8 2)
1221   ret <2 x i64> %call
1222 }
1223
1224 define <4 x i64> @test_mm256_i64gather_epi64(i64 *%a0, <4 x i64> %a1) {
1225 ; X86-LABEL: test_mm256_i64gather_epi64:
1226 ; X86:       # %bb.0:
1227 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1228 ; X86-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
1229 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1230 ; X86-NEXT:    vpgatherqq %ymm2, (%eax,%ymm0,2), %ymm1
1231 ; X86-NEXT:    vmovdqa %ymm1, %ymm0
1232 ; X86-NEXT:    retl
1233 ;
1234 ; X64-LABEL: test_mm256_i64gather_epi64:
1235 ; X64:       # %bb.0:
1236 ; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
1237 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1238 ; X64-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm0,2), %ymm1
1239 ; X64-NEXT:    vmovdqa %ymm1, %ymm0
1240 ; X64-NEXT:    retq
1241   %arg0 = bitcast i64 *%a0 to i8*
1242   %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8* %arg0, <4 x i64> %a1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
1243   ret <4 x i64> %call
1244 }
1245 declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, i8*, <4 x i64>, <4 x i64>, i8) nounwind readonly
1246
1247 define <4 x i64> @test_mm256_mask_i64gather_epi64(<4 x i64> %a0, i64 *%a1, <4 x i64> %a2, <4 x i64> %a3) {
1248 ; X86-LABEL: test_mm256_mask_i64gather_epi64:
1249 ; X86:       # %bb.0:
1250 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1251 ; X86-NEXT:    vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0
1252 ; X86-NEXT:    retl
1253 ;
1254 ; X64-LABEL: test_mm256_mask_i64gather_epi64:
1255 ; X64:       # %bb.0:
1256 ; X64-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0
1257 ; X64-NEXT:    retq
1258   %arg1 = bitcast i64 *%a1 to i8*
1259   %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0, i8* %arg1, <4 x i64> %a2, <4 x i64> %a3, i8 2)
1260   ret <4 x i64> %call
1261 }
1262
1263 define <2 x double> @test_mm_i64gather_pd(double *%a0, <2 x i64> %a1) {
1264 ; X86-LABEL: test_mm_i64gather_pd:
1265 ; X86:       # %bb.0:
1266 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1267 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1268 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1269 ; X86-NEXT:    vgatherqpd %xmm2, (%eax,%xmm0,2), %xmm1
1270 ; X86-NEXT:    vmovapd %xmm1, %xmm0
1271 ; X86-NEXT:    retl
1272 ;
1273 ; X64-LABEL: test_mm_i64gather_pd:
1274 ; X64:       # %bb.0:
1275 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1276 ; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1277 ; X64-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm0,2), %xmm1
1278 ; X64-NEXT:    vmovapd %xmm1, %xmm0
1279 ; X64-NEXT:    retq
1280   %arg0 = bitcast double *%a0 to i8*
1281   %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
1282   %sext = sext <2 x i1> %cmp to <2 x i64>
1283   %mask = bitcast <2 x i64> %sext to <2 x double>
1284   %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> undef, i8* %arg0, <2 x i64> %a1, <2 x double> %mask, i8 2)
1285   ret <2 x double> %call
1286 }
1287 declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, i8*, <2 x i64>, <2 x double>, i8) nounwind readonly
1288
1289 define <2 x double> @test_mm_mask_i64gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) {
1290 ; X86-LABEL: test_mm_mask_i64gather_pd:
1291 ; X86:       # %bb.0:
1292 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1293 ; X86-NEXT:    vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0
1294 ; X86-NEXT:    retl
1295 ;
1296 ; X64-LABEL: test_mm_mask_i64gather_pd:
1297 ; X64:       # %bb.0:
1298 ; X64-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0
1299 ; X64-NEXT:    retq
1300   %arg1 = bitcast double *%a1 to i8*
1301   %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0, i8* %arg1, <2 x i64> %a2, <2 x double> %a3, i8 2)
1302   ret <2 x double> %call
1303 }
1304
1305 define <4 x double> @test_mm256_i64gather_pd(double *%a0, <4 x i64> %a1) {
1306 ; X86-LABEL: test_mm256_i64gather_pd:
1307 ; X86:       # %bb.0:
1308 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1309 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1310 ; X86-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
1311 ; X86-NEXT:    vgatherqpd %ymm2, (%eax,%ymm0,2), %ymm1
1312 ; X86-NEXT:    vmovapd %ymm1, %ymm0
1313 ; X86-NEXT:    retl
1314 ;
1315 ; X64-LABEL: test_mm256_i64gather_pd:
1316 ; X64:       # %bb.0:
1317 ; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1318 ; X64-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
1319 ; X64-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm0,2), %ymm1
1320 ; X64-NEXT:    vmovapd %ymm1, %ymm0
1321 ; X64-NEXT:    retq
1322   %arg0 = bitcast double *%a0 to i8*
1323   %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
1324   %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8* %arg0, <4 x i64> %a1, <4 x double> %mask, i8 2)
1325   ret <4 x double> %call
1326 }
1327 declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, i8*, <4 x i64>, <4 x double>, i8) nounwind readonly
1328
1329 define <4 x double> @test_mm256_mask_i64gather_pd(<4 x double> %a0, i64 *%a1, <4 x i64> %a2, <4 x double> %a3) {
1330 ; X86-LABEL: test_mm256_mask_i64gather_pd:
1331 ; X86:       # %bb.0:
1332 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1333 ; X86-NEXT:    vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0
1334 ; X86-NEXT:    retl
1335 ;
1336 ; X64-LABEL: test_mm256_mask_i64gather_pd:
1337 ; X64:       # %bb.0:
1338 ; X64-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm1,2), %ymm0
1339 ; X64-NEXT:    retq
1340   %arg1 = bitcast i64 *%a1 to i8*
1341   %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0, i8* %arg1, <4 x i64> %a2, <4 x double> %a3, i8 2)
1342   ret <4 x double> %call
1343 }
1344
1345 define <4 x float> @test_mm_i64gather_ps(float *%a0, <2 x i64> %a1) {
1346 ; X86-LABEL: test_mm_i64gather_ps:
1347 ; X86:       # %bb.0:
1348 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1349 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1350 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1351 ; X86-NEXT:    vgatherqps %xmm2, (%eax,%xmm0,2), %xmm1
1352 ; X86-NEXT:    vmovaps %xmm1, %xmm0
1353 ; X86-NEXT:    retl
1354 ;
1355 ; X64-LABEL: test_mm_i64gather_ps:
1356 ; X64:       # %bb.0:
1357 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1358 ; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1359 ; X64-NEXT:    vgatherqps %xmm2, (%rdi,%xmm0,2), %xmm1
1360 ; X64-NEXT:    vmovaps %xmm1, %xmm0
1361 ; X64-NEXT:    retq
1362   %arg0 = bitcast float *%a0 to i8*
1363   %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1364   %sext = sext <4 x i1> %cmp to <4 x i32>
1365   %mask = bitcast <4 x i32> %sext to <4 x float>
1366   %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> undef, i8* %arg0, <2 x i64> %a1, <4 x float> %mask, i8 2)
1367   ret <4 x float> %call
1368 }
1369 declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, i8*, <2 x i64>, <4 x float>, i8) nounwind readonly
1370
1371 define <4 x float> @test_mm_mask_i64gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) {
1372 ; X86-LABEL: test_mm_mask_i64gather_ps:
1373 ; X86:       # %bb.0:
1374 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1375 ; X86-NEXT:    vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0
1376 ; X86-NEXT:    retl
1377 ;
1378 ; X64-LABEL: test_mm_mask_i64gather_ps:
1379 ; X64:       # %bb.0:
1380 ; X64-NEXT:    vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0
1381 ; X64-NEXT:    retq
1382   %arg1 = bitcast float *%a1 to i8*
1383   %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0, i8* %arg1, <2 x i64> %a2, <4 x float> %a3, i8 2)
1384   ret <4 x float> %call
1385 }
1386
1387 define <4 x float> @test_mm256_i64gather_ps(float *%a0, <4 x i64> %a1) {
1388 ; X86-LABEL: test_mm256_i64gather_ps:
1389 ; X86:       # %bb.0:
1390 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1391 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1392 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1393 ; X86-NEXT:    vgatherqps %xmm2, (%eax,%ymm0,2), %xmm1
1394 ; X86-NEXT:    vmovaps %xmm1, %xmm0
1395 ; X86-NEXT:    vzeroupper
1396 ; X86-NEXT:    retl
1397 ;
1398 ; X64-LABEL: test_mm256_i64gather_ps:
1399 ; X64:       # %bb.0:
1400 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1401 ; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1402 ; X64-NEXT:    vgatherqps %xmm2, (%rdi,%ymm0,2), %xmm1
1403 ; X64-NEXT:    vmovaps %xmm1, %xmm0
1404 ; X64-NEXT:    vzeroupper
1405 ; X64-NEXT:    retq
1406   %arg0 = bitcast float *%a0 to i8*
1407   %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
1408   %sext = sext <4 x i1> %cmp to <4 x i32>
1409   %mask = bitcast <4 x i32> %sext to <4 x float>
1410   %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8* %arg0, <4 x i64> %a1, <4 x float> %mask, i8 2)
1411   ret <4 x float> %call
1412 }
1413 declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, i8*, <4 x i64>, <4 x float>, i8) nounwind readonly
1414
1415 define <4 x float> @test_mm256_mask_i64gather_ps(<4 x float> %a0, float *%a1, <4 x i64> %a2, <4 x float> %a3) {
1416 ; X86-LABEL: test_mm256_mask_i64gather_ps:
1417 ; X86:       # %bb.0:
1418 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1419 ; X86-NEXT:    vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0
1420 ; X86-NEXT:    vzeroupper
1421 ; X86-NEXT:    retl
1422 ;
1423 ; X64-LABEL: test_mm256_mask_i64gather_ps:
1424 ; X64:       # %bb.0:
1425 ; X64-NEXT:    vgatherqps %xmm2, (%rdi,%ymm1,2), %xmm0
1426 ; X64-NEXT:    vzeroupper
1427 ; X64-NEXT:    retq
1428   %arg1 = bitcast float *%a1 to i8*
1429   %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0, i8* %arg1, <4 x i64> %a2, <4 x float> %a3, i8 2)
1430   ret <4 x float> %call
1431 }
1432
1433 define <4 x i64> @test0_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
1434 ; CHECK-LABEL: test0_mm256_inserti128_si256:
1435 ; CHECK:       # %bb.0:
1436 ; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
1437 ; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1438 ; CHECK-NEXT:    ret{{[l|q]}}
1439   %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1440   %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
1441   ret <4 x i64> %res
1442 }
1443
1444 define <4 x i64> @test1_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
1445 ; CHECK-LABEL: test1_mm256_inserti128_si256:
1446 ; CHECK:       # %bb.0:
1447 ; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1448 ; CHECK-NEXT:    ret{{[l|q]}}
1449   %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
1450   %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1451   ret <4 x i64> %res
1452 }
1453
1454 define <4 x i64> @test_mm256_madd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1455 ; CHECK-LABEL: test_mm256_madd_epi16:
1456 ; CHECK:       # %bb.0:
1457 ; CHECK-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
1458 ; CHECK-NEXT:    ret{{[l|q]}}
1459   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1460   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1461   %res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %arg0, <16 x i16> %arg1)
1462   %bc = bitcast <8 x i32> %res to <4 x i64>
1463   ret <4 x i64> %bc
1464 }
1465 declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
1466
1467 define <4 x i64> @test_mm256_maddubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1468 ; CHECK-LABEL: test_mm256_maddubs_epi16:
1469 ; CHECK:       # %bb.0:
1470 ; CHECK-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0
1471 ; CHECK-NEXT:    ret{{[l|q]}}
1472   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1473   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1474   %res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %arg0, <32 x i8> %arg1)
1475   %bc = bitcast <16 x i16> %res to <4 x i64>
1476   ret <4 x i64> %bc
1477 }
1478 declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone
1479
1480 define <2 x i64> @test_mm_maskload_epi32(i32* %a0, <2 x i64> %a1) nounwind {
1481 ; X86-LABEL: test_mm_maskload_epi32:
1482 ; X86:       # %bb.0:
1483 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1484 ; X86-NEXT:    vpmaskmovd (%eax), %xmm0, %xmm0
1485 ; X86-NEXT:    retl
1486 ;
1487 ; X64-LABEL: test_mm_maskload_epi32:
1488 ; X64:       # %bb.0:
1489 ; X64-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm0
1490 ; X64-NEXT:    retq
1491   %arg0 = bitcast i32* %a0 to i8*
1492   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1493   %call = call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %arg0, <4 x i32> %arg1)
1494   %bc = bitcast <4 x i32> %call to <2 x i64>
1495   ret <2 x i64> %bc
1496 }
1497 declare <4 x i32> @llvm.x86.avx2.maskload.d(i8*, <4 x i32>) nounwind readonly
1498
1499 define <4 x i64> @test_mm256_maskload_epi32(i32* %a0, <4 x i64> %a1) nounwind {
1500 ; X86-LABEL: test_mm256_maskload_epi32:
1501 ; X86:       # %bb.0:
1502 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1503 ; X86-NEXT:    vpmaskmovd (%eax), %ymm0, %ymm0
1504 ; X86-NEXT:    retl
1505 ;
1506 ; X64-LABEL: test_mm256_maskload_epi32:
1507 ; X64:       # %bb.0:
1508 ; X64-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm0
1509 ; X64-NEXT:    retq
1510   %arg0 = bitcast i32* %a0 to i8*
1511   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1512   %call = call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %arg0, <8 x i32> %arg1)
1513   %bc = bitcast <8 x i32> %call to <4 x i64>
1514   ret <4 x i64> %bc
1515 }
1516 declare <8 x i32> @llvm.x86.avx2.maskload.d.256(i8*, <8 x i32>) nounwind readonly
1517
1518 define <2 x i64> @test_mm_maskload_epi64(i64* %a0, <2 x i64> %a1) nounwind {
1519 ; X86-LABEL: test_mm_maskload_epi64:
1520 ; X86:       # %bb.0:
1521 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1522 ; X86-NEXT:    vpmaskmovq (%eax), %xmm0, %xmm0
1523 ; X86-NEXT:    retl
1524 ;
1525 ; X64-LABEL: test_mm_maskload_epi64:
1526 ; X64:       # %bb.0:
1527 ; X64-NEXT:    vpmaskmovq (%rdi), %xmm0, %xmm0
1528 ; X64-NEXT:    retq
1529   %arg0 = bitcast i64* %a0 to i8*
1530   %res = call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %arg0, <2 x i64> %a1)
1531   ret <2 x i64> %res
1532 }
1533 declare <2 x i64> @llvm.x86.avx2.maskload.q(i8*, <2 x i64>) nounwind readonly
1534
1535 define <4 x i64> @test_mm256_maskload_epi64(i64* %a0, <4 x i64> %a1) nounwind {
1536 ; X86-LABEL: test_mm256_maskload_epi64:
1537 ; X86:       # %bb.0:
1538 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1539 ; X86-NEXT:    vpmaskmovq (%eax), %ymm0, %ymm0
1540 ; X86-NEXT:    retl
1541 ;
1542 ; X64-LABEL: test_mm256_maskload_epi64:
1543 ; X64:       # %bb.0:
1544 ; X64-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm0
1545 ; X64-NEXT:    retq
1546   %arg0 = bitcast i64* %a0 to i8*
1547   %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %arg0, <4 x i64> %a1)
1548   ret <4 x i64> %res
1549 }
1550 declare <4 x i64> @llvm.x86.avx2.maskload.q.256(i8*, <4 x i64>) nounwind readonly
1551
1552 define void @test_mm_maskstore_epi32(float* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
1553 ; X86-LABEL: test_mm_maskstore_epi32:
1554 ; X86:       # %bb.0:
1555 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1556 ; X86-NEXT:    vpmaskmovd %xmm1, %xmm0, (%eax)
1557 ; X86-NEXT:    retl
1558 ;
1559 ; X64-LABEL: test_mm_maskstore_epi32:
1560 ; X64:       # %bb.0:
1561 ; X64-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
1562 ; X64-NEXT:    retq
1563   %arg0 = bitcast float* %a0 to i8*
1564   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1565   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
1566   call void @llvm.x86.avx2.maskstore.d(i8* %arg0, <4 x i32> %arg1, <4 x i32> %arg2)
1567   ret void
1568 }
1569 declare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>) nounwind readnone
1570
1571 define void @test_mm256_maskstore_epi32(float* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
1572 ; X86-LABEL: test_mm256_maskstore_epi32:
1573 ; X86:       # %bb.0:
1574 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1575 ; X86-NEXT:    vpmaskmovd %ymm1, %ymm0, (%eax)
1576 ; X86-NEXT:    vzeroupper
1577 ; X86-NEXT:    retl
1578 ;
1579 ; X64-LABEL: test_mm256_maskstore_epi32:
1580 ; X64:       # %bb.0:
1581 ; X64-NEXT:    vpmaskmovd %ymm1, %ymm0, (%rdi)
1582 ; X64-NEXT:    vzeroupper
1583 ; X64-NEXT:    retq
1584   %arg0 = bitcast float* %a0 to i8*
1585   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1586   %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
1587   call void @llvm.x86.avx2.maskstore.d.256(i8* %arg0, <8 x i32> %arg1, <8 x i32> %arg2)
1588   ret void
1589 }
1590 declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>) nounwind readnone
1591
1592 define void @test_mm_maskstore_epi64(i64* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
1593 ; X86-LABEL: test_mm_maskstore_epi64:
1594 ; X86:       # %bb.0:
1595 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1596 ; X86-NEXT:    vpmaskmovq %xmm1, %xmm0, (%eax)
1597 ; X86-NEXT:    retl
1598 ;
1599 ; X64-LABEL: test_mm_maskstore_epi64:
1600 ; X64:       # %bb.0:
1601 ; X64-NEXT:    vpmaskmovq %xmm1, %xmm0, (%rdi)
1602 ; X64-NEXT:    retq
1603   %arg0 = bitcast i64* %a0 to i8*
1604   call void @llvm.x86.avx2.maskstore.q(i8* %arg0, <2 x i64> %a1, <2 x i64> %a2)
1605   ret void
1606 }
1607 declare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>) nounwind readnone
1608
1609 define void @test_mm256_maskstore_epi64(i64* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
1610 ; X86-LABEL: test_mm256_maskstore_epi64:
1611 ; X86:       # %bb.0:
1612 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1613 ; X86-NEXT:    vpmaskmovq %ymm1, %ymm0, (%eax)
1614 ; X86-NEXT:    vzeroupper
1615 ; X86-NEXT:    retl
1616 ;
1617 ; X64-LABEL: test_mm256_maskstore_epi64:
1618 ; X64:       # %bb.0:
1619 ; X64-NEXT:    vpmaskmovq %ymm1, %ymm0, (%rdi)
1620 ; X64-NEXT:    vzeroupper
1621 ; X64-NEXT:    retq
1622   %arg0 = bitcast i64* %a0 to i8*
1623   call void @llvm.x86.avx2.maskstore.q.256(i8* %arg0, <4 x i64> %a1, <4 x i64> %a2)
1624   ret void
1625 }
1626 declare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>) nounwind readnone
1627
1628 define <4 x i64> @test_mm256_max_epi8(<4 x i64> %a0, <4 x i64> %a1) {
1629 ; CHECK-LABEL: test_mm256_max_epi8:
1630 ; CHECK:       # %bb.0:
1631 ; CHECK-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
1632 ; CHECK-NEXT:    ret{{[l|q]}}
1633   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1634   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1635   %sel = call <32 x i8> @llvm.smax.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
1636   %bc = bitcast <32 x i8> %sel to <4 x i64>
1637   ret <4 x i64> %bc
1638 }
1639 declare <32 x i8> @llvm.smax.v32i8(<32 x i8>, <32 x i8>)
1640
1641 define <4 x i64> @test_mm256_max_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1642 ; CHECK-LABEL: test_mm256_max_epi16:
1643 ; CHECK:       # %bb.0:
1644 ; CHECK-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
1645 ; CHECK-NEXT:    ret{{[l|q]}}
1646   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1647   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1648   %sel = call <16 x i16> @llvm.smax.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
1649   %bc = bitcast <16 x i16> %sel to <4 x i64>
1650   ret <4 x i64> %bc
1651 }
1652 declare <16 x i16> @llvm.smax.v16i16(<16 x i16>, <16 x i16>)
1653
1654 define <4 x i64> @test_mm256_max_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1655 ; CHECK-LABEL: test_mm256_max_epi32:
1656 ; CHECK:       # %bb.0:
1657 ; CHECK-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
1658 ; CHECK-NEXT:    ret{{[l|q]}}
1659   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1660   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1661   %sel = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %arg0, <8 x i32> %arg1)
1662   %bc = bitcast <8 x i32> %sel to <4 x i64>
1663   ret <4 x i64> %bc
1664 }
1665 declare <8 x i32> @llvm.smax.v8i32(<8 x i32>, <8 x i32>)
1666
1667 define <4 x i64> @test_mm256_max_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1668 ; CHECK-LABEL: test_mm256_max_epu8:
1669 ; CHECK:       # %bb.0:
1670 ; CHECK-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
1671 ; CHECK-NEXT:    ret{{[l|q]}}
1672   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1673   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1674   %sel = call <32 x i8> @llvm.umax.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
1675   %bc = bitcast <32 x i8> %sel to <4 x i64>
1676   ret <4 x i64> %bc
1677 }
1678 declare <32 x i8> @llvm.umax.v32i8(<32 x i8>, <32 x i8>)
1679
1680 define <4 x i64> @test_mm256_max_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1681 ; CHECK-LABEL: test_mm256_max_epu16:
1682 ; CHECK:       # %bb.0:
1683 ; CHECK-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
1684 ; CHECK-NEXT:    ret{{[l|q]}}
1685   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1686   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1687   %sel = call <16 x i16> @llvm.umax.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
1688   %bc = bitcast <16 x i16> %sel to <4 x i64>
1689   ret <4 x i64> %bc
1690 }
1691 declare <16 x i16> @llvm.umax.v16i16(<16 x i16>, <16 x i16>)
1692
1693 define <4 x i64> @test_mm256_max_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1694 ; CHECK-LABEL: test_mm256_max_epu32:
1695 ; CHECK:       # %bb.0:
1696 ; CHECK-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
1697 ; CHECK-NEXT:    ret{{[l|q]}}
1698   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1699   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1700   %sel = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %arg0, <8 x i32> %arg1)
1701   %bc = bitcast <8 x i32> %sel to <4 x i64>
1702   ret <4 x i64> %bc
1703 }
1704 declare <8 x i32> @llvm.umax.v8i32(<8 x i32>, <8 x i32>)
1705
1706 define <4 x i64> @test_mm256_min_epi8(<4 x i64> %a0, <4 x i64> %a1) {
1707 ; CHECK-LABEL: test_mm256_min_epi8:
1708 ; CHECK:       # %bb.0:
1709 ; CHECK-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
1710 ; CHECK-NEXT:    ret{{[l|q]}}
1711   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1712   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1713   %sel = call <32 x i8> @llvm.smin.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
1714   %bc = bitcast <32 x i8> %sel to <4 x i64>
1715   ret <4 x i64> %bc
1716 }
1717 declare <32 x i8> @llvm.smin.v32i8(<32 x i8>, <32 x i8>)
1718
1719 define <4 x i64> @test_mm256_min_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1720 ; CHECK-LABEL: test_mm256_min_epi16:
1721 ; CHECK:       # %bb.0:
1722 ; CHECK-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
1723 ; CHECK-NEXT:    ret{{[l|q]}}
1724   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1725   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1726   %sel = call <16 x i16> @llvm.smin.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
1727   %bc = bitcast <16 x i16> %sel to <4 x i64>
1728   ret <4 x i64> %bc
1729 }
1730 declare <16 x i16> @llvm.smin.v16i16(<16 x i16>, <16 x i16>)
1731
1732 define <4 x i64> @test_mm256_min_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1733 ; CHECK-LABEL: test_mm256_min_epi32:
1734 ; CHECK:       # %bb.0:
1735 ; CHECK-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
1736 ; CHECK-NEXT:    ret{{[l|q]}}
1737   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1738   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1739   %sel = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %arg0, <8 x i32> %arg1)
1740   %bc = bitcast <8 x i32> %sel to <4 x i64>
1741   ret <4 x i64> %bc
1742 }
1743 declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>)
1744
1745 define <4 x i64> @test_mm256_min_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1746 ; CHECK-LABEL: test_mm256_min_epu8:
1747 ; CHECK:       # %bb.0:
1748 ; CHECK-NEXT:    vpminub %ymm1, %ymm0, %ymm0
1749 ; CHECK-NEXT:    ret{{[l|q]}}
1750   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1751   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1752   %sel = call <32 x i8> @llvm.umin.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
1753   %bc = bitcast <32 x i8> %sel to <4 x i64>
1754   ret <4 x i64> %bc
1755 }
1756 declare <32 x i8> @llvm.umin.v32i8(<32 x i8>, <32 x i8>)
1757
1758 define <4 x i64> @test_mm256_min_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1759 ; CHECK-LABEL: test_mm256_min_epu16:
1760 ; CHECK:       # %bb.0:
1761 ; CHECK-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
1762 ; CHECK-NEXT:    ret{{[l|q]}}
1763   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1764   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1765   %sel = call <16 x i16> @llvm.umin.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
1766   %bc = bitcast <16 x i16> %sel to <4 x i64>
1767   ret <4 x i64> %bc
1768 }
1769 declare <16 x i16> @llvm.umin.v16i16(<16 x i16>, <16 x i16>)
1770
1771 define <4 x i64> @test_mm256_min_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1772 ; CHECK-LABEL: test_mm256_min_epu32:
1773 ; CHECK:       # %bb.0:
1774 ; CHECK-NEXT:    vpminud %ymm1, %ymm0, %ymm0
1775 ; CHECK-NEXT:    ret{{[l|q]}}
1776   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1777   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1778   %sel = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %arg0, <8 x i32> %arg1)
1779   %bc = bitcast <8 x i32> %sel to <4 x i64>
1780   ret <4 x i64> %bc
1781 }
1782 declare <8 x i32> @llvm.umin.v8i32(<8 x i32>, <8 x i32>)
1783
1784 define i32 @test_mm256_movemask_epi8(<4 x i64> %a0) nounwind {
1785 ; CHECK-LABEL: test_mm256_movemask_epi8:
1786 ; CHECK:       # %bb.0:
1787 ; CHECK-NEXT:    vpmovmskb %ymm0, %eax
1788 ; CHECK-NEXT:    vzeroupper
1789 ; CHECK-NEXT:    ret{{[l|q]}}
1790   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1791   %res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %arg0)
1792   ret i32 %res
1793 }
1794 declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone
1795
1796 define <4 x i64> @test_mm256_mpsadbw_epu8(<4 x i64> %a0, <4 x i64> %a1) {
1797 ; CHECK-LABEL: test_mm256_mpsadbw_epu8:
1798 ; CHECK:       # %bb.0:
1799 ; CHECK-NEXT:    vmpsadbw $3, %ymm1, %ymm0, %ymm0
1800 ; CHECK-NEXT:    ret{{[l|q]}}
1801   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
1802   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
1803   %call = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %arg0, <32 x i8> %arg1, i8 3)
1804   %bc = bitcast <16 x i16>  %call to <4 x i64>
1805   ret <4 x i64> %bc
1806 }
1807 declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone
1808
1809 define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1810 ; CHECK-LABEL: test_mm256_mul_epi32:
1811 ; CHECK:       # %bb.0:
1812 ; CHECK-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0
1813 ; CHECK-NEXT:    ret{{[l|q]}}
1814   %A = shl <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32>
1815   %A1 = ashr exact <4 x i64> %A, <i64 32, i64 32, i64 32, i64 32>
1816   %B = shl <4 x i64> %a1, <i64 32, i64 32, i64 32, i64 32>
1817   %B1 = ashr exact <4 x i64> %B, <i64 32, i64 32, i64 32, i64 32>
1818   %res = mul nsw <4 x i64> %A1, %B1
1819   ret <4 x i64> %res
1820 }
1821 declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
1822
1823 define <4 x i64> @test_mm256_mul_epu32(<4 x i64> %a0, <4 x i64> %a1) {
1824 ; CHECK-LABEL: test_mm256_mul_epu32:
1825 ; CHECK:       # %bb.0:
1826 ; CHECK-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
1827 ; CHECK-NEXT:    ret{{[l|q]}}
1828   %A = and <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1829   %B = and <4 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
1830   %res = mul nuw <4 x i64> %A, %B
1831   ret <4 x i64> %res
1832 }
1833 declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone
1834
1835 define <4 x i64> @test_mm256_mulhi_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1836 ; CHECK-LABEL: test_mm256_mulhi_epi16:
1837 ; CHECK:       # %bb.0:
1838 ; CHECK-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
1839 ; CHECK-NEXT:    ret{{[l|q]}}
1840   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1841   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1842   %res = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %arg0, <16 x i16> %arg1)
1843   %bc = bitcast <16 x i16> %res to <4 x i64>
1844   ret <4 x i64> %bc
1845 }
1846 declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone
1847
1848 define <4 x i64> @test_mm256_mulhi_epu16(<4 x i64> %a0, <4 x i64> %a1) {
1849 ; CHECK-LABEL: test_mm256_mulhi_epu16:
1850 ; CHECK:       # %bb.0:
1851 ; CHECK-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0
1852 ; CHECK-NEXT:    ret{{[l|q]}}
1853   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1854   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1855   %res = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %arg0, <16 x i16> %arg1)
1856   %bc = bitcast <16 x i16> %res to <4 x i64>
1857   ret <4 x i64> %bc
1858 }
1859 declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone
1860
1861 define <4 x i64> @test_mm256_mulhrs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1862 ; CHECK-LABEL: test_mm256_mulhrs_epi16:
1863 ; CHECK:       # %bb.0:
1864 ; CHECK-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0
1865 ; CHECK-NEXT:    ret{{[l|q]}}
1866   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1867   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1868   %res = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %arg0, <16 x i16> %arg1)
1869   %bc = bitcast <16 x i16> %res to <4 x i64>
1870   ret <4 x i64> %bc
1871 }
1872 declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone
1873
1874 define <4 x i64> @test_mm256_mullo_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1875 ; CHECK-LABEL: test_mm256_mullo_epi16:
1876 ; CHECK:       # %bb.0:
1877 ; CHECK-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
1878 ; CHECK-NEXT:    ret{{[l|q]}}
1879   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1880   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1881   %res = mul <16 x i16> %arg0, %arg1
1882   %bc = bitcast <16 x i16> %res to <4 x i64>
1883   ret <4 x i64> %bc
1884 }
1885
1886 define <4 x i64> @test_mm256_mullo_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1887 ; CHECK-LABEL: test_mm256_mullo_epi32:
1888 ; CHECK:       # %bb.0:
1889 ; CHECK-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
1890 ; CHECK-NEXT:    ret{{[l|q]}}
1891   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1892   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1893   %res = mul <8 x i32> %arg0, %arg1
1894   %bc = bitcast <8 x i32> %res to <4 x i64>
1895   ret <4 x i64> %bc
1896 }
1897
1898 define <4 x i64> @test_mm256_or_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
1899 ; CHECK-LABEL: test_mm256_or_si256:
1900 ; CHECK:       # %bb.0:
1901 ; CHECK-NEXT:    vorps %ymm1, %ymm0, %ymm0
1902 ; CHECK-NEXT:    ret{{[l|q]}}
1903   %res = or <4 x i64> %a0, %a1
1904   ret <4 x i64> %res
1905 }
1906
1907 define <4 x i64> @test_mm256_packs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1908 ; CHECK-LABEL: test_mm256_packs_epi16:
1909 ; CHECK:       # %bb.0:
1910 ; CHECK-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0
1911 ; CHECK-NEXT:    ret{{[l|q]}}
1912   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1913   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1914   %call = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %arg0, <16 x i16> %arg1)
1915   %res = bitcast <32 x i8> %call to <4 x i64>
1916   ret <4 x i64> %res
1917 }
1918 declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
1919
1920 define <4 x i64> @test_mm256_packs_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1921 ; CHECK-LABEL: test_mm256_packs_epi32:
1922 ; CHECK:       # %bb.0:
1923 ; CHECK-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
1924 ; CHECK-NEXT:    ret{{[l|q]}}
1925   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1926   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1927   %call = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %arg0, <8 x i32> %arg1)
1928   %res = bitcast <16 x i16> %call to <4 x i64>
1929   ret <4 x i64> %res
1930 }
1931 declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
1932
1933 define <4 x i64> @test_mm256_packus_epi16(<4 x i64> %a0, <4 x i64> %a1) {
1934 ; CHECK-LABEL: test_mm256_packus_epi16:
1935 ; CHECK:       # %bb.0:
1936 ; CHECK-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
1937 ; CHECK-NEXT:    ret{{[l|q]}}
1938   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
1939   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
1940   %call = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %arg0, <16 x i16> %arg1)
1941   %res = bitcast <32 x i8> %call to <4 x i64>
1942   ret <4 x i64> %res
1943 }
1944 declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
1945
1946 define <4 x i64> @test_mm256_packus_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1947 ; CHECK-LABEL: test_mm256_packus_epi32:
1948 ; CHECK:       # %bb.0:
1949 ; CHECK-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
1950 ; CHECK-NEXT:    ret{{[l|q]}}
1951   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1952   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1953   %call = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %arg0, <8 x i32> %arg1)
1954   %res = bitcast <16 x i16> %call to <4 x i64>
1955   ret <4 x i64> %res
1956 }
1957 declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
1958
1959 define <4 x i64> @test_mm256_permute2x128_si256(<4 x i64> %a0, <4 x i64> %a1) {
1960 ; CHECK-LABEL: test_mm256_permute2x128_si256:
1961 ; CHECK:       # %bb.0:
1962 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
1963 ; CHECK-NEXT:    ret{{[l|q]}}
1964   %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
1965   ret <4 x i64> %res
1966 }
1967 declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly
1968
1969 define <4 x i64> @test_mm256_permute4x64_epi64(<4 x i64> %a0) {
1970 ; CHECK-LABEL: test_mm256_permute4x64_epi64:
1971 ; CHECK:       # %bb.0:
1972 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,0,2,0]
1973 ; CHECK-NEXT:    ret{{[l|q]}}
1974   %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 0>
1975   ret <4 x i64> %res
1976 }
1977
1978 define <4 x double> @test_mm256_permute4x64_pd(<4 x double> %a0) {
1979 ; CHECK-LABEL: test_mm256_permute4x64_pd:
1980 ; CHECK:       # %bb.0:
1981 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,2,1,0]
1982 ; CHECK-NEXT:    ret{{[l|q]}}
1983   %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
1984   ret <4 x double> %res
1985 }
1986
1987 define <4 x i64> @test_mm256_permutevar8x32_epi32(<4 x i64> %a0, <4 x i64> %a1) {
1988 ; CHECK-LABEL: test_mm256_permutevar8x32_epi32:
1989 ; CHECK:       # %bb.0:
1990 ; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
1991 ; CHECK-NEXT:    ret{{[l|q]}}
1992   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
1993   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1994   %call = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %arg0, <8 x i32> %arg1)
1995   %res = bitcast <8 x i32> %call to <4 x i64>
1996   ret <4 x i64> %res
1997 }
1998 declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
1999
2000 define <8 x float> @test_mm256_permutevar8x32_ps(<8 x float> %a0, <4 x i64> %a1) {
2001 ; CHECK-LABEL: test_mm256_permutevar8x32_ps:
2002 ; CHECK:       # %bb.0:
2003 ; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
2004 ; CHECK-NEXT:    ret{{[l|q]}}
2005   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2006   %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %arg1)
2007   ret <8 x float> %res
2008 }
2009 declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
2010
2011 define <4 x i64> @test_mm256_sad_epu8(<4 x i64> %a0, <4 x i64> %a1) {
2012 ; CHECK-LABEL: test_mm256_sad_epu8:
2013 ; CHECK:       # %bb.0:
2014 ; CHECK-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
2015 ; CHECK-NEXT:    ret{{[l|q]}}
2016   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2017   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2018   %res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %arg0, <32 x i8> %arg1)
2019   ret <4 x i64> %res
2020 }
2021 declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
2022
2023 define <4 x i64> @test_mm256_shuffle_epi32(<4 x i64> %a0) {
2024 ; CHECK-LABEL: test_mm256_shuffle_epi32:
2025 ; CHECK:       # %bb.0:
2026 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4]
2027 ; CHECK-NEXT:    ret{{[l|q]}}
2028   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2029   %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4>
2030   %res = bitcast <8 x i32> %shuf to <4 x i64>
2031   ret <4 x i64> %res
2032 }
2033
2034 define <4 x i64> @test_mm256_shuffle_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2035 ; CHECK-LABEL: test_mm256_shuffle_epi8:
2036 ; CHECK:       # %bb.0:
2037 ; CHECK-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
2038 ; CHECK-NEXT:    ret{{[l|q]}}
2039   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2040   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2041   %shuf = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %arg0, <32 x i8> %arg1)
2042   %res = bitcast <32 x i8> %shuf to <4 x i64>
2043   ret <4 x i64> %res
2044 }
2045 declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
2046
2047 define <4 x i64> @test_mm256_shufflehi_epi16(<4 x i64> %a0) {
2048 ; CHECK-LABEL: test_mm256_shufflehi_epi16:
2049 ; CHECK:       # %bb.0:
2050 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,5,8,9,10,11,15,14,14,13]
2051 ; CHECK-NEXT:    ret{{[l|q]}}
2052   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2053   %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13>
2054   %res = bitcast <16 x i16> %shuf to <4 x i64>
2055   ret <4 x i64> %res
2056 }
2057
2058 define <4 x i64> @test_mm256_shufflelo_epi16(<4 x i64> %a0) {
2059 ; CHECK-LABEL: test_mm256_shufflelo_epi16:
2060 ; CHECK:       # %bb.0:
2061 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,0,1,1,4,5,6,7,11,8,9,9,12,13,14,15]
2062 ; CHECK-NEXT:    ret{{[l|q]}}
2063   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2064   %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, i32 9, i32 12, i32 13, i32 14, i32 15>
2065   %res = bitcast <16 x i16> %shuf to <4 x i64>
2066   ret <4 x i64> %res
2067 }
2068
2069 define <4 x i64> @test_mm256_sign_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2070 ; CHECK-LABEL: test_mm256_sign_epi8:
2071 ; CHECK:       # %bb.0:
2072 ; CHECK-NEXT:    vpsignb %ymm1, %ymm0, %ymm0
2073 ; CHECK-NEXT:    ret{{[l|q]}}
2074   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2075   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2076   %call = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %arg0, <32 x i8> %arg1)
2077   %res = bitcast <32 x i8> %call to <4 x i64>
2078   ret <4 x i64> %res
2079 }
2080 declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
2081
2082 define <4 x i64> @test_mm256_sign_epi16(<4 x i64> %a0, <4 x i64> %a1) {
2083 ; CHECK-LABEL: test_mm256_sign_epi16:
2084 ; CHECK:       # %bb.0:
2085 ; CHECK-NEXT:    vpsignw %ymm1, %ymm0, %ymm0
2086 ; CHECK-NEXT:    ret{{[l|q]}}
2087   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2088   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2089   %call = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %arg0, <16 x i16> %arg1)
2090   %res = bitcast <16 x i16> %call to <4 x i64>
2091   ret <4 x i64> %res
2092 }
2093 declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone
2094
2095 define <4 x i64> @test_mm256_sign_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2096 ; CHECK-LABEL: test_mm256_sign_epi32:
2097 ; CHECK:       # %bb.0:
2098 ; CHECK-NEXT:    vpsignd %ymm1, %ymm0, %ymm0
2099 ; CHECK-NEXT:    ret{{[l|q]}}
2100   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2101   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2102   %call = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %arg0, <8 x i32> %arg1)
2103   %res = bitcast <8 x i32> %call to <4 x i64>
2104   ret <4 x i64> %res
2105 }
2106 declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
2107
2108 define <4 x i64> @test_mm256_sll_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2109 ; CHECK-LABEL: test_mm256_sll_epi16:
2110 ; CHECK:       # %bb.0:
2111 ; CHECK-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
2112 ; CHECK-NEXT:    ret{{[l|q]}}
2113   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2114   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2115   %res = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %arg0, <8 x i16> %arg1)
2116   %bc = bitcast <16 x i16> %res to <4 x i64>
2117   ret <4 x i64> %bc
2118 }
2119 declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone
2120
2121 define <4 x i64> @test_mm256_sll_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2122 ; CHECK-LABEL: test_mm256_sll_epi32:
2123 ; CHECK:       # %bb.0:
2124 ; CHECK-NEXT:    vpslld %xmm1, %ymm0, %ymm0
2125 ; CHECK-NEXT:    ret{{[l|q]}}
2126   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2127   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2128   %res = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %arg0, <4 x i32> %arg1)
2129   %bc = bitcast <8 x i32> %res to <4 x i64>
2130   ret <4 x i64> %bc
2131 }
2132 declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
2133
2134 define <4 x i64> @test_mm256_sll_epi64(<4 x i64> %a0, <2 x i64> %a1) {
2135 ; CHECK-LABEL: test_mm256_sll_epi64:
2136 ; CHECK:       # %bb.0:
2137 ; CHECK-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
2138 ; CHECK-NEXT:    ret{{[l|q]}}
2139   %res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1)
2140   ret <4 x i64> %res
2141 }
2142 declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
2143
2144 define <4 x i64> @test_mm256_slli_epi16(<4 x i64> %a0) {
2145 ; CHECK-LABEL: test_mm256_slli_epi16:
2146 ; CHECK:       # %bb.0:
2147 ; CHECK-NEXT:    vpsllw $3, %ymm0, %ymm0
2148 ; CHECK-NEXT:    ret{{[l|q]}}
2149   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2150   %res = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %arg0, i32 3)
2151   %bc = bitcast <16 x i16> %res to <4 x i64>
2152   ret <4 x i64> %bc
2153 }
2154 declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone
2155
2156 define <4 x i64> @test_mm256_slli_epi32(<4 x i64> %a0) {
2157 ; CHECK-LABEL: test_mm256_slli_epi32:
2158 ; CHECK:       # %bb.0:
2159 ; CHECK-NEXT:    vpslld $3, %ymm0, %ymm0
2160 ; CHECK-NEXT:    ret{{[l|q]}}
2161   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2162   %res = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %arg0, i32 3)
2163   %bc = bitcast <8 x i32> %res to <4 x i64>
2164   ret <4 x i64> %bc
2165 }
2166 declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) nounwind readnone
2167
2168 define <4 x i64> @test_mm256_slli_epi64(<4 x i64> %a0) {
2169 ; CHECK-LABEL: test_mm256_slli_epi64:
2170 ; CHECK:       # %bb.0:
2171 ; CHECK-NEXT:    vpsllq $3, %ymm0, %ymm0
2172 ; CHECK-NEXT:    ret{{[l|q]}}
2173   %res = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %a0, i32 3)
2174   ret <4 x i64> %res
2175 }
2176 declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) nounwind readnone
2177
2178 define <4 x i64> @test_mm256_slli_si256(<4 x i64> %a0) {
2179 ; CHECK-LABEL: test_mm256_slli_si256:
2180 ; CHECK:       # %bb.0:
2181 ; CHECK-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
2182 ; CHECK-NEXT:    ret{{[l|q]}}
2183   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2184   %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
2185   %res = bitcast <32 x i8> %shuf to <4 x i64>
2186   ret <4 x i64> %res
2187 }
2188
2189 define <2 x i64> @test_mm_sllv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2190 ; CHECK-LABEL: test_mm_sllv_epi32:
2191 ; CHECK:       # %bb.0:
2192 ; CHECK-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
2193 ; CHECK-NEXT:    ret{{[l|q]}}
2194   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2195   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2196   %res = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %arg0, <4 x i32> %arg1)
2197   %bc = bitcast <4 x i32> %res to <2 x i64>
2198   ret <2 x i64> %bc
2199 }
2200 declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
2201
2202 define <4 x i64> @test_mm256_sllv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2203 ; CHECK-LABEL: test_mm256_sllv_epi32:
2204 ; CHECK:       # %bb.0:
2205 ; CHECK-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
2206 ; CHECK-NEXT:    ret{{[l|q]}}
2207   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2208   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2209   %res = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2210   %bc = bitcast <8 x i32> %res to <4 x i64>
2211   ret <4 x i64> %bc
2212 }
2213 declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2214
2215 define <2 x i64> @test_mm_sllv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
2216 ; CHECK-LABEL: test_mm_sllv_epi64:
2217 ; CHECK:       # %bb.0:
2218 ; CHECK-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
2219 ; CHECK-NEXT:    ret{{[l|q]}}
2220   %res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1)
2221   ret <2 x i64> %res
2222 }
2223 declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
2224
2225 define <4 x i64> @test_mm256_sllv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
2226 ; CHECK-LABEL: test_mm256_sllv_epi64:
2227 ; CHECK:       # %bb.0:
2228 ; CHECK-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0
2229 ; CHECK-NEXT:    ret{{[l|q]}}
2230   %res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1)
2231   ret <4 x i64> %res
2232 }
2233 declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
2234
2235 define <4 x i64> @test_mm256_sra_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2236 ; CHECK-LABEL: test_mm256_sra_epi16:
2237 ; CHECK:       # %bb.0:
2238 ; CHECK-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
2239 ; CHECK-NEXT:    ret{{[l|q]}}
2240   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2241   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2242   %res = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %arg0, <8 x i16> %arg1)
2243   %bc = bitcast <16 x i16> %res to <4 x i64>
2244   ret <4 x i64> %bc
2245 }
2246 declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone
2247
2248 define <4 x i64> @test_mm256_sra_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2249 ; CHECK-LABEL: test_mm256_sra_epi32:
2250 ; CHECK:       # %bb.0:
2251 ; CHECK-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
2252 ; CHECK-NEXT:    ret{{[l|q]}}
2253   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2254   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2255   %res = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %arg0, <4 x i32> %arg1)
2256   %bc = bitcast <8 x i32> %res to <4 x i64>
2257   ret <4 x i64> %bc
2258 }
2259 declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
2260
2261 define <4 x i64> @test_mm256_srai_epi16(<4 x i64> %a0) {
2262 ; CHECK-LABEL: test_mm256_srai_epi16:
2263 ; CHECK:       # %bb.0:
2264 ; CHECK-NEXT:    vpsraw $3, %ymm0, %ymm0
2265 ; CHECK-NEXT:    ret{{[l|q]}}
2266   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2267   %res = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %arg0, i32 3)
2268   %bc = bitcast <16 x i16> %res to <4 x i64>
2269   ret <4 x i64> %bc
2270 }
2271 declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone
2272
2273 define <4 x i64> @test_mm256_srai_epi32(<4 x i64> %a0) {
2274 ; CHECK-LABEL: test_mm256_srai_epi32:
2275 ; CHECK:       # %bb.0:
2276 ; CHECK-NEXT:    vpsrad $3, %ymm0, %ymm0
2277 ; CHECK-NEXT:    ret{{[l|q]}}
2278   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2279   %res = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %arg0, i32 3)
2280   %bc = bitcast <8 x i32> %res to <4 x i64>
2281   ret <4 x i64> %bc
2282 }
2283 declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) nounwind readnone
2284
2285 define <2 x i64> @test_mm_srav_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2286 ; CHECK-LABEL: test_mm_srav_epi32:
2287 ; CHECK:       # %bb.0:
2288 ; CHECK-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
2289 ; CHECK-NEXT:    ret{{[l|q]}}
2290   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2291   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2292   %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %arg0, <4 x i32> %arg1)
2293   %bc = bitcast <4 x i32> %res to <2 x i64>
2294   ret <2 x i64> %bc
2295 }
2296 declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
2297
2298 define <4 x i64> @test_mm256_srav_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2299 ; CHECK-LABEL: test_mm256_srav_epi32:
2300 ; CHECK:       # %bb.0:
2301 ; CHECK-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
2302 ; CHECK-NEXT:    ret{{[l|q]}}
2303   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2304   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2305   %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2306   %bc = bitcast <8 x i32> %res to <4 x i64>
2307   ret <4 x i64> %bc
2308 }
2309 declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2310
2311 define <4 x i64> @test_mm256_srl_epi16(<4 x i64> %a0, <2 x i64> %a1) {
2312 ; CHECK-LABEL: test_mm256_srl_epi16:
2313 ; CHECK:       # %bb.0:
2314 ; CHECK-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
2315 ; CHECK-NEXT:    ret{{[l|q]}}
2316   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2317   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
2318   %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %arg0, <8 x i16> %arg1)
2319   %bc = bitcast <16 x i16> %res to <4 x i64>
2320   ret <4 x i64> %bc
2321 }
2322 declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone
2323
2324 define <4 x i64> @test_mm256_srl_epi32(<4 x i64> %a0, <2 x i64> %a1) {
2325 ; CHECK-LABEL: test_mm256_srl_epi32:
2326 ; CHECK:       # %bb.0:
2327 ; CHECK-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
2328 ; CHECK-NEXT:    ret{{[l|q]}}
2329   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2330   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2331   %res = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %arg0, <4 x i32> %arg1)
2332   %bc = bitcast <8 x i32> %res to <4 x i64>
2333   ret <4 x i64> %bc
2334 }
2335 declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
2336
2337 define <4 x i64> @test_mm256_srl_epi64(<4 x i64> %a0, <2 x i64> %a1) {
2338 ; CHECK-LABEL: test_mm256_srl_epi64:
2339 ; CHECK:       # %bb.0:
2340 ; CHECK-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
2341 ; CHECK-NEXT:    ret{{[l|q]}}
2342   %res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1)
2343   ret <4 x i64> %res
2344 }
2345 declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
2346
2347 define <4 x i64> @test_mm256_srli_epi16(<4 x i64> %a0) {
2348 ; CHECK-LABEL: test_mm256_srli_epi16:
2349 ; CHECK:       # %bb.0:
2350 ; CHECK-NEXT:    vpsrlw $3, %ymm0, %ymm0
2351 ; CHECK-NEXT:    ret{{[l|q]}}
2352   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2353   %res = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %arg0, i32 3)
2354   %bc = bitcast <16 x i16> %res to <4 x i64>
2355   ret <4 x i64> %bc
2356 }
2357 declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone
2358
2359 define <4 x i64> @test_mm256_srli_epi32(<4 x i64> %a0) {
2360 ; CHECK-LABEL: test_mm256_srli_epi32:
2361 ; CHECK:       # %bb.0:
2362 ; CHECK-NEXT:    vpsrld $3, %ymm0, %ymm0
2363 ; CHECK-NEXT:    ret{{[l|q]}}
2364   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2365   %res = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %arg0, i32 3)
2366   %bc = bitcast <8 x i32> %res to <4 x i64>
2367   ret <4 x i64> %bc
2368 }
2369 declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) nounwind readnone
2370
2371 define <4 x i64> @test_mm256_srli_epi64(<4 x i64> %a0) {
2372 ; CHECK-LABEL: test_mm256_srli_epi64:
2373 ; CHECK:       # %bb.0:
2374 ; CHECK-NEXT:    vpsrlq $3, %ymm0, %ymm0
2375 ; CHECK-NEXT:    ret{{[l|q]}}
2376   %res = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %a0, i32 3)
2377   ret <4 x i64> %res
2378 }
2379 declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) nounwind readnone
2380
2381 define <4 x i64> @test_mm256_srli_si256(<4 x i64> %a0) {
2382 ; CHECK-LABEL: test_mm256_srli_si256:
2383 ; CHECK:       # %bb.0:
2384 ; CHECK-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
2385 ; CHECK-NEXT:    ret{{[l|q]}}
2386   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2387   %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
2388   %res = bitcast <32 x i8> %shuf to <4 x i64>
2389   ret <4 x i64> %res
2390 }
2391
2392 define <2 x i64> @test_mm_srlv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
2393 ; CHECK-LABEL: test_mm_srlv_epi32:
2394 ; CHECK:       # %bb.0:
2395 ; CHECK-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
2396 ; CHECK-NEXT:    ret{{[l|q]}}
2397   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
2398   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
2399   %res = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %arg0, <4 x i32> %arg1)
2400   %bc = bitcast <4 x i32> %res to <2 x i64>
2401   ret <2 x i64> %bc
2402 }
2403 declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
2404
2405 define <4 x i64> @test_mm256_srlv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
2406 ; CHECK-LABEL: test_mm256_srlv_epi32:
2407 ; CHECK:       # %bb.0:
2408 ; CHECK-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
2409 ; CHECK-NEXT:    ret{{[l|q]}}
2410   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2411   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2412   %res = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
2413   %bc = bitcast <8 x i32> %res to <4 x i64>
2414   ret <4 x i64> %bc
2415 }
2416 declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
2417
2418 define <2 x i64> @test_mm_srlv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
2419 ; CHECK-LABEL: test_mm_srlv_epi64:
2420 ; CHECK:       # %bb.0:
2421 ; CHECK-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
2422 ; CHECK-NEXT:    ret{{[l|q]}}
2423   %res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1)
2424   ret <2 x i64> %res
2425 }
2426 declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
2427
2428 define <4 x i64> @test_mm256_srlv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
2429 ; CHECK-LABEL: test_mm256_srlv_epi64:
2430 ; CHECK:       # %bb.0:
2431 ; CHECK-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
2432 ; CHECK-NEXT:    ret{{[l|q]}}
2433   %res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1)
2434   ret <4 x i64> %res
2435 }
2436 declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
2437
2438 define <4 x i64> @test_mm256_stream_load_si256(<4 x i64> *%a0) {
2439 ; X86-LABEL: test_mm256_stream_load_si256:
2440 ; X86:       # %bb.0:
2441 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2442 ; X86-NEXT:    vmovntdqa (%eax), %ymm0
2443 ; X86-NEXT:    retl
2444 ;
2445 ; X64-LABEL: test_mm256_stream_load_si256:
2446 ; X64:       # %bb.0:
2447 ; X64-NEXT:    vmovntdqa (%rdi), %ymm0
2448 ; X64-NEXT:    retq
2449   %arg0 = bitcast <4 x i64> *%a0 to i8*
2450   %res = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %arg0)
2451   ret <4 x i64> %res
2452 }
2453 declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly
2454
2455 define <4 x i64> @test_mm256_sub_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2456 ; CHECK-LABEL: test_mm256_sub_epi8:
2457 ; CHECK:       # %bb.0:
2458 ; CHECK-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
2459 ; CHECK-NEXT:    ret{{[l|q]}}
2460   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2461   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2462   %res = sub <32 x i8> %arg0, %arg1
2463   %bc = bitcast <32 x i8> %res to <4 x i64>
2464   ret <4 x i64> %bc
2465 }
2466
2467 define <4 x i64> @test_mm256_sub_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2468 ; CHECK-LABEL: test_mm256_sub_epi16:
2469 ; CHECK:       # %bb.0:
2470 ; CHECK-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
2471 ; CHECK-NEXT:    ret{{[l|q]}}
2472   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2473   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2474   %res = sub <16 x i16> %arg0, %arg1
2475   %bc = bitcast <16 x i16> %res to <4 x i64>
2476   ret <4 x i64> %bc
2477 }
2478
2479 define <4 x i64> @test_mm256_sub_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2480 ; CHECK-LABEL: test_mm256_sub_epi32:
2481 ; CHECK:       # %bb.0:
2482 ; CHECK-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
2483 ; CHECK-NEXT:    ret{{[l|q]}}
2484   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2485   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2486   %res = sub <8 x i32> %arg0, %arg1
2487   %bc = bitcast <8 x i32> %res to <4 x i64>
2488   ret <4 x i64> %bc
2489 }
2490
2491 define <4 x i64> @test_mm256_sub_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2492 ; CHECK-LABEL: test_mm256_sub_epi64:
2493 ; CHECK:       # %bb.0:
2494 ; CHECK-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
2495 ; CHECK-NEXT:    ret{{[l|q]}}
2496   %res = sub <4 x i64> %a0, %a1
2497   ret <4 x i64> %res
2498 }
2499
2500 define <4 x i64> @test_mm256_subs_epi8(<4 x i64> %a0, <4 x i64> %a1) {
2501 ; CHECK-LABEL: test_mm256_subs_epi8:
2502 ; CHECK:       # %bb.0:
2503 ; CHECK-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0
2504 ; CHECK-NEXT:    ret{{[l|q]}}
2505   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2506   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2507   %res = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
2508   %bc = bitcast <32 x i8> %res to <4 x i64>
2509   ret <4 x i64> %bc
2510 }
2511 declare <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone
2512
2513 define <4 x i64> @test_mm256_subs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
2514 ; CHECK-LABEL: test_mm256_subs_epi16:
2515 ; CHECK:       # %bb.0:
2516 ; CHECK-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0
2517 ; CHECK-NEXT:    ret{{[l|q]}}
2518   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2519   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2520   %res = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
2521   %bc = bitcast <16 x i16> %res to <4 x i64>
2522   ret <4 x i64> %bc
2523 }
2524 declare <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone
2525
2526 define <4 x i64> @test_mm256_subs_epu8(<4 x i64> %a0, <4 x i64> %a1) {
2527 ; CHECK-LABEL: test_mm256_subs_epu8:
2528 ; CHECK:       # %bb.0:
2529 ; CHECK-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0
2530 ; CHECK-NEXT:    ret{{[l|q]}}
2531   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2532   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2533   %res = call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1)
2534   %bc = bitcast <32 x i8> %res to <4 x i64>
2535   ret <4 x i64> %bc
2536 }
2537 declare <32 x i8> @llvm.usub.sat.v32i8(<32 x i8>, <32 x i8>)
2538
2539 define <4 x i64> @test_mm256_subs_epu16(<4 x i64> %a0, <4 x i64> %a1) {
2540 ; CHECK-LABEL: test_mm256_subs_epu16:
2541 ; CHECK:       # %bb.0:
2542 ; CHECK-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0
2543 ; CHECK-NEXT:    ret{{[l|q]}}
2544   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2545   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2546   %res = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1)
2547   %bc = bitcast <16 x i16> %res to <4 x i64>
2548   ret <4 x i64> %bc
2549 }
2550 declare <16 x i16> @llvm.usub.sat.v16i16(<16 x i16>, <16 x i16>)
2551
2552 define <4 x i64> @test_mm256_unpackhi_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2553 ; CHECK-LABEL: test_mm256_unpackhi_epi8:
2554 ; CHECK:       # %bb.0:
2555 ; CHECK-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
2556 ; CHECK-NEXT:    ret{{[l|q]}}
2557   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2558   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2559   %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
2560   %bc = bitcast <32 x i8> %res to <4 x i64>
2561   ret <4 x i64> %bc
2562 }
2563
2564 define <4 x i64> @test_mm256_unpackhi_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2565 ; CHECK-LABEL: test_mm256_unpackhi_epi16:
2566 ; CHECK:       # %bb.0:
2567 ; CHECK-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
2568 ; CHECK-NEXT:    ret{{[l|q]}}
2569   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2570   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2571   %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
2572   %bc = bitcast <16 x i16> %res to <4 x i64>
2573   ret <4 x i64> %bc
2574 }
2575
2576 define <4 x i64> @test_mm256_unpackhi_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2577 ; CHECK-LABEL: test_mm256_unpackhi_epi32:
2578 ; CHECK:       # %bb.0:
2579 ; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
2580 ; CHECK-NEXT:    ret{{[l|q]}}
2581   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2582   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2583   %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
2584   %bc = bitcast <8 x i32> %res to <4 x i64>
2585   ret <4 x i64> %bc
2586 }
2587
2588 define <4 x i64> @test_mm256_unpackhi_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2589 ; CHECK-LABEL: test_mm256_unpackhi_epi64:
2590 ; CHECK:       # %bb.0:
2591 ; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
2592 ; CHECK-NEXT:    ret{{[l|q]}}
2593   %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
2594   ret <4 x i64> %res
2595 }
2596
2597 define <4 x i64> @test_mm256_unpacklo_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2598 ; CHECK-LABEL: test_mm256_unpacklo_epi8:
2599 ; CHECK:       # %bb.0:
2600 ; CHECK-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
2601 ; CHECK-NEXT:    ret{{[l|q]}}
2602   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
2603   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
2604   %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
2605   %bc = bitcast <32 x i8> %res to <4 x i64>
2606   ret <4 x i64> %bc
2607 }
2608
2609 define <4 x i64> @test_mm256_unpacklo_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2610 ; CHECK-LABEL: test_mm256_unpacklo_epi16:
2611 ; CHECK:       # %bb.0:
2612 ; CHECK-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
2613 ; CHECK-NEXT:    ret{{[l|q]}}
2614   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
2615   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
2616   %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
2617   %bc = bitcast <16 x i16> %res to <4 x i64>
2618   ret <4 x i64> %bc
2619 }
2620
2621 define <4 x i64> @test_mm256_unpacklo_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2622 ; CHECK-LABEL: test_mm256_unpacklo_epi32:
2623 ; CHECK:       # %bb.0:
2624 ; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
2625 ; CHECK-NEXT:    ret{{[l|q]}}
2626   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
2627   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
2628   %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
2629   %bc = bitcast <8 x i32> %res to <4 x i64>
2630   ret <4 x i64> %bc
2631 }
2632
2633 define <4 x i64> @test_mm256_unpacklo_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2634 ; CHECK-LABEL: test_mm256_unpacklo_epi64:
2635 ; CHECK:       # %bb.0:
2636 ; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
2637 ; CHECK-NEXT:    ret{{[l|q]}}
2638   %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
2639   ret <4 x i64> %res
2640 }
2641
2642 define <4 x i64> @test_mm256_xor_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2643 ; CHECK-LABEL: test_mm256_xor_si256:
2644 ; CHECK:       # %bb.0:
2645 ; CHECK-NEXT:    vxorps %ymm1, %ymm0, %ymm0
2646 ; CHECK-NEXT:    ret{{[l|q]}}
2647   %res = xor <4 x i64> %a0, %a1
2648   ret <4 x i64> %res
2649 }
2650
2651 declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
2652
2653 declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone