test/CodeGen/X86/avx-intrinsics-fast-isel.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=avx,aes,pclmul | FileCheck %s --check-prefixes=CHECK,X86
   3 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=avx,aes,pclmul | FileCheck %s --check-prefixes=CHECK,X64
   4
   5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx-builtins.c
   6
   7 define <4 x double> @test_mm256_add_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
   8 ; CHECK-LABEL: test_mm256_add_pd:
   9 ; CHECK:       # %bb.0:
  10 ; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
  11 ; CHECK-NEXT:    ret{{[l|q]}}
  12   %res = fadd <4 x double> %a0, %a1
  13   ret <4 x double> %res
  14 }
  15
  16 define <8 x float> @test_mm256_add_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
  17 ; CHECK-LABEL: test_mm256_add_ps:
  18 ; CHECK:       # %bb.0:
  19 ; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0
  20 ; CHECK-NEXT:    ret{{[l|q]}}
  21   %res = fadd <8 x float> %a0, %a1
  22   ret <8 x float> %res
  23 }
  24
  25 define <4 x double> @test_mm256_addsub_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
  26 ; CHECK-LABEL: test_mm256_addsub_pd:
  27 ; CHECK:       # %bb.0:
  28 ; CHECK-NEXT:    vaddsubpd %ymm1, %ymm0, %ymm0
  29 ; CHECK-NEXT:    ret{{[l|q]}}
  30   %res = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1)
  31   ret <4 x double> %res
  32 }
  33 declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
  34
  35 define <8 x float> @test_mm256_addsub_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
  36 ; CHECK-LABEL: test_mm256_addsub_ps:
  37 ; CHECK:       # %bb.0:
  38 ; CHECK-NEXT:    vaddsubps %ymm1, %ymm0, %ymm0
  39 ; CHECK-NEXT:    ret{{[l|q]}}
  40   %res = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1)
  41   ret <8 x float> %res
  42 }
  43 declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
  44
  45 define <4 x double> @test_mm256_and_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
  46 ; CHECK-LABEL: test_mm256_and_pd:
  47 ; CHECK:       # %bb.0:
  48 ; CHECK-NEXT:    vandps %ymm1, %ymm0, %ymm0
  49 ; CHECK-NEXT:    ret{{[l|q]}}
  50   %1 = bitcast <4 x double> %a0 to <4 x i64>
  51   %2 = bitcast <4 x double> %a1 to <4 x i64>
  52   %res = and <4 x i64> %1, %2
  53   %bc = bitcast <4 x i64> %res to <4 x double>
  54   ret <4 x double> %bc
  55 }
  56
  57 define <8 x float> @test_mm256_and_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
  58 ; CHECK-LABEL: test_mm256_and_ps:
  59 ; CHECK:       # %bb.0:
  60 ; CHECK-NEXT:    vandps %ymm1, %ymm0, %ymm0
  61 ; CHECK-NEXT:    ret{{[l|q]}}
  62   %1 = bitcast <8 x float> %a0 to <8 x i32>
  63   %2 = bitcast <8 x float> %a1 to <8 x i32>
  64   %res = and <8 x i32> %1, %2
  65   %bc = bitcast <8 x i32> %res to <8 x float>
  66   ret <8 x float> %bc
  67 }
  68
  69 define <4 x double> @test_mm256_andnot_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
  70 ; CHECK-LABEL: test_mm256_andnot_pd:
  71 ; CHECK:       # %bb.0:
  72 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
  73 ; CHECK-NEXT:    vcmptrueps %ymm2, %ymm2, %ymm2
  74 ; CHECK-NEXT:    vxorps %ymm2, %ymm0, %ymm0
  75 ; CHECK-NEXT:    vandps %ymm1, %ymm0, %ymm0
  76 ; CHECK-NEXT:    ret{{[l|q]}}
  77   %1 = bitcast <4 x double> %a0 to <4 x i64>
  78   %2 = bitcast <4 x double> %a1 to <4 x i64>
  79   %3 = xor <4 x i64> %1, <i64 -1, i64 -1, i64 -1, i64 -1>
  80   %res = and <4 x i64> %3, %2
  81   %bc = bitcast <4 x i64> %res to <4 x double>
  82   ret <4 x double> %bc
  83 }
  84
  85 define <8 x float> @test_mm256_andnot_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
  86 ; CHECK-LABEL: test_mm256_andnot_ps:
  87 ; CHECK:       # %bb.0:
  88 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
  89 ; CHECK-NEXT:    vcmptrueps %ymm2, %ymm2, %ymm2
  90 ; CHECK-NEXT:    vxorps %ymm2, %ymm0, %ymm0
  91 ; CHECK-NEXT:    vandps %ymm1, %ymm0, %ymm0
  92 ; CHECK-NEXT:    ret{{[l|q]}}
  93   %1 = bitcast <8 x float> %a0 to <8 x i32>
  94   %2 = bitcast <8 x float> %a1 to <8 x i32>
  95   %3 = xor <8 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
  96   %res = and <8 x i32> %3, %2
  97   %bc = bitcast <8 x i32> %res to <8 x float>
  98   ret <8 x float> %bc
  99 }
 100
 101 define <4 x double> @test_mm256_blend_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
 102 ; CHECK-LABEL: test_mm256_blend_pd:
 103 ; CHECK:       # %bb.0:
 104 ; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
 105 ; CHECK-NEXT:    ret{{[l|q]}}
 106   %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
 107   ret <4 x double> %res
 108 }
 109
 110 define <8 x float> @test_mm256_blend_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
 111 ; CHECK-LABEL: test_mm256_blend_ps:
 112 ; CHECK:       # %bb.0:
 113 ; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6],ymm1[7]
 114 ; CHECK-NEXT:    ret{{[l|q]}}
 115   %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
 116   ret <8 x float> %res
 117 }
 118
 119 define <4 x double> @test_mm256_blendv_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) nounwind {
 120 ; CHECK-LABEL: test_mm256_blendv_pd:
 121 ; CHECK:       # %bb.0:
 122 ; CHECK-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
 123 ; CHECK-NEXT:    ret{{[l|q]}}
 124   %res = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
 125   ret <4 x double> %res
 126 }
 127 declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
 128
 129 define <8 x float> @test_mm256_blendv_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind {
 130 ; CHECK-LABEL: test_mm256_blendv_ps:
 131 ; CHECK:       # %bb.0:
 132 ; CHECK-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0
 133 ; CHECK-NEXT:    ret{{[l|q]}}
 134   %res = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
 135   ret <8 x float> %res
 136 }
 137 declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
 138
 139 define <4 x double> @test_mm256_broadcast_pd(<2 x double>* %a0) nounwind {
 140 ; X86-LABEL: test_mm256_broadcast_pd:
 141 ; X86:       # %bb.0:
 142 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 143 ; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
 144 ; X86-NEXT:    retl
 145 ;
 146 ; X64-LABEL: test_mm256_broadcast_pd:
 147 ; X64:       # %bb.0:
 148 ; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
 149 ; X64-NEXT:    retq
 150   %ld = load <2 x double>, <2 x double>* %a0
 151   %res = shufflevector <2 x double> %ld, <2 x double> %ld, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
 152   ret <4 x double> %res
 153 }
 154
 155 define <8 x float> @test_mm256_broadcast_ps(<4 x float>* %a0) nounwind {
 156 ; X86-LABEL: test_mm256_broadcast_ps:
 157 ; X86:       # %bb.0:
 158 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 159 ; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
 160 ; X86-NEXT:    retl
 161 ;
 162 ; X64-LABEL: test_mm256_broadcast_ps:
 163 ; X64:       # %bb.0:
 164 ; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
 165 ; X64-NEXT:    retq
 166   %ld = load <4 x float>, <4 x float>* %a0
 167   %res = shufflevector <4 x float> %ld, <4 x float> %ld, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 168   ret <8 x float> %res
 169 }
 170
 171 define <4 x double> @test_mm256_broadcast_sd(double* %a0) nounwind {
 172 ; X86-LABEL: test_mm256_broadcast_sd:
 173 ; X86:       # %bb.0:
 174 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 175 ; X86-NEXT:    vbroadcastsd (%eax), %ymm0
 176 ; X86-NEXT:    retl
 177 ;
 178 ; X64-LABEL: test_mm256_broadcast_sd:
 179 ; X64:       # %bb.0:
 180 ; X64-NEXT:    vbroadcastsd (%rdi), %ymm0
 181 ; X64-NEXT:    retq
 182   %ld = load double, double* %a0
 183   %ins0 = insertelement <4 x double> undef, double %ld, i32 0
 184   %ins1 = insertelement <4 x double> %ins0, double %ld, i32 1
 185   %ins2 = insertelement <4 x double> %ins1, double %ld, i32 2
 186   %ins3 = insertelement <4 x double> %ins2, double %ld, i32 3
 187   ret <4 x double> %ins3
 188 }
 189
 190 define <4 x float> @test_mm_broadcast_ss(float* %a0) nounwind {
 191 ; X86-LABEL: test_mm_broadcast_ss:
 192 ; X86:       # %bb.0:
 193 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 194 ; X86-NEXT:    vbroadcastss (%eax), %xmm0
 195 ; X86-NEXT:    retl
 196 ;
 197 ; X64-LABEL: test_mm_broadcast_ss:
 198 ; X64:       # %bb.0:
 199 ; X64-NEXT:    vbroadcastss (%rdi), %xmm0
 200 ; X64-NEXT:    retq
 201   %ld = load float, float* %a0
 202   %ins0 = insertelement <4 x float> undef, float %ld, i32 0
 203   %ins1 = insertelement <4 x float> %ins0, float %ld, i32 1
 204   %ins2 = insertelement <4 x float> %ins1, float %ld, i32 2
 205   %ins3 = insertelement <4 x float> %ins2, float %ld, i32 3
 206   ret <4 x float> %ins3
 207 }
 208
 209 define <8 x float> @test_mm256_broadcast_ss(float* %a0) nounwind {
 210 ; X86-LABEL: test_mm256_broadcast_ss:
 211 ; X86:       # %bb.0:
 212 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 213 ; X86-NEXT:    vbroadcastss (%eax), %ymm0
 214 ; X86-NEXT:    retl
 215 ;
 216 ; X64-LABEL: test_mm256_broadcast_ss:
 217 ; X64:       # %bb.0:
 218 ; X64-NEXT:    vbroadcastss (%rdi), %ymm0
 219 ; X64-NEXT:    retq
 220   %ld = load float, float* %a0
 221   %ins0 = insertelement <8 x float> undef, float %ld, i32 0
 222   %ins1 = insertelement <8 x float> %ins0, float %ld, i32 1
 223   %ins2 = insertelement <8 x float> %ins1, float %ld, i32 2
 224   %ins3 = insertelement <8 x float> %ins2, float %ld, i32 3
 225   %ins4 = insertelement <8 x float> %ins3, float %ld, i32 4
 226   %ins5 = insertelement <8 x float> %ins4, float %ld, i32 5
 227   %ins6 = insertelement <8 x float> %ins5, float %ld, i32 6
 228   %ins7 = insertelement <8 x float> %ins6, float %ld, i32 7
 229   ret <8 x float> %ins7
 230 }
 231
 232 define <8 x float> @test_mm256_castpd_ps(<4 x double> %a0) nounwind {
 233 ; CHECK-LABEL: test_mm256_castpd_ps:
 234 ; CHECK:       # %bb.0:
 235 ; CHECK-NEXT:    ret{{[l|q]}}
 236   %res = bitcast <4 x double> %a0 to <8 x float>
 237   ret <8 x float> %res
 238 }
 239
 240 define <4 x i64> @test_mm256_castpd_si256(<4 x double> %a0) nounwind {
 241 ; CHECK-LABEL: test_mm256_castpd_si256:
 242 ; CHECK:       # %bb.0:
 243 ; CHECK-NEXT:    ret{{[l|q]}}
 244   %res = bitcast <4 x double> %a0 to <4 x i64>
 245   ret <4 x i64> %res
 246 }
 247
 248 define <4 x double> @test_mm256_castpd128_pd256(<2 x double> %a0) nounwind {
 249 ; CHECK-LABEL: test_mm256_castpd128_pd256:
 250 ; CHECK:       # %bb.0:
 251 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
 252 ; CHECK-NEXT:    ret{{[l|q]}}
 253   %res = shufflevector <2 x double> %a0, <2 x double> %a0, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 254   ret <4 x double> %res
 255 }
 256
 257 define <2 x double> @test_mm256_castpd256_pd128(<4 x double> %a0) nounwind {
 258 ; CHECK-LABEL: test_mm256_castpd256_pd128:
 259 ; CHECK:       # %bb.0:
 260 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 261 ; CHECK-NEXT:    vzeroupper
 262 ; CHECK-NEXT:    ret{{[l|q]}}
 263   %res = shufflevector <4 x double> %a0, <4 x double> %a0, <2 x i32> <i32 0, i32 1>
 264   ret <2 x double> %res
 265 }
 266
 267 define <4 x double> @test_mm256_castps_pd(<8 x float> %a0) nounwind {
 268 ; CHECK-LABEL: test_mm256_castps_pd:
 269 ; CHECK:       # %bb.0:
 270 ; CHECK-NEXT:    ret{{[l|q]}}
 271   %res = bitcast <8 x float> %a0 to <4 x double>
 272   ret <4 x double> %res
 273 }
 274
 275 define <4 x i64> @test_mm256_castps_si256(<8 x float> %a0) nounwind {
 276 ; CHECK-LABEL: test_mm256_castps_si256:
 277 ; CHECK:       # %bb.0:
 278 ; CHECK-NEXT:    ret{{[l|q]}}
 279   %res = bitcast <8 x float> %a0 to <4 x i64>
 280   ret <4 x i64> %res
 281 }
 282
 283 define <8 x float> @test_mm256_castps128_ps256(<4 x float> %a0) nounwind {
 284 ; CHECK-LABEL: test_mm256_castps128_ps256:
 285 ; CHECK:       # %bb.0:
 286 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
 287 ; CHECK-NEXT:    ret{{[l|q]}}
 288   %res = shufflevector <4 x float> %a0, <4 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 289   ret <8 x float> %res
 290 }
 291
 292 define <4 x float> @test_mm256_castps256_ps128(<8 x float> %a0) nounwind {
 293 ; CHECK-LABEL: test_mm256_castps256_ps128:
 294 ; CHECK:       # %bb.0:
 295 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 296 ; CHECK-NEXT:    vzeroupper
 297 ; CHECK-NEXT:    ret{{[l|q]}}
 298   %res = shufflevector <8 x float> %a0, <8 x float> %a0, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 299   ret <4 x float> %res
 300 }
 301
 302 define <4 x i64> @test_mm256_castsi128_si256(<2 x i64> %a0) nounwind {
 303 ; CHECK-LABEL: test_mm256_castsi128_si256:
 304 ; CHECK:       # %bb.0:
 305 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
 306 ; CHECK-NEXT:    ret{{[l|q]}}
 307   %res = shufflevector <2 x i64> %a0, <2 x i64> %a0, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 308   ret <4 x i64> %res
 309 }
 310
 311 define <4 x double> @test_mm256_castsi256_pd(<4 x i64> %a0) nounwind {
 312 ; CHECK-LABEL: test_mm256_castsi256_pd:
 313 ; CHECK:       # %bb.0:
 314 ; CHECK-NEXT:    ret{{[l|q]}}
 315   %res = bitcast <4 x i64> %a0 to <4 x double>
 316   ret <4 x double> %res
 317 }
 318
 319 define <8 x float> @test_mm256_castsi256_ps(<4 x i64> %a0) nounwind {
 320 ; CHECK-LABEL: test_mm256_castsi256_ps:
 321 ; CHECK:       # %bb.0:
 322 ; CHECK-NEXT:    ret{{[l|q]}}
 323   %res = bitcast <4 x i64> %a0 to <8 x float>
 324   ret <8 x float> %res
 325 }
 326
 327 define <2 x i64> @test_mm256_castsi256_si128(<4 x i64> %a0) nounwind {
 328 ; CHECK-LABEL: test_mm256_castsi256_si128:
 329 ; CHECK:       # %bb.0:
 330 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
 331 ; CHECK-NEXT:    vzeroupper
 332 ; CHECK-NEXT:    ret{{[l|q]}}
 333   %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 0, i32 1>
 334   ret <2 x i64> %res
 335 }
 336
 337 define <4 x double> @test_mm256_ceil_pd(<4 x double> %a0) nounwind {
 338 ; CHECK-LABEL: test_mm256_ceil_pd:
 339 ; CHECK:       # %bb.0:
 340 ; CHECK-NEXT:    vroundpd $2, %ymm0, %ymm0
 341 ; CHECK-NEXT:    ret{{[l|q]}}
 342   %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 2)
 343   ret <4 x double> %res
 344 }
 345 declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
 346
 347 define <8 x float> @test_mm256_ceil_ps(<8 x float> %a0) nounwind {
 348 ; CHECK-LABEL: test_mm256_ceil_ps:
 349 ; CHECK:       # %bb.0:
 350 ; CHECK-NEXT:    vroundps $2, %ymm0, %ymm0
 351 ; CHECK-NEXT:    ret{{[l|q]}}
 352   %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 2)
 353   ret <8 x float> %res
 354 }
 355 declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
 356
 357 define <2 x double> @test_mm_cmp_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
 358 ; CHECK-LABEL: test_mm_cmp_pd:
 359 ; CHECK:       # %bb.0:
 360 ; CHECK-NEXT:    vcmpgepd %xmm1, %xmm0, %xmm0
 361 ; CHECK-NEXT:    ret{{[l|q]}}
 362   %res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 13)
 363   ret <2 x double> %res
 364 }
 365 declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounwind readnone
 366
 367 define <4 x double> @test_mm256_cmp_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
 368 ; CHECK-LABEL: test_mm256_cmp_pd:
 369 ; CHECK:       # %bb.0:
 370 ; CHECK-NEXT:    vcmpgepd %ymm1, %ymm0, %ymm0
 371 ; CHECK-NEXT:    ret{{[l|q]}}
 372   %res = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i8 13)
 373   ret <4 x double> %res
 374 }
 375 declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
 376
 377 define <4 x float> @test_mm_cmp_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
 378 ; CHECK-LABEL: test_mm_cmp_ps:
 379 ; CHECK:       # %bb.0:
 380 ; CHECK-NEXT:    vcmpgeps %xmm1, %xmm0, %xmm0
 381 ; CHECK-NEXT:    ret{{[l|q]}}
 382   %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 13)
 383   ret <4 x float> %res
 384 }
 385 declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
 386
 387 define <8 x float> @test_mm256_cmp_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
 388 ; CHECK-LABEL: test_mm256_cmp_ps:
 389 ; CHECK:       # %bb.0:
 390 ; CHECK-NEXT:    vcmpgeps %ymm1, %ymm0, %ymm0
 391 ; CHECK-NEXT:    ret{{[l|q]}}
 392   %res = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 13)
 393   ret <8 x float> %res
 394 }
 395 declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
 396
 397 define <2 x double> @test_mm_cmp_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
 398 ; CHECK-LABEL: test_mm_cmp_sd:
 399 ; CHECK:       # %bb.0:
 400 ; CHECK-NEXT:    vcmpgesd %xmm1, %xmm0, %xmm0
 401 ; CHECK-NEXT:    ret{{[l|q]}}
 402   %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 13)
 403   ret <2 x double> %res
 404 }
 405 declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone
 406
 407 define <4 x float> @test_mm_cmp_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
 408 ; CHECK-LABEL: test_mm_cmp_ss:
 409 ; CHECK:       # %bb.0:
 410 ; CHECK-NEXT:    vcmpgess %xmm1, %xmm0, %xmm0
 411 ; CHECK-NEXT:    ret{{[l|q]}}
 412   %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 13)
 413   ret <4 x float> %res
 414 }
 415 declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
 416
 417 define <4 x double> @test_mm256_cvtepi32_pd(<2 x i64> %a0) nounwind {
 418 ; CHECK-LABEL: test_mm256_cvtepi32_pd:
 419 ; CHECK:       # %bb.0:
 420 ; CHECK-NEXT:    vcvtdq2pd %xmm0, %ymm0
 421 ; CHECK-NEXT:    ret{{[l|q]}}
 422   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
 423   %res = sitofp <4 x i32> %arg0 to <4 x double>
 424   ret <4 x double> %res
 425 }
 426
 427 define <8 x float> @test_mm256_cvtepi32_ps(<4 x i64> %a0) nounwind {
 428 ; CHECK-LABEL: test_mm256_cvtepi32_ps:
 429 ; CHECK:       # %bb.0:
 430 ; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm0
 431 ; CHECK-NEXT:    ret{{[l|q]}}
 432   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
 433   %res = sitofp <8 x i32> %arg0 to <8 x float>
 434   ret <8 x float> %res
 435 }
 436
 437 define <2 x i64> @test_mm256_cvtpd_epi32(<4 x double> %a0) nounwind {
 438 ; CHECK-LABEL: test_mm256_cvtpd_epi32:
 439 ; CHECK:       # %bb.0:
 440 ; CHECK-NEXT:    vcvtpd2dq %ymm0, %xmm0
 441 ; CHECK-NEXT:    vzeroupper
 442 ; CHECK-NEXT:    ret{{[l|q]}}
 443   %cvt = call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %a0)
 444   %res = bitcast <4 x i32> %cvt to <2 x i64>
 445   ret <2 x i64> %res
 446 }
 447 declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>) nounwind readnone
 448
 449 define <4 x float> @test_mm256_cvtpd_ps(<4 x double> %a0) nounwind {
 450 ; CHECK-LABEL: test_mm256_cvtpd_ps:
 451 ; CHECK:       # %bb.0:
 452 ; CHECK-NEXT:    vcvtpd2ps %ymm0, %xmm0
 453 ; CHECK-NEXT:    vzeroupper
 454 ; CHECK-NEXT:    ret{{[l|q]}}
 455   %res = call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %a0)
 456   ret <4 x float> %res
 457 }
 458 declare <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double>) nounwind readnone
 459
 460 define <4 x i64> @test_mm256_cvtps_epi32(<8 x float> %a0) nounwind {
 461 ; CHECK-LABEL: test_mm256_cvtps_epi32:
 462 ; CHECK:       # %bb.0:
 463 ; CHECK-NEXT:    vcvtps2dq %ymm0, %ymm0
 464 ; CHECK-NEXT:    ret{{[l|q]}}
 465   %cvt = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %a0)
 466   %res = bitcast <8 x i32> %cvt to <4 x i64>
 467   ret <4 x i64> %res
 468 }
 469 declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone
 470
 471 define <4 x double> @test_mm256_cvtps_pd(<4 x float> %a0) nounwind {
 472 ; CHECK-LABEL: test_mm256_cvtps_pd:
 473 ; CHECK:       # %bb.0:
 474 ; CHECK-NEXT:    vcvtps2pd %xmm0, %ymm0
 475 ; CHECK-NEXT:    ret{{[l|q]}}
 476   %res = fpext <4 x float> %a0 to <4 x double>
 477   ret <4 x double> %res
 478 }
 479
 480 define <2 x i64> @test_mm256_cvttpd_epi32(<4 x double> %a0) nounwind {
 481 ; CHECK-LABEL: test_mm256_cvttpd_epi32:
 482 ; CHECK:       # %bb.0:
 483 ; CHECK-NEXT:    vcvttpd2dq %ymm0, %xmm0
 484 ; CHECK-NEXT:    vzeroupper
 485 ; CHECK-NEXT:    ret{{[l|q]}}
 486   %cvt = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0)
 487   %res = bitcast <4 x i32> %cvt to <2 x i64>
 488   ret <2 x i64> %res
 489 }
 490 declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
 491
 492 define <4 x i64> @test_mm256_cvttps_epi32(<8 x float> %a0) nounwind {
 493 ; CHECK-LABEL: test_mm256_cvttps_epi32:
 494 ; CHECK:       # %bb.0:
 495 ; CHECK-NEXT:    vcvttps2dq %ymm0, %ymm0
 496 ; CHECK-NEXT:    ret{{[l|q]}}
 497   %cvt = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0)
 498   %res = bitcast <8 x i32> %cvt to <4 x i64>
 499   ret <4 x i64> %res
 500 }
 501 declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
 502
 503 define <4 x double> @test_mm256_div_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
 504 ; CHECK-LABEL: test_mm256_div_pd:
 505 ; CHECK:       # %bb.0:
 506 ; CHECK-NEXT:    vdivpd %ymm1, %ymm0, %ymm0
 507 ; CHECK-NEXT:    ret{{[l|q]}}
 508   %res = fdiv <4 x double> %a0, %a1
 509   ret <4 x double> %res
 510 }
 511
 512 define <8 x float> @test_mm256_div_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
 513 ; CHECK-LABEL: test_mm256_div_ps:
 514 ; CHECK:       # %bb.0:
 515 ; CHECK-NEXT:    vdivps %ymm1, %ymm0, %ymm0
 516 ; CHECK-NEXT:    ret{{[l|q]}}
 517   %res = fdiv <8 x float> %a0, %a1
 518   ret <8 x float> %res
 519 }
 520
 521 define <8 x float> @test_mm256_dp_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
 522 ; CHECK-LABEL: test_mm256_dp_ps:
 523 ; CHECK:       # %bb.0:
 524 ; CHECK-NEXT:    vdpps $7, %ymm1, %ymm0, %ymm0
 525 ; CHECK-NEXT:    ret{{[l|q]}}
 526   %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7)
 527   ret <8 x float> %res
 528 }
 529 declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
 530
 531 define i32 @test_mm256_extract_epi8(<4 x i64> %a0) nounwind {
 532 ; CHECK-LABEL: test_mm256_extract_epi8:
 533 ; CHECK:       # %bb.0:
 534 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
 535 ; CHECK-NEXT:    vpextrb $15, %xmm0, %eax
 536 ; CHECK-NEXT:    movzbl %al, %eax
 537 ; CHECK-NEXT:    vzeroupper
 538 ; CHECK-NEXT:    ret{{[l|q]}}
 539   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 540   %ext = extractelement <32 x i8> %arg0, i32 31
 541   %res = zext i8 %ext to i32
 542   ret i32 %res
 543 }
 544
 545 define i32 @test_mm256_extract_epi16(<4 x i64> %a0) nounwind {
 546 ; CHECK-LABEL: test_mm256_extract_epi16:
 547 ; CHECK:       # %bb.0:
 548 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
 549 ; CHECK-NEXT:    vpextrw $3, %xmm0, %eax
 550 ; CHECK-NEXT:    movzwl %ax, %eax
 551 ; CHECK-NEXT:    vzeroupper
 552 ; CHECK-NEXT:    ret{{[l|q]}}
 553   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 554   %ext = extractelement <16 x i16> %arg0, i32 11
 555   %res = zext i16 %ext to i32
 556   ret i32 %res
 557 }
 558
 559 define i32 @test_mm256_extract_epi32(<4 x i64> %a0) nounwind {
 560 ; CHECK-LABEL: test_mm256_extract_epi32:
 561 ; CHECK:       # %bb.0:
 562 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
 563 ; CHECK-NEXT:    vextractps $1, %xmm0, %eax
 564 ; CHECK-NEXT:    vzeroupper
 565 ; CHECK-NEXT:    ret{{[l|q]}}
 566   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
 567   %res = extractelement <8 x i32> %arg0, i32 5
 568   ret i32 %res
 569 }
 570
 571 define i64 @test_mm256_extract_epi64(<4 x i64> %a0) nounwind {
 572 ; X86-LABEL: test_mm256_extract_epi64:
 573 ; X86:       # %bb.0:
 574 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
 575 ; X86-NEXT:    vextractps $2, %xmm0, %eax
 576 ; X86-NEXT:    vextractps $3, %xmm0, %edx
 577 ; X86-NEXT:    vzeroupper
 578 ; X86-NEXT:    retl
 579 ;
 580 ; X64-LABEL: test_mm256_extract_epi64:
 581 ; X64:       # %bb.0:
 582 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
 583 ; X64-NEXT:    vpextrq $1, %xmm0, %rax
 584 ; X64-NEXT:    vzeroupper
 585 ; X64-NEXT:    retq
 586   %res = extractelement <4 x i64> %a0, i32 3
 587   ret i64 %res
 588 }
 589
 590 define <2 x double> @test_mm256_extractf128_pd(<4 x double> %a0) nounwind {
 591 ; CHECK-LABEL: test_mm256_extractf128_pd:
 592 ; CHECK:       # %bb.0:
 593 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
 594 ; CHECK-NEXT:    vzeroupper
 595 ; CHECK-NEXT:    ret{{[l|q]}}
 596   %res = shufflevector <4 x double> %a0, <4 x double> %a0, <2 x i32> <i32 2, i32 3>
 597   ret <2 x double> %res
 598 }
 599
 600 define <4 x float> @test_mm256_extractf128_ps(<8 x float> %a0) nounwind {
 601 ; CHECK-LABEL: test_mm256_extractf128_ps:
 602 ; CHECK:       # %bb.0:
 603 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
 604 ; CHECK-NEXT:    vzeroupper
 605 ; CHECK-NEXT:    ret{{[l|q]}}
 606   %res = shufflevector <8 x float> %a0, <8 x float> %a0, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 607   ret <4 x float> %res
 608 }
 609
 610 define <2 x i64> @test_mm256_extractf128_si256(<4 x i64> %a0) nounwind {
 611 ; CHECK-LABEL: test_mm256_extractf128_si256:
 612 ; CHECK:       # %bb.0:
 613 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
 614 ; CHECK-NEXT:    vzeroupper
 615 ; CHECK-NEXT:    ret{{[l|q]}}
 616   %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3>
 617   ret <2 x i64> %res
 618 }
 619
 620 define <4 x double> @test_mm256_floor_pd(<4 x double> %a0) nounwind {
 621 ; CHECK-LABEL: test_mm256_floor_pd:
 622 ; CHECK:       # %bb.0:
 623 ; CHECK-NEXT:    vroundpd $1, %ymm0, %ymm0
 624 ; CHECK-NEXT:    ret{{[l|q]}}
 625   %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 1)
 626   ret <4 x double> %res
 627 }
 628
 629 define <8 x float> @test_mm256_floor_ps(<8 x float> %a0) nounwind {
 630 ; CHECK-LABEL: test_mm256_floor_ps:
 631 ; CHECK:       # %bb.0:
 632 ; CHECK-NEXT:    vroundps $1, %ymm0, %ymm0
 633 ; CHECK-NEXT:    ret{{[l|q]}}
 634   %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 1)
 635   ret <8 x float> %res
 636 }
 637
 638 define <4 x double> @test_mm256_hadd_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
 639 ; CHECK-LABEL: test_mm256_hadd_pd:
 640 ; CHECK:       # %bb.0:
 641 ; CHECK-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
 642 ; CHECK-NEXT:    ret{{[l|q]}}
 643   %res = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1)
 644   ret <4 x double> %res
 645 }
 646 declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
 647
 648 define <8 x float> @test_mm256_hadd_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
 649 ; CHECK-LABEL: test_mm256_hadd_ps:
 650 ; CHECK:       # %bb.0:
 651 ; CHECK-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
 652 ; CHECK-NEXT:    ret{{[l|q]}}
 653   %res = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1)
 654   ret <8 x float> %res
 655 }
 656 declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
 657
 658 define <4 x double> @test_mm256_hsub_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
 659 ; CHECK-LABEL: test_mm256_hsub_pd:
 660 ; CHECK:       # %bb.0:
 661 ; CHECK-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0
 662 ; CHECK-NEXT:    ret{{[l|q]}}
 663   %res = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1)
 664   ret <4 x double> %res
 665 }
 666 declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
 667
 668 define <8 x float> @test_mm256_hsub_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
 669 ; CHECK-LABEL: test_mm256_hsub_ps:
 670 ; CHECK:       # %bb.0:
 671 ; CHECK-NEXT:    vhsubps %ymm1, %ymm0, %ymm0
 672 ; CHECK-NEXT:    ret{{[l|q]}}
 673   %res = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1)
 674   ret <8 x float> %res
 675 }
 676 declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
 677
 678 define <4 x i64> @test_mm256_insert_epi8(<4 x i64> %a0, i8 %a1) nounwind {
 679 ; X86-LABEL: test_mm256_insert_epi8:
 680 ; X86:       # %bb.0:
 681 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 682 ; X86-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm1
 683 ; X86-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 684 ; X86-NEXT:    retl
 685 ;
 686 ; X64-LABEL: test_mm256_insert_epi8:
 687 ; X64:       # %bb.0:
 688 ; X64-NEXT:    movzbl %dil, %eax
 689 ; X64-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm1
 690 ; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 691 ; X64-NEXT:    retq
 692   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
 693   %res = insertelement <32 x i8> %arg0, i8 %a1, i32 4
 694   %bc = bitcast <32 x i8> %res to <4 x i64>
 695   ret <4 x i64> %bc
 696 }
 697
 698 define <4 x i64> @test_mm256_insert_epi16(<4 x i64> %a0, i16 %a1) nounwind {
 699 ; X86-LABEL: test_mm256_insert_epi16:
 700 ; X86:       # %bb.0:
 701 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
 702 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
 703 ; X86-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
 704 ; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 705 ; X86-NEXT:    retl
 706 ;
 707 ; X64-LABEL: test_mm256_insert_epi16:
 708 ; X64:       # %bb.0:
 709 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
 710 ; X64-NEXT:    vpinsrw $6, %edi, %xmm1, %xmm1
 711 ; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 712 ; X64-NEXT:    retq
 713   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
 714   %res = insertelement <16 x i16> %arg0, i16 %a1, i32 14
 715   %bc = bitcast <16 x i16> %res to <4 x i64>
 716   ret <4 x i64> %bc
 717 }
 718
 719 define <4 x i64> @test_mm256_insert_epi32(<4 x i64> %a0, i32 %a1) nounwind {
 720 ; X86-LABEL: test_mm256_insert_epi32:
 721 ; X86:       # %bb.0:
 722 ; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm1
 723 ; X86-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 724 ; X86-NEXT:    retl
 725 ;
 726 ; X64-LABEL: test_mm256_insert_epi32:
 727 ; X64:       # %bb.0:
 728 ; X64-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm1
 729 ; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 730 ; X64-NEXT:    retq
 731   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
 732   %res = insertelement <8 x i32> %arg0, i32 %a1, i32 3
 733   %bc = bitcast <8 x i32> %res to <4 x i64>
 734   ret <4 x i64> %bc
 735 }
 736
 737 define <4 x i64> @test_mm256_insert_epi64(<4 x i64> %a0, i64 %a1) nounwind {
 738 ; X86-LABEL: test_mm256_insert_epi64:
 739 ; X86:       # %bb.0:
 740 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
 741 ; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
 742 ; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
 743 ; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 744 ; X86-NEXT:    retl
 745 ;
 746 ; X64-LABEL: test_mm256_insert_epi64:
 747 ; X64:       # %bb.0:
 748 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
 749 ; X64-NEXT:    vpinsrq $1, %rdi, %xmm1, %xmm1
 750 ; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 751 ; X64-NEXT:    retq
 752   %res = insertelement <4 x i64> %a0, i64 %a1, i32 3
 753   ret <4 x i64> %res
 754 }
 755
 756 define <4 x double> @test_mm256_insertf128_pd(<4 x double> %a0, <2 x double> %a1) nounwind {
 757 ; CHECK-LABEL: test_mm256_insertf128_pd:
 758 ; CHECK:       # %bb.0:
 759 ; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
 760 ; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 761 ; CHECK-NEXT:    ret{{[l|q]}}
 762   %ext = shufflevector <2 x double> %a1, <2 x double> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 763   %res = shufflevector <4 x double> %a0, <4 x double> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 764   ret <4 x double> %res
 765 }
 766
 767 define <8 x float> @test_mm256_insertf128_ps(<8 x float> %a0, <4 x float> %a1) nounwind {
 768 ; CHECK-LABEL: test_mm256_insertf128_ps:
 769 ; CHECK:       # %bb.0:
 770 ; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 771 ; CHECK-NEXT:    ret{{[l|q]}}
 772   %ext = shufflevector <4 x float> %a1, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 773   %res = shufflevector <8 x float> %a0, <8 x float> %ext, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 774   ret <8 x float> %res
 775 }
 776
 777 define <4 x i64> @test_mm256_insertf128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
 778 ; CHECK-LABEL: test_mm256_insertf128_si256:
 779 ; CHECK:       # %bb.0:
 780 ; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
 781 ; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 782 ; CHECK-NEXT:    ret{{[l|q]}}
 783   %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 784   %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 785   ret <4 x i64> %res
 786 }
 787
 788 define <4 x i64> @test_mm256_lddqu_si256(<4 x i64>* %a0) nounwind {
 789 ; X86-LABEL: test_mm256_lddqu_si256:
 790 ; X86:       # %bb.0:
 791 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 792 ; X86-NEXT:    vlddqu (%eax), %ymm0
 793 ; X86-NEXT:    retl
 794 ;
 795 ; X64-LABEL: test_mm256_lddqu_si256:
 796 ; X64:       # %bb.0:
 797 ; X64-NEXT:    vlddqu (%rdi), %ymm0
 798 ; X64-NEXT:    retq
 799   %arg0 = bitcast <4 x i64>* %a0 to i8*
 800   %res = call <32 x i8> @llvm.x86.avx.ldu.dq.256(i8* %arg0)
 801   %bc = bitcast <32 x i8> %res to <4 x i64>
 802   ret <4 x i64> %bc
 803 }
 804 declare <32 x i8> @llvm.x86.avx.ldu.dq.256(i8*) nounwind readnone
 805
 806 define <4 x double> @test_mm256_load_pd(double* %a0) nounwind {
 807 ; X86-LABEL: test_mm256_load_pd:
 808 ; X86:       # %bb.0:
 809 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 810 ; X86-NEXT:    vmovaps (%eax), %ymm0
 811 ; X86-NEXT:    retl
 812 ;
 813 ; X64-LABEL: test_mm256_load_pd:
 814 ; X64:       # %bb.0:
 815 ; X64-NEXT:    vmovaps (%rdi), %ymm0
 816 ; X64-NEXT:    retq
 817   %arg0 = bitcast double* %a0 to <4 x double>*
 818   %res = load <4 x double>, <4 x double>* %arg0, align 32
 819   ret <4 x double> %res
 820 }
 821
 822 define <8 x float> @test_mm256_load_ps(float* %a0) nounwind {
 823 ; X86-LABEL: test_mm256_load_ps:
 824 ; X86:       # %bb.0:
 825 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 826 ; X86-NEXT:    vmovaps (%eax), %ymm0
 827 ; X86-NEXT:    retl
 828 ;
 829 ; X64-LABEL: test_mm256_load_ps:
 830 ; X64:       # %bb.0:
 831 ; X64-NEXT:    vmovaps (%rdi), %ymm0
 832 ; X64-NEXT:    retq
 833   %arg0 = bitcast float* %a0 to <8 x float>*
 834   %res = load <8 x float>, <8 x float>* %arg0, align 32
 835   ret <8 x float> %res
 836 }
 837
 838 define <4 x i64> @test_mm256_load_si256(<4 x i64>* %a0) nounwind {
 839 ; X86-LABEL: test_mm256_load_si256:
 840 ; X86:       # %bb.0:
 841 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 842 ; X86-NEXT:    vmovaps (%eax), %ymm0
 843 ; X86-NEXT:    retl
 844 ;
 845 ; X64-LABEL: test_mm256_load_si256:
 846 ; X64:       # %bb.0:
 847 ; X64-NEXT:    vmovaps (%rdi), %ymm0
 848 ; X64-NEXT:    retq
 849   %res = load <4 x i64>, <4 x i64>* %a0, align 32
 850   ret <4 x i64> %res
 851 }
 852
 853 define <4 x double> @test_mm256_loadu_pd(double* %a0) nounwind {
 854 ; X86-LABEL: test_mm256_loadu_pd:
 855 ; X86:       # %bb.0:
 856 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 857 ; X86-NEXT:    vmovups (%eax), %ymm0
 858 ; X86-NEXT:    retl
 859 ;
 860 ; X64-LABEL: test_mm256_loadu_pd:
 861 ; X64:       # %bb.0:
 862 ; X64-NEXT:    vmovups (%rdi), %ymm0
 863 ; X64-NEXT:    retq
 864   %arg0 = bitcast double* %a0 to <4 x double>*
 865   %res = load <4 x double>, <4 x double>* %arg0, align 1
 866   ret <4 x double> %res
 867 }
 868
 869 define <8 x float> @test_mm256_loadu_ps(float* %a0) nounwind {
 870 ; X86-LABEL: test_mm256_loadu_ps:
 871 ; X86:       # %bb.0:
 872 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 873 ; X86-NEXT:    vmovups (%eax), %ymm0
 874 ; X86-NEXT:    retl
 875 ;
 876 ; X64-LABEL: test_mm256_loadu_ps:
 877 ; X64:       # %bb.0:
 878 ; X64-NEXT:    vmovups (%rdi), %ymm0
 879 ; X64-NEXT:    retq
 880   %arg0 = bitcast float* %a0 to <8 x float>*
 881   %res = load <8 x float>, <8 x float>* %arg0, align 1
 882   ret <8 x float> %res
 883 }
 884
 885 define <4 x i64> @test_mm256_loadu_si256(<4 x i64>* %a0) nounwind {
 886 ; X86-LABEL: test_mm256_loadu_si256:
 887 ; X86:       # %bb.0:
 888 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 889 ; X86-NEXT:    vmovups (%eax), %ymm0
 890 ; X86-NEXT:    retl
 891 ;
 892 ; X64-LABEL: test_mm256_loadu_si256:
 893 ; X64:       # %bb.0:
 894 ; X64-NEXT:    vmovups (%rdi), %ymm0
 895 ; X64-NEXT:    retq
 896   %res = load <4 x i64>, <4 x i64>* %a0, align 1
 897   ret <4 x i64> %res
 898 }
 899
 900 define <8 x float> @test_mm256_loadu2_m128(float* %a0, float* %a1) nounwind {
 901 ; X86-LABEL: test_mm256_loadu2_m128:
 902 ; X86:       # %bb.0:
 903 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 904 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 905 ; X86-NEXT:    vmovups (%eax), %xmm0
 906 ; X86-NEXT:    vinsertf128 $1, (%ecx), %ymm0, %ymm0
 907 ; X86-NEXT:    retl
 908 ;
 909 ; X64-LABEL: test_mm256_loadu2_m128:
 910 ; X64:       # %bb.0:
 911 ; X64-NEXT:    vmovups (%rsi), %xmm0
 912 ; X64-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0
 913 ; X64-NEXT:    retq
 914   %arg0 = bitcast float* %a0 to <4 x float>*
 915   %hi4 = load <4 x float>, <4 x float>* %arg0, align 1
 916   %hi8 = shufflevector <4 x float> %hi4, <4 x float> %hi4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 917   %arg1 = bitcast float* %a1 to <4 x float>*
 918   %lo4 = load <4 x float>, <4 x float>* %arg1, align 1
 919   %lo8 = shufflevector <4 x float> %lo4, <4 x float> %lo4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 920   %res = shufflevector <8 x float> %lo8, <8 x float> %hi8, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 921   ret <8 x float> %res
 922 }
 923
 924 define <4 x double> @test_mm256_loadu2_m128d(double* %a0, double* %a1) nounwind {
 925 ; X86-LABEL: test_mm256_loadu2_m128d:
 926 ; X86:       # %bb.0:
 927 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 928 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 929 ; X86-NEXT:    vmovups (%eax), %xmm0
 930 ; X86-NEXT:    vinsertf128 $1, (%ecx), %ymm0, %ymm0
 931 ; X86-NEXT:    retl
 932 ;
 933 ; X64-LABEL: test_mm256_loadu2_m128d:
 934 ; X64:       # %bb.0:
 935 ; X64-NEXT:    vmovups (%rsi), %xmm0
 936 ; X64-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0
 937 ; X64-NEXT:    retq
 938   %arg0 = bitcast double* %a0 to <2 x double>*
 939   %hi2 = load <2 x double>, <2 x double>* %arg0, align 1
 940   %hi4 = shufflevector <2 x double> %hi2, <2 x double> %hi2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 941   %arg1 = bitcast double* %a1 to <2 x double>*
 942   %lo2 = load <2 x double>, <2 x double>* %arg1, align 1
 943   %lo4 = shufflevector <2 x double> %lo2, <2 x double> %lo2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 944   %res = shufflevector <4 x double> %lo4, <4 x double> %hi4, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 945   ret <4 x double> %res
 946 }
 947
 948 define <4 x i64> @test_mm256_loadu2_m128i(i64* %a0, i64* %a1) nounwind {
 949 ; X86-LABEL: test_mm256_loadu2_m128i:
 950 ; X86:       # %bb.0:
 951 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 952 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 953 ; X86-NEXT:    vmovups (%eax), %xmm0
 954 ; X86-NEXT:    vinsertf128 $1, (%ecx), %ymm0, %ymm0
 955 ; X86-NEXT:    retl
 956 ;
 957 ; X64-LABEL: test_mm256_loadu2_m128i:
 958 ; X64:       # %bb.0:
 959 ; X64-NEXT:    vmovups (%rsi), %xmm0
 960 ; X64-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0
 961 ; X64-NEXT:    retq
 962   %arg0 = bitcast i64* %a0 to <2 x i64>*
 963   %hi2 = load <2 x i64>, <2 x i64>* %arg0, align 1
 964   %hi4 = shufflevector <2 x i64> %hi2, <2 x i64> %hi2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 965   %arg1 = bitcast i64* %a1 to <2 x i64>*
 966   %lo2 = load <2 x i64>, <2 x i64>* %arg1, align 1
 967   %lo4 = shufflevector <2 x i64> %lo2, <2 x i64> %lo2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 968   %res = shufflevector <4 x i64> %lo4, <4 x i64> %hi4, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 969   ret <4 x i64> %res
 970 }
 971
 972 define <2 x double> @test_mm_maskload_pd(double* %a0, <2 x i64> %a1) nounwind {
 973 ; X86-LABEL: test_mm_maskload_pd:
 974 ; X86:       # %bb.0:
 975 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 976 ; X86-NEXT:    vmaskmovpd (%eax), %xmm0, %xmm0
 977 ; X86-NEXT:    retl
 978 ;
 979 ; X64-LABEL: test_mm_maskload_pd:
 980 ; X64:       # %bb.0:
 981 ; X64-NEXT:    vmaskmovpd (%rdi), %xmm0, %xmm0
 982 ; X64-NEXT:    retq
 983   %arg0 = bitcast double* %a0 to i8*
 984   %res = call <2 x double> @llvm.x86.avx.maskload.pd(i8* %arg0, <2 x i64> %a1)
 985   ret <2 x double> %res
 986 }
 987 declare <2 x double> @llvm.x86.avx.maskload.pd(i8*, <2 x i64>) nounwind readnone
 988
 989 define <4 x double> @test_mm256_maskload_pd(double* %a0, <4 x i64> %a1) nounwind {
 990 ; X86-LABEL: test_mm256_maskload_pd:
 991 ; X86:       # %bb.0:
 992 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 993 ; X86-NEXT:    vmaskmovpd (%eax), %ymm0, %ymm0
 994 ; X86-NEXT:    retl
 995 ;
 996 ; X64-LABEL: test_mm256_maskload_pd:
 997 ; X64:       # %bb.0:
 998 ; X64-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm0
 999 ; X64-NEXT:    retq
1000   %arg0 = bitcast double* %a0 to i8*
1001   %res = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %arg0, <4 x i64> %a1)
1002   ret <4 x double> %res
1003 }
1004 declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8*, <4 x i64>) nounwind readnone
1005
1006 define <4 x float> @test_mm_maskload_ps(float* %a0, <2 x i64> %a1) nounwind {
1007 ; X86-LABEL: test_mm_maskload_ps:
1008 ; X86:       # %bb.0:
1009 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1010 ; X86-NEXT:    vmaskmovps (%eax), %xmm0, %xmm0
1011 ; X86-NEXT:    retl
1012 ;
1013 ; X64-LABEL: test_mm_maskload_ps:
1014 ; X64:       # %bb.0:
1015 ; X64-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm0
1016 ; X64-NEXT:    retq
1017   %arg0 = bitcast float* %a0 to i8*
1018   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1019   %res = call <4 x float> @llvm.x86.avx.maskload.ps(i8* %arg0, <4 x i32> %arg1)
1020   ret <4 x float> %res
1021 }
1022 declare <4 x float> @llvm.x86.avx.maskload.ps(i8*, <4 x i32>) nounwind readnone
1023
1024 define <8 x float> @test_mm256_maskload_ps(float* %a0, <4 x i64> %a1) nounwind {
1025 ; X86-LABEL: test_mm256_maskload_ps:
1026 ; X86:       # %bb.0:
1027 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1028 ; X86-NEXT:    vmaskmovps (%eax), %ymm0, %ymm0
1029 ; X86-NEXT:    retl
1030 ;
1031 ; X64-LABEL: test_mm256_maskload_ps:
1032 ; X64:       # %bb.0:
1033 ; X64-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm0
1034 ; X64-NEXT:    retq
1035   %arg0 = bitcast float* %a0 to i8*
1036   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1037   %res = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %arg0, <8 x i32> %arg1)
1038   ret <8 x float> %res
1039 }
1040 declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8*, <8 x i32>) nounwind readnone
1041
1042 define void @test_mm_maskstore_pd(double* %a0, <2 x i64> %a1, <2 x double> %a2) nounwind {
1043 ; X86-LABEL: test_mm_maskstore_pd:
1044 ; X86:       # %bb.0:
1045 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1046 ; X86-NEXT:    vmaskmovpd %xmm1, %xmm0, (%eax)
1047 ; X86-NEXT:    retl
1048 ;
1049 ; X64-LABEL: test_mm_maskstore_pd:
1050 ; X64:       # %bb.0:
1051 ; X64-NEXT:    vmaskmovpd %xmm1, %xmm0, (%rdi)
1052 ; X64-NEXT:    retq
1053   %arg0 = bitcast double* %a0 to i8*
1054   call void @llvm.x86.avx.maskstore.pd(i8* %arg0, <2 x i64> %a1, <2 x double> %a2)
1055   ret void
1056 }
1057 declare void @llvm.x86.avx.maskstore.pd(i8*, <2 x i64>, <2 x double>) nounwind readnone
1058
1059 define void @test_mm256_maskstore_pd(double* %a0, <4 x i64> %a1, <4 x double> %a2) nounwind {
1060 ; X86-LABEL: test_mm256_maskstore_pd:
1061 ; X86:       # %bb.0:
1062 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1063 ; X86-NEXT:    vmaskmovpd %ymm1, %ymm0, (%eax)
1064 ; X86-NEXT:    vzeroupper
1065 ; X86-NEXT:    retl
1066 ;
1067 ; X64-LABEL: test_mm256_maskstore_pd:
1068 ; X64:       # %bb.0:
1069 ; X64-NEXT:    vmaskmovpd %ymm1, %ymm0, (%rdi)
1070 ; X64-NEXT:    vzeroupper
1071 ; X64-NEXT:    retq
1072   %arg0 = bitcast double* %a0 to i8*
1073   call void @llvm.x86.avx.maskstore.pd.256(i8* %arg0, <4 x i64> %a1, <4 x double> %a2)
1074   ret void
1075 }
1076 declare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x i64>, <4 x double>) nounwind readnone
1077
1078 define void @test_mm_maskstore_ps(float* %a0, <2 x i64> %a1, <4 x float> %a2) nounwind {
1079 ; X86-LABEL: test_mm_maskstore_ps:
1080 ; X86:       # %bb.0:
1081 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1082 ; X86-NEXT:    vmaskmovps %xmm1, %xmm0, (%eax)
1083 ; X86-NEXT:    retl
1084 ;
1085 ; X64-LABEL: test_mm_maskstore_ps:
1086 ; X64:       # %bb.0:
1087 ; X64-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
1088 ; X64-NEXT:    retq
1089   %arg0 = bitcast float* %a0 to i8*
1090   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1091   call void @llvm.x86.avx.maskstore.ps(i8* %arg0, <4 x i32> %arg1, <4 x float> %a2)
1092   ret void
1093 }
1094 declare void @llvm.x86.avx.maskstore.ps(i8*, <4 x i32>, <4 x float>) nounwind readnone
1095
1096 define void @test_mm256_maskstore_ps(float* %a0, <4 x i64> %a1, <8 x float> %a2) nounwind {
1097 ; X86-LABEL: test_mm256_maskstore_ps:
1098 ; X86:       # %bb.0:
1099 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1100 ; X86-NEXT:    vmaskmovps %ymm1, %ymm0, (%eax)
1101 ; X86-NEXT:    vzeroupper
1102 ; X86-NEXT:    retl
1103 ;
1104 ; X64-LABEL: test_mm256_maskstore_ps:
1105 ; X64:       # %bb.0:
1106 ; X64-NEXT:    vmaskmovps %ymm1, %ymm0, (%rdi)
1107 ; X64-NEXT:    vzeroupper
1108 ; X64-NEXT:    retq
1109   %arg0 = bitcast float* %a0 to i8*
1110   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1111   call void @llvm.x86.avx.maskstore.ps.256(i8* %arg0, <8 x i32> %arg1, <8 x float> %a2)
1112   ret void
1113 }
1114 declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>) nounwind readnone
1115
1116 define <4 x double> @test_mm256_max_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
1117 ; CHECK-LABEL: test_mm256_max_pd:
1118 ; CHECK:       # %bb.0:
1119 ; CHECK-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0
1120 ; CHECK-NEXT:    ret{{[l|q]}}
1121   %res = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1)
1122   ret <4 x double> %res
1123 }
1124 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
1125
1126 define <8 x float> @test_mm256_max_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
1127 ; CHECK-LABEL: test_mm256_max_ps:
1128 ; CHECK:       # %bb.0:
1129 ; CHECK-NEXT:    vmaxps %ymm1, %ymm0, %ymm0
1130 ; CHECK-NEXT:    ret{{[l|q]}}
1131   %res = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
1132   ret <8 x float> %res
1133 }
1134 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
1135
1136 define <4 x double> @test_mm256_min_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
1137 ; CHECK-LABEL: test_mm256_min_pd:
1138 ; CHECK:       # %bb.0:
1139 ; CHECK-NEXT:    vminpd %ymm1, %ymm0, %ymm0
1140 ; CHECK-NEXT:    ret{{[l|q]}}
1141   %res = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1)
1142   ret <4 x double> %res
1143 }
1144 declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
1145
1146 define <8 x float> @test_mm256_min_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
1147 ; CHECK-LABEL: test_mm256_min_ps:
1148 ; CHECK:       # %bb.0:
1149 ; CHECK-NEXT:    vminps %ymm1, %ymm0, %ymm0
1150 ; CHECK-NEXT:    ret{{[l|q]}}
1151   %res = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
1152   ret <8 x float> %res
1153 }
1154 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
1155
1156 define <4 x double> @test_mm256_movedup_pd(<4 x double> %a0) nounwind {
1157 ; CHECK-LABEL: test_mm256_movedup_pd:
1158 ; CHECK:       # %bb.0:
1159 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
1160 ; CHECK-NEXT:    ret{{[l|q]}}
1161   %res = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
1162   ret <4 x double> %res
1163 }
1164
1165 define <8 x float> @test_mm256_movehdup_ps(<8 x float> %a0) nounwind {
1166 ; CHECK-LABEL: test_mm256_movehdup_ps:
1167 ; CHECK:       # %bb.0:
1168 ; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
1169 ; CHECK-NEXT:    ret{{[l|q]}}
1170   %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
1171   ret <8 x float> %res
1172 }
1173
1174 define <8 x float> @test_mm256_moveldup_ps(<8 x float> %a0) nounwind {
1175 ; CHECK-LABEL: test_mm256_moveldup_ps:
1176 ; CHECK:       # %bb.0:
1177 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
1178 ; CHECK-NEXT:    ret{{[l|q]}}
1179   %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
1180   ret <8 x float> %res
1181 }
1182
1183 define i32 @test_mm256_movemask_pd(<4 x double> %a0) nounwind {
1184 ; CHECK-LABEL: test_mm256_movemask_pd:
1185 ; CHECK:       # %bb.0:
1186 ; CHECK-NEXT:    vmovmskpd %ymm0, %eax
1187 ; CHECK-NEXT:    vzeroupper
1188 ; CHECK-NEXT:    ret{{[l|q]}}
1189   %res = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0)
1190   ret i32 %res
1191 }
1192 declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
1193
1194 define i32 @test_mm256_movemask_ps(<8 x float> %a0) nounwind {
1195 ; CHECK-LABEL: test_mm256_movemask_ps:
1196 ; CHECK:       # %bb.0:
1197 ; CHECK-NEXT:    vmovmskps %ymm0, %eax
1198 ; CHECK-NEXT:    vzeroupper
1199 ; CHECK-NEXT:    ret{{[l|q]}}
1200   %res = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0)
1201   ret i32 %res
1202 }
1203 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
1204
1205 define <4 x double> @test_mm256_mul_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
1206 ; CHECK-LABEL: test_mm256_mul_pd:
1207 ; CHECK:       # %bb.0:
1208 ; CHECK-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
1209 ; CHECK-NEXT:    ret{{[l|q]}}
1210   %res = fmul <4 x double> %a0, %a1
1211   ret <4 x double> %res
1212 }
1213
1214 define <8 x float> @test_mm256_mul_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
1215 ; CHECK-LABEL: test_mm256_mul_ps:
1216 ; CHECK:       # %bb.0:
1217 ; CHECK-NEXT:    vmulps %ymm1, %ymm0, %ymm0
1218 ; CHECK-NEXT:    ret{{[l|q]}}
1219   %res = fmul <8 x float> %a0, %a1
1220   ret <8 x float> %res
1221 }
1222
1223 define <4 x double> @test_mm256_or_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
1224 ; CHECK-LABEL: test_mm256_or_pd:
1225 ; CHECK:       # %bb.0:
1226 ; CHECK-NEXT:    vorps %ymm1, %ymm0, %ymm0
1227 ; CHECK-NEXT:    ret{{[l|q]}}
1228   %1 = bitcast <4 x double> %a0 to <4 x i64>
1229   %2 = bitcast <4 x double> %a1 to <4 x i64>
1230   %res = or <4 x i64> %1, %2
1231   %bc = bitcast <4 x i64> %res to <4 x double>
1232   ret <4 x double> %bc
1233 }
1234
1235 define <8 x float> @test_mm256_or_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
1236 ; CHECK-LABEL: test_mm256_or_ps:
1237 ; CHECK:       # %bb.0:
1238 ; CHECK-NEXT:    vorps %ymm1, %ymm0, %ymm0
1239 ; CHECK-NEXT:    ret{{[l|q]}}
1240   %1 = bitcast <8 x float> %a0 to <8 x i32>
1241   %2 = bitcast <8 x float> %a1 to <8 x i32>
1242   %res = or <8 x i32> %1, %2
1243   %bc = bitcast <8 x i32> %res to <8 x float>
1244   ret <8 x float> %bc
1245 }
1246
1247 define <2 x double> @test_mm_permute_pd(<2 x double> %a0) nounwind {
1248 ; CHECK-LABEL: test_mm_permute_pd:
1249 ; CHECK:       # %bb.0:
1250 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
1251 ; CHECK-NEXT:    ret{{[l|q]}}
1252   %res = shufflevector <2 x double> %a0, <2 x double> %a0, <2 x i32> <i32 1, i32 0>
1253   ret <2 x double> %res
1254 }
1255
1256 define <4 x double> @test_mm256_permute_pd(<4 x double> %a0) nounwind {
1257 ; CHECK-LABEL: test_mm256_permute_pd:
1258 ; CHECK:       # %bb.0:
1259 ; CHECK-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
1260 ; CHECK-NEXT:    ret{{[l|q]}}
1261   %res = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
1262   ret <4 x double> %res
1263 }
1264
1265 define <4 x float> @test_mm_permute_ps(<4 x float> %a0) nounwind {
1266 ; CHECK-LABEL: test_mm_permute_ps:
1267 ; CHECK:       # %bb.0:
1268 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
1269 ; CHECK-NEXT:    ret{{[l|q]}}
1270   %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1271   ret <4 x float> %res
1272 }
1273
1274 define <4 x float> @test2_mm_permute_ps(<4 x float> %a0) nounwind {
1275 ; CHECK-LABEL: test2_mm_permute_ps:
1276 ; CHECK:       # %bb.0:
1277 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,2,3]
1278 ; CHECK-NEXT:    ret{{[l|q]}}
1279   %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 2, i32 1, i32 2, i32 3>
1280   ret <4 x float> %res
1281 }
1282
1283 define <8 x float> @test_mm256_permute_ps(<8 x float> %a0) nounwind {
1284 ; CHECK-LABEL: test_mm256_permute_ps:
1285 ; CHECK:       # %bb.0:
1286 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
1287 ; CHECK-NEXT:    ret{{[l|q]}}
1288   %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
1289   ret <8 x float> %res
1290 }
1291
1292 define <4 x double> @test_mm256_permute2f128_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
1293 ; CHECK-LABEL: test_mm256_permute2f128_pd:
1294 ; CHECK:       # %bb.0:
1295 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm1[0,1]
1296 ; CHECK-NEXT:    ret{{[l|q]}}
1297   %res = shufflevector <4 x double> zeroinitializer, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1298   ret <4 x double> %res
1299 }
1300 declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
1301
1302 ; PR26667
1303 define <8 x float> @test_mm256_permute2f128_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
1304 ; CHECK-LABEL: test_mm256_permute2f128_ps:
1305 ; CHECK:       # %bb.0:
1306 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
1307 ; CHECK-NEXT:    ret{{[l|q]}}
1308   %res = shufflevector <8 x float> %a1, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
1309   ret <8 x float> %res
1310 }
1311 declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
1312
1313 define <4 x i64> @test_mm256_permute2f128_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
1314 ; CHECK-LABEL: test_mm256_permute2f128_si256:
1315 ; CHECK:       # %bb.0:
1316 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
1317 ; CHECK-NEXT:    ret{{[l|q]}}
1318   %1 = bitcast <4 x i64> %a0 to <8 x i32>
1319   %2 = bitcast <4 x i64> %a1 to <8 x i32>
1320   %res = shufflevector <8 x i32> %2, <8 x i32> %2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
1321   %bc = bitcast <8 x i32> %res to <4 x i64>
1322   ret <4 x i64> %bc
1323 }
1324 declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
1325
1326 define <2 x double> @test_mm_permutevar_pd(<2 x double> %a0, <2 x i64> %a1) nounwind {
1327 ; CHECK-LABEL: test_mm_permutevar_pd:
1328 ; CHECK:       # %bb.0:
1329 ; CHECK-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
1330 ; CHECK-NEXT:    ret{{[l|q]}}
1331   %res = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1)
1332   ret <2 x double> %res
1333 }
1334 declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwind readnone
1335
1336 define <4 x double> @test_mm256_permutevar_pd(<4 x double> %a0, <4 x i64> %a1) nounwind {
1337 ; CHECK-LABEL: test_mm256_permutevar_pd:
1338 ; CHECK:       # %bb.0:
1339 ; CHECK-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0
1340 ; CHECK-NEXT:    ret{{[l|q]}}
1341   %res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1)
1342   ret <4 x double> %res
1343 }
1344 declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone
1345
1346 define <4 x float> @test_mm_permutevar_ps(<4 x float> %a0, <2 x i64> %a1) nounwind {
1347 ; CHECK-LABEL: test_mm_permutevar_ps:
1348 ; CHECK:       # %bb.0:
1349 ; CHECK-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
1350 ; CHECK-NEXT:    ret{{[l|q]}}
1351   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
1352   %res = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %arg1)
1353   ret <4 x float> %res
1354 }
1355 declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind readnone
1356
1357 define <8 x float> @test_mm256_permutevar_ps(<8 x float> %a0, <4 x i64> %a1) nounwind {
1358 ; CHECK-LABEL: test_mm256_permutevar_ps:
1359 ; CHECK:       # %bb.0:
1360 ; CHECK-NEXT:    vpermilps %ymm1, %ymm0, %ymm0
1361 ; CHECK-NEXT:    ret{{[l|q]}}
1362   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
1363   %res = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %arg1)
1364   ret <8 x float> %res
1365 }
1366 declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) nounwind readnone
1367
1368 define <8 x float> @test_mm256_rcp_ps(<8 x float> %a0) nounwind {
1369 ; CHECK-LABEL: test_mm256_rcp_ps:
1370 ; CHECK:       # %bb.0:
1371 ; CHECK-NEXT:    vrcpps %ymm0, %ymm0
1372 ; CHECK-NEXT:    ret{{[l|q]}}
1373   %res = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0)
1374   ret <8 x float> %res
1375 }
1376 declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
1377
1378 define <4 x double> @test_mm256_round_pd(<4 x double> %a0) nounwind {
1379 ; CHECK-LABEL: test_mm256_round_pd:
1380 ; CHECK:       # %bb.0:
1381 ; CHECK-NEXT:    vroundpd $4, %ymm0, %ymm0
1382 ; CHECK-NEXT:    ret{{[l|q]}}
1383   %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 4)
1384   ret <4 x double> %res
1385 }
1386
1387 define <8 x float> @test_mm256_round_ps(<8 x float> %a0) nounwind {
1388 ; CHECK-LABEL: test_mm256_round_ps:
1389 ; CHECK:       # %bb.0:
1390 ; CHECK-NEXT:    vroundps $4, %ymm0, %ymm0
1391 ; CHECK-NEXT:    ret{{[l|q]}}
1392   %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 4)
1393   ret <8 x float> %res
1394 }
1395
1396 define <8 x float> @test_mm256_rsqrt_ps(<8 x float> %a0) nounwind {
1397 ; CHECK-LABEL: test_mm256_rsqrt_ps:
1398 ; CHECK:       # %bb.0:
1399 ; CHECK-NEXT:    vrsqrtps %ymm0, %ymm0
1400 ; CHECK-NEXT:    ret{{[l|q]}}
1401   %res = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0)
1402   ret <8 x float> %res
1403 }
1404 declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
1405
1406 define <4 x i64> @test_mm256_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15, i8 %a16, i8 %a17, i8 %a18, i8 %a19, i8 %a20, i8 %a21, i8 %a22, i8 %a23, i8 %a24, i8 %a25, i8 %a26, i8 %a27, i8 %a28, i8 %a29, i8 %a30, i8 %a31) nounwind {
1407 ; X86-LABEL: test_mm256_set_epi8:
1408 ; X86:       # %bb.0:
1409 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1410 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
1411 ; X86-NEXT:    vmovd %ecx, %xmm0
1412 ; X86-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
1413 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1414 ; X86-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
1415 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1416 ; X86-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
1417 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1418 ; X86-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
1419 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1420 ; X86-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
1421 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1422 ; X86-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
1423 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1424 ; X86-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
1425 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1426 ; X86-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
1427 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1428 ; X86-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
1429 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1430 ; X86-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
1431 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1432 ; X86-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
1433 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1434 ; X86-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
1435 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1436 ; X86-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
1437 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1438 ; X86-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
1439 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1440 ; X86-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
1441 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1442 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
1443 ; X86-NEXT:    vmovd %ecx, %xmm1
1444 ; X86-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
1445 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1446 ; X86-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
1447 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1448 ; X86-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
1449 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1450 ; X86-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
1451 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1452 ; X86-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
1453 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1454 ; X86-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
1455 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1456 ; X86-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
1457 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1458 ; X86-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
1459 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1460 ; X86-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
1461 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1462 ; X86-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
1463 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1464 ; X86-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
1465 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1466 ; X86-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
1467 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1468 ; X86-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
1469 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1470 ; X86-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
1471 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1472 ; X86-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
1473 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1474 ; X86-NEXT:    retl
1475 ;
1476 ; X64-LABEL: test_mm256_set_epi8:
1477 ; X64:       # %bb.0:
1478 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
1479 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1480 ; X64-NEXT:    vmovd %eax, %xmm0
1481 ; X64-NEXT:    vpinsrb $1, %r10d, %xmm0, %xmm0
1482 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1483 ; X64-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
1484 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1485 ; X64-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
1486 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1487 ; X64-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
1488 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1489 ; X64-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
1490 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1491 ; X64-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
1492 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1493 ; X64-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
1494 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1495 ; X64-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
1496 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1497 ; X64-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
1498 ; X64-NEXT:    movzbl %r9b, %eax
1499 ; X64-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
1500 ; X64-NEXT:    movzbl %r8b, %eax
1501 ; X64-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
1502 ; X64-NEXT:    movzbl %cl, %eax
1503 ; X64-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
1504 ; X64-NEXT:    movzbl %dl, %eax
1505 ; X64-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
1506 ; X64-NEXT:    movzbl %sil, %eax
1507 ; X64-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
1508 ; X64-NEXT:    movzbl %dil, %eax
1509 ; X64-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
1510 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1511 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
1512 ; X64-NEXT:    vmovd %ecx, %xmm1
1513 ; X64-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
1514 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1515 ; X64-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
1516 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1517 ; X64-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
1518 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1519 ; X64-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
1520 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1521 ; X64-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
1522 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1523 ; X64-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
1524 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1525 ; X64-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
1526 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1527 ; X64-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
1528 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1529 ; X64-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
1530 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1531 ; X64-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
1532 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1533 ; X64-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
1534 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1535 ; X64-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
1536 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1537 ; X64-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
1538 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1539 ; X64-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
1540 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
1541 ; X64-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
1542 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1543 ; X64-NEXT:    retq
1544   %res0  = insertelement <32 x i8> undef,  i8 %a31, i32 0
1545   %res1  = insertelement <32 x i8> %res0,  i8 %a30, i32 1
1546   %res2  = insertelement <32 x i8> %res1,  i8 %a29, i32 2
1547   %res3  = insertelement <32 x i8> %res2,  i8 %a28, i32 3
1548   %res4  = insertelement <32 x i8> %res3,  i8 %a27, i32 4
1549   %res5  = insertelement <32 x i8> %res4,  i8 %a26, i32 5
1550   %res6  = insertelement <32 x i8> %res5,  i8 %a25, i32 6
1551   %res7  = insertelement <32 x i8> %res6,  i8 %a24, i32 7
1552   %res8  = insertelement <32 x i8> %res7,  i8 %a23, i32 8
1553   %res9  = insertelement <32 x i8> %res8,  i8 %a22, i32 9
1554   %res10 = insertelement <32 x i8> %res9,  i8 %a21, i32 10
1555   %res11 = insertelement <32 x i8> %res10, i8 %a20, i32 11
1556   %res12 = insertelement <32 x i8> %res11, i8 %a19, i32 12
1557   %res13 = insertelement <32 x i8> %res12, i8 %a18, i32 13
1558   %res14 = insertelement <32 x i8> %res13, i8 %a17, i32 14
1559   %res15 = insertelement <32 x i8> %res14, i8 %a16, i32 15
1560   %res16 = insertelement <32 x i8> %res15, i8 %a15, i32 16
1561   %res17 = insertelement <32 x i8> %res16, i8 %a14, i32 17
1562   %res18 = insertelement <32 x i8> %res17, i8 %a13, i32 18
1563   %res19 = insertelement <32 x i8> %res18, i8 %a12, i32 19
1564   %res20 = insertelement <32 x i8> %res19, i8 %a11, i32 20
1565   %res21 = insertelement <32 x i8> %res20, i8 %a10, i32 21
1566   %res22 = insertelement <32 x i8> %res21, i8 %a9 , i32 22
1567   %res23 = insertelement <32 x i8> %res22, i8 %a8 , i32 23
1568   %res24 = insertelement <32 x i8> %res23, i8 %a7 , i32 24
1569   %res25 = insertelement <32 x i8> %res24, i8 %a6 , i32 25
1570   %res26 = insertelement <32 x i8> %res25, i8 %a5 , i32 26
1571   %res27 = insertelement <32 x i8> %res26, i8 %a4 , i32 27
1572   %res28 = insertelement <32 x i8> %res27, i8 %a3 , i32 28
1573   %res29 = insertelement <32 x i8> %res28, i8 %a2 , i32 29
1574   %res30 = insertelement <32 x i8> %res29, i8 %a1 , i32 30
1575   %res31 = insertelement <32 x i8> %res30, i8 %a0 , i32 31
1576   %res = bitcast <32 x i8> %res31 to <4 x i64>
1577   ret <4 x i64> %res
1578 }
1579
1580 define <4 x i64> @test_mm256_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind {
1581 ; X86-LABEL: test_mm256_set_epi16:
1582 ; X86:       # %bb.0:
1583 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1584 ; X86-NEXT:    vmovd %eax, %xmm0
1585 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1586 ; X86-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
1587 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1588 ; X86-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
1589 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1590 ; X86-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
1591 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1592 ; X86-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
1593 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1594 ; X86-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
1595 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1596 ; X86-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
1597 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1598 ; X86-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
1599 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1600 ; X86-NEXT:    vmovd %eax, %xmm1
1601 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1602 ; X86-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
1603 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1604 ; X86-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
1605 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1606 ; X86-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
1607 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1608 ; X86-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
1609 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1610 ; X86-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
1611 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1612 ; X86-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
1613 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1614 ; X86-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
1615 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1616 ; X86-NEXT:    retl
1617 ;
1618 ; X64-LABEL: test_mm256_set_epi16:
1619 ; X64:       # %bb.0:
1620 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
1621 ; X64-NEXT:    vmovd %eax, %xmm0
1622 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
1623 ; X64-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
1624 ; X64-NEXT:    vpinsrw $2, %r9d, %xmm0, %xmm0
1625 ; X64-NEXT:    vpinsrw $3, %r8d, %xmm0, %xmm0
1626 ; X64-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0
1627 ; X64-NEXT:    vpinsrw $5, %edx, %xmm0, %xmm0
1628 ; X64-NEXT:    vpinsrw $6, %esi, %xmm0, %xmm0
1629 ; X64-NEXT:    vpinsrw $7, %edi, %xmm0, %xmm0
1630 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
1631 ; X64-NEXT:    vmovd %eax, %xmm1
1632 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
1633 ; X64-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
1634 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
1635 ; X64-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
1636 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
1637 ; X64-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
1638 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
1639 ; X64-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
1640 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
1641 ; X64-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
1642 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
1643 ; X64-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
1644 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
1645 ; X64-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
1646 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1647 ; X64-NEXT:    retq
1648   %res0  = insertelement <16 x i16> undef,  i16 %a15, i32 0
1649   %res1  = insertelement <16 x i16> %res0,  i16 %a14, i32 1
1650   %res2  = insertelement <16 x i16> %res1,  i16 %a13, i32 2
1651   %res3  = insertelement <16 x i16> %res2,  i16 %a12, i32 3
1652   %res4  = insertelement <16 x i16> %res3,  i16 %a11, i32 4
1653   %res5  = insertelement <16 x i16> %res4,  i16 %a10, i32 5
1654   %res6  = insertelement <16 x i16> %res5,  i16 %a9 , i32 6
1655   %res7  = insertelement <16 x i16> %res6,  i16 %a8 , i32 7
1656   %res8  = insertelement <16 x i16> %res7,  i16 %a7 , i32 8
1657   %res9  = insertelement <16 x i16> %res8,  i16 %a6 , i32 9
1658   %res10 = insertelement <16 x i16> %res9,  i16 %a5 , i32 10
1659   %res11 = insertelement <16 x i16> %res10, i16 %a4 , i32 11
1660   %res12 = insertelement <16 x i16> %res11, i16 %a3 , i32 12
1661   %res13 = insertelement <16 x i16> %res12, i16 %a2 , i32 13
1662   %res14 = insertelement <16 x i16> %res13, i16 %a1 , i32 14
1663   %res15 = insertelement <16 x i16> %res14, i16 %a0 , i32 15
1664   %res = bitcast <16 x i16> %res15 to <4 x i64>
1665   ret <4 x i64> %res
1666 }
1667
1668 define <4 x i64> @test_mm256_set_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind {
1669 ; X86-LABEL: test_mm256_set_epi32:
1670 ; X86:       # %bb.0:
1671 ; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1672 ; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
1673 ; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
1674 ; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
1675 ; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1676 ; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
1677 ; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
1678 ; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
1679 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1680 ; X86-NEXT:    retl
1681 ;
1682 ; X64-LABEL: test_mm256_set_epi32:
1683 ; X64:       # %bb.0:
1684 ; X64-NEXT:    vmovd %ecx, %xmm0
1685 ; X64-NEXT:    vpinsrd $1, %edx, %xmm0, %xmm0
1686 ; X64-NEXT:    vpinsrd $2, %esi, %xmm0, %xmm0
1687 ; X64-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm0
1688 ; X64-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1689 ; X64-NEXT:    vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1
1690 ; X64-NEXT:    vpinsrd $2, %r9d, %xmm1, %xmm1
1691 ; X64-NEXT:    vpinsrd $3, %r8d, %xmm1, %xmm1
1692 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1693 ; X64-NEXT:    retq
1694   %res0 = insertelement <8 x i32> undef, i32 %a7, i32 0
1695   %res1 = insertelement <8 x i32> %res0, i32 %a6, i32 1
1696   %res2 = insertelement <8 x i32> %res1, i32 %a5, i32 2
1697   %res3 = insertelement <8 x i32> %res2, i32 %a4, i32 3
1698   %res4 = insertelement <8 x i32> %res3, i32 %a3, i32 4
1699   %res5 = insertelement <8 x i32> %res4, i32 %a2, i32 5
1700   %res6 = insertelement <8 x i32> %res5, i32 %a1, i32 6
1701   %res7 = insertelement <8 x i32> %res6, i32 %a0, i32 7
1702   %res = bitcast <8 x i32> %res7 to <4 x i64>
1703   ret <4 x i64> %res
1704 }
1705
1706 define <4 x i64> @test_mm256_set_epi64x(i64 %a0, i64 %a1, i64 %a2, i64 %a3) nounwind {
1707 ; X86-LABEL: test_mm256_set_epi64x:
1708 ; X86:       # %bb.0:
1709 ; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1710 ; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
1711 ; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
1712 ; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
1713 ; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1714 ; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
1715 ; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
1716 ; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
1717 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1718 ; X86-NEXT:    retl
1719 ;
1720 ; X64-LABEL: test_mm256_set_epi64x:
1721 ; X64:       # %bb.0:
1722 ; X64-NEXT:    vmovq %rdi, %xmm0
1723 ; X64-NEXT:    vmovq %rsi, %xmm1
1724 ; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1725 ; X64-NEXT:    vmovq %rdx, %xmm1
1726 ; X64-NEXT:    vmovq %rcx, %xmm2
1727 ; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
1728 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1729 ; X64-NEXT:    retq
1730   %res0 = insertelement <4 x i64> undef, i64 %a3, i32 0
1731   %res1 = insertelement <4 x i64> %res0, i64 %a2, i32 1
1732   %res2 = insertelement <4 x i64> %res1, i64 %a1, i32 2
1733   %res3 = insertelement <4 x i64> %res2, i64 %a0, i32 3
1734   ret <4 x i64> %res3
1735 }
1736
1737 define <8 x float> @test_mm256_set_m128(<4 x float> %a0, <4 x float> %a1) nounwind {
1738 ; CHECK-LABEL: test_mm256_set_m128:
1739 ; CHECK:       # %bb.0:
1740 ; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
1741 ; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1742 ; CHECK-NEXT:    ret{{[l|q]}}
1743   %res = shufflevector <4 x float> %a1, <4 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1744   ret <8 x float> %res
1745 }
1746
1747 define <4 x double> @test_mm256_set_m128d(<2 x double> %a0, <2 x double> %a1) nounwind {
1748 ; CHECK-LABEL: test_mm256_set_m128d:
1749 ; CHECK:       # %bb.0:
1750 ; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
1751 ; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1752 ; CHECK-NEXT:    ret{{[l|q]}}
1753   %arg0 = bitcast <2 x double> %a0 to <4 x float>
1754   %arg1 = bitcast <2 x double> %a1 to <4 x float>
1755   %res = shufflevector <4 x float> %arg1, <4 x float> %arg0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1756   %bc = bitcast <8 x float> %res to <4 x double>
1757   ret <4 x double> %bc
1758 }
1759
1760 define <4 x i64> @test_mm256_set_m128i(<2 x i64> %a0, <2 x i64> %a1) nounwind {
1761 ; CHECK-LABEL: test_mm256_set_m128i:
1762 ; CHECK:       # %bb.0:
1763 ; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
1764 ; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1765 ; CHECK-NEXT:    ret{{[l|q]}}
1766   %arg0 = bitcast <2 x i64> %a0 to <4 x float>
1767   %arg1 = bitcast <2 x i64> %a1 to <4 x float>
1768   %res = shufflevector <4 x float> %arg1, <4 x float> %arg0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1769   %bc = bitcast <8 x float> %res to <4 x i64>
1770   ret <4 x i64> %bc
1771 }
1772
1773 define <4 x double> @test_mm256_set_pd(double %a0, double %a1, double %a2, double %a3) nounwind {
1774 ; X86-LABEL: test_mm256_set_pd:
1775 ; X86:       # %bb.0:
1776 ; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
1777 ; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
1778 ; X86-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1779 ; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
1780 ; X86-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
1781 ; X86-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1782 ; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1783 ; X86-NEXT:    retl
1784 ;
1785 ; X64-LABEL: test_mm256_set_pd:
1786 ; X64:       # %bb.0:
1787 ; X64-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
1788 ; X64-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0]
1789 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1790 ; X64-NEXT:    retq
1791   %res0 = insertelement <4 x double> undef, double %a3, i32 0
1792   %res1 = insertelement <4 x double> %res0, double %a2, i32 1
1793   %res2 = insertelement <4 x double> %res1, double %a1, i32 2
1794   %res3 = insertelement <4 x double> %res2, double %a0, i32 3
1795   ret <4 x double> %res3
1796 }
1797
1798 define <8 x float> @test_mm256_set_ps(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) nounwind {
1799 ; X86-LABEL: test_mm256_set_ps:
1800 ; X86:       # %bb.0:
1801 ; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1802 ; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1803 ; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
1804 ; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1805 ; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
1806 ; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1807 ; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
1808 ; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1809 ; X86-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1810 ; X86-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
1811 ; X86-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1812 ; X86-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
1813 ; X86-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1814 ; X86-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
1815 ; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1816 ; X86-NEXT:    retl
1817 ;
1818 ; X64-LABEL: test_mm256_set_ps:
1819 ; X64:       # %bb.0:
1820 ; X64-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
1821 ; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
1822 ; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
1823 ; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[2,3]
1824 ; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3]
1825 ; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
1826 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1827 ; X64-NEXT:    retq
1828   %res0 = insertelement <8 x float> undef, float %a7, i32 0
1829   %res1 = insertelement <8 x float> %res0, float %a6, i32 1
1830   %res2 = insertelement <8 x float> %res1, float %a5, i32 2
1831   %res3 = insertelement <8 x float> %res2, float %a4, i32 3
1832   %res4 = insertelement <8 x float> %res3, float %a3, i32 4
1833   %res5 = insertelement <8 x float> %res4, float %a2, i32 5
1834   %res6 = insertelement <8 x float> %res5, float %a1, i32 6
1835   %res7 = insertelement <8 x float> %res6, float %a0, i32 7
1836   ret <8 x float> %res7
1837 }
1838
1839 define <4 x i64> @test_mm256_set1_epi8(i8 %a0) nounwind {
1840 ; X86-LABEL: test_mm256_set1_epi8:
1841 ; X86:       # %bb.0:
1842 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
1843 ; X86-NEXT:    vmovd %eax, %xmm0
1844 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1845 ; X86-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
1846 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1847 ; X86-NEXT:    retl
1848 ;
1849 ; X64-LABEL: test_mm256_set1_epi8:
1850 ; X64:       # %bb.0:
1851 ; X64-NEXT:    movzbl %dil, %eax
1852 ; X64-NEXT:    vmovd %eax, %xmm0
1853 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1854 ; X64-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
1855 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1856 ; X64-NEXT:    retq
1857   %res0  = insertelement <32 x i8> undef,  i8 %a0, i32 0
1858   %res1  = insertelement <32 x i8> %res0,  i8 %a0, i32 1
1859   %res2  = insertelement <32 x i8> %res1,  i8 %a0, i32 2
1860   %res3  = insertelement <32 x i8> %res2,  i8 %a0, i32 3
1861   %res4  = insertelement <32 x i8> %res3,  i8 %a0, i32 4
1862   %res5  = insertelement <32 x i8> %res4,  i8 %a0, i32 5
1863   %res6  = insertelement <32 x i8> %res5,  i8 %a0, i32 6
1864   %res7  = insertelement <32 x i8> %res6,  i8 %a0, i32 7
1865   %res8  = insertelement <32 x i8> %res7,  i8 %a0, i32 8
1866   %res9  = insertelement <32 x i8> %res8,  i8 %a0, i32 9
1867   %res10 = insertelement <32 x i8> %res9,  i8 %a0, i32 10
1868   %res11 = insertelement <32 x i8> %res10, i8 %a0, i32 11
1869   %res12 = insertelement <32 x i8> %res11, i8 %a0, i32 12
1870   %res13 = insertelement <32 x i8> %res12, i8 %a0, i32 13
1871   %res14 = insertelement <32 x i8> %res13, i8 %a0, i32 14
1872   %res15 = insertelement <32 x i8> %res14, i8 %a0, i32 15
1873   %res16 = insertelement <32 x i8> %res15, i8 %a0, i32 16
1874   %res17 = insertelement <32 x i8> %res16, i8 %a0, i32 17
1875   %res18 = insertelement <32 x i8> %res17, i8 %a0, i32 18
1876   %res19 = insertelement <32 x i8> %res18, i8 %a0, i32 19
1877   %res20 = insertelement <32 x i8> %res19, i8 %a0, i32 20
1878   %res21 = insertelement <32 x i8> %res20, i8 %a0, i32 21
1879   %res22 = insertelement <32 x i8> %res21, i8 %a0, i32 22
1880   %res23 = insertelement <32 x i8> %res22, i8 %a0, i32 23
1881   %res24 = insertelement <32 x i8> %res23, i8 %a0, i32 24
1882   %res25 = insertelement <32 x i8> %res24, i8 %a0, i32 25
1883   %res26 = insertelement <32 x i8> %res25, i8 %a0, i32 26
1884   %res27 = insertelement <32 x i8> %res26, i8 %a0, i32 27
1885   %res28 = insertelement <32 x i8> %res27, i8 %a0, i32 28
1886   %res29 = insertelement <32 x i8> %res28, i8 %a0, i32 29
1887   %res30 = insertelement <32 x i8> %res29, i8 %a0, i32 30
1888   %res31 = insertelement <32 x i8> %res30, i8 %a0, i32 31
1889   %res = bitcast <32 x i8> %res31 to <4 x i64>
1890   ret <4 x i64> %res
1891 }
1892
1893 define <4 x i64> @test_mm256_set1_epi16(i16 %a0) nounwind {
1894 ; X86-LABEL: test_mm256_set1_epi16:
1895 ; X86:       # %bb.0:
1896 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
1897 ; X86-NEXT:    vmovd %eax, %xmm0
1898 ; X86-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
1899 ; X86-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1900 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1901 ; X86-NEXT:    retl
1902 ;
1903 ; X64-LABEL: test_mm256_set1_epi16:
1904 ; X64:       # %bb.0:
1905 ; X64-NEXT:    vmovd %edi, %xmm0
1906 ; X64-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
1907 ; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1908 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1909 ; X64-NEXT:    retq
1910   %res0  = insertelement <16 x i16> undef,  i16 %a0, i32 0
1911   %res1  = insertelement <16 x i16> %res0,  i16 %a0, i32 1
1912   %res2  = insertelement <16 x i16> %res1,  i16 %a0, i32 2
1913   %res3  = insertelement <16 x i16> %res2,  i16 %a0, i32 3
1914   %res4  = insertelement <16 x i16> %res3,  i16 %a0, i32 4
1915   %res5  = insertelement <16 x i16> %res4,  i16 %a0, i32 5
1916   %res6  = insertelement <16 x i16> %res5,  i16 %a0, i32 6
1917   %res7  = insertelement <16 x i16> %res6,  i16 %a0, i32 7
1918   %res8  = insertelement <16 x i16> %res7,  i16 %a0, i32 8
1919   %res9  = insertelement <16 x i16> %res8,  i16 %a0, i32 9
1920   %res10 = insertelement <16 x i16> %res9,  i16 %a0, i32 10
1921   %res11 = insertelement <16 x i16> %res10, i16 %a0, i32 11
1922   %res12 = insertelement <16 x i16> %res11, i16 %a0, i32 12
1923   %res13 = insertelement <16 x i16> %res12, i16 %a0, i32 13
1924   %res14 = insertelement <16 x i16> %res13, i16 %a0, i32 14
1925   %res15 = insertelement <16 x i16> %res14, i16 %a0, i32 15
1926   %res = bitcast <16 x i16> %res15 to <4 x i64>
1927   ret <4 x i64> %res
1928 }
1929
1930 define <4 x i64> @test_mm256_set1_epi32(i32 %a0) nounwind {
1931 ; X86-LABEL: test_mm256_set1_epi32:
1932 ; X86:       # %bb.0:
1933 ; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1934 ; X86-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
1935 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1936 ; X86-NEXT:    retl
1937 ;
1938 ; X64-LABEL: test_mm256_set1_epi32:
1939 ; X64:       # %bb.0:
1940 ; X64-NEXT:    vmovd %edi, %xmm0
1941 ; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1942 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1943 ; X64-NEXT:    retq
1944   %res0 = insertelement <8 x i32> undef, i32 %a0, i32 0
1945   %res1 = insertelement <8 x i32> %res0, i32 %a0, i32 1
1946   %res2 = insertelement <8 x i32> %res1, i32 %a0, i32 2
1947   %res3 = insertelement <8 x i32> %res2, i32 %a0, i32 3
1948   %res4 = insertelement <8 x i32> %res3, i32 %a0, i32 4
1949   %res5 = insertelement <8 x i32> %res4, i32 %a0, i32 5
1950   %res6 = insertelement <8 x i32> %res5, i32 %a0, i32 6
1951   %res7 = insertelement <8 x i32> %res6, i32 %a0, i32 7
1952   %res = bitcast <8 x i32> %res7 to <4 x i64>
1953   ret <4 x i64> %res
1954 }
1955
1956 define <4 x i64> @test_mm256_set1_epi64x(i64 %a0) nounwind {
1957 ; X86-LABEL: test_mm256_set1_epi64x:
1958 ; X86:       # %bb.0:
1959 ; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1960 ; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
1961 ; X86-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1962 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1963 ; X86-NEXT:    retl
1964 ;
1965 ; X64-LABEL: test_mm256_set1_epi64x:
1966 ; X64:       # %bb.0:
1967 ; X64-NEXT:    vmovq %rdi, %xmm0
1968 ; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
1969 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1970 ; X64-NEXT:    retq
1971   %res0 = insertelement <4 x i64> undef, i64 %a0, i32 0
1972   %res1 = insertelement <4 x i64> %res0, i64 %a0, i32 1
1973   %res2 = insertelement <4 x i64> %res1, i64 %a0, i32 2
1974   %res3 = insertelement <4 x i64> %res2, i64 %a0, i32 3
1975   ret <4 x i64> %res3
1976 }
1977
1978 define <4 x double> @test_mm256_set1_pd(double %a0) nounwind {
1979 ; X86-LABEL: test_mm256_set1_pd:
1980 ; X86:       # %bb.0:
1981 ; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
1982 ; X86-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1983 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1984 ; X86-NEXT:    retl
1985 ;
1986 ; X64-LABEL: test_mm256_set1_pd:
1987 ; X64:       # %bb.0:
1988 ; X64-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
1989 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
1990 ; X64-NEXT:    retq
1991   %res0 = insertelement <4 x double> undef, double %a0, i32 0
1992   %res1 = insertelement <4 x double> %res0, double %a0, i32 1
1993   %res2 = insertelement <4 x double> %res1, double %a0, i32 2
1994   %res3 = insertelement <4 x double> %res2, double %a0, i32 3
1995   ret <4 x double> %res3
1996 }
1997
1998 define <8 x float> @test_mm256_set1_ps(float %a0) nounwind {
1999 ; X86-LABEL: test_mm256_set1_ps:
2000 ; X86:       # %bb.0:
2001 ; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2002 ; X86-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
2003 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2004 ; X86-NEXT:    retl
2005 ;
2006 ; X64-LABEL: test_mm256_set1_ps:
2007 ; X64:       # %bb.0:
2008 ; X64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
2009 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
2010 ; X64-NEXT:    retq
2011   %res0 = insertelement <8 x float> undef, float %a0, i32 0
2012   %res1 = insertelement <8 x float> %res0, float %a0, i32 1
2013   %res2 = insertelement <8 x float> %res1, float %a0, i32 2
2014   %res3 = insertelement <8 x float> %res2, float %a0, i32 3
2015   %res4 = insertelement <8 x float> %res3, float %a0, i32 4
2016   %res5 = insertelement <8 x float> %res4, float %a0, i32 5
2017   %res6 = insertelement <8 x float> %res5, float %a0, i32 6
2018   %res7 = insertelement <8 x float> %res6, float %a0, i32 7
2019   ret <8 x float> %res7
2020 }
2021
2022 define <4 x i64> @test_mm256_setr_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15, i8 %a16, i8 %a17, i8 %a18, i8 %a19, i8 %a20, i8 %a21, i8 %a22, i8 %a23, i8 %a24, i8 %a25, i8 %a26, i8 %a27, i8 %a28, i8 %a29, i8 %a30, i8 %a31) nounwind {
2023 ; X86-LABEL: test_mm256_setr_epi8:
2024 ; X86:       # %bb.0:
2025 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2026 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
2027 ; X86-NEXT:    vmovd %ecx, %xmm0
2028 ; X86-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
2029 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2030 ; X86-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
2031 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2032 ; X86-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
2033 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2034 ; X86-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
2035 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2036 ; X86-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
2037 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2038 ; X86-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
2039 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2040 ; X86-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
2041 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2042 ; X86-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
2043 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2044 ; X86-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
2045 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2046 ; X86-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
2047 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2048 ; X86-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
2049 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2050 ; X86-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
2051 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2052 ; X86-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
2053 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2054 ; X86-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
2055 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2056 ; X86-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
2057 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2058 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
2059 ; X86-NEXT:    vmovd %ecx, %xmm1
2060 ; X86-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
2061 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2062 ; X86-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
2063 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2064 ; X86-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
2065 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2066 ; X86-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
2067 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2068 ; X86-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
2069 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2070 ; X86-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
2071 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2072 ; X86-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
2073 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2074 ; X86-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
2075 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2076 ; X86-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
2077 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2078 ; X86-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
2079 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2080 ; X86-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
2081 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2082 ; X86-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
2083 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2084 ; X86-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
2085 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2086 ; X86-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
2087 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
2088 ; X86-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
2089 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2090 ; X86-NEXT:    retl
2091 ;
2092 ; X64-LABEL: test_mm256_setr_epi8:
2093 ; X64:       # %bb.0:
2094 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
2095 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2096 ; X64-NEXT:    vmovd %eax, %xmm0
2097 ; X64-NEXT:    vpinsrb $1, %r10d, %xmm0, %xmm0
2098 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2099 ; X64-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
2100 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2101 ; X64-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
2102 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2103 ; X64-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
2104 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2105 ; X64-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
2106 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2107 ; X64-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
2108 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2109 ; X64-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
2110 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2111 ; X64-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
2112 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2113 ; X64-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
2114 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2115 ; X64-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
2116 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2117 ; X64-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
2118 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2119 ; X64-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
2120 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2121 ; X64-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
2122 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2123 ; X64-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
2124 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2125 ; X64-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
2126 ; X64-NEXT:    movzbl %sil, %eax
2127 ; X64-NEXT:    movzbl %dil, %esi
2128 ; X64-NEXT:    vmovd %esi, %xmm1
2129 ; X64-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
2130 ; X64-NEXT:    movzbl %dl, %eax
2131 ; X64-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
2132 ; X64-NEXT:    movzbl %cl, %eax
2133 ; X64-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
2134 ; X64-NEXT:    movzbl %r8b, %eax
2135 ; X64-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
2136 ; X64-NEXT:    movzbl %r9b, %eax
2137 ; X64-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
2138 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2139 ; X64-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
2140 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2141 ; X64-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
2142 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2143 ; X64-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
2144 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2145 ; X64-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
2146 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2147 ; X64-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
2148 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2149 ; X64-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
2150 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2151 ; X64-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
2152 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2153 ; X64-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
2154 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2155 ; X64-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
2156 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
2157 ; X64-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
2158 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2159 ; X64-NEXT:    retq
2160   %res0  = insertelement <32 x i8> undef,  i8 %a0 , i32 0
2161   %res1  = insertelement <32 x i8> %res0,  i8 %a1 , i32 1
2162   %res2  = insertelement <32 x i8> %res1,  i8 %a2 , i32 2
2163   %res3  = insertelement <32 x i8> %res2,  i8 %a3 , i32 3
2164   %res4  = insertelement <32 x i8> %res3,  i8 %a4 , i32 4
2165   %res5  = insertelement <32 x i8> %res4,  i8 %a5 , i32 5
2166   %res6  = insertelement <32 x i8> %res5,  i8 %a6 , i32 6
2167   %res7  = insertelement <32 x i8> %res6,  i8 %a7 , i32 7
2168   %res8  = insertelement <32 x i8> %res7,  i8 %a8 , i32 8
2169   %res9  = insertelement <32 x i8> %res8,  i8 %a9 , i32 9
2170   %res10 = insertelement <32 x i8> %res9,  i8 %a10, i32 10
2171   %res11 = insertelement <32 x i8> %res10, i8 %a11, i32 11
2172   %res12 = insertelement <32 x i8> %res11, i8 %a12, i32 12
2173   %res13 = insertelement <32 x i8> %res12, i8 %a13, i32 13
2174   %res14 = insertelement <32 x i8> %res13, i8 %a14, i32 14
2175   %res15 = insertelement <32 x i8> %res14, i8 %a15, i32 15
2176   %res16 = insertelement <32 x i8> %res15, i8 %a16, i32 16
2177   %res17 = insertelement <32 x i8> %res16, i8 %a17, i32 17
2178   %res18 = insertelement <32 x i8> %res17, i8 %a18, i32 18
2179   %res19 = insertelement <32 x i8> %res18, i8 %a19, i32 19
2180   %res20 = insertelement <32 x i8> %res19, i8 %a20, i32 20
2181   %res21 = insertelement <32 x i8> %res20, i8 %a21, i32 21
2182   %res22 = insertelement <32 x i8> %res21, i8 %a22, i32 22
2183   %res23 = insertelement <32 x i8> %res22, i8 %a23, i32 23
2184   %res24 = insertelement <32 x i8> %res23, i8 %a24, i32 24
2185   %res25 = insertelement <32 x i8> %res24, i8 %a25, i32 25
2186   %res26 = insertelement <32 x i8> %res25, i8 %a26, i32 26
2187   %res27 = insertelement <32 x i8> %res26, i8 %a27, i32 27
2188   %res28 = insertelement <32 x i8> %res27, i8 %a28, i32 28
2189   %res29 = insertelement <32 x i8> %res28, i8 %a29, i32 29
2190   %res30 = insertelement <32 x i8> %res29, i8 %a30, i32 30
2191   %res31 = insertelement <32 x i8> %res30, i8 %a31, i32 31
2192   %res = bitcast <32 x i8> %res31 to <4 x i64>
2193   ret <4 x i64> %res
2194 }
2195
2196 define <4 x i64> @test_mm256_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind {
2197 ; X86-LABEL: test_mm256_setr_epi16:
2198 ; X86:       # %bb.0:
2199 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2200 ; X86-NEXT:    vmovd %eax, %xmm0
2201 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2202 ; X86-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
2203 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2204 ; X86-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
2205 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2206 ; X86-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
2207 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2208 ; X86-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
2209 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2210 ; X86-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
2211 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2212 ; X86-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
2213 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2214 ; X86-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
2215 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2216 ; X86-NEXT:    vmovd %eax, %xmm1
2217 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2218 ; X86-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
2219 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2220 ; X86-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
2221 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2222 ; X86-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
2223 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2224 ; X86-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
2225 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2226 ; X86-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
2227 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2228 ; X86-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
2229 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
2230 ; X86-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
2231 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2232 ; X86-NEXT:    retl
2233 ;
2234 ; X64-LABEL: test_mm256_setr_epi16:
2235 ; X64:       # %bb.0:
2236 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
2237 ; X64-NEXT:    vmovd %eax, %xmm0
2238 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
2239 ; X64-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
2240 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
2241 ; X64-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
2242 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
2243 ; X64-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
2244 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
2245 ; X64-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
2246 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
2247 ; X64-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
2248 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
2249 ; X64-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
2250 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
2251 ; X64-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
2252 ; X64-NEXT:    vmovd %edi, %xmm1
2253 ; X64-NEXT:    vpinsrw $1, %esi, %xmm1, %xmm1
2254 ; X64-NEXT:    vpinsrw $2, %edx, %xmm1, %xmm1
2255 ; X64-NEXT:    vpinsrw $3, %ecx, %xmm1, %xmm1
2256 ; X64-NEXT:    vpinsrw $4, %r8d, %xmm1, %xmm1
2257 ; X64-NEXT:    vpinsrw $5, %r9d, %xmm1, %xmm1
2258 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
2259 ; X64-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
2260 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
2261 ; X64-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
2262 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2263 ; X64-NEXT:    retq
2264   %res0  = insertelement <16 x i16> undef,  i16 %a0 , i32 0
2265   %res1  = insertelement <16 x i16> %res0,  i16 %a1 , i32 1
2266   %res2  = insertelement <16 x i16> %res1,  i16 %a2 , i32 2
2267   %res3  = insertelement <16 x i16> %res2,  i16 %a3 , i32 3
2268   %res4  = insertelement <16 x i16> %res3,  i16 %a4 , i32 4
2269   %res5  = insertelement <16 x i16> %res4,  i16 %a5 , i32 5
2270   %res6  = insertelement <16 x i16> %res5,  i16 %a6 , i32 6
2271   %res7  = insertelement <16 x i16> %res6,  i16 %a7 , i32 7
2272   %res8  = insertelement <16 x i16> %res7,  i16 %a8 , i32 8
2273   %res9  = insertelement <16 x i16> %res8,  i16 %a9 , i32 9
2274   %res10 = insertelement <16 x i16> %res9,  i16 %a10, i32 10
2275   %res11 = insertelement <16 x i16> %res10, i16 %a11, i32 11
2276   %res12 = insertelement <16 x i16> %res11, i16 %a12, i32 12
2277   %res13 = insertelement <16 x i16> %res12, i16 %a13, i32 13
2278   %res14 = insertelement <16 x i16> %res13, i16 %a14, i32 14
2279   %res15 = insertelement <16 x i16> %res14, i16 %a15, i32 15
2280   %res = bitcast <16 x i16> %res15 to <4 x i64>
2281   ret <4 x i64> %res
2282 }
2283
2284 define <4 x i64> @test_mm256_setr_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind {
2285 ; X86-LABEL: test_mm256_setr_epi32:
2286 ; X86:       # %bb.0:
2287 ; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2288 ; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
2289 ; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
2290 ; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
2291 ; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2292 ; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
2293 ; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
2294 ; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
2295 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2296 ; X86-NEXT:    retl
2297 ;
2298 ; X64-LABEL: test_mm256_setr_epi32:
2299 ; X64:       # %bb.0:
2300 ; X64-NEXT:    vmovd %r8d, %xmm0
2301 ; X64-NEXT:    vpinsrd $1, %r9d, %xmm0, %xmm0
2302 ; X64-NEXT:    vpinsrd $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0
2303 ; X64-NEXT:    vpinsrd $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0
2304 ; X64-NEXT:    vmovd %edi, %xmm1
2305 ; X64-NEXT:    vpinsrd $1, %esi, %xmm1, %xmm1
2306 ; X64-NEXT:    vpinsrd $2, %edx, %xmm1, %xmm1
2307 ; X64-NEXT:    vpinsrd $3, %ecx, %xmm1, %xmm1
2308 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2309 ; X64-NEXT:    retq
2310   %res0 = insertelement <8 x i32> undef, i32 %a0, i32 0
2311   %res1 = insertelement <8 x i32> %res0, i32 %a1, i32 1
2312   %res2 = insertelement <8 x i32> %res1, i32 %a2, i32 2
2313   %res3 = insertelement <8 x i32> %res2, i32 %a3, i32 3
2314   %res4 = insertelement <8 x i32> %res3, i32 %a4, i32 4
2315   %res5 = insertelement <8 x i32> %res4, i32 %a5, i32 5
2316   %res6 = insertelement <8 x i32> %res5, i32 %a6, i32 6
2317   %res7 = insertelement <8 x i32> %res6, i32 %a7, i32 7
2318   %res = bitcast <8 x i32> %res7 to <4 x i64>
2319   ret <4 x i64> %res
2320 }
2321
2322 define <4 x i64> @test_mm256_setr_epi64x(i64 %a0, i64 %a1, i64 %a2, i64 %a3) nounwind {
2323 ; X86-LABEL: test_mm256_setr_epi64x:
2324 ; X86:       # %bb.0:
2325 ; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2326 ; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
2327 ; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
2328 ; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
2329 ; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2330 ; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
2331 ; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
2332 ; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
2333 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2334 ; X86-NEXT:    retl
2335 ;
2336 ; X64-LABEL: test_mm256_setr_epi64x:
2337 ; X64:       # %bb.0:
2338 ; X64-NEXT:    vmovq %rcx, %xmm0
2339 ; X64-NEXT:    vmovq %rdx, %xmm1
2340 ; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2341 ; X64-NEXT:    vmovq %rsi, %xmm1
2342 ; X64-NEXT:    vmovq %rdi, %xmm2
2343 ; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
2344 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2345 ; X64-NEXT:    retq
2346   %res0 = insertelement <4 x i64> undef, i64 %a0, i32 0
2347   %res1 = insertelement <4 x i64> %res0, i64 %a1, i32 1
2348   %res2 = insertelement <4 x i64> %res1, i64 %a2, i32 2
2349   %res3 = insertelement <4 x i64> %res2, i64 %a3, i32 3
2350   ret <4 x i64> %res3
2351 }
2352
2353 define <8 x float> @test_mm256_setr_m128(<4 x float> %a0, <4 x float> %a1) nounwind {
2354 ; CHECK-LABEL: test_mm256_setr_m128:
2355 ; CHECK:       # %bb.0:
2356 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
2357 ; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2358 ; CHECK-NEXT:    ret{{[l|q]}}
2359   %res = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2360   ret <8 x float> %res
2361 }
2362
2363 define <4 x double> @test_mm256_setr_m128d(<2 x double> %a0, <2 x double> %a1) nounwind {
2364 ; CHECK-LABEL: test_mm256_setr_m128d:
2365 ; CHECK:       # %bb.0:
2366 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
2367 ; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2368 ; CHECK-NEXT:    ret{{[l|q]}}
2369   %arg0 = bitcast <2 x double> %a0 to <4 x float>
2370   %arg1 = bitcast <2 x double> %a1 to <4 x float>
2371   %res = shufflevector <4 x float> %arg0, <4 x float> %arg1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2372   %bc = bitcast <8 x float> %res to <4 x double>
2373   ret <4 x double> %bc
2374 }
2375
2376 define <4 x i64> @test_mm256_setr_m128i(<2 x i64> %a0, <2 x i64> %a1) nounwind {
2377 ; CHECK-LABEL: test_mm256_setr_m128i:
2378 ; CHECK:       # %bb.0:
2379 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
2380 ; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
2381 ; CHECK-NEXT:    ret{{[l|q]}}
2382   %arg0 = bitcast <2 x i64> %a0 to <4 x float>
2383   %arg1 = bitcast <2 x i64> %a1 to <4 x float>
2384   %res = shufflevector <4 x float> %arg0, <4 x float> %arg1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
2385   %bc = bitcast <8 x float> %res to <4 x i64>
2386   ret <4 x i64> %bc
2387 }
2388
2389 define <4 x double> @test_mm256_setr_pd(double %a0, double %a1, double %a2, double %a3) nounwind {
2390 ; X86-LABEL: test_mm256_setr_pd:
2391 ; X86:       # %bb.0:
2392 ; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
2393 ; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
2394 ; X86-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
2395 ; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
2396 ; X86-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
2397 ; X86-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
2398 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2399 ; X86-NEXT:    retl
2400 ;
2401 ; X64-LABEL: test_mm256_setr_pd:
2402 ; X64:       # %bb.0:
2403 ; X64-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
2404 ; X64-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
2405 ; X64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2406 ; X64-NEXT:    retq
2407   %res0 = insertelement <4 x double> undef, double %a0, i32 0
2408   %res1 = insertelement <4 x double> %res0, double %a1, i32 1
2409   %res2 = insertelement <4 x double> %res1, double %a2, i32 2
2410   %res3 = insertelement <4 x double> %res2, double %a3, i32 3
2411   ret <4 x double> %res3
2412 }
2413
2414 define <8 x float> @test_mm256_setr_ps(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) nounwind {
2415 ; X86-LABEL: test_mm256_setr_ps:
2416 ; X86:       # %bb.0:
2417 ; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
2418 ; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2419 ; X86-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2420 ; X86-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
2421 ; X86-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
2422 ; X86-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
2423 ; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
2424 ; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
2425 ; X86-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
2426 ; X86-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
2427 ; X86-NEXT:    vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
2428 ; X86-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
2429 ; X86-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3]
2430 ; X86-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
2431 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
2432 ; X86-NEXT:    retl
2433 ;
2434 ; X64-LABEL: test_mm256_setr_ps:
2435 ; X64:       # %bb.0:
2436 ; X64-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
2437 ; X64-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3]
2438 ; X64-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0]
2439 ; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
2440 ; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
2441 ; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
2442 ; X64-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
2443 ; X64-NEXT:    retq
2444   %res0 = insertelement <8 x float> undef, float %a0, i32 0
2445   %res1 = insertelement <8 x float> %res0, float %a1, i32 1
2446   %res2 = insertelement <8 x float> %res1, float %a2, i32 2
2447   %res3 = insertelement <8 x float> %res2, float %a3, i32 3
2448   %res4 = insertelement <8 x float> %res3, float %a4, i32 4
2449   %res5 = insertelement <8 x float> %res4, float %a5, i32 5
2450   %res6 = insertelement <8 x float> %res5, float %a6, i32 6
2451   %res7 = insertelement <8 x float> %res6, float %a7, i32 7
2452   ret <8 x float> %res7
2453 }
2454
2455 define <4 x double> @test_mm256_setzero_pd() nounwind {
2456 ; CHECK-LABEL: test_mm256_setzero_pd:
2457 ; CHECK:       # %bb.0:
2458 ; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
2459 ; CHECK-NEXT:    ret{{[l|q]}}
2460   ret <4 x double> zeroinitializer
2461 }
2462
2463 define <8 x float> @test_mm256_setzero_ps() nounwind {
2464 ; CHECK-LABEL: test_mm256_setzero_ps:
2465 ; CHECK:       # %bb.0:
2466 ; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
2467 ; CHECK-NEXT:    ret{{[l|q]}}
2468   ret <8 x float> zeroinitializer
2469 }
2470
2471 define <4 x i64> @test_mm256_setzero_si256() nounwind {
2472 ; CHECK-LABEL: test_mm256_setzero_si256:
2473 ; CHECK:       # %bb.0:
2474 ; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
2475 ; CHECK-NEXT:    ret{{[l|q]}}
2476   ret <4 x i64> zeroinitializer
2477 }
2478
2479 define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
2480 ; CHECK-LABEL: test_mm256_shuffle_pd:
2481 ; CHECK:       # %bb.0:
2482 ; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
2483 ; CHECK-NEXT:    ret{{[l|q]}}
2484   %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
2485   ret <4 x double> %res
2486 }
2487
2488 define <8 x float> @test_mm256_shuffle_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
2489 ; CHECK-LABEL: test_mm256_shuffle_ps:
2490 ; CHECK:       # %bb.0:
2491 ; CHECK-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
2492 ; CHECK-NEXT:    ret{{[l|q]}}
2493   %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12>
2494   ret <8 x float> %res
2495 }
2496
2497 define <4 x double> @test_mm256_sqrt_pd(<4 x double> %a0) nounwind {
2498 ; CHECK-LABEL: test_mm256_sqrt_pd:
2499 ; CHECK:       # %bb.0: # %entry
2500 ; CHECK-NEXT:    vsqrtpd %ymm0, %ymm0
2501 ; CHECK-NEXT:    ret{{[l|q]}}
2502 entry:
2503   %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %a0) #2
2504   ret <4 x double> %0
2505 }
2506
2507 declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) #1
2508
2509 define <8 x float> @test_mm256_sqrt_ps(<8 x float> %a0) nounwind {
2510 ; CHECK-LABEL: test_mm256_sqrt_ps:
2511 ; CHECK:       # %bb.0: # %entry
2512 ; CHECK-NEXT:    vsqrtps %ymm0, %ymm0
2513 ; CHECK-NEXT:    ret{{[l|q]}}
2514 entry:
2515   %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %a0) #2
2516   ret <8 x float> %0
2517 }
2518
2519 declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #1
2520
2521 define void @test_mm256_store_pd(double* %a0, <4 x double> %a1) nounwind {
2522 ; X86-LABEL: test_mm256_store_pd:
2523 ; X86:       # %bb.0:
2524 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2525 ; X86-NEXT:    vmovaps %ymm0, (%eax)
2526 ; X86-NEXT:    vzeroupper
2527 ; X86-NEXT:    retl
2528 ;
2529 ; X64-LABEL: test_mm256_store_pd:
2530 ; X64:       # %bb.0:
2531 ; X64-NEXT:    vmovaps %ymm0, (%rdi)
2532 ; X64-NEXT:    vzeroupper
2533 ; X64-NEXT:    retq
2534   %arg0 = bitcast double* %a0 to <4 x double>*
2535   store <4 x double> %a1, <4 x double>* %arg0, align 32
2536   ret void
2537 }
2538
2539 define void @test_mm256_store_ps(float* %a0, <8 x float> %a1) nounwind {
2540 ; X86-LABEL: test_mm256_store_ps:
2541 ; X86:       # %bb.0:
2542 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2543 ; X86-NEXT:    vmovaps %ymm0, (%eax)
2544 ; X86-NEXT:    vzeroupper
2545 ; X86-NEXT:    retl
2546 ;
2547 ; X64-LABEL: test_mm256_store_ps:
2548 ; X64:       # %bb.0:
2549 ; X64-NEXT:    vmovaps %ymm0, (%rdi)
2550 ; X64-NEXT:    vzeroupper
2551 ; X64-NEXT:    retq
2552   %arg0 = bitcast float* %a0 to <8 x float>*
2553   store <8 x float> %a1, <8 x float>* %arg0, align 32
2554   ret void
2555 }
2556
2557 define void @test_mm256_store_si256(<4 x i64>* %a0, <4 x i64> %a1) nounwind {
2558 ; X86-LABEL: test_mm256_store_si256:
2559 ; X86:       # %bb.0:
2560 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2561 ; X86-NEXT:    vmovaps %ymm0, (%eax)
2562 ; X86-NEXT:    vzeroupper
2563 ; X86-NEXT:    retl
2564 ;
2565 ; X64-LABEL: test_mm256_store_si256:
2566 ; X64:       # %bb.0:
2567 ; X64-NEXT:    vmovaps %ymm0, (%rdi)
2568 ; X64-NEXT:    vzeroupper
2569 ; X64-NEXT:    retq
2570   store <4 x i64> %a1, <4 x i64>* %a0, align 32
2571   ret void
2572 }
2573
2574 define void @test_mm256_storeu_pd(double* %a0, <4 x double> %a1) nounwind {
2575 ; X86-LABEL: test_mm256_storeu_pd:
2576 ; X86:       # %bb.0:
2577 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2578 ; X86-NEXT:    vmovups %ymm0, (%eax)
2579 ; X86-NEXT:    vzeroupper
2580 ; X86-NEXT:    retl
2581 ;
2582 ; X64-LABEL: test_mm256_storeu_pd:
2583 ; X64:       # %bb.0:
2584 ; X64-NEXT:    vmovups %ymm0, (%rdi)
2585 ; X64-NEXT:    vzeroupper
2586 ; X64-NEXT:    retq
2587   %arg0 = bitcast double* %a0 to <4 x double>*
2588   store <4 x double> %a1, <4 x double>* %arg0, align 1
2589   ret void
2590 }
2591
2592 define void @test_mm256_storeu_ps(float* %a0, <8 x float> %a1) nounwind {
2593 ; X86-LABEL: test_mm256_storeu_ps:
2594 ; X86:       # %bb.0:
2595 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2596 ; X86-NEXT:    vmovups %ymm0, (%eax)
2597 ; X86-NEXT:    vzeroupper
2598 ; X86-NEXT:    retl
2599 ;
2600 ; X64-LABEL: test_mm256_storeu_ps:
2601 ; X64:       # %bb.0:
2602 ; X64-NEXT:    vmovups %ymm0, (%rdi)
2603 ; X64-NEXT:    vzeroupper
2604 ; X64-NEXT:    retq
2605   %arg0 = bitcast float* %a0 to <8 x float>*
2606   store <8 x float> %a1, <8 x float>* %arg0, align 1
2607   ret void
2608 }
2609
2610 define void @test_mm256_storeu_si256(<4 x i64>* %a0, <4 x i64> %a1) nounwind {
2611 ; X86-LABEL: test_mm256_storeu_si256:
2612 ; X86:       # %bb.0:
2613 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2614 ; X86-NEXT:    vmovups %ymm0, (%eax)
2615 ; X86-NEXT:    vzeroupper
2616 ; X86-NEXT:    retl
2617 ;
2618 ; X64-LABEL: test_mm256_storeu_si256:
2619 ; X64:       # %bb.0:
2620 ; X64-NEXT:    vmovups %ymm0, (%rdi)
2621 ; X64-NEXT:    vzeroupper
2622 ; X64-NEXT:    retq
2623   store <4 x i64> %a1, <4 x i64>* %a0, align 1
2624   ret void
2625 }
2626
2627 define void @test_mm256_storeu2_m128(float* %a0, float* %a1, <8 x float> %a2) nounwind {
2628 ; X86-LABEL: test_mm256_storeu2_m128:
2629 ; X86:       # %bb.0:
2630 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2631 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
2632 ; X86-NEXT:    vmovups %xmm0, (%ecx)
2633 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
2634 ; X86-NEXT:    vmovups %xmm0, (%eax)
2635 ; X86-NEXT:    vzeroupper
2636 ; X86-NEXT:    retl
2637 ;
2638 ; X64-LABEL: test_mm256_storeu2_m128:
2639 ; X64:       # %bb.0:
2640 ; X64-NEXT:    vmovups %xmm0, (%rdi)
2641 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
2642 ; X64-NEXT:    vmovups %xmm0, (%rsi)
2643 ; X64-NEXT:    vzeroupper
2644 ; X64-NEXT:    retq
2645   %arg0 = bitcast float* %a0 to <4 x float>*
2646   %lo = shufflevector <8 x float> %a2, <8 x float> %a2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
2647   store <4 x float> %lo, <4 x float>* %arg0, align 1
2648   %arg1 = bitcast float* %a1 to <4 x float>*
2649   %hi = shufflevector <8 x float> %a2, <8 x float> %a2, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
2650   store <4 x float> %hi, <4 x float>* %arg1, align 1
2651   ret void
2652 }
2653
2654 define void @test_mm256_storeu2_m128d(double* %a0, double* %a1, <4 x double> %a2) nounwind {
2655 ; X86-LABEL: test_mm256_storeu2_m128d:
2656 ; X86:       # %bb.0:
2657 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2658 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
2659 ; X86-NEXT:    vmovups %xmm0, (%ecx)
2660 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
2661 ; X86-NEXT:    vmovups %xmm0, (%eax)
2662 ; X86-NEXT:    vzeroupper
2663 ; X86-NEXT:    retl
2664 ;
2665 ; X64-LABEL: test_mm256_storeu2_m128d:
2666 ; X64:       # %bb.0:
2667 ; X64-NEXT:    vmovups %xmm0, (%rdi)
2668 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
2669 ; X64-NEXT:    vmovups %xmm0, (%rsi)
2670 ; X64-NEXT:    vzeroupper
2671 ; X64-NEXT:    retq
2672   %arg0 = bitcast double* %a0 to <2 x double>*
2673   %lo = shufflevector <4 x double> %a2, <4 x double> %a2, <2 x i32> <i32 0, i32 1>
2674   store <2 x double> %lo, <2 x double>* %arg0, align 1
2675   %arg1 = bitcast double* %a1 to <2 x double>*
2676   %hi = shufflevector <4 x double> %a2, <4 x double> %a2, <2 x i32> <i32 2, i32 3>
2677   store <2 x double> %hi, <2 x double>* %arg1, align 1
2678   ret void
2679 }
2680
2681 define void @test_mm256_storeu2_m128i(<2 x i64>* %a0, <2 x i64>* %a1, <4 x i64> %a2) nounwind {
2682 ; X86-LABEL: test_mm256_storeu2_m128i:
2683 ; X86:       # %bb.0:
2684 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2685 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
2686 ; X86-NEXT:    vmovups %xmm0, (%ecx)
2687 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
2688 ; X86-NEXT:    vmovups %xmm0, (%eax)
2689 ; X86-NEXT:    vzeroupper
2690 ; X86-NEXT:    retl
2691 ;
2692 ; X64-LABEL: test_mm256_storeu2_m128i:
2693 ; X64:       # %bb.0:
2694 ; X64-NEXT:    vmovups %xmm0, (%rdi)
2695 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
2696 ; X64-NEXT:    vmovups %xmm0, (%rsi)
2697 ; X64-NEXT:    vzeroupper
2698 ; X64-NEXT:    retq
2699   %arg0 = bitcast <2 x i64>* %a0 to <2 x i64>*
2700   %lo = shufflevector <4 x i64> %a2, <4 x i64> %a2, <2 x i32> <i32 0, i32 1>
2701   store <2 x i64> %lo, <2 x i64>* %arg0, align 1
2702   %arg1 = bitcast <2 x i64>* %a1 to <2 x i64>*
2703   %hi = shufflevector <4 x i64> %a2, <4 x i64> %a2, <2 x i32> <i32 2, i32 3>
2704   store <2 x i64> %hi, <2 x i64>* %arg1, align 1
2705   ret void
2706 }
2707
2708 define void @test_mm256_stream_pd(double *%a0, <4 x double> %a1) nounwind {
2709 ; X86-LABEL: test_mm256_stream_pd:
2710 ; X86:       # %bb.0:
2711 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2712 ; X86-NEXT:    vmovntps %ymm0, (%eax)
2713 ; X86-NEXT:    vzeroupper
2714 ; X86-NEXT:    retl
2715 ;
2716 ; X64-LABEL: test_mm256_stream_pd:
2717 ; X64:       # %bb.0:
2718 ; X64-NEXT:    vmovntps %ymm0, (%rdi)
2719 ; X64-NEXT:    vzeroupper
2720 ; X64-NEXT:    retq
2721   %arg0 = bitcast double* %a0 to <4 x double>*
2722   store <4 x double> %a1, <4 x double>* %arg0, align 32, !nontemporal !0
2723   ret void
2724 }
2725
2726 define void @test_mm256_stream_ps(float *%a0, <8 x float> %a1) nounwind {
2727 ; X86-LABEL: test_mm256_stream_ps:
2728 ; X86:       # %bb.0:
2729 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2730 ; X86-NEXT:    vmovntps %ymm0, (%eax)
2731 ; X86-NEXT:    vzeroupper
2732 ; X86-NEXT:    retl
2733 ;
2734 ; X64-LABEL: test_mm256_stream_ps:
2735 ; X64:       # %bb.0:
2736 ; X64-NEXT:    vmovntps %ymm0, (%rdi)
2737 ; X64-NEXT:    vzeroupper
2738 ; X64-NEXT:    retq
2739   %arg0 = bitcast float* %a0 to <8 x float>*
2740   store <8 x float> %a1, <8 x float>* %arg0, align 32, !nontemporal !0
2741   ret void
2742 }
2743
2744 define void @test_mm256_stream_si256(<4 x i64> *%a0, <4 x i64> %a1) nounwind {
2745 ; X86-LABEL: test_mm256_stream_si256:
2746 ; X86:       # %bb.0:
2747 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
2748 ; X86-NEXT:    vmovntps %ymm0, (%eax)
2749 ; X86-NEXT:    vzeroupper
2750 ; X86-NEXT:    retl
2751 ;
2752 ; X64-LABEL: test_mm256_stream_si256:
2753 ; X64:       # %bb.0:
2754 ; X64-NEXT:    vmovntps %ymm0, (%rdi)
2755 ; X64-NEXT:    vzeroupper
2756 ; X64-NEXT:    retq
2757   store <4 x i64> %a1, <4 x i64>* %a0, align 32, !nontemporal !0
2758   ret void
2759 }
2760
2761 define <4 x double> @test_mm256_sub_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
2762 ; CHECK-LABEL: test_mm256_sub_pd:
2763 ; CHECK:       # %bb.0:
2764 ; CHECK-NEXT:    vsubpd %ymm1, %ymm0, %ymm0
2765 ; CHECK-NEXT:    ret{{[l|q]}}
2766   %res = fsub <4 x double> %a0, %a1
2767   ret <4 x double> %res
2768 }
2769
2770 define <8 x float> @test_mm256_sub_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
2771 ; CHECK-LABEL: test_mm256_sub_ps:
2772 ; CHECK:       # %bb.0:
2773 ; CHECK-NEXT:    vsubps %ymm1, %ymm0, %ymm0
2774 ; CHECK-NEXT:    ret{{[l|q]}}
2775   %res = fsub <8 x float> %a0, %a1
2776   ret <8 x float> %res
2777 }
2778
2779 define i32 @test_mm_testc_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
2780 ; CHECK-LABEL: test_mm_testc_pd:
2781 ; CHECK:       # %bb.0:
2782 ; CHECK-NEXT:    xorl %eax, %eax
2783 ; CHECK-NEXT:    vtestpd %xmm1, %xmm0
2784 ; CHECK-NEXT:    setb %al
2785 ; CHECK-NEXT:    ret{{[l|q]}}
2786   %res = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1)
2787   ret i32 %res
2788 }
2789 declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnone
2790
2791 define i32 @test_mm256_testc_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
2792 ; CHECK-LABEL: test_mm256_testc_pd:
2793 ; CHECK:       # %bb.0:
2794 ; CHECK-NEXT:    xorl %eax, %eax
2795 ; CHECK-NEXT:    vtestpd %ymm1, %ymm0
2796 ; CHECK-NEXT:    setb %al
2797 ; CHECK-NEXT:    vzeroupper
2798 ; CHECK-NEXT:    ret{{[l|q]}}
2799   %res = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1)
2800   ret i32 %res
2801 }
2802 declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind readnone
2803
2804 define i32 @test_mm_testc_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
2805 ; CHECK-LABEL: test_mm_testc_ps:
2806 ; CHECK:       # %bb.0:
2807 ; CHECK-NEXT:    xorl %eax, %eax
2808 ; CHECK-NEXT:    vtestps %xmm1, %xmm0
2809 ; CHECK-NEXT:    setb %al
2810 ; CHECK-NEXT:    ret{{[l|q]}}
2811   %res = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1)
2812   ret i32 %res
2813 }
2814 declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone
2815
2816 define i32 @test_mm256_testc_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
2817 ; CHECK-LABEL: test_mm256_testc_ps:
2818 ; CHECK:       # %bb.0:
2819 ; CHECK-NEXT:    xorl %eax, %eax
2820 ; CHECK-NEXT:    vtestps %ymm1, %ymm0
2821 ; CHECK-NEXT:    setb %al
2822 ; CHECK-NEXT:    vzeroupper
2823 ; CHECK-NEXT:    ret{{[l|q]}}
2824   %res = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1)
2825   ret i32 %res
2826 }
2827 declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readnone
2828
2829 define i32 @test_mm256_testc_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2830 ; CHECK-LABEL: test_mm256_testc_si256:
2831 ; CHECK:       # %bb.0:
2832 ; CHECK-NEXT:    xorl %eax, %eax
2833 ; CHECK-NEXT:    vptest %ymm1, %ymm0
2834 ; CHECK-NEXT:    setb %al
2835 ; CHECK-NEXT:    vzeroupper
2836 ; CHECK-NEXT:    ret{{[l|q]}}
2837   %res = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a0, <4 x i64> %a1)
2838   ret i32 %res
2839 }
2840 declare i32 @llvm.x86.avx.ptestc.256(<4 x i64>, <4 x i64>) nounwind readnone
2841
2842 define i32 @test_mm_testnzc_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
2843 ; CHECK-LABEL: test_mm_testnzc_pd:
2844 ; CHECK:       # %bb.0:
2845 ; CHECK-NEXT:    xorl %eax, %eax
2846 ; CHECK-NEXT:    vtestpd %xmm1, %xmm0
2847 ; CHECK-NEXT:    seta %al
2848 ; CHECK-NEXT:    ret{{[l|q]}}
2849   %res = call i32 @llvm.x86.avx.vtestnzc.pd(<2 x double> %a0, <2 x double> %a1)
2850   ret i32 %res
2851 }
2852 declare i32 @llvm.x86.avx.vtestnzc.pd(<2 x double>, <2 x double>) nounwind readnone
2853
2854 define i32 @test_mm256_testnzc_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
2855 ; CHECK-LABEL: test_mm256_testnzc_pd:
2856 ; CHECK:       # %bb.0:
2857 ; CHECK-NEXT:    xorl %eax, %eax
2858 ; CHECK-NEXT:    vtestpd %ymm1, %ymm0
2859 ; CHECK-NEXT:    seta %al
2860 ; CHECK-NEXT:    vzeroupper
2861 ; CHECK-NEXT:    ret{{[l|q]}}
2862   %res = call i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double> %a0, <4 x double> %a1)
2863   ret i32 %res
2864 }
2865 declare i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double>, <4 x double>) nounwind readnone
2866
2867 define i32 @test_mm_testnzc_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
2868 ; CHECK-LABEL: test_mm_testnzc_ps:
2869 ; CHECK:       # %bb.0:
2870 ; CHECK-NEXT:    xorl %eax, %eax
2871 ; CHECK-NEXT:    vtestps %xmm1, %xmm0
2872 ; CHECK-NEXT:    seta %al
2873 ; CHECK-NEXT:    ret{{[l|q]}}
2874   %res = call i32 @llvm.x86.avx.vtestnzc.ps(<4 x float> %a0, <4 x float> %a1)
2875   ret i32 %res
2876 }
2877 declare i32 @llvm.x86.avx.vtestnzc.ps(<4 x float>, <4 x float>) nounwind readnone
2878
2879 define i32 @test_mm256_testnzc_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
2880 ; CHECK-LABEL: test_mm256_testnzc_ps:
2881 ; CHECK:       # %bb.0:
2882 ; CHECK-NEXT:    xorl %eax, %eax
2883 ; CHECK-NEXT:    vtestps %ymm1, %ymm0
2884 ; CHECK-NEXT:    seta %al
2885 ; CHECK-NEXT:    vzeroupper
2886 ; CHECK-NEXT:    ret{{[l|q]}}
2887   %res = call i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float> %a0, <8 x float> %a1)
2888   ret i32 %res
2889 }
2890 declare i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float>, <8 x float>) nounwind readnone
2891
2892 define i32 @test_mm256_testnzc_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2893 ; CHECK-LABEL: test_mm256_testnzc_si256:
2894 ; CHECK:       # %bb.0:
2895 ; CHECK-NEXT:    xorl %eax, %eax
2896 ; CHECK-NEXT:    vptest %ymm1, %ymm0
2897 ; CHECK-NEXT:    seta %al
2898 ; CHECK-NEXT:    vzeroupper
2899 ; CHECK-NEXT:    ret{{[l|q]}}
2900   %res = call i32 @llvm.x86.avx.ptestnzc.256(<4 x i64> %a0, <4 x i64> %a1)
2901   ret i32 %res
2902 }
2903 declare i32 @llvm.x86.avx.ptestnzc.256(<4 x i64>, <4 x i64>) nounwind readnone
2904
2905 define i32 @test_mm_testz_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
2906 ; CHECK-LABEL: test_mm_testz_pd:
2907 ; CHECK:       # %bb.0:
2908 ; CHECK-NEXT:    xorl %eax, %eax
2909 ; CHECK-NEXT:    vtestpd %xmm1, %xmm0
2910 ; CHECK-NEXT:    sete %al
2911 ; CHECK-NEXT:    ret{{[l|q]}}
2912   %res = call i32 @llvm.x86.avx.vtestz.pd(<2 x double> %a0, <2 x double> %a1)
2913   ret i32 %res
2914 }
2915 declare i32 @llvm.x86.avx.vtestz.pd(<2 x double>, <2 x double>) nounwind readnone
2916
2917 define i32 @test_mm256_testz_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
2918 ; CHECK-LABEL: test_mm256_testz_pd:
2919 ; CHECK:       # %bb.0:
2920 ; CHECK-NEXT:    xorl %eax, %eax
2921 ; CHECK-NEXT:    vtestpd %ymm1, %ymm0
2922 ; CHECK-NEXT:    sete %al
2923 ; CHECK-NEXT:    vzeroupper
2924 ; CHECK-NEXT:    ret{{[l|q]}}
2925   %res = call i32 @llvm.x86.avx.vtestz.pd.256(<4 x double> %a0, <4 x double> %a1)
2926   ret i32 %res
2927 }
2928 declare i32 @llvm.x86.avx.vtestz.pd.256(<4 x double>, <4 x double>) nounwind readnone
2929
2930 define i32 @test_mm_testz_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
2931 ; CHECK-LABEL: test_mm_testz_ps:
2932 ; CHECK:       # %bb.0:
2933 ; CHECK-NEXT:    xorl %eax, %eax
2934 ; CHECK-NEXT:    vtestps %xmm1, %xmm0
2935 ; CHECK-NEXT:    sete %al
2936 ; CHECK-NEXT:    ret{{[l|q]}}
2937   %res = call i32 @llvm.x86.avx.vtestz.ps(<4 x float> %a0, <4 x float> %a1)
2938   ret i32 %res
2939 }
2940 declare i32 @llvm.x86.avx.vtestz.ps(<4 x float>, <4 x float>) nounwind readnone
2941
2942 define i32 @test_mm256_testz_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
2943 ; CHECK-LABEL: test_mm256_testz_ps:
2944 ; CHECK:       # %bb.0:
2945 ; CHECK-NEXT:    xorl %eax, %eax
2946 ; CHECK-NEXT:    vtestps %ymm1, %ymm0
2947 ; CHECK-NEXT:    sete %al
2948 ; CHECK-NEXT:    vzeroupper
2949 ; CHECK-NEXT:    ret{{[l|q]}}
2950   %res = call i32 @llvm.x86.avx.vtestz.ps.256(<8 x float> %a0, <8 x float> %a1)
2951   ret i32 %res
2952 }
2953 declare i32 @llvm.x86.avx.vtestz.ps.256(<8 x float>, <8 x float>) nounwind readnone
2954
2955 define i32 @test_mm256_testz_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
2956 ; CHECK-LABEL: test_mm256_testz_si256:
2957 ; CHECK:       # %bb.0:
2958 ; CHECK-NEXT:    xorl %eax, %eax
2959 ; CHECK-NEXT:    vptest %ymm1, %ymm0
2960 ; CHECK-NEXT:    sete %al
2961 ; CHECK-NEXT:    vzeroupper
2962 ; CHECK-NEXT:    ret{{[l|q]}}
2963   %res = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %a0, <4 x i64> %a1)
2964   ret i32 %res
2965 }
2966 declare i32 @llvm.x86.avx.ptestz.256(<4 x i64>, <4 x i64>) nounwind readnone
2967
2968 define <2 x double> @test_mm_undefined_pd() nounwind {
2969 ; CHECK-LABEL: test_mm_undefined_pd:
2970 ; CHECK:       # %bb.0:
2971 ; CHECK-NEXT:    ret{{[l|q]}}
2972   ret <2 x double> undef
2973 }
2974
2975 define <4 x double> @test_mm256_undefined_pd() nounwind {
2976 ; CHECK-LABEL: test_mm256_undefined_pd:
2977 ; CHECK:       # %bb.0:
2978 ; CHECK-NEXT:    ret{{[l|q]}}
2979   ret <4 x double> undef
2980 }
2981
2982 define <8 x float> @test_mm256_undefined_ps() nounwind {
2983 ; CHECK-LABEL: test_mm256_undefined_ps:
2984 ; CHECK:       # %bb.0:
2985 ; CHECK-NEXT:    ret{{[l|q]}}
2986   ret <8 x float> undef
2987 }
2988
2989 define <4 x i64> @test_mm256_undefined_si256() nounwind {
2990 ; CHECK-LABEL: test_mm256_undefined_si256:
2991 ; CHECK:       # %bb.0:
2992 ; CHECK-NEXT:    ret{{[l|q]}}
2993   ret <4 x i64> undef
2994 }
2995
2996 define <4 x double> @test_mm256_unpackhi_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
2997 ; CHECK-LABEL: test_mm256_unpackhi_pd:
2998 ; CHECK:       # %bb.0:
2999 ; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
3000 ; CHECK-NEXT:    ret{{[l|q]}}
3001   %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
3002   ret <4 x double> %res
3003 }
3004
3005 define <8 x float> @test_mm256_unpackhi_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
3006 ; CHECK-LABEL: test_mm256_unpackhi_ps:
3007 ; CHECK:       # %bb.0:
3008 ; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
3009 ; CHECK-NEXT:    ret{{[l|q]}}
3010   %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
3011   ret <8 x float> %res
3012 }
3013
3014 define <4 x double> @test_mm256_unpacklo_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
3015 ; CHECK-LABEL: test_mm256_unpacklo_pd:
3016 ; CHECK:       # %bb.0:
3017 ; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
3018 ; CHECK-NEXT:    ret{{[l|q]}}
3019   %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
3020   ret <4 x double> %res
3021 }
3022
3023 define <8 x float> @test_mm256_unpacklo_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
3024 ; CHECK-LABEL: test_mm256_unpacklo_ps:
3025 ; CHECK:       # %bb.0:
3026 ; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
3027 ; CHECK-NEXT:    ret{{[l|q]}}
3028   %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
3029   ret <8 x float> %res
3030 }
3031
3032 define <4 x double> @test_mm256_xor_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
3033 ; CHECK-LABEL: test_mm256_xor_pd:
3034 ; CHECK:       # %bb.0:
3035 ; CHECK-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3036 ; CHECK-NEXT:    ret{{[l|q]}}
3037   %1 = bitcast <4 x double> %a0 to <4 x i64>
3038   %2 = bitcast <4 x double> %a1 to <4 x i64>
3039   %res = xor <4 x i64> %1, %2
3040   %bc = bitcast <4 x i64> %res to <4 x double>
3041   ret <4 x double> %bc
3042 }
3043
3044 define <8 x float> @test_mm256_xor_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
3045 ; CHECK-LABEL: test_mm256_xor_ps:
3046 ; CHECK:       # %bb.0:
3047 ; CHECK-NEXT:    vxorps %ymm1, %ymm0, %ymm0
3048 ; CHECK-NEXT:    ret{{[l|q]}}
3049   %1 = bitcast <8 x float> %a0 to <8 x i32>
3050   %2 = bitcast <8 x float> %a1 to <8 x i32>
3051   %res = xor <8 x i32> %1, %2
3052   %bc = bitcast <8 x i32> %res to <8 x float>
3053   ret <8 x float> %bc
3054 }
3055
3056 define void @test_mm256_zeroall() nounwind {
3057 ; CHECK-LABEL: test_mm256_zeroall:
3058 ; CHECK:       # %bb.0:
3059 ; CHECK-NEXT:    vzeroall
3060 ; CHECK-NEXT:    ret{{[l|q]}}
3061   call void @llvm.x86.avx.vzeroall()
3062   ret void
3063 }
3064 declare void @llvm.x86.avx.vzeroall() nounwind readnone
3065
3066 define void @test_mm256_zeroupper() nounwind {
3067 ; CHECK-LABEL: test_mm256_zeroupper:
3068 ; CHECK:       # %bb.0:
3069 ; CHECK-NEXT:    vzeroupper
3070 ; CHECK-NEXT:    ret{{[l|q]}}
3071   call void @llvm.x86.avx.vzeroupper()
3072   ret void
3073 }
3074 declare void @llvm.x86.avx.vzeroupper() nounwind readnone
3075
3076 define <4 x double> @test_mm256_zextpd128_pd256(<2 x double> %a0) nounwind {
3077 ; CHECK-LABEL: test_mm256_zextpd128_pd256:
3078 ; CHECK:       # %bb.0:
3079 ; CHECK-NEXT:    vmovaps %xmm0, %xmm0
3080 ; CHECK-NEXT:    ret{{[l|q]}}
3081   %res = shufflevector <2 x double> %a0, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3082   ret <4 x double> %res
3083 }
3084
3085 define <8 x float> @test_mm256_zextps128_ps256(<4 x float> %a0) nounwind {
3086 ; CHECK-LABEL: test_mm256_zextps128_ps256:
3087 ; CHECK:       # %bb.0:
3088 ; CHECK-NEXT:    vmovaps %xmm0, %xmm0
3089 ; CHECK-NEXT:    ret{{[l|q]}}
3090   %res = shufflevector <4 x float> %a0, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3091   ret <8 x float> %res
3092 }
3093
3094 define <4 x i64> @test_mm256_zextsi128_si256(<2 x i64> %a0) nounwind {
3095 ; CHECK-LABEL: test_mm256_zextsi128_si256:
3096 ; CHECK:       # %bb.0:
3097 ; CHECK-NEXT:    vmovaps %xmm0, %xmm0
3098 ; CHECK-NEXT:    ret{{[l|q]}}
3099   %res = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3100   ret <4 x i64> %res
3101 }
3102
3103 !0 = !{i32 1}