llvm/test/CodeGen/X86/avx.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
   2 ; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefixes=CHECK,X86
   3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefixes=CHECK,X64
   4
   5 define <4 x i32> @blendvb_fallback_v4i32(<4 x i1> %mask, <4 x i32> %x, <4 x i32> %y) {
   6 ; CHECK-LABEL: blendvb_fallback_v4i32:
   7 ; CHECK:       ## %bb.0:
   8 ; CHECK-NEXT:    vpslld $31, %xmm0, %xmm0
   9 ; CHECK-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
  10 ; CHECK-NEXT:    ret{{[l|q]}}
  11   %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %y
  12   ret <4 x i32> %ret
  13 }
  14
  15 define <8 x i32> @blendvb_fallback_v8i32(<8 x i1> %mask, <8 x i32> %x, <8 x i32> %y) {
  16 ; CHECK-LABEL: blendvb_fallback_v8i32:
  17 ; CHECK:       ## %bb.0:
  18 ; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
  19 ; CHECK-NEXT:    vpslld $31, %xmm3, %xmm3
  20 ; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
  21 ; CHECK-NEXT:    vpslld $31, %xmm0, %xmm0
  22 ; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
  23 ; CHECK-NEXT:    vblendvps %ymm0, %ymm1, %ymm2, %ymm0
  24 ; CHECK-NEXT:    ret{{[l|q]}}
  25   %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
  26   ret <8 x i32> %ret
  27 }
  28
  29 define <8 x float> @blendvb_fallback_v8f32(<8 x i1> %mask, <8 x float> %x, <8 x float> %y) {
  30 ; CHECK-LABEL: blendvb_fallback_v8f32:
  31 ; CHECK:       ## %bb.0:
  32 ; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
  33 ; CHECK-NEXT:    vpslld $31, %xmm3, %xmm3
  34 ; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
  35 ; CHECK-NEXT:    vpslld $31, %xmm0, %xmm0
  36 ; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
  37 ; CHECK-NEXT:    vblendvps %ymm0, %ymm1, %ymm2, %ymm0
  38 ; CHECK-NEXT:    ret{{[l|q]}}
  39   %ret = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y
  40   ret <8 x float> %ret
  41 }
  42
  43 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
  44
  45 define <4 x float> @insertps_from_vector_load(<4 x float> %a, ptr nocapture readonly %pb) {
  46 ; X86-LABEL: insertps_from_vector_load:
  47 ; X86:       ## %bb.0:
  48 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
  49 ; X86-NEXT:    vinsertps $48, (%eax), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
  50 ; X86-NEXT:    retl
  51 ;
  52 ; X64-LABEL: insertps_from_vector_load:
  53 ; X64:       ## %bb.0:
  54 ; X64-NEXT:    vinsertps $48, (%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
  55 ; X64-NEXT:    retq
  56   %1 = load <4 x float>, ptr %pb, align 16
  57   %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
  58   ret <4 x float> %2
  59 }
  60
  61 ;; Use a non-zero CountS for insertps
  62 define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, ptr nocapture readonly %pb) {
  63 ; X86-LABEL: insertps_from_vector_load_offset:
  64 ; X86:       ## %bb.0:
  65 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
  66 ; X86-NEXT:    vinsertps $32, 4(%eax), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
  67 ; X86-NEXT:    retl
  68 ;
  69 ; X64-LABEL: insertps_from_vector_load_offset:
  70 ; X64:       ## %bb.0:
  71 ; X64-NEXT:    vinsertps $32, 4(%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1],mem[0],xmm0[3]
  72 ; X64-NEXT:    retq
  73   %1 = load <4 x float>, ptr %pb, align 16
  74   %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
  75   ret <4 x float> %2
  76 }
  77
  78 define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, ptr nocapture readonly %pb, i64 %index) {
  79 ; X86-LABEL: insertps_from_vector_load_offset_2:
  80 ; X86:       ## %bb.0:
  81 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
  82 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
  83 ; X86-NEXT:    shll $4, %ecx
  84 ; X86-NEXT:    vinsertps $0, 12(%eax,%ecx), %xmm0, %xmm0 ## xmm0 = mem[0],xmm0[1,2,3]
  85 ; X86-NEXT:    retl
  86 ;
  87 ; X64-LABEL: insertps_from_vector_load_offset_2:
  88 ; X64:       ## %bb.0:
  89 ; X64-NEXT:    shlq $4, %rsi
  90 ; X64-NEXT:    vinsertps $0, 12(%rdi,%rsi), %xmm0, %xmm0 ## xmm0 = mem[0],xmm0[1,2,3]
  91 ; X64-NEXT:    retq
  92   %1 = getelementptr inbounds <4 x float>, ptr %pb, i64 %index
  93   %2 = load <4 x float>, ptr %1, align 16
  94   %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
  95   ret <4 x float> %3
  96 }
  97
  98 define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, ptr nocapture readonly %fb, i64 %index) {
  99 ; X86-LABEL: insertps_from_broadcast_loadf32:
 100 ; X86:       ## %bb.0:
 101 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 102 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 103 ; X86-NEXT:    vinsertps $48, (%ecx,%eax,4), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
 104 ; X86-NEXT:    retl
 105 ;
 106 ; X64-LABEL: insertps_from_broadcast_loadf32:
 107 ; X64:       ## %bb.0:
 108 ; X64-NEXT:    vinsertps $48, (%rdi,%rsi,4), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
 109 ; X64-NEXT:    retq
 110   %1 = getelementptr inbounds float, ptr %fb, i64 %index
 111   %2 = load float, ptr %1, align 4
 112   %3 = insertelement <4 x float> undef, float %2, i32 0
 113   %4 = insertelement <4 x float> %3, float %2, i32 1
 114   %5 = insertelement <4 x float> %4, float %2, i32 2
 115   %6 = insertelement <4 x float> %5, float %2, i32 3
 116   %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
 117   ret <4 x float> %7
 118 }
 119
 120 define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, ptr nocapture readonly %b) {
 121 ; X86-LABEL: insertps_from_broadcast_loadv4f32:
 122 ; X86:       ## %bb.0:
 123 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 124 ; X86-NEXT:    vinsertps $48, (%eax), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
 125 ; X86-NEXT:    retl
 126 ;
 127 ; X64-LABEL: insertps_from_broadcast_loadv4f32:
 128 ; X64:       ## %bb.0:
 129 ; X64-NEXT:    vinsertps $48, (%rdi), %xmm0, %xmm0 ## xmm0 = xmm0[0,1,2],mem[0]
 130 ; X64-NEXT:    retq
 131   %1 = load <4 x float>, ptr %b, align 4
 132   %2 = extractelement <4 x float> %1, i32 0
 133   %3 = insertelement <4 x float> undef, float %2, i32 0
 134   %4 = insertelement <4 x float> %3, float %2, i32 1
 135   %5 = insertelement <4 x float> %4, float %2, i32 2
 136   %6 = insertelement <4 x float> %5, float %2, i32 3
 137   %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
 138   ret <4 x float> %7
 139 }
 140
 141 define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, ptr nocapture readonly %fb, i64 %index) {
 142 ; X86-LABEL: insertps_from_broadcast_multiple_use:
 143 ; X86:       ## %bb.0:
 144 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 145 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 146 ; X86-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm4
 147 ; X86-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
 148 ; X86-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
 149 ; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 150 ; X86-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3]
 151 ; X86-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[3]
 152 ; X86-NEXT:    vaddps %xmm2, %xmm1, %xmm1
 153 ; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 154 ; X86-NEXT:    retl
 155 ;
 156 ; X64-LABEL: insertps_from_broadcast_multiple_use:
 157 ; X64:       ## %bb.0:
 158 ; X64-NEXT:    vbroadcastss (%rdi,%rsi,4), %xmm4
 159 ; X64-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
 160 ; X64-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
 161 ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 162 ; X64-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[3]
 163 ; X64-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[3]
 164 ; X64-NEXT:    vaddps %xmm2, %xmm1, %xmm1
 165 ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 166 ; X64-NEXT:    retq
 167   %1 = getelementptr inbounds float, ptr %fb, i64 %index
 168   %2 = load float, ptr %1, align 4
 169   %3 = insertelement <4 x float> undef, float %2, i32 0
 170   %4 = insertelement <4 x float> %3, float %2, i32 1
 171   %5 = insertelement <4 x float> %4, float %2, i32 2
 172   %6 = insertelement <4 x float> %5, float %2, i32 3
 173   %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
 174   %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
 175   %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
 176   %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
 177   %11 = fadd <4 x float> %7, %8
 178   %12 = fadd <4 x float> %9, %10
 179   %13 = fadd <4 x float> %11, %12
 180   ret <4 x float> %13
 181 }