llvm/test/CodeGen/X86/avx.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefixes=CHECK,X32
   3 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s --check-prefixes=CHECK,X64
   4
   5 define <4 x i32> @blendvb_fallback_v4i32(<4 x i1> %mask, <4 x i32> %x, <4 x i32> %y) {
   6 ; CHECK-LABEL: blendvb_fallback_v4i32:
   7 ; CHECK:       ## %bb.0:
   8 ; CHECK-NEXT:    vpslld $31, %xmm0, %xmm0
   9 ; CHECK-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
  10 ; CHECK-NEXT:    ret{{[l|q]}}
  11   %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %y
  12   ret <4 x i32> %ret
  13 }
  14
  15 define <8 x i32> @blendvb_fallback_v8i32(<8 x i1> %mask, <8 x i32> %x, <8 x i32> %y) {
  16 ; CHECK-LABEL: blendvb_fallback_v8i32:
  17 ; CHECK:       ## %bb.0:
  18 ; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
  19 ; CHECK-NEXT:    vpslld $31, %xmm3, %xmm3
  20 ; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
  21 ; CHECK-NEXT:    vpslld $31, %xmm0, %xmm0
  22 ; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
  23 ; CHECK-NEXT:    vblendvps %ymm0, %ymm1, %ymm2, %ymm0
  24 ; CHECK-NEXT:    ret{{[l|q]}}
  25   %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
  26   ret <8 x i32> %ret
  27 }
  28
  29 define <8 x float> @blendvb_fallback_v8f32(<8 x i1> %mask, <8 x float> %x, <8 x float> %y) {
  30 ; CHECK-LABEL: blendvb_fallback_v8f32:
  31 ; CHECK:       ## %bb.0:
  32 ; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
  33 ; CHECK-NEXT:    vpslld $31, %xmm3, %xmm3
  34 ; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
  35 ; CHECK-NEXT:    vpslld $31, %xmm0, %xmm0
  36 ; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
  37 ; CHECK-NEXT:    vblendvps %ymm0, %ymm1, %ymm2, %ymm0
  38 ; CHECK-NEXT:    ret{{[l|q]}}
  39   %ret = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y
  40   ret <8 x float> %ret
  41 }
  42
  43 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
  44
  45 define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
  46 ; On X32, account for the argument's move to registers
  47 ; X32-LABEL: insertps_from_vector_load:
  48 ; X32:       ## %bb.0:
  49 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
  50 ; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
  51 ; X32-NEXT:    retl
  52 ;
  53 ; X64-LABEL: insertps_from_vector_load:
  54 ; X64:       ## %bb.0:
  55 ; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
  56 ; X64-NEXT:    retq
  57   %1 = load <4 x float>, <4 x float>* %pb, align 16
  58   %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
  59   ret <4 x float> %2
  60 }
  61
  62 ;; Use a non-zero CountS for insertps
  63 define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
  64 ; On X32, account for the argument's move to registers
  65 ;; Try to match a bit more of the instr, since we need the load's offset.
  66 ; X32-LABEL: insertps_from_vector_load_offset:
  67 ; X32:       ## %bb.0:
  68 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
  69 ; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
  70 ; X32-NEXT:    retl
  71 ;
  72 ; X64-LABEL: insertps_from_vector_load_offset:
  73 ; X64:       ## %bb.0:
  74 ; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
  75 ; X64-NEXT:    retq
  76   %1 = load <4 x float>, <4 x float>* %pb, align 16
  77   %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
  78   ret <4 x float> %2
  79 }
  80
  81 define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
  82 ; On X32, account for the argument's move to registers
  83 ;; Try to match a bit more of the instr, since we need the load's offset.
  84 ; X32-LABEL: insertps_from_vector_load_offset_2:
  85 ; X32:       ## %bb.0:
  86 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
  87 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
  88 ; X32-NEXT:    shll $4, %ecx
  89 ; X32-NEXT:    vinsertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
  90 ; X32-NEXT:    retl
  91 ;
  92 ; X64-LABEL: insertps_from_vector_load_offset_2:
  93 ; X64:       ## %bb.0:
  94 ; X64-NEXT:    shlq $4, %rsi
  95 ; X64-NEXT:    vinsertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
  96 ; X64-NEXT:    retq
  97   %1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index
  98   %2 = load <4 x float>, <4 x float>* %1, align 16
  99   %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
 100   ret <4 x float> %3
 101 }
 102
 103 define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
 104 ; On X32, account for the arguments' move to registers
 105 ; X32-LABEL: insertps_from_broadcast_loadf32:
 106 ; X32:       ## %bb.0:
 107 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 108 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 109 ; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
 110 ; X32-NEXT:    retl
 111 ;
 112 ; X64-LABEL: insertps_from_broadcast_loadf32:
 113 ; X64:       ## %bb.0:
 114 ; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
 115 ; X64-NEXT:    retq
 116   %1 = getelementptr inbounds float, float* %fb, i64 %index
 117   %2 = load float, float* %1, align 4
 118   %3 = insertelement <4 x float> undef, float %2, i32 0
 119   %4 = insertelement <4 x float> %3, float %2, i32 1
 120   %5 = insertelement <4 x float> %4, float %2, i32 2
 121   %6 = insertelement <4 x float> %5, float %2, i32 3
 122   %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
 123   ret <4 x float> %7
 124 }
 125
 126 define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
 127 ; On X32, account for the arguments' move to registers
 128 ; X32-LABEL: insertps_from_broadcast_loadv4f32:
 129 ; X32:       ## %bb.0:
 130 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 131 ; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
 132 ; X32-NEXT:    retl
 133 ;
 134 ; X64-LABEL: insertps_from_broadcast_loadv4f32:
 135 ; X64:       ## %bb.0:
 136 ; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
 137 ; X64-NEXT:    retq
 138   %1 = load <4 x float>, <4 x float>* %b, align 4
 139   %2 = extractelement <4 x float> %1, i32 0
 140   %3 = insertelement <4 x float> undef, float %2, i32 0
 141   %4 = insertelement <4 x float> %3, float %2, i32 1
 142   %5 = insertelement <4 x float> %4, float %2, i32 2
 143   %6 = insertelement <4 x float> %5, float %2, i32 3
 144   %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
 145   ret <4 x float> %7
 146 }
 147
 148 ;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
 149 define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
 150 ; On X32, account for the arguments' move to registers
 151 ; X32-LABEL: insertps_from_broadcast_multiple_use:
 152 ; X32:       ## %bb.0:
 153 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 154 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 155 ; X32-NEXT:    vbroadcastss (%ecx,%eax,4), %xmm4
 156 ; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
 157 ; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
 158 ; X32-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 159 ; X32-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]
 160 ; X32-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]
 161 ; X32-NEXT:    vaddps %xmm2, %xmm1, %xmm1
 162 ; X32-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 163 ; X32-NEXT:    retl
 164 ;
 165 ; X64-LABEL: insertps_from_broadcast_multiple_use:
 166 ; X64:       ## %bb.0:
 167 ; X64-NEXT:    vbroadcastss (%rdi,%rsi,4), %xmm4
 168 ; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
 169 ; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
 170 ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 171 ; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm4[0]
 172 ; X64-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm4[0]
 173 ; X64-NEXT:    vaddps %xmm2, %xmm1, %xmm1
 174 ; X64-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 175 ; X64-NEXT:    retq
 176   %1 = getelementptr inbounds float, float* %fb, i64 %index
 177   %2 = load float, float* %1, align 4
 178   %3 = insertelement <4 x float> undef, float %2, i32 0
 179   %4 = insertelement <4 x float> %3, float %2, i32 1
 180   %5 = insertelement <4 x float> %4, float %2, i32 2
 181   %6 = insertelement <4 x float> %5, float %2, i32 3
 182   %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
 183   %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
 184   %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
 185   %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
 186   %11 = fadd <4 x float> %7, %8
 187   %12 = fadd <4 x float> %9, %10
 188   %13 = fadd <4 x float> %11, %12
 189   ret <4 x float> %13
 190 }