llvm/test/CodeGen/X86/avx-intel-ocl.ll

   1 ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck -check-prefix=X86 %s
   2 ; RUN: llc < %s -mtriple=i386-pc-win32 -mattr=+avx | FileCheck -check-prefix=X86 %s
   3 ; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx | FileCheck -check-prefix=WIN64 %s
   4 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck -check-prefix=X64 %s
   5
   6 declare <16 x float> @func_float16_ptr(<16 x float>, ptr)
   7 declare <16 x float> @func_float16(<16 x float>, <16 x float>)
   8 declare i32 @func_int(i32, i32)
   9
  10 ; WIN64-LABEL: testf16_inp
  11 ; WIN64: vaddps  {{.*}}, {{%ymm[0-1]}}
  12 ; WIN64: vaddps  {{.*}}, {{%ymm[0-1]}}
  13 ; WIN64: leaq    {{.*}}(%rsp), %rcx
  14 ; WIN64: call
  15 ; WIN64: ret
  16
  17 ; X86-LABEL: testf16_inp
  18 ; X86: vaddps  {{.*}}, {{%ymm[0-1]}}
  19 ; X86: vaddps  {{.*}}, {{%ymm[0-1]}}
  20 ; Push is not deemed profitable if we're realigning the stack.
  21 ; X86: {{pushl|movl}}   %eax
  22 ; X86: call
  23 ; X86: ret
  24
  25 ; X64-LABEL: testf16_inp
  26 ; X64: vaddps  {{.*}}, {{%ymm[0-1]}}
  27 ; X64: vaddps  {{.*}}, {{%ymm[0-1]}}
  28 ; X64: movq    %rsp, %rdi
  29 ; X64: call
  30 ; X64: ret
  31
  32 ;test calling conventions - input parameters
  33 define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
  34   %y = alloca <16 x float>, align 16
  35   %x = fadd <16 x float> %a, %b
  36   %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, ptr %y)
  37   %2 = load <16 x float>, ptr %y, align 16
  38   %3 = fadd <16 x float> %2, %1
  39   ret <16 x float> %3
  40 }
  41
  42 ;test calling conventions - preserved registers
  43
  44 ; preserved ymm6-ymm15
  45 ; WIN64-LABEL: testf16_regs
  46 ; WIN64: call
  47 ; WIN64: vaddps  {{%ymm[6-7]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}}
  48 ; WIN64: vaddps  {{%ymm[6-7]}}, {{%ymm[0-1]}}, {{%ymm[0-1]}}
  49 ; WIN64: ret
  50
  51 ; preserved ymm8-ymm15
  52 ; X64-LABEL: testf16_regs
  53 ; X64: call
  54 ; X64: vaddps  {{%ymm[0-1]}}, {{%ymm[8-9]}}, {{%ymm[0-1]}}
  55 ; X64: vaddps  {{%ymm[0-1]}}, {{%ymm[8-9]}}, {{%ymm[0-1]}}
  56 ; X64: ret
  57
  58 define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
  59   %y = alloca <16 x float>, align 16
  60   %x = fadd <16 x float> %a, %b
  61   %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, ptr %y)
  62   %2 = load <16 x float>, ptr %y, align 16
  63   %3 = fadd <16 x float> %1, %b
  64   %4 = fadd <16 x float> %2, %3
  65   ret <16 x float> %4
  66 }
  67
  68 ; test calling conventions - prolog and epilog
  69 ; WIN64-LABEL: test_prolog_epilog
  70 ; WIN64: vmovups {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}}     # 32-byte Spill
  71 ; WIN64: vmovups {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}}     # 32-byte Spill
  72 ; WIN64: vmovups {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}}     # 32-byte Spill
  73 ; WIN64: vmovups {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}}     # 32-byte Spill
  74 ; WIN64: vmovups {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}}     # 32-byte Spill
  75 ; WIN64: vmovups {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}}     # 32-byte Spill
  76 ; WIN64: vmovups {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}}     # 32-byte Spill
  77 ; WIN64: vmovups {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}}     # 32-byte Spill
  78 ; WIN64: vmovups {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}}     # 32-byte Spill
  79 ; WIN64: vmovups {{%ymm([6-9]|1[0-5])}}, {{.*(%rbp).*}}     # 32-byte Spill
  80 ; WIN64: call
  81 ; WIN64: vmovups {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
  82 ; WIN64: vmovups {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
  83 ; WIN64: vmovups {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
  84 ; WIN64: vmovups {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
  85 ; WIN64: vmovups {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
  86 ; WIN64: vmovups {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
  87 ; WIN64: vmovups {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
  88 ; WIN64: vmovups {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
  89 ; WIN64: vmovups {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
  90 ; WIN64: vmovups {{.*(%rbp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
  91
  92 ; X64-LABEL: test_prolog_epilog
  93 ; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Spill
  94 ; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Spill
  95 ; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Spill
  96 ; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Spill
  97 ; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Spill
  98 ; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Spill
  99 ; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Spill
 100 ; X64: vmovups {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rsp)  ## 32-byte Spill
 101 ; X64: call
 102 ; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
 103 ; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
 104 ; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
 105 ; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
 106 ; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
 107 ; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
 108 ; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
 109 ; X64: vmovups {{.*}}(%rsp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
 110 define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
 111    %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
 112    ret <16 x float> %c
 113 }
 114
 115 ; test functions with integer parameters
 116 ; pass parameters on stack for 32-bit platform
 117 ; X86-LABEL: test_int
 118 ; X86: pushl {{.*}}
 119 ; X86: pushl {{.*}}
 120 ; X86: call
 121 ; X86: addl {{.*}}, %eax
 122
 123 ; pass parameters in registers for 64-bit platform
 124 ; X64-LABEL: test_int
 125 ; X64: movl {{.*}}, %esi
 126 ; X64: leal {{.*}}, %edi
 127 ; X64: call
 128 ; X64: addl {{.*}}, %eax
 129 define i32 @test_int(i32 %a, i32 %b) nounwind {
 130     %c1 = add i32 %a, %b
 131         %c2 = call intel_ocl_bicc i32 @func_int(i32 %c1, i32 %a)
 132     %c = add i32 %c2, %b
 133         ret i32 %c
 134 }
 135
 136 ; WIN64-LABEL: test_float4
 137 ; WIN64-NOT: vzeroupper
 138 ; WIN64: call
 139 ; WIN64-NOT: vzeroupper
 140 ; WIN64: call
 141 ; WIN64: ret
 142
 143 ; X64-LABEL: test_float4
 144 ; X64-NOT: vzeroupper
 145 ; X64: call
 146 ; X64-NOT: vzeroupper
 147 ; X64: call
 148 ; X64: ret
 149
 150 ; X86-LABEL: test_float4
 151 ; X86: vzeroupper
 152 ; X86: call
 153 ; X86: vzeroupper
 154 ; X86: call
 155 ; X86: ret
 156
 157 declare <4 x float> @func_float4(<4 x float>, <4 x float>, <4 x float>)
 158
 159 define <8 x float> @test_float4(<8 x float> %a, <8 x float> %b, <8 x float> %c) nounwind readnone {
 160 entry:
 161   %0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 162   %1 = shufflevector <8 x float> %b, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 163   %2 = shufflevector <8 x float> %c, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 164   %call.i = tail call intel_ocl_bicc <4 x float> @func_float4(<4 x float> %0, <4 x float> %1, <4 x float> %2) nounwind
 165   %3 = shufflevector <4 x float> %call.i, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 166   %4 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 167   %5 = shufflevector <8 x float> %b, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 168   %6 = shufflevector <8 x float> %c, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
 169   %call.i2 = tail call intel_ocl_bicc <4 x float> @func_float4(<4 x float> %4, <4 x float> %5, <4 x float> %6) nounwind
 170   %7 = shufflevector <4 x float> %call.i2, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 171   %8 = shufflevector <8 x float> %3, <8 x float> %7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 172   ret <8 x float> %8
 173 }