llvm/test/CodeGen/X86/sse-intel-ocl.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=nehalem | FileCheck -check-prefix=WIN32 %s
   3 ; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=nehalem | FileCheck -check-prefix=WIN64 %s
   4 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=nehalem | FileCheck -check-prefix=NOT_WIN %s
   5
   6 declare <16 x float> @func_float16_ptr(<16 x float>, ptr)
   7 declare <16 x float> @func_float16(<16 x float>, <16 x float>)
   8
   9 ;test calling conventions - input parameters
  10 define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
  11 ; WIN32-LABEL: testf16_inp:
  12 ; WIN32:       # %bb.0:
  13 ; WIN32-NEXT:    pushl %ebp
  14 ; WIN32-NEXT:    movl %esp, %ebp
  15 ; WIN32-NEXT:    andl $-16, %esp
  16 ; WIN32-NEXT:    subl $80, %esp
  17 ; WIN32-NEXT:    movups 72(%ebp), %xmm4
  18 ; WIN32-NEXT:    movups 8(%ebp), %xmm3
  19 ; WIN32-NEXT:    addps %xmm4, %xmm3
  20 ; WIN32-NEXT:    movups 56(%ebp), %xmm4
  21 ; WIN32-NEXT:    movups 40(%ebp), %xmm5
  22 ; WIN32-NEXT:    movups 24(%ebp), %xmm6
  23 ; WIN32-NEXT:    movl %esp, %eax
  24 ; WIN32-NEXT:    addps %xmm6, %xmm0
  25 ; WIN32-NEXT:    addps %xmm5, %xmm1
  26 ; WIN32-NEXT:    addps %xmm4, %xmm2
  27 ; WIN32-NEXT:    pushl %eax
  28 ; WIN32-NEXT:    calll _func_float16_ptr
  29 ; WIN32-NEXT:    addl $4, %esp
  30 ; WIN32-NEXT:    addps (%esp), %xmm0
  31 ; WIN32-NEXT:    addps {{[0-9]+}}(%esp), %xmm1
  32 ; WIN32-NEXT:    addps {{[0-9]+}}(%esp), %xmm2
  33 ; WIN32-NEXT:    addps {{[0-9]+}}(%esp), %xmm3
  34 ; WIN32-NEXT:    movl %ebp, %esp
  35 ; WIN32-NEXT:    popl %ebp
  36 ; WIN32-NEXT:    retl
  37 ;
  38 ; WIN64-LABEL: testf16_inp:
  39 ; WIN64:       # %bb.0:
  40 ; WIN64-NEXT:    subq $104, %rsp
  41 ; WIN64-NEXT:    movaps (%r9), %xmm3
  42 ; WIN64-NEXT:    movaps (%r8), %xmm2
  43 ; WIN64-NEXT:    movaps (%rdx), %xmm1
  44 ; WIN64-NEXT:    movaps (%rcx), %xmm0
  45 ; WIN64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
  46 ; WIN64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
  47 ; WIN64-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
  48 ; WIN64-NEXT:    movq {{[0-9]+}}(%rsp), %r8
  49 ; WIN64-NEXT:    addps (%r8), %xmm0
  50 ; WIN64-NEXT:    addps (%rdx), %xmm1
  51 ; WIN64-NEXT:    addps (%rcx), %xmm2
  52 ; WIN64-NEXT:    addps (%rax), %xmm3
  53 ; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
  54 ; WIN64-NEXT:    callq func_float16_ptr
  55 ; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm0
  56 ; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm1
  57 ; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm2
  58 ; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm3
  59 ; WIN64-NEXT:    addq $104, %rsp
  60 ; WIN64-NEXT:    retq
  61 ;
  62 ; NOT_WIN-LABEL: testf16_inp:
  63 ; NOT_WIN:       ## %bb.0:
  64 ; NOT_WIN-NEXT:    subq $72, %rsp
  65 ; NOT_WIN-NEXT:    addps %xmm4, %xmm0
  66 ; NOT_WIN-NEXT:    addps %xmm5, %xmm1
  67 ; NOT_WIN-NEXT:    addps %xmm6, %xmm2
  68 ; NOT_WIN-NEXT:    addps %xmm7, %xmm3
  69 ; NOT_WIN-NEXT:    movq %rsp, %rdi
  70 ; NOT_WIN-NEXT:    callq _func_float16_ptr
  71 ; NOT_WIN-NEXT:    addps (%rsp), %xmm0
  72 ; NOT_WIN-NEXT:    addps {{[0-9]+}}(%rsp), %xmm1
  73 ; NOT_WIN-NEXT:    addps {{[0-9]+}}(%rsp), %xmm2
  74 ; NOT_WIN-NEXT:    addps {{[0-9]+}}(%rsp), %xmm3
  75 ; NOT_WIN-NEXT:    addq $72, %rsp
  76 ; NOT_WIN-NEXT:    retq
  77   %y = alloca <16 x float>, align 16
  78   %x = fadd <16 x float> %a, %b
  79   %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, ptr %y)
  80   %2 = load <16 x float>, ptr %y, align 16
  81   %3 = fadd <16 x float> %2, %1
  82   ret <16 x float> %3
  83 }
  84
  85 ; test calling conventions - preserved registers
  86
  87 ; preserves xmm6-xmm15 on windows, xmm8-xmm15 on other plateforms.
  88 define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
  89 ; WIN32-LABEL: testf16_regs:
  90 ; WIN32:       # %bb.0:
  91 ; WIN32-NEXT:    pushl %ebp
  92 ; WIN32-NEXT:    movl %esp, %ebp
  93 ; WIN32-NEXT:    andl $-16, %esp
  94 ; WIN32-NEXT:    subl $80, %esp
  95 ; WIN32-NEXT:    movups 72(%ebp), %xmm6
  96 ; WIN32-NEXT:    movups 8(%ebp), %xmm3
  97 ; WIN32-NEXT:    movups 56(%ebp), %xmm7
  98 ; WIN32-NEXT:    movups 40(%ebp), %xmm5
  99 ; WIN32-NEXT:    movups 24(%ebp), %xmm4
 100 ; WIN32-NEXT:    movl %esp, %eax
 101 ; WIN32-NEXT:    addps %xmm4, %xmm0
 102 ; WIN32-NEXT:    addps %xmm5, %xmm1
 103 ; WIN32-NEXT:    addps %xmm7, %xmm2
 104 ; WIN32-NEXT:    addps %xmm6, %xmm3
 105 ; WIN32-NEXT:    pushl %eax
 106 ; WIN32-NEXT:    calll _func_float16_ptr
 107 ; WIN32-NEXT:    addl $4, %esp
 108 ; WIN32-NEXT:    movups 72(%ebp), %xmm4
 109 ; WIN32-NEXT:    addps %xmm4, %xmm3
 110 ; WIN32-NEXT:    movups 56(%ebp), %xmm4
 111 ; WIN32-NEXT:    addps %xmm4, %xmm2
 112 ; WIN32-NEXT:    movups 40(%ebp), %xmm4
 113 ; WIN32-NEXT:    addps %xmm4, %xmm1
 114 ; WIN32-NEXT:    movups 24(%ebp), %xmm4
 115 ; WIN32-NEXT:    addps %xmm4, %xmm0
 116 ; WIN32-NEXT:    addps (%esp), %xmm0
 117 ; WIN32-NEXT:    addps {{[0-9]+}}(%esp), %xmm1
 118 ; WIN32-NEXT:    addps {{[0-9]+}}(%esp), %xmm2
 119 ; WIN32-NEXT:    addps {{[0-9]+}}(%esp), %xmm3
 120 ; WIN32-NEXT:    movl %ebp, %esp
 121 ; WIN32-NEXT:    popl %ebp
 122 ; WIN32-NEXT:    retl
 123 ;
 124 ; WIN64-LABEL: testf16_regs:
 125 ; WIN64:       # %bb.0:
 126 ; WIN64-NEXT:    subq $168, %rsp
 127 ; WIN64-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 128 ; WIN64-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 129 ; WIN64-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 130 ; WIN64-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 131 ; WIN64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 132 ; WIN64-NEXT:    movaps (%rax), %xmm6
 133 ; WIN64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 134 ; WIN64-NEXT:    movaps (%rax), %xmm7
 135 ; WIN64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 136 ; WIN64-NEXT:    movaps (%rax), %xmm8
 137 ; WIN64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 138 ; WIN64-NEXT:    movaps (%rax), %xmm9
 139 ; WIN64-NEXT:    movaps (%rcx), %xmm0
 140 ; WIN64-NEXT:    addps %xmm9, %xmm0
 141 ; WIN64-NEXT:    movaps (%rdx), %xmm1
 142 ; WIN64-NEXT:    addps %xmm8, %xmm1
 143 ; WIN64-NEXT:    movaps (%r8), %xmm2
 144 ; WIN64-NEXT:    addps %xmm7, %xmm2
 145 ; WIN64-NEXT:    movaps (%r9), %xmm3
 146 ; WIN64-NEXT:    addps %xmm6, %xmm3
 147 ; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
 148 ; WIN64-NEXT:    callq func_float16_ptr
 149 ; WIN64-NEXT:    addps %xmm6, %xmm3
 150 ; WIN64-NEXT:    addps %xmm7, %xmm2
 151 ; WIN64-NEXT:    addps %xmm8, %xmm1
 152 ; WIN64-NEXT:    addps %xmm9, %xmm0
 153 ; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm0
 154 ; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm1
 155 ; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm2
 156 ; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm3
 157 ; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 158 ; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 159 ; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
 160 ; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
 161 ; WIN64-NEXT:    addq $168, %rsp
 162 ; WIN64-NEXT:    retq
 163 ;
 164 ; NOT_WIN-LABEL: testf16_regs:
 165 ; NOT_WIN:       ## %bb.0:
 166 ; NOT_WIN-NEXT:    subq $72, %rsp
 167 ; NOT_WIN-NEXT:    movaps %xmm7, %xmm9
 168 ; NOT_WIN-NEXT:    movaps %xmm6, %xmm10
 169 ; NOT_WIN-NEXT:    movaps %xmm5, %xmm11
 170 ; NOT_WIN-NEXT:    movaps %xmm4, %xmm8
 171 ; NOT_WIN-NEXT:    addps %xmm4, %xmm0
 172 ; NOT_WIN-NEXT:    addps %xmm5, %xmm1
 173 ; NOT_WIN-NEXT:    addps %xmm6, %xmm2
 174 ; NOT_WIN-NEXT:    addps %xmm7, %xmm3
 175 ; NOT_WIN-NEXT:    movq %rsp, %rdi
 176 ; NOT_WIN-NEXT:    callq _func_float16_ptr
 177 ; NOT_WIN-NEXT:    addps %xmm9, %xmm3
 178 ; NOT_WIN-NEXT:    addps %xmm10, %xmm2
 179 ; NOT_WIN-NEXT:    addps %xmm11, %xmm1
 180 ; NOT_WIN-NEXT:    addps %xmm8, %xmm0
 181 ; NOT_WIN-NEXT:    addps (%rsp), %xmm0
 182 ; NOT_WIN-NEXT:    addps {{[0-9]+}}(%rsp), %xmm1
 183 ; NOT_WIN-NEXT:    addps {{[0-9]+}}(%rsp), %xmm2
 184 ; NOT_WIN-NEXT:    addps {{[0-9]+}}(%rsp), %xmm3
 185 ; NOT_WIN-NEXT:    addq $72, %rsp
 186 ; NOT_WIN-NEXT:    retq
 187   %y = alloca <16 x float>, align 16
 188   %x = fadd <16 x float> %a, %b
 189   %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, ptr %y)
 190   %2 = load <16 x float>, ptr %y, align 16
 191   %3 = fadd <16 x float> %1, %b
 192   %4 = fadd <16 x float> %2, %3
 193   ret <16 x float> %4
 194 }
 195
 196 ; test calling conventions - prolog and epilog
 197 define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
 198 ; WIN32-LABEL: test_prolog_epilog:
 199 ; WIN32:       # %bb.0:
 200 ; WIN32-NEXT:    pushl %ebp
 201 ; WIN32-NEXT:    movl %esp, %ebp
 202 ; WIN32-NEXT:    andl $-16, %esp
 203 ; WIN32-NEXT:    subl $96, %esp
 204 ; WIN32-NEXT:    movups 8(%ebp), %xmm4
 205 ; WIN32-NEXT:    movups 24(%ebp), %xmm5
 206 ; WIN32-NEXT:    movups 40(%ebp), %xmm6
 207 ; WIN32-NEXT:    movups 56(%ebp), %xmm7
 208 ; WIN32-NEXT:    movups %xmm7, {{[0-9]+}}(%esp)
 209 ; WIN32-NEXT:    movups %xmm6, {{[0-9]+}}(%esp)
 210 ; WIN32-NEXT:    movups %xmm5, {{[0-9]+}}(%esp)
 211 ; WIN32-NEXT:    movups %xmm4, {{[0-9]+}}(%esp)
 212 ; WIN32-NEXT:    movups %xmm3, (%esp)
 213 ; WIN32-NEXT:    calll _func_float16
 214 ; WIN32-NEXT:    movl %ebp, %esp
 215 ; WIN32-NEXT:    popl %ebp
 216 ; WIN32-NEXT:    retl
 217 ;
 218 ; WIN64-LABEL: test_prolog_epilog:
 219 ; WIN64:       # %bb.0:
 220 ; WIN64-NEXT:    subq $232, %rsp
 221 ; WIN64-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 222 ; WIN64-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 223 ; WIN64-NEXT:    movaps (%r9), %xmm4
 224 ; WIN64-NEXT:    movaps (%rdx), %xmm5
 225 ; WIN64-NEXT:    movaps (%r8), %xmm6
 226 ; WIN64-NEXT:    movaps (%rcx), %xmm7
 227 ; WIN64-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
 228 ; WIN64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
 229 ; WIN64-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
 230 ; WIN64-NEXT:    movaps %xmm3, {{[0-9]+}}(%rsp)
 231 ; WIN64-NEXT:    movaps %xmm7, {{[0-9]+}}(%rsp)
 232 ; WIN64-NEXT:    movaps %xmm6, {{[0-9]+}}(%rsp)
 233 ; WIN64-NEXT:    movaps %xmm5, {{[0-9]+}}(%rsp)
 234 ; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
 235 ; WIN64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 236 ; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
 237 ; WIN64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 238 ; WIN64-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
 239 ; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
 240 ; WIN64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 241 ; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
 242 ; WIN64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 243 ; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
 244 ; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 245 ; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %r8
 246 ; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %r9
 247 ; WIN64-NEXT:    callq func_float16
 248 ; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 249 ; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 250 ; WIN64-NEXT:    addq $232, %rsp
 251 ; WIN64-NEXT:    retq
 252 ;
 253 ; NOT_WIN-LABEL: test_prolog_epilog:
 254 ; NOT_WIN:       ## %bb.0:
 255 ; NOT_WIN-NEXT:    subq $136, %rsp
 256 ; NOT_WIN-NEXT:    movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
 257 ; NOT_WIN-NEXT:    movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
 258 ; NOT_WIN-NEXT:    movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
 259 ; NOT_WIN-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
 260 ; NOT_WIN-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
 261 ; NOT_WIN-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
 262 ; NOT_WIN-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
 263 ; NOT_WIN-NEXT:    movaps %xmm8, (%rsp) ## 16-byte Spill
 264 ; NOT_WIN-NEXT:    callq _func_float16
 265 ; NOT_WIN-NEXT:    movaps (%rsp), %xmm8 ## 16-byte Reload
 266 ; NOT_WIN-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 ## 16-byte Reload
 267 ; NOT_WIN-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 ## 16-byte Reload
 268 ; NOT_WIN-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 ## 16-byte Reload
 269 ; NOT_WIN-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 ## 16-byte Reload
 270 ; NOT_WIN-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 ## 16-byte Reload
 271 ; NOT_WIN-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 ## 16-byte Reload
 272 ; NOT_WIN-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 ## 16-byte Reload
 273 ; NOT_WIN-NEXT:    addq $136, %rsp
 274 ; NOT_WIN-NEXT:    retq
 275    %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
 276    ret <16 x float> %c
 277 }