libavcodec/x86/fmtconvert.asm

   1 ;******************************************************************************
   2 ;* x86 optimized Format Conversion Utils
   3 ;* Copyright (c) 2008 Loren Merritt
   4 ;*
   5 ;* This file is part of Libav.
   6 ;*
   7 ;* Libav is free software; you can redistribute it and/or
   8 ;* modify it under the terms of the GNU Lesser General Public
   9 ;* License as published by the Free Software Foundation; either
  10 ;* version 2.1 of the License, or (at your option) any later version.
  11 ;*
  12 ;* Libav is distributed in the hope that it will be useful,
  13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15 ;* Lesser General Public License for more details.
  16 ;*
  17 ;* You should have received a copy of the GNU Lesser General Public
  18 ;* License along with Libav; if not, write to the Free Software
  19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20 ;******************************************************************************
  21
  22 %include "libavutil/x86/x86util.asm"
  23
  24 SECTION .text
  25
  26 ;------------------------------------------------------------------------------
  27 ; void ff_int32_to_float_fmul_scalar(float *dst, const int32_t *src, float mul,
  28 ;                                    int len);
  29 ;------------------------------------------------------------------------------
  30 %macro INT32_TO_FLOAT_FMUL_SCALAR 1
  31 %if UNIX64
  32 cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len
  33 %else
  34 cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len
  35 %endif
  36 %if WIN64
  37     SWAP 0, 2
  38 %elif ARCH_X86_32
  39     movss   m0, mulm
  40 %endif
  41     SPLATD  m0
  42     shl     lend, 2
  43     add     srcq, lenq
  44     add     dstq, lenq
  45     neg     lenq
  46 .loop:
  47 %if cpuflag(sse2)
  48     cvtdq2ps  m1, [srcq+lenq   ]
  49     cvtdq2ps  m2, [srcq+lenq+16]
  50 %else
  51     cvtpi2ps  m1, [srcq+lenq   ]
  52     cvtpi2ps  m3, [srcq+lenq+ 8]
  53     cvtpi2ps  m2, [srcq+lenq+16]
  54     cvtpi2ps  m4, [srcq+lenq+24]
  55     movlhps   m1, m3
  56     movlhps   m2, m4
  57 %endif
  58     mulps     m1, m0
  59     mulps     m2, m0
  60     mova  [dstq+lenq   ], m1
  61     mova  [dstq+lenq+16], m2
  62     add     lenq, 32
  63     jl .loop
  64 %if notcpuflag(sse2)
  65     ;; cvtpi2ps switches to MMX even if the source is a memory location
  66     ;; possible an error in documentation since every tested CPU disagrees with
  67     ;; that. Use emms anyway since the vast majority of machines will use the
  68     ;; SSE2 variant
  69     emms
  70 %endif
  71     RET
  72 %endmacro
  73
  74 INIT_XMM sse
  75 INT32_TO_FLOAT_FMUL_SCALAR 5
  76 INIT_XMM sse2
  77 INT32_TO_FLOAT_FMUL_SCALAR 3