vp8/encoder/x86/preproc_mmx.c

   1 /*
   2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS.  All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11
  12 #include "memory.h"
  13 #include "preproc.h"
  14 #include "pragmas.h"
  15
  16 /****************************************************************************
  17 *  Macros
  18 ****************************************************************************/
  19 #define FRAMECOUNT 7
  20 #define ROUNDUP32(X) ( ( ( (unsigned long) X ) + 31 )&( 0xFFFFFFE0 ) )
  21
  22 /****************************************************************************
  23 *  Imports
  24 ****************************************************************************/
  25 extern void vpx_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled);
  26
  27 /****************************************************************************
  28 *  Exported Global Variables
  29 ****************************************************************************/
  30 void (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength);
  31
  32 /****************************************************************************
  33  *
  34  *  ROUTINE       : temp_filter_wmt
  35  *
  36  *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
  37  *                  unsigned char *s     : Pointer to source frame.
  38  *                  unsigned char *d     : Pointer to destination frame.
  39  *                  int bytes            : Number of bytes to filter.
  40  *                  int strength         : Strength of filter to apply.
  41  *
  42  *  OUTPUTS       : None.
  43  *
  44  *  RETURNS       : void
  45  *
  46  *  FUNCTION      : Performs a closesness adjusted temporarl blur
  47  *
  48  *  SPECIAL NOTES : Destination frame can be same as source frame.
  49  *
  50  ****************************************************************************/
  51 void temp_filter_wmt
  52 (
  53     pre_proc_instance *ppi,
  54     unsigned char *s,
  55     unsigned char *d,
  56     int bytes,
  57     int strength
  58 )
  59 {
  60     int byte = 0;
  61     unsigned char *frameptr = ppi->frame_buffer;
  62
  63     __declspec(align(16)) unsigned short threes[]  = { 3, 3, 3, 3, 3, 3, 3, 3};
  64     __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16, 16, 16, 16, 16};
  65
  66     if (ppi->frame == 0)
  67     {
  68         do
  69         {
  70             int i;
  71             int frame = 0;
  72
  73             do
  74             {
  75                 for (i = 0; i < 8; i++)
  76                 {
  77                     *frameptr = s[byte+i];
  78                     ++frameptr;
  79                 }
  80
  81                 ++frame;
  82             }
  83             while (frame < FRAMECOUNT);
  84
  85             for (i = 0; i < 8; i++)
  86                 d[byte+i] = s[byte+i];
  87
  88             byte += 8;
  89
  90         }
  91         while (byte < bytes);
  92     }
  93     else
  94     {
  95         int i;
  96         int offset2 = (ppi->frame % FRAMECOUNT);
  97
  98         do
  99         {
 100             __declspec(align(16)) unsigned short counts[8];
 101             __declspec(align(16)) unsigned short sums[8];
 102             __asm
 103             {
 104                 mov         eax, offset2
 105                 mov         edi, s                  // source pixels
 106                 pxor        xmm1, xmm1              // accumulator
 107
 108                 pxor        xmm7, xmm7
 109
 110                 mov         esi, frameptr           // accumulator
 111                 pxor        xmm2, xmm2              // count
 112
 113                 movq        xmm3, QWORD PTR [edi]
 114
 115                 movq        QWORD PTR [esi+8*eax], xmm3
 116
 117                 punpcklbw   xmm3, xmm2              // xmm3 source pixels
 118                 mov         ecx,  FRAMECOUNT
 119
 120                 next_frame:
 121                 movq        xmm4, QWORD PTR [esi]   // get frame buffer values
 122                 punpcklbw   xmm4, xmm7              // xmm4 frame buffer pixels
 123                 movdqa      xmm6, xmm4              // save the pixel values
 124                 psubsw      xmm4, xmm3              // subtracted pixel values
 125                 pmullw      xmm4, xmm4              // square xmm4
 126                 movd        xmm5, strength
 127                 psrlw       xmm4, xmm5              // should be strength
 128                 pmullw      xmm4, threes            // 3 * modifier
 129                 movdqa      xmm5, sixteens          // 16s
 130                 psubusw     xmm5, xmm4              // 16 - modifiers
 131                 movdqa      xmm4, xmm5              // save the modifiers
 132                 pmullw      xmm4, xmm6              // multiplier values
 133                 paddusw     xmm1, xmm4              // accumulator
 134                 paddusw     xmm2, xmm5              // count
 135                 add         esi, 8                  // next frame
 136                 dec         ecx                     // next set of eight pixels
 137                 jnz         next_frame
 138
 139                 movdqa      counts, xmm2
 140                 psrlw       xmm2, 1                 // divide count by 2 for rounding
 141                 paddusw     xmm1, xmm2              // rounding added in
 142
 143                 mov         frameptr, esi
 144
 145                 movdqa      sums, xmm1
 146             }
 147
 148             for (i = 0; i < 8; i++)
 149             {
 150                 int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
 151                 blurvalue >>= 16;
 152                 d[i] = blurvalue;
 153             }
 154
 155             s += 8;
 156             d += 8;
 157             byte += 8;
 158         }
 159         while (byte < bytes);
 160     }
 161
 162     ++ppi->frame;
 163     __asm emms
 164 }
 165
 166 /****************************************************************************
 167  *
 168  *  ROUTINE       : temp_filter_mmx
 169  *
 170  *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
 171  *                  unsigned char *s     : Pointer to source frame.
 172  *                  unsigned char *d     : Pointer to destination frame.
 173  *                  int bytes            : Number of bytes to filter.
 174  *                  int strength         : Strength of filter to apply.
 175  *
 176  *  OUTPUTS       : None.
 177  *
 178  *  RETURNS       : void
 179  *
 180  *  FUNCTION      : Performs a closesness adjusted temporarl blur
 181  *
 182  *  SPECIAL NOTES : Destination frame can be same as source frame.
 183  *
 184  ****************************************************************************/
 185 void temp_filter_mmx
 186 (
 187     pre_proc_instance *ppi,
 188     unsigned char *s,
 189     unsigned char *d,
 190     int bytes,
 191     int strength
 192 )
 193 {
 194     int byte = 0;
 195     unsigned char *frameptr = ppi->frame_buffer;
 196
 197     __declspec(align(16)) unsigned short threes[]  = { 3, 3, 3, 3};
 198     __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16};
 199
 200     if (ppi->frame == 0)
 201     {
 202         do
 203         {
 204             int i;
 205             int frame = 0;
 206
 207             do
 208             {
 209                 for (i = 0; i < 4; i++)
 210                 {
 211                     *frameptr = s[byte+i];
 212                     ++frameptr;
 213                 }
 214
 215                 ++frame;
 216             }
 217             while (frame < FRAMECOUNT);
 218
 219             for (i = 0; i < 4; i++)
 220                 d[byte+i] = s[byte+i];
 221
 222             byte += 4;
 223
 224         }
 225         while (byte < bytes);
 226     }
 227     else
 228     {
 229         int i;
 230         int offset2 = (ppi->frame % FRAMECOUNT);
 231
 232         do
 233         {
 234             __declspec(align(16)) unsigned short counts[8];
 235             __declspec(align(16)) unsigned short sums[8];
 236             __asm
 237             {
 238
 239                 mov         eax, offset2
 240                 mov         edi, s                  // source pixels
 241                 pxor        mm1, mm1                // accumulator
 242                 pxor        mm7, mm7
 243
 244                 mov         esi, frameptr           // accumulator
 245                 pxor        mm2, mm2                // count
 246
 247                 movd        mm3, DWORD PTR [edi]
 248                 movd        DWORD PTR [esi+4*eax], mm3
 249
 250                 punpcklbw   mm3, mm2                // mm3 source pixels
 251                 mov         ecx,  FRAMECOUNT
 252
 253                 next_frame:
 254                 movd        mm4, DWORD PTR [esi]    // get frame buffer values
 255                 punpcklbw   mm4, mm7                // mm4 frame buffer pixels
 256                 movq        mm6, mm4                // save the pixel values
 257                 psubsw      mm4, mm3                // subtracted pixel values
 258                 pmullw      mm4, mm4                // square mm4
 259                 movd        mm5, strength
 260                 psrlw       mm4, mm5                // should be strength
 261                 pmullw      mm4, threes             // 3 * modifier
 262                 movq        mm5, sixteens           // 16s
 263                 psubusw     mm5, mm4                // 16 - modifiers
 264                 movq        mm4, mm5                // save the modifiers
 265                 pmullw      mm4, mm6                // multiplier values
 266                 paddusw     mm1, mm4                // accumulator
 267                 paddusw     mm2, mm5                // count
 268                 add         esi, 4                  // next frame
 269                 dec         ecx                     // next set of eight pixels
 270                 jnz         next_frame
 271
 272                 movq        counts, mm2
 273                 psrlw       mm2, 1                  // divide count by 2 for rounding
 274                 paddusw     mm1, mm2                // rounding added in
 275
 276                 mov         frameptr, esi
 277
 278                 movq        sums, mm1
 279
 280             }
 281
 282             for (i = 0; i < 4; i++)
 283             {
 284                 int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
 285                 blurvalue >>= 16;
 286                 d[i] = blurvalue;
 287             }
 288
 289             s += 4;
 290             d += 4;
 291             byte += 4;
 292         }
 293         while (byte < bytes);
 294     }
 295
 296     ++ppi->frame;
 297     __asm emms
 298 }