hvirtual/mpeg2enc/mblock_sub44_sads.c

   1 /*
   2  *
   3  * mblock_sub44_sads.c
   4  * Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
   5  *
   6  * Fast block sum-absolute difference computation for a rectangular area 4*x
   7  * by y where y > h against a 4 by h block.
   8  *
   9  * Used for 4*4 sub-sampled motion compensation calculations.
  10  *
  11  * This is actually just a shell that uses templates from the included
  12  * file "mblock_sub44_sads_x86_h.c".  I didn't trust the compiler to do a good
  13  * job on nested inlining.  One day I'll experiment.
  14  *
  15  *
  16  * This file is part of mpeg2enc, a free MPEG-2 video stream encoder
  17  * based on the original MSSG reference design
  18  *
  19  * mpeg2enc is free software; you can redistribute new parts
  20  * and/or modify under the terms of the GNU General Public License
  21  * as published by
  22  * the Free Software Foundation; either version 2 of the License, or
  23  * (at your option) any later version.
  24  *
  25  * mpeg2dec is distributed in the hope that it will be useful,
  26  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  27  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  28  * GNU General Public License for more details.
  29  *
  30  * See the files for those sections (c) MSSG
  31  *
  32  * You should have received a copy of the GNU General Public License
  33  * along with this program; if not, write to the Free Software
  34  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  35  */
  36
  37 #include "config.h"
  38 #include "global.h"
  39 #include "mpeg2enc.h"
  40 #include "simd.h"
  41 #include "attributes.h"
  42 #include "mmx.h"
  43 #include "fastintfns.h"
  44
  45 /*
  46   Register usage:
  47   mm0-mm3  Hold the current row
  48   mm4      Used for accumulating partial SAD
  49   mm7      Holds zero
  50  */
  51
  52 static inline void mmx_zero_reg (void)
  53 {
  54         /*  load 0 into mm7      */
  55         pxor_r2r (mm7, mm7);
  56 }
  57
  58 /*
  59  * Load a 4*4 block of 4*4 sub-sampled pels (qpels) into the MMX
  60  * registers
  61  *
  62  */
  63
  64 static __inline__ void load_blk(uint8_t *blk,uint32_t rowstride,int h)
  65 {
  66         movq_m2r( *blk, mm0);
  67         blk += rowstride;
  68         movq_m2r( *blk, mm1);
  69         if( h == 2 )
  70                 return;
  71         blk += rowstride;
  72         movq_m2r( *blk, mm2);
  73         blk += rowstride;
  74         movq_m2r( *blk, mm3);
  75 }
  76
  77 /*
  78  * Do a shift right on the 4*4 block in the MMX registers
  79  *
  80  */
  81 static __inline__ void shift_blk(const uint32_t shift)
  82 {
  83         psrlq_i2r( shift,mm0);
  84         psrlq_i2r( shift,mm1);
  85         psrlq_i2r( shift,mm2);
  86         psrlq_i2r( shift,mm3);
  87 }
  88
  89 /*
  90  * Compute the Sum absolute differences between the 4*h block in
  91  * the MMX registers
  92  *
  93  * and the 4*h block pointed to by refblk
  94  *
  95  * h == 2 || h == 4
  96  *
  97  * TODO: Currently always loads and shifts 4*4 even if 4*2 is required.
  98  *
  99  */
 100
 101 static __inline__ int qblock_sad_mmxe(uint8_t *refblk,
 102                                                                   uint32_t h,
 103                                                                   uint32_t rowstride)
 104 {
 105         int res;
 106         pxor_r2r        (mm4,mm4);
 107
 108         movq_r2r        (mm0,mm5);              /* First row */
 109         movd_m2r        (*refblk, mm6);
 110         pxor_r2r    ( mm7, mm7);
 111         refblk += rowstride;
 112         punpcklbw_r2r   ( mm7, mm5);
 113         punpcklbw_r2r   ( mm7, mm6);
 114         psadbw_r2r      ( mm5, mm6);
 115         paddw_r2r     ( mm6, mm4 );
 116
 117
 118
 119         movq_r2r        (mm1,mm5);              /* Second row */
 120         movd_m2r        (*refblk, mm6);
 121         refblk += rowstride;
 122         punpcklbw_r2r   ( mm7, mm5);
 123         punpcklbw_r2r   ( mm7, mm6);
 124         psadbw_r2r      ( mm5, mm6);
 125         paddw_r2r     ( mm6, mm4 );
 126
 127         if( h == 4 )
 128         {
 129
 130                 movq_r2r        (mm2,mm5);              /* Third row */
 131                 movd_m2r        (*refblk, mm6);
 132                 refblk += rowstride;
 133                 punpcklbw_r2r   ( mm7, mm5);
 134                 punpcklbw_r2r   ( mm7, mm6);
 135                 psadbw_r2r      ( mm5, mm6);
 136                 paddw_r2r     ( mm6, mm4 );
 137
 138
 139                 movq_r2r        (mm3,mm5);              /* Fourth row */
 140                 movd_m2r        (*refblk, mm6);
 141                 punpcklbw_r2r   ( mm7, mm5);
 142                 punpcklbw_r2r   ( mm7, mm6);
 143                 psadbw_r2r      ( mm5, mm6);
 144                 paddw_r2r     ( mm6, mm4 );
 145
 146         }
 147         movd_r2m      ( mm4, res );
 148
 149         return res;
 150 }
 151
 152
 153
 154 static __inline__ int qblock_sad_mmx(uint8_t *refblk,
 155                                                                   uint32_t h,
 156                                                                   uint32_t rowstride)
 157 {
 158         int res;
 159         pxor_r2r        (mm4,mm4);
 160
 161         movq_r2r        (mm0,mm5);              /* First row */
 162         movd_m2r        (*refblk, mm6);
 163         pxor_r2r    ( mm7, mm7);
 164         refblk += rowstride;
 165         punpcklbw_r2r   ( mm7, mm5);
 166
 167         punpcklbw_r2r   ( mm7, mm6);
 168
 169         movq_r2r                ( mm5, mm7);
 170         psubusw_r2r     ( mm6, mm5);
 171
 172         psubusw_r2r   ( mm7, mm6);
 173
 174         paddw_r2r     ( mm5, mm4);
 175         paddw_r2r     ( mm6, mm4 );
 176
 177
 178
 179         movq_r2r        (mm1,mm5);              /* Second row */
 180         movd_m2r        (*refblk, mm6);
 181         pxor_r2r    ( mm7, mm7);
 182         refblk += rowstride;
 183         punpcklbw_r2r   ( mm7, mm5);
 184         punpcklbw_r2r   ( mm7, mm6);
 185         movq_r2r                ( mm5, mm7);
 186         psubusw_r2r     ( mm6, mm5);
 187         psubusw_r2r   ( mm7, mm6);
 188         paddw_r2r     ( mm5, mm4);
 189         paddw_r2r     ( mm6, mm4 );
 190
 191         if( h == 4 )
 192         {
 193
 194                 movq_r2r        (mm2,mm5);              /* Third row */
 195                 movd_m2r        (*refblk, mm6);
 196                 pxor_r2r    ( mm7, mm7);
 197                 refblk += rowstride;
 198                 punpcklbw_r2r   ( mm7, mm5);
 199                 punpcklbw_r2r   ( mm7, mm6);
 200                 movq_r2r                ( mm5, mm7);
 201                 psubusw_r2r     ( mm6, mm5);
 202                 psubusw_r2r   ( mm7, mm6);
 203                 paddw_r2r     ( mm5, mm4);
 204                 paddw_r2r     ( mm6, mm4 );
 205
 206                 movq_r2r        (mm3,mm5);              /* Fourth row */
 207                 movd_m2r        (*refblk, mm6);
 208                 pxor_r2r    ( mm7, mm7);
 209                 punpcklbw_r2r   ( mm7, mm5);
 210                 punpcklbw_r2r   ( mm7, mm6);
 211                 movq_r2r                ( mm5, mm7);
 212                 psubusw_r2r     ( mm6, mm5);
 213                 psubusw_r2r   ( mm7, mm6);
 214                 paddw_r2r     ( mm5, mm4);
 215                 paddw_r2r     ( mm6, mm4 );
 216         }
 217
 218
 219         movq_r2r      ( mm4, mm5 );
 220     psrlq_i2r     ( 32, mm5 );
 221     paddw_r2r     ( mm5, mm4 );
 222         movq_r2r      ( mm4, mm6 );
 223     psrlq_i2r     ( 16, mm6 );
 224     paddw_r2r     ( mm6, mm4 );
 225         movd_r2m      ( mm4, res );
 226
 227         return res & 0xffff;
 228 }
 229
 230
 231 /*
 232  * Do the Extended MMX versions
 233  */
 234 #define SIMD_SUFFIX(x) x##_mmxe
 235 #include "mblock_sub44_sads_x86_h.c"
 236 #undef SIMD_SUFFIX
 237 /*
 238  * Do the original MMX versions
 239  */
 240 #define SIMD_SUFFIX(x) x##_mmx
 241 #include "mblock_sub44_sads_x86_h.c"
 242 #undef SIMD_SUFFIX
 243
 244
 245
 246