libdirac_common/mot_comp_mmx.cpp

   1 /* ***** BEGIN LICENSE BLOCK *****
   2 *
   3 * $Id$ $Name$
   4 *
   5 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
   6 *
   7 * The contents of this file are subject to the Mozilla Public License
   8 * Version 1.1 (the "License"); you may not use this file except in compliance
   9 * with the License. You may obtain a copy of the License at
  10 * http://www.mozilla.org/MPL/
  11 *
  12 * Software distributed under the License is distributed on an "AS IS" basis,
  13 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
  14 * the specific language governing rights and limitations under the License.
  15 *
  16 * The Original Code is BBC Research and Development code.
  17 *
  18 * The Initial Developer of the Original Code is the British Broadcasting
  19 * Corporation.
  20 * Portions created by the Initial Developer are Copyright (C) 2004.
  21 * All Rights Reserved.
  22 *
  23 * Contributor(s): Anuradha Suraparaju (Original Author)
  24 *
  25 * Alternatively, the contents of this file may be used under the terms of
  26 * the GNU General Public License Version 2 (the "GPL"), or the GNU Lesser
  27 * Public License Version 2.1 (the "LGPL"), in which case the provisions of
  28 * the GPL or the LGPL are applicable instead of those above. If you wish to
  29 * allow use of your version of this file only under the terms of the either
  30 * the GPL or LGPL and not to allow others to use your version of this file
  31 * under the MPL, indicate your decision by deleting the provisions above
  32 * and replace them with the notice and other provisions required by the GPL
  33 * or LGPL. If you do not delete the provisions above, a recipient may use
  34 * your version of this file under the terms of any one of the MPL, the GPL
  35 * or the LGPL.
  36 * ***** END LICENSE BLOCK ***** */
  37
  38 #if defined(HAVE_MMX)
  39 #include <mmintrin.h>
  40 #include <libdirac_common/mot_comp.h>
  41 #include <libdirac_common/mot_comp_mmx.h>
  42 #include <libdirac_common/motion.h>
  43 #include <libdirac_common/dirac_assertions.h>
  44 using namespace dirac;
  45
  46 inline void check_active_columns(
  47         int x, int xmax, ValueType act_cols1[4],
  48         ValueType act_cols2[4], ValueType *row1, ValueType *row2)
  49 {
  50     // check if we need any clipping
  51     if (x >= 0 && (x+3) < xmax) {
  52         // special case, nothing to do
  53         memcpy(act_cols1, &row1[x], 4 * sizeof(ValueType));
  54         memcpy(act_cols2, &row2[x], 4 * sizeof(ValueType));
  55     }
  56     else
  57     {
  58         act_cols1[0] = row1[BChk(x,xmax)];
  59         act_cols2[0] = row2[BChk(x,xmax)];
  60         act_cols1[1] = row1[BChk(x+1,xmax)];
  61         act_cols2[1] = row2[BChk(x+1,xmax)];
  62         act_cols1[2] = row1[BChk(x+2,xmax)];
  63         act_cols2[2] = row2[BChk(x+2,xmax)];
  64         act_cols1[3] = row1[BChk(x+3,xmax)];
  65         act_cols2[3] = row2[BChk(x+3,xmax)];
  66     }
  67 }
  68
  69 void MotionCompensator_QuarterPixel::BlockPixelPred(
  70                                    TwoDArray<ValueType> &block_data ,
  71                                    const ImageCoords& pos ,
  72                                    const ImageCoords& orig_pic_size ,
  73                                    const PicArray &refup_data ,
  74                                    const MVector &mv)
  75 {
  76     // Set up the start point in the reference image by rounding the motion vector
  77     // to 1/2 pel accuracy.NB: bit shift rounds negative values DOWN, as required
  78     const MVector roundvec( mv.x>>1 , mv.y>>1 );
  79
  80     //Get the remainder after rounding. NB rmdr values always 0 or 1
  81     const MVector rmdr( mv.x & 1 , mv.y & 1 );
  82
  83     //Where to start in the upconverted image
  84     const ImageCoords start_pos( std::max(pos.x,0) , std::max(pos.y,0) );
  85     // check that we are doing MC within true pic boundaries
  86     if (start_pos.x >= orig_pic_size.x || start_pos.y >= orig_pic_size.y)
  87         return;
  88     const ImageCoords ref_start( ( start_pos.x<<1 ) + roundvec.x ,( start_pos.y<<1 ) + roundvec.y );
  89
  90     //An additional stage to make sure the block to be copied does not fall outside
  91     //the reference image.
  92     const int refXlen = refup_data.LengthX();
  93     const int trueRefXlen = (orig_pic_size.x << 1) - 1;
  94     const int trueRefYlen = (orig_pic_size.y << 1) - 1;
  95
  96     ValueType *block_curr = &block_data[0][0];
  97
  98     bool do_bounds_checking = false;
  99     //Check if there are going to be any problems copying the block from
 100     //the upvconverted reference image.
 101     if( ref_start.x < 0 )
 102         do_bounds_checking = true;
 103     else if( ref_start.x + (block_data.LengthX()<<1 ) >= trueRefXlen )
 104         do_bounds_checking = true;
 105     if( ref_start.y < 0 )
 106         do_bounds_checking = true;
 107     else if( ref_start.y + (block_data.LengthY()<<1 ) >= trueRefYlen)
 108         do_bounds_checking = true;
 109
 110     if( !do_bounds_checking )
 111     {
 112         int stopX = (block_data.LengthX()>>2)<<2;
 113         ValueType *refup_curr = &refup_data[ref_start.y][ref_start.x];
 114         const int refup_next( ( refXlen - block_data.LengthX() )*2 ); //go down 2 rows and back to beginning of block line
 115         if( rmdr.x == 0 && rmdr.y == 0 )
 116         {
 117             __m64 m1, m2;
 118             for( int y=0; y < block_data.LengthY(); ++y, refup_curr+=refup_next )
 119             {
 120                 int x;
 121                 for( x=0; x < stopX; x+=4, block_curr+=4, refup_curr+=8 )
 122                 {
 123                     m1 = _mm_unpacklo_pi16 (*(__m64 *)refup_curr, *(__m64 *)(refup_curr+4));
 124                     m2 = _mm_unpackhi_pi16 (*(__m64 *)refup_curr, *(__m64 *)(refup_curr+4));
 125                     // *block_curr = refup_curr[0]
 126                     *(__m64 *)block_curr = _mm_unpacklo_pi16 (m1, m2);
 127                 }
 128                 // Mopup the last value
 129                 for ( x=stopX ; x < block_data.LengthX(); ++x)
 130                 {
 131                     *block_curr = *refup_curr;
 132                     ++block_curr;
 133                     refup_curr+=2;
 134                 }
 135             }
 136             _mm_empty();
 137         }
 138         else if( rmdr.y == 0 )
 139         {
 140             __m64 round = _mm_set_pi16 (1, 1, 1, 1);
 141             __m64 m1, m2, m3;
 142
 143             for( int y=0; y < block_data.LengthY(); ++y, refup_curr+=refup_next )
 144             {
 145                 int x;
 146                 for( x=0; x < stopX; x+=4, block_curr+=4, refup_curr+=8 )
 147                 {
 148                     m1 = _mm_unpacklo_pi16 (*(__m64 *)refup_curr, *(__m64 *)(refup_curr+4));
 149                     m3 = _mm_unpackhi_pi16 (*(__m64 *)refup_curr, *(__m64 *)(refup_curr+4));
 150                     m2 = _mm_unpackhi_pi16 (m1, m3);
 151                     m1 = _mm_unpacklo_pi16 (m1, m3);
 152
 153                     // (refup_curr[0] + refup_curr[1] + 1)>>1
 154                     m1 = _mm_add_pi16 (m1, m2);
 155                     m1 = _mm_add_pi16 (m1, round);
 156                     *(__m64 *)block_curr = _mm_srai_pi16 (m1, 1);
 157                 }
 158
 159                 // Mopup the last value
 160                 for ( x=stopX; x < block_data.LengthX(); ++x)
 161                 {
 162                     *block_curr = ((    *refup_curr  +
 163                                        *(refup_curr+1)  + 1
 164                                   ) >> 1);
 165                     ++block_curr;
 166                     refup_curr+=2;
 167                 }
 168             }
 169             _mm_empty();
 170         }
 171         else if( rmdr.x == 0 )
 172         {
 173             __m64 round = _mm_set_pi16 (1, 1, 1, 1);
 174             __m64 m1, m2, m3;
 175             for( int y=0; y < block_data.LengthY(); ++y, refup_curr+=refup_next )
 176             {
 177                 int x;
 178                 for( x = 0; x < stopX; x+=4, block_curr+=4, refup_curr+=8 )
 179                 {
 180                     m1 = _mm_unpacklo_pi16 (*(__m64 *)refup_curr, *(__m64 *)(refup_curr+4));
 181                     m2 = _mm_unpackhi_pi16 (*(__m64 *)refup_curr, *(__m64 *)(refup_curr+4));
 182                     // m1 now contains r00 r02 r04 r06
 183                     m1 = _mm_unpacklo_pi16 (m1, m2);
 184
 185                     m3 = _mm_unpacklo_pi16 (*(__m64 *)(refup_curr+refXlen), *(__m64 *)(refup_curr+refXlen+4));
 186                     m2 = _mm_unpackhi_pi16 (*(__m64 *)(refup_curr+refXlen), *(__m64 *)(refup_curr+refXlen+4));
 187                     // m1 now contains r10 r12 r14 r16
 188                     m2 = _mm_unpacklo_pi16 (m3, m2);
 189
 190                     // (refup_curr[0] + (refup_curr+refXlen)[0] + 1)>>1
 191                     m1 = _mm_add_pi16 (m1, m2);
 192                     m1 = _mm_add_pi16 (m1, round);
 193                     *(__m64 *)block_curr = _mm_srai_pi16 (m1, 1);
 194                 }
 195                 for ( x=stopX; x < block_data.LengthX(); ++x)
 196                 {
 197                     *block_curr = (( *refup_curr + *(refup_curr+refXlen) +
 198                                        1
 199                                    ) >> 1);
 200                     ++block_curr;
 201                     refup_curr+=2;
 202                 }
 203             }
 204             _mm_empty();
 205         }
 206         else
 207         {
 208             __m64 round = _mm_set_pi16 (2, 2, 2, 2);
 209             __m64 m1, m2, m3;
 210             for( int y=0; y < block_data.LengthY(); ++y, refup_curr+=refup_next )
 211             {
 212                 int x;
 213                 for( x = 0; x < stopX; x+=4, block_curr+=4, refup_curr+=8 )
 214                 {
 215                     m1 = _mm_add_pi16 (*(__m64 *)refup_curr, *(__m64 *)(refup_curr+refXlen));
 216                     m2 = _mm_add_pi16 (*(__m64 *)(refup_curr+4), *(__m64 *)(refup_curr+refXlen+4));
 217                     m3 = _mm_unpacklo_pi16 (m1, m2);
 218                     m1 = _mm_unpackhi_pi16 (m1, m2);
 219
 220                     m2 = _mm_unpackhi_pi16 (m3, m1);
 221                     m1 = _mm_unpacklo_pi16 (m3, m1);
 222
 223                     m1 = _mm_add_pi16 (m1, m2);
 224                     m1 = _mm_add_pi16 (m1, round);
 225                     *(__m64 *)block_curr = _mm_srai_pi16 (m1, 2);
 226                 }
 227                 for ( x=stopX; x < block_data.LengthX(); ++x)
 228                 {
 229                     *block_curr = ((  *refup_curr  +
 230                                        *(refup_curr+1)  +
 231                                        *(refup_curr+refXlen)  +
 232                                        *(refup_curr+refXlen+1)  +
 233                                        2
 234                                    ) >> 2);
 235                     ++block_curr;
 236                     refup_curr+=2;
 237                 }
 238             }
 239             _mm_empty();
 240         }
 241     }
 242     else
 243     {
 244         // We're 2doing bounds checking because we'll fall off the edge of the reference otherwise.
 245
 246         //weights for doing linear interpolation, calculated from the remainder values
 247         const ValueType linear_wts[4] = {  (2 - rmdr.x) * (2 - rmdr.y),    //tl
 248                                            rmdr.x * (2 - rmdr.y),          //tr
 249                                            (2 - rmdr.x) * rmdr.y,          //bl
 250                                            rmdr.x * rmdr.y };              //br
 251
 252         ValueType act_cols1[4], act_cols2[4];
 253         int uX, uY, c, l;
 254        for(c = 0, uY = ref_start.y; c < block_data.LengthY(); ++c, uY += 2)
 255        {
 256            for(l = 0, uX=ref_start.x; l < block_data.LengthX(); ++l, ++block_curr, uX += 2)
 257            {
 258                check_active_columns(uX, trueRefXlen, act_cols1, act_cols2, refup_data[BChk(uY, trueRefYlen)], refup_data[BChk(uY+1, trueRefYlen)]);
 259
 260                *block_curr = ((     linear_wts[0] * act_cols1[0] +
 261                                     linear_wts[1] * act_cols1[1] +
 262                                     linear_wts[2] * act_cols2[0] +
 263                                     linear_wts[3] * act_cols2[1] +
 264                                     2
 265                                ) >> 2);
 266            }//l
 267        }//c
 268     }
 269 }
 270
 271 void MotionCompensator_HalfPixel::BlockPixelPred(
 272                                    TwoDArray<ValueType> &block_data ,
 273                                    const ImageCoords& pos ,
 274                                    const ImageCoords& orig_pic_size ,
 275                                    const PicArray &refup_data ,
 276                                    const MVector &mv)
 277 {
 278     //Where to start in the upconverted image
 279     const ImageCoords start_pos( std::max(pos.x,0) , std::max(pos.y,0) );
 280     const ImageCoords ref_start( ( start_pos.x<<1 ) + mv.x ,( start_pos.y<<1 ) + mv.y );
 281
 282     //An additional stage to make sure the block to be copied does not fall outside
 283     //the reference image.
 284     const int refXlen = refup_data.LengthX();
 285     //const int refYlen = refup_data.LengthY();
 286     const int trueRefXlen = (orig_pic_size.x << 1) - 1;
 287     const int trueRefYlen = (orig_pic_size.y << 1) - 1;
 288
 289     bool do_bounds_checking = false;
 290
 291     //Check if there are going to be any problems copying the block from
 292     //the upvconverted reference image.
 293
 294     if( ref_start.x < 0 )
 295         do_bounds_checking = true;
 296     else if( ref_start.x + ((block_data.LengthX() - 1 )<<1 ) >= trueRefXlen )
 297         do_bounds_checking = true;
 298     if( ref_start.y < 0 )
 299         do_bounds_checking = true;
 300     else if( ref_start.y + ((block_data.LengthY() - 1 )<<1 ) >= trueRefYlen)
 301         do_bounds_checking = true;
 302
 303     ValueType *block_curr = &block_data[0][0];
 304
 305     if( !do_bounds_checking )
 306     {
 307         ValueType *refup_curr = &refup_data[ref_start.y][ref_start.x];
 308         const int refup_next( (refXlen - block_data.LengthX())*2 );// go down 2 rows and back up
 309 #if 1
 310         int stopX = (block_data.LengthX()>>2)<<2;
 311         {
 312             __m64 m1, m2;
 313
 314             for( int y=0; y < block_data.LengthY(); ++y, refup_curr+=refup_next )
 315             {
 316                 int x;
 317                 for( x=0; x < stopX; x+=4, block_curr+=4, refup_curr+=8 )
 318                 {
 319                     m1 = _mm_unpacklo_pi16 (*(__m64 *)refup_curr, *(__m64 *)(refup_curr+4));
 320                     m2 = _mm_unpackhi_pi16 (*(__m64 *)refup_curr, *(__m64 *)(refup_curr+4));
 321                     *(__m64 *)block_curr  = _mm_unpacklo_pi16 (m1, m2);
 322                 }
 323                 // Mopup the last value
 324                 for ( x=stopX ; x < block_data.LengthX(); ++x)
 325                 {
 326                     *block_curr = *refup_curr;
 327                     ++block_curr;
 328                     refup_curr+=2;
 329                 }
 330             }
 331             _mm_empty();
 332         }
 333 #else
 334
 335         for( int y=0; y < block_data.LengthY(); ++y, refup_curr+=refup_next )
 336         {
 337             for( int x=0; x < block_data.LengthX(); ++x, ++block_curr, refup_curr+=2 )
 338             {
 339                 *block_curr =  refup_curr[0];
 340             }
 341         }
 342 #endif
 343     }
 344     else
 345     {
 346         // We're doing bounds checking because we'll fall off the edge of the reference otherwise.
 347         for( int y=0, ry=ref_start.y, by=BChk(ry,trueRefYlen);
 348              y<block_data.LengthY();
 349              ++y, ry+=2,by=BChk(ry,trueRefYlen))
 350         {
 351              for( int x=0 , rx=ref_start.x , bx=BChk(rx,trueRefXlen);
 352                   x<block_data.LengthX() ;
 353                   ++x, ++block_curr, rx+=2 , bx=BChk(rx,trueRefXlen))
 354              {
 355                  *block_curr = refup_data[by][bx];
 356              }// x
 357         }// y
 358     }
 359 }
 360
 361 void MotionCompensator::AdjustBlockBySpatialWeights (
 362                                        TwoDArray<ValueType>& val_block,
 363                                        const ImageCoords &pos,
 364                                        const TwoDArray<ValueType> &wt_array)
 365 {
 366     ImageCoords start_pos (std::max(0, pos.x), std::max(0, pos.y));
 367     ImageCoords wt_start (start_pos.x - pos.x, start_pos.y - pos.y);
 368
 369     ValueType *val_curr = &val_block[0][0];
 370     ValueType *wt_curr = &wt_array[wt_start.y][wt_start.x];
 371
 372     // go down at row and back to beginning of weights line
 373     const int wt_next = wt_array.LengthX() - val_block.LengthX();
 374
 375     const int stopX = (val_block.LengthX()>>2)<<2;
 376
 377     for ( int j = 0; j < val_block.LengthY(); ++j, wt_curr += wt_next)
 378     {
 379         for ( int i =  0; i < stopX; i+=4, val_curr+=4, wt_curr+=4)
 380         {
 381             /*
 382             * NOTE: Using only the low 16 bits of the result of multiplication
 383             * by weights because the result is supposed to fit in 16 bit
 384             * words. For some weights could result in overflow and errors
 385             */
 386            __m64 *out = (__m64 *)val_curr;
 387            *out = _mm_mullo_pi16 (*(__m64 *)val_curr, *(__m64 *)wt_curr);
 388         }
 389         for (int i = stopX; i < val_block.LengthX(); ++i, ++val_curr, ++wt_curr)
 390         {
 391             *val_curr = *val_curr * *wt_curr;
 392         }
 393     }
 394     _mm_empty();
 395 }
 396
 397 namespace dirac
 398 {
 399     void CompensateComponentAddAndShift_mmx (int start_y, int end_y,
 400                                            int weight_bits,
 401                                            const ImageCoords& orig_pic_size,
 402                                            TwoDArray<ValueType> &comp_data,
 403                                            PicArray &pic_data_out)
 404     {
 405         if (start_y >= end_y)
 406             return;
 407         const int round_val = 1<<(weight_bits-1);
 408         int stopX = pic_data_out.FirstX() + ((orig_pic_size.x>>2)<<2);
 409         int x_end_truepic_data = pic_data_out.FirstX() + orig_pic_size.x;
 410         int x_end_data = pic_data_out.FirstX() + pic_data_out.LengthX();
 411         __m64 mround_val = _mm_set_pi16 (round_val, round_val, round_val, round_val);
 412         ValueType *pic_row = &comp_data[0][comp_data.FirstX()];
 413         ValueType *out_row = &pic_data_out[start_y][pic_data_out.FirstX()];
 414         for ( int i = start_y; i < end_y; i++)
 415         {
 416             for ( int j =  pic_data_out.FirstX(); j < stopX; j+=4)
 417             {
 418                 __m64 in1 = _mm_add_pi16 (*(__m64 *)pic_row, mround_val);
 419                 in1 = _mm_srai_pi16 (in1, weight_bits);
 420                 __m64 *out = (__m64 *)out_row;
 421                 *out = _mm_add_pi16 (in1, *out);
 422                 pic_row += 4;
 423                 out_row += 4;
 424             }
 425             for ( int j =stopX; j < x_end_truepic_data; j++)
 426             {
 427                 *out_row += static_cast<ValueType>( (*pic_row + round_val) >> weight_bits );
 428                 ++out_row;
 429                 ++pic_row;
 430             }
 431             // Now pad past the true picture with the last true pic val in
 432             // current row
 433             ValueType last_true_val = *(out_row - 1);
 434             for ( int j = x_end_truepic_data; j < x_end_data; ++j)
 435             {
 436                 *out_row = last_true_val;
 437                 ++out_row;
 438                 ++pic_row;
 439             }
 440          }
 441         _mm_empty();
 442     }
 443
 444     void AddMCBlock_mmx (const ImageCoords& start_pos,
 445                         TwoDArray<ValueType> &comp_strip,
 446                         TwoDArray<ValueType>& block_data)
 447     {
 448         const int stopX = (block_data.LengthX()>>2)<<2;
 449
 450         const int comp_next = comp_strip.LengthX()-block_data.LengthX();
 451         ValueType *comp_curr = &comp_strip[start_pos.y][start_pos.x];
 452         ValueType *block_curr = &block_data[0][0];
 453
 454         for (int j = 0; j < block_data.LengthY(); ++j, comp_curr += comp_next)
 455         {
 456             for (int i = 0; i < stopX; i+=4, comp_curr+=4, block_curr+=4)
 457             {
 458                 __m64 *out = (__m64 *)comp_curr;
 459                 // mc_tmp[y][x] += val
 460                 *out = _mm_add_pi16 (*(__m64 *)comp_curr, *(__m64 *)block_curr);
 461             }
 462             for (int i = stopX; i < block_data.LengthX(); ++i, ++comp_curr, ++block_curr)
 463             {
 464                 *comp_curr += *block_curr;
 465             }
 466         }
 467         _mm_empty();
 468     }
 469 }
 470 #endif