Updating Contact email
[BrunelResearch-dirac.git] / libdirac_common / mot_comp_mmx.cpp
blob6e8b7ee343f21933a3d9390170e4cd0474daf899
1 /* ***** BEGIN LICENSE BLOCK *****
3 * $Id$ $Name$
5 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
7 * The contents of this file are subject to the Mozilla Public License
8 * Version 1.1 (the "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
10 * http://www.mozilla.org/MPL/
12 * Software distributed under the License is distributed on an "AS IS" basis,
13 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
14 * the specific language governing rights and limitations under the License.
16 * The Original Code is BBC Research and Development code.
18 * The Initial Developer of the Original Code is the British Broadcasting
19 * Corporation.
20 * Portions created by the Initial Developer are Copyright (C) 2004.
21 * All Rights Reserved.
23 * Contributor(s): Anuradha Suraparaju (Original Author)
25 * Alternatively, the contents of this file may be used under the terms of
26 * the GNU General Public License Version 2 (the "GPL"), or the GNU Lesser
27 * Public License Version 2.1 (the "LGPL"), in which case the provisions of
28 * the GPL or the LGPL are applicable instead of those above. If you wish to
29 * allow use of your version of this file only under the terms of the either
30 * the GPL or LGPL and not to allow others to use your version of this file
31 * under the MPL, indicate your decision by deleting the provisions above
32 * and replace them with the notice and other provisions required by the GPL
33 * or LGPL. If you do not delete the provisions above, a recipient may use
34 * your version of this file under the terms of any one of the MPL, the GPL
35 * or the LGPL.
36 * ***** END LICENSE BLOCK ***** */
38 #if defined(HAVE_MMX)
39 #include <mmintrin.h>
40 #include <libdirac_common/mot_comp.h>
41 #include <libdirac_common/mot_comp_mmx.h>
42 #include <libdirac_common/motion.h>
43 #include <libdirac_common/dirac_assertions.h>
44 using namespace dirac;
46 inline void check_active_columns(
47 int x, int xmax, ValueType act_cols1[4],
48 ValueType act_cols2[4], ValueType *row1, ValueType *row2)
50 // check if we need any clipping
51 if (x >= 0 && (x+3) < xmax) {
52 // special case, nothing to do
53 memcpy(act_cols1, &row1[x], 4 * sizeof(ValueType));
54 memcpy(act_cols2, &row2[x], 4 * sizeof(ValueType));
56 else
58 act_cols1[0] = row1[BChk(x,xmax)];
59 act_cols2[0] = row2[BChk(x,xmax)];
60 act_cols1[1] = row1[BChk(x+1,xmax)];
61 act_cols2[1] = row2[BChk(x+1,xmax)];
62 act_cols1[2] = row1[BChk(x+2,xmax)];
63 act_cols2[2] = row2[BChk(x+2,xmax)];
64 act_cols1[3] = row1[BChk(x+3,xmax)];
65 act_cols2[3] = row2[BChk(x+3,xmax)];
69 void MotionCompensator_QuarterPixel::BlockPixelPred(
70 TwoDArray<ValueType> &block_data ,
71 const ImageCoords& pos ,
72 const ImageCoords& orig_pic_size ,
73 const PicArray &refup_data ,
74 const MVector &mv)
76 // Set up the start point in the reference image by rounding the motion vector
77 // to 1/2 pel accuracy.NB: bit shift rounds negative values DOWN, as required
78 const MVector roundvec( mv.x>>1 , mv.y>>1 );
80 //Get the remainder after rounding. NB rmdr values always 0 or 1
81 const MVector rmdr( mv.x & 1 , mv.y & 1 );
83 //Where to start in the upconverted image
84 const ImageCoords start_pos( std::max(pos.x,0) , std::max(pos.y,0) );
85 // check that we are doing MC within true pic boundaries
86 if (start_pos.x >= orig_pic_size.x || start_pos.y >= orig_pic_size.y)
87 return;
88 const ImageCoords ref_start( ( start_pos.x<<1 ) + roundvec.x ,( start_pos.y<<1 ) + roundvec.y );
90 //An additional stage to make sure the block to be copied does not fall outside
91 //the reference image.
92 const int refXlen = refup_data.LengthX();
93 const int trueRefXlen = (orig_pic_size.x << 1) - 1;
94 const int trueRefYlen = (orig_pic_size.y << 1) - 1;
96 ValueType *block_curr = &block_data[0][0];
98 bool do_bounds_checking = false;
99 //Check if there are going to be any problems copying the block from
100 //the upvconverted reference image.
101 if( ref_start.x < 0 )
102 do_bounds_checking = true;
103 else if( ref_start.x + (block_data.LengthX()<<1 ) >= trueRefXlen )
104 do_bounds_checking = true;
105 if( ref_start.y < 0 )
106 do_bounds_checking = true;
107 else if( ref_start.y + (block_data.LengthY()<<1 ) >= trueRefYlen)
108 do_bounds_checking = true;
110 if( !do_bounds_checking )
112 int stopX = (block_data.LengthX()>>2)<<2;
113 ValueType *refup_curr = &refup_data[ref_start.y][ref_start.x];
114 const int refup_next( ( refXlen - block_data.LengthX() )*2 ); //go down 2 rows and back to beginning of block line
115 if( rmdr.x == 0 && rmdr.y == 0 )
117 __m64 m1, m2;
118 for( int y=0; y < block_data.LengthY(); ++y, refup_curr+=refup_next )
120 int x;
121 for( x=0; x < stopX; x+=4, block_curr+=4, refup_curr+=8 )
123 m1 = _mm_unpacklo_pi16 (*(__m64 *)refup_curr, *(__m64 *)(refup_curr+4));
124 m2 = _mm_unpackhi_pi16 (*(__m64 *)refup_curr, *(__m64 *)(refup_curr+4));
125 // *block_curr = refup_curr[0]
126 *(__m64 *)block_curr = _mm_unpacklo_pi16 (m1, m2);
128 // Mopup the last value
129 for ( x=stopX ; x < block_data.LengthX(); ++x)
131 *block_curr = *refup_curr;
132 ++block_curr;
133 refup_curr+=2;
136 _mm_empty();
138 else if( rmdr.y == 0 )
140 __m64 round = _mm_set_pi16 (1, 1, 1, 1);
141 __m64 m1, m2, m3;
143 for( int y=0; y < block_data.LengthY(); ++y, refup_curr+=refup_next )
145 int x;
146 for( x=0; x < stopX; x+=4, block_curr+=4, refup_curr+=8 )
148 m1 = _mm_unpacklo_pi16 (*(__m64 *)refup_curr, *(__m64 *)(refup_curr+4));
149 m3 = _mm_unpackhi_pi16 (*(__m64 *)refup_curr, *(__m64 *)(refup_curr+4));
150 m2 = _mm_unpackhi_pi16 (m1, m3);
151 m1 = _mm_unpacklo_pi16 (m1, m3);
153 // (refup_curr[0] + refup_curr[1] + 1)>>1
154 m1 = _mm_add_pi16 (m1, m2);
155 m1 = _mm_add_pi16 (m1, round);
156 *(__m64 *)block_curr = _mm_srai_pi16 (m1, 1);
159 // Mopup the last value
160 for ( x=stopX; x < block_data.LengthX(); ++x)
162 *block_curr = (( *refup_curr +
163 *(refup_curr+1) + 1
164 ) >> 1);
165 ++block_curr;
166 refup_curr+=2;
169 _mm_empty();
171 else if( rmdr.x == 0 )
173 __m64 round = _mm_set_pi16 (1, 1, 1, 1);
174 __m64 m1, m2, m3;
175 for( int y=0; y < block_data.LengthY(); ++y, refup_curr+=refup_next )
177 int x;
178 for( x = 0; x < stopX; x+=4, block_curr+=4, refup_curr+=8 )
180 m1 = _mm_unpacklo_pi16 (*(__m64 *)refup_curr, *(__m64 *)(refup_curr+4));
181 m2 = _mm_unpackhi_pi16 (*(__m64 *)refup_curr, *(__m64 *)(refup_curr+4));
182 // m1 now contains r00 r02 r04 r06
183 m1 = _mm_unpacklo_pi16 (m1, m2);
185 m3 = _mm_unpacklo_pi16 (*(__m64 *)(refup_curr+refXlen), *(__m64 *)(refup_curr+refXlen+4));
186 m2 = _mm_unpackhi_pi16 (*(__m64 *)(refup_curr+refXlen), *(__m64 *)(refup_curr+refXlen+4));
187 // m1 now contains r10 r12 r14 r16
188 m2 = _mm_unpacklo_pi16 (m3, m2);
190 // (refup_curr[0] + (refup_curr+refXlen)[0] + 1)>>1
191 m1 = _mm_add_pi16 (m1, m2);
192 m1 = _mm_add_pi16 (m1, round);
193 *(__m64 *)block_curr = _mm_srai_pi16 (m1, 1);
195 for ( x=stopX; x < block_data.LengthX(); ++x)
197 *block_curr = (( *refup_curr + *(refup_curr+refXlen) +
199 ) >> 1);
200 ++block_curr;
201 refup_curr+=2;
204 _mm_empty();
206 else
208 __m64 round = _mm_set_pi16 (2, 2, 2, 2);
209 __m64 m1, m2, m3;
210 for( int y=0; y < block_data.LengthY(); ++y, refup_curr+=refup_next )
212 int x;
213 for( x = 0; x < stopX; x+=4, block_curr+=4, refup_curr+=8 )
215 m1 = _mm_add_pi16 (*(__m64 *)refup_curr, *(__m64 *)(refup_curr+refXlen));
216 m2 = _mm_add_pi16 (*(__m64 *)(refup_curr+4), *(__m64 *)(refup_curr+refXlen+4));
217 m3 = _mm_unpacklo_pi16 (m1, m2);
218 m1 = _mm_unpackhi_pi16 (m1, m2);
220 m2 = _mm_unpackhi_pi16 (m3, m1);
221 m1 = _mm_unpacklo_pi16 (m3, m1);
223 m1 = _mm_add_pi16 (m1, m2);
224 m1 = _mm_add_pi16 (m1, round);
225 *(__m64 *)block_curr = _mm_srai_pi16 (m1, 2);
227 for ( x=stopX; x < block_data.LengthX(); ++x)
229 *block_curr = (( *refup_curr +
230 *(refup_curr+1) +
231 *(refup_curr+refXlen) +
232 *(refup_curr+refXlen+1) +
234 ) >> 2);
235 ++block_curr;
236 refup_curr+=2;
239 _mm_empty();
242 else
244 // We're 2doing bounds checking because we'll fall off the edge of the reference otherwise.
246 //weights for doing linear interpolation, calculated from the remainder values
247 const ValueType linear_wts[4] = { (2 - rmdr.x) * (2 - rmdr.y), //tl
248 rmdr.x * (2 - rmdr.y), //tr
249 (2 - rmdr.x) * rmdr.y, //bl
250 rmdr.x * rmdr.y }; //br
252 ValueType act_cols1[4], act_cols2[4];
253 int uX, uY, c, l;
254 for(c = 0, uY = ref_start.y; c < block_data.LengthY(); ++c, uY += 2)
256 for(l = 0, uX=ref_start.x; l < block_data.LengthX(); ++l, ++block_curr, uX += 2)
258 check_active_columns(uX, trueRefXlen, act_cols1, act_cols2, refup_data[BChk(uY, trueRefYlen)], refup_data[BChk(uY+1, trueRefYlen)]);
260 *block_curr = (( linear_wts[0] * act_cols1[0] +
261 linear_wts[1] * act_cols1[1] +
262 linear_wts[2] * act_cols2[0] +
263 linear_wts[3] * act_cols2[1] +
265 ) >> 2);
266 }//l
267 }//c
271 void MotionCompensator_HalfPixel::BlockPixelPred(
272 TwoDArray<ValueType> &block_data ,
273 const ImageCoords& pos ,
274 const ImageCoords& orig_pic_size ,
275 const PicArray &refup_data ,
276 const MVector &mv)
278 //Where to start in the upconverted image
279 const ImageCoords start_pos( std::max(pos.x,0) , std::max(pos.y,0) );
280 const ImageCoords ref_start( ( start_pos.x<<1 ) + mv.x ,( start_pos.y<<1 ) + mv.y );
282 //An additional stage to make sure the block to be copied does not fall outside
283 //the reference image.
284 const int refXlen = refup_data.LengthX();
285 //const int refYlen = refup_data.LengthY();
286 const int trueRefXlen = (orig_pic_size.x << 1) - 1;
287 const int trueRefYlen = (orig_pic_size.y << 1) - 1;
289 bool do_bounds_checking = false;
291 //Check if there are going to be any problems copying the block from
292 //the upvconverted reference image.
294 if( ref_start.x < 0 )
295 do_bounds_checking = true;
296 else if( ref_start.x + ((block_data.LengthX() - 1 )<<1 ) >= trueRefXlen )
297 do_bounds_checking = true;
298 if( ref_start.y < 0 )
299 do_bounds_checking = true;
300 else if( ref_start.y + ((block_data.LengthY() - 1 )<<1 ) >= trueRefYlen)
301 do_bounds_checking = true;
303 ValueType *block_curr = &block_data[0][0];
305 if( !do_bounds_checking )
307 ValueType *refup_curr = &refup_data[ref_start.y][ref_start.x];
308 const int refup_next( (refXlen - block_data.LengthX())*2 );// go down 2 rows and back up
309 #if 1
310 int stopX = (block_data.LengthX()>>2)<<2;
312 __m64 m1, m2;
314 for( int y=0; y < block_data.LengthY(); ++y, refup_curr+=refup_next )
316 int x;
317 for( x=0; x < stopX; x+=4, block_curr+=4, refup_curr+=8 )
319 m1 = _mm_unpacklo_pi16 (*(__m64 *)refup_curr, *(__m64 *)(refup_curr+4));
320 m2 = _mm_unpackhi_pi16 (*(__m64 *)refup_curr, *(__m64 *)(refup_curr+4));
321 *(__m64 *)block_curr = _mm_unpacklo_pi16 (m1, m2);
323 // Mopup the last value
324 for ( x=stopX ; x < block_data.LengthX(); ++x)
326 *block_curr = *refup_curr;
327 ++block_curr;
328 refup_curr+=2;
331 _mm_empty();
333 #else
335 for( int y=0; y < block_data.LengthY(); ++y, refup_curr+=refup_next )
337 for( int x=0; x < block_data.LengthX(); ++x, ++block_curr, refup_curr+=2 )
339 *block_curr = refup_curr[0];
342 #endif
344 else
346 // We're doing bounds checking because we'll fall off the edge of the reference otherwise.
347 for( int y=0, ry=ref_start.y, by=BChk(ry,trueRefYlen);
348 y<block_data.LengthY();
349 ++y, ry+=2,by=BChk(ry,trueRefYlen))
351 for( int x=0 , rx=ref_start.x , bx=BChk(rx,trueRefXlen);
352 x<block_data.LengthX() ;
353 ++x, ++block_curr, rx+=2 , bx=BChk(rx,trueRefXlen))
355 *block_curr = refup_data[by][bx];
356 }// x
357 }// y
361 void MotionCompensator::AdjustBlockBySpatialWeights (
362 TwoDArray<ValueType>& val_block,
363 const ImageCoords &pos,
364 const TwoDArray<ValueType> &wt_array)
366 ImageCoords start_pos (std::max(0, pos.x), std::max(0, pos.y));
367 ImageCoords wt_start (start_pos.x - pos.x, start_pos.y - pos.y);
369 ValueType *val_curr = &val_block[0][0];
370 ValueType *wt_curr = &wt_array[wt_start.y][wt_start.x];
372 // go down at row and back to beginning of weights line
373 const int wt_next = wt_array.LengthX() - val_block.LengthX();
375 const int stopX = (val_block.LengthX()>>2)<<2;
377 for ( int j = 0; j < val_block.LengthY(); ++j, wt_curr += wt_next)
379 for ( int i = 0; i < stopX; i+=4, val_curr+=4, wt_curr+=4)
382 * NOTE: Using only the low 16 bits of the result of multiplication
383 * by weights because the result is supposed to fit in 16 bit
384 * words. For some weights could result in overflow and errors
386 __m64 *out = (__m64 *)val_curr;
387 *out = _mm_mullo_pi16 (*(__m64 *)val_curr, *(__m64 *)wt_curr);
389 for (int i = stopX; i < val_block.LengthX(); ++i, ++val_curr, ++wt_curr)
391 *val_curr = *val_curr * *wt_curr;
394 _mm_empty();
397 namespace dirac
399 void CompensateComponentAddAndShift_mmx (int start_y, int end_y,
400 int weight_bits,
401 const ImageCoords& orig_pic_size,
402 TwoDArray<ValueType> &comp_data,
403 PicArray &pic_data_out)
405 if (start_y >= end_y)
406 return;
407 const int round_val = 1<<(weight_bits-1);
408 int stopX = pic_data_out.FirstX() + ((orig_pic_size.x>>2)<<2);
409 int x_end_truepic_data = pic_data_out.FirstX() + orig_pic_size.x;
410 int x_end_data = pic_data_out.FirstX() + pic_data_out.LengthX();
411 __m64 mround_val = _mm_set_pi16 (round_val, round_val, round_val, round_val);
412 ValueType *pic_row = &comp_data[0][comp_data.FirstX()];
413 ValueType *out_row = &pic_data_out[start_y][pic_data_out.FirstX()];
414 for ( int i = start_y; i < end_y; i++)
416 for ( int j = pic_data_out.FirstX(); j < stopX; j+=4)
418 __m64 in1 = _mm_add_pi16 (*(__m64 *)pic_row, mround_val);
419 in1 = _mm_srai_pi16 (in1, weight_bits);
420 __m64 *out = (__m64 *)out_row;
421 *out = _mm_add_pi16 (in1, *out);
422 pic_row += 4;
423 out_row += 4;
425 for ( int j =stopX; j < x_end_truepic_data; j++)
427 *out_row += static_cast<ValueType>( (*pic_row + round_val) >> weight_bits );
428 ++out_row;
429 ++pic_row;
431 // Now pad past the true picture with the last true pic val in
432 // current row
433 ValueType last_true_val = *(out_row - 1);
434 for ( int j = x_end_truepic_data; j < x_end_data; ++j)
436 *out_row = last_true_val;
437 ++out_row;
438 ++pic_row;
441 _mm_empty();
444 void AddMCBlock_mmx (const ImageCoords& start_pos,
445 TwoDArray<ValueType> &comp_strip,
446 TwoDArray<ValueType>& block_data)
448 const int stopX = (block_data.LengthX()>>2)<<2;
450 const int comp_next = comp_strip.LengthX()-block_data.LengthX();
451 ValueType *comp_curr = &comp_strip[start_pos.y][start_pos.x];
452 ValueType *block_curr = &block_data[0][0];
454 for (int j = 0; j < block_data.LengthY(); ++j, comp_curr += comp_next)
456 for (int i = 0; i < stopX; i+=4, comp_curr+=4, block_curr+=4)
458 __m64 *out = (__m64 *)comp_curr;
459 // mc_tmp[y][x] += val
460 *out = _mm_add_pi16 (*(__m64 *)comp_curr, *(__m64 *)block_curr);
462 for (int i = stopX; i < block_data.LengthX(); ++i, ++comp_curr, ++block_curr)
464 *comp_curr += *block_curr;
467 _mm_empty();
470 #endif