2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
14 #include "vpx_ports/mem.h"
16 extern void filter_block1d_h6_mmx(const unsigned char *src_ptr
, unsigned short *output_ptr
, unsigned int src_pixels_per_line
, unsigned int pixel_step
, unsigned int output_height
, unsigned int output_width
, short *vp7_filter
);
17 extern void filter_block1d_v6_mmx(const short *src_ptr
, unsigned char *output_ptr
, unsigned int pixels_per_line
, unsigned int pixel_step
, unsigned int output_height
, unsigned int output_width
, short *vp7_filter
);
18 extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr
, unsigned short *output_ptr
, unsigned int src_pixels_per_line
, unsigned int pixel_step
, unsigned int output_height
, unsigned int output_width
, short *vp7_filter
);
19 extern void filter_block1d8_v6_sse2(const short *src_ptr
, unsigned char *output_ptr
, unsigned int pixels_per_line
, unsigned int pixel_step
, unsigned int output_height
, unsigned int output_width
, short *vp7_filter
);
21 extern void vp8_filter_block2d_bil4x4_var_mmx
23 const unsigned char *ref_ptr
,
24 int ref_pixels_per_line
,
25 const unsigned char *src_ptr
,
26 int src_pixels_per_line
,
30 unsigned int *sumsquared
33 extern unsigned int vp8_get4x4var_mmx
35 const unsigned char *src_ptr
,
37 const unsigned char *ref_ptr
,
43 unsigned int vp8_get_mb_ss_sse2
47 unsigned int vp8_get16x16var_sse2
49 const unsigned char *src_ptr
,
51 const unsigned char *ref_ptr
,
56 unsigned int vp8_get16x16pred_error_sse2
58 const unsigned char *src_ptr
,
60 const unsigned char *ref_ptr
,
63 unsigned int vp8_get8x8var_sse2
65 const unsigned char *src_ptr
,
67 const unsigned char *ref_ptr
,
72 void vp8_filter_block2d_bil_var_sse2
74 const unsigned char *ref_ptr
,
75 int ref_pixels_per_line
,
76 const unsigned char *src_ptr
,
77 int src_pixels_per_line
,
82 unsigned int *sumsquared
84 void vp8_half_horiz_vert_variance16x_h_sse2
86 const unsigned char *ref_ptr
,
87 int ref_pixels_per_line
,
88 const unsigned char *src_ptr
,
89 int src_pixels_per_line
,
92 unsigned int *sumsquared
94 void vp8_half_horiz_variance16x_h_sse2
96 const unsigned char *ref_ptr
,
97 int ref_pixels_per_line
,
98 const unsigned char *src_ptr
,
99 int src_pixels_per_line
,
102 unsigned int *sumsquared
104 void vp8_half_vert_variance16x_h_sse2
106 const unsigned char *ref_ptr
,
107 int ref_pixels_per_line
,
108 const unsigned char *src_ptr
,
109 int src_pixels_per_line
,
112 unsigned int *sumsquared
115 DECLARE_ALIGNED(16, extern short, vp8_vp7_bilinear_filters_mmx
[8][8]);
117 unsigned int vp8_variance4x4_wmt(
118 const unsigned char *src_ptr
,
120 const unsigned char *ref_ptr
,
126 vp8_get4x4var_mmx(src_ptr
, source_stride
, ref_ptr
, recon_stride
, &var
, &avg
) ;
127 return (var
- ((avg
* avg
) >> 4));
133 unsigned int vp8_variance8x8_wmt
135 const unsigned char *src_ptr
,
137 const unsigned char *ref_ptr
,
143 vp8_get8x8var_sse2(src_ptr
, source_stride
, ref_ptr
, recon_stride
, &var
, &avg
) ;
145 return (var
- ((avg
* avg
) >> 6));
150 unsigned int vp8_variance16x16_wmt
152 const unsigned char *src_ptr
,
154 const unsigned char *ref_ptr
,
162 vp8_get16x16var_sse2(src_ptr
, source_stride
, ref_ptr
, recon_stride
, &sse0
, &sum0
) ;
164 return (sse0
- ((sum0
* sum0
) >> 8));
166 unsigned int vp8_mse16x16_wmt(
167 const unsigned char *src_ptr
,
169 const unsigned char *ref_ptr
,
176 vp8_get16x16var_sse2(src_ptr
, source_stride
, ref_ptr
, recon_stride
, &sse0
, &sum0
) ;
183 unsigned int vp8_variance16x8_wmt
185 const unsigned char *src_ptr
,
187 const unsigned char *ref_ptr
,
191 unsigned int sse0
, sse1
, var
;
194 vp8_get8x8var_sse2(src_ptr
, source_stride
, ref_ptr
, recon_stride
, &sse0
, &sum0
) ;
195 vp8_get8x8var_sse2(src_ptr
+ 8, source_stride
, ref_ptr
+ 8, recon_stride
, &sse1
, &sum1
);
200 return (var
- ((avg
* avg
) >> 7));
204 unsigned int vp8_variance8x16_wmt
206 const unsigned char *src_ptr
,
208 const unsigned char *ref_ptr
,
212 unsigned int sse0
, sse1
, var
;
215 vp8_get8x8var_sse2(src_ptr
, source_stride
, ref_ptr
, recon_stride
, &sse0
, &sum0
) ;
216 vp8_get8x8var_sse2(src_ptr
+ 8 * source_stride
, source_stride
, ref_ptr
+ 8 * recon_stride
, recon_stride
, &sse1
, &sum1
) ;
221 return (var
- ((avg
* avg
) >> 7));
225 unsigned int vp8_sub_pixel_variance4x4_wmt
227 const unsigned char *src_ptr
,
228 int src_pixels_per_line
,
231 const unsigned char *dst_ptr
,
232 int dst_pixels_per_line
,
238 vp8_filter_block2d_bil4x4_var_mmx(
239 src_ptr
, src_pixels_per_line
,
240 dst_ptr
, dst_pixels_per_line
,
241 vp8_vp7_bilinear_filters_mmx
[xoffset
], vp8_vp7_bilinear_filters_mmx
[yoffset
],
245 return (xxsum
- ((xsum
* xsum
) >> 4));
249 unsigned int vp8_sub_pixel_variance8x8_wmt
251 const unsigned char *src_ptr
,
252 int src_pixels_per_line
,
255 const unsigned char *dst_ptr
,
256 int dst_pixels_per_line
,
263 if (xoffset
== 4 && yoffset
== 0)
265 vp8_half_horiz_variance16x_h_sse2(
266 src_ptr
, src_pixels_per_line
,
267 dst_ptr
, dst_pixels_per_line
, 8,
270 else if (xoffset
== 0 && yoffset
== 4)
272 vp8_half_vert_variance16x_h_sse2(
273 src_ptr
, src_pixels_per_line
,
274 dst_ptr
, dst_pixels_per_line
, 8,
277 else if (xoffset
== 4 && yoffset
== 4)
279 vp8_half_horiz_vert_variance16x_h_sse2(
280 src_ptr
, src_pixels_per_line
,
281 dst_ptr
, dst_pixels_per_line
, 8,
286 vp8_filter_block2d_bil_var_sse2(
287 src_ptr
, src_pixels_per_line
,
288 dst_ptr
, dst_pixels_per_line
, 8,
294 return (xxsum
- ((xsum
* xsum
) >> 6));
297 unsigned int vp8_sub_pixel_variance16x16_wmt
299 const unsigned char *src_ptr
,
300 int src_pixels_per_line
,
303 const unsigned char *dst_ptr
,
304 int dst_pixels_per_line
,
309 unsigned int xxsum0
, xxsum1
;
312 // note we could avoid these if statements if the calling function
313 // just called the appropriate functions inside.
314 if (xoffset
== 4 && yoffset
== 0)
316 vp8_half_horiz_variance16x_h_sse2(
317 src_ptr
, src_pixels_per_line
,
318 dst_ptr
, dst_pixels_per_line
, 16,
321 vp8_half_horiz_variance16x_h_sse2(
322 src_ptr
+ 8, src_pixels_per_line
,
323 dst_ptr
+ 8, dst_pixels_per_line
, 16,
326 else if (xoffset
== 0 && yoffset
== 4)
328 vp8_half_vert_variance16x_h_sse2(
329 src_ptr
, src_pixels_per_line
,
330 dst_ptr
, dst_pixels_per_line
, 16,
333 vp8_half_vert_variance16x_h_sse2(
334 src_ptr
+ 8, src_pixels_per_line
,
335 dst_ptr
+ 8, dst_pixels_per_line
, 16,
338 else if (xoffset
== 4 && yoffset
== 4)
340 vp8_half_horiz_vert_variance16x_h_sse2(
341 src_ptr
, src_pixels_per_line
,
342 dst_ptr
, dst_pixels_per_line
, 16,
345 vp8_half_horiz_vert_variance16x_h_sse2(
346 src_ptr
+ 8, src_pixels_per_line
,
347 dst_ptr
+ 8, dst_pixels_per_line
, 16,
352 vp8_filter_block2d_bil_var_sse2(
353 src_ptr
, src_pixels_per_line
,
354 dst_ptr
, dst_pixels_per_line
, 16,
360 vp8_filter_block2d_bil_var_sse2(
361 src_ptr
+ 8, src_pixels_per_line
,
362 dst_ptr
+ 8, dst_pixels_per_line
, 16,
371 return (xxsum0
- ((xsum0
* xsum0
) >> 8));
374 unsigned int vp8_sub_pixel_mse16x16_wmt(
375 const unsigned char *src_ptr
,
376 int src_pixels_per_line
,
379 const unsigned char *dst_ptr
,
380 int dst_pixels_per_line
,
384 vp8_sub_pixel_variance16x16_wmt(src_ptr
, src_pixels_per_line
, xoffset
, yoffset
, dst_ptr
, dst_pixels_per_line
, sse
);
388 unsigned int vp8_sub_pixel_variance16x8_wmt
390 const unsigned char *src_ptr
,
391 int src_pixels_per_line
,
394 const unsigned char *dst_ptr
,
395 int dst_pixels_per_line
,
401 unsigned int xxsum0
, xxsum1
;
403 if (xoffset
== 4 && yoffset
== 0)
405 vp8_half_horiz_variance16x_h_sse2(
406 src_ptr
, src_pixels_per_line
,
407 dst_ptr
, dst_pixels_per_line
, 8,
410 vp8_half_horiz_variance16x_h_sse2(
411 src_ptr
+ 8, src_pixels_per_line
,
412 dst_ptr
+ 8, dst_pixels_per_line
, 8,
415 else if (xoffset
== 0 && yoffset
== 4)
417 vp8_half_vert_variance16x_h_sse2(
418 src_ptr
, src_pixels_per_line
,
419 dst_ptr
, dst_pixels_per_line
, 8,
422 vp8_half_vert_variance16x_h_sse2(
423 src_ptr
+ 8, src_pixels_per_line
,
424 dst_ptr
+ 8, dst_pixels_per_line
, 8,
427 else if (xoffset
== 4 && yoffset
== 4)
429 vp8_half_horiz_vert_variance16x_h_sse2(
430 src_ptr
, src_pixels_per_line
,
431 dst_ptr
, dst_pixels_per_line
, 8,
434 vp8_half_horiz_vert_variance16x_h_sse2(
435 src_ptr
+ 8, src_pixels_per_line
,
436 dst_ptr
+ 8, dst_pixels_per_line
, 8,
441 vp8_filter_block2d_bil_var_sse2(
442 src_ptr
, src_pixels_per_line
,
443 dst_ptr
, dst_pixels_per_line
, 8,
447 vp8_filter_block2d_bil_var_sse2(
448 src_ptr
+ 8, src_pixels_per_line
,
449 dst_ptr
+ 8, dst_pixels_per_line
, 8,
458 return (xxsum0
- ((xsum0
* xsum0
) >> 7));
461 unsigned int vp8_sub_pixel_variance8x16_wmt
463 const unsigned char *src_ptr
,
464 int src_pixels_per_line
,
467 const unsigned char *dst_ptr
,
468 int dst_pixels_per_line
,
475 if (xoffset
== 4 && yoffset
== 0)
477 vp8_half_horiz_variance16x_h_sse2(
478 src_ptr
, src_pixels_per_line
,
479 dst_ptr
, dst_pixels_per_line
, 16,
482 else if (xoffset
== 0 && yoffset
== 4)
484 vp8_half_vert_variance16x_h_sse2(
485 src_ptr
, src_pixels_per_line
,
486 dst_ptr
, dst_pixels_per_line
, 16,
489 else if (xoffset
== 4 && yoffset
== 4)
491 vp8_half_horiz_vert_variance16x_h_sse2(
492 src_ptr
, src_pixels_per_line
,
493 dst_ptr
, dst_pixels_per_line
, 16,
498 vp8_filter_block2d_bil_var_sse2(
499 src_ptr
, src_pixels_per_line
,
500 dst_ptr
, dst_pixels_per_line
, 16,
506 return (xxsum
- ((xsum
* xsum
) >> 7));
509 unsigned int vp8_i_variance16x16_wmt(
510 const unsigned char *src_ptr
,
512 const unsigned char *ref_ptr
,
516 unsigned int sse0
, sse1
, sse2
, sse3
, var
;
517 int sum0
, sum1
, sum2
, sum3
, avg
;
520 vp8_get8x8var_sse2(src_ptr
, source_stride
, ref_ptr
, recon_stride
, &sse0
, &sum0
) ;
521 vp8_get8x8var_sse2(src_ptr
+ 8, source_stride
, ref_ptr
+ 8, recon_stride
, &sse1
, &sum1
);
522 vp8_get8x8var_sse2(src_ptr
+ (source_stride
>> 1), source_stride
, ref_ptr
+ (recon_stride
>> 1), recon_stride
, &sse2
, &sum2
) ;
523 vp8_get8x8var_sse2(src_ptr
+ (source_stride
>> 1) + 8, source_stride
, ref_ptr
+ (recon_stride
>> 1) + 8, recon_stride
, &sse3
, &sum3
);
525 var
= sse0
+ sse1
+ sse2
+ sse3
;
526 avg
= sum0
+ sum1
+ sum2
+ sum3
;
529 return (var
- ((avg
* avg
) >> 8));
533 unsigned int vp8_i_variance8x16_wmt(
534 const unsigned char *src_ptr
,
536 const unsigned char *ref_ptr
,
540 unsigned int sse0
, sse1
, var
;
542 vp8_get8x8var_sse2(src_ptr
, source_stride
, ref_ptr
, recon_stride
, &sse0
, &sum0
) ;
543 vp8_get8x8var_sse2(src_ptr
+ (source_stride
>> 1), source_stride
, ref_ptr
+ (recon_stride
>> 1), recon_stride
, &sse1
, &sum1
) ;
549 return (var
- ((avg
* avg
) >> 7));
554 unsigned int vp8_i_sub_pixel_variance16x16_wmt
556 const unsigned char *src_ptr
,
557 int src_pixels_per_line
,
560 const unsigned char *dst_ptr
,
561 int dst_pixels_per_line
,
565 return vp8_sub_pixel_variance16x16_wmt(src_ptr
, (src_pixels_per_line
>> 1), xoffset
, yoffset
, dst_ptr
, (dst_pixels_per_line
>> 1), sse
);
569 unsigned int vp8_i_sub_pixel_variance8x16_wmt
571 const unsigned char *src_ptr
,
572 int src_pixels_per_line
,
575 const unsigned char *dst_ptr
,
576 int dst_pixels_per_line
,
581 return vp8_sub_pixel_variance8x16_wmt(src_ptr
, (src_pixels_per_line
>> 1), xoffset
, yoffset
, dst_ptr
, (dst_pixels_per_line
>> 1), sse
);
585 unsigned int vp8_variance_halfpixvar16x16_h_wmt(
586 const unsigned char *src_ptr
,
587 int src_pixels_per_line
,
588 const unsigned char *dst_ptr
,
589 int dst_pixels_per_line
,
593 unsigned int xxsum0
, xxsum1
;
595 vp8_half_horiz_variance16x_h_sse2(
596 src_ptr
, src_pixels_per_line
,
597 dst_ptr
, dst_pixels_per_line
, 16,
600 vp8_half_horiz_variance16x_h_sse2(
601 src_ptr
+ 8, src_pixels_per_line
,
602 dst_ptr
+ 8, dst_pixels_per_line
, 16,
608 return (xxsum0
- ((xsum0
* xsum0
) >> 8));
612 unsigned int vp8_variance_halfpixvar16x16_v_wmt(
613 const unsigned char *src_ptr
,
614 int src_pixels_per_line
,
615 const unsigned char *dst_ptr
,
616 int dst_pixels_per_line
,
620 unsigned int xxsum0
, xxsum1
;
622 vp8_half_vert_variance16x_h_sse2(
623 src_ptr
, src_pixels_per_line
,
624 dst_ptr
, dst_pixels_per_line
, 16,
627 vp8_half_vert_variance16x_h_sse2(
628 src_ptr
+ 8, src_pixels_per_line
,
629 dst_ptr
+ 8, dst_pixels_per_line
, 16,
635 return (xxsum0
- ((xsum0
* xsum0
) >> 8));
639 unsigned int vp8_variance_halfpixvar16x16_hv_wmt(
640 const unsigned char *src_ptr
,
641 int src_pixels_per_line
,
642 const unsigned char *dst_ptr
,
643 int dst_pixels_per_line
,
647 unsigned int xxsum0
, xxsum1
;
649 vp8_half_horiz_vert_variance16x_h_sse2(
650 src_ptr
, src_pixels_per_line
,
651 dst_ptr
, dst_pixels_per_line
, 16,
654 vp8_half_horiz_vert_variance16x_h_sse2(
655 src_ptr
+ 8, src_pixels_per_line
,
656 dst_ptr
+ 8, dst_pixels_per_line
, 16,
662 return (xxsum0
- ((xsum0
* xsum0
) >> 8));