2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
14 #include "vpx_ports/mem.h"
16 extern void filter_block1d_h6_mmx
18 const unsigned char *src_ptr
,
19 unsigned short *output_ptr
,
20 unsigned int src_pixels_per_line
,
21 unsigned int pixel_step
,
22 unsigned int output_height
,
23 unsigned int output_width
,
26 extern void filter_block1d_v6_mmx
29 unsigned char *output_ptr
,
30 unsigned int pixels_per_line
,
31 unsigned int pixel_step
,
32 unsigned int output_height
,
33 unsigned int output_width
,
37 extern unsigned int vp8_get_mb_ss_mmx(short *src_ptr
);
38 extern unsigned int vp8_get8x8var_mmx
40 const unsigned char *src_ptr
,
42 const unsigned char *ref_ptr
,
47 extern unsigned int vp8_get4x4var_mmx
49 const unsigned char *src_ptr
,
51 const unsigned char *ref_ptr
,
56 extern unsigned int vp8_get4x4sse_cs_mmx
58 const unsigned char *src_ptr
,
60 const unsigned char *ref_ptr
,
63 extern void vp8_filter_block2d_bil4x4_var_mmx
65 const unsigned char *ref_ptr
,
66 int ref_pixels_per_line
,
67 const unsigned char *src_ptr
,
68 int src_pixels_per_line
,
72 unsigned int *sumsquared
74 extern void vp8_filter_block2d_bil_var_mmx
76 const unsigned char *ref_ptr
,
77 int ref_pixels_per_line
,
78 const unsigned char *src_ptr
,
79 int src_pixels_per_line
,
84 unsigned int *sumsquared
86 extern unsigned int vp8_get16x16pred_error_mmx
88 unsigned char *src_ptr
,
90 unsigned char *ref_ptr
,
95 void vp8_test_get_mb_ss(void)
99 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
100 -2, -2, -2, -2, 2, 2, 2, 2, -2, -2, -2, -2, 2, 2, 2, 2,
101 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
102 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
103 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
104 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
105 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
106 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
107 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
108 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
109 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
110 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
111 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
112 -3, -3, -3, -3, 3, 3, 3, 3, -3, -3, -3, -3, 3, 3, 3, 3,
113 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
114 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
116 int s
= 0, x
= vp8_get_mb_ss_mmx(zz
);
120 for (y
= 0; y
< 256; y
++)
121 s
+= (zz
[y
] * zz
[y
]);
128 unsigned int vp8_get16x16var_mmx(
129 const unsigned char *src_ptr
,
131 const unsigned char *ref_ptr
,
137 unsigned int sse0
, sse1
, sse2
, sse3
, var
;
138 int sum0
, sum1
, sum2
, sum3
, avg
;
141 vp8_get8x8var_mmx(src_ptr
, source_stride
, ref_ptr
, recon_stride
, &sse0
, &sum0
) ;
142 vp8_get8x8var_mmx(src_ptr
+ 8, source_stride
, ref_ptr
+ 8, recon_stride
, &sse1
, &sum1
);
143 vp8_get8x8var_mmx(src_ptr
+ 8 * source_stride
, source_stride
, ref_ptr
+ 8 * recon_stride
, recon_stride
, &sse2
, &sum2
) ;
144 vp8_get8x8var_mmx(src_ptr
+ 8 * source_stride
+ 8, source_stride
, ref_ptr
+ 8 * recon_stride
+ 8, recon_stride
, &sse3
, &sum3
);
146 var
= sse0
+ sse1
+ sse2
+ sse3
;
147 avg
= sum0
+ sum1
+ sum2
+ sum3
;
151 return (var
- ((avg
* avg
) >> 8));
159 unsigned int vp8_variance4x4_mmx(
160 const unsigned char *src_ptr
,
162 const unsigned char *ref_ptr
,
169 vp8_get4x4var_mmx(src_ptr
, source_stride
, ref_ptr
, recon_stride
, &var
, &avg
) ;
171 return (var
- ((avg
* avg
) >> 4));
175 unsigned int vp8_variance8x8_mmx(
176 const unsigned char *src_ptr
,
178 const unsigned char *ref_ptr
,
185 vp8_get8x8var_mmx(src_ptr
, source_stride
, ref_ptr
, recon_stride
, &var
, &avg
) ;
188 return (var
- ((avg
* avg
) >> 6));
192 unsigned int vp8_mse16x16_mmx(
193 const unsigned char *src_ptr
,
195 const unsigned char *ref_ptr
,
199 unsigned int sse0
, sse1
, sse2
, sse3
, var
;
200 int sum0
, sum1
, sum2
, sum3
;
203 vp8_get8x8var_mmx(src_ptr
, source_stride
, ref_ptr
, recon_stride
, &sse0
, &sum0
) ;
204 vp8_get8x8var_mmx(src_ptr
+ 8, source_stride
, ref_ptr
+ 8, recon_stride
, &sse1
, &sum1
);
205 vp8_get8x8var_mmx(src_ptr
+ 8 * source_stride
, source_stride
, ref_ptr
+ 8 * recon_stride
, recon_stride
, &sse2
, &sum2
) ;
206 vp8_get8x8var_mmx(src_ptr
+ 8 * source_stride
+ 8, source_stride
, ref_ptr
+ 8 * recon_stride
+ 8, recon_stride
, &sse3
, &sum3
);
208 var
= sse0
+ sse1
+ sse2
+ sse3
;
214 unsigned int vp8_variance16x16_mmx(
215 const unsigned char *src_ptr
,
217 const unsigned char *ref_ptr
,
221 unsigned int sse0
, sse1
, sse2
, sse3
, var
;
222 int sum0
, sum1
, sum2
, sum3
, avg
;
225 vp8_get8x8var_mmx(src_ptr
, source_stride
, ref_ptr
, recon_stride
, &sse0
, &sum0
) ;
226 vp8_get8x8var_mmx(src_ptr
+ 8, source_stride
, ref_ptr
+ 8, recon_stride
, &sse1
, &sum1
);
227 vp8_get8x8var_mmx(src_ptr
+ 8 * source_stride
, source_stride
, ref_ptr
+ 8 * recon_stride
, recon_stride
, &sse2
, &sum2
) ;
228 vp8_get8x8var_mmx(src_ptr
+ 8 * source_stride
+ 8, source_stride
, ref_ptr
+ 8 * recon_stride
+ 8, recon_stride
, &sse3
, &sum3
);
230 var
= sse0
+ sse1
+ sse2
+ sse3
;
231 avg
= sum0
+ sum1
+ sum2
+ sum3
;
233 return (var
- ((avg
* avg
) >> 8));
236 unsigned int vp8_variance16x8_mmx(
237 const unsigned char *src_ptr
,
239 const unsigned char *ref_ptr
,
243 unsigned int sse0
, sse1
, var
;
246 vp8_get8x8var_mmx(src_ptr
, source_stride
, ref_ptr
, recon_stride
, &sse0
, &sum0
) ;
247 vp8_get8x8var_mmx(src_ptr
+ 8, source_stride
, ref_ptr
+ 8, recon_stride
, &sse1
, &sum1
);
252 return (var
- ((avg
* avg
) >> 7));
257 unsigned int vp8_variance8x16_mmx(
258 const unsigned char *src_ptr
,
260 const unsigned char *ref_ptr
,
264 unsigned int sse0
, sse1
, var
;
267 vp8_get8x8var_mmx(src_ptr
, source_stride
, ref_ptr
, recon_stride
, &sse0
, &sum0
) ;
268 vp8_get8x8var_mmx(src_ptr
+ 8 * source_stride
, source_stride
, ref_ptr
+ 8 * recon_stride
, recon_stride
, &sse1
, &sum1
) ;
274 return (var
- ((avg
* avg
) >> 7));
281 ///////////////////////////////////////////////////////////////////////////
282 // the mmx function that does the bilinear filtering and var calculation //
284 ///////////////////////////////////////////////////////////////////////////
285 DECLARE_ALIGNED(16, const short, vp8_vp7_bilinear_filters_mmx
[8][8]) =
287 { 128, 128, 128, 128, 0, 0, 0, 0 },
288 { 112, 112, 112, 112, 16, 16, 16, 16 },
289 { 96, 96, 96, 96, 32, 32, 32, 32 },
290 { 80, 80, 80, 80, 48, 48, 48, 48 },
291 { 64, 64, 64, 64, 64, 64, 64, 64 },
292 { 48, 48, 48, 48, 80, 80, 80, 80 },
293 { 32, 32, 32, 32, 96, 96, 96, 96 },
294 { 16, 16, 16, 16, 112, 112, 112, 112 }
297 unsigned int vp8_sub_pixel_variance4x4_mmx
299 const unsigned char *src_ptr
,
300 int src_pixels_per_line
,
303 const unsigned char *dst_ptr
,
304 int dst_pixels_per_line
,
310 vp8_filter_block2d_bil4x4_var_mmx(
311 src_ptr
, src_pixels_per_line
,
312 dst_ptr
, dst_pixels_per_line
,
313 vp8_vp7_bilinear_filters_mmx
[xoffset
], vp8_vp7_bilinear_filters_mmx
[yoffset
],
317 return (xxsum
- ((xsum
* xsum
) >> 4));
321 unsigned int vp8_sub_pixel_variance8x8_mmx
323 const unsigned char *src_ptr
,
324 int src_pixels_per_line
,
327 const unsigned char *dst_ptr
,
328 int dst_pixels_per_line
,
335 vp8_filter_block2d_bil_var_mmx(
336 src_ptr
, src_pixels_per_line
,
337 dst_ptr
, dst_pixels_per_line
, 8,
338 vp8_vp7_bilinear_filters_mmx
[xoffset
], vp8_vp7_bilinear_filters_mmx
[yoffset
],
342 return (xxsum
- ((xsum
* xsum
) >> 6));
345 unsigned int vp8_sub_pixel_variance16x16_mmx
347 const unsigned char *src_ptr
,
348 int src_pixels_per_line
,
351 const unsigned char *dst_ptr
,
352 int dst_pixels_per_line
,
358 unsigned int xxsum0
, xxsum1
;
361 vp8_filter_block2d_bil_var_mmx(
362 src_ptr
, src_pixels_per_line
,
363 dst_ptr
, dst_pixels_per_line
, 16,
364 vp8_vp7_bilinear_filters_mmx
[xoffset
], vp8_vp7_bilinear_filters_mmx
[yoffset
],
369 vp8_filter_block2d_bil_var_mmx(
370 src_ptr
+ 8, src_pixels_per_line
,
371 dst_ptr
+ 8, dst_pixels_per_line
, 16,
372 vp8_vp7_bilinear_filters_mmx
[xoffset
], vp8_vp7_bilinear_filters_mmx
[yoffset
],
380 return (xxsum0
- ((xsum0
* xsum0
) >> 8));
385 unsigned int vp8_sub_pixel_mse16x16_mmx(
386 const unsigned char *src_ptr
,
387 int src_pixels_per_line
,
390 const unsigned char *dst_ptr
,
391 int dst_pixels_per_line
,
395 vp8_sub_pixel_variance16x16_mmx(src_ptr
, src_pixels_per_line
, xoffset
, yoffset
, dst_ptr
, dst_pixels_per_line
, sse
);
399 unsigned int vp8_sub_pixel_variance16x8_mmx
401 const unsigned char *src_ptr
,
402 int src_pixels_per_line
,
405 const unsigned char *dst_ptr
,
406 int dst_pixels_per_line
,
411 unsigned int xxsum0
, xxsum1
;
414 vp8_filter_block2d_bil_var_mmx(
415 src_ptr
, src_pixels_per_line
,
416 dst_ptr
, dst_pixels_per_line
, 8,
417 vp8_vp7_bilinear_filters_mmx
[xoffset
], vp8_vp7_bilinear_filters_mmx
[yoffset
],
422 vp8_filter_block2d_bil_var_mmx(
423 src_ptr
+ 8, src_pixels_per_line
,
424 dst_ptr
+ 8, dst_pixels_per_line
, 8,
425 vp8_vp7_bilinear_filters_mmx
[xoffset
], vp8_vp7_bilinear_filters_mmx
[yoffset
],
433 return (xxsum0
- ((xsum0
* xsum0
) >> 7));
436 unsigned int vp8_sub_pixel_variance8x16_mmx
438 const unsigned char *src_ptr
,
439 int src_pixels_per_line
,
442 const unsigned char *dst_ptr
,
443 int dst_pixels_per_line
,
449 vp8_filter_block2d_bil_var_mmx(
450 src_ptr
, src_pixels_per_line
,
451 dst_ptr
, dst_pixels_per_line
, 16,
452 vp8_vp7_bilinear_filters_mmx
[xoffset
], vp8_vp7_bilinear_filters_mmx
[yoffset
],
456 return (xxsum
- ((xsum
* xsum
) >> 7));
459 unsigned int vp8_i_variance16x16_mmx(
460 const unsigned char *src_ptr
,
462 const unsigned char *ref_ptr
,
466 unsigned int sse0
, sse1
, sse2
, sse3
, var
;
467 int sum0
, sum1
, sum2
, sum3
, avg
;
470 vp8_get8x8var_mmx(src_ptr
, source_stride
, ref_ptr
, recon_stride
, &sse0
, &sum0
) ;
471 vp8_get8x8var_mmx(src_ptr
+ 8, source_stride
, ref_ptr
+ 8, recon_stride
, &sse1
, &sum1
);
472 vp8_get8x8var_mmx(src_ptr
+ (source_stride
>> 1), source_stride
, ref_ptr
+ (recon_stride
>> 1), recon_stride
, &sse2
, &sum2
) ;
473 vp8_get8x8var_mmx(src_ptr
+ (source_stride
>> 1) + 8, source_stride
, ref_ptr
+ (recon_stride
>> 1) + 8, recon_stride
, &sse3
, &sum3
);
475 var
= sse0
+ sse1
+ sse2
+ sse3
;
476 avg
= sum0
+ sum1
+ sum2
+ sum3
;
478 return (var
- ((avg
* avg
) >> 8));
482 unsigned int vp8_i_variance8x16_mmx(
483 const unsigned char *src_ptr
,
485 const unsigned char *ref_ptr
,
489 unsigned int sse0
, sse1
, var
;
491 vp8_get8x8var_mmx(src_ptr
, source_stride
, ref_ptr
, recon_stride
, &sse0
, &sum0
) ;
492 vp8_get8x8var_mmx(src_ptr
+ (source_stride
>> 1), source_stride
, ref_ptr
+ (recon_stride
>> 1), recon_stride
, &sse1
, &sum1
) ;
498 return (var
- ((avg
* avg
) >> 7));
502 unsigned int vp8_i_sub_pixel_variance16x16_mmx
504 const unsigned char *src_ptr
,
505 int src_pixels_per_line
,
508 const unsigned char *dst_ptr
,
509 int dst_pixels_per_line
,
514 unsigned int xxsum0
, xxsum1
;
515 int f2soffset
= (src_pixels_per_line
>> 1);
516 int f2doffset
= (dst_pixels_per_line
>> 1);
519 vp8_filter_block2d_bil_var_mmx(
520 src_ptr
, src_pixels_per_line
,
521 dst_ptr
, dst_pixels_per_line
, 8,
522 vp8_vp7_bilinear_filters_mmx
[xoffset
], vp8_vp7_bilinear_filters_mmx
[yoffset
],
527 vp8_filter_block2d_bil_var_mmx(
528 src_ptr
+ 8, src_pixels_per_line
,
529 dst_ptr
+ 8, dst_pixels_per_line
, 8,
530 vp8_vp7_bilinear_filters_mmx
[xoffset
], vp8_vp7_bilinear_filters_mmx
[yoffset
],
537 vp8_filter_block2d_bil_var_mmx(
538 src_ptr
+ f2soffset
, src_pixels_per_line
,
539 dst_ptr
+ f2doffset
, dst_pixels_per_line
, 8,
540 vp8_vp7_bilinear_filters_mmx
[xoffset
], vp8_vp7_bilinear_filters_mmx
[yoffset
],
547 vp8_filter_block2d_bil_var_mmx(
548 src_ptr
+ f2soffset
+ 8, src_pixels_per_line
,
549 dst_ptr
+ f2doffset
+ 8, dst_pixels_per_line
, 8,
550 vp8_vp7_bilinear_filters_mmx
[xoffset
], vp8_vp7_bilinear_filters_mmx
[yoffset
],
557 return (xxsum0
- ((xsum0
* xsum0
) >> 8));
561 unsigned int vp8_i_sub_pixel_variance8x16_mmx
563 const unsigned char *src_ptr
,
564 int src_pixels_per_line
,
567 const unsigned char *dst_ptr
,
568 int dst_pixels_per_line
,
573 unsigned int xxsum0
, xxsum1
;
574 int f2soffset
= (src_pixels_per_line
>> 1);
575 int f2doffset
= (dst_pixels_per_line
>> 1);
578 vp8_filter_block2d_bil_var_mmx(
579 src_ptr
, src_pixels_per_line
,
580 dst_ptr
, dst_pixels_per_line
, 8,
581 vp8_vp7_bilinear_filters_mmx
[xoffset
], vp8_vp7_bilinear_filters_mmx
[yoffset
],
586 vp8_filter_block2d_bil_var_mmx(
587 src_ptr
+ f2soffset
, src_pixels_per_line
,
588 dst_ptr
+ f2doffset
, dst_pixels_per_line
, 8,
589 vp8_vp7_bilinear_filters_mmx
[xoffset
], vp8_vp7_bilinear_filters_mmx
[yoffset
],
596 return (xxsum0
- ((xsum0
* xsum0
) >> 7));
600 unsigned int vp8_variance_halfpixvar16x16_h_mmx(
601 const unsigned char *src_ptr
,
603 const unsigned char *ref_ptr
,
607 return vp8_sub_pixel_variance16x16_mmx(src_ptr
, source_stride
, 4, 0,
608 ref_ptr
, recon_stride
, sse
);
612 unsigned int vp8_variance_halfpixvar16x16_v_mmx(
613 const unsigned char *src_ptr
,
615 const unsigned char *ref_ptr
,
619 return vp8_sub_pixel_variance16x16_mmx(src_ptr
, source_stride
, 0, 4,
620 ref_ptr
, recon_stride
, sse
);
624 unsigned int vp8_variance_halfpixvar16x16_hv_mmx(
625 const unsigned char *src_ptr
,
627 const unsigned char *ref_ptr
,
631 return vp8_sub_pixel_variance16x16_mmx(src_ptr
, source_stride
, 4, 4,
632 ref_ptr
, recon_stride
, sse
);