Define RDCOST only once
[libvpx.git] / vp8 / encoder / x86 / variance_sse2.c
blob0edda30623b88bbeb44fc29ef0224f016c028887
1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
12 #include "vp8/encoder/variance.h"
13 #include "vp8/common/pragmas.h"
14 #include "vpx_ports/mem.h"
16 extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
17 extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
18 extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
19 extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
21 extern void vp8_filter_block2d_bil4x4_var_mmx
23 const unsigned char *ref_ptr,
24 int ref_pixels_per_line,
25 const unsigned char *src_ptr,
26 int src_pixels_per_line,
27 const short *HFilter,
28 const short *VFilter,
29 int *sum,
30 unsigned int *sumsquared
33 extern unsigned int vp8_get4x4var_mmx
35 const unsigned char *src_ptr,
36 int source_stride,
37 const unsigned char *ref_ptr,
38 int recon_stride,
39 unsigned int *SSE,
40 int *Sum
43 unsigned int vp8_get_mb_ss_sse2
45 const short *src_ptr
47 unsigned int vp8_get16x16var_sse2
49 const unsigned char *src_ptr,
50 int source_stride,
51 const unsigned char *ref_ptr,
52 int recon_stride,
53 unsigned int *SSE,
54 int *Sum
56 unsigned int vp8_get16x16pred_error_sse2
58 const unsigned char *src_ptr,
59 int src_stride,
60 const unsigned char *ref_ptr,
61 int ref_stride
63 unsigned int vp8_get8x8var_sse2
65 const unsigned char *src_ptr,
66 int source_stride,
67 const unsigned char *ref_ptr,
68 int recon_stride,
69 unsigned int *SSE,
70 int *Sum
72 void vp8_filter_block2d_bil_var_sse2
74 const unsigned char *ref_ptr,
75 int ref_pixels_per_line,
76 const unsigned char *src_ptr,
77 int src_pixels_per_line,
78 unsigned int Height,
79 int xoffset,
80 int yoffset,
81 int *sum,
82 unsigned int *sumsquared
84 void vp8_half_horiz_vert_variance8x_h_sse2
86 const unsigned char *ref_ptr,
87 int ref_pixels_per_line,
88 const unsigned char *src_ptr,
89 int src_pixels_per_line,
90 unsigned int Height,
91 int *sum,
92 unsigned int *sumsquared
94 void vp8_half_horiz_vert_variance16x_h_sse2
96 const unsigned char *ref_ptr,
97 int ref_pixels_per_line,
98 const unsigned char *src_ptr,
99 int src_pixels_per_line,
100 unsigned int Height,
101 int *sum,
102 unsigned int *sumsquared
104 void vp8_half_horiz_variance8x_h_sse2
106 const unsigned char *ref_ptr,
107 int ref_pixels_per_line,
108 const unsigned char *src_ptr,
109 int src_pixels_per_line,
110 unsigned int Height,
111 int *sum,
112 unsigned int *sumsquared
114 void vp8_half_horiz_variance16x_h_sse2
116 const unsigned char *ref_ptr,
117 int ref_pixels_per_line,
118 const unsigned char *src_ptr,
119 int src_pixels_per_line,
120 unsigned int Height,
121 int *sum,
122 unsigned int *sumsquared
124 void vp8_half_vert_variance8x_h_sse2
126 const unsigned char *ref_ptr,
127 int ref_pixels_per_line,
128 const unsigned char *src_ptr,
129 int src_pixels_per_line,
130 unsigned int Height,
131 int *sum,
132 unsigned int *sumsquared
134 void vp8_half_vert_variance16x_h_sse2
136 const unsigned char *ref_ptr,
137 int ref_pixels_per_line,
138 const unsigned char *src_ptr,
139 int src_pixels_per_line,
140 unsigned int Height,
141 int *sum,
142 unsigned int *sumsquared
145 DECLARE_ALIGNED(16, extern short, vp8_vp7_bilinear_filters_mmx[8][8]);
147 unsigned int vp8_variance4x4_wmt(
148 const unsigned char *src_ptr,
149 int source_stride,
150 const unsigned char *ref_ptr,
151 int recon_stride)
153 unsigned int var;
154 int avg;
156 vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
157 return (var - ((avg * avg) >> 4));
163 unsigned int vp8_variance8x8_wmt
165 const unsigned char *src_ptr,
166 int source_stride,
167 const unsigned char *ref_ptr,
168 int recon_stride)
170 unsigned int var;
171 int avg;
173 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
175 return (var - ((avg * avg) >> 6));
180 unsigned int vp8_variance16x16_wmt
182 const unsigned char *src_ptr,
183 int source_stride,
184 const unsigned char *ref_ptr,
185 int recon_stride,
186 unsigned int *sse)
188 unsigned int sse0;
189 int sum0;
192 vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
193 *sse = sse0;
194 return (sse0 - ((sum0 * sum0) >> 8));
196 unsigned int vp8_mse16x16_wmt(
197 const unsigned char *src_ptr,
198 int source_stride,
199 const unsigned char *ref_ptr,
200 int recon_stride,
201 unsigned int *sse)
204 unsigned int sse0;
205 int sum0;
206 vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
207 *sse = sse0;
208 return sse0;
213 unsigned int vp8_variance16x8_wmt
215 const unsigned char *src_ptr,
216 int source_stride,
217 const unsigned char *ref_ptr,
218 int recon_stride,
219 unsigned int *sse)
221 unsigned int sse0, sse1, var;
222 int sum0, sum1, avg;
224 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
225 vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
227 var = sse0 + sse1;
228 avg = sum0 + sum1;
229 *sse = var;
230 return (var - ((avg * avg) >> 7));
234 unsigned int vp8_variance8x16_wmt
236 const unsigned char *src_ptr,
237 int source_stride,
238 const unsigned char *ref_ptr,
239 int recon_stride,
240 unsigned int *sse)
242 unsigned int sse0, sse1, var;
243 int sum0, sum1, avg;
245 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
246 vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
248 var = sse0 + sse1;
249 avg = sum0 + sum1;
250 *sse = var;
251 return (var - ((avg * avg) >> 7));
255 unsigned int vp8_sub_pixel_variance4x4_wmt
257 const unsigned char *src_ptr,
258 int src_pixels_per_line,
259 int xoffset,
260 int yoffset,
261 const unsigned char *dst_ptr,
262 int dst_pixels_per_line,
263 unsigned int *sse
266 int xsum;
267 unsigned int xxsum;
268 vp8_filter_block2d_bil4x4_var_mmx(
269 src_ptr, src_pixels_per_line,
270 dst_ptr, dst_pixels_per_line,
271 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
272 &xsum, &xxsum
274 *sse = xxsum;
275 return (xxsum - ((xsum * xsum) >> 4));
279 unsigned int vp8_sub_pixel_variance8x8_wmt
281 const unsigned char *src_ptr,
282 int src_pixels_per_line,
283 int xoffset,
284 int yoffset,
285 const unsigned char *dst_ptr,
286 int dst_pixels_per_line,
287 unsigned int *sse
290 int xsum;
291 unsigned int xxsum;
293 if (xoffset == 4 && yoffset == 0)
295 vp8_half_horiz_variance8x_h_sse2(
296 src_ptr, src_pixels_per_line,
297 dst_ptr, dst_pixels_per_line, 8,
298 &xsum, &xxsum);
300 else if (xoffset == 0 && yoffset == 4)
302 vp8_half_vert_variance8x_h_sse2(
303 src_ptr, src_pixels_per_line,
304 dst_ptr, dst_pixels_per_line, 8,
305 &xsum, &xxsum);
307 else if (xoffset == 4 && yoffset == 4)
309 vp8_half_horiz_vert_variance8x_h_sse2(
310 src_ptr, src_pixels_per_line,
311 dst_ptr, dst_pixels_per_line, 8,
312 &xsum, &xxsum);
314 else
316 vp8_filter_block2d_bil_var_sse2(
317 src_ptr, src_pixels_per_line,
318 dst_ptr, dst_pixels_per_line, 8,
319 xoffset, yoffset,
320 &xsum, &xxsum);
323 *sse = xxsum;
324 return (xxsum - ((xsum * xsum) >> 6));
327 unsigned int vp8_sub_pixel_variance16x16_wmt
329 const unsigned char *src_ptr,
330 int src_pixels_per_line,
331 int xoffset,
332 int yoffset,
333 const unsigned char *dst_ptr,
334 int dst_pixels_per_line,
335 unsigned int *sse
338 int xsum0, xsum1;
339 unsigned int xxsum0, xxsum1;
342 // note we could avoid these if statements if the calling function
343 // just called the appropriate functions inside.
344 if (xoffset == 4 && yoffset == 0)
346 vp8_half_horiz_variance16x_h_sse2(
347 src_ptr, src_pixels_per_line,
348 dst_ptr, dst_pixels_per_line, 16,
349 &xsum0, &xxsum0);
351 else if (xoffset == 0 && yoffset == 4)
353 vp8_half_vert_variance16x_h_sse2(
354 src_ptr, src_pixels_per_line,
355 dst_ptr, dst_pixels_per_line, 16,
356 &xsum0, &xxsum0);
358 else if (xoffset == 4 && yoffset == 4)
360 vp8_half_horiz_vert_variance16x_h_sse2(
361 src_ptr, src_pixels_per_line,
362 dst_ptr, dst_pixels_per_line, 16,
363 &xsum0, &xxsum0);
365 else
367 vp8_filter_block2d_bil_var_sse2(
368 src_ptr, src_pixels_per_line,
369 dst_ptr, dst_pixels_per_line, 16,
370 xoffset, yoffset,
371 &xsum0, &xxsum0
374 vp8_filter_block2d_bil_var_sse2(
375 src_ptr + 8, src_pixels_per_line,
376 dst_ptr + 8, dst_pixels_per_line, 16,
377 xoffset, yoffset,
378 &xsum1, &xxsum1
380 xsum0 += xsum1;
381 xxsum0 += xxsum1;
384 *sse = xxsum0;
385 return (xxsum0 - ((xsum0 * xsum0) >> 8));
388 unsigned int vp8_sub_pixel_mse16x16_wmt(
389 const unsigned char *src_ptr,
390 int src_pixels_per_line,
391 int xoffset,
392 int yoffset,
393 const unsigned char *dst_ptr,
394 int dst_pixels_per_line,
395 unsigned int *sse
398 vp8_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
399 return *sse;
402 unsigned int vp8_sub_pixel_variance16x8_wmt
404 const unsigned char *src_ptr,
405 int src_pixels_per_line,
406 int xoffset,
407 int yoffset,
408 const unsigned char *dst_ptr,
409 int dst_pixels_per_line,
410 unsigned int *sse
414 int xsum0, xsum1;
415 unsigned int xxsum0, xxsum1;
417 if (xoffset == 4 && yoffset == 0)
419 vp8_half_horiz_variance16x_h_sse2(
420 src_ptr, src_pixels_per_line,
421 dst_ptr, dst_pixels_per_line, 8,
422 &xsum0, &xxsum0);
424 else if (xoffset == 0 && yoffset == 4)
426 vp8_half_vert_variance16x_h_sse2(
427 src_ptr, src_pixels_per_line,
428 dst_ptr, dst_pixels_per_line, 8,
429 &xsum0, &xxsum0);
431 else if (xoffset == 4 && yoffset == 4)
433 vp8_half_horiz_vert_variance16x_h_sse2(
434 src_ptr, src_pixels_per_line,
435 dst_ptr, dst_pixels_per_line, 8,
436 &xsum0, &xxsum0);
438 else
440 vp8_filter_block2d_bil_var_sse2(
441 src_ptr, src_pixels_per_line,
442 dst_ptr, dst_pixels_per_line, 8,
443 xoffset, yoffset,
444 &xsum0, &xxsum0);
446 vp8_filter_block2d_bil_var_sse2(
447 src_ptr + 8, src_pixels_per_line,
448 dst_ptr + 8, dst_pixels_per_line, 8,
449 xoffset, yoffset,
450 &xsum1, &xxsum1);
451 xsum0 += xsum1;
452 xxsum0 += xxsum1;
455 *sse = xxsum0;
456 return (xxsum0 - ((xsum0 * xsum0) >> 7));
459 unsigned int vp8_sub_pixel_variance8x16_wmt
461 const unsigned char *src_ptr,
462 int src_pixels_per_line,
463 int xoffset,
464 int yoffset,
465 const unsigned char *dst_ptr,
466 int dst_pixels_per_line,
467 unsigned int *sse
470 int xsum;
471 unsigned int xxsum;
473 if (xoffset == 4 && yoffset == 0)
475 vp8_half_horiz_variance8x_h_sse2(
476 src_ptr, src_pixels_per_line,
477 dst_ptr, dst_pixels_per_line, 16,
478 &xsum, &xxsum);
480 else if (xoffset == 0 && yoffset == 4)
482 vp8_half_vert_variance8x_h_sse2(
483 src_ptr, src_pixels_per_line,
484 dst_ptr, dst_pixels_per_line, 16,
485 &xsum, &xxsum);
487 else if (xoffset == 4 && yoffset == 4)
489 vp8_half_horiz_vert_variance8x_h_sse2(
490 src_ptr, src_pixels_per_line,
491 dst_ptr, dst_pixels_per_line, 16,
492 &xsum, &xxsum);
494 else
496 vp8_filter_block2d_bil_var_sse2(
497 src_ptr, src_pixels_per_line,
498 dst_ptr, dst_pixels_per_line, 16,
499 xoffset, yoffset,
500 &xsum, &xxsum);
503 *sse = xxsum;
504 return (xxsum - ((xsum * xsum) >> 7));
508 unsigned int vp8_variance_halfpixvar16x16_h_wmt(
509 const unsigned char *src_ptr,
510 int src_pixels_per_line,
511 const unsigned char *dst_ptr,
512 int dst_pixels_per_line,
513 unsigned int *sse)
515 int xsum0;
516 unsigned int xxsum0;
518 vp8_half_horiz_variance16x_h_sse2(
519 src_ptr, src_pixels_per_line,
520 dst_ptr, dst_pixels_per_line, 16,
521 &xsum0, &xxsum0);
523 *sse = xxsum0;
524 return (xxsum0 - ((xsum0 * xsum0) >> 8));
528 unsigned int vp8_variance_halfpixvar16x16_v_wmt(
529 const unsigned char *src_ptr,
530 int src_pixels_per_line,
531 const unsigned char *dst_ptr,
532 int dst_pixels_per_line,
533 unsigned int *sse)
535 int xsum0;
536 unsigned int xxsum0;
537 vp8_half_vert_variance16x_h_sse2(
538 src_ptr, src_pixels_per_line,
539 dst_ptr, dst_pixels_per_line, 16,
540 &xsum0, &xxsum0);
542 *sse = xxsum0;
543 return (xxsum0 - ((xsum0 * xsum0) >> 8));
547 unsigned int vp8_variance_halfpixvar16x16_hv_wmt(
548 const unsigned char *src_ptr,
549 int src_pixels_per_line,
550 const unsigned char *dst_ptr,
551 int dst_pixels_per_line,
552 unsigned int *sse)
554 int xsum0;
555 unsigned int xxsum0;
557 vp8_half_horiz_vert_variance16x_h_sse2(
558 src_ptr, src_pixels_per_line,
559 dst_ptr, dst_pixels_per_line, 16,
560 &xsum0, &xxsum0);
562 *sse = xxsum0;
563 return (xxsum0 - ((xsum0 * xsum0) >> 8));