Fix relative include paths
[libvpx.git] / vp8 / encoder / x86 / variance_sse2.c
blob7cf6a63085a6f80f8101b59f3f72c0aca64732ea
1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
12 #include "vp8/encoder/variance.h"
13 #include "vp8/common/pragmas.h"
14 #include "vpx_ports/mem.h"
16 extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
17 extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
18 extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
19 extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
21 extern void vp8_filter_block2d_bil4x4_var_mmx
23 const unsigned char *ref_ptr,
24 int ref_pixels_per_line,
25 const unsigned char *src_ptr,
26 int src_pixels_per_line,
27 const short *HFilter,
28 const short *VFilter,
29 int *sum,
30 unsigned int *sumsquared
33 extern unsigned int vp8_get4x4var_mmx
35 const unsigned char *src_ptr,
36 int source_stride,
37 const unsigned char *ref_ptr,
38 int recon_stride,
39 unsigned int *SSE,
40 int *Sum
43 unsigned int vp8_get_mb_ss_sse2
45 const short *src_ptr
47 unsigned int vp8_get16x16var_sse2
49 const unsigned char *src_ptr,
50 int source_stride,
51 const unsigned char *ref_ptr,
52 int recon_stride,
53 unsigned int *SSE,
54 int *Sum
56 unsigned int vp8_get16x16pred_error_sse2
58 const unsigned char *src_ptr,
59 int src_stride,
60 const unsigned char *ref_ptr,
61 int ref_stride
63 unsigned int vp8_get8x8var_sse2
65 const unsigned char *src_ptr,
66 int source_stride,
67 const unsigned char *ref_ptr,
68 int recon_stride,
69 unsigned int *SSE,
70 int *Sum
72 void vp8_filter_block2d_bil_var_sse2
74 const unsigned char *ref_ptr,
75 int ref_pixels_per_line,
76 const unsigned char *src_ptr,
77 int src_pixels_per_line,
78 unsigned int Height,
79 int xoffset,
80 int yoffset,
81 int *sum,
82 unsigned int *sumsquared
84 void vp8_half_horiz_vert_variance16x_h_sse2
86 const unsigned char *ref_ptr,
87 int ref_pixels_per_line,
88 const unsigned char *src_ptr,
89 int src_pixels_per_line,
90 unsigned int Height,
91 int *sum,
92 unsigned int *sumsquared
94 void vp8_half_horiz_variance16x_h_sse2
96 const unsigned char *ref_ptr,
97 int ref_pixels_per_line,
98 const unsigned char *src_ptr,
99 int src_pixels_per_line,
100 unsigned int Height,
101 int *sum,
102 unsigned int *sumsquared
104 void vp8_half_vert_variance16x_h_sse2
106 const unsigned char *ref_ptr,
107 int ref_pixels_per_line,
108 const unsigned char *src_ptr,
109 int src_pixels_per_line,
110 unsigned int Height,
111 int *sum,
112 unsigned int *sumsquared
115 DECLARE_ALIGNED(16, extern short, vp8_vp7_bilinear_filters_mmx[8][8]);
117 unsigned int vp8_variance4x4_wmt(
118 const unsigned char *src_ptr,
119 int source_stride,
120 const unsigned char *ref_ptr,
121 int recon_stride)
123 unsigned int var;
124 int avg;
126 vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
127 return (var - ((avg * avg) >> 4));
133 unsigned int vp8_variance8x8_wmt
135 const unsigned char *src_ptr,
136 int source_stride,
137 const unsigned char *ref_ptr,
138 int recon_stride)
140 unsigned int var;
141 int avg;
143 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
145 return (var - ((avg * avg) >> 6));
150 unsigned int vp8_variance16x16_wmt
152 const unsigned char *src_ptr,
153 int source_stride,
154 const unsigned char *ref_ptr,
155 int recon_stride,
156 unsigned int *sse)
158 unsigned int sse0;
159 int sum0;
162 vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
163 *sse = sse0;
164 return (sse0 - ((sum0 * sum0) >> 8));
166 unsigned int vp8_mse16x16_wmt(
167 const unsigned char *src_ptr,
168 int source_stride,
169 const unsigned char *ref_ptr,
170 int recon_stride,
171 unsigned int *sse)
174 unsigned int sse0;
175 int sum0;
176 vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
177 *sse = sse0;
178 return sse0;
183 unsigned int vp8_variance16x8_wmt
185 const unsigned char *src_ptr,
186 int source_stride,
187 const unsigned char *ref_ptr,
188 int recon_stride,
189 unsigned int *sse)
191 unsigned int sse0, sse1, var;
192 int sum0, sum1, avg;
194 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
195 vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
197 var = sse0 + sse1;
198 avg = sum0 + sum1;
199 *sse = var;
200 return (var - ((avg * avg) >> 7));
204 unsigned int vp8_variance8x16_wmt
206 const unsigned char *src_ptr,
207 int source_stride,
208 const unsigned char *ref_ptr,
209 int recon_stride,
210 unsigned int *sse)
212 unsigned int sse0, sse1, var;
213 int sum0, sum1, avg;
215 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
216 vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
218 var = sse0 + sse1;
219 avg = sum0 + sum1;
220 *sse = var;
221 return (var - ((avg * avg) >> 7));
225 unsigned int vp8_sub_pixel_variance4x4_wmt
227 const unsigned char *src_ptr,
228 int src_pixels_per_line,
229 int xoffset,
230 int yoffset,
231 const unsigned char *dst_ptr,
232 int dst_pixels_per_line,
233 unsigned int *sse
236 int xsum;
237 unsigned int xxsum;
238 vp8_filter_block2d_bil4x4_var_mmx(
239 src_ptr, src_pixels_per_line,
240 dst_ptr, dst_pixels_per_line,
241 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
242 &xsum, &xxsum
244 *sse = xxsum;
245 return (xxsum - ((xsum * xsum) >> 4));
249 unsigned int vp8_sub_pixel_variance8x8_wmt
251 const unsigned char *src_ptr,
252 int src_pixels_per_line,
253 int xoffset,
254 int yoffset,
255 const unsigned char *dst_ptr,
256 int dst_pixels_per_line,
257 unsigned int *sse
260 int xsum;
261 unsigned int xxsum;
263 if (xoffset == 4 && yoffset == 0)
265 vp8_half_horiz_variance16x_h_sse2(
266 src_ptr, src_pixels_per_line,
267 dst_ptr, dst_pixels_per_line, 8,
268 &xsum, &xxsum);
270 else if (xoffset == 0 && yoffset == 4)
272 vp8_half_vert_variance16x_h_sse2(
273 src_ptr, src_pixels_per_line,
274 dst_ptr, dst_pixels_per_line, 8,
275 &xsum, &xxsum);
277 else if (xoffset == 4 && yoffset == 4)
279 vp8_half_horiz_vert_variance16x_h_sse2(
280 src_ptr, src_pixels_per_line,
281 dst_ptr, dst_pixels_per_line, 8,
282 &xsum, &xxsum);
284 else
286 vp8_filter_block2d_bil_var_sse2(
287 src_ptr, src_pixels_per_line,
288 dst_ptr, dst_pixels_per_line, 8,
289 xoffset, yoffset,
290 &xsum, &xxsum);
293 *sse = xxsum;
294 return (xxsum - ((xsum * xsum) >> 6));
297 unsigned int vp8_sub_pixel_variance16x16_wmt
299 const unsigned char *src_ptr,
300 int src_pixels_per_line,
301 int xoffset,
302 int yoffset,
303 const unsigned char *dst_ptr,
304 int dst_pixels_per_line,
305 unsigned int *sse
308 int xsum0, xsum1;
309 unsigned int xxsum0, xxsum1;
312 // note we could avoid these if statements if the calling function
313 // just called the appropriate functions inside.
314 if (xoffset == 4 && yoffset == 0)
316 vp8_half_horiz_variance16x_h_sse2(
317 src_ptr, src_pixels_per_line,
318 dst_ptr, dst_pixels_per_line, 16,
319 &xsum0, &xxsum0);
321 vp8_half_horiz_variance16x_h_sse2(
322 src_ptr + 8, src_pixels_per_line,
323 dst_ptr + 8, dst_pixels_per_line, 16,
324 &xsum1, &xxsum1);
326 else if (xoffset == 0 && yoffset == 4)
328 vp8_half_vert_variance16x_h_sse2(
329 src_ptr, src_pixels_per_line,
330 dst_ptr, dst_pixels_per_line, 16,
331 &xsum0, &xxsum0);
333 vp8_half_vert_variance16x_h_sse2(
334 src_ptr + 8, src_pixels_per_line,
335 dst_ptr + 8, dst_pixels_per_line, 16,
336 &xsum1, &xxsum1);
338 else if (xoffset == 4 && yoffset == 4)
340 vp8_half_horiz_vert_variance16x_h_sse2(
341 src_ptr, src_pixels_per_line,
342 dst_ptr, dst_pixels_per_line, 16,
343 &xsum0, &xxsum0);
345 vp8_half_horiz_vert_variance16x_h_sse2(
346 src_ptr + 8, src_pixels_per_line,
347 dst_ptr + 8, dst_pixels_per_line, 16,
348 &xsum1, &xxsum1);
350 else
352 vp8_filter_block2d_bil_var_sse2(
353 src_ptr, src_pixels_per_line,
354 dst_ptr, dst_pixels_per_line, 16,
355 xoffset, yoffset,
356 &xsum0, &xxsum0
360 vp8_filter_block2d_bil_var_sse2(
361 src_ptr + 8, src_pixels_per_line,
362 dst_ptr + 8, dst_pixels_per_line, 16,
363 xoffset, yoffset,
364 &xsum1, &xxsum1
368 xsum0 += xsum1;
369 xxsum0 += xxsum1;
370 *sse = xxsum0;
371 return (xxsum0 - ((xsum0 * xsum0) >> 8));
374 unsigned int vp8_sub_pixel_mse16x16_wmt(
375 const unsigned char *src_ptr,
376 int src_pixels_per_line,
377 int xoffset,
378 int yoffset,
379 const unsigned char *dst_ptr,
380 int dst_pixels_per_line,
381 unsigned int *sse
384 vp8_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
385 return *sse;
388 unsigned int vp8_sub_pixel_variance16x8_wmt
390 const unsigned char *src_ptr,
391 int src_pixels_per_line,
392 int xoffset,
393 int yoffset,
394 const unsigned char *dst_ptr,
395 int dst_pixels_per_line,
396 unsigned int *sse
400 int xsum0, xsum1;
401 unsigned int xxsum0, xxsum1;
403 if (xoffset == 4 && yoffset == 0)
405 vp8_half_horiz_variance16x_h_sse2(
406 src_ptr, src_pixels_per_line,
407 dst_ptr, dst_pixels_per_line, 8,
408 &xsum0, &xxsum0);
410 vp8_half_horiz_variance16x_h_sse2(
411 src_ptr + 8, src_pixels_per_line,
412 dst_ptr + 8, dst_pixels_per_line, 8,
413 &xsum1, &xxsum1);
415 else if (xoffset == 0 && yoffset == 4)
417 vp8_half_vert_variance16x_h_sse2(
418 src_ptr, src_pixels_per_line,
419 dst_ptr, dst_pixels_per_line, 8,
420 &xsum0, &xxsum0);
422 vp8_half_vert_variance16x_h_sse2(
423 src_ptr + 8, src_pixels_per_line,
424 dst_ptr + 8, dst_pixels_per_line, 8,
425 &xsum1, &xxsum1);
427 else if (xoffset == 4 && yoffset == 4)
429 vp8_half_horiz_vert_variance16x_h_sse2(
430 src_ptr, src_pixels_per_line,
431 dst_ptr, dst_pixels_per_line, 8,
432 &xsum0, &xxsum0);
434 vp8_half_horiz_vert_variance16x_h_sse2(
435 src_ptr + 8, src_pixels_per_line,
436 dst_ptr + 8, dst_pixels_per_line, 8,
437 &xsum1, &xxsum1);
439 else
441 vp8_filter_block2d_bil_var_sse2(
442 src_ptr, src_pixels_per_line,
443 dst_ptr, dst_pixels_per_line, 8,
444 xoffset, yoffset,
445 &xsum0, &xxsum0);
447 vp8_filter_block2d_bil_var_sse2(
448 src_ptr + 8, src_pixels_per_line,
449 dst_ptr + 8, dst_pixels_per_line, 8,
450 xoffset, yoffset,
451 &xsum1, &xxsum1);
454 xsum0 += xsum1;
455 xxsum0 += xxsum1;
457 *sse = xxsum0;
458 return (xxsum0 - ((xsum0 * xsum0) >> 7));
461 unsigned int vp8_sub_pixel_variance8x16_wmt
463 const unsigned char *src_ptr,
464 int src_pixels_per_line,
465 int xoffset,
466 int yoffset,
467 const unsigned char *dst_ptr,
468 int dst_pixels_per_line,
469 unsigned int *sse
472 int xsum;
473 unsigned int xxsum;
475 if (xoffset == 4 && yoffset == 0)
477 vp8_half_horiz_variance16x_h_sse2(
478 src_ptr, src_pixels_per_line,
479 dst_ptr, dst_pixels_per_line, 16,
480 &xsum, &xxsum);
482 else if (xoffset == 0 && yoffset == 4)
484 vp8_half_vert_variance16x_h_sse2(
485 src_ptr, src_pixels_per_line,
486 dst_ptr, dst_pixels_per_line, 16,
487 &xsum, &xxsum);
489 else if (xoffset == 4 && yoffset == 4)
491 vp8_half_horiz_vert_variance16x_h_sse2(
492 src_ptr, src_pixels_per_line,
493 dst_ptr, dst_pixels_per_line, 16,
494 &xsum, &xxsum);
496 else
498 vp8_filter_block2d_bil_var_sse2(
499 src_ptr, src_pixels_per_line,
500 dst_ptr, dst_pixels_per_line, 16,
501 xoffset, yoffset,
502 &xsum, &xxsum);
505 *sse = xxsum;
506 return (xxsum - ((xsum * xsum) >> 7));
509 unsigned int vp8_i_variance16x16_wmt(
510 const unsigned char *src_ptr,
511 int source_stride,
512 const unsigned char *ref_ptr,
513 int recon_stride,
514 unsigned int *sse)
516 unsigned int sse0, sse1, sse2, sse3, var;
517 int sum0, sum1, sum2, sum3, avg;
520 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
521 vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
522 vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ;
523 vp8_get8x8var_sse2(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3);
525 var = sse0 + sse1 + sse2 + sse3;
526 avg = sum0 + sum1 + sum2 + sum3;
528 *sse = var;
529 return (var - ((avg * avg) >> 8));
533 unsigned int vp8_i_variance8x16_wmt(
534 const unsigned char *src_ptr,
535 int source_stride,
536 const unsigned char *ref_ptr,
537 int recon_stride,
538 unsigned int *sse)
540 unsigned int sse0, sse1, var;
541 int sum0, sum1, avg;
542 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
543 vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ;
545 var = sse0 + sse1;
546 avg = sum0 + sum1;
548 *sse = var;
549 return (var - ((avg * avg) >> 7));
554 unsigned int vp8_i_sub_pixel_variance16x16_wmt
556 const unsigned char *src_ptr,
557 int src_pixels_per_line,
558 int xoffset,
559 int yoffset,
560 const unsigned char *dst_ptr,
561 int dst_pixels_per_line,
562 unsigned int *sse
565 return vp8_sub_pixel_variance16x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse);
569 unsigned int vp8_i_sub_pixel_variance8x16_wmt
571 const unsigned char *src_ptr,
572 int src_pixels_per_line,
573 int xoffset,
574 int yoffset,
575 const unsigned char *dst_ptr,
576 int dst_pixels_per_line,
577 unsigned int *sse
581 return vp8_sub_pixel_variance8x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse);
585 unsigned int vp8_variance_halfpixvar16x16_h_wmt(
586 const unsigned char *src_ptr,
587 int src_pixels_per_line,
588 const unsigned char *dst_ptr,
589 int dst_pixels_per_line,
590 unsigned int *sse)
592 int xsum0, xsum1;
593 unsigned int xxsum0, xxsum1;
595 vp8_half_horiz_variance16x_h_sse2(
596 src_ptr, src_pixels_per_line,
597 dst_ptr, dst_pixels_per_line, 16,
598 &xsum0, &xxsum0);
600 vp8_half_horiz_variance16x_h_sse2(
601 src_ptr + 8, src_pixels_per_line,
602 dst_ptr + 8, dst_pixels_per_line, 16,
603 &xsum1, &xxsum1);
605 xsum0 += xsum1;
606 xxsum0 += xxsum1;
607 *sse = xxsum0;
608 return (xxsum0 - ((xsum0 * xsum0) >> 8));
612 unsigned int vp8_variance_halfpixvar16x16_v_wmt(
613 const unsigned char *src_ptr,
614 int src_pixels_per_line,
615 const unsigned char *dst_ptr,
616 int dst_pixels_per_line,
617 unsigned int *sse)
619 int xsum0, xsum1;
620 unsigned int xxsum0, xxsum1;
622 vp8_half_vert_variance16x_h_sse2(
623 src_ptr, src_pixels_per_line,
624 dst_ptr, dst_pixels_per_line, 16,
625 &xsum0, &xxsum0);
627 vp8_half_vert_variance16x_h_sse2(
628 src_ptr + 8, src_pixels_per_line,
629 dst_ptr + 8, dst_pixels_per_line, 16,
630 &xsum1, &xxsum1);
632 xsum0 += xsum1;
633 xxsum0 += xxsum1;
634 *sse = xxsum0;
635 return (xxsum0 - ((xsum0 * xsum0) >> 8));
639 unsigned int vp8_variance_halfpixvar16x16_hv_wmt(
640 const unsigned char *src_ptr,
641 int src_pixels_per_line,
642 const unsigned char *dst_ptr,
643 int dst_pixels_per_line,
644 unsigned int *sse)
646 int xsum0, xsum1;
647 unsigned int xxsum0, xxsum1;
649 vp8_half_horiz_vert_variance16x_h_sse2(
650 src_ptr, src_pixels_per_line,
651 dst_ptr, dst_pixels_per_line, 16,
652 &xsum0, &xxsum0);
654 vp8_half_horiz_vert_variance16x_h_sse2(
655 src_ptr + 8, src_pixels_per_line,
656 dst_ptr + 8, dst_pixels_per_line, 16,
657 &xsum1, &xxsum1);
659 xsum0 += xsum1;
660 xxsum0 += xxsum1;
661 *sse = xxsum0;
662 return (xxsum0 - ((xsum0 * xsum0) >> 8));