Merge "vp8_rd_pick_best_mbsegmentation code restructure"
[libvpx.git] / vp8 / common / x86 / postproc_mmx.c
blob6b6321ace3d672617105085cfb36956262e7135b
1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
12 #include <math.h>
13 #include <stdlib.h>
14 #include "vpx_scale/yv12config.h"
15 #include "pragmas.h"
17 #define VP8_FILTER_WEIGHT 128
18 #define VP8_FILTER_SHIFT 7
22 /* static constants */
23 __declspec(align(16))
24 const static short Blur[48] =
27 16, 16, 16, 16, 16, 16, 16, 16,
28 16, 16, 16, 16, 16, 16, 16, 16,
29 64, 64, 64, 64, 64, 64, 64, 64,
30 16, 16, 16, 16, 16, 16, 16, 16,
31 16, 16, 16, 16, 16, 16, 16, 16,
32 0, 0, 0, 0, 0, 0, 0, 0,
35 #define RD __declspec(align(16)) __int64 rd = 0x0040004000400040;
36 #define R4D2 __declspec(align(16)) __int64 rd42[2] = {0x0004000400040004,0x0004000400040004};
38 #ifndef RELOCATEABLE
39 const static RD;
40 const static R4D2;
41 #endif
44 /* external references */
45 extern double vp8_gaussian(double sigma, double mu, double x);
46 extern short vp8_rv[];
47 extern int vp8_q2mbl(int x) ;
51 void vp8_post_proc_down_and_across_mmx
53 unsigned char *src_ptr,
54 unsigned char *dst_ptr,
55 int src_pixels_per_line,
56 int dst_pixels_per_line,
57 int rows,
58 int cols,
59 int flimit
62 #ifdef RELOCATEABLE
64 R4D2
65 #endif
67 __asm
69 push ebx
70 lea ebx, Blur
71 movd mm2, flimit
72 punpcklwd mm2, mm2
73 punpckldq mm2, mm2
75 mov esi, src_ptr
76 mov edi, dst_ptr
78 mov ecx, DWORD PTR rows
79 mov eax, src_pixels_per_line ;
80 destination pitch?
81 pxor mm0, mm0 ;
82 mm0 = 00000000
84 nextrow:
86 xor edx, edx ;
88 clear out edx for use as loop counter
89 nextcol:
91 pxor mm7, mm7 ;
93 mm7 = 00000000
94 movq mm6, [ebx + 32 ] ;
95 mm6 = kernel 2 taps
96 movq mm3, [esi] ;
97 mm4 = r0 p0..p7
98 punpcklbw mm3, mm0 ;
99 mm3 = p0..p3
100 movq mm1, mm3 ;
101 mm1 = p0..p3
102 pmullw mm3, mm6 ;
103 mm3 *= kernel 2 modifiers
105 movq mm6, [ebx + 48] ;
106 mm6 = kernel 3 taps
107 movq mm5, [esi + eax] ;
108 mm4 = r1 p0..p7
109 punpcklbw mm5, mm0 ;
110 mm5 = r1 p0..p3
111 pmullw mm6, mm5 ;
112 mm6 *= p0..p3 * kernel 3 modifiers
113 paddusw mm3, mm6 ;
114 mm3 += mm6
117 thresholding
118 movq mm7, mm1 ;
119 mm7 = r0 p0..p3
120 psubusw mm7, mm5 ;
121 mm7 = r0 p0..p3 - r1 p0..p3
122 psubusw mm5, mm1 ;
123 mm5 = r1 p0..p3 - r0 p0..p3
124 paddusw mm7, mm5 ;
125 mm7 = abs(r0 p0..p3 - r1 p0..p3)
126 pcmpgtw mm7, mm2
128 movq mm6, [ebx + 64 ] ;
129 mm6 = kernel 4 modifiers
130 movq mm5, [esi + 2*eax] ;
131 mm4 = r2 p0..p7
132 punpcklbw mm5, mm0 ;
133 mm5 = r2 p0..p3
134 pmullw mm6, mm5 ;
135 mm5 *= kernel 4 modifiers
136 paddusw mm3, mm6 ;
137 mm3 += mm5
140 thresholding
141 movq mm6, mm1 ;
142 mm6 = r0 p0..p3
143 psubusw mm6, mm5 ;
144 mm6 = r0 p0..p3 - r2 p0..p3
145 psubusw mm5, mm1 ;
146 mm5 = r2 p0..p3 - r2 p0..p3
147 paddusw mm6, mm5 ;
148 mm6 = abs(r0 p0..p3 - r2 p0..p3)
149 pcmpgtw mm6, mm2
150 por mm7, mm6 ;
151 accumulate thresholds
154 neg eax
155 movq mm6, [ebx ] ;
156 kernel 0 taps
157 movq mm5, [esi+2*eax] ;
158 mm4 = r-2 p0..p7
159 punpcklbw mm5, mm0 ;
160 mm5 = r-2 p0..p3
161 pmullw mm6, mm5 ;
162 mm5 *= kernel 0 modifiers
163 paddusw mm3, mm6 ;
164 mm3 += mm5
167 thresholding
168 movq mm6, mm1 ;
169 mm6 = r0 p0..p3
170 psubusw mm6, mm5 ;
171 mm6 = p0..p3 - r-2 p0..p3
172 psubusw mm5, mm1 ;
173 mm5 = r-2 p0..p3 - p0..p3
174 paddusw mm6, mm5 ;
175 mm6 = abs(r0 p0..p3 - r-2 p0..p3)
176 pcmpgtw mm6, mm2
177 por mm7, mm6 ;
178 accumulate thresholds
180 movq mm6, [ebx + 16] ;
181 kernel 1 taps
182 movq mm4, [esi+eax] ;
183 mm4 = r-1 p0..p7
184 punpcklbw mm4, mm0 ;
185 mm4 = r-1 p0..p3
186 pmullw mm6, mm4 ;
187 mm4 *= kernel 1 modifiers.
188 paddusw mm3, mm6 ;
189 mm3 += mm5
192 thresholding
193 movq mm6, mm1 ;
194 mm6 = r0 p0..p3
195 psubusw mm6, mm4 ;
196 mm6 = p0..p3 - r-2 p0..p3
197 psubusw mm4, mm1 ;
198 mm5 = r-1 p0..p3 - p0..p3
199 paddusw mm6, mm4 ;
200 mm6 = abs(r0 p0..p3 - r-1 p0..p3)
201 pcmpgtw mm6, mm2
202 por mm7, mm6 ;
203 accumulate thresholds
206 paddusw mm3, rd ;
207 mm3 += round value
208 psraw mm3, VP8_FILTER_SHIFT ;
209 mm3 /= 128
211 pand mm1, mm7 ;
212 mm1 select vals > thresh from source
213 pandn mm7, mm3 ;
214 mm7 select vals < thresh from blurred result
215 paddusw mm1, mm7 ;
216 combination
218 packuswb mm1, mm0 ;
219 pack to bytes
221 movd [edi], mm1 ;
222 neg eax ;
223 pitch is positive
226 add esi, 4
227 add edi, 4
228 add edx, 4
230 cmp edx, cols
231 jl nextcol
232 // done with the all cols, start the across filtering in place
233 sub esi, edx
234 sub edi, edx
237 push eax
238 xor edx, edx
239 mov eax, [edi-4];
241 acrossnextcol:
242 pxor mm7, mm7 ;
243 mm7 = 00000000
244 movq mm6, [ebx + 32 ] ;
245 movq mm4, [edi+edx] ;
246 mm4 = p0..p7
247 movq mm3, mm4 ;
248 mm3 = p0..p7
249 punpcklbw mm3, mm0 ;
250 mm3 = p0..p3
251 movq mm1, mm3 ;
252 mm1 = p0..p3
253 pmullw mm3, mm6 ;
254 mm3 *= kernel 2 modifiers
256 movq mm6, [ebx + 48]
257 psrlq mm4, 8 ;
258 mm4 = p1..p7
259 movq mm5, mm4 ;
260 mm5 = p1..p7
261 punpcklbw mm5, mm0 ;
262 mm5 = p1..p4
263 pmullw mm6, mm5 ;
264 mm6 *= p1..p4 * kernel 3 modifiers
265 paddusw mm3, mm6 ;
266 mm3 += mm6
269 thresholding
270 movq mm7, mm1 ;
271 mm7 = p0..p3
272 psubusw mm7, mm5 ;
273 mm7 = p0..p3 - p1..p4
274 psubusw mm5, mm1 ;
275 mm5 = p1..p4 - p0..p3
276 paddusw mm7, mm5 ;
277 mm7 = abs(p0..p3 - p1..p4)
278 pcmpgtw mm7, mm2
280 movq mm6, [ebx + 64 ]
281 psrlq mm4, 8 ;
282 mm4 = p2..p7
283 movq mm5, mm4 ;
284 mm5 = p2..p7
285 punpcklbw mm5, mm0 ;
286 mm5 = p2..p5
287 pmullw mm6, mm5 ;
288 mm5 *= kernel 4 modifiers
289 paddusw mm3, mm6 ;
290 mm3 += mm5
293 thresholding
294 movq mm6, mm1 ;
295 mm6 = p0..p3
296 psubusw mm6, mm5 ;
297 mm6 = p0..p3 - p1..p4
298 psubusw mm5, mm1 ;
299 mm5 = p1..p4 - p0..p3
300 paddusw mm6, mm5 ;
301 mm6 = abs(p0..p3 - p1..p4)
302 pcmpgtw mm6, mm2
303 por mm7, mm6 ;
304 accumulate thresholds
307 movq mm6, [ebx ]
308 movq mm4, [edi+edx-2] ;
309 mm4 = p-2..p5
310 movq mm5, mm4 ;
311 mm5 = p-2..p5
312 punpcklbw mm5, mm0 ;
313 mm5 = p-2..p1
314 pmullw mm6, mm5 ;
315 mm5 *= kernel 0 modifiers
316 paddusw mm3, mm6 ;
317 mm3 += mm5
320 thresholding
321 movq mm6, mm1 ;
322 mm6 = p0..p3
323 psubusw mm6, mm5 ;
324 mm6 = p0..p3 - p1..p4
325 psubusw mm5, mm1 ;
326 mm5 = p1..p4 - p0..p3
327 paddusw mm6, mm5 ;
328 mm6 = abs(p0..p3 - p1..p4)
329 pcmpgtw mm6, mm2
330 por mm7, mm6 ;
331 accumulate thresholds
333 movq mm6, [ebx + 16]
334 psrlq mm4, 8 ;
335 mm4 = p-1..p5
336 punpcklbw mm4, mm0 ;
337 mm4 = p-1..p2
338 pmullw mm6, mm4 ;
339 mm4 *= kernel 1 modifiers.
340 paddusw mm3, mm6 ;
341 mm3 += mm5
344 thresholding
345 movq mm6, mm1 ;
346 mm6 = p0..p3
347 psubusw mm6, mm4 ;
348 mm6 = p0..p3 - p1..p4
349 psubusw mm4, mm1 ;
350 mm5 = p1..p4 - p0..p3
351 paddusw mm6, mm4 ;
352 mm6 = abs(p0..p3 - p1..p4)
353 pcmpgtw mm6, mm2
354 por mm7, mm6 ;
355 accumulate thresholds
357 paddusw mm3, rd ;
358 mm3 += round value
359 psraw mm3, VP8_FILTER_SHIFT ;
360 mm3 /= 128
362 pand mm1, mm7 ;
363 mm1 select vals > thresh from source
364 pandn mm7, mm3 ;
365 mm7 select vals < thresh from blurred result
366 paddusw mm1, mm7 ;
367 combination
369 packuswb mm1, mm0 ;
370 pack to bytes
371 mov DWORD PTR [edi+edx-4], eax ;
372 store previous four bytes
373 movd eax, mm1
375 add edx, 4
376 cmp edx, cols
377 jl acrossnextcol;
379 mov DWORD PTR [edi+edx-4], eax
380 pop eax
382 // done with this rwo
383 add esi, eax ;
384 next line
385 mov eax, dst_pixels_per_line ;
386 destination pitch?
387 add edi, eax ;
388 next destination
389 mov eax, src_pixels_per_line ;
390 destination pitch?
392 dec ecx ;
393 decrement count
394 jnz nextrow ;
395 next row
396 pop ebx
403 void vp8_post_proc_down_and_across_xmm
405 unsigned char *src_ptr,
406 unsigned char *dst_ptr,
407 int src_pixels_per_line,
408 int dst_pixels_per_line,
409 int rows,
410 int cols,
411 int flimit
414 #ifdef RELOCATEABLE
415 R4D2
416 #endif
418 __asm
420 movd xmm2, flimit
421 punpcklwd xmm2, xmm2
422 punpckldq xmm2, xmm2
423 punpcklqdq xmm2, xmm2
425 mov esi, src_ptr
426 mov edi, dst_ptr
428 mov ecx, DWORD PTR rows
429 mov eax, src_pixels_per_line ;
430 destination pitch?
431 pxor xmm0, xmm0 ;
432 mm0 = 00000000
434 nextrow:
436 xor edx, edx ;
438 clear out edx for use as loop counter
439 nextcol:
440 movq xmm3, QWORD PTR [esi] ;
442 mm4 = r0 p0..p7
443 punpcklbw xmm3, xmm0 ;
444 mm3 = p0..p3
445 movdqa xmm1, xmm3 ;
446 mm1 = p0..p3
447 psllw xmm3, 2 ;
449 movq xmm5, QWORD PTR [esi + eax] ;
450 mm4 = r1 p0..p7
451 punpcklbw xmm5, xmm0 ;
452 mm5 = r1 p0..p3
453 paddusw xmm3, xmm5 ;
454 mm3 += mm6
457 thresholding
458 movdqa xmm7, xmm1 ;
459 mm7 = r0 p0..p3
460 psubusw xmm7, xmm5 ;
461 mm7 = r0 p0..p3 - r1 p0..p3
462 psubusw xmm5, xmm1 ;
463 mm5 = r1 p0..p3 - r0 p0..p3
464 paddusw xmm7, xmm5 ;
465 mm7 = abs(r0 p0..p3 - r1 p0..p3)
466 pcmpgtw xmm7, xmm2
468 movq xmm5, QWORD PTR [esi + 2*eax] ;
469 mm4 = r2 p0..p7
470 punpcklbw xmm5, xmm0 ;
471 mm5 = r2 p0..p3
472 paddusw xmm3, xmm5 ;
473 mm3 += mm5
476 thresholding
477 movdqa xmm6, xmm1 ;
478 mm6 = r0 p0..p3
479 psubusw xmm6, xmm5 ;
480 mm6 = r0 p0..p3 - r2 p0..p3
481 psubusw xmm5, xmm1 ;
482 mm5 = r2 p0..p3 - r2 p0..p3
483 paddusw xmm6, xmm5 ;
484 mm6 = abs(r0 p0..p3 - r2 p0..p3)
485 pcmpgtw xmm6, xmm2
486 por xmm7, xmm6 ;
487 accumulate thresholds
490 neg eax
491 movq xmm5, QWORD PTR [esi+2*eax] ;
492 mm4 = r-2 p0..p7
493 punpcklbw xmm5, xmm0 ;
494 mm5 = r-2 p0..p3
495 paddusw xmm3, xmm5 ;
496 mm3 += mm5
499 thresholding
500 movdqa xmm6, xmm1 ;
501 mm6 = r0 p0..p3
502 psubusw xmm6, xmm5 ;
503 mm6 = p0..p3 - r-2 p0..p3
504 psubusw xmm5, xmm1 ;
505 mm5 = r-2 p0..p3 - p0..p3
506 paddusw xmm6, xmm5 ;
507 mm6 = abs(r0 p0..p3 - r-2 p0..p3)
508 pcmpgtw xmm6, xmm2
509 por xmm7, xmm6 ;
510 accumulate thresholds
512 movq xmm4, QWORD PTR [esi+eax] ;
513 mm4 = r-1 p0..p7
514 punpcklbw xmm4, xmm0 ;
515 mm4 = r-1 p0..p3
516 paddusw xmm3, xmm4 ;
517 mm3 += mm5
520 thresholding
521 movdqa xmm6, xmm1 ;
522 mm6 = r0 p0..p3
523 psubusw xmm6, xmm4 ;
524 mm6 = p0..p3 - r-2 p0..p3
525 psubusw xmm4, xmm1 ;
526 mm5 = r-1 p0..p3 - p0..p3
527 paddusw xmm6, xmm4 ;
528 mm6 = abs(r0 p0..p3 - r-1 p0..p3)
529 pcmpgtw xmm6, xmm2
530 por xmm7, xmm6 ;
531 accumulate thresholds
534 paddusw xmm3, rd42 ;
535 mm3 += round value
536 psraw xmm3, 3 ;
537 mm3 /= 8
539 pand xmm1, xmm7 ;
540 mm1 select vals > thresh from source
541 pandn xmm7, xmm3 ;
542 mm7 select vals < thresh from blurred result
543 paddusw xmm1, xmm7 ;
544 combination
546 packuswb xmm1, xmm0 ;
547 pack to bytes
548 movq QWORD PTR [edi], xmm1 ;
550 neg eax ;
551 pitch is positive
552 add esi, 8
553 add edi, 8
555 add edx, 8
556 cmp edx, cols
558 jl nextcol
560 // done with the all cols, start the across filtering in place
561 sub esi, edx
562 sub edi, edx
564 xor edx, edx
565 movq mm0, QWORD PTR [edi-8];
567 acrossnextcol:
568 movq xmm7, QWORD PTR [edi +edx -2]
569 movd xmm4, DWORD PTR [edi +edx +6]
571 pslldq xmm4, 8
572 por xmm4, xmm7
574 movdqa xmm3, xmm4
575 psrldq xmm3, 2
576 punpcklbw xmm3, xmm0 ;
577 mm3 = p0..p3
578 movdqa xmm1, xmm3 ;
579 mm1 = p0..p3
580 psllw xmm3, 2
583 movdqa xmm5, xmm4
584 psrldq xmm5, 3
585 punpcklbw xmm5, xmm0 ;
586 mm5 = p1..p4
587 paddusw xmm3, xmm5 ;
588 mm3 += mm6
591 thresholding
592 movdqa xmm7, xmm1 ;
593 mm7 = p0..p3
594 psubusw xmm7, xmm5 ;
595 mm7 = p0..p3 - p1..p4
596 psubusw xmm5, xmm1 ;
597 mm5 = p1..p4 - p0..p3
598 paddusw xmm7, xmm5 ;
599 mm7 = abs(p0..p3 - p1..p4)
600 pcmpgtw xmm7, xmm2
602 movdqa xmm5, xmm4
603 psrldq xmm5, 4
604 punpcklbw xmm5, xmm0 ;
605 mm5 = p2..p5
606 paddusw xmm3, xmm5 ;
607 mm3 += mm5
610 thresholding
611 movdqa xmm6, xmm1 ;
612 mm6 = p0..p3
613 psubusw xmm6, xmm5 ;
614 mm6 = p0..p3 - p1..p4
615 psubusw xmm5, xmm1 ;
616 mm5 = p1..p4 - p0..p3
617 paddusw xmm6, xmm5 ;
618 mm6 = abs(p0..p3 - p1..p4)
619 pcmpgtw xmm6, xmm2
620 por xmm7, xmm6 ;
621 accumulate thresholds
624 movdqa xmm5, xmm4 ;
625 mm5 = p-2..p5
626 punpcklbw xmm5, xmm0 ;
627 mm5 = p-2..p1
628 paddusw xmm3, xmm5 ;
629 mm3 += mm5
632 thresholding
633 movdqa xmm6, xmm1 ;
634 mm6 = p0..p3
635 psubusw xmm6, xmm5 ;
636 mm6 = p0..p3 - p1..p4
637 psubusw xmm5, xmm1 ;
638 mm5 = p1..p4 - p0..p3
639 paddusw xmm6, xmm5 ;
640 mm6 = abs(p0..p3 - p1..p4)
641 pcmpgtw xmm6, xmm2
642 por xmm7, xmm6 ;
643 accumulate thresholds
645 psrldq xmm4, 1 ;
646 mm4 = p-1..p5
647 punpcklbw xmm4, xmm0 ;
648 mm4 = p-1..p2
649 paddusw xmm3, xmm4 ;
650 mm3 += mm5
653 thresholding
654 movdqa xmm6, xmm1 ;
655 mm6 = p0..p3
656 psubusw xmm6, xmm4 ;
657 mm6 = p0..p3 - p1..p4
658 psubusw xmm4, xmm1 ;
659 mm5 = p1..p4 - p0..p3
660 paddusw xmm6, xmm4 ;
661 mm6 = abs(p0..p3 - p1..p4)
662 pcmpgtw xmm6, xmm2
663 por xmm7, xmm6 ;
664 accumulate thresholds
666 paddusw xmm3, rd42 ;
667 mm3 += round value
668 psraw xmm3, 3 ;
669 mm3 /= 8
671 pand xmm1, xmm7 ;
672 mm1 select vals > thresh from source
673 pandn xmm7, xmm3 ;
674 mm7 select vals < thresh from blurred result
675 paddusw xmm1, xmm7 ;
676 combination
678 packuswb xmm1, xmm0 ;
679 pack to bytes
680 movq QWORD PTR [edi+edx-8], mm0 ;
681 store previous four bytes
682 movdq2q mm0, xmm1
684 add edx, 8
685 cmp edx, cols
686 jl acrossnextcol;
688 // last 8 pixels
689 movq QWORD PTR [edi+edx-8], mm0
691 // done with this rwo
692 add esi, eax ;
693 next line
694 mov eax, dst_pixels_per_line ;
695 destination pitch?
696 add edi, eax ;
697 next destination
698 mov eax, src_pixels_per_line ;
699 destination pitch?
701 dec ecx ;
702 decrement count
703 jnz nextrow ;
704 next row
709 void vp8_mbpost_proc_down_mmx(unsigned char *dst, int pitch, int rows, int cols, int flimit)
711 int c, i;
712 __declspec(align(16))
713 int flimit2[2];
714 __declspec(align(16))
715 unsigned char d[16][8];
717 flimit = vp8_q2mbl(flimit);
719 for (i = 0; i < 2; i++)
720 flimit2[i] = flimit;
722 rows += 8;
724 for (c = 0; c < cols; c += 4)
726 unsigned char *s = &dst[c];
728 __asm
730 mov esi, s ;
731 pxor mm0, mm0 ;
733 mov eax, pitch ;
734 neg eax // eax = -pitch
736 lea esi, [esi + eax*8]; // edi = s[-pitch*8]
737 neg eax
740 pxor mm5, mm5
741 pxor mm6, mm6 ;
743 pxor mm7, mm7 ;
744 mov edi, esi
746 mov ecx, 15 ;
748 loop_initvar:
749 movd mm1, DWORD PTR [edi];
750 punpcklbw mm1, mm0 ;
752 paddw mm5, mm1 ;
753 pmullw mm1, mm1 ;
755 movq mm2, mm1 ;
756 punpcklwd mm1, mm0 ;
758 punpckhwd mm2, mm0 ;
759 paddd mm6, mm1 ;
761 paddd mm7, mm2 ;
762 lea edi, [edi+eax] ;
764 dec ecx
765 jne loop_initvar
766 //save the var and sum
767 xor edx, edx
768 loop_row:
769 movd mm1, DWORD PTR [esi] // [s-pitch*8]
770 movd mm2, DWORD PTR [edi] // [s+pitch*7]
772 punpcklbw mm1, mm0
773 punpcklbw mm2, mm0
775 paddw mm5, mm2
776 psubw mm5, mm1
778 pmullw mm2, mm2
779 movq mm4, mm2
781 punpcklwd mm2, mm0
782 punpckhwd mm4, mm0
784 paddd mm6, mm2
785 paddd mm7, mm4
787 pmullw mm1, mm1
788 movq mm2, mm1
790 punpcklwd mm1, mm0
791 psubd mm6, mm1
793 punpckhwd mm2, mm0
794 psubd mm7, mm2
797 movq mm3, mm6
798 pslld mm3, 4
800 psubd mm3, mm6
801 movq mm1, mm5
803 movq mm4, mm5
804 pmullw mm1, mm1
806 pmulhw mm4, mm4
807 movq mm2, mm1
809 punpcklwd mm1, mm4
810 punpckhwd mm2, mm4
812 movq mm4, mm7
813 pslld mm4, 4
815 psubd mm4, mm7
817 psubd mm3, mm1
818 psubd mm4, mm2
820 psubd mm3, flimit2
821 psubd mm4, flimit2
823 psrad mm3, 31
824 psrad mm4, 31
826 packssdw mm3, mm4
827 packsswb mm3, mm0
829 movd mm1, DWORD PTR [esi+eax*8]
831 movq mm2, mm1
832 punpcklbw mm1, mm0
834 paddw mm1, mm5
835 mov ecx, edx
837 and ecx, 127
838 movq mm4, vp8_rv[ecx*2]
840 paddw mm1, mm4
841 //paddw xmm1, eight8s
842 psraw mm1, 4
844 packuswb mm1, mm0
845 pand mm1, mm3
847 pandn mm3, mm2
848 por mm1, mm3
850 and ecx, 15
851 movd DWORD PTR d[ecx*4], mm1
853 mov ecx, edx
854 sub ecx, 8
856 and ecx, 15
857 movd mm1, DWORD PTR d[ecx*4]
859 movd [esi], mm1
860 lea esi, [esi+eax]
862 lea edi, [edi+eax]
863 add edx, 1
865 cmp edx, rows
866 jl loop_row
873 void vp8_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols, int flimit)
875 int c, i;
876 __declspec(align(16))
877 int flimit4[4];
878 __declspec(align(16))
879 unsigned char d[16][8];
881 flimit = vp8_q2mbl(flimit);
883 for (i = 0; i < 4; i++)
884 flimit4[i] = flimit;
886 rows += 8;
888 for (c = 0; c < cols; c += 8)
890 unsigned char *s = &dst[c];
892 __asm
894 mov esi, s ;
895 pxor xmm0, xmm0 ;
897 mov eax, pitch ;
898 neg eax // eax = -pitch
900 lea esi, [esi + eax*8]; // edi = s[-pitch*8]
901 neg eax
904 pxor xmm5, xmm5
905 pxor xmm6, xmm6 ;
907 pxor xmm7, xmm7 ;
908 mov edi, esi
910 mov ecx, 15 ;
912 loop_initvar:
913 movq xmm1, QWORD PTR [edi];
914 punpcklbw xmm1, xmm0 ;
916 paddw xmm5, xmm1 ;
917 pmullw xmm1, xmm1 ;
919 movdqa xmm2, xmm1 ;
920 punpcklwd xmm1, xmm0 ;
922 punpckhwd xmm2, xmm0 ;
923 paddd xmm6, xmm1 ;
925 paddd xmm7, xmm2 ;
926 lea edi, [edi+eax] ;
928 dec ecx
929 jne loop_initvar
930 //save the var and sum
931 xor edx, edx
932 loop_row:
933 movq xmm1, QWORD PTR [esi] // [s-pitch*8]
934 movq xmm2, QWORD PTR [edi] // [s+pitch*7]
936 punpcklbw xmm1, xmm0
937 punpcklbw xmm2, xmm0
939 paddw xmm5, xmm2
940 psubw xmm5, xmm1
942 pmullw xmm2, xmm2
943 movdqa xmm4, xmm2
945 punpcklwd xmm2, xmm0
946 punpckhwd xmm4, xmm0
948 paddd xmm6, xmm2
949 paddd xmm7, xmm4
951 pmullw xmm1, xmm1
952 movdqa xmm2, xmm1
954 punpcklwd xmm1, xmm0
955 psubd xmm6, xmm1
957 punpckhwd xmm2, xmm0
958 psubd xmm7, xmm2
961 movdqa xmm3, xmm6
962 pslld xmm3, 4
964 psubd xmm3, xmm6
965 movdqa xmm1, xmm5
967 movdqa xmm4, xmm5
968 pmullw xmm1, xmm1
970 pmulhw xmm4, xmm4
971 movdqa xmm2, xmm1
973 punpcklwd xmm1, xmm4
974 punpckhwd xmm2, xmm4
976 movdqa xmm4, xmm7
977 pslld xmm4, 4
979 psubd xmm4, xmm7
981 psubd xmm3, xmm1
982 psubd xmm4, xmm2
984 psubd xmm3, flimit4
985 psubd xmm4, flimit4
987 psrad xmm3, 31
988 psrad xmm4, 31
990 packssdw xmm3, xmm4
991 packsswb xmm3, xmm0
993 movq xmm1, QWORD PTR [esi+eax*8]
995 movq xmm2, xmm1
996 punpcklbw xmm1, xmm0
998 paddw xmm1, xmm5
999 mov ecx, edx
1001 and ecx, 127
1002 movdqu xmm4, vp8_rv[ecx*2]
1004 paddw xmm1, xmm4
1005 //paddw xmm1, eight8s
1006 psraw xmm1, 4
1008 packuswb xmm1, xmm0
1009 pand xmm1, xmm3
1011 pandn xmm3, xmm2
1012 por xmm1, xmm3
1014 and ecx, 15
1015 movq QWORD PTR d[ecx*8], xmm1
1017 mov ecx, edx
1018 sub ecx, 8
1020 and ecx, 15
1021 movq mm0, d[ecx*8]
1023 movq [esi], mm0
1024 lea esi, [esi+eax]
1026 lea edi, [edi+eax]
1027 add edx, 1
1029 cmp edx, rows
1030 jl loop_row
1036 #if 0
1037 /****************************************************************************
1039 * ROUTINE : plane_add_noise_wmt
1041 * INPUTS : unsigned char *Start starting address of buffer to add gaussian
1042 * noise to
1043 * unsigned int Width width of plane
1044 * unsigned int Height height of plane
1045 * int Pitch distance between subsequent lines of frame
1046 * int q quantizer used to determine amount of noise
1047 * to add
1049 * OUTPUTS : None.
1051 * RETURNS : void.
1053 * FUNCTION : adds gaussian noise to a plane of pixels
1055 * SPECIAL NOTES : None.
1057 ****************************************************************************/
1058 void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
1060 unsigned int i;
1062 __declspec(align(16)) unsigned char blackclamp[16];
1063 __declspec(align(16)) unsigned char whiteclamp[16];
1064 __declspec(align(16)) unsigned char bothclamp[16];
1065 char char_dist[300];
1066 char Rand[2048];
1067 double sigma;
1068 // return;
1069 __asm emms
1070 sigma = a + .5 + .6 * (63 - q) / 63.0;
1072 // set up a lookup table of 256 entries that matches
1073 // a gaussian distribution with sigma determined by q.
1076 double i;
1077 int next, j;
1079 next = 0;
1081 for (i = -32; i < 32; i++)
1083 double g = 256 * vp8_gaussian(sigma, 0, 1.0 * i);
1084 int a = (int)(g + .5);
1086 if (a)
1088 for (j = 0; j < a; j++)
1090 char_dist[next+j] = (char) i;
1093 next = next + j;
1098 for (next = next; next < 256; next++)
1099 char_dist[next] = 0;
1103 for (i = 0; i < 2048; i++)
1105 Rand[i] = char_dist[rand() & 0xff];
1108 for (i = 0; i < 16; i++)
1110 blackclamp[i] = -char_dist[0];
1111 whiteclamp[i] = -char_dist[0];
1112 bothclamp[i] = -2 * char_dist[0];
1115 for (i = 0; i < Height; i++)
1117 unsigned char *Pos = Start + i * Pitch;
1118 char *Ref = Rand + (rand() & 0xff);
1120 __asm
1122 mov ecx, [Width]
1123 mov esi, Pos
1124 mov edi, Ref
1125 xor eax, eax
1127 nextset:
1128 movdqu xmm1, [esi+eax] // get the source
1130 psubusb xmm1, blackclamp // clamp both sides so we don't outrange adding noise
1131 paddusb xmm1, bothclamp
1132 psubusb xmm1, whiteclamp
1134 movdqu xmm2, [edi+eax] // get the noise for this line
1135 paddb xmm1, xmm2 // add it in
1136 movdqu [esi+eax], xmm1 // store the result
1138 add eax, 16 // move to the next line
1140 cmp eax, ecx
1141 jl nextset
1148 #endif
1149 __declspec(align(16))
1150 static const int four8s[4] = { 8, 8, 8, 8};
1151 void vp8_mbpost_proc_across_ip_xmm(unsigned char *src, int pitch, int rows, int cols, int flimit)
1153 int r, i;
1154 __declspec(align(16))
1155 int flimit4[4];
1156 unsigned char *s = src;
1157 int sumsq;
1158 int sum;
1161 flimit = vp8_q2mbl(flimit);
1162 flimit4[0] =
1163 flimit4[1] =
1164 flimit4[2] =
1165 flimit4[3] = flimit;
1167 for (r = 0; r < rows; r++)
1171 sumsq = 0;
1172 sum = 0;
1174 for (i = -8; i <= 6; i++)
1176 sumsq += s[i] * s[i];
1177 sum += s[i];
1180 __asm
1182 mov eax, sumsq
1183 movd xmm7, eax
1185 mov eax, sum
1186 movd xmm6, eax
1188 mov esi, s
1189 xor ecx, ecx
1191 mov edx, cols
1192 add edx, 8
1193 pxor mm0, mm0
1194 pxor mm1, mm1
1196 pxor xmm0, xmm0
1197 nextcol4:
1199 movd xmm1, DWORD PTR [esi+ecx-8] // -8 -7 -6 -5
1200 movd xmm2, DWORD PTR [esi+ecx+7] // +7 +8 +9 +10
1202 punpcklbw xmm1, xmm0 // expanding
1203 punpcklbw xmm2, xmm0 // expanding
1205 punpcklwd xmm1, xmm0 // expanding to dwords
1206 punpcklwd xmm2, xmm0 // expanding to dwords
1208 psubd xmm2, xmm1 // 7--8 8--7 9--6 10--5
1209 paddd xmm1, xmm1 // -8*2 -7*2 -6*2 -5*2
1211 paddd xmm1, xmm2 // 7+-8 8+-7 9+-6 10+-5
1212 pmaddwd xmm1, xmm2 // squared of 7+-8 8+-7 9+-6 10+-5
1214 paddd xmm6, xmm2
1215 paddd xmm7, xmm1
1217 pshufd xmm6, xmm6, 0 // duplicate the last ones
1218 pshufd xmm7, xmm7, 0 // duplicate the last ones
1220 psrldq xmm1, 4 // 8--7 9--6 10--5 0000
1221 psrldq xmm2, 4 // 8--7 9--6 10--5 0000
1223 pshufd xmm3, xmm1, 3 // 0000 8--7 8--7 8--7 squared
1224 pshufd xmm4, xmm2, 3 // 0000 8--7 8--7 8--7 squared
1226 paddd xmm6, xmm4
1227 paddd xmm7, xmm3
1229 pshufd xmm3, xmm1, 01011111b // 0000 0000 9--6 9--6 squared
1230 pshufd xmm4, xmm2, 01011111b // 0000 0000 9--6 9--6 squared
1232 paddd xmm7, xmm3
1233 paddd xmm6, xmm4
1235 pshufd xmm3, xmm1, 10111111b // 0000 0000 8--7 8--7 squared
1236 pshufd xmm4, xmm2, 10111111b // 0000 0000 8--7 8--7 squared
1238 paddd xmm7, xmm3
1239 paddd xmm6, xmm4
1241 movdqa xmm3, xmm6
1242 pmaddwd xmm3, xmm3
1244 movdqa xmm5, xmm7
1245 pslld xmm5, 4
1247 psubd xmm5, xmm7
1248 psubd xmm5, xmm3
1250 psubd xmm5, flimit4
1251 psrad xmm5, 31
1253 packssdw xmm5, xmm0
1254 packsswb xmm5, xmm0
1256 movd xmm1, DWORD PTR [esi+ecx]
1257 movq xmm2, xmm1
1259 punpcklbw xmm1, xmm0
1260 punpcklwd xmm1, xmm0
1262 paddd xmm1, xmm6
1263 paddd xmm1, four8s
1265 psrad xmm1, 4
1266 packssdw xmm1, xmm0
1268 packuswb xmm1, xmm0
1269 pand xmm1, xmm5
1271 pandn xmm5, xmm2
1272 por xmm5, xmm1
1274 movd [esi+ecx-8], mm0
1275 movq mm0, mm1
1277 movdq2q mm1, xmm5
1278 psrldq xmm7, 12
1280 psrldq xmm6, 12
1281 add ecx, 4
1283 cmp ecx, edx
1284 jl nextcol4
1287 s += pitch;
1291 #if 0
1293 /****************************************************************************
1295 * ROUTINE : plane_add_noise_mmx
1297 * INPUTS : unsigned char *Start starting address of buffer to add gaussian
1298 * noise to
1299 * unsigned int Width width of plane
1300 * unsigned int Height height of plane
1301 * int Pitch distance between subsequent lines of frame
1302 * int q quantizer used to determine amount of noise
1303 * to add
1305 * OUTPUTS : None.
1307 * RETURNS : void.
1309 * FUNCTION : adds gaussian noise to a plane of pixels
1311 * SPECIAL NOTES : None.
1313 ****************************************************************************/
1314 void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
1316 unsigned int i;
1317 int Pitch4 = Pitch * 4;
1318 const int noise_amount = 2;
1319 const int noise_adder = 2 * noise_amount + 1;
1321 __declspec(align(16)) unsigned char blackclamp[16];
1322 __declspec(align(16)) unsigned char whiteclamp[16];
1323 __declspec(align(16)) unsigned char bothclamp[16];
1325 char char_dist[300];
1326 char Rand[2048];
1328 double sigma;
1329 __asm emms
1330 sigma = a + .5 + .6 * (63 - q) / 63.0;
1332 // set up a lookup table of 256 entries that matches
1333 // a gaussian distribution with sigma determined by q.
1336 double i, sum = 0;
1337 int next, j;
1339 next = 0;
1341 for (i = -32; i < 32; i++)
1343 int a = (int)(.5 + 256 * vp8_gaussian(sigma, 0, i));
1345 if (a)
1347 for (j = 0; j < a; j++)
1349 char_dist[next+j] = (char) i;
1352 next = next + j;
1357 for (next = next; next < 256; next++)
1358 char_dist[next] = 0;
1362 for (i = 0; i < 2048; i++)
1364 Rand[i] = char_dist[rand() & 0xff];
1367 for (i = 0; i < 16; i++)
1369 blackclamp[i] = -char_dist[0];
1370 whiteclamp[i] = -char_dist[0];
1371 bothclamp[i] = -2 * char_dist[0];
1374 for (i = 0; i < Height; i++)
1376 unsigned char *Pos = Start + i * Pitch;
1377 char *Ref = Rand + (rand() & 0xff);
1379 __asm
1381 mov ecx, [Width]
1382 mov esi, Pos
1383 mov edi, Ref
1384 xor eax, eax
1386 nextset:
1387 movq mm1, [esi+eax] // get the source
1389 psubusb mm1, blackclamp // clamp both sides so we don't outrange adding noise
1390 paddusb mm1, bothclamp
1391 psubusb mm1, whiteclamp
1393 movq mm2, [edi+eax] // get the noise for this line
1394 paddb mm1, mm2 // add it in
1395 movq [esi+eax], mm1 // store the result
1397 add eax, 8 // move to the next line
1399 cmp eax, ecx
1400 jl nextset
1407 #else
1408 extern char an[8][64][3072];
1409 extern int cd[8][64];
1411 void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
1413 unsigned int i;
1414 __declspec(align(16)) unsigned char blackclamp[16];
1415 __declspec(align(16)) unsigned char whiteclamp[16];
1416 __declspec(align(16)) unsigned char bothclamp[16];
1419 __asm emms
1421 for (i = 0; i < 16; i++)
1423 blackclamp[i] = -cd[a][q];
1424 whiteclamp[i] = -cd[a][q];
1425 bothclamp[i] = -2 * cd[a][q];
1428 for (i = 0; i < Height; i++)
1430 unsigned char *Pos = Start + i * Pitch;
1431 char *Ref = an[a][q] + (rand() & 0xff);
1433 __asm
1435 mov ecx, [Width]
1436 mov esi, Pos
1437 mov edi, Ref
1438 xor eax, eax
1440 nextset:
1441 movq mm1, [esi+eax] // get the source
1443 psubusb mm1, blackclamp // clamp both sides so we don't outrange adding noise
1444 paddusb mm1, bothclamp
1445 psubusb mm1, whiteclamp
1447 movq mm2, [edi+eax] // get the noise for this line
1448 paddb mm1, mm2 // add it in
1449 movq [esi+eax], mm1 // store the result
1451 add eax, 8 // move to the next line
1453 cmp eax, ecx
1454 jl nextset
1460 void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
1462 unsigned int i;
1464 __declspec(align(16)) unsigned char blackclamp[16];
1465 __declspec(align(16)) unsigned char whiteclamp[16];
1466 __declspec(align(16)) unsigned char bothclamp[16];
1468 __asm emms
1470 for (i = 0; i < 16; i++)
1472 blackclamp[i] = -cd[a][q];
1473 whiteclamp[i] = -cd[a][q];
1474 bothclamp[i] = -2 * cd[a][q];
1477 for (i = 0; i < Height; i++)
1479 unsigned char *Pos = Start + i * Pitch;
1480 char *Ref = an[a][q] + (rand() & 0xff);
1482 __asm
1484 mov ecx, [Width]
1485 mov esi, Pos
1486 mov edi, Ref
1487 xor eax, eax
1489 nextset:
1490 movdqu xmm1, [esi+eax] // get the source
1492 psubusb xmm1, blackclamp // clamp both sides so we don't outrange adding noise
1493 paddusb xmm1, bothclamp
1494 psubusb xmm1, whiteclamp
1496 movdqu xmm2, [edi+eax] // get the noise for this line
1497 paddb xmm1, xmm2 // add it in
1498 movdqu [esi+eax], xmm1 // store the result
1500 add eax, 16 // move to the next line
1502 cmp eax, ecx
1503 jl nextset
1508 #endif