2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
15 ;void vp8_loop_filter_horizontal_edge_mmx
17 ; unsigned char *src_ptr,
24 global sym
(vp8_loop_filter_horizontal_edge_mmx
)
25 sym
(vp8_loop_filter_horizontal_edge_mmx
):
28 SHADOW_ARGS_TO_STACK
6
35 sub rsp
, 32 ; reserve 32 bytes
36 %define t0
[rsp
+ 0] ;__declspec(align(16)) char t0[8];
37 %define t1
[rsp
+ 16] ;__declspec(align(16)) char t1[8];
39 mov rsi
, arg
(0) ;src_ptr
40 movsxd rax
, dword ptr arg
(1) ;src_pixel_step ; destination pitch?
42 movsxd rcx
, dword ptr arg
(5) ;count
44 mov rdx
, arg
(3) ;limit
46 mov rdi
, rsi
; rdi points to row +1 for indirect addressing
49 ; calculate breakout conditions
50 movq mm2
, [rdi
+2*rax
] ; q3
51 movq mm1
, [rsi
+2*rax
] ; q2
53 psubusb mm1
, mm2
; q2-=q3
54 psubusb mm2
, mm6
; q3-=q2
55 por mm1
, mm2
; abs(q3-q2)
59 movq mm4
, [rsi
+rax
] ; q1
61 psubusb mm4
, mm6
; q1-=q2
62 psubusb mm6
, mm3
; q2-=q1
63 por mm4
, mm6
; abs(q2-q1)
70 psubusb mm4
, mm3
; q0-=q1
71 psubusb mm3
, mm0
; q1-=q0
72 por mm4
, mm3
; abs(q0-q1)
73 movq t0
, mm4
; save to t0
78 neg rax
; negate pitch to deal with above border
80 movq mm2
, [rsi
+4*rax
] ; p3
81 movq mm4
, [rdi
+4*rax
] ; p2
83 psubusb mm4
, mm2
; p2-=p3
84 psubusb mm2
, mm5
; p3-=p2
85 por mm4
, mm2
; abs(p3 - p2)
90 movq mm4
, [rsi
+2*rax
] ; p1
92 psubusb mm4
, mm5
; p1-=p2
93 psubusb mm5
, mm3
; p2-=p1
94 por mm4
, mm5
; abs(p2 - p1)
100 movq mm4
, [rsi
+rax
] ; p0
102 psubusb mm4
, mm3
; p0-=p1
103 psubusb mm3
, mm5
; p1-=p0
104 por mm4
, mm3
; abs(p1 - p0)
105 movq t1
, mm4
; save to t1
111 psubusb mm3
, mm2
; q1-=p1
112 psubusb mm2
, mm4
; p1-=q1
113 por mm2
, mm3
; abs(p1-q1)
114 pand mm2
, [GLOBAL(tfe
)] ; set lsb of each byte to zero
115 psrlw mm2
, 1 ; abs(p1-q1)/2
119 psubusb mm5
, mm3
; p0-=q0
120 psubusb mm3
, mm6
; q0-=p0
121 por mm5
, mm3
; abs(p0 - q0)
122 paddusb mm5
, mm5
; abs(p0-q0)*2
123 paddusb mm5
, mm2
; abs (p0 - q0) *2 + abs(p1-q1)/2
125 mov rdx
, arg
(2) ;flimit ; get flimit
126 movq mm2
, [rdx
] ; flimit mm2
127 paddb mm2
, mm2
; flimit*2 (less than 255)
128 paddb mm7
, mm2
; flimit * 2 + limit (less than 255)
130 psubusb mm5
, mm7
; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
133 pcmpeqb mm1
, mm5
; mask mm1
135 ; calculate high edge variance
136 mov rdx
, arg
(4) ;thresh ; get thresh
138 movq mm4
, t0
; get abs (q1 - q0)
140 movq mm3
, t1
; get abs (p1 - p0)
142 paddb mm4
, mm3
; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
150 ; start work on filters
151 movq mm2
, [rsi
+2*rax
] ; p1
153 pxor mm2
, [GLOBAL(t80
)] ; p1 offset to convert to signed values
154 pxor mm7
, [GLOBAL(t80
)] ; q1 offset to convert to signed values
155 psubsb mm2
, mm7
; p1 - q1
156 pand mm2
, mm4
; high var mask (hvm)(p1 - q1)
157 pxor mm6
, [GLOBAL(t80
)] ; offset to convert to signed values
158 pxor mm0
, [GLOBAL(t80
)] ; offset to convert to signed values
160 psubsb mm0
, mm6
; q0 - p0
161 paddsb mm2
, mm0
; 1 * (q0 - p0) + hvm(p1 - q1)
162 paddsb mm2
, mm0
; 2 * (q0 - p0) + hvm(p1 - q1)
163 paddsb mm2
, mm0
; 3 * (q0 - p0) + hvm(p1 - q1)
164 pand mm1
, mm2
; mask filter values we don't care about
166 paddsb mm1
, [GLOBAL(t4
)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
167 paddsb mm2
, [GLOBAL(t3
)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
176 movq mm2
, mm0
; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
179 movq mm5
, mm1
; abcdefgh
180 punpcklbw mm0
, mm1
; e0f0g0h0
181 psraw mm0
, 11 ; sign extended shift right by 3
183 punpckhbw mm1
, mm5
; a0b0c0d0
184 psraw mm1
, 11 ; sign extended shift right by 3
185 movq mm5
, mm0
; save results
187 packsswb mm0
, mm1
; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
188 paddsw mm5
, [GLOBAL(ones
)]
189 paddsw mm1
, [GLOBAL(ones
)]
190 psraw mm5
, 1 ; partial shifted one more time for 2nd tap
191 psraw mm1
, 1 ; partial shifted one more time for 2nd tap
192 packsswb mm5
, mm1
; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
193 pandn mm4
, mm5
; high edge variance additive
195 paddsb mm6
, mm2
; p0+= p0 add
196 pxor mm6
, [GLOBAL(t80
)] ; unoffset
197 movq
[rsi
+rax
], mm6
; write back
199 movq mm6
, [rsi
+2*rax
] ; p1
200 pxor mm6
, [GLOBAL(t80
)] ; reoffset
201 paddsb mm6
, mm4
; p1+= p1 add
202 pxor mm6
, [GLOBAL(t80
)] ; unoffset
203 movq
[rsi
+2*rax
], mm6
; write back
205 psubsb mm3
, mm0
; q0-= q0 add
206 pxor mm3
, [GLOBAL(t80
)] ; unoffset
207 movq
[rsi
], mm3
; write back
209 psubsb mm7
, mm4
; q1-= q1 add
210 pxor mm7
, [GLOBAL(t80
)] ; unoffset
211 movq
[rdi
], mm7
; write back
229 ;void vp8_loop_filter_vertical_edge_mmx
231 ; unsigned char *src_ptr,
232 ; int src_pixel_step,
233 ; const char *flimit,
235 ; const char *thresh,
238 global sym
(vp8_loop_filter_vertical_edge_mmx
)
239 sym
(vp8_loop_filter_vertical_edge_mmx
):
242 SHADOW_ARGS_TO_STACK
6
249 sub rsp
, 64 ; reserve 64 bytes
250 %define t0
[rsp
+ 0] ;__declspec(align(16)) char t0[8];
251 %define t1
[rsp
+ 16] ;__declspec(align(16)) char t1[8];
252 %define srct
[rsp
+ 32] ;__declspec(align(16)) char srct[32];
254 mov rsi
, arg
(0) ;src_ptr
255 movsxd rax
, dword ptr arg
(1) ;src_pixel_step ; destination pitch?
257 lea rsi
, [rsi
+ rax
*4 - 4]
259 movsxd rcx
, dword ptr arg
(5) ;count
261 mov rdi
, rsi
; rdi points to row +1 for indirect addressing
266 movq mm6
, [rsi
+2*rax
] ; 67 66 65 64 63 62 61 60
267 movq mm7
, mm6
; 77 76 75 74 73 72 71 70
269 punpckhbw mm7
, [rdi
+2*rax
] ; 77 67 76 66 75 65 74 64
270 punpcklbw mm6
, [rdi
+2*rax
] ; 73 63 72 62 71 61 70 60
272 movq mm4
, [rsi
] ; 47 46 45 44 43 42 41 40
273 movq mm5
, mm4
; 47 46 45 44 43 42 41 40
275 punpckhbw mm5
, [rsi
+rax
] ; 57 47 56 46 55 45 54 44
276 punpcklbw mm4
, [rsi
+rax
] ; 53 43 52 42 51 41 50 40
278 movq mm3
, mm5
; 57 47 56 46 55 45 54 44
279 punpckhwd mm5
, mm7
; 77 67 57 47 76 66 56 46
281 punpcklwd mm3
, mm7
; 75 65 55 45 74 64 54 44
282 movq mm2
, mm4
; 53 43 52 42 51 41 50 40
284 punpckhwd mm4
, mm6
; 73 63 53 43 72 62 52 42
285 punpcklwd mm2
, mm6
; 71 61 51 41 70 60 50 40
288 movq mm6
, [rsi
+rax
*2] ; 27 26 25 24 23 22 21 20
290 movq mm1
, mm6
; 27 26 25 24 23 22 21 20
291 punpckhbw mm6
, [rsi
+rax
] ; 37 27 36 36 35 25 34 24
293 punpcklbw mm1
, [rsi
+rax
] ; 33 23 32 22 31 21 30 20
294 movq mm7
, [rsi
+rax
*4]; ; 07 06 05 04 03 02 01 00
296 punpckhbw mm7
, [rdi
+rax
*4] ; 17 07 16 06 15 05 14 04
297 movq mm0
, mm7
; 17 07 16 06 15 05 14 04
299 punpckhwd mm7
, mm6
; 37 27 17 07 36 26 16 06
300 punpcklwd mm0
, mm6
; 35 25 15 05 34 24 14 04
302 movq mm6
, mm7
; 37 27 17 07 36 26 16 06
303 punpckhdq mm7
, mm5
; 77 67 57 47 37 27 17 07 = q3
305 punpckldq mm6
, mm5
; 76 66 56 46 36 26 16 06 = q2
307 movq mm5
, mm6
; 76 66 56 46 36 26 16 06
308 psubusb mm5
, mm7
; q2-q3
310 psubusb mm7
, mm6
; q3-q2
311 por mm7
, mm5
; ; mm7=abs (q3-q2)
313 movq mm5
, mm0
; 35 25 15 05 34 24 14 04
314 punpckhdq mm5
, mm3
; 75 65 55 45 35 25 15 05 = q1
316 punpckldq mm0
, mm3
; 74 64 54 44 34 24 15 04 = q0
317 movq mm3
, mm5
; 75 65 55 45 35 25 15 05 = q1
319 psubusb mm3
, mm6
; q1-q2
320 psubusb mm6
, mm5
; q2-q1
322 por mm6
, mm3
; mm6=abs(q2-q1)
325 movq
[rdx
+24], mm5
; save q1
326 movq
[rdx
+16], mm0
; save q0
328 movq mm3
, [rsi
+rax
*4] ; 07 06 05 04 03 02 01 00
329 punpcklbw mm3
, [rdi
+rax
*4] ; 13 03 12 02 11 01 10 00
331 movq mm0
, mm3
; 13 03 12 02 11 01 10 00
332 punpcklwd mm0
, mm1
; 31 21 11 01 30 20 10 00
334 punpckhwd mm3
, mm1
; 33 23 13 03 32 22 12 02
335 movq mm1
, mm0
; 31 21 11 01 30 20 10 00
337 punpckldq mm0
, mm2
; 70 60 50 40 30 20 10 00 =p3
338 punpckhdq mm1
, mm2
; 71 61 51 41 31 21 11 01 =p2
340 movq mm2
, mm1
; 71 61 51 41 31 21 11 01 =p2
341 psubusb mm2
, mm0
; p2-p3
343 psubusb mm0
, mm1
; p3-p2
344 por mm0
, mm2
; mm0=abs(p3-p2)
346 movq mm2
, mm3
; 33 23 13 03 32 22 12 02
347 punpckldq mm2
, mm4
; 72 62 52 42 32 22 12 02 = p1
349 punpckhdq mm3
, mm4
; 73 63 53 43 33 23 13 03 = p0
350 movq
[rdx
+8], mm3
; save p0
352 movq
[rdx
], mm2
; save p1
353 movq mm5
, mm2
; mm5 = p1
355 psubusb mm2
, mm1
; p1-p2
356 psubusb mm1
, mm5
; p2-p1
358 por mm1
, mm2
; mm1=abs(p2-p1)
359 mov rdx
, arg
(3) ;limit
361 movq mm4
, [rdx
] ; mm4 = limit
371 por mm0
, mm7
; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
375 movq mm7
, mm3
; mm3=mm7=p0
376 psubusb mm7
, mm5
; p0 - p1
378 psubusb mm5
, mm3
; p1 - p0
379 por mm5
, mm7
; abs(p1-p0)
381 movq t0
, mm5
; save abs(p1-p0)
385 por mm0
, mm5
; mm0=mask
387 movq mm5
, [rdx
+16] ; mm5=q0
388 movq mm7
, [rdx
+24] ; mm7=q1
390 movq mm6
, mm5
; mm6=q0
392 psubusb mm5
, mm7
; q0-q1
394 psubusb mm7
, mm6
; q1-q0
395 por mm7
, mm5
; abs(q1-q0)
397 movq t1
, mm7
; save abs(q1-q0)
403 psubusb mm5
, mm1
; q1-=p1
404 psubusb mm1
, mm2
; p1-=q1
405 por mm5
, mm1
; abs(p1-q1)
406 pand mm5
, [GLOBAL(tfe
)] ; set lsb of each byte to zero
407 psrlw mm5
, 1 ; abs(p1-q1)/2
409 mov rdx
, arg
(2) ;flimit ;
411 movq mm2
, [rdx
] ;flimit mm2
412 movq mm1
, mm3
; mm1=mm3=p0
414 movq mm7
, mm6
; mm7=mm6=q0
415 psubusb mm1
, mm7
; p0-q0
417 psubusb mm7
, mm3
; q0-p0
418 por mm1
, mm7
; abs(q0-p0)
419 paddusb mm1
, mm1
; abs(q0-p0)*2
420 paddusb mm1
, mm5
; abs (p0 - q0) *2 + abs(p1-q1)/2
422 paddb mm2
, mm2
; flimit*2 (less than 255)
423 paddb mm4
, mm2
; flimit * 2 + limit (less than 255)
425 psubusb mm1
, mm4
; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
431 ; calculate high edge variance
432 mov rdx
, arg
(4) ;thresh ; get thresh
435 movq mm4
, t0
; get abs (q1 - q0)
438 movq mm3
, t1
; get abs (p1 - p0)
441 por mm4
, mm3
; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
449 ; start work on filters
453 movq mm7
, [rdx
+24] ; q1
455 movq mm6
, [rdx
+8] ; p0
456 movq mm0
, [rdx
+16] ; q0
458 pxor mm2
, [GLOBAL(t80
)] ; p1 offset to convert to signed values
459 pxor mm7
, [GLOBAL(t80
)] ; q1 offset to convert to signed values
461 psubsb mm2
, mm7
; p1 - q1
462 pand mm2
, mm4
; high var mask (hvm)(p1 - q1)
464 pxor mm6
, [GLOBAL(t80
)] ; offset to convert to signed values
465 pxor mm0
, [GLOBAL(t80
)] ; offset to convert to signed values
468 psubsb mm0
, mm6
; q0 - p0
470 paddsb mm2
, mm0
; 1 * (q0 - p0) + hvm(p1 - q1)
471 paddsb mm2
, mm0
; 2 * (q0 - p0) + hvm(p1 - q1)
473 paddsb mm2
, mm0
; 3 * (q0 - p0) + hvm(p1 - q1)
474 pand mm1
, mm2
; mask filter values we don't care about
477 paddsb mm1
, [GLOBAL(t4
)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
479 paddsb mm2
, [GLOBAL(t3
)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
491 movq mm2
, mm0
; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
494 movq mm5
, mm1
; abcdefgh
496 punpcklbw mm0
, mm1
; e0f0g0h0
497 psraw mm0
, 11 ; sign extended shift right by 3
500 punpckhbw mm1
, mm5
; a0b0c0d0
502 psraw mm1
, 11 ; sign extended shift right by 3
503 movq mm5
, mm0
; save results
505 packsswb mm0
, mm1
; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
506 paddsw mm5
, [GLOBAL(ones
)]
508 paddsw mm1
, [GLOBAL(ones
)]
509 psraw mm5
, 1 ; partial shifted one more time for 2nd tap
511 psraw mm1
, 1 ; partial shifted one more time for 2nd tap
512 packsswb mm5
, mm1
; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
514 pandn mm4
, mm5
; high edge variance additive
516 paddsb mm6
, mm2
; p0+= p0 add
517 pxor mm6
, [GLOBAL(t80
)] ; unoffset
521 pxor mm1
, [GLOBAL(t80
)] ; reoffset
523 paddsb mm1
, mm4
; p1+= p1 add
524 pxor mm1
, [GLOBAL(t80
)] ; unoffset
527 psubsb mm3
, mm0
; q0-= q0 add
528 pxor mm3
, [GLOBAL(t80
)] ; unoffset
531 psubsb mm7
, mm4
; q1-= q1 add
532 pxor mm7
, [GLOBAL(t80
)] ; unoffset
535 ; tranpose and write back
536 ; mm1 = 72 62 52 42 32 22 12 02
537 ; mm6 = 73 63 53 43 33 23 13 03
538 ; mm3 = 74 64 54 44 34 24 14 04
539 ; mm7 = 75 65 55 45 35 25 15 05
541 movq mm2
, mm1
; 72 62 52 42 32 22 12 02
542 punpcklbw mm2
, mm6
; 33 32 23 22 13 12 03 02
544 movq mm4
, mm3
; 74 64 54 44 34 24 14 04
545 punpckhbw mm1
, mm6
; 73 72 63 62 53 52 43 42
547 punpcklbw mm4
, mm7
; 35 34 25 24 15 14 05 04
548 punpckhbw mm3
, mm7
; 75 74 65 64 55 54 45 44
550 movq mm6
, mm2
; 33 32 23 22 13 12 03 02
551 punpcklwd mm2
, mm4
; 15 14 13 12 05 04 03 02
553 punpckhwd mm6
, mm4
; 35 34 33 32 25 24 23 22
554 movq mm5
, mm1
; 73 72 63 62 53 52 43 42
556 punpcklwd mm1
, mm3
; 55 54 53 52 45 44 43 42
557 punpckhwd mm5
, mm3
; 75 74 73 72 65 64 63 62
560 ; mm2 = 15 14 13 12 05 04 03 02
561 ; mm6 = 35 34 33 32 25 24 23 22
562 ; mm5 = 55 54 53 52 45 44 43 42
563 ; mm1 = 75 74 73 72 65 64 63 62
567 movd
[rsi
+rax
*4+2], mm2
570 movd
[rdi
+rax
*4+2], mm2
571 movd
[rsi
+rax
*2+2], mm6
585 movd
[rdi
+rax
*2+2], mm5
602 ;void vp8_mbloop_filter_horizontal_edge_mmx
604 ; unsigned char *src_ptr,
605 ; int src_pixel_step,
606 ; const char *flimit,
608 ; const char *thresh,
611 global sym
(vp8_mbloop_filter_horizontal_edge_mmx
)
612 sym
(vp8_mbloop_filter_horizontal_edge_mmx
):
615 SHADOW_ARGS_TO_STACK
6
622 sub rsp
, 32 ; reserve 32 bytes
623 %define t0
[rsp
+ 0] ;__declspec(align(16)) char t0[8];
624 %define t1
[rsp
+ 16] ;__declspec(align(16)) char t1[8];
626 mov rsi
, arg
(0) ;src_ptr
627 movsxd rax
, dword ptr arg
(1) ;src_pixel_step ; destination pitch?
629 movsxd rcx
, dword ptr arg
(5) ;count
631 mov rdx
, arg
(3) ;limit
633 mov rdi
, rsi
; rdi points to row +1 for indirect addressing
636 ; calculate breakout conditions
637 movq mm2
, [rdi
+2*rax
] ; q3
639 movq mm1
, [rsi
+2*rax
] ; q2
641 psubusb mm1
, mm2
; q2-=q3
642 psubusb mm2
, mm6
; q3-=q2
643 por mm1
, mm2
; abs(q3-q2)
647 ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit
648 movq mm4
, [rsi
+rax
] ; q1
650 psubusb mm4
, mm6
; q1-=q2
651 psubusb mm6
, mm3
; q2-=q1
652 por mm4
, mm6
; abs(q2-q1)
657 ; mm1 = mask, mm3=q1, mm7 = limit
661 psubusb mm4
, mm3
; q0-=q1
662 psubusb mm3
, mm0
; q1-=q0
663 por mm4
, mm3
; abs(q0-q1)
664 movq t0
, mm4
; save to t0
669 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1)
671 neg rax
; negate pitch to deal with above border
673 movq mm2
, [rsi
+4*rax
] ; p3
674 movq mm4
, [rdi
+4*rax
] ; p2
676 psubusb mm4
, mm2
; p2-=p3
677 psubusb mm2
, mm5
; p3-=p2
678 por mm4
, mm2
; abs(p3 - p2)
681 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1)
683 movq mm4
, [rsi
+2*rax
] ; p1
685 psubusb mm4
, mm5
; p1-=p2
686 psubusb mm5
, mm3
; p2-=p1
687 por mm4
, mm5
; abs(p2 - p1)
694 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1)
696 movq mm4
, [rsi
+rax
] ; p0
698 psubusb mm4
, mm3
; p0-=p1
699 psubusb mm3
, mm5
; p1-=p0
700 por mm4
, mm3
; abs(p1 - p0)
701 movq t1
, mm4
; save to t1
704 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0)
708 psubusb mm3
, mm2
; q1-=p1
709 psubusb mm2
, mm4
; p1-=q1
710 por mm2
, mm3
; abs(p1-q1)
711 pand mm2
, [GLOBAL(tfe
)] ; set lsb of each byte to zero
712 psrlw mm2
, 1 ; abs(p1-q1)/2
716 psubusb mm5
, mm3
; p0-=q0
717 psubusb mm3
, mm6
; q0-=p0
718 por mm5
, mm3
; abs(p0 - q0)
719 paddusb mm5
, mm5
; abs(p0-q0)*2
720 paddusb mm5
, mm2
; abs (p0 - q0) *2 + abs(p1-q1)/2
722 mov rdx
, arg
(2) ;flimit ; get flimit
723 movq mm2
, [rdx
] ; flimit mm2
724 paddb mm2
, mm2
; flimit*2 (less than 255)
725 paddb mm7
, mm2
; flimit * 2 + limit (less than 255)
727 psubusb mm5
, mm7
; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
730 pcmpeqb mm1
, mm5
; mask mm1
732 ; mm1 = mask, mm0=q0, mm7 = flimit, t0 = abs(q0-q1) t1 = abs(p1-p0)
735 ; calculate high edge variance
736 mov rdx
, arg
(4) ;thresh ; get thresh
738 movq mm4
, t0
; get abs (q1 - q0)
740 movq mm3
, t1
; get abs (p1 - p0)
742 paddb mm4
, mm3
; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
751 ; mm1 = mask, mm0=q0, mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0)
753 ; start work on filters
754 movq mm2
, [rsi
+2*rax
] ; p1
756 pxor mm2
, [GLOBAL(t80
)] ; p1 offset to convert to signed values
757 pxor mm7
, [GLOBAL(t80
)] ; q1 offset to convert to signed values
758 psubsb mm2
, mm7
; p1 - q1
760 pxor mm6
, [GLOBAL(t80
)] ; offset to convert to signed values
761 pxor mm0
, [GLOBAL(t80
)] ; offset to convert to signed values
763 psubsb mm0
, mm6
; q0 - p0
764 paddsb mm2
, mm0
; 1 * (q0 - p0) + (p1 - q1)
765 paddsb mm2
, mm0
; 2 * (q0 - p0)
766 paddsb mm2
, mm0
; 3 * (q0 - p0) + (p1 - q1)
767 pand mm1
, mm2
; mask filter values we don't care about
770 ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
771 movq mm2
, mm1
; vp8_filter
772 pand mm2
, mm4
; ; Filter2 = vp8_filter & hev
775 paddsb mm5
, [GLOBAL(t3
)];
780 punpcklbw mm0
, mm5
; e0f0g0h0
781 psraw mm0
, 11 ; sign extended shift right by 3
782 punpckhbw mm7
, mm5
; a0b0c0d0
783 psraw mm7
, 11 ; sign extended shift right by 3
784 packsswb mm0
, mm7
; Filter2 >>=3;
786 movq mm5
, mm0
; Filter2
788 paddsb mm2
, [GLOBAL(t4
)] ; vp8_signed_char_clamp(Filter2 + 4)
792 punpcklbw mm0
, mm2
; e0f0g0h0
793 psraw mm0
, 11 ; sign extended shift right by 3
794 punpckhbw mm7
, mm2
; a0b0c0d0
795 psraw mm7
, 11 ; sign extended shift right by 3
796 packsswb mm0
, mm7
; Filter2 >>=3;
798 ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0
799 psubsb mm3
, mm0
; qs0 =qs0 - filter1
800 paddsb mm6
, mm5
; ps0 =ps0 + Fitler2
802 ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
803 ; vp8_filter &= ~hev;
804 ; Filter2 = vp8_filter;
805 pandn mm4
, mm1
; vp8_filter&=~hev
808 ; mm3=qs0, mm4=filter2, mm6=ps0
810 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
811 ; s = vp8_signed_char_clamp(qs0 - u);
813 ; s = vp8_signed_char_clamp(ps0 + u);
821 pmulhw mm1
, [GLOBAL(s27
)]
822 pmulhw mm2
, [GLOBAL(s27
)]
823 paddw mm1
, [GLOBAL(s63
)]
824 paddw mm2
, [GLOBAL(s63
)]
832 pxor mm3
, [GLOBAL(t80
)]
833 pxor mm6
, [GLOBAL(t80
)]
837 ; roughly 2/7th difference across boundary
838 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
839 ; s = vp8_signed_char_clamp(qs1 - u);
841 ; s = vp8_signed_char_clamp(ps1 + u);
847 pmulhw mm1
, [GLOBAL(s18
)]
848 pmulhw mm2
, [GLOBAL(s18
)]
849 paddw mm1
, [GLOBAL(s63
)]
850 paddw mm2
, [GLOBAL(s63
)]
856 movq mm6
, [rsi
+rax
*2] ; p1
858 pxor mm3
, [GLOBAL(t80
)]
859 pxor mm6
, [GLOBAL(t80
)]
864 pxor mm6
, [GLOBAL(t80
)]
865 pxor mm3
, [GLOBAL(t80
)]
867 movq
[rsi
+rax
*2], mm6
869 ; roughly 1/7th difference across boundary
870 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
871 ; s = vp8_signed_char_clamp(qs2 - u);
873 ; s = vp8_signed_char_clamp(ps2 + u);
879 pmulhw mm1
, [GLOBAL(s9
)]
880 pmulhw mm2
, [GLOBAL(s9
)]
881 paddw mm1
, [GLOBAL(s63
)]
882 paddw mm2
, [GLOBAL(s63
)]
888 movq mm6
, [rdi
+rax
*4]
892 pxor mm6
, [GLOBAL(t80
)]
893 pxor mm3
, [GLOBAL(t80
)]
898 pxor mm6
, [GLOBAL(t80
)]
899 pxor mm3
, [GLOBAL(t80
)]
902 movq
[rdi
+rax
*4], mm6
921 ;void vp8_mbloop_filter_vertical_edge_mmx
923 ; unsigned char *src_ptr,
924 ; int src_pixel_step,
925 ; const char *flimit,
927 ; const char *thresh,
930 global sym
(vp8_mbloop_filter_vertical_edge_mmx
)
931 sym
(vp8_mbloop_filter_vertical_edge_mmx
):
934 SHADOW_ARGS_TO_STACK
6
941 sub rsp
, 96 ; reserve 96 bytes
942 %define t0
[rsp
+ 0] ;__declspec(align(16)) char t0[8];
943 %define t1
[rsp
+ 16] ;__declspec(align(16)) char t1[8];
944 %define srct
[rsp
+ 32] ;__declspec(align(16)) char srct[64];
946 mov rsi
, arg
(0) ;src_ptr
947 movsxd rax
, dword ptr arg
(1) ;src_pixel_step ; destination pitch?
949 lea rsi
, [rsi
+ rax
*4 - 4]
951 movsxd rcx
, dword ptr arg
(5) ;count
953 lea rdi
, [rsi
+ rax
] ; rdi points to row +1 for indirect addressing
956 movq mm0
, [rdi
+2*rax
] ; 77 76 75 74 73 72 71 70
957 movq mm6
, [rsi
+2*rax
] ; 67 66 65 64 63 62 61 60
959 movq mm7
, mm6
; 77 76 75 74 73 72 71 70
960 punpckhbw mm7
, mm0
; 77 67 76 66 75 65 74 64
962 punpcklbw mm6
, mm0
; 73 63 72 62 71 61 70 60
963 movq mm0
, [rsi
+rax
] ; 57 56 55 54 53 52 51 50
965 movq mm4
, [rsi
] ; 47 46 45 44 43 42 41 40
966 movq mm5
, mm4
; 47 46 45 44 43 42 41 40
968 punpckhbw mm5
, mm0
; 57 47 56 46 55 45 54 44
969 punpcklbw mm4
, mm0
; 53 43 52 42 51 41 50 40
971 movq mm3
, mm5
; 57 47 56 46 55 45 54 44
972 punpckhwd mm5
, mm7
; 77 67 57 47 76 66 56 46
974 punpcklwd mm3
, mm7
; 75 65 55 45 74 64 54 44
975 movq mm2
, mm4
; 53 43 52 42 51 41 50 40
977 punpckhwd mm4
, mm6
; 73 63 53 43 72 62 52 42
978 punpcklwd mm2
, mm6
; 71 61 51 41 70 60 50 40
982 movq mm7
, [rsi
+rax
] ; 37 36 35 34 33 32 31 30
983 movq mm6
, [rsi
+rax
*2] ; 27 26 25 24 23 22 21 20
985 movq mm1
, mm6
; 27 26 25 24 23 22 21 20
986 punpckhbw mm6
, mm7
; 37 27 36 36 35 25 34 24
988 punpcklbw mm1
, mm7
; 33 23 32 22 31 21 30 20
990 movq mm7
, [rsi
+rax
*4]; ; 07 06 05 04 03 02 01 00
991 punpckhbw mm7
, [rdi
+rax
*4] ; 17 07 16 06 15 05 14 04
993 movq mm0
, mm7
; 17 07 16 06 15 05 14 04
994 punpckhwd mm7
, mm6
; 37 27 17 07 36 26 16 06
996 punpcklwd mm0
, mm6
; 35 25 15 05 34 24 14 04
997 movq mm6
, mm7
; 37 27 17 07 36 26 16 06
999 punpckhdq mm7
, mm5
; 77 67 57 47 37 27 17 07 = q3
1000 punpckldq mm6
, mm5
; 76 66 56 46 36 26 16 06 = q2
1003 movq mm5
, mm6
; 76 66 56 46 36 26 16 06
1006 psubusb mm5
, mm7
; q2-q3
1010 psubusb mm7
, mm6
; q3-q2
1012 por mm7
, mm5
; ; mm7=abs (q3-q2)
1013 movq mm5
, mm0
; 35 25 15 05 34 24 14 04
1015 punpckhdq mm5
, mm3
; 75 65 55 45 35 25 15 05 = q1
1016 punpckldq mm0
, mm3
; 74 64 54 44 34 24 15 04 = q0
1018 movq mm3
, mm5
; 75 65 55 45 35 25 15 05 = q1
1019 psubusb mm3
, mm6
; q1-q2
1021 psubusb mm6
, mm5
; q2-q1
1022 por mm6
, mm3
; mm6=abs(q2-q1)
1024 movq
[rdx
+40], mm5
; save q1
1025 movq
[rdx
+32], mm0
; save q0
1027 movq mm3
, [rsi
+rax
*4] ; 07 06 05 04 03 02 01 00
1028 punpcklbw mm3
, [rdi
+rax
*4] ; 13 03 12 02 11 01 10 00
1030 movq mm0
, mm3
; 13 03 12 02 11 01 10 00
1031 punpcklwd mm0
, mm1
; 31 21 11 01 30 20 10 00
1033 punpckhwd mm3
, mm1
; 33 23 13 03 32 22 12 02
1034 movq mm1
, mm0
; 31 21 11 01 30 20 10 00
1036 punpckldq mm0
, mm2
; 70 60 50 40 30 20 10 00 =p3
1037 punpckhdq mm1
, mm2
; 71 61 51 41 31 21 11 01 =p2
1039 movq
[rdx
], mm0
; save p3
1040 movq
[rdx
+8], mm1
; save p2
1042 movq mm2
, mm1
; 71 61 51 41 31 21 11 01 =p2
1043 psubusb mm2
, mm0
; p2-p3
1045 psubusb mm0
, mm1
; p3-p2
1046 por mm0
, mm2
; mm0=abs(p3-p2)
1048 movq mm2
, mm3
; 33 23 13 03 32 22 12 02
1049 punpckldq mm2
, mm4
; 72 62 52 42 32 22 12 02 = p1
1051 punpckhdq mm3
, mm4
; 73 63 53 43 33 23 13 03 = p0
1052 movq
[rdx
+24], mm3
; save p0
1054 movq
[rdx
+16], mm2
; save p1
1055 movq mm5
, mm2
; mm5 = p1
1057 psubusb mm2
, mm1
; p1-p2
1058 psubusb mm1
, mm5
; p2-p1
1060 por mm1
, mm2
; mm1=abs(p2-p1)
1061 mov rdx
, arg
(3) ;limit
1063 movq mm4
, [rdx
] ; mm4 = limit
1064 psubusb mm7
, mm4
; abs(q3-q2) > limit
1066 psubusb mm0
, mm4
; abs(p3-p2) > limit
1067 psubusb mm1
, mm4
; abs(p2-p1) > limit
1069 psubusb mm6
, mm4
; abs(q2-q1) > limit
1073 por mm0
, mm7
; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
1077 movq mm7
, mm3
; mm3=mm7=p0
1078 psubusb mm7
, mm5
; p0 - p1
1080 psubusb mm5
, mm3
; p1 - p0
1081 por mm5
, mm7
; abs(p1-p0)
1083 movq t0
, mm5
; save abs(p1-p0)
1086 psubusb mm5
, mm4
; mm5 = abs(p1-p0) > limit
1087 por mm0
, mm5
; mm0=mask
1089 movq mm5
, [rdx
+32] ; mm5=q0
1090 movq mm7
, [rdx
+40] ; mm7=q1
1092 movq mm6
, mm5
; mm6=q0
1094 psubusb mm5
, mm7
; q0-q1
1096 psubusb mm7
, mm6
; q1-q0
1097 por mm7
, mm5
; abs(q1-q0)
1099 movq t1
, mm7
; save abs(q1-q0)
1100 psubusb mm7
, mm4
; mm7=abs(q1-q0)> limit
1105 psubusb mm5
, mm1
; q1-=p1
1106 psubusb mm1
, mm2
; p1-=q1
1107 por mm5
, mm1
; abs(p1-q1)
1108 pand mm5
, [GLOBAL(tfe
)] ; set lsb of each byte to zero
1109 psrlw mm5
, 1 ; abs(p1-q1)/2
1111 mov rdx
, arg
(2) ;flimit ;
1113 movq mm2
, [rdx
] ;flimit mm2
1114 movq mm1
, mm3
; mm1=mm3=p0
1116 movq mm7
, mm6
; mm7=mm6=q0
1117 psubusb mm1
, mm7
; p0-q0
1119 psubusb mm7
, mm3
; q0-p0
1120 por mm1
, mm7
; abs(q0-p0)
1121 paddusb mm1
, mm1
; abs(q0-p0)*2
1122 paddusb mm1
, mm5
; abs (p0 - q0) *2 + abs(p1-q1)/2
1124 paddb mm2
, mm2
; flimit*2 (less than 255)
1125 paddb mm4
, mm2
; flimit * 2 + limit (less than 255)
1127 psubusb mm1
, mm4
; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
1128 por mm1
, mm0
; ; mask
1133 ; calculate high edge variance
1134 mov rdx
, arg
(4) ;thresh ; get thresh
1137 movq mm4
, t0
; get abs (q1 - q0)
1138 psubusb mm4
, mm7
; abs(q1 - q0) > thresh
1140 movq mm3
, t1
; get abs (p1 - p0)
1141 psubusb mm3
, mm7
; abs(p1 - p0)> thresh
1143 por mm4
, mm3
; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
1152 ; start work on filters
1155 ; start work on filters
1156 movq mm2
, [rdx
+16] ; p1
1157 movq mm7
, [rdx
+40] ; q1
1158 pxor mm2
, [GLOBAL(t80
)] ; p1 offset to convert to signed values
1159 pxor mm7
, [GLOBAL(t80
)] ; q1 offset to convert to signed values
1160 psubsb mm2
, mm7
; p1 - q1
1162 movq mm6
, [rdx
+24] ; p0
1163 movq mm0
, [rdx
+32] ; q0
1164 pxor mm6
, [GLOBAL(t80
)] ; offset to convert to signed values
1165 pxor mm0
, [GLOBAL(t80
)] ; offset to convert to signed values
1168 psubsb mm0
, mm6
; q0 - p0
1169 paddsb mm2
, mm0
; 1 * (q0 - p0) + (p1 - q1)
1170 paddsb mm2
, mm0
; 2 * (q0 - p0)
1171 paddsb mm2
, mm0
; 3 * (q0 - p0) + (p1 - q1)
1172 pand mm1
, mm2
; mask filter values we don't care about
1174 ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
1175 movq mm2
, mm1
; vp8_filter
1176 pand mm2
, mm4
; ; Filter2 = vp8_filter & hev
1179 paddsb mm5
, [GLOBAL(t3
)];
1184 punpcklbw mm0
, mm5
; e0f0g0h0
1185 psraw mm0
, 11 ; sign extended shift right by 3
1186 punpckhbw mm7
, mm5
; a0b0c0d0
1187 psraw mm7
, 11 ; sign extended shift right by 3
1188 packsswb mm0
, mm7
; Filter2 >>=3;
1190 movq mm5
, mm0
; Filter2
1192 paddsb mm2
, [GLOBAL(t4
)] ; vp8_signed_char_clamp(Filter2 + 4)
1196 punpcklbw mm0
, mm2
; e0f0g0h0
1197 psraw mm0
, 11 ; sign extended shift right by 3
1198 punpckhbw mm7
, mm2
; a0b0c0d0
1199 psraw mm7
, 11 ; sign extended shift right by 3
1200 packsswb mm0
, mm7
; Filter2 >>=3;
1202 ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0
1203 psubsb mm3
, mm0
; qs0 =qs0 - filter1
1204 paddsb mm6
, mm5
; ps0 =ps0 + Fitler2
1206 ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
1207 ; vp8_filter &= ~hev;
1208 ; Filter2 = vp8_filter;
1209 pandn mm4
, mm1
; vp8_filter&=~hev
1212 ; mm3=qs0, mm4=filter2, mm6=ps0
1214 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
1215 ; s = vp8_signed_char_clamp(qs0 - u);
1217 ; s = vp8_signed_char_clamp(ps0 + u);
1225 pmulhw mm1
, [GLOBAL(s27
)]
1226 pmulhw mm2
, [GLOBAL(s27
)]
1227 paddw mm1
, [GLOBAL(s63
)]
1228 paddw mm2
, [GLOBAL(s63
)]
1236 pxor mm3
, [GLOBAL(t80
)]
1237 pxor mm6
, [GLOBAL(t80
)]
1241 ; roughly 2/7th difference across boundary
1242 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
1243 ; s = vp8_signed_char_clamp(qs1 - u);
1245 ; s = vp8_signed_char_clamp(ps1 + u);
1251 pmulhw mm1
, [GLOBAL(s18
)]
1252 pmulhw mm2
, [GLOBAL(s18
)]
1253 paddw mm1
, [GLOBAL(s63
)]
1254 paddw mm2
, [GLOBAL(s63
)]
1259 movq mm3
, [rdx
+ 40]
1260 movq mm6
, [rdx
+ 16] ; p1
1261 pxor mm3
, [GLOBAL(t80
)]
1262 pxor mm6
, [GLOBAL(t80
)]
1267 pxor mm6
, [GLOBAL(t80
)]
1268 pxor mm3
, [GLOBAL(t80
)]
1269 movq
[rdx
+ 40], mm3
1270 movq
[rdx
+ 16], mm6
1272 ; roughly 1/7th difference across boundary
1273 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
1274 ; s = vp8_signed_char_clamp(qs2 - u);
1276 ; s = vp8_signed_char_clamp(ps2 + u);
1282 pmulhw mm1
, [GLOBAL(s9
)]
1283 pmulhw mm2
, [GLOBAL(s9
)]
1284 paddw mm1
, [GLOBAL(s63
)]
1285 paddw mm2
, [GLOBAL(s63
)]
1293 pxor mm6
, [GLOBAL(t80
)]
1294 pxor mm3
, [GLOBAL(t80
)]
1299 pxor mm6
, [GLOBAL(t80
)] ; mm6 = 71 61 51 41 31 21 11 01
1300 pxor mm3
, [GLOBAL(t80
)] ; mm3 = 76 66 56 46 36 26 15 06
1302 ; tranpose and write back
1303 movq mm0
, [rdx
] ; mm0 = 70 60 50 40 30 20 10 00
1304 movq mm1
, mm0
; mm0 = 70 60 50 40 30 20 10 00
1306 punpcklbw mm0
, mm6
; mm0 = 31 30 21 20 11 10 01 00
1307 punpckhbw mm1
, mm6
; mm3 = 71 70 61 60 51 50 41 40
1309 movq mm2
, [rdx
+16] ; mm2 = 72 62 52 42 32 22 12 02
1310 movq mm6
, mm2
; mm3 = 72 62 52 42 32 22 12 02
1312 punpcklbw mm2
, [rdx
+24] ; mm2 = 33 32 23 22 13 12 03 02
1313 punpckhbw mm6
, [rdx
+24] ; mm3 = 73 72 63 62 53 52 43 42
1315 movq mm5
, mm0
; mm5 = 31 30 21 20 11 10 01 00
1316 punpcklwd mm0
, mm2
; mm0 = 13 12 11 10 03 02 01 00
1318 punpckhwd mm5
, mm2
; mm5 = 33 32 31 30 23 22 21 20
1319 movq mm4
, mm1
; mm4 = 71 70 61 60 51 50 41 40
1321 punpcklwd mm1
, mm6
; mm1 = 53 52 51 50 43 42 41 40
1322 punpckhwd mm4
, mm6
; mm4 = 73 72 71 70 63 62 61 60
1324 movq mm2
, [rdx
+32] ; mm2 = 74 64 54 44 34 24 14 04
1325 punpcklbw mm2
, [rdx
+40] ; mm2 = 35 34 25 24 15 14 05 04
1327 movq mm6
, mm3
; mm6 = 76 66 56 46 36 26 15 06
1328 punpcklbw mm6
, [rdx
+56] ; mm6 = 37 36 27 26 17 16 07 06
1330 movq mm7
, mm2
; mm7 = 35 34 25 24 15 14 05 04
1331 punpcklwd mm2
, mm6
; mm2 = 17 16 15 14 07 06 05 04
1333 punpckhwd mm7
, mm6
; mm7 = 37 36 35 34 27 26 25 24
1334 movq mm6
, mm0
; mm6 = 13 12 11 10 03 02 01 00
1336 punpckldq mm0
, mm2
; mm0 = 07 06 05 04 03 02 01 00
1337 punpckhdq mm6
, mm2
; mm6 = 17 16 15 14 13 12 11 10
1339 movq
[rsi
+rax
*4], mm0
; write out
1340 movq
[rdi
+rax
*4], mm6
; write out
1342 movq mm0
, mm5
; mm0 = 33 32 31 30 23 22 21 20
1343 punpckldq mm0
, mm7
; mm0 = 27 26 25 24 23 22 20 20
1345 punpckhdq mm5
, mm7
; mm5 = 37 36 35 34 33 32 31 30
1346 movq
[rsi
+rax
*2], mm0
; write out
1348 movq
[rdi
+rax
*2], mm5
; write out
1349 movq mm2
, [rdx
+32] ; mm2 = 74 64 54 44 34 24 14 04
1351 punpckhbw mm2
, [rdx
+40] ; mm2 = 75 74 65 64 54 54 45 44
1352 punpckhbw mm3
, [rdx
+56] ; mm3 = 77 76 67 66 57 56 47 46
1354 movq mm5
, mm2
; mm5 = 75 74 65 64 54 54 45 44
1355 punpcklwd mm2
, mm3
; mm2 = 57 56 55 54 47 46 45 44
1357 punpckhwd mm5
, mm3
; mm5 = 77 76 75 74 67 66 65 64
1358 movq mm0
, mm1
; mm0= 53 52 51 50 43 42 41 40
1360 movq mm3
, mm4
; mm4 = 73 72 71 70 63 62 61 60
1361 punpckldq mm0
, mm2
; mm0 = 47 46 45 44 43 42 41 40
1363 punpckhdq mm1
, mm2
; mm1 = 57 56 55 54 53 52 51 50
1364 movq
[rsi
], mm0
; write out
1366 movq
[rdi
], mm1
; write out
1369 punpckldq mm3
, mm5
; mm3 = 67 66 65 64 63 62 61 60
1370 punpckhdq mm4
, mm5
; mm4 = 77 76 75 74 73 72 71 60
1372 movq
[rsi
+rax
*2], mm3
1373 movq
[rdi
+rax
*2], mm4
1375 lea rsi
, [rsi
+rax
*8]
1391 ;void vp8_loop_filter_simple_horizontal_edge_mmx
1393 ; unsigned char *src_ptr,
1394 ; int src_pixel_step,
1395 ; const char *flimit,
1396 ; const char *limit,
1397 ; const char *thresh,
1400 global sym
(vp8_loop_filter_simple_horizontal_edge_mmx
)
1401 sym
(vp8_loop_filter_simple_horizontal_edge_mmx
):
1404 SHADOW_ARGS_TO_STACK
6
1410 mov rsi
, arg
(0) ;src_ptr
1411 movsxd rax
, dword ptr arg
(1) ;src_pixel_step ; destination pitch?
1413 movsxd rcx
, dword ptr arg
(5) ;count
1415 mov rdx
, arg
(3) ;limit
1417 mov rdx
, arg
(2) ;flimit ; get flimit
1419 paddb mm3
, mm3
; flimit*2 (less than 255)
1420 paddb mm3
, mm7
; flimit * 2 + limit (less than 255)
1422 mov rdi
, rsi
; rdi points to row +1 for indirect addressing
1427 movq mm1
, [rsi
+2*rax
] ; p1
1428 movq mm0
, [rdi
] ; q1
1432 psubusb mm0
, mm1
; q1-=p1
1433 psubusb mm1
, mm4
; p1-=q1
1434 por mm1
, mm0
; abs(p1-q1)
1435 pand mm1
, [GLOBAL(tfe
)] ; set lsb of each byte to zero
1436 psrlw mm1
, 1 ; abs(p1-q1)/2
1438 movq mm5
, [rsi
+rax
] ; p0
1439 movq mm4
, [rsi
] ; q0
1442 psubusb mm5
, mm4
; p0-=q0
1443 psubusb mm4
, mm6
; q0-=p0
1444 por mm5
, mm4
; abs(p0 - q0)
1445 paddusb mm5
, mm5
; abs(p0-q0)*2
1446 paddusb mm5
, mm1
; abs (p0 - q0) *2 + abs(p1-q1)/2
1448 psubusb mm5
, mm3
; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
1452 ; start work on filters
1453 pxor mm2
, [GLOBAL(t80
)] ; p1 offset to convert to signed values
1454 pxor mm7
, [GLOBAL(t80
)] ; q1 offset to convert to signed values
1455 psubsb mm2
, mm7
; p1 - q1
1457 pxor mm6
, [GLOBAL(t80
)] ; offset to convert to signed values
1458 pxor mm0
, [GLOBAL(t80
)] ; offset to convert to signed values
1460 psubsb mm0
, mm6
; q0 - p0
1461 paddsb mm2
, mm0
; p1 - q1 + 1 * (q0 - p0)
1462 paddsb mm2
, mm0
; p1 - q1 + 2 * (q0 - p0)
1463 paddsb mm2
, mm0
; p1 - q1 + 3 * (q0 - p0)
1464 pand mm5
, mm2
; mask filter values we don't care about
1467 paddsb mm5
, [GLOBAL(t4
)] ; 3* (q0 - p0) + (p1 - q1) + 4
1469 movq mm0
, mm5
; get a copy of filters
1470 psllw mm0
, 8 ; shift left 8
1471 psraw mm0
, 3 ; arithmetic shift right 11
1473 movq mm1
, mm5
; get a copy of filters
1474 psraw mm1
, 11 ; arithmetic shift right 11
1475 psllw mm1
, 8 ; shift left 8 to put it back
1477 por mm0
, mm1
; put the two together to get result
1479 psubsb mm3
, mm0
; q0-= q0 add
1480 pxor mm3
, [GLOBAL(t80
)] ; unoffset
1481 movq
[rsi
], mm3
; write back
1485 psubsb mm5
, [GLOBAL(t1s
)] ; +3 instead of +4
1487 movq mm0
, mm5
; get a copy of filters
1488 psllw mm0
, 8 ; shift left 8
1489 psraw mm0
, 3 ; arithmetic shift right 11
1491 psraw mm5
, 11 ; arithmetic shift right 11
1492 psllw mm5
, 8 ; shift left 8 to put it back
1493 por mm0
, mm5
; put the two together to get result
1496 paddsb mm6
, mm0
; p0+= p0 add
1497 pxor mm6
, [GLOBAL(t80
)] ; unoffset
1498 movq
[rsi
+rax
], mm6
; write back
1514 ;void vp8_loop_filter_simple_vertical_edge_mmx
1516 ; unsigned char *src_ptr,
1517 ; int src_pixel_step,
1518 ; const char *flimit,
1519 ; const char *limit,
1520 ; const char *thresh,
1523 global sym
(vp8_loop_filter_simple_vertical_edge_mmx
)
1524 sym
(vp8_loop_filter_simple_vertical_edge_mmx
):
1527 SHADOW_ARGS_TO_STACK
6
1534 sub rsp
, 32 ; reserve 32 bytes
1535 %define t0
[rsp
+ 0] ;__declspec(align(16)) char t0[8];
1536 %define t1
[rsp
+ 16] ;__declspec(align(16)) char t1[8];
1538 mov rsi
, arg
(0) ;src_ptr
1539 movsxd rax
, dword ptr arg
(1) ;src_pixel_step ; destination pitch?
1541 lea rsi
, [rsi
+ rax
*4- 2]; ;
1542 movsxd rcx
, dword ptr arg
(5) ;count
1545 lea rdi
, [rsi
+ rax
];
1546 movd mm0
, [rdi
+ rax
* 2] ; xx xx xx xx 73 72 71 70
1548 movd mm6
, [rsi
+ rax
* 2] ; xx xx xx xx 63 62 61 60
1549 punpcklbw mm6
, mm0
; 73 63 72 62 71 61 70 60
1551 movd mm0
, [rsi
+ rax
] ; xx xx xx xx 53 52 51 50
1552 movd mm4
, [rsi
] ; xx xx xx xx 43 42 41 40
1554 punpcklbw mm4
, mm0
; 53 43 52 42 51 41 50 40
1555 movq mm5
, mm4
; 53 43 52 42 51 41 50 40
1557 punpcklwd mm4
, mm6
; 71 61 51 41 70 60 50 40
1558 punpckhwd mm5
, mm6
; 73 63 53 43 72 62 52 42
1562 movd mm7
, [rsi
+ rax
] ; xx xx xx xx 33 32 31 30
1563 movd mm6
, [rsi
+ rax
* 2] ; xx xx xx xx 23 22 21 20
1565 punpcklbw mm6
, mm7
; 33 23 32 22 31 21 30 20
1566 movd mm1
, [rdi
+ rax
* 4] ; xx xx xx xx 13 12 11 10
1568 movd mm0
, [rsi
+ rax
* 4] ; xx xx xx xx 03 02 01 00
1569 punpcklbw mm0
, mm1
; 13 03 12 02 11 01 10 00
1571 movq mm2
, mm0
; 13 03 12 02 11 01 10 00
1572 punpcklwd mm0
, mm6
; 31 21 11 01 30 20 10 00
1574 punpckhwd mm2
, mm6
; 33 23 13 03 32 22 12 02
1575 movq mm1
, mm0
; 13 03 12 02 11 01 10 00
1577 punpckldq mm0
, mm4
; 70 60 50 40 30 20 10 00 = p1
1578 movq mm3
, mm2
; 33 23 13 03 32 22 12 02
1580 punpckhdq mm1
, mm4
; 71 61 51 41 31 21 11 01 = p0
1581 punpckldq mm2
, mm5
; 72 62 52 42 32 22 12 02 = q0
1583 punpckhdq mm3
, mm5
; 73 63 53 43 33 23 13 03 = q1
1589 psubusb mm7
, mm6
; q1-=p1
1590 psubusb mm6
, mm3
; p1-=q1
1591 por mm6
, mm7
; abs(p1-q1)
1592 pand mm6
, [GLOBAL(tfe
)] ; set lsb of each byte to zero
1593 psrlw mm6
, 1 ; abs(p1-q1)/2
1598 psubusb mm5
, mm2
; p0-=q0
1599 psubusb mm4
, mm1
; q0-=p0
1601 por mm5
, mm4
; abs(p0 - q0)
1602 paddusb mm5
, mm5
; abs(p0-q0)*2
1603 paddusb mm5
, mm6
; abs (p0 - q0) *2 + abs(p1-q1)/2
1605 mov rdx
, arg
(2) ;flimit ; get flimit
1607 mov rdx
, arg
(3) ; get limit
1609 paddb mm7
, mm7
; flimit*2 (less than 255)
1610 paddb mm7
, mm6
; flimit * 2 + limit (less than 255)
1612 psubusb mm5
, mm7
; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
1614 pcmpeqb mm5
, mm7
; mm5 = mask
1616 ; start work on filters
1620 pxor mm0
, [GLOBAL(t80
)] ; p1 offset to convert to signed values
1621 pxor mm3
, [GLOBAL(t80
)] ; q1 offset to convert to signed values
1623 psubsb mm0
, mm3
; p1 - q1
1627 pxor mm6
, [GLOBAL(t80
)] ; offset to convert to signed values
1629 pxor mm7
, [GLOBAL(t80
)] ; offset to convert to signed values
1630 movq mm3
, mm7
; offseted ; q0
1632 psubsb mm7
, mm6
; q0 - p0
1633 paddsb mm0
, mm7
; p1 - q1 + 1 * (q0 - p0)
1635 paddsb mm0
, mm7
; p1 - q1 + 2 * (q0 - p0)
1636 paddsb mm0
, mm7
; p1 - q1 + 3 * (q0 - p0)
1638 pand mm5
, mm0
; mask filter values we don't care about
1640 paddsb mm5
, [GLOBAL(t4
)] ; 3* (q0 - p0) + (p1 - q1) + 4
1642 movq mm0
, mm5
; get a copy of filters
1643 psllw mm0
, 8 ; shift left 8
1644 psraw mm0
, 3 ; arithmetic shift right 11
1647 movq mm7
, mm5
; get a copy of filters
1648 psraw mm7
, 11 ; arithmetic shift right 11
1649 psllw mm7
, 8 ; shift left 8 to put it back
1651 por mm0
, mm7
; put the two together to get result
1653 psubsb mm3
, mm0
; q0-= q0sz add
1654 pxor mm3
, [GLOBAL(t80
)] ; unoffset
1657 psubsb mm5
, [GLOBAL(t1s
)] ; +3 instead of +4
1659 movq mm0
, mm5
; get a copy of filters
1660 psllw mm0
, 8 ; shift left 8
1661 psraw mm0
, 3 ; arithmetic shift right 11
1664 psraw mm5
, 11 ; arithmetic shift right 11
1665 psllw mm5
, 8 ; shift left 8 to put it back
1666 por mm0
, mm5
; put the two together to get result
1668 paddsb mm6
, mm0
; p0+= p0 add
1669 pxor mm6
, [GLOBAL(t80
)] ; unoffset
1675 ; mm0 = 70 60 50 40 30 20 10 00
1676 ; mm6 = 71 61 51 41 31 21 11 01
1677 ; mm3 = 72 62 52 42 32 22 12 02
1678 ; mm4 = 73 63 53 43 33 23 13 03
1679 ; transpose back to write out
1682 punpcklbw mm0
, mm6
; 31 30 21 20 11 10 01 00
1684 punpckhbw mm1
, mm6
; 71 70 61 60 51 50 41 40
1687 punpcklbw mm2
, mm4
; 33 32 23 22 13 12 03 02
1688 movq mm5
, mm1
; 71 70 61 60 51 50 41 40
1690 punpckhbw mm3
, mm4
; 73 72 63 62 53 52 43 42
1691 movq mm6
, mm0
; 31 30 21 20 11 10 01 00
1693 punpcklwd mm0
, mm2
; 13 12 11 10 03 02 01 00
1694 punpckhwd mm6
, mm2
; 33 32 31 30 23 22 21 20
1696 movd
[rsi
+rax
*4], mm0
; write 03 02 01 00
1697 punpcklwd mm1
, mm3
; 53 52 51 50 43 42 41 40
1699 psrlq mm0
, 32 ; xx xx xx xx 13 12 11 10
1700 punpckhwd mm5
, mm3
; 73 72 71 70 63 62 61 60
1702 movd
[rdi
+rax
*4], mm0
; write 13 12 11 10
1703 movd
[rsi
+rax
*2], mm6
; write 23 22 21 20
1705 psrlq mm6
, 32 ; 33 32 31 30
1706 movd
[rsi
], mm1
; write 43 42 41 40
1708 movd
[rsi
+ rax
], mm6
; write 33 32 31 30
1711 movd
[rsi
+ rax
*2], mm5
; write 63 62 61 60
1712 psrlq mm1
, 32 ; 53 52 51 50
1714 movd
[rdi
], mm1
; write out 53 52 51 50
1715 psrlq mm5
, 32 ; 73 72 71 70
1717 movd
[rdi
+ rax
*2], mm5
; write 73 72 71 70
1719 lea rsi
, [rsi
+rax
*8] ; next 8
1736 ;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,
1738 ; loop_filter_info *lfi)
1742 ; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
1743 ; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
1744 ; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2);