2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
15 ;void vp8_loop_filter_horizontal_edge_mmx
17 ; unsigned char *src_ptr,
24 global sym
(vp8_loop_filter_horizontal_edge_mmx
)
25 sym
(vp8_loop_filter_horizontal_edge_mmx
):
28 SHADOW_ARGS_TO_STACK
6
35 sub rsp
, 32 ; reserve 32 bytes
36 %define t0
[rsp
+ 0] ;__declspec(align(16)) char t0[8];
37 %define t1
[rsp
+ 16] ;__declspec(align(16)) char t1[8];
39 mov rsi
, arg
(0) ;src_ptr
40 movsxd rax
, dword ptr arg
(1) ;src_pixel_step ; destination pitch?
42 movsxd rcx
, dword ptr arg
(5) ;count
44 mov rdx
, arg
(3) ;limit
46 mov rdi
, rsi
; rdi points to row +1 for indirect addressing
49 ; calculate breakout conditions
50 movq mm2
, [rdi
+2*rax
] ; q3
51 movq mm1
, [rsi
+2*rax
] ; q2
53 psubusb mm1
, mm2
; q2-=q3
54 psubusb mm2
, mm6
; q3-=q2
55 por mm1
, mm2
; abs(q3-q2)
59 movq mm4
, [rsi
+rax
] ; q1
61 psubusb mm4
, mm6
; q1-=q2
62 psubusb mm6
, mm3
; q2-=q1
63 por mm4
, mm6
; abs(q2-q1)
70 psubusb mm4
, mm3
; q0-=q1
71 psubusb mm3
, mm0
; q1-=q0
72 por mm4
, mm3
; abs(q0-q1)
73 movq t0
, mm4
; save to t0
78 neg rax
; negate pitch to deal with above border
80 movq mm2
, [rsi
+4*rax
] ; p3
81 movq mm4
, [rdi
+4*rax
] ; p2
83 psubusb mm4
, mm2
; p2-=p3
84 psubusb mm2
, mm5
; p3-=p2
85 por mm4
, mm2
; abs(p3 - p2)
90 movq mm4
, [rsi
+2*rax
] ; p1
92 psubusb mm4
, mm5
; p1-=p2
93 psubusb mm5
, mm3
; p2-=p1
94 por mm4
, mm5
; abs(p2 - p1)
100 movq mm4
, [rsi
+rax
] ; p0
102 psubusb mm4
, mm3
; p0-=p1
103 psubusb mm3
, mm5
; p1-=p0
104 por mm4
, mm3
; abs(p1 - p0)
105 movq t1
, mm4
; save to t1
111 psubusb mm3
, mm2
; q1-=p1
112 psubusb mm2
, mm4
; p1-=q1
113 por mm2
, mm3
; abs(p1-q1)
114 pand mm2
, [GLOBAL(tfe
)] ; set lsb of each byte to zero
115 psrlw mm2
, 1 ; abs(p1-q1)/2
119 psubusb mm5
, mm3
; p0-=q0
120 psubusb mm3
, mm6
; q0-=p0
121 por mm5
, mm3
; abs(p0 - q0)
122 paddusb mm5
, mm5
; abs(p0-q0)*2
123 paddusb mm5
, mm2
; abs (p0 - q0) *2 + abs(p1-q1)/2
125 mov rdx
, arg
(2) ;blimit ; get blimit
126 movq mm7
, [rdx
] ; blimit
128 psubusb mm5
, mm7
; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
131 pcmpeqb mm1
, mm5
; mask mm1
133 ; calculate high edge variance
134 mov rdx
, arg
(4) ;thresh ; get thresh
136 movq mm4
, t0
; get abs (q1 - q0)
138 movq mm3
, t1
; get abs (p1 - p0)
140 paddb mm4
, mm3
; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
148 ; start work on filters
149 movq mm2
, [rsi
+2*rax
] ; p1
151 pxor mm2
, [GLOBAL(t80
)] ; p1 offset to convert to signed values
152 pxor mm7
, [GLOBAL(t80
)] ; q1 offset to convert to signed values
153 psubsb mm2
, mm7
; p1 - q1
154 pand mm2
, mm4
; high var mask (hvm)(p1 - q1)
155 pxor mm6
, [GLOBAL(t80
)] ; offset to convert to signed values
156 pxor mm0
, [GLOBAL(t80
)] ; offset to convert to signed values
158 psubsb mm0
, mm6
; q0 - p0
159 paddsb mm2
, mm0
; 1 * (q0 - p0) + hvm(p1 - q1)
160 paddsb mm2
, mm0
; 2 * (q0 - p0) + hvm(p1 - q1)
161 paddsb mm2
, mm0
; 3 * (q0 - p0) + hvm(p1 - q1)
162 pand mm1
, mm2
; mask filter values we don't care about
164 paddsb mm1
, [GLOBAL(t4
)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
165 paddsb mm2
, [GLOBAL(t3
)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
174 movq mm2
, mm0
; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
177 movq mm5
, mm1
; abcdefgh
178 punpcklbw mm0
, mm1
; e0f0g0h0
179 psraw mm0
, 11 ; sign extended shift right by 3
181 punpckhbw mm1
, mm5
; a0b0c0d0
182 psraw mm1
, 11 ; sign extended shift right by 3
183 movq mm5
, mm0
; save results
185 packsswb mm0
, mm1
; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
186 paddsw mm5
, [GLOBAL(ones
)]
187 paddsw mm1
, [GLOBAL(ones
)]
188 psraw mm5
, 1 ; partial shifted one more time for 2nd tap
189 psraw mm1
, 1 ; partial shifted one more time for 2nd tap
190 packsswb mm5
, mm1
; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
191 pandn mm4
, mm5
; high edge variance additive
193 paddsb mm6
, mm2
; p0+= p0 add
194 pxor mm6
, [GLOBAL(t80
)] ; unoffset
195 movq
[rsi
+rax
], mm6
; write back
197 movq mm6
, [rsi
+2*rax
] ; p1
198 pxor mm6
, [GLOBAL(t80
)] ; reoffset
199 paddsb mm6
, mm4
; p1+= p1 add
200 pxor mm6
, [GLOBAL(t80
)] ; unoffset
201 movq
[rsi
+2*rax
], mm6
; write back
203 psubsb mm3
, mm0
; q0-= q0 add
204 pxor mm3
, [GLOBAL(t80
)] ; unoffset
205 movq
[rsi
], mm3
; write back
207 psubsb mm7
, mm4
; q1-= q1 add
208 pxor mm7
, [GLOBAL(t80
)] ; unoffset
209 movq
[rdi
], mm7
; write back
227 ;void vp8_loop_filter_vertical_edge_mmx
229 ; unsigned char *src_ptr,
230 ; int src_pixel_step,
231 ; const char *blimit,
233 ; const char *thresh,
236 global sym
(vp8_loop_filter_vertical_edge_mmx
)
237 sym
(vp8_loop_filter_vertical_edge_mmx
):
240 SHADOW_ARGS_TO_STACK
6
247 sub rsp
, 64 ; reserve 64 bytes
248 %define t0
[rsp
+ 0] ;__declspec(align(16)) char t0[8];
249 %define t1
[rsp
+ 16] ;__declspec(align(16)) char t1[8];
250 %define srct
[rsp
+ 32] ;__declspec(align(16)) char srct[32];
252 mov rsi
, arg
(0) ;src_ptr
253 movsxd rax
, dword ptr arg
(1) ;src_pixel_step ; destination pitch?
255 lea rsi
, [rsi
+ rax
*4 - 4]
257 movsxd rcx
, dword ptr arg
(5) ;count
259 mov rdi
, rsi
; rdi points to row +1 for indirect addressing
264 movq mm6
, [rsi
+2*rax
] ; 67 66 65 64 63 62 61 60
265 movq mm7
, mm6
; 77 76 75 74 73 72 71 70
267 punpckhbw mm7
, [rdi
+2*rax
] ; 77 67 76 66 75 65 74 64
268 punpcklbw mm6
, [rdi
+2*rax
] ; 73 63 72 62 71 61 70 60
270 movq mm4
, [rsi
] ; 47 46 45 44 43 42 41 40
271 movq mm5
, mm4
; 47 46 45 44 43 42 41 40
273 punpckhbw mm5
, [rsi
+rax
] ; 57 47 56 46 55 45 54 44
274 punpcklbw mm4
, [rsi
+rax
] ; 53 43 52 42 51 41 50 40
276 movq mm3
, mm5
; 57 47 56 46 55 45 54 44
277 punpckhwd mm5
, mm7
; 77 67 57 47 76 66 56 46
279 punpcklwd mm3
, mm7
; 75 65 55 45 74 64 54 44
280 movq mm2
, mm4
; 53 43 52 42 51 41 50 40
282 punpckhwd mm4
, mm6
; 73 63 53 43 72 62 52 42
283 punpcklwd mm2
, mm6
; 71 61 51 41 70 60 50 40
286 movq mm6
, [rsi
+rax
*2] ; 27 26 25 24 23 22 21 20
288 movq mm1
, mm6
; 27 26 25 24 23 22 21 20
289 punpckhbw mm6
, [rsi
+rax
] ; 37 27 36 36 35 25 34 24
291 punpcklbw mm1
, [rsi
+rax
] ; 33 23 32 22 31 21 30 20
292 movq mm7
, [rsi
+rax
*4]; ; 07 06 05 04 03 02 01 00
294 punpckhbw mm7
, [rdi
+rax
*4] ; 17 07 16 06 15 05 14 04
295 movq mm0
, mm7
; 17 07 16 06 15 05 14 04
297 punpckhwd mm7
, mm6
; 37 27 17 07 36 26 16 06
298 punpcklwd mm0
, mm6
; 35 25 15 05 34 24 14 04
300 movq mm6
, mm7
; 37 27 17 07 36 26 16 06
301 punpckhdq mm7
, mm5
; 77 67 57 47 37 27 17 07 = q3
303 punpckldq mm6
, mm5
; 76 66 56 46 36 26 16 06 = q2
305 movq mm5
, mm6
; 76 66 56 46 36 26 16 06
306 psubusb mm5
, mm7
; q2-q3
308 psubusb mm7
, mm6
; q3-q2
309 por mm7
, mm5
; ; mm7=abs (q3-q2)
311 movq mm5
, mm0
; 35 25 15 05 34 24 14 04
312 punpckhdq mm5
, mm3
; 75 65 55 45 35 25 15 05 = q1
314 punpckldq mm0
, mm3
; 74 64 54 44 34 24 15 04 = q0
315 movq mm3
, mm5
; 75 65 55 45 35 25 15 05 = q1
317 psubusb mm3
, mm6
; q1-q2
318 psubusb mm6
, mm5
; q2-q1
320 por mm6
, mm3
; mm6=abs(q2-q1)
323 movq
[rdx
+24], mm5
; save q1
324 movq
[rdx
+16], mm0
; save q0
326 movq mm3
, [rsi
+rax
*4] ; 07 06 05 04 03 02 01 00
327 punpcklbw mm3
, [rdi
+rax
*4] ; 13 03 12 02 11 01 10 00
329 movq mm0
, mm3
; 13 03 12 02 11 01 10 00
330 punpcklwd mm0
, mm1
; 31 21 11 01 30 20 10 00
332 punpckhwd mm3
, mm1
; 33 23 13 03 32 22 12 02
333 movq mm1
, mm0
; 31 21 11 01 30 20 10 00
335 punpckldq mm0
, mm2
; 70 60 50 40 30 20 10 00 =p3
336 punpckhdq mm1
, mm2
; 71 61 51 41 31 21 11 01 =p2
338 movq mm2
, mm1
; 71 61 51 41 31 21 11 01 =p2
339 psubusb mm2
, mm0
; p2-p3
341 psubusb mm0
, mm1
; p3-p2
342 por mm0
, mm2
; mm0=abs(p3-p2)
344 movq mm2
, mm3
; 33 23 13 03 32 22 12 02
345 punpckldq mm2
, mm4
; 72 62 52 42 32 22 12 02 = p1
347 punpckhdq mm3
, mm4
; 73 63 53 43 33 23 13 03 = p0
348 movq
[rdx
+8], mm3
; save p0
350 movq
[rdx
], mm2
; save p1
351 movq mm5
, mm2
; mm5 = p1
353 psubusb mm2
, mm1
; p1-p2
354 psubusb mm1
, mm5
; p2-p1
356 por mm1
, mm2
; mm1=abs(p2-p1)
357 mov rdx
, arg
(3) ;limit
359 movq mm4
, [rdx
] ; mm4 = limit
369 por mm0
, mm7
; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
373 movq mm7
, mm3
; mm3=mm7=p0
374 psubusb mm7
, mm5
; p0 - p1
376 psubusb mm5
, mm3
; p1 - p0
377 por mm5
, mm7
; abs(p1-p0)
379 movq t0
, mm5
; save abs(p1-p0)
383 por mm0
, mm5
; mm0=mask
385 movq mm5
, [rdx
+16] ; mm5=q0
386 movq mm7
, [rdx
+24] ; mm7=q1
388 movq mm6
, mm5
; mm6=q0
390 psubusb mm5
, mm7
; q0-q1
392 psubusb mm7
, mm6
; q1-q0
393 por mm7
, mm5
; abs(q1-q0)
395 movq t1
, mm7
; save abs(q1-q0)
401 psubusb mm5
, mm1
; q1-=p1
402 psubusb mm1
, mm2
; p1-=q1
403 por mm5
, mm1
; abs(p1-q1)
404 pand mm5
, [GLOBAL(tfe
)] ; set lsb of each byte to zero
405 psrlw mm5
, 1 ; abs(p1-q1)/2
407 mov rdx
, arg
(2) ;blimit ;
409 movq mm4
, [rdx
] ;blimit
410 movq mm1
, mm3
; mm1=mm3=p0
412 movq mm7
, mm6
; mm7=mm6=q0
413 psubusb mm1
, mm7
; p0-q0
415 psubusb mm7
, mm3
; q0-p0
416 por mm1
, mm7
; abs(q0-p0)
417 paddusb mm1
, mm1
; abs(q0-p0)*2
418 paddusb mm1
, mm5
; abs (p0 - q0) *2 + abs(p1-q1)/2
420 psubusb mm1
, mm4
; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
426 ; calculate high edge variance
427 mov rdx
, arg
(4) ;thresh ; get thresh
430 movq mm4
, t0
; get abs (q1 - q0)
433 movq mm3
, t1
; get abs (p1 - p0)
436 por mm4
, mm3
; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
444 ; start work on filters
448 movq mm7
, [rdx
+24] ; q1
450 movq mm6
, [rdx
+8] ; p0
451 movq mm0
, [rdx
+16] ; q0
453 pxor mm2
, [GLOBAL(t80
)] ; p1 offset to convert to signed values
454 pxor mm7
, [GLOBAL(t80
)] ; q1 offset to convert to signed values
456 psubsb mm2
, mm7
; p1 - q1
457 pand mm2
, mm4
; high var mask (hvm)(p1 - q1)
459 pxor mm6
, [GLOBAL(t80
)] ; offset to convert to signed values
460 pxor mm0
, [GLOBAL(t80
)] ; offset to convert to signed values
463 psubsb mm0
, mm6
; q0 - p0
465 paddsb mm2
, mm0
; 1 * (q0 - p0) + hvm(p1 - q1)
466 paddsb mm2
, mm0
; 2 * (q0 - p0) + hvm(p1 - q1)
468 paddsb mm2
, mm0
; 3 * (q0 - p0) + hvm(p1 - q1)
469 pand mm1
, mm2
; mask filter values we don't care about
472 paddsb mm1
, [GLOBAL(t4
)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
474 paddsb mm2
, [GLOBAL(t3
)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
486 movq mm2
, mm0
; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
489 movq mm5
, mm1
; abcdefgh
491 punpcklbw mm0
, mm1
; e0f0g0h0
492 psraw mm0
, 11 ; sign extended shift right by 3
495 punpckhbw mm1
, mm5
; a0b0c0d0
497 psraw mm1
, 11 ; sign extended shift right by 3
498 movq mm5
, mm0
; save results
500 packsswb mm0
, mm1
; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
501 paddsw mm5
, [GLOBAL(ones
)]
503 paddsw mm1
, [GLOBAL(ones
)]
504 psraw mm5
, 1 ; partial shifted one more time for 2nd tap
506 psraw mm1
, 1 ; partial shifted one more time for 2nd tap
507 packsswb mm5
, mm1
; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
509 pandn mm4
, mm5
; high edge variance additive
511 paddsb mm6
, mm2
; p0+= p0 add
512 pxor mm6
, [GLOBAL(t80
)] ; unoffset
516 pxor mm1
, [GLOBAL(t80
)] ; reoffset
518 paddsb mm1
, mm4
; p1+= p1 add
519 pxor mm1
, [GLOBAL(t80
)] ; unoffset
522 psubsb mm3
, mm0
; q0-= q0 add
523 pxor mm3
, [GLOBAL(t80
)] ; unoffset
526 psubsb mm7
, mm4
; q1-= q1 add
527 pxor mm7
, [GLOBAL(t80
)] ; unoffset
530 ; tranpose and write back
531 ; mm1 = 72 62 52 42 32 22 12 02
532 ; mm6 = 73 63 53 43 33 23 13 03
533 ; mm3 = 74 64 54 44 34 24 14 04
534 ; mm7 = 75 65 55 45 35 25 15 05
536 movq mm2
, mm1
; 72 62 52 42 32 22 12 02
537 punpcklbw mm2
, mm6
; 33 32 23 22 13 12 03 02
539 movq mm4
, mm3
; 74 64 54 44 34 24 14 04
540 punpckhbw mm1
, mm6
; 73 72 63 62 53 52 43 42
542 punpcklbw mm4
, mm7
; 35 34 25 24 15 14 05 04
543 punpckhbw mm3
, mm7
; 75 74 65 64 55 54 45 44
545 movq mm6
, mm2
; 33 32 23 22 13 12 03 02
546 punpcklwd mm2
, mm4
; 15 14 13 12 05 04 03 02
548 punpckhwd mm6
, mm4
; 35 34 33 32 25 24 23 22
549 movq mm5
, mm1
; 73 72 63 62 53 52 43 42
551 punpcklwd mm1
, mm3
; 55 54 53 52 45 44 43 42
552 punpckhwd mm5
, mm3
; 75 74 73 72 65 64 63 62
555 ; mm2 = 15 14 13 12 05 04 03 02
556 ; mm6 = 35 34 33 32 25 24 23 22
557 ; mm5 = 55 54 53 52 45 44 43 42
558 ; mm1 = 75 74 73 72 65 64 63 62
562 movd
[rsi
+rax
*4+2], mm2
565 movd
[rdi
+rax
*4+2], mm2
566 movd
[rsi
+rax
*2+2], mm6
580 movd
[rdi
+rax
*2+2], mm5
597 ;void vp8_mbloop_filter_horizontal_edge_mmx
599 ; unsigned char *src_ptr,
600 ; int src_pixel_step,
601 ; const char *blimit,
603 ; const char *thresh,
606 global sym
(vp8_mbloop_filter_horizontal_edge_mmx
)
607 sym
(vp8_mbloop_filter_horizontal_edge_mmx
):
610 SHADOW_ARGS_TO_STACK
6
617 sub rsp
, 32 ; reserve 32 bytes
618 %define t0
[rsp
+ 0] ;__declspec(align(16)) char t0[8];
619 %define t1
[rsp
+ 16] ;__declspec(align(16)) char t1[8];
621 mov rsi
, arg
(0) ;src_ptr
622 movsxd rax
, dword ptr arg
(1) ;src_pixel_step ; destination pitch?
624 movsxd rcx
, dword ptr arg
(5) ;count
626 mov rdx
, arg
(3) ;limit
628 mov rdi
, rsi
; rdi points to row +1 for indirect addressing
631 ; calculate breakout conditions
632 movq mm2
, [rdi
+2*rax
] ; q3
634 movq mm1
, [rsi
+2*rax
] ; q2
636 psubusb mm1
, mm2
; q2-=q3
637 psubusb mm2
, mm6
; q3-=q2
638 por mm1
, mm2
; abs(q3-q2)
642 ; mm1 = abs(q3-q2), mm6 =q2, mm7 = limit
643 movq mm4
, [rsi
+rax
] ; q1
645 psubusb mm4
, mm6
; q1-=q2
646 psubusb mm6
, mm3
; q2-=q1
647 por mm4
, mm6
; abs(q2-q1)
652 ; mm1 = mask, mm3=q1, mm7 = limit
656 psubusb mm4
, mm3
; q0-=q1
657 psubusb mm3
, mm0
; q1-=q0
658 por mm4
, mm3
; abs(q0-q1)
659 movq t0
, mm4
; save to t0
664 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1)
666 neg rax
; negate pitch to deal with above border
668 movq mm2
, [rsi
+4*rax
] ; p3
669 movq mm4
, [rdi
+4*rax
] ; p2
671 psubusb mm4
, mm2
; p2-=p3
672 psubusb mm2
, mm5
; p3-=p2
673 por mm4
, mm2
; abs(p3 - p2)
676 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1)
678 movq mm4
, [rsi
+2*rax
] ; p1
680 psubusb mm4
, mm5
; p1-=p2
681 psubusb mm5
, mm3
; p2-=p1
682 por mm4
, mm5
; abs(p2 - p1)
689 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1)
691 movq mm4
, [rsi
+rax
] ; p0
693 psubusb mm4
, mm3
; p0-=p1
694 psubusb mm3
, mm5
; p1-=p0
695 por mm4
, mm3
; abs(p1 - p0)
696 movq t1
, mm4
; save to t1
699 ; mm1 = mask, mm0=q0, mm7 = limit, t0 = abs(q0-q1) t1 = abs(p1-p0)
703 psubusb mm3
, mm2
; q1-=p1
704 psubusb mm2
, mm4
; p1-=q1
705 por mm2
, mm3
; abs(p1-q1)
706 pand mm2
, [GLOBAL(tfe
)] ; set lsb of each byte to zero
707 psrlw mm2
, 1 ; abs(p1-q1)/2
711 psubusb mm5
, mm3
; p0-=q0
712 psubusb mm3
, mm6
; q0-=p0
713 por mm5
, mm3
; abs(p0 - q0)
714 paddusb mm5
, mm5
; abs(p0-q0)*2
715 paddusb mm5
, mm2
; abs (p0 - q0) *2 + abs(p1-q1)/2
717 mov rdx
, arg
(2) ;blimit ; get blimit
718 movq mm7
, [rdx
] ; blimit
720 psubusb mm5
, mm7
; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
723 pcmpeqb mm1
, mm5
; mask mm1
725 ; mm1 = mask, mm0=q0, mm7 = blimit, t0 = abs(q0-q1) t1 = abs(p1-p0)
728 ; calculate high edge variance
729 mov rdx
, arg
(4) ;thresh ; get thresh
731 movq mm4
, t0
; get abs (q1 - q0)
733 movq mm3
, t1
; get abs (p1 - p0)
735 paddb mm4
, mm3
; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
744 ; mm1 = mask, mm0=q0, mm7 = thresh, t0 = abs(q0-q1) t1 = abs(p1-p0)
746 ; start work on filters
747 movq mm2
, [rsi
+2*rax
] ; p1
749 pxor mm2
, [GLOBAL(t80
)] ; p1 offset to convert to signed values
750 pxor mm7
, [GLOBAL(t80
)] ; q1 offset to convert to signed values
751 psubsb mm2
, mm7
; p1 - q1
753 pxor mm6
, [GLOBAL(t80
)] ; offset to convert to signed values
754 pxor mm0
, [GLOBAL(t80
)] ; offset to convert to signed values
756 psubsb mm0
, mm6
; q0 - p0
757 paddsb mm2
, mm0
; 1 * (q0 - p0) + (p1 - q1)
758 paddsb mm2
, mm0
; 2 * (q0 - p0)
759 paddsb mm2
, mm0
; 3 * (q0 - p0) + (p1 - q1)
760 pand mm1
, mm2
; mask filter values we don't care about
763 ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
764 movq mm2
, mm1
; vp8_filter
765 pand mm2
, mm4
; ; Filter2 = vp8_filter & hev
768 paddsb mm5
, [GLOBAL(t3
)];
773 punpcklbw mm0
, mm5
; e0f0g0h0
774 psraw mm0
, 11 ; sign extended shift right by 3
775 punpckhbw mm7
, mm5
; a0b0c0d0
776 psraw mm7
, 11 ; sign extended shift right by 3
777 packsswb mm0
, mm7
; Filter2 >>=3;
779 movq mm5
, mm0
; Filter2
781 paddsb mm2
, [GLOBAL(t4
)] ; vp8_signed_char_clamp(Filter2 + 4)
785 punpcklbw mm0
, mm2
; e0f0g0h0
786 psraw mm0
, 11 ; sign extended shift right by 3
787 punpckhbw mm7
, mm2
; a0b0c0d0
788 psraw mm7
, 11 ; sign extended shift right by 3
789 packsswb mm0
, mm7
; Filter2 >>=3;
791 ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0
792 psubsb mm3
, mm0
; qs0 =qs0 - filter1
793 paddsb mm6
, mm5
; ps0 =ps0 + Fitler2
795 ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
796 ; vp8_filter &= ~hev;
797 ; Filter2 = vp8_filter;
798 pandn mm4
, mm1
; vp8_filter&=~hev
801 ; mm3=qs0, mm4=filter2, mm6=ps0
803 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
804 ; s = vp8_signed_char_clamp(qs0 - u);
806 ; s = vp8_signed_char_clamp(ps0 + u);
814 pmulhw mm1
, [GLOBAL(s27
)]
815 pmulhw mm2
, [GLOBAL(s27
)]
816 paddw mm1
, [GLOBAL(s63
)]
817 paddw mm2
, [GLOBAL(s63
)]
825 pxor mm3
, [GLOBAL(t80
)]
826 pxor mm6
, [GLOBAL(t80
)]
830 ; roughly 2/7th difference across boundary
831 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
832 ; s = vp8_signed_char_clamp(qs1 - u);
834 ; s = vp8_signed_char_clamp(ps1 + u);
840 pmulhw mm1
, [GLOBAL(s18
)]
841 pmulhw mm2
, [GLOBAL(s18
)]
842 paddw mm1
, [GLOBAL(s63
)]
843 paddw mm2
, [GLOBAL(s63
)]
849 movq mm6
, [rsi
+rax
*2] ; p1
851 pxor mm3
, [GLOBAL(t80
)]
852 pxor mm6
, [GLOBAL(t80
)]
857 pxor mm6
, [GLOBAL(t80
)]
858 pxor mm3
, [GLOBAL(t80
)]
860 movq
[rsi
+rax
*2], mm6
862 ; roughly 1/7th difference across boundary
863 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
864 ; s = vp8_signed_char_clamp(qs2 - u);
866 ; s = vp8_signed_char_clamp(ps2 + u);
872 pmulhw mm1
, [GLOBAL(s9
)]
873 pmulhw mm2
, [GLOBAL(s9
)]
874 paddw mm1
, [GLOBAL(s63
)]
875 paddw mm2
, [GLOBAL(s63
)]
881 movq mm6
, [rdi
+rax
*4]
885 pxor mm6
, [GLOBAL(t80
)]
886 pxor mm3
, [GLOBAL(t80
)]
891 pxor mm6
, [GLOBAL(t80
)]
892 pxor mm3
, [GLOBAL(t80
)]
895 movq
[rdi
+rax
*4], mm6
914 ;void vp8_mbloop_filter_vertical_edge_mmx
916 ; unsigned char *src_ptr,
917 ; int src_pixel_step,
918 ; const char *blimit,
920 ; const char *thresh,
923 global sym
(vp8_mbloop_filter_vertical_edge_mmx
)
924 sym
(vp8_mbloop_filter_vertical_edge_mmx
):
927 SHADOW_ARGS_TO_STACK
6
934 sub rsp
, 96 ; reserve 96 bytes
935 %define t0
[rsp
+ 0] ;__declspec(align(16)) char t0[8];
936 %define t1
[rsp
+ 16] ;__declspec(align(16)) char t1[8];
937 %define srct
[rsp
+ 32] ;__declspec(align(16)) char srct[64];
939 mov rsi
, arg
(0) ;src_ptr
940 movsxd rax
, dword ptr arg
(1) ;src_pixel_step ; destination pitch?
942 lea rsi
, [rsi
+ rax
*4 - 4]
944 movsxd rcx
, dword ptr arg
(5) ;count
946 lea rdi
, [rsi
+ rax
] ; rdi points to row +1 for indirect addressing
949 movq mm0
, [rdi
+2*rax
] ; 77 76 75 74 73 72 71 70
950 movq mm6
, [rsi
+2*rax
] ; 67 66 65 64 63 62 61 60
952 movq mm7
, mm6
; 77 76 75 74 73 72 71 70
953 punpckhbw mm7
, mm0
; 77 67 76 66 75 65 74 64
955 punpcklbw mm6
, mm0
; 73 63 72 62 71 61 70 60
956 movq mm0
, [rsi
+rax
] ; 57 56 55 54 53 52 51 50
958 movq mm4
, [rsi
] ; 47 46 45 44 43 42 41 40
959 movq mm5
, mm4
; 47 46 45 44 43 42 41 40
961 punpckhbw mm5
, mm0
; 57 47 56 46 55 45 54 44
962 punpcklbw mm4
, mm0
; 53 43 52 42 51 41 50 40
964 movq mm3
, mm5
; 57 47 56 46 55 45 54 44
965 punpckhwd mm5
, mm7
; 77 67 57 47 76 66 56 46
967 punpcklwd mm3
, mm7
; 75 65 55 45 74 64 54 44
968 movq mm2
, mm4
; 53 43 52 42 51 41 50 40
970 punpckhwd mm4
, mm6
; 73 63 53 43 72 62 52 42
971 punpcklwd mm2
, mm6
; 71 61 51 41 70 60 50 40
975 movq mm7
, [rsi
+rax
] ; 37 36 35 34 33 32 31 30
976 movq mm6
, [rsi
+rax
*2] ; 27 26 25 24 23 22 21 20
978 movq mm1
, mm6
; 27 26 25 24 23 22 21 20
979 punpckhbw mm6
, mm7
; 37 27 36 36 35 25 34 24
981 punpcklbw mm1
, mm7
; 33 23 32 22 31 21 30 20
983 movq mm7
, [rsi
+rax
*4]; ; 07 06 05 04 03 02 01 00
984 punpckhbw mm7
, [rdi
+rax
*4] ; 17 07 16 06 15 05 14 04
986 movq mm0
, mm7
; 17 07 16 06 15 05 14 04
987 punpckhwd mm7
, mm6
; 37 27 17 07 36 26 16 06
989 punpcklwd mm0
, mm6
; 35 25 15 05 34 24 14 04
990 movq mm6
, mm7
; 37 27 17 07 36 26 16 06
992 punpckhdq mm7
, mm5
; 77 67 57 47 37 27 17 07 = q3
993 punpckldq mm6
, mm5
; 76 66 56 46 36 26 16 06 = q2
996 movq mm5
, mm6
; 76 66 56 46 36 26 16 06
999 psubusb mm5
, mm7
; q2-q3
1003 psubusb mm7
, mm6
; q3-q2
1005 por mm7
, mm5
; ; mm7=abs (q3-q2)
1006 movq mm5
, mm0
; 35 25 15 05 34 24 14 04
1008 punpckhdq mm5
, mm3
; 75 65 55 45 35 25 15 05 = q1
1009 punpckldq mm0
, mm3
; 74 64 54 44 34 24 15 04 = q0
1011 movq mm3
, mm5
; 75 65 55 45 35 25 15 05 = q1
1012 psubusb mm3
, mm6
; q1-q2
1014 psubusb mm6
, mm5
; q2-q1
1015 por mm6
, mm3
; mm6=abs(q2-q1)
1017 movq
[rdx
+40], mm5
; save q1
1018 movq
[rdx
+32], mm0
; save q0
1020 movq mm3
, [rsi
+rax
*4] ; 07 06 05 04 03 02 01 00
1021 punpcklbw mm3
, [rdi
+rax
*4] ; 13 03 12 02 11 01 10 00
1023 movq mm0
, mm3
; 13 03 12 02 11 01 10 00
1024 punpcklwd mm0
, mm1
; 31 21 11 01 30 20 10 00
1026 punpckhwd mm3
, mm1
; 33 23 13 03 32 22 12 02
1027 movq mm1
, mm0
; 31 21 11 01 30 20 10 00
1029 punpckldq mm0
, mm2
; 70 60 50 40 30 20 10 00 =p3
1030 punpckhdq mm1
, mm2
; 71 61 51 41 31 21 11 01 =p2
1032 movq
[rdx
], mm0
; save p3
1033 movq
[rdx
+8], mm1
; save p2
1035 movq mm2
, mm1
; 71 61 51 41 31 21 11 01 =p2
1036 psubusb mm2
, mm0
; p2-p3
1038 psubusb mm0
, mm1
; p3-p2
1039 por mm0
, mm2
; mm0=abs(p3-p2)
1041 movq mm2
, mm3
; 33 23 13 03 32 22 12 02
1042 punpckldq mm2
, mm4
; 72 62 52 42 32 22 12 02 = p1
1044 punpckhdq mm3
, mm4
; 73 63 53 43 33 23 13 03 = p0
1045 movq
[rdx
+24], mm3
; save p0
1047 movq
[rdx
+16], mm2
; save p1
1048 movq mm5
, mm2
; mm5 = p1
1050 psubusb mm2
, mm1
; p1-p2
1051 psubusb mm1
, mm5
; p2-p1
1053 por mm1
, mm2
; mm1=abs(p2-p1)
1054 mov rdx
, arg
(3) ;limit
1056 movq mm4
, [rdx
] ; mm4 = limit
1057 psubusb mm7
, mm4
; abs(q3-q2) > limit
1059 psubusb mm0
, mm4
; abs(p3-p2) > limit
1060 psubusb mm1
, mm4
; abs(p2-p1) > limit
1062 psubusb mm6
, mm4
; abs(q2-q1) > limit
1066 por mm0
, mm7
; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit
1070 movq mm7
, mm3
; mm3=mm7=p0
1071 psubusb mm7
, mm5
; p0 - p1
1073 psubusb mm5
, mm3
; p1 - p0
1074 por mm5
, mm7
; abs(p1-p0)
1076 movq t0
, mm5
; save abs(p1-p0)
1079 psubusb mm5
, mm4
; mm5 = abs(p1-p0) > limit
1080 por mm0
, mm5
; mm0=mask
1082 movq mm5
, [rdx
+32] ; mm5=q0
1083 movq mm7
, [rdx
+40] ; mm7=q1
1085 movq mm6
, mm5
; mm6=q0
1087 psubusb mm5
, mm7
; q0-q1
1089 psubusb mm7
, mm6
; q1-q0
1090 por mm7
, mm5
; abs(q1-q0)
1092 movq t1
, mm7
; save abs(q1-q0)
1093 psubusb mm7
, mm4
; mm7=abs(q1-q0)> limit
1098 psubusb mm5
, mm1
; q1-=p1
1099 psubusb mm1
, mm2
; p1-=q1
1100 por mm5
, mm1
; abs(p1-q1)
1101 pand mm5
, [GLOBAL(tfe
)] ; set lsb of each byte to zero
1102 psrlw mm5
, 1 ; abs(p1-q1)/2
1104 mov rdx
, arg
(2) ;blimit ;
1106 movq mm4
, [rdx
] ;blimit
1107 movq mm1
, mm3
; mm1=mm3=p0
1109 movq mm7
, mm6
; mm7=mm6=q0
1110 psubusb mm1
, mm7
; p0-q0
1112 psubusb mm7
, mm3
; q0-p0
1113 por mm1
, mm7
; abs(q0-p0)
1114 paddusb mm1
, mm1
; abs(q0-p0)*2
1115 paddusb mm1
, mm5
; abs (p0 - q0) *2 + abs(p1-q1)/2
1117 psubusb mm1
, mm4
; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
1118 por mm1
, mm0
; ; mask
1123 ; calculate high edge variance
1124 mov rdx
, arg
(4) ;thresh ; get thresh
1127 movq mm4
, t0
; get abs (q1 - q0)
1128 psubusb mm4
, mm7
; abs(q1 - q0) > thresh
1130 movq mm3
, t1
; get abs (p1 - p0)
1131 psubusb mm3
, mm7
; abs(p1 - p0)> thresh
1133 por mm4
, mm3
; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
1142 ; start work on filters
1145 ; start work on filters
1146 movq mm2
, [rdx
+16] ; p1
1147 movq mm7
, [rdx
+40] ; q1
1148 pxor mm2
, [GLOBAL(t80
)] ; p1 offset to convert to signed values
1149 pxor mm7
, [GLOBAL(t80
)] ; q1 offset to convert to signed values
1150 psubsb mm2
, mm7
; p1 - q1
1152 movq mm6
, [rdx
+24] ; p0
1153 movq mm0
, [rdx
+32] ; q0
1154 pxor mm6
, [GLOBAL(t80
)] ; offset to convert to signed values
1155 pxor mm0
, [GLOBAL(t80
)] ; offset to convert to signed values
1158 psubsb mm0
, mm6
; q0 - p0
1159 paddsb mm2
, mm0
; 1 * (q0 - p0) + (p1 - q1)
1160 paddsb mm2
, mm0
; 2 * (q0 - p0)
1161 paddsb mm2
, mm0
; 3 * (q0 - p0) + (p1 - q1)
1162 pand mm1
, mm2
; mask filter values we don't care about
1164 ; mm1 = vp8_filter, mm4=hev, mm6=ps0, mm3=qs0
1165 movq mm2
, mm1
; vp8_filter
1166 pand mm2
, mm4
; ; Filter2 = vp8_filter & hev
1169 paddsb mm5
, [GLOBAL(t3
)];
1174 punpcklbw mm0
, mm5
; e0f0g0h0
1175 psraw mm0
, 11 ; sign extended shift right by 3
1176 punpckhbw mm7
, mm5
; a0b0c0d0
1177 psraw mm7
, 11 ; sign extended shift right by 3
1178 packsswb mm0
, mm7
; Filter2 >>=3;
1180 movq mm5
, mm0
; Filter2
1182 paddsb mm2
, [GLOBAL(t4
)] ; vp8_signed_char_clamp(Filter2 + 4)
1186 punpcklbw mm0
, mm2
; e0f0g0h0
1187 psraw mm0
, 11 ; sign extended shift right by 3
1188 punpckhbw mm7
, mm2
; a0b0c0d0
1189 psraw mm7
, 11 ; sign extended shift right by 3
1190 packsswb mm0
, mm7
; Filter2 >>=3;
1192 ; mm0= filter2 mm1 = vp8_filter, mm3 =qs0 mm5=s mm4 =hev mm6=ps0
1193 psubsb mm3
, mm0
; qs0 =qs0 - filter1
1194 paddsb mm6
, mm5
; ps0 =ps0 + Fitler2
1196 ; mm1=vp8_filter, mm3=qs0, mm4 =hev mm6=ps0
1197 ; vp8_filter &= ~hev;
1198 ; Filter2 = vp8_filter;
1199 pandn mm4
, mm1
; vp8_filter&=~hev
1202 ; mm3=qs0, mm4=filter2, mm6=ps0
1204 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7);
1205 ; s = vp8_signed_char_clamp(qs0 - u);
1207 ; s = vp8_signed_char_clamp(ps0 + u);
1215 pmulhw mm1
, [GLOBAL(s27
)]
1216 pmulhw mm2
, [GLOBAL(s27
)]
1217 paddw mm1
, [GLOBAL(s63
)]
1218 paddw mm2
, [GLOBAL(s63
)]
1226 pxor mm3
, [GLOBAL(t80
)]
1227 pxor mm6
, [GLOBAL(t80
)]
1231 ; roughly 2/7th difference across boundary
1232 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7);
1233 ; s = vp8_signed_char_clamp(qs1 - u);
1235 ; s = vp8_signed_char_clamp(ps1 + u);
1241 pmulhw mm1
, [GLOBAL(s18
)]
1242 pmulhw mm2
, [GLOBAL(s18
)]
1243 paddw mm1
, [GLOBAL(s63
)]
1244 paddw mm2
, [GLOBAL(s63
)]
1249 movq mm3
, [rdx
+ 40]
1250 movq mm6
, [rdx
+ 16] ; p1
1251 pxor mm3
, [GLOBAL(t80
)]
1252 pxor mm6
, [GLOBAL(t80
)]
1257 pxor mm6
, [GLOBAL(t80
)]
1258 pxor mm3
, [GLOBAL(t80
)]
1259 movq
[rdx
+ 40], mm3
1260 movq
[rdx
+ 16], mm6
1262 ; roughly 1/7th difference across boundary
1263 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7);
1264 ; s = vp8_signed_char_clamp(qs2 - u);
1266 ; s = vp8_signed_char_clamp(ps2 + u);
1272 pmulhw mm1
, [GLOBAL(s9
)]
1273 pmulhw mm2
, [GLOBAL(s9
)]
1274 paddw mm1
, [GLOBAL(s63
)]
1275 paddw mm2
, [GLOBAL(s63
)]
1283 pxor mm6
, [GLOBAL(t80
)]
1284 pxor mm3
, [GLOBAL(t80
)]
1289 pxor mm6
, [GLOBAL(t80
)] ; mm6 = 71 61 51 41 31 21 11 01
1290 pxor mm3
, [GLOBAL(t80
)] ; mm3 = 76 66 56 46 36 26 15 06
1292 ; tranpose and write back
1293 movq mm0
, [rdx
] ; mm0 = 70 60 50 40 30 20 10 00
1294 movq mm1
, mm0
; mm0 = 70 60 50 40 30 20 10 00
1296 punpcklbw mm0
, mm6
; mm0 = 31 30 21 20 11 10 01 00
1297 punpckhbw mm1
, mm6
; mm3 = 71 70 61 60 51 50 41 40
1299 movq mm2
, [rdx
+16] ; mm2 = 72 62 52 42 32 22 12 02
1300 movq mm6
, mm2
; mm3 = 72 62 52 42 32 22 12 02
1302 punpcklbw mm2
, [rdx
+24] ; mm2 = 33 32 23 22 13 12 03 02
1303 punpckhbw mm6
, [rdx
+24] ; mm3 = 73 72 63 62 53 52 43 42
1305 movq mm5
, mm0
; mm5 = 31 30 21 20 11 10 01 00
1306 punpcklwd mm0
, mm2
; mm0 = 13 12 11 10 03 02 01 00
1308 punpckhwd mm5
, mm2
; mm5 = 33 32 31 30 23 22 21 20
1309 movq mm4
, mm1
; mm4 = 71 70 61 60 51 50 41 40
1311 punpcklwd mm1
, mm6
; mm1 = 53 52 51 50 43 42 41 40
1312 punpckhwd mm4
, mm6
; mm4 = 73 72 71 70 63 62 61 60
1314 movq mm2
, [rdx
+32] ; mm2 = 74 64 54 44 34 24 14 04
1315 punpcklbw mm2
, [rdx
+40] ; mm2 = 35 34 25 24 15 14 05 04
1317 movq mm6
, mm3
; mm6 = 76 66 56 46 36 26 15 06
1318 punpcklbw mm6
, [rdx
+56] ; mm6 = 37 36 27 26 17 16 07 06
1320 movq mm7
, mm2
; mm7 = 35 34 25 24 15 14 05 04
1321 punpcklwd mm2
, mm6
; mm2 = 17 16 15 14 07 06 05 04
1323 punpckhwd mm7
, mm6
; mm7 = 37 36 35 34 27 26 25 24
1324 movq mm6
, mm0
; mm6 = 13 12 11 10 03 02 01 00
1326 punpckldq mm0
, mm2
; mm0 = 07 06 05 04 03 02 01 00
1327 punpckhdq mm6
, mm2
; mm6 = 17 16 15 14 13 12 11 10
1329 movq
[rsi
+rax
*4], mm0
; write out
1330 movq
[rdi
+rax
*4], mm6
; write out
1332 movq mm0
, mm5
; mm0 = 33 32 31 30 23 22 21 20
1333 punpckldq mm0
, mm7
; mm0 = 27 26 25 24 23 22 20 20
1335 punpckhdq mm5
, mm7
; mm5 = 37 36 35 34 33 32 31 30
1336 movq
[rsi
+rax
*2], mm0
; write out
1338 movq
[rdi
+rax
*2], mm5
; write out
1339 movq mm2
, [rdx
+32] ; mm2 = 74 64 54 44 34 24 14 04
1341 punpckhbw mm2
, [rdx
+40] ; mm2 = 75 74 65 64 54 54 45 44
1342 punpckhbw mm3
, [rdx
+56] ; mm3 = 77 76 67 66 57 56 47 46
1344 movq mm5
, mm2
; mm5 = 75 74 65 64 54 54 45 44
1345 punpcklwd mm2
, mm3
; mm2 = 57 56 55 54 47 46 45 44
1347 punpckhwd mm5
, mm3
; mm5 = 77 76 75 74 67 66 65 64
1348 movq mm0
, mm1
; mm0= 53 52 51 50 43 42 41 40
1350 movq mm3
, mm4
; mm4 = 73 72 71 70 63 62 61 60
1351 punpckldq mm0
, mm2
; mm0 = 47 46 45 44 43 42 41 40
1353 punpckhdq mm1
, mm2
; mm1 = 57 56 55 54 53 52 51 50
1354 movq
[rsi
], mm0
; write out
1356 movq
[rdi
], mm1
; write out
1359 punpckldq mm3
, mm5
; mm3 = 67 66 65 64 63 62 61 60
1360 punpckhdq mm4
, mm5
; mm4 = 77 76 75 74 73 72 71 60
1362 movq
[rsi
+rax
*2], mm3
1363 movq
[rdi
+rax
*2], mm4
1365 lea rsi
, [rsi
+rax
*8]
1381 ;void vp8_loop_filter_simple_horizontal_edge_mmx
1383 ; unsigned char *src_ptr,
1384 ; int src_pixel_step,
1385 ; const char *blimit
1387 global sym
(vp8_loop_filter_simple_horizontal_edge_mmx
)
1388 sym
(vp8_loop_filter_simple_horizontal_edge_mmx
):
1391 SHADOW_ARGS_TO_STACK
3
1397 mov rsi
, arg
(0) ;src_ptr
1398 movsxd rax
, dword ptr arg
(1) ;src_pixel_step ; destination pitch?
1402 mov rdx
, arg
(2) ;blimit ; get blimit
1405 mov rdi
, rsi
; rdi points to row +1 for indirect addressing
1410 movq mm1
, [rsi
+2*rax
] ; p1
1411 movq mm0
, [rdi
] ; q1
1415 psubusb mm0
, mm1
; q1-=p1
1416 psubusb mm1
, mm4
; p1-=q1
1417 por mm1
, mm0
; abs(p1-q1)
1418 pand mm1
, [GLOBAL(tfe
)] ; set lsb of each byte to zero
1419 psrlw mm1
, 1 ; abs(p1-q1)/2
1421 movq mm5
, [rsi
+rax
] ; p0
1422 movq mm4
, [rsi
] ; q0
1425 psubusb mm5
, mm4
; p0-=q0
1426 psubusb mm4
, mm6
; q0-=p0
1427 por mm5
, mm4
; abs(p0 - q0)
1428 paddusb mm5
, mm5
; abs(p0-q0)*2
1429 paddusb mm5
, mm1
; abs (p0 - q0) *2 + abs(p1-q1)/2
1431 psubusb mm5
, mm3
; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
1435 ; start work on filters
1436 pxor mm2
, [GLOBAL(t80
)] ; p1 offset to convert to signed values
1437 pxor mm7
, [GLOBAL(t80
)] ; q1 offset to convert to signed values
1438 psubsb mm2
, mm7
; p1 - q1
1440 pxor mm6
, [GLOBAL(t80
)] ; offset to convert to signed values
1441 pxor mm0
, [GLOBAL(t80
)] ; offset to convert to signed values
1443 psubsb mm0
, mm6
; q0 - p0
1444 paddsb mm2
, mm0
; p1 - q1 + 1 * (q0 - p0)
1445 paddsb mm2
, mm0
; p1 - q1 + 2 * (q0 - p0)
1446 paddsb mm2
, mm0
; p1 - q1 + 3 * (q0 - p0)
1447 pand mm5
, mm2
; mask filter values we don't care about
1450 paddsb mm5
, [GLOBAL(t4
)] ; 3* (q0 - p0) + (p1 - q1) + 4
1452 movq mm0
, mm5
; get a copy of filters
1453 psllw mm0
, 8 ; shift left 8
1454 psraw mm0
, 3 ; arithmetic shift right 11
1456 movq mm1
, mm5
; get a copy of filters
1457 psraw mm1
, 11 ; arithmetic shift right 11
1458 psllw mm1
, 8 ; shift left 8 to put it back
1460 por mm0
, mm1
; put the two together to get result
1462 psubsb mm3
, mm0
; q0-= q0 add
1463 pxor mm3
, [GLOBAL(t80
)] ; unoffset
1464 movq
[rsi
], mm3
; write back
1468 psubsb mm5
, [GLOBAL(t1s
)] ; +3 instead of +4
1470 movq mm0
, mm5
; get a copy of filters
1471 psllw mm0
, 8 ; shift left 8
1472 psraw mm0
, 3 ; arithmetic shift right 11
1474 psraw mm5
, 11 ; arithmetic shift right 11
1475 psllw mm5
, 8 ; shift left 8 to put it back
1476 por mm0
, mm5
; put the two together to get result
1479 paddsb mm6
, mm0
; p0+= p0 add
1480 pxor mm6
, [GLOBAL(t80
)] ; unoffset
1481 movq
[rsi
+rax
], mm6
; write back
1497 ;void vp8_loop_filter_simple_vertical_edge_mmx
1499 ; unsigned char *src_ptr,
1500 ; int src_pixel_step,
1501 ; const char *blimit
1503 global sym
(vp8_loop_filter_simple_vertical_edge_mmx
)
1504 sym
(vp8_loop_filter_simple_vertical_edge_mmx
):
1507 SHADOW_ARGS_TO_STACK
3
1514 sub rsp
, 32 ; reserve 32 bytes
1515 %define t0
[rsp
+ 0] ;__declspec(align(16)) char t0[8];
1516 %define t1
[rsp
+ 16] ;__declspec(align(16)) char t1[8];
1518 mov rsi
, arg
(0) ;src_ptr
1519 movsxd rax
, dword ptr arg
(1) ;src_pixel_step ; destination pitch?
1521 lea rsi
, [rsi
+ rax
*4- 2]; ;
1525 lea rdi
, [rsi
+ rax
];
1526 movd mm0
, [rdi
+ rax
* 2] ; xx xx xx xx 73 72 71 70
1528 movd mm6
, [rsi
+ rax
* 2] ; xx xx xx xx 63 62 61 60
1529 punpcklbw mm6
, mm0
; 73 63 72 62 71 61 70 60
1531 movd mm0
, [rsi
+ rax
] ; xx xx xx xx 53 52 51 50
1532 movd mm4
, [rsi
] ; xx xx xx xx 43 42 41 40
1534 punpcklbw mm4
, mm0
; 53 43 52 42 51 41 50 40
1535 movq mm5
, mm4
; 53 43 52 42 51 41 50 40
1537 punpcklwd mm4
, mm6
; 71 61 51 41 70 60 50 40
1538 punpckhwd mm5
, mm6
; 73 63 53 43 72 62 52 42
1542 movd mm7
, [rsi
+ rax
] ; xx xx xx xx 33 32 31 30
1543 movd mm6
, [rsi
+ rax
* 2] ; xx xx xx xx 23 22 21 20
1545 punpcklbw mm6
, mm7
; 33 23 32 22 31 21 30 20
1546 movd mm1
, [rdi
+ rax
* 4] ; xx xx xx xx 13 12 11 10
1548 movd mm0
, [rsi
+ rax
* 4] ; xx xx xx xx 03 02 01 00
1549 punpcklbw mm0
, mm1
; 13 03 12 02 11 01 10 00
1551 movq mm2
, mm0
; 13 03 12 02 11 01 10 00
1552 punpcklwd mm0
, mm6
; 31 21 11 01 30 20 10 00
1554 punpckhwd mm2
, mm6
; 33 23 13 03 32 22 12 02
1555 movq mm1
, mm0
; 13 03 12 02 11 01 10 00
1557 punpckldq mm0
, mm4
; 70 60 50 40 30 20 10 00 = p1
1558 movq mm3
, mm2
; 33 23 13 03 32 22 12 02
1560 punpckhdq mm1
, mm4
; 71 61 51 41 31 21 11 01 = p0
1561 punpckldq mm2
, mm5
; 72 62 52 42 32 22 12 02 = q0
1563 punpckhdq mm3
, mm5
; 73 63 53 43 33 23 13 03 = q1
1569 psubusb mm7
, mm6
; q1-=p1
1570 psubusb mm6
, mm3
; p1-=q1
1571 por mm6
, mm7
; abs(p1-q1)
1572 pand mm6
, [GLOBAL(tfe
)] ; set lsb of each byte to zero
1573 psrlw mm6
, 1 ; abs(p1-q1)/2
1578 psubusb mm5
, mm2
; p0-=q0
1579 psubusb mm4
, mm1
; q0-=p0
1581 por mm5
, mm4
; abs(p0 - q0)
1582 paddusb mm5
, mm5
; abs(p0-q0)*2
1583 paddusb mm5
, mm6
; abs (p0 - q0) *2 + abs(p1-q1)/2
1585 mov rdx
, arg
(2) ;blimit ; get blimit
1588 psubusb mm5
, mm7
; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
1590 pcmpeqb mm5
, mm7
; mm5 = mask
1592 ; start work on filters
1596 pxor mm0
, [GLOBAL(t80
)] ; p1 offset to convert to signed values
1597 pxor mm3
, [GLOBAL(t80
)] ; q1 offset to convert to signed values
1599 psubsb mm0
, mm3
; p1 - q1
1603 pxor mm6
, [GLOBAL(t80
)] ; offset to convert to signed values
1605 pxor mm7
, [GLOBAL(t80
)] ; offset to convert to signed values
1606 movq mm3
, mm7
; offseted ; q0
1608 psubsb mm7
, mm6
; q0 - p0
1609 paddsb mm0
, mm7
; p1 - q1 + 1 * (q0 - p0)
1611 paddsb mm0
, mm7
; p1 - q1 + 2 * (q0 - p0)
1612 paddsb mm0
, mm7
; p1 - q1 + 3 * (q0 - p0)
1614 pand mm5
, mm0
; mask filter values we don't care about
1616 paddsb mm5
, [GLOBAL(t4
)] ; 3* (q0 - p0) + (p1 - q1) + 4
1618 movq mm0
, mm5
; get a copy of filters
1619 psllw mm0
, 8 ; shift left 8
1620 psraw mm0
, 3 ; arithmetic shift right 11
1623 movq mm7
, mm5
; get a copy of filters
1624 psraw mm7
, 11 ; arithmetic shift right 11
1625 psllw mm7
, 8 ; shift left 8 to put it back
1627 por mm0
, mm7
; put the two together to get result
1629 psubsb mm3
, mm0
; q0-= q0sz add
1630 pxor mm3
, [GLOBAL(t80
)] ; unoffset
1633 psubsb mm5
, [GLOBAL(t1s
)] ; +3 instead of +4
1635 movq mm0
, mm5
; get a copy of filters
1636 psllw mm0
, 8 ; shift left 8
1637 psraw mm0
, 3 ; arithmetic shift right 11
1640 psraw mm5
, 11 ; arithmetic shift right 11
1641 psllw mm5
, 8 ; shift left 8 to put it back
1642 por mm0
, mm5
; put the two together to get result
1644 paddsb mm6
, mm0
; p0+= p0 add
1645 pxor mm6
, [GLOBAL(t80
)] ; unoffset
1651 ; mm0 = 70 60 50 40 30 20 10 00
1652 ; mm6 = 71 61 51 41 31 21 11 01
1653 ; mm3 = 72 62 52 42 32 22 12 02
1654 ; mm4 = 73 63 53 43 33 23 13 03
1655 ; transpose back to write out
1658 punpcklbw mm0
, mm6
; 31 30 21 20 11 10 01 00
1660 punpckhbw mm1
, mm6
; 71 70 61 60 51 50 41 40
1663 punpcklbw mm2
, mm4
; 33 32 23 22 13 12 03 02
1664 movq mm5
, mm1
; 71 70 61 60 51 50 41 40
1666 punpckhbw mm3
, mm4
; 73 72 63 62 53 52 43 42
1667 movq mm6
, mm0
; 31 30 21 20 11 10 01 00
1669 punpcklwd mm0
, mm2
; 13 12 11 10 03 02 01 00
1670 punpckhwd mm6
, mm2
; 33 32 31 30 23 22 21 20
1672 movd
[rsi
+rax
*4], mm0
; write 03 02 01 00
1673 punpcklwd mm1
, mm3
; 53 52 51 50 43 42 41 40
1675 psrlq mm0
, 32 ; xx xx xx xx 13 12 11 10
1676 punpckhwd mm5
, mm3
; 73 72 71 70 63 62 61 60
1678 movd
[rdi
+rax
*4], mm0
; write 13 12 11 10
1679 movd
[rsi
+rax
*2], mm6
; write 23 22 21 20
1681 psrlq mm6
, 32 ; 33 32 31 30
1682 movd
[rsi
], mm1
; write 43 42 41 40
1684 movd
[rsi
+ rax
], mm6
; write 33 32 31 30
1687 movd
[rsi
+ rax
*2], mm5
; write 63 62 61 60
1688 psrlq mm1
, 32 ; 53 52 51 50
1690 movd
[rdi
], mm1
; write out 53 52 51 50
1691 psrlq mm5
, 32 ; 73 72 71 70
1693 movd
[rdi
+ rax
*2], mm5
; write 73 72 71 70
1695 lea rsi
, [rsi
+rax
*8] ; next 8
1712 ;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,
1714 ; loop_filter_info *lfi)
1718 ; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
1719 ; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
1720 ; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2);