2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 ; Use of pmaxub instead of psubusb to compute filter mask was seen
17 %macro LFH_FILTER_AND_HEV_MASK
1
19 movdqa xmm2
, [rdi
+2*rax
] ; q3
20 movdqa xmm1
, [rsi
+2*rax
] ; q2
21 movdqa xmm4
, [rsi
+rax
] ; q1
22 movdqa xmm5
, [rsi
] ; q0
23 neg rax
; negate pitch to deal with above border
25 movlps xmm2
, [rsi
+ rcx
*2] ; q3
26 movlps xmm1
, [rsi
+ rcx
] ; q2
27 movlps xmm4
, [rsi
] ; q1
28 movlps xmm5
, [rsi
+ rax
] ; q0
30 movhps xmm2
, [rdi
+ rcx
*2]
31 movhps xmm1
, [rdi
+ rcx
]
33 movhps xmm5
, [rdi
+ rax
]
35 lea rsi
, [rsi
+ rax
*4]
36 lea rdi
, [rdi
+ rax
*4]
38 movdqa XMMWORD
PTR [rsp
], xmm1
; store q2
39 movdqa XMMWORD
PTR [rsp
+ 16], xmm4
; store q1
42 movdqa xmm6
, xmm1
; q2
43 movdqa xmm3
, xmm4
; q1
45 psubusb xmm1
, xmm2
; q2-=q3
46 psubusb xmm2
, xmm6
; q3-=q2
48 psubusb xmm4
, xmm6
; q1-=q2
49 psubusb xmm6
, xmm3
; q2-=q1
51 por xmm4
, xmm6
; abs(q2-q1)
52 por xmm1
, xmm2
; abs(q3-q2)
54 movdqa xmm0
, xmm5
; q0
57 psubusb xmm5
, xmm3
; q0-=q1
58 psubusb xmm3
, xmm0
; q1-=q0
60 por xmm5
, xmm3
; abs(q0-q1)
61 movdqa t0
, xmm5
; save to t0
66 movdqa xmm2
, [rsi
+4*rax
] ; p3
67 movdqa xmm4
, [rdi
+4*rax
] ; p2
68 movdqa xmm6
, [rsi
+2*rax
] ; p1
70 movlps xmm2
, [rsi
+ rax
] ; p3
71 movlps xmm4
, [rsi
] ; p2
72 movlps xmm6
, [rsi
+ rcx
] ; p1
74 movhps xmm2
, [rdi
+ rax
]
76 movhps xmm6
, [rdi
+ rcx
]
78 movdqa XMMWORD
PTR [rsp
+ 32], xmm4
; store p2
79 movdqa XMMWORD
PTR [rsp
+ 48], xmm6
; store p1
82 movdqa xmm5
, xmm4
; p2
83 movdqa xmm3
, xmm6
; p1
85 psubusb xmm4
, xmm2
; p2-=p3
86 psubusb xmm2
, xmm5
; p3-=p2
88 psubusb xmm3
, xmm5
; p1-=p2
89 pmaxub xmm1
, xmm4
; abs(p3 - p2)
91 psubusb xmm5
, xmm6
; p2-=p1
92 pmaxub xmm1
, xmm2
; abs(p3 - p2)
94 pmaxub xmm1
, xmm5
; abs(p2 - p1)
95 movdqa xmm2
, xmm6
; p1
97 pmaxub xmm1
, xmm3
; abs(p2 - p1)
99 movdqa xmm4
, [rsi
+rax
] ; p0
100 movdqa xmm3
, [rdi
] ; q1
102 movlps xmm4
, [rsi
+ rcx
*2] ; p0
103 movhps xmm4
, [rdi
+ rcx
*2]
107 movdqa xmm5
, xmm4
; p0
108 psubusb xmm4
, xmm6
; p0-=p1
110 psubusb xmm6
, xmm5
; p1-=p0
112 por xmm6
, xmm4
; abs(p1 - p0)
113 mov rdx
, arg
(2) ; get flimit
115 movdqa t1
, xmm6
; save to t1
117 movdqa xmm4
, xmm3
; q1
120 psubusb xmm3
, xmm2
; q1-=p1
121 psubusb xmm2
, xmm4
; p1-=q1
124 por xmm2
, xmm3
; abs(p1-q1)
126 movdqa xmm4
, XMMWORD
PTR [rdx
] ; flimit
128 movdqa xmm3
, xmm0
; q0
129 pand xmm2
, [GLOBAL(tfe
)] ; set lsb of each byte to zero
131 mov rdx
, arg
(4) ; hev get thresh
133 movdqa xmm6
, xmm5
; p0
134 psrlw xmm2
, 1 ; abs(p1-q1)/2
136 psubusb xmm5
, xmm3
; p0-=q0
137 paddb xmm4
, xmm4
; flimit*2 (less than 255)
139 psubusb xmm3
, xmm6
; q0-=p0
140 por xmm5
, xmm3
; abs(p0 - q0)
142 paddusb xmm5
, xmm5
; abs(p0-q0)*2
143 paddb xmm7
, xmm4
; flimit * 2 + limit (less than 255)
145 movdqa xmm4
, t0
; hev get abs (q1 - q0)
147 movdqa xmm3
, t1
; get abs (p1 - p0)
149 paddusb xmm5
, xmm2
; abs (p0 - q0) *2 + abs(p1-q1)/2
151 movdqa xmm2
, XMMWORD
PTR [rdx
] ; hev
153 psubusb xmm5
, xmm7
; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
154 psubusb xmm4
, xmm2
; hev
156 psubusb xmm3
, xmm2
; hev
160 paddb xmm4
, xmm3
; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
162 pcmpeqb xmm4
, xmm5
; hev
163 pcmpeqb xmm3
, xmm3
; hev
165 pcmpeqb xmm1
, xmm7
; mask xmm1
166 pxor xmm4
, xmm3
; hev
174 movdqa xmm2
, [rsi
+2*rax
] ; p1
175 movdqa xmm7
, [rdi
] ; q1
179 movdqa xmm2
, [rdx
] ; p1
180 movdqa xmm7
, [rdx
+48] ; q1
181 movdqa xmm6
, [rdx
+16] ; p0
182 movdqa xmm0
, [rdx
+32] ; q0
185 pxor xmm2
, [GLOBAL(t80
)] ; p1 offset to convert to signed values
186 pxor xmm7
, [GLOBAL(t80
)] ; q1 offset to convert to signed values
188 psubsb xmm2
, xmm7
; p1 - q1
189 pxor xmm6
, [GLOBAL(t80
)] ; offset to convert to signed values
191 pand xmm2
, xmm4
; high var mask (hvm)(p1 - q1)
192 pxor xmm0
, [GLOBAL(t80
)] ; offset to convert to signed values
194 movdqa xmm3
, xmm0
; q0
195 psubsb xmm0
, xmm6
; q0 - p0
197 paddsb xmm2
, xmm0
; 1 * (q0 - p0) + hvm(p1 - q1)
199 paddsb xmm2
, xmm0
; 2 * (q0 - p0) + hvm(p1 - q1)
201 paddsb xmm2
, xmm0
; 3 * (q0 - p0) + hvm(p1 - q1)
203 pand xmm1
, xmm2
; mask filter values we don't care about
207 paddsb xmm1
, [GLOBAL(t4
)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
208 paddsb xmm2
, [GLOBAL(t3
)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
210 punpckhbw xmm5
, xmm2
; axbxcxdx
211 punpcklbw xmm2
, xmm2
; exfxgxhx
213 punpcklbw xmm0
, xmm1
; exfxgxhx
214 psraw xmm5
, 11 ; sign extended shift right by 3
216 punpckhbw xmm1
, xmm1
; axbxcxdx
217 psraw xmm2
, 11 ; sign extended shift right by 3
219 packsswb xmm2
, xmm5
; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
220 psraw xmm0
, 11 ; sign extended shift right by 3
222 psraw xmm1
, 11 ; sign extended shift right by 3
223 movdqa xmm5
, xmm0
; save results
225 packsswb xmm0
, xmm1
; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
226 paddsw xmm5
, [GLOBAL(ones
)]
228 paddsw xmm1
, [GLOBAL(ones
)]
229 psraw xmm5
, 1 ; partial shifted one more time for 2nd tap
231 psraw xmm1
, 1 ; partial shifted one more time for 2nd tap
233 paddsb xmm6
, xmm2
; p0+= p0 add
234 packsswb xmm5
, xmm1
; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
239 movdqa xmm1
, [rsi
+2*rax
] ; p1
241 movdqa xmm1
, [rdx
] ; p1
243 pandn xmm4
, xmm5
; high edge variance additive
244 pxor xmm6
, [GLOBAL(t80
)] ; unoffset
246 pxor xmm1
, [GLOBAL(t80
)] ; reoffset
247 psubsb xmm3
, xmm0
; q0-= q0 add
249 paddsb xmm1
, xmm4
; p1+= p1 add
250 pxor xmm3
, [GLOBAL(t80
)] ; unoffset
252 pxor xmm1
, [GLOBAL(t80
)] ; unoffset
253 psubsb xmm7
, xmm4
; q1-= q1 add
255 pxor xmm7
, [GLOBAL(t80
)] ; unoffset
257 lea rsi
, [rsi
+ rcx
*2]
258 lea rdi
, [rdi
+ rcx
*2]
259 movq MMWORD
PTR [rsi
], xmm6
; p0
260 movhps MMWORD
PTR [rdi
], xmm6
261 movq MMWORD
PTR [rsi
+ rax
], xmm1
; p1
262 movhps MMWORD
PTR [rdi
+ rax
], xmm1
263 movq MMWORD
PTR [rsi
+ rcx
], xmm3
; q0
264 movhps MMWORD
PTR [rdi
+ rcx
], xmm3
265 movq MMWORD
PTR [rsi
+ rcx
*2],xmm7
; q1
266 movhps MMWORD
PTR [rdi
+ rcx
*2],xmm7
268 movdqa
[rsi
+rax
], xmm6
; write back
269 movdqa
[rsi
+2*rax
], xmm1
; write back
270 movdqa
[rsi
], xmm3
; write back
271 movdqa
[rdi
], xmm7
; write back
277 ;void vp8_loop_filter_horizontal_edge_sse2
279 ; unsigned char *src_ptr,
280 ; int src_pixel_step,
281 ; const char *flimit,
283 ; const char *thresh,
286 global sym
(vp8_loop_filter_horizontal_edge_sse2
)
287 sym
(vp8_loop_filter_horizontal_edge_sse2
):
290 SHADOW_ARGS_TO_STACK
6
298 sub rsp
, 32 ; reserve 32 bytes
299 %define t0
[rsp
+ 0] ;__declspec(align(16)) char t0[16];
300 %define t1
[rsp
+ 16] ;__declspec(align(16)) char t1[16];
302 mov rsi
, arg
(0) ;src_ptr
303 movsxd rax
, dword ptr arg
(1) ;src_pixel_step
305 mov rdx
, arg
(3) ;limit
306 movdqa xmm7
, XMMWORD
PTR [rdx
]
308 lea rdi
, [rsi
+rax
] ; rdi points to row +1 for indirect addressing
310 ; calculate breakout conditions and high edge variance
311 LFH_FILTER_AND_HEV_MASK
1
312 ; filter and write back the result
327 ;void vp8_loop_filter_horizontal_edge_uv_sse2
329 ; unsigned char *src_ptr,
330 ; int src_pixel_step,
331 ; const char *flimit,
333 ; const char *thresh,
336 global sym
(vp8_loop_filter_horizontal_edge_uv_sse2
)
337 sym
(vp8_loop_filter_horizontal_edge_uv_sse2
):
340 SHADOW_ARGS_TO_STACK
6
348 sub rsp
, 96 ; reserve 96 bytes
349 %define q2
[rsp
+ 0] ;__declspec(align(16)) char q2[16];
350 %define q1
[rsp
+ 16] ;__declspec(align(16)) char q1[16];
351 %define p2
[rsp
+ 32] ;__declspec(align(16)) char p2[16];
352 %define p1
[rsp
+ 48] ;__declspec(align(16)) char p1[16];
353 %define t0
[rsp
+ 64] ;__declspec(align(16)) char t0[16];
354 %define t1
[rsp
+ 80] ;__declspec(align(16)) char t1[16];
358 movsxd rax
, dword ptr arg
(1) ; src_pixel_step
360 neg rax
; negate pitch to deal with above border
362 mov rdx
, arg
(3) ;limit
363 movdqa xmm7
, XMMWORD
PTR [rdx
]
368 ; calculate breakout conditions and high edge variance
369 LFH_FILTER_AND_HEV_MASK
0
370 ; filter and write back the result
385 %macro MB_FILTER_AND_WRITEBACK
1
390 movdqa xmm2
, [rsi
+2*rax
] ; p1
391 movdqa xmm7
, [rdi
] ; q1
398 movdqa xmm2
, [rdx
+32] ; p1
399 movdqa xmm7
, [rdx
+80] ; q1
400 movdqa xmm6
, [rdx
+48] ; p0
401 movdqa xmm0
, [rdx
+64] ; q0
404 pxor xmm2
, [GLOBAL(t80
)] ; p1 offset to convert to signed values
405 pxor xmm7
, [GLOBAL(t80
)] ; q1 offset to convert to signed values
406 pxor xmm6
, [GLOBAL(t80
)] ; offset to convert to signed values
407 pxor xmm0
, [GLOBAL(t80
)] ; offset to convert to signed values
409 psubsb xmm2
, xmm7
; p1 - q1
410 movdqa xmm3
, xmm0
; q0
412 psubsb xmm0
, xmm6
; q0 - p0
414 paddsb xmm2
, xmm0
; 1 * (q0 - p0) + (p1 - q1)
416 paddsb xmm2
, xmm0
; 2 * (q0 - p0)
418 paddsb xmm2
, xmm0
; 3 * (q0 - p0) + (p1 - q1)
420 pand xmm1
, xmm2
; mask filter values we don't care about
422 movdqa xmm2
, xmm1
; vp8_filter
424 pand xmm2
, xmm4
; Filter2 = vp8_filter & hev
427 pandn xmm4
, xmm1
; vp8_filter&=~hev
430 punpcklbw xmm0
, xmm4
; Filter 2 (hi)
433 punpckhbw xmm1
, xmm4
; Filter 2 (lo)
434 paddsb xmm5
, [GLOBAL(t3
)] ; vp8_signed_char_clamp(Filter2 + 3)
436 pmulhw xmm1
, [GLOBAL(s9
)] ; Filter 2 (lo) * 9
438 pmulhw xmm0
, [GLOBAL(s9
)] ; Filter 2 (hi) * 9
440 punpckhbw xmm7
, xmm5
; axbxcxdx
441 paddsb xmm2
, [GLOBAL(t4
)] ; vp8_signed_char_clamp(Filter2 + 4)
443 punpcklbw xmm5
, xmm5
; exfxgxhx
444 psraw xmm7
, 11 ; sign extended shift right by 3
446 psraw xmm5
, 11 ; sign extended shift right by 3
447 punpckhbw xmm4
, xmm2
; axbxcxdx
449 punpcklbw xmm2
, xmm2
; exfxgxhx
450 psraw xmm4
, 11 ; sign extended shift right by 3
452 packsswb xmm5
, xmm7
; Filter2 >>=3;
453 psraw xmm2
, 11 ; sign extended shift right by 3
455 packsswb xmm2
, xmm4
; Filter1 >>=3;
458 paddsb xmm6
, xmm5
; ps0 =ps0 + Fitler2
461 psubsb xmm3
, xmm2
; qs0 =qs0 - Filter1
465 paddw xmm0
, [GLOBAL(s63
)] ; Filter 2 (hi) * 9 + 63
467 paddw xmm1
, [GLOBAL(s63
)] ; Filter 2 (lo) * 9 + 63
468 paddw xmm5
, xmm5
; Filter 2 (hi) * 18
470 paddw xmm7
, xmm7
; Filter 2 (lo) * 18
471 paddw xmm5
, xmm0
; Filter 2 (hi) * 27 + 63
473 paddw xmm7
, xmm1
; Filter 2 (lo) * 27 + 63
474 paddw xmm2
, xmm0
; Filter 2 (hi) * 18 + 63
476 paddw xmm4
, xmm1
; Filter 2 (lo) * 18 + 63
477 psraw xmm0
, 7 ; (Filter 2 (hi) * 9 + 63) >> 7
479 psraw xmm1
, 7 ; (Filter 2 (lo) * 9 + 63) >> 7
480 psraw xmm2
, 7 ; (Filter 2 (hi) * 18 + 63) >> 7
482 packsswb xmm0
, xmm1
; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
483 psraw xmm4
, 7 ; (Filter 2 (lo) * 18 + 63) >> 7
485 psraw xmm5
, 7 ; (Filter 2 (hi) * 27 + 63) >> 7
486 packsswb xmm2
, xmm4
; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
488 psraw xmm7
, 7 ; (Filter 2 (lo) * 27 + 63) >> 7
490 packsswb xmm5
, xmm7
; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
492 psubsb xmm3
, xmm5
; sq = vp8_signed_char_clamp(qs0 - u3)
493 paddsb xmm6
, xmm5
; sp = vp8_signed_char_clamp(ps0 - u3)
502 movdqa xmm5
, XMMWORD
PTR [rdi
+rcx
] ; q2
503 movdqa xmm1
, XMMWORD
PTR [rdi
] ; q1
504 movdqa xmm4
, XMMWORD
PTR [rsi
+rax
*2] ; p1
505 movdqa xmm7
, XMMWORD
PTR [rdi
+rax
*4] ; p2
507 movdqa xmm5
, XMMWORD
PTR [rdx
+96] ; q2
508 movdqa xmm1
, XMMWORD
PTR [rdx
+80] ; q1
509 movdqa xmm4
, XMMWORD
PTR [rdx
+32] ; p1
510 movdqa xmm7
, XMMWORD
PTR [rdx
+16] ; p2
513 pxor xmm3
, [GLOBAL(t80
)] ; *oq0 = sq^0x80
514 pxor xmm6
, [GLOBAL(t80
)] ; *oq0 = sp^0x80
516 pxor xmm1
, [GLOBAL(t80
)]
517 pxor xmm4
, [GLOBAL(t80
)]
519 psubsb xmm1
, xmm2
; sq = vp8_signed_char_clamp(qs1 - u2)
520 paddsb xmm4
, xmm2
; sp = vp8_signed_char_clamp(ps1 - u2)
522 pxor xmm1
, [GLOBAL(t80
)] ; *oq1 = sq^0x80;
523 pxor xmm4
, [GLOBAL(t80
)] ; *op1 = sp^0x80;
525 pxor xmm7
, [GLOBAL(t80
)]
526 pxor xmm5
, [GLOBAL(t80
)]
528 paddsb xmm7
, xmm0
; sp = vp8_signed_char_clamp(ps2 - u)
529 psubsb xmm5
, xmm0
; sq = vp8_signed_char_clamp(qs2 - u)
531 pxor xmm7
, [GLOBAL(t80
)] ; *op2 = sp^0x80;
532 pxor xmm5
, [GLOBAL(t80
)] ; *oq2 = sq^0x80;
538 movq MMWORD
PTR [rsi
], xmm6
; p0
539 movhps MMWORD
PTR [rdi
], xmm6
540 movq MMWORD
PTR [rsi
+ rcx
], xmm3
; q0
541 movhps MMWORD
PTR [rdi
+ rcx
], xmm3
543 movq MMWORD
PTR [rsi
+rcx
*2], xmm1
; q1
544 movhps MMWORD
PTR [rdi
+rcx
*2], xmm1
546 movq MMWORD
PTR [rsi
+ rax
], xmm4
; p1
547 movhps MMWORD
PTR [rdi
+ rax
], xmm4
549 movq MMWORD
PTR [rsi
+rax
*2], xmm7
; p2
550 movhps MMWORD
PTR [rdi
+rax
*2], xmm7
554 movq MMWORD
PTR [rsi
+rcx
*2], xmm5
; q2
555 movhps MMWORD
PTR [rdi
+rcx
*2], xmm5
557 movdqa XMMWORD
PTR [rdi
+rcx
], xmm5
; q2
558 movdqa XMMWORD
PTR [rdi
], xmm1
; q1
559 movdqa XMMWORD
PTR [rsi
], xmm3
; q0
560 movdqa XMMWORD
PTR [rsi
+rax
],xmm6
; p0
561 movdqa XMMWORD
PTR [rsi
+rax
*2],xmm4
; p1
562 movdqa XMMWORD
PTR [rdi
+rax
*4],xmm7
; p2
564 movdqa XMMWORD
PTR [rdx
+80], xmm1
; q1
565 movdqa XMMWORD
PTR [rdx
+64], xmm3
; q0
566 movdqa XMMWORD
PTR [rdx
+48], xmm6
; p0
567 movdqa XMMWORD
PTR [rdx
+32], xmm4
; p1
573 ;void vp8_mbloop_filter_horizontal_edge_sse2
575 ; unsigned char *src_ptr,
576 ; int src_pixel_step,
577 ; const char *flimit,
579 ; const char *thresh,
582 global sym
(vp8_mbloop_filter_horizontal_edge_sse2
)
583 sym
(vp8_mbloop_filter_horizontal_edge_sse2
):
586 SHADOW_ARGS_TO_STACK
6
594 sub rsp
, 32 ; reserve 32 bytes
595 %define t0
[rsp
+ 0] ;__declspec(align(16)) char t0[16];
596 %define t1
[rsp
+ 16] ;__declspec(align(16)) char t1[16];
598 mov rsi
, arg
(0) ;src_ptr
599 movsxd rax
, dword ptr arg
(1) ;src_pixel_step
601 mov rdx
, arg
(3) ;limit
602 movdqa xmm7
, XMMWORD
PTR [rdx
]
604 lea rdi
, [rsi
+rax
] ; rdi points to row +1 for indirect addressing
606 ; calculate breakout conditions and high edge variance
607 LFH_FILTER_AND_HEV_MASK
1
608 ; filter and write back the results
609 MB_FILTER_AND_WRITEBACK
1
623 ;void vp8_mbloop_filter_horizontal_edge_uv_sse2
626 ; int src_pixel_step,
627 ; const char *flimit,
629 ; const char *thresh,
632 global sym
(vp8_mbloop_filter_horizontal_edge_uv_sse2
)
633 sym
(vp8_mbloop_filter_horizontal_edge_uv_sse2
):
636 SHADOW_ARGS_TO_STACK
6
644 sub rsp
, 96 ; reserve 96 bytes
645 %define q2
[rsp
+ 0] ;__declspec(align(16)) char q2[16];
646 %define q1
[rsp
+ 16] ;__declspec(align(16)) char q1[16];
647 %define p2
[rsp
+ 32] ;__declspec(align(16)) char p2[16];
648 %define p1
[rsp
+ 48] ;__declspec(align(16)) char p1[16];
649 %define t0
[rsp
+ 64] ;__declspec(align(16)) char t0[16];
650 %define t1
[rsp
+ 80] ;__declspec(align(16)) char t1[16];
654 movsxd rax
, dword ptr arg
(1) ; src_pixel_step
656 neg rax
; negate pitch to deal with above border
658 mov rdx
, arg
(3) ;limit
659 movdqa xmm7
, XMMWORD
PTR [rdx
]
664 ; calculate breakout conditions and high edge variance
665 LFH_FILTER_AND_HEV_MASK
0
666 ; filter and write back the results
667 MB_FILTER_AND_WRITEBACK
0
681 %macro TRANSPOSE_16X8
2
682 movq xmm4
, QWORD PTR [rsi
] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
683 movq xmm1
, QWORD PTR [rdi
] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
684 movq xmm0
, QWORD PTR [rsi
+2*rax
] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
685 movq xmm7
, QWORD PTR [rdi
+2*rax
] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
686 movq xmm5
, QWORD PTR [rsi
+4*rax
] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
687 movq xmm2
, QWORD PTR [rdi
+4*rax
] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
689 punpcklbw xmm4
, xmm1
; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
691 movq xmm1
, QWORD PTR [rdi
+2*rcx
] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
693 movdqa xmm3
, xmm4
; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
694 punpcklbw xmm0
, xmm7
; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
696 movq xmm7
, QWORD PTR [rsi
+2*rcx
] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
698 punpcklbw xmm5
, xmm2
; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
702 mov rsi
, arg
(5) ; v_ptr
705 movdqa xmm6
, xmm5
; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
706 punpcklbw xmm7
, xmm1
; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
708 punpcklwd xmm5
, xmm7
; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
710 punpckhwd xmm6
, xmm7
; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
717 punpcklwd xmm3
, xmm0
; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
721 lea rdi
, [rsi
+ rax
] ; rdi points to row +1 for indirect addressing
724 movdqa xmm2
, xmm3
; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
725 punpckhwd xmm4
, xmm0
; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
727 movdqa xmm7
, xmm4
; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
728 punpckhdq xmm3
, xmm5
; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
730 punpckhdq xmm7
, xmm6
; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
732 punpckldq xmm4
, xmm6
; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
734 punpckldq xmm2
, xmm5
; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
736 movdqa t0
, xmm2
; save to free XMM2
737 movq xmm2
, QWORD PTR [rsi
] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
738 movq xmm6
, QWORD PTR [rdi
] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
739 movq xmm0
, QWORD PTR [rsi
+2*rax
] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
740 movq xmm5
, QWORD PTR [rdi
+2*rax
] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
741 movq xmm1
, QWORD PTR [rsi
+4*rax
] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
743 punpcklbw xmm2
, xmm6
; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
745 movq xmm6
, QWORD PTR [rdi
+4*rax
] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
747 punpcklbw xmm0
, xmm5
; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
749 movq xmm5
, QWORD PTR [rsi
+2*rcx
] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
751 punpcklbw xmm1
, xmm6
; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
753 movq xmm6
, QWORD PTR [rdi
+2*rcx
] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
755 punpcklbw xmm5
, xmm6
; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
758 punpckhwd xmm6
, xmm5
; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
760 punpcklwd xmm1
, xmm5
; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
761 movdqa xmm5
, xmm2
; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
763 punpcklwd xmm5
, xmm0
; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
765 punpckhwd xmm2
, xmm0
; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
768 punpckldq xmm0
, xmm1
; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
770 punpckhdq xmm5
, xmm1
; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
771 movdqa xmm1
, xmm2
; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
773 punpckldq xmm1
, xmm6
; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
775 punpckhdq xmm2
, xmm6
; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
776 movdqa xmm6
, xmm7
; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
778 punpcklqdq xmm6
, xmm2
; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
780 punpckhqdq xmm7
, xmm2
; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
782 movdqa xmm2
, xmm3
; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
783 punpcklqdq xmm2
, xmm5
; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
785 punpckhqdq xmm3
, xmm5
; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
787 movdqa
[rdx
], xmm2
; save 2
789 movdqa xmm5
, xmm4
; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
790 punpcklqdq xmm4
, xmm1
; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
792 movdqa
[rdx
+16], xmm3
; save 3
794 punpckhqdq xmm5
, xmm1
; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
796 movdqa
[rdx
+32], xmm4
; save 4
797 movdqa
[rdx
+48], xmm5
; save 5
798 movdqa xmm1
, t0
; get
801 punpckhqdq xmm1
, xmm0
; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
803 punpcklqdq xmm2
, xmm0
; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
805 movdqa
[rdx
+112], xmm7
; save 7
807 movdqa
[rdx
+96], xmm6
; save 6
809 movdqa xmm2
, xmm3
; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
810 punpckhqdq xmm3
, xmm5
; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
812 punpcklqdq xmm2
, xmm5
; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
814 movdqa
[rdx
+32], xmm2
; save 2
816 movdqa xmm5
, xmm4
; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
817 punpcklqdq xmm4
, xmm1
; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
819 movdqa
[rdx
+48], xmm3
; save 3
821 punpckhqdq xmm5
, xmm1
; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
823 movdqa
[rdx
+64], xmm4
; save 4
824 movdqa
[rdx
+80], xmm5
; save 5
825 movdqa xmm1
, t0
; get
828 punpckhqdq xmm1
, xmm0
; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
830 punpcklqdq xmm2
, xmm0
; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
832 movdqa
[rdx
+16], xmm1
838 %macro LFV_FILTER_MASK_HEV_MASK
1
839 movdqa xmm0
, xmm6
; q2
840 psubusb xmm0
, xmm7
; q2-q3
842 psubusb xmm7
, xmm6
; q3-q2
843 movdqa xmm4
, xmm5
; q1
845 por xmm7
, xmm0
; abs (q3-q2)
846 psubusb xmm4
, xmm6
; q1-q2
849 psubusb xmm6
, xmm5
; q2-q1
851 por xmm6
, xmm4
; abs (q2-q1)
852 psubusb xmm0
, xmm2
; p2 - p3;
854 psubusb xmm2
, xmm1
; p3 - p2;
855 por xmm0
, xmm2
; abs(p2-p3)
857 movdqa xmm2
, [rdx
] ; p1
859 movdqa xmm2
, [rdx
+32] ; p1
861 movdqa xmm5
, xmm2
; p1
864 psubusb xmm5
, xmm1
; p1-p2
865 psubusb xmm1
, xmm2
; p2-p1
867 movdqa xmm7
, xmm3
; p0
868 psubusb xmm7
, xmm2
; p0-p1
870 por xmm1
, xmm5
; abs(p2-p1)
874 movdqa xmm1
, xmm2
; p1
876 psubusb xmm2
, xmm3
; p1-p0
879 por xmm2
, xmm7
; abs(p1-p0)
881 movdqa t0
, xmm2
; save abs(p1-p0)
886 movdqa xmm5
, [rdx
+32] ; q0
887 movdqa xmm7
, [rdx
+48] ; q1
889 movdqa xmm5
, [rdx
+64] ; q0
890 movdqa xmm7
, [rdx
+80] ; q1
892 mov rdx
, arg
(3) ; limit
894 movdqa xmm6
, xmm5
; q0
895 movdqa xmm2
, xmm7
; q1
897 psubusb xmm5
, xmm7
; q0-q1
898 psubusb xmm7
, xmm6
; q1-q0
900 por xmm7
, xmm5
; abs(q1-q0)
902 movdqa t1
, xmm7
; save abs(q1-q0)
904 movdqa xmm4
, XMMWORD
PTR [rdx
]; limit
907 mov rdx
, arg
(2) ; flimit
910 movdqa xmm5
, xmm2
; q1
912 psubusb xmm5
, xmm1
; q1-=p1
913 psubusb xmm1
, xmm2
; p1-=q1
915 por xmm5
, xmm1
; abs(p1-q1)
916 movdqa xmm1
, xmm3
; p0
918 pand xmm5
, [GLOBAL(tfe
)] ; set lsb of each byte to zero
919 psubusb xmm1
, xmm6
; p0-q0
921 psrlw xmm5
, 1 ; abs(p1-q1)/2
922 psubusb xmm6
, xmm3
; q0-p0
924 movdqa xmm2
, XMMWORD
PTR [rdx
]; flimit
926 mov rdx
, arg
(4) ; get thresh
928 por xmm1
, xmm6
; abs(q0-p0)
929 paddb xmm2
, xmm2
; flimit*2 (less than 255)
931 movdqa xmm6
, t0
; get abs (q1 - q0)
933 paddusb xmm1
, xmm1
; abs(q0-p0)*2
935 movdqa xmm3
, t1
; get abs (p1 - p0)
937 movdqa xmm7
, XMMWORD
PTR [rdx
]
939 paddusb xmm1
, xmm5
; abs (p0 - q0) *2 + abs(p1-q1)/2
940 psubusb xmm6
, xmm7
; abs(q1 - q0) > thresh
942 paddb xmm4
, xmm2
; flimit * 2 + limit (less than 255)
943 psubusb xmm3
, xmm7
; abs(p1 - p0)> thresh
945 psubusb xmm1
, xmm4
; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
946 por xmm6
, xmm3
; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
948 por xmm1
, xmm0
; mask
958 %macro BV_TRANSPOSE
0
959 ; xmm1 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
960 ; xmm6 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
961 ; xmm3 = f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
962 ; xmm7 = f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
963 movdqa xmm2
, xmm1
; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
964 punpcklbw xmm2
, xmm6
; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
966 movdqa xmm4
, xmm3
; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
967 punpckhbw xmm1
, xmm6
; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
969 punpcklbw xmm4
, xmm7
; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
971 punpckhbw xmm3
, xmm7
; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
973 movdqa xmm6
, xmm2
; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
974 punpcklwd xmm2
, xmm4
; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
976 punpckhwd xmm6
, xmm4
; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
977 movdqa xmm5
, xmm1
; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
979 punpcklwd xmm1
, xmm3
; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
981 punpckhwd xmm5
, xmm3
; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
982 ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
983 ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
984 ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
985 ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
988 %macro BV_WRITEBACK
2
995 movd
[rsi
+2*rax
+2], %1
998 movd
[rdi
+2*rax
+2], %1
1000 movd
[rsi
+4*rax
+2], %2
1003 movd
[rdi
+4*rax
+2], %2
1006 movd
[rsi
+2*rcx
+2], %2
1009 movd
[rdi
+2*rcx
+2], %2
1013 ;void vp8_loop_filter_vertical_edge_sse2
1015 ; unsigned char *src_ptr,
1016 ; int src_pixel_step,
1017 ; const char *flimit,
1018 ; const char *limit,
1019 ; const char *thresh,
1022 global sym
(vp8_loop_filter_vertical_edge_sse2
)
1023 sym
(vp8_loop_filter_vertical_edge_sse2
):
1026 SHADOW_ARGS_TO_STACK
6
1034 sub rsp
, 96 ; reserve 96 bytes
1035 %define t0
[rsp
+ 0] ;__declspec(align(16)) char t0[16];
1036 %define t1
[rsp
+ 16] ;__declspec(align(16)) char t1[16];
1037 %define srct
[rsp
+ 32] ;__declspec(align(16)) char srct[64];
1039 mov rsi
, arg
(0) ; src_ptr
1040 movsxd rax
, dword ptr arg
(1) ; src_pixel_step
1043 lea rdi
, [rsi
+ rax
] ; rdi points to row +1 for indirect addressing
1044 lea rcx
, [rax
*2+rax
]
1046 ;transpose 16x8 to 8x16, and store the 8-line result on stack.
1049 ; calculate filter mask and high edge variance
1050 LFV_FILTER_MASK_HEV_MASK
1
1052 ; start work on filters
1055 ; tranpose and write back - only work on q1, q0, p0, p1
1057 ; store 16-line result
1062 BV_WRITEBACK xmm1
, xmm5
1064 lea rsi
, [rsi
+rdx
*8]
1065 lea rdi
, [rdi
+rdx
*8]
1066 BV_WRITEBACK xmm2
, xmm6
1080 ;void vp8_loop_filter_vertical_edge_uv_sse2
1083 ; int src_pixel_step,
1084 ; const char *flimit,
1085 ; const char *limit,
1086 ; const char *thresh,
1089 global sym
(vp8_loop_filter_vertical_edge_uv_sse2
)
1090 sym
(vp8_loop_filter_vertical_edge_uv_sse2
):
1093 SHADOW_ARGS_TO_STACK
6
1101 sub rsp
, 96 ; reserve 96 bytes
1102 %define t0
[rsp
+ 0] ;__declspec(align(16)) char t0[16];
1103 %define t1
[rsp
+ 16] ;__declspec(align(16)) char t1[16];
1104 %define srct
[rsp
+ 32] ;__declspec(align(16)) char srct[64];
1106 mov rsi
, arg
(0) ; u_ptr
1107 movsxd rax
, dword ptr arg
(1) ; src_pixel_step
1110 lea rdi
, [rsi
+ rax
] ; rdi points to row +1 for indirect addressing
1111 lea rcx
, [rax
+2*rax
]
1115 ;transpose 16x8 to 8x16, and store the 8-line result on stack.
1118 ; calculate filter mask and high edge variance
1119 LFV_FILTER_MASK_HEV_MASK
1
1121 ; start work on filters
1124 ; tranpose and write back - only work on q1, q0, p0, p1
1127 lea rdi
, [rsi
+ rax
] ; rdi points to row +1 for indirect addressing
1129 ; store 16-line result
1130 BV_WRITEBACK xmm1
, xmm5
1132 mov rsi
, arg
(0) ; u_ptr
1134 lea rdi
, [rsi
+ rax
] ; rdi points to row +1 for indirect addressing
1135 BV_WRITEBACK xmm2
, xmm6
1148 %macro MBV_TRANSPOSE
0
1149 movdqa xmm0
, [rdx
] ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1150 movdqa xmm1
, xmm0
; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1152 punpcklbw xmm0
, xmm7
; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
1153 punpckhbw xmm1
, xmm7
; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
1155 movdqa xmm2
, [rdx
+32] ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1156 movdqa xmm6
, xmm2
; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1158 punpcklbw xmm2
, [rdx
+48] ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
1159 punpckhbw xmm6
, [rdx
+48] ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
1161 movdqa xmm3
, xmm0
; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
1162 punpcklwd xmm0
, xmm2
; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
1164 punpckhwd xmm3
, xmm2
; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
1165 movdqa xmm4
, xmm1
; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
1167 punpcklwd xmm1
, xmm6
; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
1168 punpckhwd xmm4
, xmm6
; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
1170 movdqa xmm2
, [rdx
+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
1171 punpcklbw xmm2
, [rdx
+80] ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
1173 movdqa xmm6
, xmm5
; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
1174 punpcklbw xmm6
, [rdx
+112] ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
1176 movdqa xmm7
, xmm2
; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
1177 punpcklwd xmm2
, xmm6
; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04
1179 punpckhwd xmm7
, xmm6
; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44
1180 movdqa xmm6
, xmm0
; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
1182 punpckldq xmm0
, xmm2
; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00
1183 punpckhdq xmm6
, xmm2
; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20
1186 %macro MBV_WRITEBACK_1
0
1187 movq
QWORD PTR [rsi
], xmm0
1188 movhps MMWORD
PTR [rdi
], xmm0
1190 movq
QWORD PTR [rsi
+2*rax
], xmm6
1191 movhps MMWORD
PTR [rdi
+2*rax
], xmm6
1193 movdqa xmm0
, xmm3
; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
1194 punpckldq xmm0
, xmm7
; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
1196 punpckhdq xmm3
, xmm7
; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
1198 movq
QWORD PTR [rsi
+4*rax
], xmm0
1199 movhps MMWORD
PTR [rdi
+4*rax
], xmm0
1201 movq
QWORD PTR [rsi
+2*rcx
], xmm3
1202 movhps MMWORD
PTR [rdi
+2*rcx
], xmm3
1204 movdqa xmm2
, [rdx
+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
1205 punpckhbw xmm2
, [rdx
+80] ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
1207 punpckhbw xmm5
, [rdx
+112] ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
1210 punpcklwd xmm0
, xmm5
; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
1211 punpckhwd xmm2
, xmm5
; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
1213 movdqa xmm5
, xmm1
; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
1214 punpckldq xmm1
, xmm0
; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80
1216 punpckhdq xmm5
, xmm0
; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
1219 %macro MBV_WRITEBACK_2
0
1220 movq
QWORD PTR [rsi
], xmm1
1221 movhps MMWORD
PTR [rdi
], xmm1
1223 movq
QWORD PTR [rsi
+2*rax
], xmm5
1224 movhps MMWORD
PTR [rdi
+2*rax
], xmm5
1226 movdqa xmm1
, xmm4
; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
1227 punpckldq xmm1
, xmm2
; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
1228 punpckhdq xmm4
, xmm2
; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
1230 movq
QWORD PTR [rsi
+4*rax
], xmm1
1231 movhps MMWORD
PTR [rdi
+4*rax
], xmm1
1233 movq
QWORD PTR [rsi
+2*rcx
], xmm4
1234 movhps MMWORD
PTR [rdi
+2*rcx
], xmm4
1238 ;void vp8_mbloop_filter_vertical_edge_sse2
1240 ; unsigned char *src_ptr,
1241 ; int src_pixel_step,
1242 ; const char *flimit,
1243 ; const char *limit,
1244 ; const char *thresh,
1247 global sym
(vp8_mbloop_filter_vertical_edge_sse2
)
1248 sym
(vp8_mbloop_filter_vertical_edge_sse2
):
1251 SHADOW_ARGS_TO_STACK
6
1259 sub rsp
, 160 ; reserve 160 bytes
1260 %define t0
[rsp
+ 0] ;__declspec(align(16)) char t0[16];
1261 %define t1
[rsp
+ 16] ;__declspec(align(16)) char t1[16];
1262 %define srct
[rsp
+ 32] ;__declspec(align(16)) char srct[128];
1264 mov rsi
, arg
(0) ; src_ptr
1265 movsxd rax
, dword ptr arg
(1) ; src_pixel_step
1268 lea rdi
, [rsi
+ rax
] ; rdi points to row +1 for indirect addressing
1269 lea rcx
, [rax
*2+rax
]
1274 ; calculate filter mask and high edge variance
1275 LFV_FILTER_MASK_HEV_MASK
0
1278 ; start work on filters
1279 MB_FILTER_AND_WRITEBACK
2
1281 lea rsi
, [rsi
+rax
*8]
1282 lea rdi
, [rdi
+rax
*8]
1284 ; transpose and write back
1291 lea rsi
, [rsi
+rax
*8]
1292 lea rdi
, [rdi
+rax
*8]
1307 ;void vp8_mbloop_filter_vertical_edge_uv_sse2
1310 ; int src_pixel_step,
1311 ; const char *flimit,
1312 ; const char *limit,
1313 ; const char *thresh,
1316 global sym
(vp8_mbloop_filter_vertical_edge_uv_sse2
)
1317 sym
(vp8_mbloop_filter_vertical_edge_uv_sse2
):
1320 SHADOW_ARGS_TO_STACK
6
1328 sub rsp
, 160 ; reserve 160 bytes
1329 %define t0
[rsp
+ 0] ;__declspec(align(16)) char t0[16];
1330 %define t1
[rsp
+ 16] ;__declspec(align(16)) char t1[16];
1331 %define srct
[rsp
+ 32] ;__declspec(align(16)) char srct[128];
1333 mov rsi
, arg
(0) ; u_ptr
1334 movsxd rax
, dword ptr arg
(1) ; src_pixel_step
1337 lea rdi
, [rsi
+ rax
] ; rdi points to row +1 for indirect addressing
1338 lea rcx
, [rax
+2*rax
]
1345 ; calculate filter mask and high edge variance
1346 LFV_FILTER_MASK_HEV_MASK
0
1348 ; start work on filters
1349 MB_FILTER_AND_WRITEBACK
2
1351 ; transpose and write back
1354 mov rsi
, arg
(0) ;u_ptr
1356 lea rdi
, [rsi
+ rax
]
1358 mov rsi
, arg
(5) ;v_ptr
1360 lea rdi
, [rsi
+ rax
]
1375 ;void vp8_loop_filter_simple_horizontal_edge_sse2
1377 ; unsigned char *src_ptr,
1378 ; int src_pixel_step,
1379 ; const char *flimit,
1380 ; const char *limit,
1381 ; const char *thresh,
1384 global sym
(vp8_loop_filter_simple_horizontal_edge_sse2
)
1385 sym
(vp8_loop_filter_simple_horizontal_edge_sse2
):
1388 SHADOW_ARGS_TO_STACK
6
1395 mov rsi
, arg
(0) ;src_ptr
1396 movsxd rax
, dword ptr arg
(1) ;src_pixel_step ; destination pitch?
1397 mov rdx
, arg
(2) ;flimit ; get flimit
1398 movdqa xmm3
, XMMWORD
PTR [rdx
]
1399 mov rdx
, arg
(3) ;limit
1400 movdqa xmm7
, XMMWORD
PTR [rdx
]
1402 paddb xmm3
, xmm3
; flimit*2 (less than 255)
1403 paddb xmm3
, xmm7
; flimit * 2 + limit (less than 255)
1405 mov rdi
, rsi
; rdi points to row +1 for indirect addressing
1410 movdqu xmm1
, [rsi
+2*rax
] ; p1
1411 movdqu xmm0
, [rdi
] ; q1
1415 psubusb xmm0
, xmm1
; q1-=p1
1416 psubusb xmm1
, xmm4
; p1-=q1
1417 por xmm1
, xmm0
; abs(p1-q1)
1418 pand xmm1
, [GLOBAL(tfe
)] ; set lsb of each byte to zero
1419 psrlw xmm1
, 1 ; abs(p1-q1)/2
1421 movdqu xmm5
, [rsi
+rax
] ; p0
1422 movdqu xmm4
, [rsi
] ; q0
1423 movdqa xmm0
, xmm4
; q0
1424 movdqa xmm6
, xmm5
; p0
1425 psubusb xmm5
, xmm4
; p0-=q0
1426 psubusb xmm4
, xmm6
; q0-=p0
1427 por xmm5
, xmm4
; abs(p0 - q0)
1428 paddusb xmm5
, xmm5
; abs(p0-q0)*2
1429 paddusb xmm5
, xmm1
; abs (p0 - q0) *2 + abs(p1-q1)/2
1431 psubusb xmm5
, xmm3
; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
1435 ; start work on filters
1436 pxor xmm2
, [GLOBAL(t80
)] ; p1 offset to convert to signed values
1437 pxor xmm7
, [GLOBAL(t80
)] ; q1 offset to convert to signed values
1438 psubsb xmm2
, xmm7
; p1 - q1
1440 pxor xmm6
, [GLOBAL(t80
)] ; offset to convert to signed values
1441 pxor xmm0
, [GLOBAL(t80
)] ; offset to convert to signed values
1442 movdqa xmm3
, xmm0
; q0
1443 psubsb xmm0
, xmm6
; q0 - p0
1444 paddsb xmm2
, xmm0
; p1 - q1 + 1 * (q0 - p0)
1445 paddsb xmm2
, xmm0
; p1 - q1 + 2 * (q0 - p0)
1446 paddsb xmm2
, xmm0
; p1 - q1 + 3 * (q0 - p0)
1447 pand xmm5
, xmm2
; mask filter values we don't care about
1450 paddsb xmm5
, [GLOBAL(t4
)] ; 3* (q0 - p0) + (p1 - q1) + 4
1452 movdqa xmm0
, xmm5
; get a copy of filters
1453 psllw xmm0
, 8 ; shift left 8
1454 psraw xmm0
, 3 ; arithmetic shift right 11
1456 movdqa xmm1
, xmm5
; get a copy of filters
1457 psraw xmm1
, 11 ; arithmetic shift right 11
1458 psllw xmm1
, 8 ; shift left 8 to put it back
1460 por xmm0
, xmm1
; put the two together to get result
1462 psubsb xmm3
, xmm0
; q0-= q0 add
1463 pxor xmm3
, [GLOBAL(t80
)] ; unoffset
1464 movdqu
[rsi
], xmm3
; write back
1467 psubsb xmm5
, [GLOBAL(t1s
)] ; +3 instead of +4
1469 movdqa xmm0
, xmm5
; get a copy of filters
1470 psllw xmm0
, 8 ; shift left 8
1471 psraw xmm0
, 3 ; arithmetic shift right 11
1473 psraw xmm5
, 11 ; arithmetic shift right 11
1474 psllw xmm5
, 8 ; shift left 8 to put it back
1475 por xmm0
, xmm5
; put the two together to get result
1478 paddsb xmm6
, xmm0
; p0+= p0 add
1479 pxor xmm6
, [GLOBAL(t80
)] ; unoffset
1480 movdqu
[rsi
+rax
], xmm6
; write back
1492 ;void vp8_loop_filter_simple_vertical_edge_sse2
1494 ; unsigned char *src_ptr,
1495 ; int src_pixel_step,
1496 ; const char *flimit,
1497 ; const char *limit,
1498 ; const char *thresh,
1501 global sym
(vp8_loop_filter_simple_vertical_edge_sse2
)
1502 sym
(vp8_loop_filter_simple_vertical_edge_sse2
):
1503 push rbp
; save old base pointer value.
1504 mov rbp
, rsp
; set new base pointer value.
1505 SHADOW_ARGS_TO_STACK
6
1507 GET_GOT rbx
; save callee-saved reg
1513 sub rsp
, 32 ; reserve 32 bytes
1514 %define t0
[rsp
+ 0] ;__declspec(align(16)) char t0[16];
1515 %define t1
[rsp
+ 16] ;__declspec(align(16)) char t1[16];
1517 mov rsi
, arg
(0) ;src_ptr
1518 movsxd rax
, dword ptr arg
(1) ;src_pixel_step ; destination pitch?
1521 lea rdi
, [rsi
+ rax
]
1522 lea rdx
, [rsi
+ rax
*4]
1523 lea rcx
, [rdx
+ rax
]
1525 movdqu xmm0
, [rsi
] ; (high 96 bits unused) 03 02 01 00
1526 movdqu xmm1
, [rdx
] ; (high 96 bits unused) 43 42 41 40
1527 movdqu xmm2
, [rdi
] ; 13 12 11 10
1528 movdqu xmm3
, [rcx
] ; 53 52 51 50
1529 punpckldq xmm0
, xmm1
; (high 64 bits unused) 43 42 41 40 03 02 01 00
1530 punpckldq xmm2
, xmm3
; 53 52 51 50 13 12 11 10
1532 movdqu xmm4
, [rsi
+ rax
*2] ; 23 22 21 20
1533 movdqu xmm5
, [rdx
+ rax
*2] ; 63 62 61 60
1534 movdqu xmm6
, [rdi
+ rax
*2] ; 33 32 31 30
1535 movdqu xmm7
, [rcx
+ rax
*2] ; 73 72 71 70
1536 punpckldq xmm4
, xmm5
; 63 62 61 60 23 22 21 20
1537 punpckldq xmm6
, xmm7
; 73 72 71 70 33 32 31 30
1539 punpcklbw xmm0
, xmm2
; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
1540 punpcklbw xmm4
, xmm6
; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
1543 punpcklwd xmm0
, xmm4
; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
1544 punpckhwd xmm1
, xmm4
; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
1547 punpckldq xmm0
, xmm1
; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
1548 punpckhdq xmm2
, xmm1
; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
1550 movdqa t0
, xmm0
; save to t0
1551 movdqa t1
, xmm2
; save to t1
1553 lea rsi
, [rsi
+ rax
*8]
1554 lea rdi
, [rsi
+ rax
]
1555 lea rdx
, [rsi
+ rax
*4]
1556 lea rcx
, [rdx
+ rax
]
1558 movdqu xmm4
, [rsi
] ; 83 82 81 80
1559 movdqu xmm1
, [rdx
] ; c3 c2 c1 c0
1560 movdqu xmm6
, [rdi
] ; 93 92 91 90
1561 movdqu xmm3
, [rcx
] ; d3 d2 d1 d0
1562 punpckldq xmm4
, xmm1
; c3 c2 c1 c0 83 82 81 80
1563 punpckldq xmm6
, xmm3
; d3 d2 d1 d0 93 92 91 90
1565 movdqu xmm0
, [rsi
+ rax
*2] ; a3 a2 a1 a0
1566 movdqu xmm5
, [rdx
+ rax
*2] ; e3 e2 e1 e0
1567 movdqu xmm2
, [rdi
+ rax
*2] ; b3 b2 b1 b0
1568 movdqu xmm7
, [rcx
+ rax
*2] ; f3 f2 f1 f0
1569 punpckldq xmm0
, xmm5
; e3 e2 e1 e0 a3 a2 a1 a0
1570 punpckldq xmm2
, xmm7
; f3 f2 f1 f0 b3 b2 b1 b0
1572 punpcklbw xmm4
, xmm6
; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
1573 punpcklbw xmm0
, xmm2
; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
1576 punpcklwd xmm4
, xmm0
; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
1577 punpckhwd xmm1
, xmm0
; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
1580 punpckldq xmm4
, xmm1
; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
1581 punpckhdq xmm6
, xmm1
; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
1583 movdqa xmm0
, t0
; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
1584 movdqa xmm2
, t1
; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
1588 punpcklqdq xmm0
, xmm4
; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1589 punpckhqdq xmm1
, xmm4
; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
1590 punpcklqdq xmm2
, xmm6
; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1591 punpckhqdq xmm3
, xmm6
; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
1594 movdqa xmm6
, xmm0
; p1
1595 movdqa xmm7
, xmm3
; q1
1596 psubusb xmm7
, xmm0
; q1-=p1
1597 psubusb xmm6
, xmm3
; p1-=q1
1598 por xmm6
, xmm7
; abs(p1-q1)
1599 pand xmm6
, [GLOBAL(tfe
)] ; set lsb of each byte to zero
1600 psrlw xmm6
, 1 ; abs(p1-q1)/2
1602 movdqa xmm5
, xmm1
; p0
1603 movdqa xmm4
, xmm2
; q0
1604 psubusb xmm5
, xmm2
; p0-=q0
1605 psubusb xmm4
, xmm1
; q0-=p0
1606 por xmm5
, xmm4
; abs(p0 - q0)
1607 paddusb xmm5
, xmm5
; abs(p0-q0)*2
1608 paddusb xmm5
, xmm6
; abs (p0 - q0) *2 + abs(p1-q1)/2
1610 mov rdx
, arg
(2) ;flimit
1611 movdqa xmm7
, XMMWORD
PTR [rdx
]
1612 mov rdx
, arg
(3) ; get limit
1613 movdqa xmm6
, XMMWORD
PTR [rdx
]
1614 paddb xmm7
, xmm7
; flimit*2 (less than 255)
1615 paddb xmm7
, xmm6
; flimit * 2 + limit (less than 255)
1617 psubusb xmm5
, xmm7
; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
1619 pcmpeqb xmm5
, xmm7
; mm5 = mask
1621 ; start work on filters
1625 pxor xmm0
, [GLOBAL(t80
)] ; p1 offset to convert to signed values
1626 pxor xmm3
, [GLOBAL(t80
)] ; q1 offset to convert to signed values
1628 psubsb xmm0
, xmm3
; p1 - q1
1629 movdqa xmm6
, xmm1
; p0
1631 movdqa xmm7
, xmm2
; q0
1632 pxor xmm6
, [GLOBAL(t80
)] ; offset to convert to signed values
1634 pxor xmm7
, [GLOBAL(t80
)] ; offset to convert to signed values
1635 movdqa xmm3
, xmm7
; offseted ; q0
1637 psubsb xmm7
, xmm6
; q0 - p0
1638 paddsb xmm0
, xmm7
; p1 - q1 + 1 * (q0 - p0)
1640 paddsb xmm0
, xmm7
; p1 - q1 + 2 * (q0 - p0)
1641 paddsb xmm0
, xmm7
; p1 - q1 + 3 * (q0 - p0)
1643 pand xmm5
, xmm0
; mask filter values we don't care about
1646 paddsb xmm5
, [GLOBAL(t4
)] ; 3* (q0 - p0) + (p1 - q1) + 4
1648 movdqa xmm0
, xmm5
; get a copy of filters
1649 psllw xmm0
, 8 ; shift left 8
1651 psraw xmm0
, 3 ; arithmetic shift right 11
1654 movdqa xmm7
, xmm5
; get a copy of filters
1655 psraw xmm7
, 11 ; arithmetic shift right 11
1657 psllw xmm7
, 8 ; shift left 8 to put it back
1658 por xmm0
, xmm7
; put the two together to get result
1660 psubsb xmm3
, xmm0
; q0-= q0sz add
1661 pxor xmm3
, [GLOBAL(t80
)] ; unoffset q0
1664 psubsb xmm5
, [GLOBAL(t1s
)] ; +3 instead of +4
1665 movdqa xmm0
, xmm5
; get a copy of filters
1667 psllw xmm0
, 8 ; shift left 8
1668 psraw xmm0
, 3 ; arithmetic shift right 11
1671 psraw xmm5
, 11 ; arithmetic shift right 11
1673 psllw xmm5
, 8 ; shift left 8 to put it back
1674 por xmm0
, xmm5
; put the two together to get result
1676 paddsb xmm6
, xmm0
; p0+= p0 add
1677 pxor xmm6
, [GLOBAL(t80
)] ; unoffset p0
1679 movdqa xmm0
, t0
; p1
1680 movdqa xmm4
, t1
; q1
1682 ; transpose back to write out
1683 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1684 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
1685 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1686 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
1688 punpcklbw xmm0
, xmm6
; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
1689 punpckhbw xmm1
, xmm6
; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
1692 punpcklbw xmm3
, xmm4
; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
1693 punpckhbw xmm5
, xmm4
; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
1696 punpcklwd xmm0
, xmm3
; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
1697 punpckhwd xmm2
, xmm3
; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
1700 punpcklwd xmm1
, xmm5
; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
1701 punpckhwd xmm3
, xmm5
; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
1703 ; write out order: xmm0 xmm2 xmm1 xmm3
1704 lea rdx
, [rsi
+ rax
*4]
1706 movd
[rsi
], xmm1
; write the second 8-line result
1710 movd
[rsi
+ rax
*2], xmm1
1712 movd
[rdi
+ rax
*2], xmm1
1718 movd
[rdx
+ rax
*2], xmm3
1720 movd
[rcx
+ rax
*2], xmm3
1723 lea rsi
, [rsi
+ rax
*8]
1725 lea rdi
, [rsi
+ rax
]
1726 lea rdx
, [rsi
+ rax
*4]
1727 lea rcx
, [rdx
+ rax
]
1729 movd
[rsi
], xmm0
; write the first 8-line result
1733 movd
[rsi
+ rax
*2], xmm0
1735 movd
[rdi
+ rax
*2], xmm0
1741 movd
[rdx
+ rax
*2], xmm2
1743 movd
[rcx
+ rax
*2], xmm2