2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 ; Use of pmaxub instead of psubusb to compute filter mask was seen
17 %macro LFH_FILTER_AND_HEV_MASK
1
19 movdqa xmm2
, [rdi
+2*rax
] ; q3
20 movdqa xmm1
, [rsi
+2*rax
] ; q2
21 movdqa xmm4
, [rsi
+rax
] ; q1
22 movdqa xmm5
, [rsi
] ; q0
23 neg rax
; negate pitch to deal with above border
25 movlps xmm2
, [rsi
+ rcx
*2] ; q3
26 movlps xmm1
, [rsi
+ rcx
] ; q2
27 movlps xmm4
, [rsi
] ; q1
28 movlps xmm5
, [rsi
+ rax
] ; q0
30 movhps xmm2
, [rdi
+ rcx
*2]
31 movhps xmm1
, [rdi
+ rcx
]
33 movhps xmm5
, [rdi
+ rax
]
35 lea rsi
, [rsi
+ rax
*4]
36 lea rdi
, [rdi
+ rax
*4]
38 movdqa XMMWORD
PTR [rsp
], xmm1
; store q2
39 movdqa XMMWORD
PTR [rsp
+ 16], xmm4
; store q1
42 movdqa xmm6
, xmm1
; q2
43 movdqa xmm3
, xmm4
; q1
45 psubusb xmm1
, xmm2
; q2-=q3
46 psubusb xmm2
, xmm6
; q3-=q2
48 psubusb xmm4
, xmm6
; q1-=q2
49 psubusb xmm6
, xmm3
; q2-=q1
51 por xmm4
, xmm6
; abs(q2-q1)
52 por xmm1
, xmm2
; abs(q3-q2)
54 movdqa xmm0
, xmm5
; q0
57 psubusb xmm5
, xmm3
; q0-=q1
58 psubusb xmm3
, xmm0
; q1-=q0
60 por xmm5
, xmm3
; abs(q0-q1)
61 movdqa t0
, xmm5
; save to t0
66 movdqa xmm2
, [rsi
+4*rax
] ; p3
67 movdqa xmm4
, [rdi
+4*rax
] ; p2
68 movdqa xmm6
, [rsi
+2*rax
] ; p1
70 movlps xmm2
, [rsi
+ rax
] ; p3
71 movlps xmm4
, [rsi
] ; p2
72 movlps xmm6
, [rsi
+ rcx
] ; p1
74 movhps xmm2
, [rdi
+ rax
]
76 movhps xmm6
, [rdi
+ rcx
]
78 movdqa XMMWORD
PTR [rsp
+ 32], xmm4
; store p2
79 movdqa XMMWORD
PTR [rsp
+ 48], xmm6
; store p1
82 movdqa xmm5
, xmm4
; p2
83 movdqa xmm3
, xmm6
; p1
85 psubusb xmm4
, xmm2
; p2-=p3
86 psubusb xmm2
, xmm5
; p3-=p2
88 psubusb xmm3
, xmm5
; p1-=p2
89 pmaxub xmm1
, xmm4
; abs(p3 - p2)
91 psubusb xmm5
, xmm6
; p2-=p1
92 pmaxub xmm1
, xmm2
; abs(p3 - p2)
94 pmaxub xmm1
, xmm5
; abs(p2 - p1)
95 movdqa xmm2
, xmm6
; p1
97 pmaxub xmm1
, xmm3
; abs(p2 - p1)
99 movdqa xmm4
, [rsi
+rax
] ; p0
100 movdqa xmm3
, [rdi
] ; q1
102 movlps xmm4
, [rsi
+ rcx
*2] ; p0
103 movhps xmm4
, [rdi
+ rcx
*2]
107 movdqa xmm5
, xmm4
; p0
108 psubusb xmm4
, xmm6
; p0-=p1
110 psubusb xmm6
, xmm5
; p1-=p0
112 por xmm6
, xmm4
; abs(p1 - p0)
113 mov rdx
, arg
(2) ; get blimit
115 movdqa t1
, xmm6
; save to t1
117 movdqa xmm4
, xmm3
; q1
120 psubusb xmm3
, xmm2
; q1-=p1
121 psubusb xmm2
, xmm4
; p1-=q1
124 por xmm2
, xmm3
; abs(p1-q1)
126 movdqa xmm7
, XMMWORD
PTR [rdx
] ; blimit
128 movdqa xmm3
, xmm0
; q0
129 pand xmm2
, [GLOBAL(tfe
)] ; set lsb of each byte to zero
131 mov rdx
, arg
(4) ; hev get thresh
133 movdqa xmm6
, xmm5
; p0
134 psrlw xmm2
, 1 ; abs(p1-q1)/2
136 psubusb xmm5
, xmm3
; p0-=q0
138 psubusb xmm3
, xmm6
; q0-=p0
139 por xmm5
, xmm3
; abs(p0 - q0)
141 paddusb xmm5
, xmm5
; abs(p0-q0)*2
143 movdqa xmm4
, t0
; hev get abs (q1 - q0)
145 movdqa xmm3
, t1
; get abs (p1 - p0)
147 paddusb xmm5
, xmm2
; abs (p0 - q0) *2 + abs(p1-q1)/2
149 movdqa xmm2
, XMMWORD
PTR [rdx
] ; hev
151 psubusb xmm5
, xmm7
; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
152 psubusb xmm4
, xmm2
; hev
154 psubusb xmm3
, xmm2
; hev
158 paddb xmm4
, xmm3
; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
160 pcmpeqb xmm4
, xmm5
; hev
161 pcmpeqb xmm3
, xmm3
; hev
163 pcmpeqb xmm1
, xmm7
; mask xmm1
164 pxor xmm4
, xmm3
; hev
172 movdqa xmm2
, [rsi
+2*rax
] ; p1
173 movdqa xmm7
, [rdi
] ; q1
177 movdqa xmm2
, [rdx
] ; p1
178 movdqa xmm7
, [rdx
+48] ; q1
179 movdqa xmm6
, [rdx
+16] ; p0
180 movdqa xmm0
, [rdx
+32] ; q0
183 pxor xmm2
, [GLOBAL(t80
)] ; p1 offset to convert to signed values
184 pxor xmm7
, [GLOBAL(t80
)] ; q1 offset to convert to signed values
186 psubsb xmm2
, xmm7
; p1 - q1
187 pxor xmm6
, [GLOBAL(t80
)] ; offset to convert to signed values
189 pand xmm2
, xmm4
; high var mask (hvm)(p1 - q1)
190 pxor xmm0
, [GLOBAL(t80
)] ; offset to convert to signed values
192 movdqa xmm3
, xmm0
; q0
193 psubsb xmm0
, xmm6
; q0 - p0
195 paddsb xmm2
, xmm0
; 1 * (q0 - p0) + hvm(p1 - q1)
197 paddsb xmm2
, xmm0
; 2 * (q0 - p0) + hvm(p1 - q1)
199 paddsb xmm2
, xmm0
; 3 * (q0 - p0) + hvm(p1 - q1)
201 pand xmm1
, xmm2
; mask filter values we don't care about
205 paddsb xmm1
, [GLOBAL(t4
)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
206 paddsb xmm2
, [GLOBAL(t3
)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
208 punpckhbw xmm5
, xmm2
; axbxcxdx
209 punpcklbw xmm2
, xmm2
; exfxgxhx
211 punpcklbw xmm0
, xmm1
; exfxgxhx
212 psraw xmm5
, 11 ; sign extended shift right by 3
214 punpckhbw xmm1
, xmm1
; axbxcxdx
215 psraw xmm2
, 11 ; sign extended shift right by 3
217 packsswb xmm2
, xmm5
; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
218 psraw xmm0
, 11 ; sign extended shift right by 3
220 psraw xmm1
, 11 ; sign extended shift right by 3
221 movdqa xmm5
, xmm0
; save results
223 packsswb xmm0
, xmm1
; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
224 paddsw xmm5
, [GLOBAL(ones
)]
226 paddsw xmm1
, [GLOBAL(ones
)]
227 psraw xmm5
, 1 ; partial shifted one more time for 2nd tap
229 psraw xmm1
, 1 ; partial shifted one more time for 2nd tap
231 paddsb xmm6
, xmm2
; p0+= p0 add
232 packsswb xmm5
, xmm1
; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
237 movdqa xmm1
, [rsi
+2*rax
] ; p1
239 movdqa xmm1
, [rdx
] ; p1
241 pandn xmm4
, xmm5
; high edge variance additive
242 pxor xmm6
, [GLOBAL(t80
)] ; unoffset
244 pxor xmm1
, [GLOBAL(t80
)] ; reoffset
245 psubsb xmm3
, xmm0
; q0-= q0 add
247 paddsb xmm1
, xmm4
; p1+= p1 add
248 pxor xmm3
, [GLOBAL(t80
)] ; unoffset
250 pxor xmm1
, [GLOBAL(t80
)] ; unoffset
251 psubsb xmm7
, xmm4
; q1-= q1 add
253 pxor xmm7
, [GLOBAL(t80
)] ; unoffset
255 lea rsi
, [rsi
+ rcx
*2]
256 lea rdi
, [rdi
+ rcx
*2]
257 movq MMWORD
PTR [rsi
], xmm6
; p0
258 movhps MMWORD
PTR [rdi
], xmm6
259 movq MMWORD
PTR [rsi
+ rax
], xmm1
; p1
260 movhps MMWORD
PTR [rdi
+ rax
], xmm1
261 movq MMWORD
PTR [rsi
+ rcx
], xmm3
; q0
262 movhps MMWORD
PTR [rdi
+ rcx
], xmm3
263 movq MMWORD
PTR [rsi
+ rcx
*2],xmm7
; q1
264 movhps MMWORD
PTR [rdi
+ rcx
*2],xmm7
266 movdqa
[rsi
+rax
], xmm6
; write back
267 movdqa
[rsi
+2*rax
], xmm1
; write back
268 movdqa
[rsi
], xmm3
; write back
269 movdqa
[rdi
], xmm7
; write back
275 ;void vp8_loop_filter_horizontal_edge_sse2
277 ; unsigned char *src_ptr,
278 ; int src_pixel_step,
279 ; const char *blimit,
281 ; const char *thresh,
284 global sym
(vp8_loop_filter_horizontal_edge_sse2
)
285 sym
(vp8_loop_filter_horizontal_edge_sse2
):
288 SHADOW_ARGS_TO_STACK
6
296 sub rsp
, 32 ; reserve 32 bytes
297 %define t0
[rsp
+ 0] ;__declspec(align(16)) char t0[16];
298 %define t1
[rsp
+ 16] ;__declspec(align(16)) char t1[16];
300 mov rsi
, arg
(0) ;src_ptr
301 movsxd rax
, dword ptr arg
(1) ;src_pixel_step
303 mov rdx
, arg
(3) ;limit
304 movdqa xmm7
, XMMWORD
PTR [rdx
]
306 lea rdi
, [rsi
+rax
] ; rdi points to row +1 for indirect addressing
308 ; calculate breakout conditions and high edge variance
309 LFH_FILTER_AND_HEV_MASK
1
310 ; filter and write back the result
325 ;void vp8_loop_filter_horizontal_edge_uv_sse2
327 ; unsigned char *src_ptr,
328 ; int src_pixel_step,
329 ; const char *blimit,
331 ; const char *thresh,
334 global sym
(vp8_loop_filter_horizontal_edge_uv_sse2
)
335 sym
(vp8_loop_filter_horizontal_edge_uv_sse2
):
338 SHADOW_ARGS_TO_STACK
6
346 sub rsp
, 96 ; reserve 96 bytes
347 %define q2
[rsp
+ 0] ;__declspec(align(16)) char q2[16];
348 %define q1
[rsp
+ 16] ;__declspec(align(16)) char q1[16];
349 %define p2
[rsp
+ 32] ;__declspec(align(16)) char p2[16];
350 %define p1
[rsp
+ 48] ;__declspec(align(16)) char p1[16];
351 %define t0
[rsp
+ 64] ;__declspec(align(16)) char t0[16];
352 %define t1
[rsp
+ 80] ;__declspec(align(16)) char t1[16];
356 movsxd rax
, dword ptr arg
(1) ; src_pixel_step
358 neg rax
; negate pitch to deal with above border
360 mov rdx
, arg
(3) ;limit
361 movdqa xmm7
, XMMWORD
PTR [rdx
]
366 ; calculate breakout conditions and high edge variance
367 LFH_FILTER_AND_HEV_MASK
0
368 ; filter and write back the result
383 %macro MB_FILTER_AND_WRITEBACK
1
388 movdqa xmm2
, [rsi
+2*rax
] ; p1
389 movdqa xmm7
, [rdi
] ; q1
396 movdqa xmm2
, [rdx
+32] ; p1
397 movdqa xmm7
, [rdx
+80] ; q1
398 movdqa xmm6
, [rdx
+48] ; p0
399 movdqa xmm0
, [rdx
+64] ; q0
402 pxor xmm2
, [GLOBAL(t80
)] ; p1 offset to convert to signed values
403 pxor xmm7
, [GLOBAL(t80
)] ; q1 offset to convert to signed values
404 pxor xmm6
, [GLOBAL(t80
)] ; offset to convert to signed values
405 pxor xmm0
, [GLOBAL(t80
)] ; offset to convert to signed values
407 psubsb xmm2
, xmm7
; p1 - q1
408 movdqa xmm3
, xmm0
; q0
410 psubsb xmm0
, xmm6
; q0 - p0
412 paddsb xmm2
, xmm0
; 1 * (q0 - p0) + (p1 - q1)
414 paddsb xmm2
, xmm0
; 2 * (q0 - p0)
416 paddsb xmm2
, xmm0
; 3 * (q0 - p0) + (p1 - q1)
418 pand xmm1
, xmm2
; mask filter values we don't care about
420 movdqa xmm2
, xmm1
; vp8_filter
422 pand xmm2
, xmm4
; Filter2 = vp8_filter & hev
425 pandn xmm4
, xmm1
; vp8_filter&=~hev
428 punpcklbw xmm0
, xmm4
; Filter 2 (hi)
431 punpckhbw xmm1
, xmm4
; Filter 2 (lo)
432 paddsb xmm5
, [GLOBAL(t3
)] ; vp8_signed_char_clamp(Filter2 + 3)
434 pmulhw xmm1
, [GLOBAL(s9
)] ; Filter 2 (lo) * 9
436 pmulhw xmm0
, [GLOBAL(s9
)] ; Filter 2 (hi) * 9
438 punpckhbw xmm7
, xmm5
; axbxcxdx
439 paddsb xmm2
, [GLOBAL(t4
)] ; vp8_signed_char_clamp(Filter2 + 4)
441 punpcklbw xmm5
, xmm5
; exfxgxhx
442 psraw xmm7
, 11 ; sign extended shift right by 3
444 psraw xmm5
, 11 ; sign extended shift right by 3
445 punpckhbw xmm4
, xmm2
; axbxcxdx
447 punpcklbw xmm2
, xmm2
; exfxgxhx
448 psraw xmm4
, 11 ; sign extended shift right by 3
450 packsswb xmm5
, xmm7
; Filter2 >>=3;
451 psraw xmm2
, 11 ; sign extended shift right by 3
453 packsswb xmm2
, xmm4
; Filter1 >>=3;
456 paddsb xmm6
, xmm5
; ps0 =ps0 + Fitler2
459 psubsb xmm3
, xmm2
; qs0 =qs0 - Filter1
463 paddw xmm0
, [GLOBAL(s63
)] ; Filter 2 (hi) * 9 + 63
465 paddw xmm1
, [GLOBAL(s63
)] ; Filter 2 (lo) * 9 + 63
466 paddw xmm5
, xmm5
; Filter 2 (hi) * 18
468 paddw xmm7
, xmm7
; Filter 2 (lo) * 18
469 paddw xmm5
, xmm0
; Filter 2 (hi) * 27 + 63
471 paddw xmm7
, xmm1
; Filter 2 (lo) * 27 + 63
472 paddw xmm2
, xmm0
; Filter 2 (hi) * 18 + 63
474 paddw xmm4
, xmm1
; Filter 2 (lo) * 18 + 63
475 psraw xmm0
, 7 ; (Filter 2 (hi) * 9 + 63) >> 7
477 psraw xmm1
, 7 ; (Filter 2 (lo) * 9 + 63) >> 7
478 psraw xmm2
, 7 ; (Filter 2 (hi) * 18 + 63) >> 7
480 packsswb xmm0
, xmm1
; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
481 psraw xmm4
, 7 ; (Filter 2 (lo) * 18 + 63) >> 7
483 psraw xmm5
, 7 ; (Filter 2 (hi) * 27 + 63) >> 7
484 packsswb xmm2
, xmm4
; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
486 psraw xmm7
, 7 ; (Filter 2 (lo) * 27 + 63) >> 7
488 packsswb xmm5
, xmm7
; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
490 psubsb xmm3
, xmm5
; sq = vp8_signed_char_clamp(qs0 - u3)
491 paddsb xmm6
, xmm5
; sp = vp8_signed_char_clamp(ps0 - u3)
500 movdqa xmm5
, XMMWORD
PTR [rdi
+rcx
] ; q2
501 movdqa xmm1
, XMMWORD
PTR [rdi
] ; q1
502 movdqa xmm4
, XMMWORD
PTR [rsi
+rax
*2] ; p1
503 movdqa xmm7
, XMMWORD
PTR [rdi
+rax
*4] ; p2
505 movdqa xmm5
, XMMWORD
PTR [rdx
+96] ; q2
506 movdqa xmm1
, XMMWORD
PTR [rdx
+80] ; q1
507 movdqa xmm4
, XMMWORD
PTR [rdx
+32] ; p1
508 movdqa xmm7
, XMMWORD
PTR [rdx
+16] ; p2
511 pxor xmm3
, [GLOBAL(t80
)] ; *oq0 = sq^0x80
512 pxor xmm6
, [GLOBAL(t80
)] ; *oq0 = sp^0x80
514 pxor xmm1
, [GLOBAL(t80
)]
515 pxor xmm4
, [GLOBAL(t80
)]
517 psubsb xmm1
, xmm2
; sq = vp8_signed_char_clamp(qs1 - u2)
518 paddsb xmm4
, xmm2
; sp = vp8_signed_char_clamp(ps1 - u2)
520 pxor xmm1
, [GLOBAL(t80
)] ; *oq1 = sq^0x80;
521 pxor xmm4
, [GLOBAL(t80
)] ; *op1 = sp^0x80;
523 pxor xmm7
, [GLOBAL(t80
)]
524 pxor xmm5
, [GLOBAL(t80
)]
526 paddsb xmm7
, xmm0
; sp = vp8_signed_char_clamp(ps2 - u)
527 psubsb xmm5
, xmm0
; sq = vp8_signed_char_clamp(qs2 - u)
529 pxor xmm7
, [GLOBAL(t80
)] ; *op2 = sp^0x80;
530 pxor xmm5
, [GLOBAL(t80
)] ; *oq2 = sq^0x80;
536 movq MMWORD
PTR [rsi
], xmm6
; p0
537 movhps MMWORD
PTR [rdi
], xmm6
538 movq MMWORD
PTR [rsi
+ rcx
], xmm3
; q0
539 movhps MMWORD
PTR [rdi
+ rcx
], xmm3
541 movq MMWORD
PTR [rsi
+rcx
*2], xmm1
; q1
542 movhps MMWORD
PTR [rdi
+rcx
*2], xmm1
544 movq MMWORD
PTR [rsi
+ rax
], xmm4
; p1
545 movhps MMWORD
PTR [rdi
+ rax
], xmm4
547 movq MMWORD
PTR [rsi
+rax
*2], xmm7
; p2
548 movhps MMWORD
PTR [rdi
+rax
*2], xmm7
552 movq MMWORD
PTR [rsi
+rcx
*2], xmm5
; q2
553 movhps MMWORD
PTR [rdi
+rcx
*2], xmm5
555 movdqa XMMWORD
PTR [rdi
+rcx
], xmm5
; q2
556 movdqa XMMWORD
PTR [rdi
], xmm1
; q1
557 movdqa XMMWORD
PTR [rsi
], xmm3
; q0
558 movdqa XMMWORD
PTR [rsi
+rax
],xmm6
; p0
559 movdqa XMMWORD
PTR [rsi
+rax
*2],xmm4
; p1
560 movdqa XMMWORD
PTR [rdi
+rax
*4],xmm7
; p2
562 movdqa XMMWORD
PTR [rdx
+80], xmm1
; q1
563 movdqa XMMWORD
PTR [rdx
+64], xmm3
; q0
564 movdqa XMMWORD
PTR [rdx
+48], xmm6
; p0
565 movdqa XMMWORD
PTR [rdx
+32], xmm4
; p1
571 ;void vp8_mbloop_filter_horizontal_edge_sse2
573 ; unsigned char *src_ptr,
574 ; int src_pixel_step,
575 ; const char *blimit,
577 ; const char *thresh,
580 global sym
(vp8_mbloop_filter_horizontal_edge_sse2
)
581 sym
(vp8_mbloop_filter_horizontal_edge_sse2
):
584 SHADOW_ARGS_TO_STACK
6
592 sub rsp
, 32 ; reserve 32 bytes
593 %define t0
[rsp
+ 0] ;__declspec(align(16)) char t0[16];
594 %define t1
[rsp
+ 16] ;__declspec(align(16)) char t1[16];
596 mov rsi
, arg
(0) ;src_ptr
597 movsxd rax
, dword ptr arg
(1) ;src_pixel_step
599 mov rdx
, arg
(3) ;limit
600 movdqa xmm7
, XMMWORD
PTR [rdx
]
602 lea rdi
, [rsi
+rax
] ; rdi points to row +1 for indirect addressing
604 ; calculate breakout conditions and high edge variance
605 LFH_FILTER_AND_HEV_MASK
1
606 ; filter and write back the results
607 MB_FILTER_AND_WRITEBACK
1
621 ;void vp8_mbloop_filter_horizontal_edge_uv_sse2
624 ; int src_pixel_step,
625 ; const char *blimit,
627 ; const char *thresh,
630 global sym
(vp8_mbloop_filter_horizontal_edge_uv_sse2
)
631 sym
(vp8_mbloop_filter_horizontal_edge_uv_sse2
):
634 SHADOW_ARGS_TO_STACK
6
642 sub rsp
, 96 ; reserve 96 bytes
643 %define q2
[rsp
+ 0] ;__declspec(align(16)) char q2[16];
644 %define q1
[rsp
+ 16] ;__declspec(align(16)) char q1[16];
645 %define p2
[rsp
+ 32] ;__declspec(align(16)) char p2[16];
646 %define p1
[rsp
+ 48] ;__declspec(align(16)) char p1[16];
647 %define t0
[rsp
+ 64] ;__declspec(align(16)) char t0[16];
648 %define t1
[rsp
+ 80] ;__declspec(align(16)) char t1[16];
652 movsxd rax
, dword ptr arg
(1) ; src_pixel_step
654 neg rax
; negate pitch to deal with above border
656 mov rdx
, arg
(3) ;limit
657 movdqa xmm7
, XMMWORD
PTR [rdx
]
662 ; calculate breakout conditions and high edge variance
663 LFH_FILTER_AND_HEV_MASK
0
664 ; filter and write back the results
665 MB_FILTER_AND_WRITEBACK
0
679 %macro TRANSPOSE_16X8
2
680 movq xmm4
, QWORD PTR [rsi
] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
681 movq xmm1
, QWORD PTR [rdi
] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
682 movq xmm0
, QWORD PTR [rsi
+2*rax
] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
683 movq xmm7
, QWORD PTR [rdi
+2*rax
] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
684 movq xmm5
, QWORD PTR [rsi
+4*rax
] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
685 movq xmm2
, QWORD PTR [rdi
+4*rax
] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
687 punpcklbw xmm4
, xmm1
; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
689 movq xmm1
, QWORD PTR [rdi
+2*rcx
] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
691 movdqa xmm3
, xmm4
; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
692 punpcklbw xmm0
, xmm7
; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
694 movq xmm7
, QWORD PTR [rsi
+2*rcx
] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
696 punpcklbw xmm5
, xmm2
; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
700 mov rsi
, arg
(5) ; v_ptr
703 movdqa xmm6
, xmm5
; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
704 punpcklbw xmm7
, xmm1
; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
706 punpcklwd xmm5
, xmm7
; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
708 punpckhwd xmm6
, xmm7
; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
715 punpcklwd xmm3
, xmm0
; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
719 lea rdi
, [rsi
+ rax
] ; rdi points to row +1 for indirect addressing
722 movdqa xmm2
, xmm3
; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
723 punpckhwd xmm4
, xmm0
; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
725 movdqa xmm7
, xmm4
; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
726 punpckhdq xmm3
, xmm5
; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
728 punpckhdq xmm7
, xmm6
; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
730 punpckldq xmm4
, xmm6
; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
732 punpckldq xmm2
, xmm5
; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
734 movdqa t0
, xmm2
; save to free XMM2
735 movq xmm2
, QWORD PTR [rsi
] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
736 movq xmm6
, QWORD PTR [rdi
] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
737 movq xmm0
, QWORD PTR [rsi
+2*rax
] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
738 movq xmm5
, QWORD PTR [rdi
+2*rax
] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
739 movq xmm1
, QWORD PTR [rsi
+4*rax
] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
741 punpcklbw xmm2
, xmm6
; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
743 movq xmm6
, QWORD PTR [rdi
+4*rax
] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
745 punpcklbw xmm0
, xmm5
; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
747 movq xmm5
, QWORD PTR [rsi
+2*rcx
] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
749 punpcklbw xmm1
, xmm6
; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
751 movq xmm6
, QWORD PTR [rdi
+2*rcx
] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
753 punpcklbw xmm5
, xmm6
; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
756 punpckhwd xmm6
, xmm5
; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
758 punpcklwd xmm1
, xmm5
; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
759 movdqa xmm5
, xmm2
; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
761 punpcklwd xmm5
, xmm0
; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
763 punpckhwd xmm2
, xmm0
; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
766 punpckldq xmm0
, xmm1
; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
768 punpckhdq xmm5
, xmm1
; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
769 movdqa xmm1
, xmm2
; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
771 punpckldq xmm1
, xmm6
; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
773 punpckhdq xmm2
, xmm6
; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
774 movdqa xmm6
, xmm7
; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
776 punpcklqdq xmm6
, xmm2
; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
778 punpckhqdq xmm7
, xmm2
; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
780 movdqa xmm2
, xmm3
; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
781 punpcklqdq xmm2
, xmm5
; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
783 punpckhqdq xmm3
, xmm5
; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
785 movdqa
[rdx
], xmm2
; save 2
787 movdqa xmm5
, xmm4
; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
788 punpcklqdq xmm4
, xmm1
; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
790 movdqa
[rdx
+16], xmm3
; save 3
792 punpckhqdq xmm5
, xmm1
; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
794 movdqa
[rdx
+32], xmm4
; save 4
795 movdqa
[rdx
+48], xmm5
; save 5
796 movdqa xmm1
, t0
; get
799 punpckhqdq xmm1
, xmm0
; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
801 punpcklqdq xmm2
, xmm0
; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
803 movdqa
[rdx
+112], xmm7
; save 7
805 movdqa
[rdx
+96], xmm6
; save 6
807 movdqa xmm2
, xmm3
; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
808 punpckhqdq xmm3
, xmm5
; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
810 punpcklqdq xmm2
, xmm5
; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
812 movdqa
[rdx
+32], xmm2
; save 2
814 movdqa xmm5
, xmm4
; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
815 punpcklqdq xmm4
, xmm1
; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
817 movdqa
[rdx
+48], xmm3
; save 3
819 punpckhqdq xmm5
, xmm1
; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
821 movdqa
[rdx
+64], xmm4
; save 4
822 movdqa
[rdx
+80], xmm5
; save 5
823 movdqa xmm1
, t0
; get
826 punpckhqdq xmm1
, xmm0
; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
828 punpcklqdq xmm2
, xmm0
; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
830 movdqa
[rdx
+16], xmm1
836 %macro LFV_FILTER_MASK_HEV_MASK
1
837 movdqa xmm0
, xmm6
; q2
838 psubusb xmm0
, xmm7
; q2-q3
840 psubusb xmm7
, xmm6
; q3-q2
841 movdqa xmm4
, xmm5
; q1
843 por xmm7
, xmm0
; abs (q3-q2)
844 psubusb xmm4
, xmm6
; q1-q2
847 psubusb xmm6
, xmm5
; q2-q1
849 por xmm6
, xmm4
; abs (q2-q1)
850 psubusb xmm0
, xmm2
; p2 - p3;
852 psubusb xmm2
, xmm1
; p3 - p2;
853 por xmm0
, xmm2
; abs(p2-p3)
855 movdqa xmm2
, [rdx
] ; p1
857 movdqa xmm2
, [rdx
+32] ; p1
859 movdqa xmm5
, xmm2
; p1
862 psubusb xmm5
, xmm1
; p1-p2
863 psubusb xmm1
, xmm2
; p2-p1
865 movdqa xmm7
, xmm3
; p0
866 psubusb xmm7
, xmm2
; p0-p1
868 por xmm1
, xmm5
; abs(p2-p1)
872 movdqa xmm1
, xmm2
; p1
874 psubusb xmm2
, xmm3
; p1-p0
877 por xmm2
, xmm7
; abs(p1-p0)
879 movdqa t0
, xmm2
; save abs(p1-p0)
884 movdqa xmm5
, [rdx
+32] ; q0
885 movdqa xmm7
, [rdx
+48] ; q1
887 movdqa xmm5
, [rdx
+64] ; q0
888 movdqa xmm7
, [rdx
+80] ; q1
890 mov rdx
, arg
(3) ; limit
892 movdqa xmm6
, xmm5
; q0
893 movdqa xmm2
, xmm7
; q1
895 psubusb xmm5
, xmm7
; q0-q1
896 psubusb xmm7
, xmm6
; q1-q0
898 por xmm7
, xmm5
; abs(q1-q0)
900 movdqa t1
, xmm7
; save abs(q1-q0)
902 movdqa xmm4
, XMMWORD
PTR [rdx
]; limit
905 mov rdx
, arg
(2) ; blimit
908 movdqa xmm5
, xmm2
; q1
910 psubusb xmm5
, xmm1
; q1-=p1
911 psubusb xmm1
, xmm2
; p1-=q1
913 por xmm5
, xmm1
; abs(p1-q1)
914 movdqa xmm1
, xmm3
; p0
916 pand xmm5
, [GLOBAL(tfe
)] ; set lsb of each byte to zero
917 psubusb xmm1
, xmm6
; p0-q0
919 psrlw xmm5
, 1 ; abs(p1-q1)/2
920 psubusb xmm6
, xmm3
; q0-p0
922 movdqa xmm4
, XMMWORD
PTR [rdx
]; blimit
924 mov rdx
, arg
(4) ; get thresh
926 por xmm1
, xmm6
; abs(q0-p0)
928 movdqa xmm6
, t0
; get abs (q1 - q0)
930 paddusb xmm1
, xmm1
; abs(q0-p0)*2
932 movdqa xmm3
, t1
; get abs (p1 - p0)
934 movdqa xmm7
, XMMWORD
PTR [rdx
]
936 paddusb xmm1
, xmm5
; abs (p0 - q0) *2 + abs(p1-q1)/2
937 psubusb xmm6
, xmm7
; abs(q1 - q0) > thresh
939 psubusb xmm3
, xmm7
; abs(p1 - p0)> thresh
941 psubusb xmm1
, xmm4
; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
942 por xmm6
, xmm3
; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
944 por xmm1
, xmm0
; mask
954 %macro BV_TRANSPOSE
0
955 ; xmm1 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
956 ; xmm6 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
957 ; xmm3 = f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
958 ; xmm7 = f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
959 movdqa xmm2
, xmm1
; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
960 punpcklbw xmm2
, xmm6
; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
962 movdqa xmm4
, xmm3
; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
963 punpckhbw xmm1
, xmm6
; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
965 punpcklbw xmm4
, xmm7
; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
967 punpckhbw xmm3
, xmm7
; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
969 movdqa xmm6
, xmm2
; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
970 punpcklwd xmm2
, xmm4
; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
972 punpckhwd xmm6
, xmm4
; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
973 movdqa xmm5
, xmm1
; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
975 punpcklwd xmm1
, xmm3
; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
977 punpckhwd xmm5
, xmm3
; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
978 ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
979 ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
980 ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
981 ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
984 %macro BV_WRITEBACK
2
991 movd
[rsi
+2*rax
+2], %1
994 movd
[rdi
+2*rax
+2], %1
996 movd
[rsi
+4*rax
+2], %2
999 movd
[rdi
+4*rax
+2], %2
1002 movd
[rsi
+2*rcx
+2], %2
1005 movd
[rdi
+2*rcx
+2], %2
1009 ;void vp8_loop_filter_vertical_edge_sse2
1011 ; unsigned char *src_ptr,
1012 ; int src_pixel_step,
1013 ; const char *blimit,
1014 ; const char *limit,
1015 ; const char *thresh,
1018 global sym
(vp8_loop_filter_vertical_edge_sse2
)
1019 sym
(vp8_loop_filter_vertical_edge_sse2
):
1022 SHADOW_ARGS_TO_STACK
6
1030 sub rsp
, 96 ; reserve 96 bytes
1031 %define t0
[rsp
+ 0] ;__declspec(align(16)) char t0[16];
1032 %define t1
[rsp
+ 16] ;__declspec(align(16)) char t1[16];
1033 %define srct
[rsp
+ 32] ;__declspec(align(16)) char srct[64];
1035 mov rsi
, arg
(0) ; src_ptr
1036 movsxd rax
, dword ptr arg
(1) ; src_pixel_step
1039 lea rdi
, [rsi
+ rax
] ; rdi points to row +1 for indirect addressing
1040 lea rcx
, [rax
*2+rax
]
1042 ;transpose 16x8 to 8x16, and store the 8-line result on stack.
1045 ; calculate filter mask and high edge variance
1046 LFV_FILTER_MASK_HEV_MASK
1
1048 ; start work on filters
1051 ; tranpose and write back - only work on q1, q0, p0, p1
1053 ; store 16-line result
1058 BV_WRITEBACK xmm1
, xmm5
1060 lea rsi
, [rsi
+rdx
*8]
1061 lea rdi
, [rdi
+rdx
*8]
1062 BV_WRITEBACK xmm2
, xmm6
1076 ;void vp8_loop_filter_vertical_edge_uv_sse2
1079 ; int src_pixel_step,
1080 ; const char *blimit,
1081 ; const char *limit,
1082 ; const char *thresh,
1085 global sym
(vp8_loop_filter_vertical_edge_uv_sse2
)
1086 sym
(vp8_loop_filter_vertical_edge_uv_sse2
):
1089 SHADOW_ARGS_TO_STACK
6
1097 sub rsp
, 96 ; reserve 96 bytes
1098 %define t0
[rsp
+ 0] ;__declspec(align(16)) char t0[16];
1099 %define t1
[rsp
+ 16] ;__declspec(align(16)) char t1[16];
1100 %define srct
[rsp
+ 32] ;__declspec(align(16)) char srct[64];
1102 mov rsi
, arg
(0) ; u_ptr
1103 movsxd rax
, dword ptr arg
(1) ; src_pixel_step
1106 lea rdi
, [rsi
+ rax
] ; rdi points to row +1 for indirect addressing
1107 lea rcx
, [rax
+2*rax
]
1111 ;transpose 16x8 to 8x16, and store the 8-line result on stack.
1114 ; calculate filter mask and high edge variance
1115 LFV_FILTER_MASK_HEV_MASK
1
1117 ; start work on filters
1120 ; tranpose and write back - only work on q1, q0, p0, p1
1123 lea rdi
, [rsi
+ rax
] ; rdi points to row +1 for indirect addressing
1125 ; store 16-line result
1126 BV_WRITEBACK xmm1
, xmm5
1128 mov rsi
, arg
(0) ; u_ptr
1130 lea rdi
, [rsi
+ rax
] ; rdi points to row +1 for indirect addressing
1131 BV_WRITEBACK xmm2
, xmm6
1144 %macro MBV_TRANSPOSE
0
1145 movdqa xmm0
, [rdx
] ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1146 movdqa xmm1
, xmm0
; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1148 punpcklbw xmm0
, xmm7
; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
1149 punpckhbw xmm1
, xmm7
; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
1151 movdqa xmm2
, [rdx
+32] ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1152 movdqa xmm6
, xmm2
; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1154 punpcklbw xmm2
, [rdx
+48] ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
1155 punpckhbw xmm6
, [rdx
+48] ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
1157 movdqa xmm3
, xmm0
; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
1158 punpcklwd xmm0
, xmm2
; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
1160 punpckhwd xmm3
, xmm2
; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
1161 movdqa xmm4
, xmm1
; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
1163 punpcklwd xmm1
, xmm6
; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
1164 punpckhwd xmm4
, xmm6
; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
1166 movdqa xmm2
, [rdx
+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
1167 punpcklbw xmm2
, [rdx
+80] ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
1169 movdqa xmm6
, xmm5
; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
1170 punpcklbw xmm6
, [rdx
+112] ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
1172 movdqa xmm7
, xmm2
; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
1173 punpcklwd xmm2
, xmm6
; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04
1175 punpckhwd xmm7
, xmm6
; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44
1176 movdqa xmm6
, xmm0
; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
1178 punpckldq xmm0
, xmm2
; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00
1179 punpckhdq xmm6
, xmm2
; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20
1182 %macro MBV_WRITEBACK_1
0
1183 movq
QWORD PTR [rsi
], xmm0
1184 movhps MMWORD
PTR [rdi
], xmm0
1186 movq
QWORD PTR [rsi
+2*rax
], xmm6
1187 movhps MMWORD
PTR [rdi
+2*rax
], xmm6
1189 movdqa xmm0
, xmm3
; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
1190 punpckldq xmm0
, xmm7
; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
1192 punpckhdq xmm3
, xmm7
; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
1194 movq
QWORD PTR [rsi
+4*rax
], xmm0
1195 movhps MMWORD
PTR [rdi
+4*rax
], xmm0
1197 movq
QWORD PTR [rsi
+2*rcx
], xmm3
1198 movhps MMWORD
PTR [rdi
+2*rcx
], xmm3
1200 movdqa xmm2
, [rdx
+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
1201 punpckhbw xmm2
, [rdx
+80] ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
1203 punpckhbw xmm5
, [rdx
+112] ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
1206 punpcklwd xmm0
, xmm5
; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
1207 punpckhwd xmm2
, xmm5
; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
1209 movdqa xmm5
, xmm1
; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
1210 punpckldq xmm1
, xmm0
; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80
1212 punpckhdq xmm5
, xmm0
; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
1215 %macro MBV_WRITEBACK_2
0
1216 movq
QWORD PTR [rsi
], xmm1
1217 movhps MMWORD
PTR [rdi
], xmm1
1219 movq
QWORD PTR [rsi
+2*rax
], xmm5
1220 movhps MMWORD
PTR [rdi
+2*rax
], xmm5
1222 movdqa xmm1
, xmm4
; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
1223 punpckldq xmm1
, xmm2
; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
1224 punpckhdq xmm4
, xmm2
; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
1226 movq
QWORD PTR [rsi
+4*rax
], xmm1
1227 movhps MMWORD
PTR [rdi
+4*rax
], xmm1
1229 movq
QWORD PTR [rsi
+2*rcx
], xmm4
1230 movhps MMWORD
PTR [rdi
+2*rcx
], xmm4
1234 ;void vp8_mbloop_filter_vertical_edge_sse2
1236 ; unsigned char *src_ptr,
1237 ; int src_pixel_step,
1238 ; const char *blimit,
1239 ; const char *limit,
1240 ; const char *thresh,
1243 global sym
(vp8_mbloop_filter_vertical_edge_sse2
)
1244 sym
(vp8_mbloop_filter_vertical_edge_sse2
):
1247 SHADOW_ARGS_TO_STACK
6
1255 sub rsp
, 160 ; reserve 160 bytes
1256 %define t0
[rsp
+ 0] ;__declspec(align(16)) char t0[16];
1257 %define t1
[rsp
+ 16] ;__declspec(align(16)) char t1[16];
1258 %define srct
[rsp
+ 32] ;__declspec(align(16)) char srct[128];
1260 mov rsi
, arg
(0) ; src_ptr
1261 movsxd rax
, dword ptr arg
(1) ; src_pixel_step
1264 lea rdi
, [rsi
+ rax
] ; rdi points to row +1 for indirect addressing
1265 lea rcx
, [rax
*2+rax
]
1270 ; calculate filter mask and high edge variance
1271 LFV_FILTER_MASK_HEV_MASK
0
1274 ; start work on filters
1275 MB_FILTER_AND_WRITEBACK
2
1277 lea rsi
, [rsi
+rax
*8]
1278 lea rdi
, [rdi
+rax
*8]
1280 ; transpose and write back
1287 lea rsi
, [rsi
+rax
*8]
1288 lea rdi
, [rdi
+rax
*8]
1303 ;void vp8_mbloop_filter_vertical_edge_uv_sse2
1306 ; int src_pixel_step,
1307 ; const char *blimit,
1308 ; const char *limit,
1309 ; const char *thresh,
1312 global sym
(vp8_mbloop_filter_vertical_edge_uv_sse2
)
1313 sym
(vp8_mbloop_filter_vertical_edge_uv_sse2
):
1316 SHADOW_ARGS_TO_STACK
6
1324 sub rsp
, 160 ; reserve 160 bytes
1325 %define t0
[rsp
+ 0] ;__declspec(align(16)) char t0[16];
1326 %define t1
[rsp
+ 16] ;__declspec(align(16)) char t1[16];
1327 %define srct
[rsp
+ 32] ;__declspec(align(16)) char srct[128];
1329 mov rsi
, arg
(0) ; u_ptr
1330 movsxd rax
, dword ptr arg
(1) ; src_pixel_step
1333 lea rdi
, [rsi
+ rax
] ; rdi points to row +1 for indirect addressing
1334 lea rcx
, [rax
+2*rax
]
1341 ; calculate filter mask and high edge variance
1342 LFV_FILTER_MASK_HEV_MASK
0
1344 ; start work on filters
1345 MB_FILTER_AND_WRITEBACK
2
1347 ; transpose and write back
1350 mov rsi
, arg
(0) ;u_ptr
1352 lea rdi
, [rsi
+ rax
]
1354 mov rsi
, arg
(5) ;v_ptr
1356 lea rdi
, [rsi
+ rax
]
1371 ;void vp8_loop_filter_simple_horizontal_edge_sse2
1373 ; unsigned char *src_ptr,
1374 ; int src_pixel_step,
1375 ; const char *blimit,
1377 global sym
(vp8_loop_filter_simple_horizontal_edge_sse2
)
1378 sym
(vp8_loop_filter_simple_horizontal_edge_sse2
):
1381 SHADOW_ARGS_TO_STACK
3
1388 mov rsi
, arg
(0) ;src_ptr
1389 movsxd rax
, dword ptr arg
(1) ;src_pixel_step ; destination pitch?
1390 mov rdx
, arg
(2) ;blimit
1391 movdqa xmm3
, XMMWORD
PTR [rdx
]
1393 mov rdi
, rsi
; rdi points to row +1 for indirect addressing
1398 movdqa xmm1
, [rsi
+2*rax
] ; p1
1399 movdqa xmm0
, [rdi
] ; q1
1403 psubusb xmm0
, xmm1
; q1-=p1
1404 psubusb xmm1
, xmm4
; p1-=q1
1405 por xmm1
, xmm0
; abs(p1-q1)
1406 pand xmm1
, [GLOBAL(tfe
)] ; set lsb of each byte to zero
1407 psrlw xmm1
, 1 ; abs(p1-q1)/2
1409 movdqa xmm5
, [rsi
+rax
] ; p0
1410 movdqa xmm4
, [rsi
] ; q0
1411 movdqa xmm0
, xmm4
; q0
1412 movdqa xmm6
, xmm5
; p0
1413 psubusb xmm5
, xmm4
; p0-=q0
1414 psubusb xmm4
, xmm6
; q0-=p0
1415 por xmm5
, xmm4
; abs(p0 - q0)
1416 paddusb xmm5
, xmm5
; abs(p0-q0)*2
1417 paddusb xmm5
, xmm1
; abs (p0 - q0) *2 + abs(p1-q1)/2
1419 psubusb xmm5
, xmm3
; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
1423 ; start work on filters
1424 pxor xmm2
, [GLOBAL(t80
)] ; p1 offset to convert to signed values
1425 pxor xmm7
, [GLOBAL(t80
)] ; q1 offset to convert to signed values
1426 psubsb xmm2
, xmm7
; p1 - q1
1428 pxor xmm6
, [GLOBAL(t80
)] ; offset to convert to signed values
1429 pxor xmm0
, [GLOBAL(t80
)] ; offset to convert to signed values
1430 movdqa xmm3
, xmm0
; q0
1431 psubsb xmm0
, xmm6
; q0 - p0
1432 paddsb xmm2
, xmm0
; p1 - q1 + 1 * (q0 - p0)
1433 paddsb xmm2
, xmm0
; p1 - q1 + 2 * (q0 - p0)
1434 paddsb xmm2
, xmm0
; p1 - q1 + 3 * (q0 - p0)
1435 pand xmm5
, xmm2
; mask filter values we don't care about
1438 paddsb xmm5
, [GLOBAL(t4
)] ; 3* (q0 - p0) + (p1 - q1) + 4
1440 movdqa xmm0
, xmm5
; get a copy of filters
1441 psllw xmm0
, 8 ; shift left 8
1442 psraw xmm0
, 3 ; arithmetic shift right 11
1444 movdqa xmm1
, xmm5
; get a copy of filters
1445 psraw xmm1
, 11 ; arithmetic shift right 11
1446 psllw xmm1
, 8 ; shift left 8 to put it back
1448 por xmm0
, xmm1
; put the two together to get result
1450 psubsb xmm3
, xmm0
; q0-= q0 add
1451 pxor xmm3
, [GLOBAL(t80
)] ; unoffset
1452 movdqa
[rsi
], xmm3
; write back
1455 psubsb xmm5
, [GLOBAL(t1s
)] ; +3 instead of +4
1457 movdqa xmm0
, xmm5
; get a copy of filters
1458 psllw xmm0
, 8 ; shift left 8
1459 psraw xmm0
, 3 ; arithmetic shift right 11
1461 psraw xmm5
, 11 ; arithmetic shift right 11
1462 psllw xmm5
, 8 ; shift left 8 to put it back
1463 por xmm0
, xmm5
; put the two together to get result
1466 paddsb xmm6
, xmm0
; p0+= p0 add
1467 pxor xmm6
, [GLOBAL(t80
)] ; unoffset
1468 movdqa
[rsi
+rax
], xmm6
; write back
1480 ;void vp8_loop_filter_simple_vertical_edge_sse2
1482 ; unsigned char *src_ptr,
1483 ; int src_pixel_step,
1484 ; const char *blimit,
1486 global sym
(vp8_loop_filter_simple_vertical_edge_sse2
)
1487 sym
(vp8_loop_filter_simple_vertical_edge_sse2
):
1488 push rbp
; save old base pointer value.
1489 mov rbp
, rsp
; set new base pointer value.
1490 SHADOW_ARGS_TO_STACK
3
1492 GET_GOT rbx
; save callee-saved reg
1498 sub rsp
, 32 ; reserve 32 bytes
1499 %define t0
[rsp
+ 0] ;__declspec(align(16)) char t0[16];
1500 %define t1
[rsp
+ 16] ;__declspec(align(16)) char t1[16];
1502 mov rsi
, arg
(0) ;src_ptr
1503 movsxd rax
, dword ptr arg
(1) ;src_pixel_step ; destination pitch?
1506 lea rdi
, [rsi
+ rax
]
1507 lea rdx
, [rsi
+ rax
*4]
1508 lea rcx
, [rdx
+ rax
]
1510 movd xmm0
, [rsi
] ; (high 96 bits unused) 03 02 01 00
1511 movd xmm1
, [rdx
] ; (high 96 bits unused) 43 42 41 40
1512 movd xmm2
, [rdi
] ; 13 12 11 10
1513 movd xmm3
, [rcx
] ; 53 52 51 50
1514 punpckldq xmm0
, xmm1
; (high 64 bits unused) 43 42 41 40 03 02 01 00
1515 punpckldq xmm2
, xmm3
; 53 52 51 50 13 12 11 10
1517 movd xmm4
, [rsi
+ rax
*2] ; 23 22 21 20
1518 movd xmm5
, [rdx
+ rax
*2] ; 63 62 61 60
1519 movd xmm6
, [rdi
+ rax
*2] ; 33 32 31 30
1520 movd xmm7
, [rcx
+ rax
*2] ; 73 72 71 70
1521 punpckldq xmm4
, xmm5
; 63 62 61 60 23 22 21 20
1522 punpckldq xmm6
, xmm7
; 73 72 71 70 33 32 31 30
1524 punpcklbw xmm0
, xmm2
; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
1525 punpcklbw xmm4
, xmm6
; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
1528 punpcklwd xmm0
, xmm4
; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
1529 punpckhwd xmm1
, xmm4
; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
1532 punpckldq xmm0
, xmm1
; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
1533 punpckhdq xmm2
, xmm1
; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
1535 movdqa t0
, xmm0
; save to t0
1536 movdqa t1
, xmm2
; save to t1
1538 lea rsi
, [rsi
+ rax
*8]
1539 lea rdi
, [rsi
+ rax
]
1540 lea rdx
, [rsi
+ rax
*4]
1541 lea rcx
, [rdx
+ rax
]
1543 movd xmm4
, [rsi
] ; 83 82 81 80
1544 movd xmm1
, [rdx
] ; c3 c2 c1 c0
1545 movd xmm6
, [rdi
] ; 93 92 91 90
1546 movd xmm3
, [rcx
] ; d3 d2 d1 d0
1547 punpckldq xmm4
, xmm1
; c3 c2 c1 c0 83 82 81 80
1548 punpckldq xmm6
, xmm3
; d3 d2 d1 d0 93 92 91 90
1550 movd xmm0
, [rsi
+ rax
*2] ; a3 a2 a1 a0
1551 movd xmm5
, [rdx
+ rax
*2] ; e3 e2 e1 e0
1552 movd xmm2
, [rdi
+ rax
*2] ; b3 b2 b1 b0
1553 movd xmm7
, [rcx
+ rax
*2] ; f3 f2 f1 f0
1554 punpckldq xmm0
, xmm5
; e3 e2 e1 e0 a3 a2 a1 a0
1555 punpckldq xmm2
, xmm7
; f3 f2 f1 f0 b3 b2 b1 b0
1557 punpcklbw xmm4
, xmm6
; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
1558 punpcklbw xmm0
, xmm2
; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
1561 punpcklwd xmm4
, xmm0
; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
1562 punpckhwd xmm1
, xmm0
; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
1565 punpckldq xmm4
, xmm1
; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
1566 punpckhdq xmm6
, xmm1
; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
1568 movdqa xmm0
, t0
; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
1569 movdqa xmm2
, t1
; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
1573 punpcklqdq xmm0
, xmm4
; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1574 punpckhqdq xmm1
, xmm4
; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
1575 punpcklqdq xmm2
, xmm6
; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1576 punpckhqdq xmm3
, xmm6
; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
1579 movdqa xmm6
, xmm0
; p1
1580 movdqa xmm7
, xmm3
; q1
1581 psubusb xmm7
, xmm0
; q1-=p1
1582 psubusb xmm6
, xmm3
; p1-=q1
1583 por xmm6
, xmm7
; abs(p1-q1)
1584 pand xmm6
, [GLOBAL(tfe
)] ; set lsb of each byte to zero
1585 psrlw xmm6
, 1 ; abs(p1-q1)/2
1587 movdqa xmm5
, xmm1
; p0
1588 movdqa xmm4
, xmm2
; q0
1589 psubusb xmm5
, xmm2
; p0-=q0
1590 psubusb xmm4
, xmm1
; q0-=p0
1591 por xmm5
, xmm4
; abs(p0 - q0)
1592 paddusb xmm5
, xmm5
; abs(p0-q0)*2
1593 paddusb xmm5
, xmm6
; abs (p0 - q0) *2 + abs(p1-q1)/2
1595 mov rdx
, arg
(2) ;blimit
1596 movdqa xmm7
, XMMWORD
PTR [rdx
]
1598 psubusb xmm5
, xmm7
; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
1600 pcmpeqb xmm5
, xmm7
; mm5 = mask
1602 ; start work on filters
1606 pxor xmm0
, [GLOBAL(t80
)] ; p1 offset to convert to signed values
1607 pxor xmm3
, [GLOBAL(t80
)] ; q1 offset to convert to signed values
1609 psubsb xmm0
, xmm3
; p1 - q1
1610 movdqa xmm6
, xmm1
; p0
1612 movdqa xmm7
, xmm2
; q0
1613 pxor xmm6
, [GLOBAL(t80
)] ; offset to convert to signed values
1615 pxor xmm7
, [GLOBAL(t80
)] ; offset to convert to signed values
1616 movdqa xmm3
, xmm7
; offseted ; q0
1618 psubsb xmm7
, xmm6
; q0 - p0
1619 paddsb xmm0
, xmm7
; p1 - q1 + 1 * (q0 - p0)
1621 paddsb xmm0
, xmm7
; p1 - q1 + 2 * (q0 - p0)
1622 paddsb xmm0
, xmm7
; p1 - q1 + 3 * (q0 - p0)
1624 pand xmm5
, xmm0
; mask filter values we don't care about
1627 paddsb xmm5
, [GLOBAL(t4
)] ; 3* (q0 - p0) + (p1 - q1) + 4
1629 movdqa xmm0
, xmm5
; get a copy of filters
1630 psllw xmm0
, 8 ; shift left 8
1632 psraw xmm0
, 3 ; arithmetic shift right 11
1635 movdqa xmm7
, xmm5
; get a copy of filters
1636 psraw xmm7
, 11 ; arithmetic shift right 11
1638 psllw xmm7
, 8 ; shift left 8 to put it back
1639 por xmm0
, xmm7
; put the two together to get result
1641 psubsb xmm3
, xmm0
; q0-= q0sz add
1642 pxor xmm3
, [GLOBAL(t80
)] ; unoffset q0
1645 psubsb xmm5
, [GLOBAL(t1s
)] ; +3 instead of +4
1646 movdqa xmm0
, xmm5
; get a copy of filters
1648 psllw xmm0
, 8 ; shift left 8
1649 psraw xmm0
, 3 ; arithmetic shift right 11
1652 psraw xmm5
, 11 ; arithmetic shift right 11
1654 psllw xmm5
, 8 ; shift left 8 to put it back
1655 por xmm0
, xmm5
; put the two together to get result
1657 paddsb xmm6
, xmm0
; p0+= p0 add
1658 pxor xmm6
, [GLOBAL(t80
)] ; unoffset p0
1660 movdqa xmm0
, t0
; p1
1661 movdqa xmm4
, t1
; q1
1663 ; transpose back to write out
1664 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1665 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
1666 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1667 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
1669 punpcklbw xmm0
, xmm6
; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
1670 punpckhbw xmm1
, xmm6
; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
1673 punpcklbw xmm3
, xmm4
; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
1674 punpckhbw xmm5
, xmm4
; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
1677 punpcklwd xmm0
, xmm3
; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
1678 punpckhwd xmm2
, xmm3
; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
1681 punpcklwd xmm1
, xmm5
; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
1682 punpckhwd xmm3
, xmm5
; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
1684 ; write out order: xmm0 xmm2 xmm1 xmm3
1685 lea rdx
, [rsi
+ rax
*4]
1687 movd
[rsi
], xmm1
; write the second 8-line result
1691 movd
[rsi
+ rax
*2], xmm1
1693 movd
[rdi
+ rax
*2], xmm1
1699 movd
[rdx
+ rax
*2], xmm3
1701 movd
[rcx
+ rax
*2], xmm3
1704 lea rsi
, [rsi
+ rax
*8]
1706 lea rdi
, [rsi
+ rax
]
1707 lea rdx
, [rsi
+ rax
*4]
1708 lea rcx
, [rdx
+ rax
]
1710 movd
[rsi
], xmm0
; write the first 8-line result
1714 movd
[rsi
+ rax
*2], xmm0
1716 movd
[rdi
+ rax
*2], xmm0
1722 movd
[rdx
+ rax
*2], xmm2
1724 movd
[rcx
+ rax
*2], xmm2