Merge "Skip computation of distortion in vp8_pick_inter_mode if active_map is used"
[libvpx.git] / vp8 / common / x86 / loopfilter_sse2.asm
blob295609c58167a1cb6018b0f3366bd6f467217ee0
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 ; Use of pmaxub instead of psubusb to compute filter mask was seen
15 ; in ffvp8
17 %macro LFH_FILTER_AND_HEV_MASK 1
18 %if %1
19 movdqa xmm2, [rdi+2*rax] ; q3
20 movdqa xmm1, [rsi+2*rax] ; q2
21 movdqa xmm4, [rsi+rax] ; q1
22 movdqa xmm5, [rsi] ; q0
23 neg rax ; negate pitch to deal with above border
24 %else
25 movlps xmm2, [rsi + rcx*2] ; q3
26 movlps xmm1, [rsi + rcx] ; q2
27 movlps xmm4, [rsi] ; q1
28 movlps xmm5, [rsi + rax] ; q0
30 movhps xmm2, [rdi + rcx*2]
31 movhps xmm1, [rdi + rcx]
32 movhps xmm4, [rdi]
33 movhps xmm5, [rdi + rax]
35 lea rsi, [rsi + rax*4]
36 lea rdi, [rdi + rax*4]
38 movdqa XMMWORD PTR [rsp], xmm1 ; store q2
39 movdqa XMMWORD PTR [rsp + 16], xmm4 ; store q1
40 %endif
42 movdqa xmm6, xmm1 ; q2
43 movdqa xmm3, xmm4 ; q1
45 psubusb xmm1, xmm2 ; q2-=q3
46 psubusb xmm2, xmm6 ; q3-=q2
48 psubusb xmm4, xmm6 ; q1-=q2
49 psubusb xmm6, xmm3 ; q2-=q1
51 por xmm4, xmm6 ; abs(q2-q1)
52 por xmm1, xmm2 ; abs(q3-q2)
54 movdqa xmm0, xmm5 ; q0
55 pmaxub xmm1, xmm4
57 psubusb xmm5, xmm3 ; q0-=q1
58 psubusb xmm3, xmm0 ; q1-=q0
60 por xmm5, xmm3 ; abs(q0-q1)
61 movdqa t0, xmm5 ; save to t0
63 pmaxub xmm1, xmm5
65 %if %1
66 movdqa xmm2, [rsi+4*rax] ; p3
67 movdqa xmm4, [rdi+4*rax] ; p2
68 movdqa xmm6, [rsi+2*rax] ; p1
69 %else
70 movlps xmm2, [rsi + rax] ; p3
71 movlps xmm4, [rsi] ; p2
72 movlps xmm6, [rsi + rcx] ; p1
74 movhps xmm2, [rdi + rax]
75 movhps xmm4, [rdi]
76 movhps xmm6, [rdi + rcx]
78 movdqa XMMWORD PTR [rsp + 32], xmm4 ; store p2
79 movdqa XMMWORD PTR [rsp + 48], xmm6 ; store p1
80 %endif
82 movdqa xmm5, xmm4 ; p2
83 movdqa xmm3, xmm6 ; p1
85 psubusb xmm4, xmm2 ; p2-=p3
86 psubusb xmm2, xmm5 ; p3-=p2
88 psubusb xmm3, xmm5 ; p1-=p2
89 pmaxub xmm1, xmm4 ; abs(p3 - p2)
91 psubusb xmm5, xmm6 ; p2-=p1
92 pmaxub xmm1, xmm2 ; abs(p3 - p2)
94 pmaxub xmm1, xmm5 ; abs(p2 - p1)
95 movdqa xmm2, xmm6 ; p1
97 pmaxub xmm1, xmm3 ; abs(p2 - p1)
98 %if %1
99 movdqa xmm4, [rsi+rax] ; p0
100 movdqa xmm3, [rdi] ; q1
101 %else
102 movlps xmm4, [rsi + rcx*2] ; p0
103 movhps xmm4, [rdi + rcx*2]
104 movdqa xmm3, q1 ; q1
105 %endif
107 movdqa xmm5, xmm4 ; p0
108 psubusb xmm4, xmm6 ; p0-=p1
110 psubusb xmm6, xmm5 ; p1-=p0
112 por xmm6, xmm4 ; abs(p1 - p0)
113 mov rdx, arg(2) ; get blimit
115 movdqa t1, xmm6 ; save to t1
117 movdqa xmm4, xmm3 ; q1
118 pmaxub xmm1, xmm6
120 psubusb xmm3, xmm2 ; q1-=p1
121 psubusb xmm2, xmm4 ; p1-=q1
123 psubusb xmm1, xmm7
124 por xmm2, xmm3 ; abs(p1-q1)
126 movdqa xmm7, XMMWORD PTR [rdx] ; blimit
128 movdqa xmm3, xmm0 ; q0
129 pand xmm2, [GLOBAL(tfe)] ; set lsb of each byte to zero
131 mov rdx, arg(4) ; hev get thresh
133 movdqa xmm6, xmm5 ; p0
134 psrlw xmm2, 1 ; abs(p1-q1)/2
136 psubusb xmm5, xmm3 ; p0-=q0
138 psubusb xmm3, xmm6 ; q0-=p0
139 por xmm5, xmm3 ; abs(p0 - q0)
141 paddusb xmm5, xmm5 ; abs(p0-q0)*2
143 movdqa xmm4, t0 ; hev get abs (q1 - q0)
145 movdqa xmm3, t1 ; get abs (p1 - p0)
147 paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
149 movdqa xmm2, XMMWORD PTR [rdx] ; hev
151 psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
152 psubusb xmm4, xmm2 ; hev
154 psubusb xmm3, xmm2 ; hev
155 por xmm1, xmm5
157 pxor xmm7, xmm7
158 paddb xmm4, xmm3 ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
160 pcmpeqb xmm4, xmm5 ; hev
161 pcmpeqb xmm3, xmm3 ; hev
163 pcmpeqb xmm1, xmm7 ; mask xmm1
164 pxor xmm4, xmm3 ; hev
165 %endmacro
167 %macro B_FILTER 1
168 %if %1 == 0
169 movdqa xmm2, p1 ; p1
170 movdqa xmm7, q1 ; q1
171 %elif %1 == 1
172 movdqa xmm2, [rsi+2*rax] ; p1
173 movdqa xmm7, [rdi] ; q1
174 %elif %1 == 2
175 lea rdx, srct
177 movdqa xmm2, [rdx] ; p1
178 movdqa xmm7, [rdx+48] ; q1
179 movdqa xmm6, [rdx+16] ; p0
180 movdqa xmm0, [rdx+32] ; q0
181 %endif
183 pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
184 pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
186 psubsb xmm2, xmm7 ; p1 - q1
187 pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values
189 pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1)
190 pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values
192 movdqa xmm3, xmm0 ; q0
193 psubsb xmm0, xmm6 ; q0 - p0
195 paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
197 paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
199 paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
201 pand xmm1, xmm2 ; mask filter values we don't care about
203 movdqa xmm2, xmm1
205 paddsb xmm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
206 paddsb xmm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
208 punpckhbw xmm5, xmm2 ; axbxcxdx
209 punpcklbw xmm2, xmm2 ; exfxgxhx
211 punpcklbw xmm0, xmm1 ; exfxgxhx
212 psraw xmm5, 11 ; sign extended shift right by 3
214 punpckhbw xmm1, xmm1 ; axbxcxdx
215 psraw xmm2, 11 ; sign extended shift right by 3
217 packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
218 psraw xmm0, 11 ; sign extended shift right by 3
220 psraw xmm1, 11 ; sign extended shift right by 3
221 movdqa xmm5, xmm0 ; save results
223 packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
224 paddsw xmm5, [GLOBAL(ones)]
226 paddsw xmm1, [GLOBAL(ones)]
227 psraw xmm5, 1 ; partial shifted one more time for 2nd tap
229 psraw xmm1, 1 ; partial shifted one more time for 2nd tap
231 paddsb xmm6, xmm2 ; p0+= p0 add
232 packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
234 %if %1 == 0
235 movdqa xmm1, p1 ; p1
236 %elif %1 == 1
237 movdqa xmm1, [rsi+2*rax] ; p1
238 %elif %1 == 2
239 movdqa xmm1, [rdx] ; p1
240 %endif
241 pandn xmm4, xmm5 ; high edge variance additive
242 pxor xmm6, [GLOBAL(t80)] ; unoffset
244 pxor xmm1, [GLOBAL(t80)] ; reoffset
245 psubsb xmm3, xmm0 ; q0-= q0 add
247 paddsb xmm1, xmm4 ; p1+= p1 add
248 pxor xmm3, [GLOBAL(t80)] ; unoffset
250 pxor xmm1, [GLOBAL(t80)] ; unoffset
251 psubsb xmm7, xmm4 ; q1-= q1 add
253 pxor xmm7, [GLOBAL(t80)] ; unoffset
254 %if %1 == 0
255 lea rsi, [rsi + rcx*2]
256 lea rdi, [rdi + rcx*2]
257 movq MMWORD PTR [rsi], xmm6 ; p0
258 movhps MMWORD PTR [rdi], xmm6
259 movq MMWORD PTR [rsi + rax], xmm1 ; p1
260 movhps MMWORD PTR [rdi + rax], xmm1
261 movq MMWORD PTR [rsi + rcx], xmm3 ; q0
262 movhps MMWORD PTR [rdi + rcx], xmm3
263 movq MMWORD PTR [rsi + rcx*2],xmm7 ; q1
264 movhps MMWORD PTR [rdi + rcx*2],xmm7
265 %elif %1 == 1
266 movdqa [rsi+rax], xmm6 ; write back
267 movdqa [rsi+2*rax], xmm1 ; write back
268 movdqa [rsi], xmm3 ; write back
269 movdqa [rdi], xmm7 ; write back
270 %endif
272 %endmacro
275 ;void vp8_loop_filter_horizontal_edge_sse2
277 ; unsigned char *src_ptr,
278 ; int src_pixel_step,
279 ; const char *blimit,
280 ; const char *limit,
281 ; const char *thresh,
282 ; int count
284 global sym(vp8_loop_filter_horizontal_edge_sse2)
285 sym(vp8_loop_filter_horizontal_edge_sse2):
286 push rbp
287 mov rbp, rsp
288 SHADOW_ARGS_TO_STACK 6
289 SAVE_XMM 7
290 GET_GOT rbx
291 push rsi
292 push rdi
293 ; end prolog
295 ALIGN_STACK 16, rax
296 sub rsp, 32 ; reserve 32 bytes
297 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
298 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
300 mov rsi, arg(0) ;src_ptr
301 movsxd rax, dword ptr arg(1) ;src_pixel_step
303 mov rdx, arg(3) ;limit
304 movdqa xmm7, XMMWORD PTR [rdx]
306 lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing
308 ; calculate breakout conditions and high edge variance
309 LFH_FILTER_AND_HEV_MASK 1
310 ; filter and write back the result
311 B_FILTER 1
313 add rsp, 32
314 pop rsp
315 ; begin epilog
316 pop rdi
317 pop rsi
318 RESTORE_GOT
319 RESTORE_XMM
320 UNSHADOW_ARGS
321 pop rbp
325 ;void vp8_loop_filter_horizontal_edge_uv_sse2
327 ; unsigned char *src_ptr,
328 ; int src_pixel_step,
329 ; const char *blimit,
330 ; const char *limit,
331 ; const char *thresh,
332 ; int count
334 global sym(vp8_loop_filter_horizontal_edge_uv_sse2)
335 sym(vp8_loop_filter_horizontal_edge_uv_sse2):
336 push rbp
337 mov rbp, rsp
338 SHADOW_ARGS_TO_STACK 6
339 SAVE_XMM 7
340 GET_GOT rbx
341 push rsi
342 push rdi
343 ; end prolog
345 ALIGN_STACK 16, rax
346 sub rsp, 96 ; reserve 96 bytes
347 %define q2 [rsp + 0] ;__declspec(align(16)) char q2[16];
348 %define q1 [rsp + 16] ;__declspec(align(16)) char q1[16];
349 %define p2 [rsp + 32] ;__declspec(align(16)) char p2[16];
350 %define p1 [rsp + 48] ;__declspec(align(16)) char p1[16];
351 %define t0 [rsp + 64] ;__declspec(align(16)) char t0[16];
352 %define t1 [rsp + 80] ;__declspec(align(16)) char t1[16];
354 mov rsi, arg(0) ; u
355 mov rdi, arg(5) ; v
356 movsxd rax, dword ptr arg(1) ; src_pixel_step
357 mov rcx, rax
358 neg rax ; negate pitch to deal with above border
360 mov rdx, arg(3) ;limit
361 movdqa xmm7, XMMWORD PTR [rdx]
363 lea rsi, [rsi + rcx]
364 lea rdi, [rdi + rcx]
366 ; calculate breakout conditions and high edge variance
367 LFH_FILTER_AND_HEV_MASK 0
368 ; filter and write back the result
369 B_FILTER 0
371 add rsp, 96
372 pop rsp
373 ; begin epilog
374 pop rdi
375 pop rsi
376 RESTORE_GOT
377 RESTORE_XMM
378 UNSHADOW_ARGS
379 pop rbp
383 %macro MB_FILTER_AND_WRITEBACK 1
384 %if %1 == 0
385 movdqa xmm2, p1 ; p1
386 movdqa xmm7, q1 ; q1
387 %elif %1 == 1
388 movdqa xmm2, [rsi+2*rax] ; p1
389 movdqa xmm7, [rdi] ; q1
391 mov rcx, rax
392 neg rcx
393 %elif %1 == 2
394 lea rdx, srct
396 movdqa xmm2, [rdx+32] ; p1
397 movdqa xmm7, [rdx+80] ; q1
398 movdqa xmm6, [rdx+48] ; p0
399 movdqa xmm0, [rdx+64] ; q0
400 %endif
402 pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
403 pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
404 pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values
405 pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values
407 psubsb xmm2, xmm7 ; p1 - q1
408 movdqa xmm3, xmm0 ; q0
410 psubsb xmm0, xmm6 ; q0 - p0
412 paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1)
414 paddsb xmm2, xmm0 ; 2 * (q0 - p0)
416 paddsb xmm2, xmm0 ; 3 * (q0 - p0) + (p1 - q1)
418 pand xmm1, xmm2 ; mask filter values we don't care about
420 movdqa xmm2, xmm1 ; vp8_filter
422 pand xmm2, xmm4 ; Filter2 = vp8_filter & hev
423 pxor xmm0, xmm0
425 pandn xmm4, xmm1 ; vp8_filter&=~hev
426 pxor xmm1, xmm1
428 punpcklbw xmm0, xmm4 ; Filter 2 (hi)
429 movdqa xmm5, xmm2
431 punpckhbw xmm1, xmm4 ; Filter 2 (lo)
432 paddsb xmm5, [GLOBAL(t3)] ; vp8_signed_char_clamp(Filter2 + 3)
434 pmulhw xmm1, [GLOBAL(s9)] ; Filter 2 (lo) * 9
436 pmulhw xmm0, [GLOBAL(s9)] ; Filter 2 (hi) * 9
438 punpckhbw xmm7, xmm5 ; axbxcxdx
439 paddsb xmm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4)
441 punpcklbw xmm5, xmm5 ; exfxgxhx
442 psraw xmm7, 11 ; sign extended shift right by 3
444 psraw xmm5, 11 ; sign extended shift right by 3
445 punpckhbw xmm4, xmm2 ; axbxcxdx
447 punpcklbw xmm2, xmm2 ; exfxgxhx
448 psraw xmm4, 11 ; sign extended shift right by 3
450 packsswb xmm5, xmm7 ; Filter2 >>=3;
451 psraw xmm2, 11 ; sign extended shift right by 3
453 packsswb xmm2, xmm4 ; Filter1 >>=3;
454 movdqa xmm7, xmm1
456 paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2
457 movdqa xmm4, xmm1
459 psubsb xmm3, xmm2 ; qs0 =qs0 - Filter1
460 movdqa xmm5, xmm0
462 movdqa xmm2, xmm5
463 paddw xmm0, [GLOBAL(s63)] ; Filter 2 (hi) * 9 + 63
465 paddw xmm1, [GLOBAL(s63)] ; Filter 2 (lo) * 9 + 63
466 paddw xmm5, xmm5 ; Filter 2 (hi) * 18
468 paddw xmm7, xmm7 ; Filter 2 (lo) * 18
469 paddw xmm5, xmm0 ; Filter 2 (hi) * 27 + 63
471 paddw xmm7, xmm1 ; Filter 2 (lo) * 27 + 63
472 paddw xmm2, xmm0 ; Filter 2 (hi) * 18 + 63
474 paddw xmm4, xmm1 ; Filter 2 (lo) * 18 + 63
475 psraw xmm0, 7 ; (Filter 2 (hi) * 9 + 63) >> 7
477 psraw xmm1, 7 ; (Filter 2 (lo) * 9 + 63) >> 7
478 psraw xmm2, 7 ; (Filter 2 (hi) * 18 + 63) >> 7
480 packsswb xmm0, xmm1 ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
481 psraw xmm4, 7 ; (Filter 2 (lo) * 18 + 63) >> 7
483 psraw xmm5, 7 ; (Filter 2 (hi) * 27 + 63) >> 7
484 packsswb xmm2, xmm4 ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
486 psraw xmm7, 7 ; (Filter 2 (lo) * 27 + 63) >> 7
488 packsswb xmm5, xmm7 ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
490 psubsb xmm3, xmm5 ; sq = vp8_signed_char_clamp(qs0 - u3)
491 paddsb xmm6, xmm5 ; sp = vp8_signed_char_clamp(ps0 - u3)
493 %if %1 == 0
494 movdqa xmm5, q2 ; q2
495 movdqa xmm1, q1 ; q1
496 movdqa xmm4, p1 ; p1
497 movdqa xmm7, p2 ; p2
499 %elif %1 == 1
500 movdqa xmm5, XMMWORD PTR [rdi+rcx] ; q2
501 movdqa xmm1, XMMWORD PTR [rdi] ; q1
502 movdqa xmm4, XMMWORD PTR [rsi+rax*2] ; p1
503 movdqa xmm7, XMMWORD PTR [rdi+rax*4] ; p2
504 %elif %1 == 2
505 movdqa xmm5, XMMWORD PTR [rdx+96] ; q2
506 movdqa xmm1, XMMWORD PTR [rdx+80] ; q1
507 movdqa xmm4, XMMWORD PTR [rdx+32] ; p1
508 movdqa xmm7, XMMWORD PTR [rdx+16] ; p2
509 %endif
511 pxor xmm3, [GLOBAL(t80)] ; *oq0 = sq^0x80
512 pxor xmm6, [GLOBAL(t80)] ; *oq0 = sp^0x80
514 pxor xmm1, [GLOBAL(t80)]
515 pxor xmm4, [GLOBAL(t80)]
517 psubsb xmm1, xmm2 ; sq = vp8_signed_char_clamp(qs1 - u2)
518 paddsb xmm4, xmm2 ; sp = vp8_signed_char_clamp(ps1 - u2)
520 pxor xmm1, [GLOBAL(t80)] ; *oq1 = sq^0x80;
521 pxor xmm4, [GLOBAL(t80)] ; *op1 = sp^0x80;
523 pxor xmm7, [GLOBAL(t80)]
524 pxor xmm5, [GLOBAL(t80)]
526 paddsb xmm7, xmm0 ; sp = vp8_signed_char_clamp(ps2 - u)
527 psubsb xmm5, xmm0 ; sq = vp8_signed_char_clamp(qs2 - u)
529 pxor xmm7, [GLOBAL(t80)] ; *op2 = sp^0x80;
530 pxor xmm5, [GLOBAL(t80)] ; *oq2 = sq^0x80;
532 %if %1 == 0
533 lea rsi, [rsi+rcx*2]
534 lea rdi, [rdi+rcx*2]
536 movq MMWORD PTR [rsi], xmm6 ; p0
537 movhps MMWORD PTR [rdi], xmm6
538 movq MMWORD PTR [rsi + rcx], xmm3 ; q0
539 movhps MMWORD PTR [rdi + rcx], xmm3
541 movq MMWORD PTR [rsi+rcx*2], xmm1 ; q1
542 movhps MMWORD PTR [rdi+rcx*2], xmm1
544 movq MMWORD PTR [rsi + rax], xmm4 ; p1
545 movhps MMWORD PTR [rdi + rax], xmm4
547 movq MMWORD PTR [rsi+rax*2], xmm7 ; p2
548 movhps MMWORD PTR [rdi+rax*2], xmm7
550 lea rsi, [rsi + rcx]
551 lea rdi, [rdi + rcx]
552 movq MMWORD PTR [rsi+rcx*2], xmm5 ; q2
553 movhps MMWORD PTR [rdi+rcx*2], xmm5
554 %elif %1 == 1
555 movdqa XMMWORD PTR [rdi+rcx], xmm5 ; q2
556 movdqa XMMWORD PTR [rdi], xmm1 ; q1
557 movdqa XMMWORD PTR [rsi], xmm3 ; q0
558 movdqa XMMWORD PTR [rsi+rax ],xmm6 ; p0
559 movdqa XMMWORD PTR [rsi+rax*2],xmm4 ; p1
560 movdqa XMMWORD PTR [rdi+rax*4],xmm7 ; p2
561 %elif %1 == 2
562 movdqa XMMWORD PTR [rdx+80], xmm1 ; q1
563 movdqa XMMWORD PTR [rdx+64], xmm3 ; q0
564 movdqa XMMWORD PTR [rdx+48], xmm6 ; p0
565 movdqa XMMWORD PTR [rdx+32], xmm4 ; p1
566 %endif
568 %endmacro
571 ;void vp8_mbloop_filter_horizontal_edge_sse2
573 ; unsigned char *src_ptr,
574 ; int src_pixel_step,
575 ; const char *blimit,
576 ; const char *limit,
577 ; const char *thresh,
578 ; int count
580 global sym(vp8_mbloop_filter_horizontal_edge_sse2)
581 sym(vp8_mbloop_filter_horizontal_edge_sse2):
582 push rbp
583 mov rbp, rsp
584 SHADOW_ARGS_TO_STACK 6
585 SAVE_XMM 7
586 GET_GOT rbx
587 push rsi
588 push rdi
589 ; end prolog
591 ALIGN_STACK 16, rax
592 sub rsp, 32 ; reserve 32 bytes
593 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
594 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
596 mov rsi, arg(0) ;src_ptr
597 movsxd rax, dword ptr arg(1) ;src_pixel_step
599 mov rdx, arg(3) ;limit
600 movdqa xmm7, XMMWORD PTR [rdx]
602 lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing
604 ; calculate breakout conditions and high edge variance
605 LFH_FILTER_AND_HEV_MASK 1
606 ; filter and write back the results
607 MB_FILTER_AND_WRITEBACK 1
609 add rsp, 32
610 pop rsp
611 ; begin epilog
612 pop rdi
613 pop rsi
614 RESTORE_GOT
615 RESTORE_XMM
616 UNSHADOW_ARGS
617 pop rbp
621 ;void vp8_mbloop_filter_horizontal_edge_uv_sse2
623 ; unsigned char *u,
624 ; int src_pixel_step,
625 ; const char *blimit,
626 ; const char *limit,
627 ; const char *thresh,
628 ; unsigned char *v
630 global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2)
631 sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
632 push rbp
633 mov rbp, rsp
634 SHADOW_ARGS_TO_STACK 6
635 SAVE_XMM 7
636 GET_GOT rbx
637 push rsi
638 push rdi
639 ; end prolog
641 ALIGN_STACK 16, rax
642 sub rsp, 96 ; reserve 96 bytes
643 %define q2 [rsp + 0] ;__declspec(align(16)) char q2[16];
644 %define q1 [rsp + 16] ;__declspec(align(16)) char q1[16];
645 %define p2 [rsp + 32] ;__declspec(align(16)) char p2[16];
646 %define p1 [rsp + 48] ;__declspec(align(16)) char p1[16];
647 %define t0 [rsp + 64] ;__declspec(align(16)) char t0[16];
648 %define t1 [rsp + 80] ;__declspec(align(16)) char t1[16];
650 mov rsi, arg(0) ; u
651 mov rdi, arg(5) ; v
652 movsxd rax, dword ptr arg(1) ; src_pixel_step
653 mov rcx, rax
654 neg rax ; negate pitch to deal with above border
656 mov rdx, arg(3) ;limit
657 movdqa xmm7, XMMWORD PTR [rdx]
659 lea rsi, [rsi + rcx]
660 lea rdi, [rdi + rcx]
662 ; calculate breakout conditions and high edge variance
663 LFH_FILTER_AND_HEV_MASK 0
664 ; filter and write back the results
665 MB_FILTER_AND_WRITEBACK 0
667 add rsp, 96
668 pop rsp
669 ; begin epilog
670 pop rdi
671 pop rsi
672 RESTORE_GOT
673 RESTORE_XMM
674 UNSHADOW_ARGS
675 pop rbp
679 %macro TRANSPOSE_16X8 2
680 movq xmm4, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
681 movq xmm1, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
682 movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
683 movq xmm7, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
684 movq xmm5, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
685 movq xmm2, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
687 punpcklbw xmm4, xmm1 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
689 movq xmm1, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
691 movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
692 punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
694 movq xmm7, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
696 punpcklbw xmm5, xmm2 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
697 %if %1
698 lea rsi, [rsi+rax*8]
699 %else
700 mov rsi, arg(5) ; v_ptr
701 %endif
703 movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
704 punpcklbw xmm7, xmm1 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
706 punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
708 punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
709 %if %1
710 lea rdi, [rdi+rax*8]
711 %else
712 lea rsi, [rsi - 4]
713 %endif
715 punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
716 %if %1
717 lea rdx, srct
718 %else
719 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
720 %endif
722 movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
723 punpckhwd xmm4, xmm0 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
725 movdqa xmm7, xmm4 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
726 punpckhdq xmm3, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
728 punpckhdq xmm7, xmm6 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
730 punpckldq xmm4, xmm6 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
732 punpckldq xmm2, xmm5 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
734 movdqa t0, xmm2 ; save to free XMM2
735 movq xmm2, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
736 movq xmm6, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
737 movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
738 movq xmm5, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
739 movq xmm1, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
741 punpcklbw xmm2, xmm6 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
743 movq xmm6, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
745 punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
747 movq xmm5, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
749 punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
751 movq xmm6, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
753 punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
755 movdqa xmm6, xmm1 ;
756 punpckhwd xmm6, xmm5 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
758 punpcklwd xmm1, xmm5 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
759 movdqa xmm5, xmm2 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
761 punpcklwd xmm5, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
763 punpckhwd xmm2, xmm0 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
765 movdqa xmm0, xmm5
766 punpckldq xmm0, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
768 punpckhdq xmm5, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
769 movdqa xmm1, xmm2 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
771 punpckldq xmm1, xmm6 ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
773 punpckhdq xmm2, xmm6 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
774 movdqa xmm6, xmm7 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
776 punpcklqdq xmm6, xmm2 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
778 punpckhqdq xmm7, xmm2 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
779 %if %2
780 movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
781 punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
783 punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
785 movdqa [rdx], xmm2 ; save 2
787 movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
788 punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
790 movdqa [rdx+16], xmm3 ; save 3
792 punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
794 movdqa [rdx+32], xmm4 ; save 4
795 movdqa [rdx+48], xmm5 ; save 5
796 movdqa xmm1, t0 ; get
798 movdqa xmm2, xmm1 ;
799 punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
801 punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
802 %else
803 movdqa [rdx+112], xmm7 ; save 7
805 movdqa [rdx+96], xmm6 ; save 6
807 movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
808 punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
810 punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
812 movdqa [rdx+32], xmm2 ; save 2
814 movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
815 punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
817 movdqa [rdx+48], xmm3 ; save 3
819 punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
821 movdqa [rdx+64], xmm4 ; save 4
822 movdqa [rdx+80], xmm5 ; save 5
823 movdqa xmm1, t0 ; get
825 movdqa xmm2, xmm1
826 punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
828 punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
830 movdqa [rdx+16], xmm1
832 movdqa [rdx], xmm2
833 %endif
834 %endmacro
836 %macro LFV_FILTER_MASK_HEV_MASK 1
837 movdqa xmm0, xmm6 ; q2
838 psubusb xmm0, xmm7 ; q2-q3
840 psubusb xmm7, xmm6 ; q3-q2
841 movdqa xmm4, xmm5 ; q1
843 por xmm7, xmm0 ; abs (q3-q2)
844 psubusb xmm4, xmm6 ; q1-q2
846 movdqa xmm0, xmm1
847 psubusb xmm6, xmm5 ; q2-q1
849 por xmm6, xmm4 ; abs (q2-q1)
850 psubusb xmm0, xmm2 ; p2 - p3;
852 psubusb xmm2, xmm1 ; p3 - p2;
853 por xmm0, xmm2 ; abs(p2-p3)
854 %if %1
855 movdqa xmm2, [rdx] ; p1
856 %else
857 movdqa xmm2, [rdx+32] ; p1
858 %endif
859 movdqa xmm5, xmm2 ; p1
860 pmaxub xmm0, xmm7
862 psubusb xmm5, xmm1 ; p1-p2
863 psubusb xmm1, xmm2 ; p2-p1
865 movdqa xmm7, xmm3 ; p0
866 psubusb xmm7, xmm2 ; p0-p1
868 por xmm1, xmm5 ; abs(p2-p1)
869 pmaxub xmm0, xmm6
871 pmaxub xmm0, xmm1
872 movdqa xmm1, xmm2 ; p1
874 psubusb xmm2, xmm3 ; p1-p0
875 lea rdx, srct
877 por xmm2, xmm7 ; abs(p1-p0)
879 movdqa t0, xmm2 ; save abs(p1-p0)
881 pmaxub xmm0, xmm2
883 %if %1
884 movdqa xmm5, [rdx+32] ; q0
885 movdqa xmm7, [rdx+48] ; q1
886 %else
887 movdqa xmm5, [rdx+64] ; q0
888 movdqa xmm7, [rdx+80] ; q1
889 %endif
890 mov rdx, arg(3) ; limit
892 movdqa xmm6, xmm5 ; q0
893 movdqa xmm2, xmm7 ; q1
895 psubusb xmm5, xmm7 ; q0-q1
896 psubusb xmm7, xmm6 ; q1-q0
898 por xmm7, xmm5 ; abs(q1-q0)
900 movdqa t1, xmm7 ; save abs(q1-q0)
902 movdqa xmm4, XMMWORD PTR [rdx]; limit
904 pmaxub xmm0, xmm7
905 mov rdx, arg(2) ; blimit
907 psubusb xmm0, xmm4
908 movdqa xmm5, xmm2 ; q1
910 psubusb xmm5, xmm1 ; q1-=p1
911 psubusb xmm1, xmm2 ; p1-=q1
913 por xmm5, xmm1 ; abs(p1-q1)
914 movdqa xmm1, xmm3 ; p0
916 pand xmm5, [GLOBAL(tfe)] ; set lsb of each byte to zero
917 psubusb xmm1, xmm6 ; p0-q0
919 psrlw xmm5, 1 ; abs(p1-q1)/2
920 psubusb xmm6, xmm3 ; q0-p0
922 movdqa xmm4, XMMWORD PTR [rdx]; blimit
924 mov rdx, arg(4) ; get thresh
926 por xmm1, xmm6 ; abs(q0-p0)
928 movdqa xmm6, t0 ; get abs (q1 - q0)
930 paddusb xmm1, xmm1 ; abs(q0-p0)*2
932 movdqa xmm3, t1 ; get abs (p1 - p0)
934 movdqa xmm7, XMMWORD PTR [rdx]
936 paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
937 psubusb xmm6, xmm7 ; abs(q1 - q0) > thresh
939 psubusb xmm3, xmm7 ; abs(p1 - p0)> thresh
941 psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
942 por xmm6, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
944 por xmm1, xmm0 ; mask
945 pcmpeqb xmm6, xmm0
947 pxor xmm0, xmm0
948 pcmpeqb xmm4, xmm4
950 pcmpeqb xmm1, xmm0
951 pxor xmm4, xmm6
952 %endmacro
954 %macro BV_TRANSPOSE 0
955 ; xmm1 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
956 ; xmm6 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
957 ; xmm3 = f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
958 ; xmm7 = f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
959 movdqa xmm2, xmm1 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
960 punpcklbw xmm2, xmm6 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
962 movdqa xmm4, xmm3 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
963 punpckhbw xmm1, xmm6 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
965 punpcklbw xmm4, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
967 punpckhbw xmm3, xmm7 ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
969 movdqa xmm6, xmm2 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
970 punpcklwd xmm2, xmm4 ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
972 punpckhwd xmm6, xmm4 ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
973 movdqa xmm5, xmm1 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
975 punpcklwd xmm1, xmm3 ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
977 punpckhwd xmm5, xmm3 ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
978 ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
979 ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
980 ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
981 ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
982 %endmacro
984 %macro BV_WRITEBACK 2
985 movd [rsi+2], %1
986 psrldq %1, 4
988 movd [rdi+2], %1
989 psrldq %1, 4
991 movd [rsi+2*rax+2], %1
992 psrldq %1, 4
994 movd [rdi+2*rax+2], %1
996 movd [rsi+4*rax+2], %2
997 psrldq %2, 4
999 movd [rdi+4*rax+2], %2
1000 psrldq %2, 4
1002 movd [rsi+2*rcx+2], %2
1003 psrldq %2, 4
1005 movd [rdi+2*rcx+2], %2
1006 %endmacro
1009 ;void vp8_loop_filter_vertical_edge_sse2
1011 ; unsigned char *src_ptr,
1012 ; int src_pixel_step,
1013 ; const char *blimit,
1014 ; const char *limit,
1015 ; const char *thresh,
1016 ; int count
1018 global sym(vp8_loop_filter_vertical_edge_sse2)
1019 sym(vp8_loop_filter_vertical_edge_sse2):
1020 push rbp
1021 mov rbp, rsp
1022 SHADOW_ARGS_TO_STACK 6
1023 SAVE_XMM 7
1024 GET_GOT rbx
1025 push rsi
1026 push rdi
1027 ; end prolog
1029 ALIGN_STACK 16, rax
1030 sub rsp, 96 ; reserve 96 bytes
1031 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
1032 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
1033 %define srct [rsp + 32] ;__declspec(align(16)) char srct[64];
1035 mov rsi, arg(0) ; src_ptr
1036 movsxd rax, dword ptr arg(1) ; src_pixel_step
1038 lea rsi, [rsi - 4]
1039 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
1040 lea rcx, [rax*2+rax]
1042 ;transpose 16x8 to 8x16, and store the 8-line result on stack.
1043 TRANSPOSE_16X8 1, 1
1045 ; calculate filter mask and high edge variance
1046 LFV_FILTER_MASK_HEV_MASK 1
1048 ; start work on filters
1049 B_FILTER 2
1051 ; tranpose and write back - only work on q1, q0, p0, p1
1052 BV_TRANSPOSE
1053 ; store 16-line result
1055 lea rdx, [rax]
1056 neg rdx
1058 BV_WRITEBACK xmm1, xmm5
1060 lea rsi, [rsi+rdx*8]
1061 lea rdi, [rdi+rdx*8]
1062 BV_WRITEBACK xmm2, xmm6
1064 add rsp, 96
1065 pop rsp
1066 ; begin epilog
1067 pop rdi
1068 pop rsi
1069 RESTORE_GOT
1070 RESTORE_XMM
1071 UNSHADOW_ARGS
1072 pop rbp
1076 ;void vp8_loop_filter_vertical_edge_uv_sse2
1078 ; unsigned char *u,
1079 ; int src_pixel_step,
1080 ; const char *blimit,
1081 ; const char *limit,
1082 ; const char *thresh,
1083 ; unsigned char *v
1085 global sym(vp8_loop_filter_vertical_edge_uv_sse2)
1086 sym(vp8_loop_filter_vertical_edge_uv_sse2):
1087 push rbp
1088 mov rbp, rsp
1089 SHADOW_ARGS_TO_STACK 6
1090 SAVE_XMM 7
1091 GET_GOT rbx
1092 push rsi
1093 push rdi
1094 ; end prolog
1096 ALIGN_STACK 16, rax
1097 sub rsp, 96 ; reserve 96 bytes
1098 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
1099 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
1100 %define srct [rsp + 32] ;__declspec(align(16)) char srct[64];
1102 mov rsi, arg(0) ; u_ptr
1103 movsxd rax, dword ptr arg(1) ; src_pixel_step
1105 lea rsi, [rsi - 4]
1106 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
1107 lea rcx, [rax+2*rax]
1109 lea rdx, srct
1111 ;transpose 16x8 to 8x16, and store the 8-line result on stack.
1112 TRANSPOSE_16X8 0, 1
1114 ; calculate filter mask and high edge variance
1115 LFV_FILTER_MASK_HEV_MASK 1
1117 ; start work on filters
1118 B_FILTER 2
1120 ; tranpose and write back - only work on q1, q0, p0, p1
1121 BV_TRANSPOSE
1123 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
1125 ; store 16-line result
1126 BV_WRITEBACK xmm1, xmm5
1128 mov rsi, arg(0) ; u_ptr
1129 lea rsi, [rsi - 4]
1130 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
1131 BV_WRITEBACK xmm2, xmm6
1133 add rsp, 96
1134 pop rsp
1135 ; begin epilog
1136 pop rdi
1137 pop rsi
1138 RESTORE_GOT
1139 RESTORE_XMM
1140 UNSHADOW_ARGS
1141 pop rbp
1144 %macro MBV_TRANSPOSE 0
1145 movdqa xmm0, [rdx] ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1146 movdqa xmm1, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1148 punpcklbw xmm0, xmm7 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
1149 punpckhbw xmm1, xmm7 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
1151 movdqa xmm2, [rdx+32] ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1152 movdqa xmm6, xmm2 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1154 punpcklbw xmm2, [rdx+48] ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
1155 punpckhbw xmm6, [rdx+48] ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
1157 movdqa xmm3, xmm0 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
1158 punpcklwd xmm0, xmm2 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
1160 punpckhwd xmm3, xmm2 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
1161 movdqa xmm4, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
1163 punpcklwd xmm1, xmm6 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
1164 punpckhwd xmm4, xmm6 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
1166 movdqa xmm2, [rdx+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
1167 punpcklbw xmm2, [rdx+80] ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
1169 movdqa xmm6, xmm5 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
1170 punpcklbw xmm6, [rdx+112] ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
1172 movdqa xmm7, xmm2 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
1173 punpcklwd xmm2, xmm6 ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04
1175 punpckhwd xmm7, xmm6 ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44
1176 movdqa xmm6, xmm0 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
1178 punpckldq xmm0, xmm2 ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00
1179 punpckhdq xmm6, xmm2 ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20
1180 %endmacro
1182 %macro MBV_WRITEBACK_1 0
1183 movq QWORD PTR [rsi], xmm0
1184 movhps MMWORD PTR [rdi], xmm0
1186 movq QWORD PTR [rsi+2*rax], xmm6
1187 movhps MMWORD PTR [rdi+2*rax], xmm6
1189 movdqa xmm0, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
1190 punpckldq xmm0, xmm7 ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
1192 punpckhdq xmm3, xmm7 ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
1194 movq QWORD PTR [rsi+4*rax], xmm0
1195 movhps MMWORD PTR [rdi+4*rax], xmm0
1197 movq QWORD PTR [rsi+2*rcx], xmm3
1198 movhps MMWORD PTR [rdi+2*rcx], xmm3
1200 movdqa xmm2, [rdx+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
1201 punpckhbw xmm2, [rdx+80] ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
1203 punpckhbw xmm5, [rdx+112] ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
1204 movdqa xmm0, xmm2
1206 punpcklwd xmm0, xmm5 ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
1207 punpckhwd xmm2, xmm5 ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
1209 movdqa xmm5, xmm1 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
1210 punpckldq xmm1, xmm0 ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80
1212 punpckhdq xmm5, xmm0 ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
1213 %endmacro
1215 %macro MBV_WRITEBACK_2 0
1216 movq QWORD PTR [rsi], xmm1
1217 movhps MMWORD PTR [rdi], xmm1
1219 movq QWORD PTR [rsi+2*rax], xmm5
1220 movhps MMWORD PTR [rdi+2*rax], xmm5
1222 movdqa xmm1, xmm4 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
1223 punpckldq xmm1, xmm2 ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
1224 punpckhdq xmm4, xmm2 ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
1226 movq QWORD PTR [rsi+4*rax], xmm1
1227 movhps MMWORD PTR [rdi+4*rax], xmm1
1229 movq QWORD PTR [rsi+2*rcx], xmm4
1230 movhps MMWORD PTR [rdi+2*rcx], xmm4
1231 %endmacro
1234 ;void vp8_mbloop_filter_vertical_edge_sse2
1236 ; unsigned char *src_ptr,
1237 ; int src_pixel_step,
1238 ; const char *blimit,
1239 ; const char *limit,
1240 ; const char *thresh,
1241 ; int count
1243 global sym(vp8_mbloop_filter_vertical_edge_sse2)
1244 sym(vp8_mbloop_filter_vertical_edge_sse2):
1245 push rbp
1246 mov rbp, rsp
1247 SHADOW_ARGS_TO_STACK 6
1248 SAVE_XMM 7
1249 GET_GOT rbx
1250 push rsi
1251 push rdi
1252 ; end prolog
1254 ALIGN_STACK 16, rax
1255 sub rsp, 160 ; reserve 160 bytes
1256 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
1257 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
1258 %define srct [rsp + 32] ;__declspec(align(16)) char srct[128];
1260 mov rsi, arg(0) ; src_ptr
1261 movsxd rax, dword ptr arg(1) ; src_pixel_step
1263 lea rsi, [rsi - 4]
1264 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
1265 lea rcx, [rax*2+rax]
1267 ; Transpose
1268 TRANSPOSE_16X8 1, 0
1270 ; calculate filter mask and high edge variance
1271 LFV_FILTER_MASK_HEV_MASK 0
1273 neg rax
1274 ; start work on filters
1275 MB_FILTER_AND_WRITEBACK 2
1277 lea rsi, [rsi+rax*8]
1278 lea rdi, [rdi+rax*8]
1280 ; transpose and write back
1281 MBV_TRANSPOSE
1283 neg rax
1285 MBV_WRITEBACK_1
1287 lea rsi, [rsi+rax*8]
1288 lea rdi, [rdi+rax*8]
1289 MBV_WRITEBACK_2
1291 add rsp, 160
1292 pop rsp
1293 ; begin epilog
1294 pop rdi
1295 pop rsi
1296 RESTORE_GOT
1297 RESTORE_XMM
1298 UNSHADOW_ARGS
1299 pop rbp
1303 ;void vp8_mbloop_filter_vertical_edge_uv_sse2
1305 ; unsigned char *u,
1306 ; int src_pixel_step,
1307 ; const char *blimit,
1308 ; const char *limit,
1309 ; const char *thresh,
1310 ; unsigned char *v
1312 global sym(vp8_mbloop_filter_vertical_edge_uv_sse2)
1313 sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
1314 push rbp
1315 mov rbp, rsp
1316 SHADOW_ARGS_TO_STACK 6
1317 SAVE_XMM 7
1318 GET_GOT rbx
1319 push rsi
1320 push rdi
1321 ; end prolog
1323 ALIGN_STACK 16, rax
1324 sub rsp, 160 ; reserve 160 bytes
1325 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
1326 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
1327 %define srct [rsp + 32] ;__declspec(align(16)) char srct[128];
1329 mov rsi, arg(0) ; u_ptr
1330 movsxd rax, dword ptr arg(1) ; src_pixel_step
1332 lea rsi, [rsi - 4]
1333 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
1334 lea rcx, [rax+2*rax]
1336 lea rdx, srct
1338 ; Transpose
1339 TRANSPOSE_16X8 0, 0
1341 ; calculate filter mask and high edge variance
1342 LFV_FILTER_MASK_HEV_MASK 0
1344 ; start work on filters
1345 MB_FILTER_AND_WRITEBACK 2
1347 ; transpose and write back
1348 MBV_TRANSPOSE
1350 mov rsi, arg(0) ;u_ptr
1351 lea rsi, [rsi - 4]
1352 lea rdi, [rsi + rax]
1353 MBV_WRITEBACK_1
1354 mov rsi, arg(5) ;v_ptr
1355 lea rsi, [rsi - 4]
1356 lea rdi, [rsi + rax]
1357 MBV_WRITEBACK_2
1359 add rsp, 160
1360 pop rsp
1361 ; begin epilog
1362 pop rdi
1363 pop rsi
1364 RESTORE_GOT
1365 RESTORE_XMM
1366 UNSHADOW_ARGS
1367 pop rbp
1371 ;void vp8_loop_filter_simple_horizontal_edge_sse2
1373 ; unsigned char *src_ptr,
1374 ; int src_pixel_step,
1375 ; const char *blimit,
1377 global sym(vp8_loop_filter_simple_horizontal_edge_sse2)
1378 sym(vp8_loop_filter_simple_horizontal_edge_sse2):
1379 push rbp
1380 mov rbp, rsp
1381 SHADOW_ARGS_TO_STACK 3
1382 SAVE_XMM 7
1383 GET_GOT rbx
1384 push rsi
1385 push rdi
1386 ; end prolog
1388 mov rsi, arg(0) ;src_ptr
1389 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
1390 mov rdx, arg(2) ;blimit
1391 movdqa xmm3, XMMWORD PTR [rdx]
1393 mov rdi, rsi ; rdi points to row +1 for indirect addressing
1394 add rdi, rax
1395 neg rax
1397 ; calculate mask
1398 movdqa xmm1, [rsi+2*rax] ; p1
1399 movdqa xmm0, [rdi] ; q1
1400 movdqa xmm2, xmm1
1401 movdqa xmm7, xmm0
1402 movdqa xmm4, xmm0
1403 psubusb xmm0, xmm1 ; q1-=p1
1404 psubusb xmm1, xmm4 ; p1-=q1
1405 por xmm1, xmm0 ; abs(p1-q1)
1406 pand xmm1, [GLOBAL(tfe)] ; set lsb of each byte to zero
1407 psrlw xmm1, 1 ; abs(p1-q1)/2
1409 movdqa xmm5, [rsi+rax] ; p0
1410 movdqa xmm4, [rsi] ; q0
1411 movdqa xmm0, xmm4 ; q0
1412 movdqa xmm6, xmm5 ; p0
1413 psubusb xmm5, xmm4 ; p0-=q0
1414 psubusb xmm4, xmm6 ; q0-=p0
1415 por xmm5, xmm4 ; abs(p0 - q0)
1416 paddusb xmm5, xmm5 ; abs(p0-q0)*2
1417 paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2
1419 psubusb xmm5, xmm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
1420 pxor xmm3, xmm3
1421 pcmpeqb xmm5, xmm3
1423 ; start work on filters
1424 pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
1425 pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
1426 psubsb xmm2, xmm7 ; p1 - q1
1428 pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values
1429 pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values
1430 movdqa xmm3, xmm0 ; q0
1431 psubsb xmm0, xmm6 ; q0 - p0
1432 paddsb xmm2, xmm0 ; p1 - q1 + 1 * (q0 - p0)
1433 paddsb xmm2, xmm0 ; p1 - q1 + 2 * (q0 - p0)
1434 paddsb xmm2, xmm0 ; p1 - q1 + 3 * (q0 - p0)
1435 pand xmm5, xmm2 ; mask filter values we don't care about
1437 ; do + 4 side
1438 paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
1440 movdqa xmm0, xmm5 ; get a copy of filters
1441 psllw xmm0, 8 ; shift left 8
1442 psraw xmm0, 3 ; arithmetic shift right 11
1443 psrlw xmm0, 8
1444 movdqa xmm1, xmm5 ; get a copy of filters
1445 psraw xmm1, 11 ; arithmetic shift right 11
1446 psllw xmm1, 8 ; shift left 8 to put it back
1448 por xmm0, xmm1 ; put the two together to get result
1450 psubsb xmm3, xmm0 ; q0-= q0 add
1451 pxor xmm3, [GLOBAL(t80)] ; unoffset
1452 movdqa [rsi], xmm3 ; write back
1454 ; now do +3 side
1455 psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4
1457 movdqa xmm0, xmm5 ; get a copy of filters
1458 psllw xmm0, 8 ; shift left 8
1459 psraw xmm0, 3 ; arithmetic shift right 11
1460 psrlw xmm0, 8
1461 psraw xmm5, 11 ; arithmetic shift right 11
1462 psllw xmm5, 8 ; shift left 8 to put it back
1463 por xmm0, xmm5 ; put the two together to get result
1466 paddsb xmm6, xmm0 ; p0+= p0 add
1467 pxor xmm6, [GLOBAL(t80)] ; unoffset
1468 movdqa [rsi+rax], xmm6 ; write back
1470 ; begin epilog
1471 pop rdi
1472 pop rsi
1473 RESTORE_GOT
1474 RESTORE_XMM
1475 UNSHADOW_ARGS
1476 pop rbp
1480 ;void vp8_loop_filter_simple_vertical_edge_sse2
1482 ; unsigned char *src_ptr,
1483 ; int src_pixel_step,
1484 ; const char *blimit,
1486 global sym(vp8_loop_filter_simple_vertical_edge_sse2)
1487 sym(vp8_loop_filter_simple_vertical_edge_sse2):
1488 push rbp ; save old base pointer value.
1489 mov rbp, rsp ; set new base pointer value.
1490 SHADOW_ARGS_TO_STACK 3
1491 SAVE_XMM 7
1492 GET_GOT rbx ; save callee-saved reg
1493 push rsi
1494 push rdi
1495 ; end prolog
1497 ALIGN_STACK 16, rax
1498 sub rsp, 32 ; reserve 32 bytes
1499 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
1500 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
1502 mov rsi, arg(0) ;src_ptr
1503 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
1505 lea rsi, [rsi - 2 ]
1506 lea rdi, [rsi + rax]
1507 lea rdx, [rsi + rax*4]
1508 lea rcx, [rdx + rax]
1510 movd xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00
1511 movd xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40
1512 movd xmm2, [rdi] ; 13 12 11 10
1513 movd xmm3, [rcx] ; 53 52 51 50
1514 punpckldq xmm0, xmm1 ; (high 64 bits unused) 43 42 41 40 03 02 01 00
1515 punpckldq xmm2, xmm3 ; 53 52 51 50 13 12 11 10
1517 movd xmm4, [rsi + rax*2] ; 23 22 21 20
1518 movd xmm5, [rdx + rax*2] ; 63 62 61 60
1519 movd xmm6, [rdi + rax*2] ; 33 32 31 30
1520 movd xmm7, [rcx + rax*2] ; 73 72 71 70
1521 punpckldq xmm4, xmm5 ; 63 62 61 60 23 22 21 20
1522 punpckldq xmm6, xmm7 ; 73 72 71 70 33 32 31 30
1524 punpcklbw xmm0, xmm2 ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
1525 punpcklbw xmm4, xmm6 ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
1527 movdqa xmm1, xmm0
1528 punpcklwd xmm0, xmm4 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
1529 punpckhwd xmm1, xmm4 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
1531 movdqa xmm2, xmm0
1532 punpckldq xmm0, xmm1 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
1533 punpckhdq xmm2, xmm1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
1535 movdqa t0, xmm0 ; save to t0
1536 movdqa t1, xmm2 ; save to t1
1538 lea rsi, [rsi + rax*8]
1539 lea rdi, [rsi + rax]
1540 lea rdx, [rsi + rax*4]
1541 lea rcx, [rdx + rax]
1543 movd xmm4, [rsi] ; 83 82 81 80
1544 movd xmm1, [rdx] ; c3 c2 c1 c0
1545 movd xmm6, [rdi] ; 93 92 91 90
1546 movd xmm3, [rcx] ; d3 d2 d1 d0
1547 punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80
1548 punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90
1550 movd xmm0, [rsi + rax*2] ; a3 a2 a1 a0
1551 movd xmm5, [rdx + rax*2] ; e3 e2 e1 e0
1552 movd xmm2, [rdi + rax*2] ; b3 b2 b1 b0
1553 movd xmm7, [rcx + rax*2] ; f3 f2 f1 f0
1554 punpckldq xmm0, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0
1555 punpckldq xmm2, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0
1557 punpcklbw xmm4, xmm6 ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
1558 punpcklbw xmm0, xmm2 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
1560 movdqa xmm1, xmm4
1561 punpcklwd xmm4, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
1562 punpckhwd xmm1, xmm0 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
1564 movdqa xmm6, xmm4
1565 punpckldq xmm4, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
1566 punpckhdq xmm6, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
1568 movdqa xmm0, t0 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
1569 movdqa xmm2, t1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
1570 movdqa xmm1, xmm0
1571 movdqa xmm3, xmm2
1573 punpcklqdq xmm0, xmm4 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1574 punpckhqdq xmm1, xmm4 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
1575 punpcklqdq xmm2, xmm6 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1576 punpckhqdq xmm3, xmm6 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
1578 ; calculate mask
1579 movdqa xmm6, xmm0 ; p1
1580 movdqa xmm7, xmm3 ; q1
1581 psubusb xmm7, xmm0 ; q1-=p1
1582 psubusb xmm6, xmm3 ; p1-=q1
1583 por xmm6, xmm7 ; abs(p1-q1)
1584 pand xmm6, [GLOBAL(tfe)] ; set lsb of each byte to zero
1585 psrlw xmm6, 1 ; abs(p1-q1)/2
1587 movdqa xmm5, xmm1 ; p0
1588 movdqa xmm4, xmm2 ; q0
1589 psubusb xmm5, xmm2 ; p0-=q0
1590 psubusb xmm4, xmm1 ; q0-=p0
1591 por xmm5, xmm4 ; abs(p0 - q0)
1592 paddusb xmm5, xmm5 ; abs(p0-q0)*2
1593 paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2
1595 mov rdx, arg(2) ;blimit
1596 movdqa xmm7, XMMWORD PTR [rdx]
1598 psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
1599 pxor xmm7, xmm7
1600 pcmpeqb xmm5, xmm7 ; mm5 = mask
1602 ; start work on filters
1603 movdqa t0, xmm0
1604 movdqa t1, xmm3
1606 pxor xmm0, [GLOBAL(t80)] ; p1 offset to convert to signed values
1607 pxor xmm3, [GLOBAL(t80)] ; q1 offset to convert to signed values
1609 psubsb xmm0, xmm3 ; p1 - q1
1610 movdqa xmm6, xmm1 ; p0
1612 movdqa xmm7, xmm2 ; q0
1613 pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values
1615 pxor xmm7, [GLOBAL(t80)] ; offset to convert to signed values
1616 movdqa xmm3, xmm7 ; offseted ; q0
1618 psubsb xmm7, xmm6 ; q0 - p0
1619 paddsb xmm0, xmm7 ; p1 - q1 + 1 * (q0 - p0)
1621 paddsb xmm0, xmm7 ; p1 - q1 + 2 * (q0 - p0)
1622 paddsb xmm0, xmm7 ; p1 - q1 + 3 * (q0 - p0)
1624 pand xmm5, xmm0 ; mask filter values we don't care about
1627 paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
1629 movdqa xmm0, xmm5 ; get a copy of filters
1630 psllw xmm0, 8 ; shift left 8
1632 psraw xmm0, 3 ; arithmetic shift right 11
1633 psrlw xmm0, 8
1635 movdqa xmm7, xmm5 ; get a copy of filters
1636 psraw xmm7, 11 ; arithmetic shift right 11
1638 psllw xmm7, 8 ; shift left 8 to put it back
1639 por xmm0, xmm7 ; put the two together to get result
1641 psubsb xmm3, xmm0 ; q0-= q0sz add
1642 pxor xmm3, [GLOBAL(t80)] ; unoffset q0
1644 ; now do +3 side
1645 psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4
1646 movdqa xmm0, xmm5 ; get a copy of filters
1648 psllw xmm0, 8 ; shift left 8
1649 psraw xmm0, 3 ; arithmetic shift right 11
1651 psrlw xmm0, 8
1652 psraw xmm5, 11 ; arithmetic shift right 11
1654 psllw xmm5, 8 ; shift left 8 to put it back
1655 por xmm0, xmm5 ; put the two together to get result
1657 paddsb xmm6, xmm0 ; p0+= p0 add
1658 pxor xmm6, [GLOBAL(t80)] ; unoffset p0
1660 movdqa xmm0, t0 ; p1
1661 movdqa xmm4, t1 ; q1
1663 ; transpose back to write out
1664 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1665 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
1666 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1667 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
1668 movdqa xmm1, xmm0
1669 punpcklbw xmm0, xmm6 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
1670 punpckhbw xmm1, xmm6 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
1672 movdqa xmm5, xmm3
1673 punpcklbw xmm3, xmm4 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
1674 punpckhbw xmm5, xmm4 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
1676 movdqa xmm2, xmm0
1677 punpcklwd xmm0, xmm3 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
1678 punpckhwd xmm2, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
1680 movdqa xmm3, xmm1
1681 punpcklwd xmm1, xmm5 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
1682 punpckhwd xmm3, xmm5 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
1684 ; write out order: xmm0 xmm2 xmm1 xmm3
1685 lea rdx, [rsi + rax*4]
1687 movd [rsi], xmm1 ; write the second 8-line result
1688 psrldq xmm1, 4
1689 movd [rdi], xmm1
1690 psrldq xmm1, 4
1691 movd [rsi + rax*2], xmm1
1692 psrldq xmm1, 4
1693 movd [rdi + rax*2], xmm1
1695 movd [rdx], xmm3
1696 psrldq xmm3, 4
1697 movd [rcx], xmm3
1698 psrldq xmm3, 4
1699 movd [rdx + rax*2], xmm3
1700 psrldq xmm3, 4
1701 movd [rcx + rax*2], xmm3
1703 neg rax
1704 lea rsi, [rsi + rax*8]
1705 neg rax
1706 lea rdi, [rsi + rax]
1707 lea rdx, [rsi + rax*4]
1708 lea rcx, [rdx + rax]
1710 movd [rsi], xmm0 ; write the first 8-line result
1711 psrldq xmm0, 4
1712 movd [rdi], xmm0
1713 psrldq xmm0, 4
1714 movd [rsi + rax*2], xmm0
1715 psrldq xmm0, 4
1716 movd [rdi + rax*2], xmm0
1718 movd [rdx], xmm2
1719 psrldq xmm2, 4
1720 movd [rcx], xmm2
1721 psrldq xmm2, 4
1722 movd [rdx + rax*2], xmm2
1723 psrldq xmm2, 4
1724 movd [rcx + rax*2], xmm2
1726 add rsp, 32
1727 pop rsp
1728 ; begin epilog
1729 pop rdi
1730 pop rsi
1731 RESTORE_GOT
1732 RESTORE_XMM
1733 UNSHADOW_ARGS
1734 pop rbp
1737 SECTION_RODATA
1738 align 16
1739 tfe:
1740 times 16 db 0xfe
1741 align 16
1742 t80:
1743 times 16 db 0x80
1744 align 16
1745 t1s:
1746 times 16 db 0x01
1747 align 16
1749 times 16 db 0x03
1750 align 16
1752 times 16 db 0x04
1753 align 16
1754 ones:
1755 times 8 dw 0x0001
1756 align 16
1758 times 8 dw 0x0900
1759 align 16
1760 s63:
1761 times 8 dw 0x003f