Merge "vp8_rd_pick_best_mbsegmentation code restructure"
[libvpx.git] / vp8 / common / x86 / loopfilter_sse2.asm
blob849133dc42ae1b5aee3214ff8cab46ddab9c18e3
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 ; Use of pmaxub instead of psubusb to compute filter mask was seen
15 ; in ffvp8
17 %macro LFH_FILTER_AND_HEV_MASK 1
18 %if %1
19 movdqa xmm2, [rdi+2*rax] ; q3
20 movdqa xmm1, [rsi+2*rax] ; q2
21 movdqa xmm4, [rsi+rax] ; q1
22 movdqa xmm5, [rsi] ; q0
23 neg rax ; negate pitch to deal with above border
24 %else
25 movlps xmm2, [rsi + rcx*2] ; q3
26 movlps xmm1, [rsi + rcx] ; q2
27 movlps xmm4, [rsi] ; q1
28 movlps xmm5, [rsi + rax] ; q0
30 movhps xmm2, [rdi + rcx*2]
31 movhps xmm1, [rdi + rcx]
32 movhps xmm4, [rdi]
33 movhps xmm5, [rdi + rax]
35 lea rsi, [rsi + rax*4]
36 lea rdi, [rdi + rax*4]
38 movdqa XMMWORD PTR [rsp], xmm1 ; store q2
39 movdqa XMMWORD PTR [rsp + 16], xmm4 ; store q1
40 %endif
42 movdqa xmm6, xmm1 ; q2
43 movdqa xmm3, xmm4 ; q1
45 psubusb xmm1, xmm2 ; q2-=q3
46 psubusb xmm2, xmm6 ; q3-=q2
48 psubusb xmm4, xmm6 ; q1-=q2
49 psubusb xmm6, xmm3 ; q2-=q1
51 por xmm4, xmm6 ; abs(q2-q1)
52 por xmm1, xmm2 ; abs(q3-q2)
54 movdqa xmm0, xmm5 ; q0
55 pmaxub xmm1, xmm4
57 psubusb xmm5, xmm3 ; q0-=q1
58 psubusb xmm3, xmm0 ; q1-=q0
60 por xmm5, xmm3 ; abs(q0-q1)
61 movdqa t0, xmm5 ; save to t0
63 pmaxub xmm1, xmm5
65 %if %1
66 movdqa xmm2, [rsi+4*rax] ; p3
67 movdqa xmm4, [rdi+4*rax] ; p2
68 movdqa xmm6, [rsi+2*rax] ; p1
69 %else
70 movlps xmm2, [rsi + rax] ; p3
71 movlps xmm4, [rsi] ; p2
72 movlps xmm6, [rsi + rcx] ; p1
74 movhps xmm2, [rdi + rax]
75 movhps xmm4, [rdi]
76 movhps xmm6, [rdi + rcx]
78 movdqa XMMWORD PTR [rsp + 32], xmm4 ; store p2
79 movdqa XMMWORD PTR [rsp + 48], xmm6 ; store p1
80 %endif
82 movdqa xmm5, xmm4 ; p2
83 movdqa xmm3, xmm6 ; p1
85 psubusb xmm4, xmm2 ; p2-=p3
86 psubusb xmm2, xmm5 ; p3-=p2
88 psubusb xmm3, xmm5 ; p1-=p2
89 pmaxub xmm1, xmm4 ; abs(p3 - p2)
91 psubusb xmm5, xmm6 ; p2-=p1
92 pmaxub xmm1, xmm2 ; abs(p3 - p2)
94 pmaxub xmm1, xmm5 ; abs(p2 - p1)
95 movdqa xmm2, xmm6 ; p1
97 pmaxub xmm1, xmm3 ; abs(p2 - p1)
98 %if %1
99 movdqa xmm4, [rsi+rax] ; p0
100 movdqa xmm3, [rdi] ; q1
101 %else
102 movlps xmm4, [rsi + rcx*2] ; p0
103 movhps xmm4, [rdi + rcx*2]
104 movdqa xmm3, q1 ; q1
105 %endif
107 movdqa xmm5, xmm4 ; p0
108 psubusb xmm4, xmm6 ; p0-=p1
110 psubusb xmm6, xmm5 ; p1-=p0
112 por xmm6, xmm4 ; abs(p1 - p0)
113 mov rdx, arg(2) ; get flimit
115 movdqa t1, xmm6 ; save to t1
117 movdqa xmm4, xmm3 ; q1
118 pmaxub xmm1, xmm6
120 psubusb xmm3, xmm2 ; q1-=p1
121 psubusb xmm2, xmm4 ; p1-=q1
123 psubusb xmm1, xmm7
124 por xmm2, xmm3 ; abs(p1-q1)
126 movdqa xmm4, XMMWORD PTR [rdx] ; flimit
128 movdqa xmm3, xmm0 ; q0
129 pand xmm2, [GLOBAL(tfe)] ; set lsb of each byte to zero
131 mov rdx, arg(4) ; hev get thresh
133 movdqa xmm6, xmm5 ; p0
134 psrlw xmm2, 1 ; abs(p1-q1)/2
136 psubusb xmm5, xmm3 ; p0-=q0
137 paddb xmm4, xmm4 ; flimit*2 (less than 255)
139 psubusb xmm3, xmm6 ; q0-=p0
140 por xmm5, xmm3 ; abs(p0 - q0)
142 paddusb xmm5, xmm5 ; abs(p0-q0)*2
143 paddb xmm7, xmm4 ; flimit * 2 + limit (less than 255)
145 movdqa xmm4, t0 ; hev get abs (q1 - q0)
147 movdqa xmm3, t1 ; get abs (p1 - p0)
149 paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
151 movdqa xmm2, XMMWORD PTR [rdx] ; hev
153 psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
154 psubusb xmm4, xmm2 ; hev
156 psubusb xmm3, xmm2 ; hev
157 por xmm1, xmm5
159 pxor xmm7, xmm7
160 paddb xmm4, xmm3 ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
162 pcmpeqb xmm4, xmm5 ; hev
163 pcmpeqb xmm3, xmm3 ; hev
165 pcmpeqb xmm1, xmm7 ; mask xmm1
166 pxor xmm4, xmm3 ; hev
167 %endmacro
169 %macro B_FILTER 1
170 %if %1 == 0
171 movdqa xmm2, p1 ; p1
172 movdqa xmm7, q1 ; q1
173 %elif %1 == 1
174 movdqa xmm2, [rsi+2*rax] ; p1
175 movdqa xmm7, [rdi] ; q1
176 %elif %1 == 2
177 lea rdx, srct
179 movdqa xmm2, [rdx] ; p1
180 movdqa xmm7, [rdx+48] ; q1
181 movdqa xmm6, [rdx+16] ; p0
182 movdqa xmm0, [rdx+32] ; q0
183 %endif
185 pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
186 pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
188 psubsb xmm2, xmm7 ; p1 - q1
189 pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values
191 pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1)
192 pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values
194 movdqa xmm3, xmm0 ; q0
195 psubsb xmm0, xmm6 ; q0 - p0
197 paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
199 paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
201 paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
203 pand xmm1, xmm2 ; mask filter values we don't care about
205 movdqa xmm2, xmm1
207 paddsb xmm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
208 paddsb xmm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
210 punpckhbw xmm5, xmm2 ; axbxcxdx
211 punpcklbw xmm2, xmm2 ; exfxgxhx
213 punpcklbw xmm0, xmm1 ; exfxgxhx
214 psraw xmm5, 11 ; sign extended shift right by 3
216 punpckhbw xmm1, xmm1 ; axbxcxdx
217 psraw xmm2, 11 ; sign extended shift right by 3
219 packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3;
220 psraw xmm0, 11 ; sign extended shift right by 3
222 psraw xmm1, 11 ; sign extended shift right by 3
223 movdqa xmm5, xmm0 ; save results
225 packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
226 paddsw xmm5, [GLOBAL(ones)]
228 paddsw xmm1, [GLOBAL(ones)]
229 psraw xmm5, 1 ; partial shifted one more time for 2nd tap
231 psraw xmm1, 1 ; partial shifted one more time for 2nd tap
233 paddsb xmm6, xmm2 ; p0+= p0 add
234 packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
236 %if %1 == 0
237 movdqa xmm1, p1 ; p1
238 %elif %1 == 1
239 movdqa xmm1, [rsi+2*rax] ; p1
240 %elif %1 == 2
241 movdqa xmm1, [rdx] ; p1
242 %endif
243 pandn xmm4, xmm5 ; high edge variance additive
244 pxor xmm6, [GLOBAL(t80)] ; unoffset
246 pxor xmm1, [GLOBAL(t80)] ; reoffset
247 psubsb xmm3, xmm0 ; q0-= q0 add
249 paddsb xmm1, xmm4 ; p1+= p1 add
250 pxor xmm3, [GLOBAL(t80)] ; unoffset
252 pxor xmm1, [GLOBAL(t80)] ; unoffset
253 psubsb xmm7, xmm4 ; q1-= q1 add
255 pxor xmm7, [GLOBAL(t80)] ; unoffset
256 %if %1 == 0
257 lea rsi, [rsi + rcx*2]
258 lea rdi, [rdi + rcx*2]
259 movq MMWORD PTR [rsi], xmm6 ; p0
260 movhps MMWORD PTR [rdi], xmm6
261 movq MMWORD PTR [rsi + rax], xmm1 ; p1
262 movhps MMWORD PTR [rdi + rax], xmm1
263 movq MMWORD PTR [rsi + rcx], xmm3 ; q0
264 movhps MMWORD PTR [rdi + rcx], xmm3
265 movq MMWORD PTR [rsi + rcx*2],xmm7 ; q1
266 movhps MMWORD PTR [rdi + rcx*2],xmm7
267 %elif %1 == 1
268 movdqa [rsi+rax], xmm6 ; write back
269 movdqa [rsi+2*rax], xmm1 ; write back
270 movdqa [rsi], xmm3 ; write back
271 movdqa [rdi], xmm7 ; write back
272 %endif
274 %endmacro
277 ;void vp8_loop_filter_horizontal_edge_sse2
279 ; unsigned char *src_ptr,
280 ; int src_pixel_step,
281 ; const char *flimit,
282 ; const char *limit,
283 ; const char *thresh,
284 ; int count
286 global sym(vp8_loop_filter_horizontal_edge_sse2)
287 sym(vp8_loop_filter_horizontal_edge_sse2):
288 push rbp
289 mov rbp, rsp
290 SHADOW_ARGS_TO_STACK 6
291 SAVE_XMM
292 GET_GOT rbx
293 push rsi
294 push rdi
295 ; end prolog
297 ALIGN_STACK 16, rax
298 sub rsp, 32 ; reserve 32 bytes
299 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
300 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
302 mov rsi, arg(0) ;src_ptr
303 movsxd rax, dword ptr arg(1) ;src_pixel_step
305 mov rdx, arg(3) ;limit
306 movdqa xmm7, XMMWORD PTR [rdx]
308 lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing
310 ; calculate breakout conditions and high edge variance
311 LFH_FILTER_AND_HEV_MASK 1
312 ; filter and write back the result
313 B_FILTER 1
315 add rsp, 32
316 pop rsp
317 ; begin epilog
318 pop rdi
319 pop rsi
320 RESTORE_GOT
321 RESTORE_XMM
322 UNSHADOW_ARGS
323 pop rbp
327 ;void vp8_loop_filter_horizontal_edge_uv_sse2
329 ; unsigned char *src_ptr,
330 ; int src_pixel_step,
331 ; const char *flimit,
332 ; const char *limit,
333 ; const char *thresh,
334 ; int count
336 global sym(vp8_loop_filter_horizontal_edge_uv_sse2)
337 sym(vp8_loop_filter_horizontal_edge_uv_sse2):
338 push rbp
339 mov rbp, rsp
340 SHADOW_ARGS_TO_STACK 6
341 SAVE_XMM
342 GET_GOT rbx
343 push rsi
344 push rdi
345 ; end prolog
347 ALIGN_STACK 16, rax
348 sub rsp, 96 ; reserve 96 bytes
349 %define q2 [rsp + 0] ;__declspec(align(16)) char q2[16];
350 %define q1 [rsp + 16] ;__declspec(align(16)) char q1[16];
351 %define p2 [rsp + 32] ;__declspec(align(16)) char p2[16];
352 %define p1 [rsp + 48] ;__declspec(align(16)) char p1[16];
353 %define t0 [rsp + 64] ;__declspec(align(16)) char t0[16];
354 %define t1 [rsp + 80] ;__declspec(align(16)) char t1[16];
356 mov rsi, arg(0) ; u
357 mov rdi, arg(5) ; v
358 movsxd rax, dword ptr arg(1) ; src_pixel_step
359 mov rcx, rax
360 neg rax ; negate pitch to deal with above border
362 mov rdx, arg(3) ;limit
363 movdqa xmm7, XMMWORD PTR [rdx]
365 lea rsi, [rsi + rcx]
366 lea rdi, [rdi + rcx]
368 ; calculate breakout conditions and high edge variance
369 LFH_FILTER_AND_HEV_MASK 0
370 ; filter and write back the result
371 B_FILTER 0
373 add rsp, 96
374 pop rsp
375 ; begin epilog
376 pop rdi
377 pop rsi
378 RESTORE_GOT
379 RESTORE_XMM
380 UNSHADOW_ARGS
381 pop rbp
385 %macro MB_FILTER_AND_WRITEBACK 1
386 %if %1 == 0
387 movdqa xmm2, p1 ; p1
388 movdqa xmm7, q1 ; q1
389 %elif %1 == 1
390 movdqa xmm2, [rsi+2*rax] ; p1
391 movdqa xmm7, [rdi] ; q1
393 mov rcx, rax
394 neg rcx
395 %elif %1 == 2
396 lea rdx, srct
398 movdqa xmm2, [rdx+32] ; p1
399 movdqa xmm7, [rdx+80] ; q1
400 movdqa xmm6, [rdx+48] ; p0
401 movdqa xmm0, [rdx+64] ; q0
402 %endif
404 pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
405 pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
406 pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values
407 pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values
409 psubsb xmm2, xmm7 ; p1 - q1
410 movdqa xmm3, xmm0 ; q0
412 psubsb xmm0, xmm6 ; q0 - p0
414 paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1)
416 paddsb xmm2, xmm0 ; 2 * (q0 - p0)
418 paddsb xmm2, xmm0 ; 3 * (q0 - p0) + (p1 - q1)
420 pand xmm1, xmm2 ; mask filter values we don't care about
422 movdqa xmm2, xmm1 ; vp8_filter
424 pand xmm2, xmm4 ; Filter2 = vp8_filter & hev
425 pxor xmm0, xmm0
427 pandn xmm4, xmm1 ; vp8_filter&=~hev
428 pxor xmm1, xmm1
430 punpcklbw xmm0, xmm4 ; Filter 2 (hi)
431 movdqa xmm5, xmm2
433 punpckhbw xmm1, xmm4 ; Filter 2 (lo)
434 paddsb xmm5, [GLOBAL(t3)] ; vp8_signed_char_clamp(Filter2 + 3)
436 pmulhw xmm1, [GLOBAL(s9)] ; Filter 2 (lo) * 9
438 pmulhw xmm0, [GLOBAL(s9)] ; Filter 2 (hi) * 9
440 punpckhbw xmm7, xmm5 ; axbxcxdx
441 paddsb xmm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4)
443 punpcklbw xmm5, xmm5 ; exfxgxhx
444 psraw xmm7, 11 ; sign extended shift right by 3
446 psraw xmm5, 11 ; sign extended shift right by 3
447 punpckhbw xmm4, xmm2 ; axbxcxdx
449 punpcklbw xmm2, xmm2 ; exfxgxhx
450 psraw xmm4, 11 ; sign extended shift right by 3
452 packsswb xmm5, xmm7 ; Filter2 >>=3;
453 psraw xmm2, 11 ; sign extended shift right by 3
455 packsswb xmm2, xmm4 ; Filter1 >>=3;
456 movdqa xmm7, xmm1
458 paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2
459 movdqa xmm4, xmm1
461 psubsb xmm3, xmm2 ; qs0 =qs0 - Filter1
462 movdqa xmm5, xmm0
464 movdqa xmm2, xmm5
465 paddw xmm0, [GLOBAL(s63)] ; Filter 2 (hi) * 9 + 63
467 paddw xmm1, [GLOBAL(s63)] ; Filter 2 (lo) * 9 + 63
468 paddw xmm5, xmm5 ; Filter 2 (hi) * 18
470 paddw xmm7, xmm7 ; Filter 2 (lo) * 18
471 paddw xmm5, xmm0 ; Filter 2 (hi) * 27 + 63
473 paddw xmm7, xmm1 ; Filter 2 (lo) * 27 + 63
474 paddw xmm2, xmm0 ; Filter 2 (hi) * 18 + 63
476 paddw xmm4, xmm1 ; Filter 2 (lo) * 18 + 63
477 psraw xmm0, 7 ; (Filter 2 (hi) * 9 + 63) >> 7
479 psraw xmm1, 7 ; (Filter 2 (lo) * 9 + 63) >> 7
480 psraw xmm2, 7 ; (Filter 2 (hi) * 18 + 63) >> 7
482 packsswb xmm0, xmm1 ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
483 psraw xmm4, 7 ; (Filter 2 (lo) * 18 + 63) >> 7
485 psraw xmm5, 7 ; (Filter 2 (hi) * 27 + 63) >> 7
486 packsswb xmm2, xmm4 ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
488 psraw xmm7, 7 ; (Filter 2 (lo) * 27 + 63) >> 7
490 packsswb xmm5, xmm7 ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
492 psubsb xmm3, xmm5 ; sq = vp8_signed_char_clamp(qs0 - u3)
493 paddsb xmm6, xmm5 ; sp = vp8_signed_char_clamp(ps0 - u3)
495 %if %1 == 0
496 movdqa xmm5, q2 ; q2
497 movdqa xmm1, q1 ; q1
498 movdqa xmm4, p1 ; p1
499 movdqa xmm7, p2 ; p2
501 %elif %1 == 1
502 movdqa xmm5, XMMWORD PTR [rdi+rcx] ; q2
503 movdqa xmm1, XMMWORD PTR [rdi] ; q1
504 movdqa xmm4, XMMWORD PTR [rsi+rax*2] ; p1
505 movdqa xmm7, XMMWORD PTR [rdi+rax*4] ; p2
506 %elif %1 == 2
507 movdqa xmm5, XMMWORD PTR [rdx+96] ; q2
508 movdqa xmm1, XMMWORD PTR [rdx+80] ; q1
509 movdqa xmm4, XMMWORD PTR [rdx+32] ; p1
510 movdqa xmm7, XMMWORD PTR [rdx+16] ; p2
511 %endif
513 pxor xmm3, [GLOBAL(t80)] ; *oq0 = sq^0x80
514 pxor xmm6, [GLOBAL(t80)] ; *oq0 = sp^0x80
516 pxor xmm1, [GLOBAL(t80)]
517 pxor xmm4, [GLOBAL(t80)]
519 psubsb xmm1, xmm2 ; sq = vp8_signed_char_clamp(qs1 - u2)
520 paddsb xmm4, xmm2 ; sp = vp8_signed_char_clamp(ps1 - u2)
522 pxor xmm1, [GLOBAL(t80)] ; *oq1 = sq^0x80;
523 pxor xmm4, [GLOBAL(t80)] ; *op1 = sp^0x80;
525 pxor xmm7, [GLOBAL(t80)]
526 pxor xmm5, [GLOBAL(t80)]
528 paddsb xmm7, xmm0 ; sp = vp8_signed_char_clamp(ps2 - u)
529 psubsb xmm5, xmm0 ; sq = vp8_signed_char_clamp(qs2 - u)
531 pxor xmm7, [GLOBAL(t80)] ; *op2 = sp^0x80;
532 pxor xmm5, [GLOBAL(t80)] ; *oq2 = sq^0x80;
534 %if %1 == 0
535 lea rsi, [rsi+rcx*2]
536 lea rdi, [rdi+rcx*2]
538 movq MMWORD PTR [rsi], xmm6 ; p0
539 movhps MMWORD PTR [rdi], xmm6
540 movq MMWORD PTR [rsi + rcx], xmm3 ; q0
541 movhps MMWORD PTR [rdi + rcx], xmm3
543 movq MMWORD PTR [rsi+rcx*2], xmm1 ; q1
544 movhps MMWORD PTR [rdi+rcx*2], xmm1
546 movq MMWORD PTR [rsi + rax], xmm4 ; p1
547 movhps MMWORD PTR [rdi + rax], xmm4
549 movq MMWORD PTR [rsi+rax*2], xmm7 ; p2
550 movhps MMWORD PTR [rdi+rax*2], xmm7
552 lea rsi, [rsi + rcx]
553 lea rdi, [rdi + rcx]
554 movq MMWORD PTR [rsi+rcx*2], xmm5 ; q2
555 movhps MMWORD PTR [rdi+rcx*2], xmm5
556 %elif %1 == 1
557 movdqa XMMWORD PTR [rdi+rcx], xmm5 ; q2
558 movdqa XMMWORD PTR [rdi], xmm1 ; q1
559 movdqa XMMWORD PTR [rsi], xmm3 ; q0
560 movdqa XMMWORD PTR [rsi+rax ],xmm6 ; p0
561 movdqa XMMWORD PTR [rsi+rax*2],xmm4 ; p1
562 movdqa XMMWORD PTR [rdi+rax*4],xmm7 ; p2
563 %elif %1 == 2
564 movdqa XMMWORD PTR [rdx+80], xmm1 ; q1
565 movdqa XMMWORD PTR [rdx+64], xmm3 ; q0
566 movdqa XMMWORD PTR [rdx+48], xmm6 ; p0
567 movdqa XMMWORD PTR [rdx+32], xmm4 ; p1
568 %endif
570 %endmacro
573 ;void vp8_mbloop_filter_horizontal_edge_sse2
575 ; unsigned char *src_ptr,
576 ; int src_pixel_step,
577 ; const char *flimit,
578 ; const char *limit,
579 ; const char *thresh,
580 ; int count
582 global sym(vp8_mbloop_filter_horizontal_edge_sse2)
583 sym(vp8_mbloop_filter_horizontal_edge_sse2):
584 push rbp
585 mov rbp, rsp
586 SHADOW_ARGS_TO_STACK 6
587 SAVE_XMM
588 GET_GOT rbx
589 push rsi
590 push rdi
591 ; end prolog
593 ALIGN_STACK 16, rax
594 sub rsp, 32 ; reserve 32 bytes
595 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
596 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
598 mov rsi, arg(0) ;src_ptr
599 movsxd rax, dword ptr arg(1) ;src_pixel_step
601 mov rdx, arg(3) ;limit
602 movdqa xmm7, XMMWORD PTR [rdx]
604 lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing
606 ; calculate breakout conditions and high edge variance
607 LFH_FILTER_AND_HEV_MASK 1
608 ; filter and write back the results
609 MB_FILTER_AND_WRITEBACK 1
611 add rsp, 32
612 pop rsp
613 ; begin epilog
614 pop rdi
615 pop rsi
616 RESTORE_GOT
617 RESTORE_XMM
618 UNSHADOW_ARGS
619 pop rbp
623 ;void vp8_mbloop_filter_horizontal_edge_uv_sse2
625 ; unsigned char *u,
626 ; int src_pixel_step,
627 ; const char *flimit,
628 ; const char *limit,
629 ; const char *thresh,
630 ; unsigned char *v
632 global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2)
633 sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
634 push rbp
635 mov rbp, rsp
636 SHADOW_ARGS_TO_STACK 6
637 SAVE_XMM
638 GET_GOT rbx
639 push rsi
640 push rdi
641 ; end prolog
643 ALIGN_STACK 16, rax
644 sub rsp, 96 ; reserve 96 bytes
645 %define q2 [rsp + 0] ;__declspec(align(16)) char q2[16];
646 %define q1 [rsp + 16] ;__declspec(align(16)) char q1[16];
647 %define p2 [rsp + 32] ;__declspec(align(16)) char p2[16];
648 %define p1 [rsp + 48] ;__declspec(align(16)) char p1[16];
649 %define t0 [rsp + 64] ;__declspec(align(16)) char t0[16];
650 %define t1 [rsp + 80] ;__declspec(align(16)) char t1[16];
652 mov rsi, arg(0) ; u
653 mov rdi, arg(5) ; v
654 movsxd rax, dword ptr arg(1) ; src_pixel_step
655 mov rcx, rax
656 neg rax ; negate pitch to deal with above border
658 mov rdx, arg(3) ;limit
659 movdqa xmm7, XMMWORD PTR [rdx]
661 lea rsi, [rsi + rcx]
662 lea rdi, [rdi + rcx]
664 ; calculate breakout conditions and high edge variance
665 LFH_FILTER_AND_HEV_MASK 0
666 ; filter and write back the results
667 MB_FILTER_AND_WRITEBACK 0
669 add rsp, 96
670 pop rsp
671 ; begin epilog
672 pop rdi
673 pop rsi
674 RESTORE_GOT
675 RESTORE_XMM
676 UNSHADOW_ARGS
677 pop rbp
681 %macro TRANSPOSE_16X8 2
682 movq xmm4, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
683 movq xmm1, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
684 movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
685 movq xmm7, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
686 movq xmm5, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
687 movq xmm2, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
689 punpcklbw xmm4, xmm1 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
691 movq xmm1, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
693 movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
694 punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
696 movq xmm7, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
698 punpcklbw xmm5, xmm2 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
699 %if %1
700 lea rsi, [rsi+rax*8]
701 %else
702 mov rsi, arg(5) ; v_ptr
703 %endif
705 movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
706 punpcklbw xmm7, xmm1 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
708 punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
710 punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
711 %if %1
712 lea rdi, [rdi+rax*8]
713 %else
714 lea rsi, [rsi - 4]
715 %endif
717 punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
718 %if %1
719 lea rdx, srct
720 %else
721 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
722 %endif
724 movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
725 punpckhwd xmm4, xmm0 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
727 movdqa xmm7, xmm4 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04
728 punpckhdq xmm3, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
730 punpckhdq xmm7, xmm6 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
732 punpckldq xmm4, xmm6 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
734 punpckldq xmm2, xmm5 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
736 movdqa t0, xmm2 ; save to free XMM2
737 movq xmm2, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
738 movq xmm6, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
739 movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
740 movq xmm5, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
741 movq xmm1, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
743 punpcklbw xmm2, xmm6 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
745 movq xmm6, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
747 punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
749 movq xmm5, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
751 punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
753 movq xmm6, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
755 punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
757 movdqa xmm6, xmm1 ;
758 punpckhwd xmm6, xmm5 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4
760 punpcklwd xmm1, xmm5 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
761 movdqa xmm5, xmm2 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
763 punpcklwd xmm5, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
765 punpckhwd xmm2, xmm0 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
767 movdqa xmm0, xmm5
768 punpckldq xmm0, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
770 punpckhdq xmm5, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
771 movdqa xmm1, xmm2 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84
773 punpckldq xmm1, xmm6 ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84
775 punpckhdq xmm2, xmm6 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86
776 movdqa xmm6, xmm7 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06
778 punpcklqdq xmm6, xmm2 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
780 punpckhqdq xmm7, xmm2 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
781 %if %2
782 movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
783 punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
785 punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
787 movdqa [rdx], xmm2 ; save 2
789 movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
790 punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
792 movdqa [rdx+16], xmm3 ; save 3
794 punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
796 movdqa [rdx+32], xmm4 ; save 4
797 movdqa [rdx+48], xmm5 ; save 5
798 movdqa xmm1, t0 ; get
800 movdqa xmm2, xmm1 ;
801 punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
803 punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
804 %else
805 movdqa [rdx+112], xmm7 ; save 7
807 movdqa [rdx+96], xmm6 ; save 6
809 movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
810 punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
812 punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
814 movdqa [rdx+32], xmm2 ; save 2
816 movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
817 punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
819 movdqa [rdx+48], xmm3 ; save 3
821 punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
823 movdqa [rdx+64], xmm4 ; save 4
824 movdqa [rdx+80], xmm5 ; save 5
825 movdqa xmm1, t0 ; get
827 movdqa xmm2, xmm1
828 punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
830 punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
832 movdqa [rdx+16], xmm1
834 movdqa [rdx], xmm2
835 %endif
836 %endmacro
838 %macro LFV_FILTER_MASK_HEV_MASK 1
839 movdqa xmm0, xmm6 ; q2
840 psubusb xmm0, xmm7 ; q2-q3
842 psubusb xmm7, xmm6 ; q3-q2
843 movdqa xmm4, xmm5 ; q1
845 por xmm7, xmm0 ; abs (q3-q2)
846 psubusb xmm4, xmm6 ; q1-q2
848 movdqa xmm0, xmm1
849 psubusb xmm6, xmm5 ; q2-q1
851 por xmm6, xmm4 ; abs (q2-q1)
852 psubusb xmm0, xmm2 ; p2 - p3;
854 psubusb xmm2, xmm1 ; p3 - p2;
855 por xmm0, xmm2 ; abs(p2-p3)
856 %if %1
857 movdqa xmm2, [rdx] ; p1
858 %else
859 movdqa xmm2, [rdx+32] ; p1
860 %endif
861 movdqa xmm5, xmm2 ; p1
862 pmaxub xmm0, xmm7
864 psubusb xmm5, xmm1 ; p1-p2
865 psubusb xmm1, xmm2 ; p2-p1
867 movdqa xmm7, xmm3 ; p0
868 psubusb xmm7, xmm2 ; p0-p1
870 por xmm1, xmm5 ; abs(p2-p1)
871 pmaxub xmm0, xmm6
873 pmaxub xmm0, xmm1
874 movdqa xmm1, xmm2 ; p1
876 psubusb xmm2, xmm3 ; p1-p0
877 lea rdx, srct
879 por xmm2, xmm7 ; abs(p1-p0)
881 movdqa t0, xmm2 ; save abs(p1-p0)
883 pmaxub xmm0, xmm2
885 %if %1
886 movdqa xmm5, [rdx+32] ; q0
887 movdqa xmm7, [rdx+48] ; q1
888 %else
889 movdqa xmm5, [rdx+64] ; q0
890 movdqa xmm7, [rdx+80] ; q1
891 %endif
892 mov rdx, arg(3) ; limit
894 movdqa xmm6, xmm5 ; q0
895 movdqa xmm2, xmm7 ; q1
897 psubusb xmm5, xmm7 ; q0-q1
898 psubusb xmm7, xmm6 ; q1-q0
900 por xmm7, xmm5 ; abs(q1-q0)
902 movdqa t1, xmm7 ; save abs(q1-q0)
904 movdqa xmm4, XMMWORD PTR [rdx]; limit
906 pmaxub xmm0, xmm7
907 mov rdx, arg(2) ; flimit
909 psubusb xmm0, xmm4
910 movdqa xmm5, xmm2 ; q1
912 psubusb xmm5, xmm1 ; q1-=p1
913 psubusb xmm1, xmm2 ; p1-=q1
915 por xmm5, xmm1 ; abs(p1-q1)
916 movdqa xmm1, xmm3 ; p0
918 pand xmm5, [GLOBAL(tfe)] ; set lsb of each byte to zero
919 psubusb xmm1, xmm6 ; p0-q0
921 psrlw xmm5, 1 ; abs(p1-q1)/2
922 psubusb xmm6, xmm3 ; q0-p0
924 movdqa xmm2, XMMWORD PTR [rdx]; flimit
926 mov rdx, arg(4) ; get thresh
928 por xmm1, xmm6 ; abs(q0-p0)
929 paddb xmm2, xmm2 ; flimit*2 (less than 255)
931 movdqa xmm6, t0 ; get abs (q1 - q0)
933 paddusb xmm1, xmm1 ; abs(q0-p0)*2
935 movdqa xmm3, t1 ; get abs (p1 - p0)
937 movdqa xmm7, XMMWORD PTR [rdx]
939 paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
940 psubusb xmm6, xmm7 ; abs(q1 - q0) > thresh
942 paddb xmm4, xmm2 ; flimit * 2 + limit (less than 255)
943 psubusb xmm3, xmm7 ; abs(p1 - p0)> thresh
945 psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
946 por xmm6, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
948 por xmm1, xmm0 ; mask
949 pcmpeqb xmm6, xmm0
951 pxor xmm0, xmm0
952 pcmpeqb xmm4, xmm4
954 pcmpeqb xmm1, xmm0
955 pxor xmm4, xmm6
956 %endmacro
958 %macro BV_TRANSPOSE 0
959 ; xmm1 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
960 ; xmm6 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
961 ; xmm3 = f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
962 ; xmm7 = f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
963 movdqa xmm2, xmm1 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
964 punpcklbw xmm2, xmm6 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
966 movdqa xmm4, xmm3 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
967 punpckhbw xmm1, xmm6 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
969 punpcklbw xmm4, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
971 punpckhbw xmm3, xmm7 ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
973 movdqa xmm6, xmm2 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
974 punpcklwd xmm2, xmm4 ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
976 punpckhwd xmm6, xmm4 ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
977 movdqa xmm5, xmm1 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
979 punpcklwd xmm1, xmm3 ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
981 punpckhwd xmm5, xmm3 ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
982 ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02
983 ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42
984 ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82
985 ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2
986 %endmacro
988 %macro BV_WRITEBACK 2
989 movd [rsi+2], %1
990 psrldq %1, 4
992 movd [rdi+2], %1
993 psrldq %1, 4
995 movd [rsi+2*rax+2], %1
996 psrldq %1, 4
998 movd [rdi+2*rax+2], %1
1000 movd [rsi+4*rax+2], %2
1001 psrldq %2, 4
1003 movd [rdi+4*rax+2], %2
1004 psrldq %2, 4
1006 movd [rsi+2*rcx+2], %2
1007 psrldq %2, 4
1009 movd [rdi+2*rcx+2], %2
1010 %endmacro
1013 ;void vp8_loop_filter_vertical_edge_sse2
1015 ; unsigned char *src_ptr,
1016 ; int src_pixel_step,
1017 ; const char *flimit,
1018 ; const char *limit,
1019 ; const char *thresh,
1020 ; int count
1022 global sym(vp8_loop_filter_vertical_edge_sse2)
1023 sym(vp8_loop_filter_vertical_edge_sse2):
1024 push rbp
1025 mov rbp, rsp
1026 SHADOW_ARGS_TO_STACK 6
1027 SAVE_XMM
1028 GET_GOT rbx
1029 push rsi
1030 push rdi
1031 ; end prolog
1033 ALIGN_STACK 16, rax
1034 sub rsp, 96 ; reserve 96 bytes
1035 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
1036 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
1037 %define srct [rsp + 32] ;__declspec(align(16)) char srct[64];
1039 mov rsi, arg(0) ; src_ptr
1040 movsxd rax, dword ptr arg(1) ; src_pixel_step
1042 lea rsi, [rsi - 4]
1043 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
1044 lea rcx, [rax*2+rax]
1046 ;transpose 16x8 to 8x16, and store the 8-line result on stack.
1047 TRANSPOSE_16X8 1, 1
1049 ; calculate filter mask and high edge variance
1050 LFV_FILTER_MASK_HEV_MASK 1
1052 ; start work on filters
1053 B_FILTER 2
1055 ; tranpose and write back - only work on q1, q0, p0, p1
1056 BV_TRANSPOSE
1057 ; store 16-line result
1059 lea rdx, [rax]
1060 neg rdx
1062 BV_WRITEBACK xmm1, xmm5
1064 lea rsi, [rsi+rdx*8]
1065 lea rdi, [rdi+rdx*8]
1066 BV_WRITEBACK xmm2, xmm6
1068 add rsp, 96
1069 pop rsp
1070 ; begin epilog
1071 pop rdi
1072 pop rsi
1073 RESTORE_GOT
1074 RESTORE_XMM
1075 UNSHADOW_ARGS
1076 pop rbp
1080 ;void vp8_loop_filter_vertical_edge_uv_sse2
1082 ; unsigned char *u,
1083 ; int src_pixel_step,
1084 ; const char *flimit,
1085 ; const char *limit,
1086 ; const char *thresh,
1087 ; unsigned char *v
1089 global sym(vp8_loop_filter_vertical_edge_uv_sse2)
1090 sym(vp8_loop_filter_vertical_edge_uv_sse2):
1091 push rbp
1092 mov rbp, rsp
1093 SHADOW_ARGS_TO_STACK 6
1094 SAVE_XMM
1095 GET_GOT rbx
1096 push rsi
1097 push rdi
1098 ; end prolog
1100 ALIGN_STACK 16, rax
1101 sub rsp, 96 ; reserve 96 bytes
1102 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
1103 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
1104 %define srct [rsp + 32] ;__declspec(align(16)) char srct[64];
1106 mov rsi, arg(0) ; u_ptr
1107 movsxd rax, dword ptr arg(1) ; src_pixel_step
1109 lea rsi, [rsi - 4]
1110 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
1111 lea rcx, [rax+2*rax]
1113 lea rdx, srct
1115 ;transpose 16x8 to 8x16, and store the 8-line result on stack.
1116 TRANSPOSE_16X8 0, 1
1118 ; calculate filter mask and high edge variance
1119 LFV_FILTER_MASK_HEV_MASK 1
1121 ; start work on filters
1122 B_FILTER 2
1124 ; tranpose and write back - only work on q1, q0, p0, p1
1125 BV_TRANSPOSE
1127 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
1129 ; store 16-line result
1130 BV_WRITEBACK xmm1, xmm5
1132 mov rsi, arg(0) ; u_ptr
1133 lea rsi, [rsi - 4]
1134 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
1135 BV_WRITEBACK xmm2, xmm6
1137 add rsp, 96
1138 pop rsp
1139 ; begin epilog
1140 pop rdi
1141 pop rsi
1142 RESTORE_GOT
1143 RESTORE_XMM
1144 UNSHADOW_ARGS
1145 pop rbp
1148 %macro MBV_TRANSPOSE 0
1149 movdqa xmm0, [rdx] ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1150 movdqa xmm1, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1152 punpcklbw xmm0, xmm7 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
1153 punpckhbw xmm1, xmm7 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
1155 movdqa xmm2, [rdx+32] ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1156 movdqa xmm6, xmm2 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1158 punpcklbw xmm2, [rdx+48] ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
1159 punpckhbw xmm6, [rdx+48] ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
1161 movdqa xmm3, xmm0 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
1162 punpcklwd xmm0, xmm2 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
1164 punpckhwd xmm3, xmm2 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
1165 movdqa xmm4, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
1167 punpcklwd xmm1, xmm6 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
1168 punpckhwd xmm4, xmm6 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
1170 movdqa xmm2, [rdx+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
1171 punpcklbw xmm2, [rdx+80] ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
1173 movdqa xmm6, xmm5 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
1174 punpcklbw xmm6, [rdx+112] ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
1176 movdqa xmm7, xmm2 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
1177 punpcklwd xmm2, xmm6 ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04
1179 punpckhwd xmm7, xmm6 ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44
1180 movdqa xmm6, xmm0 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
1182 punpckldq xmm0, xmm2 ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00
1183 punpckhdq xmm6, xmm2 ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20
1184 %endmacro
1186 %macro MBV_WRITEBACK_1 0
1187 movq QWORD PTR [rsi], xmm0
1188 movhps MMWORD PTR [rdi], xmm0
1190 movq QWORD PTR [rsi+2*rax], xmm6
1191 movhps MMWORD PTR [rdi+2*rax], xmm6
1193 movdqa xmm0, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
1194 punpckldq xmm0, xmm7 ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
1196 punpckhdq xmm3, xmm7 ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
1198 movq QWORD PTR [rsi+4*rax], xmm0
1199 movhps MMWORD PTR [rdi+4*rax], xmm0
1201 movq QWORD PTR [rsi+2*rcx], xmm3
1202 movhps MMWORD PTR [rdi+2*rcx], xmm3
1204 movdqa xmm2, [rdx+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
1205 punpckhbw xmm2, [rdx+80] ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
1207 punpckhbw xmm5, [rdx+112] ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
1208 movdqa xmm0, xmm2
1210 punpcklwd xmm0, xmm5 ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
1211 punpckhwd xmm2, xmm5 ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
1213 movdqa xmm5, xmm1 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
1214 punpckldq xmm1, xmm0 ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80
1216 punpckhdq xmm5, xmm0 ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
1217 %endmacro
1219 %macro MBV_WRITEBACK_2 0
1220 movq QWORD PTR [rsi], xmm1
1221 movhps MMWORD PTR [rdi], xmm1
1223 movq QWORD PTR [rsi+2*rax], xmm5
1224 movhps MMWORD PTR [rdi+2*rax], xmm5
1226 movdqa xmm1, xmm4 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
1227 punpckldq xmm1, xmm2 ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
1228 punpckhdq xmm4, xmm2 ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
1230 movq QWORD PTR [rsi+4*rax], xmm1
1231 movhps MMWORD PTR [rdi+4*rax], xmm1
1233 movq QWORD PTR [rsi+2*rcx], xmm4
1234 movhps MMWORD PTR [rdi+2*rcx], xmm4
1235 %endmacro
1238 ;void vp8_mbloop_filter_vertical_edge_sse2
1240 ; unsigned char *src_ptr,
1241 ; int src_pixel_step,
1242 ; const char *flimit,
1243 ; const char *limit,
1244 ; const char *thresh,
1245 ; int count
1247 global sym(vp8_mbloop_filter_vertical_edge_sse2)
1248 sym(vp8_mbloop_filter_vertical_edge_sse2):
1249 push rbp
1250 mov rbp, rsp
1251 SHADOW_ARGS_TO_STACK 6
1252 SAVE_XMM
1253 GET_GOT rbx
1254 push rsi
1255 push rdi
1256 ; end prolog
1258 ALIGN_STACK 16, rax
1259 sub rsp, 160 ; reserve 160 bytes
1260 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
1261 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
1262 %define srct [rsp + 32] ;__declspec(align(16)) char srct[128];
1264 mov rsi, arg(0) ; src_ptr
1265 movsxd rax, dword ptr arg(1) ; src_pixel_step
1267 lea rsi, [rsi - 4]
1268 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
1269 lea rcx, [rax*2+rax]
1271 ; Transpose
1272 TRANSPOSE_16X8 1, 0
1274 ; calculate filter mask and high edge variance
1275 LFV_FILTER_MASK_HEV_MASK 0
1277 neg rax
1278 ; start work on filters
1279 MB_FILTER_AND_WRITEBACK 2
1281 lea rsi, [rsi+rax*8]
1282 lea rdi, [rdi+rax*8]
1284 ; transpose and write back
1285 MBV_TRANSPOSE
1287 neg rax
1289 MBV_WRITEBACK_1
1291 lea rsi, [rsi+rax*8]
1292 lea rdi, [rdi+rax*8]
1293 MBV_WRITEBACK_2
1295 add rsp, 160
1296 pop rsp
1297 ; begin epilog
1298 pop rdi
1299 pop rsi
1300 RESTORE_GOT
1301 RESTORE_XMM
1302 UNSHADOW_ARGS
1303 pop rbp
1307 ;void vp8_mbloop_filter_vertical_edge_uv_sse2
1309 ; unsigned char *u,
1310 ; int src_pixel_step,
1311 ; const char *flimit,
1312 ; const char *limit,
1313 ; const char *thresh,
1314 ; unsigned char *v
1316 global sym(vp8_mbloop_filter_vertical_edge_uv_sse2)
1317 sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
1318 push rbp
1319 mov rbp, rsp
1320 SHADOW_ARGS_TO_STACK 6
1321 SAVE_XMM
1322 GET_GOT rbx
1323 push rsi
1324 push rdi
1325 ; end prolog
1327 ALIGN_STACK 16, rax
1328 sub rsp, 160 ; reserve 160 bytes
1329 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
1330 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
1331 %define srct [rsp + 32] ;__declspec(align(16)) char srct[128];
1333 mov rsi, arg(0) ; u_ptr
1334 movsxd rax, dword ptr arg(1) ; src_pixel_step
1336 lea rsi, [rsi - 4]
1337 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
1338 lea rcx, [rax+2*rax]
1340 lea rdx, srct
1342 ; Transpose
1343 TRANSPOSE_16X8 0, 0
1345 ; calculate filter mask and high edge variance
1346 LFV_FILTER_MASK_HEV_MASK 0
1348 ; start work on filters
1349 MB_FILTER_AND_WRITEBACK 2
1351 ; transpose and write back
1352 MBV_TRANSPOSE
1354 mov rsi, arg(0) ;u_ptr
1355 lea rsi, [rsi - 4]
1356 lea rdi, [rsi + rax]
1357 MBV_WRITEBACK_1
1358 mov rsi, arg(5) ;v_ptr
1359 lea rsi, [rsi - 4]
1360 lea rdi, [rsi + rax]
1361 MBV_WRITEBACK_2
1363 add rsp, 160
1364 pop rsp
1365 ; begin epilog
1366 pop rdi
1367 pop rsi
1368 RESTORE_GOT
1369 RESTORE_XMM
1370 UNSHADOW_ARGS
1371 pop rbp
1375 ;void vp8_loop_filter_simple_horizontal_edge_sse2
1377 ; unsigned char *src_ptr,
1378 ; int src_pixel_step,
1379 ; const char *flimit,
1380 ; const char *limit,
1381 ; const char *thresh,
1382 ; int count
1384 global sym(vp8_loop_filter_simple_horizontal_edge_sse2)
1385 sym(vp8_loop_filter_simple_horizontal_edge_sse2):
1386 push rbp
1387 mov rbp, rsp
1388 SHADOW_ARGS_TO_STACK 6
1389 SAVE_XMM
1390 GET_GOT rbx
1391 push rsi
1392 push rdi
1393 ; end prolog
1395 mov rsi, arg(0) ;src_ptr
1396 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
1397 mov rdx, arg(2) ;flimit ; get flimit
1398 movdqa xmm3, XMMWORD PTR [rdx]
1399 mov rdx, arg(3) ;limit
1400 movdqa xmm7, XMMWORD PTR [rdx]
1402 paddb xmm3, xmm3 ; flimit*2 (less than 255)
1403 paddb xmm3, xmm7 ; flimit * 2 + limit (less than 255)
1405 mov rdi, rsi ; rdi points to row +1 for indirect addressing
1406 add rdi, rax
1407 neg rax
1409 ; calculate mask
1410 movdqu xmm1, [rsi+2*rax] ; p1
1411 movdqu xmm0, [rdi] ; q1
1412 movdqa xmm2, xmm1
1413 movdqa xmm7, xmm0
1414 movdqa xmm4, xmm0
1415 psubusb xmm0, xmm1 ; q1-=p1
1416 psubusb xmm1, xmm4 ; p1-=q1
1417 por xmm1, xmm0 ; abs(p1-q1)
1418 pand xmm1, [GLOBAL(tfe)] ; set lsb of each byte to zero
1419 psrlw xmm1, 1 ; abs(p1-q1)/2
1421 movdqu xmm5, [rsi+rax] ; p0
1422 movdqu xmm4, [rsi] ; q0
1423 movdqa xmm0, xmm4 ; q0
1424 movdqa xmm6, xmm5 ; p0
1425 psubusb xmm5, xmm4 ; p0-=q0
1426 psubusb xmm4, xmm6 ; q0-=p0
1427 por xmm5, xmm4 ; abs(p0 - q0)
1428 paddusb xmm5, xmm5 ; abs(p0-q0)*2
1429 paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2
1431 psubusb xmm5, xmm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
1432 pxor xmm3, xmm3
1433 pcmpeqb xmm5, xmm3
1435 ; start work on filters
1436 pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
1437 pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
1438 psubsb xmm2, xmm7 ; p1 - q1
1440 pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values
1441 pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values
1442 movdqa xmm3, xmm0 ; q0
1443 psubsb xmm0, xmm6 ; q0 - p0
1444 paddsb xmm2, xmm0 ; p1 - q1 + 1 * (q0 - p0)
1445 paddsb xmm2, xmm0 ; p1 - q1 + 2 * (q0 - p0)
1446 paddsb xmm2, xmm0 ; p1 - q1 + 3 * (q0 - p0)
1447 pand xmm5, xmm2 ; mask filter values we don't care about
1449 ; do + 4 side
1450 paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
1452 movdqa xmm0, xmm5 ; get a copy of filters
1453 psllw xmm0, 8 ; shift left 8
1454 psraw xmm0, 3 ; arithmetic shift right 11
1455 psrlw xmm0, 8
1456 movdqa xmm1, xmm5 ; get a copy of filters
1457 psraw xmm1, 11 ; arithmetic shift right 11
1458 psllw xmm1, 8 ; shift left 8 to put it back
1460 por xmm0, xmm1 ; put the two together to get result
1462 psubsb xmm3, xmm0 ; q0-= q0 add
1463 pxor xmm3, [GLOBAL(t80)] ; unoffset
1464 movdqu [rsi], xmm3 ; write back
1466 ; now do +3 side
1467 psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4
1469 movdqa xmm0, xmm5 ; get a copy of filters
1470 psllw xmm0, 8 ; shift left 8
1471 psraw xmm0, 3 ; arithmetic shift right 11
1472 psrlw xmm0, 8
1473 psraw xmm5, 11 ; arithmetic shift right 11
1474 psllw xmm5, 8 ; shift left 8 to put it back
1475 por xmm0, xmm5 ; put the two together to get result
1478 paddsb xmm6, xmm0 ; p0+= p0 add
1479 pxor xmm6, [GLOBAL(t80)] ; unoffset
1480 movdqu [rsi+rax], xmm6 ; write back
1482 ; begin epilog
1483 pop rdi
1484 pop rsi
1485 RESTORE_GOT
1486 RESTORE_XMM
1487 UNSHADOW_ARGS
1488 pop rbp
1492 ;void vp8_loop_filter_simple_vertical_edge_sse2
1494 ; unsigned char *src_ptr,
1495 ; int src_pixel_step,
1496 ; const char *flimit,
1497 ; const char *limit,
1498 ; const char *thresh,
1499 ; int count
1501 global sym(vp8_loop_filter_simple_vertical_edge_sse2)
1502 sym(vp8_loop_filter_simple_vertical_edge_sse2):
1503 push rbp ; save old base pointer value.
1504 mov rbp, rsp ; set new base pointer value.
1505 SHADOW_ARGS_TO_STACK 6
1506 SAVE_XMM
1507 GET_GOT rbx ; save callee-saved reg
1508 push rsi
1509 push rdi
1510 ; end prolog
1512 ALIGN_STACK 16, rax
1513 sub rsp, 32 ; reserve 32 bytes
1514 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
1515 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
1517 mov rsi, arg(0) ;src_ptr
1518 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
1520 lea rsi, [rsi - 2 ]
1521 lea rdi, [rsi + rax]
1522 lea rdx, [rsi + rax*4]
1523 lea rcx, [rdx + rax]
1525 movdqu xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00
1526 movdqu xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40
1527 movdqu xmm2, [rdi] ; 13 12 11 10
1528 movdqu xmm3, [rcx] ; 53 52 51 50
1529 punpckldq xmm0, xmm1 ; (high 64 bits unused) 43 42 41 40 03 02 01 00
1530 punpckldq xmm2, xmm3 ; 53 52 51 50 13 12 11 10
1532 movdqu xmm4, [rsi + rax*2] ; 23 22 21 20
1533 movdqu xmm5, [rdx + rax*2] ; 63 62 61 60
1534 movdqu xmm6, [rdi + rax*2] ; 33 32 31 30
1535 movdqu xmm7, [rcx + rax*2] ; 73 72 71 70
1536 punpckldq xmm4, xmm5 ; 63 62 61 60 23 22 21 20
1537 punpckldq xmm6, xmm7 ; 73 72 71 70 33 32 31 30
1539 punpcklbw xmm0, xmm2 ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
1540 punpcklbw xmm4, xmm6 ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
1542 movdqa xmm1, xmm0
1543 punpcklwd xmm0, xmm4 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
1544 punpckhwd xmm1, xmm4 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
1546 movdqa xmm2, xmm0
1547 punpckldq xmm0, xmm1 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
1548 punpckhdq xmm2, xmm1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
1550 movdqa t0, xmm0 ; save to t0
1551 movdqa t1, xmm2 ; save to t1
1553 lea rsi, [rsi + rax*8]
1554 lea rdi, [rsi + rax]
1555 lea rdx, [rsi + rax*4]
1556 lea rcx, [rdx + rax]
1558 movdqu xmm4, [rsi] ; 83 82 81 80
1559 movdqu xmm1, [rdx] ; c3 c2 c1 c0
1560 movdqu xmm6, [rdi] ; 93 92 91 90
1561 movdqu xmm3, [rcx] ; d3 d2 d1 d0
1562 punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80
1563 punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90
1565 movdqu xmm0, [rsi + rax*2] ; a3 a2 a1 a0
1566 movdqu xmm5, [rdx + rax*2] ; e3 e2 e1 e0
1567 movdqu xmm2, [rdi + rax*2] ; b3 b2 b1 b0
1568 movdqu xmm7, [rcx + rax*2] ; f3 f2 f1 f0
1569 punpckldq xmm0, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0
1570 punpckldq xmm2, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0
1572 punpcklbw xmm4, xmm6 ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
1573 punpcklbw xmm0, xmm2 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
1575 movdqa xmm1, xmm4
1576 punpcklwd xmm4, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
1577 punpckhwd xmm1, xmm0 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
1579 movdqa xmm6, xmm4
1580 punpckldq xmm4, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
1581 punpckhdq xmm6, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
1583 movdqa xmm0, t0 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
1584 movdqa xmm2, t1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
1585 movdqa xmm1, xmm0
1586 movdqa xmm3, xmm2
1588 punpcklqdq xmm0, xmm4 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1589 punpckhqdq xmm1, xmm4 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
1590 punpcklqdq xmm2, xmm6 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1591 punpckhqdq xmm3, xmm6 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
1593 ; calculate mask
1594 movdqa xmm6, xmm0 ; p1
1595 movdqa xmm7, xmm3 ; q1
1596 psubusb xmm7, xmm0 ; q1-=p1
1597 psubusb xmm6, xmm3 ; p1-=q1
1598 por xmm6, xmm7 ; abs(p1-q1)
1599 pand xmm6, [GLOBAL(tfe)] ; set lsb of each byte to zero
1600 psrlw xmm6, 1 ; abs(p1-q1)/2
1602 movdqa xmm5, xmm1 ; p0
1603 movdqa xmm4, xmm2 ; q0
1604 psubusb xmm5, xmm2 ; p0-=q0
1605 psubusb xmm4, xmm1 ; q0-=p0
1606 por xmm5, xmm4 ; abs(p0 - q0)
1607 paddusb xmm5, xmm5 ; abs(p0-q0)*2
1608 paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2
1610 mov rdx, arg(2) ;flimit
1611 movdqa xmm7, XMMWORD PTR [rdx]
1612 mov rdx, arg(3) ; get limit
1613 movdqa xmm6, XMMWORD PTR [rdx]
1614 paddb xmm7, xmm7 ; flimit*2 (less than 255)
1615 paddb xmm7, xmm6 ; flimit * 2 + limit (less than 255)
1617 psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > flimit * 2 + limit
1618 pxor xmm7, xmm7
1619 pcmpeqb xmm5, xmm7 ; mm5 = mask
1621 ; start work on filters
1622 movdqa t0, xmm0
1623 movdqa t1, xmm3
1625 pxor xmm0, [GLOBAL(t80)] ; p1 offset to convert to signed values
1626 pxor xmm3, [GLOBAL(t80)] ; q1 offset to convert to signed values
1628 psubsb xmm0, xmm3 ; p1 - q1
1629 movdqa xmm6, xmm1 ; p0
1631 movdqa xmm7, xmm2 ; q0
1632 pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values
1634 pxor xmm7, [GLOBAL(t80)] ; offset to convert to signed values
1635 movdqa xmm3, xmm7 ; offseted ; q0
1637 psubsb xmm7, xmm6 ; q0 - p0
1638 paddsb xmm0, xmm7 ; p1 - q1 + 1 * (q0 - p0)
1640 paddsb xmm0, xmm7 ; p1 - q1 + 2 * (q0 - p0)
1641 paddsb xmm0, xmm7 ; p1 - q1 + 3 * (q0 - p0)
1643 pand xmm5, xmm0 ; mask filter values we don't care about
1646 paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
1648 movdqa xmm0, xmm5 ; get a copy of filters
1649 psllw xmm0, 8 ; shift left 8
1651 psraw xmm0, 3 ; arithmetic shift right 11
1652 psrlw xmm0, 8
1654 movdqa xmm7, xmm5 ; get a copy of filters
1655 psraw xmm7, 11 ; arithmetic shift right 11
1657 psllw xmm7, 8 ; shift left 8 to put it back
1658 por xmm0, xmm7 ; put the two together to get result
1660 psubsb xmm3, xmm0 ; q0-= q0sz add
1661 pxor xmm3, [GLOBAL(t80)] ; unoffset q0
1663 ; now do +3 side
1664 psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4
1665 movdqa xmm0, xmm5 ; get a copy of filters
1667 psllw xmm0, 8 ; shift left 8
1668 psraw xmm0, 3 ; arithmetic shift right 11
1670 psrlw xmm0, 8
1671 psraw xmm5, 11 ; arithmetic shift right 11
1673 psllw xmm5, 8 ; shift left 8 to put it back
1674 por xmm0, xmm5 ; put the two together to get result
1676 paddsb xmm6, xmm0 ; p0+= p0 add
1677 pxor xmm6, [GLOBAL(t80)] ; unoffset p0
1679 movdqa xmm0, t0 ; p1
1680 movdqa xmm4, t1 ; q1
1682 ; transpose back to write out
1683 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
1684 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
1685 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
1686 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
1687 movdqa xmm1, xmm0
1688 punpcklbw xmm0, xmm6 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
1689 punpckhbw xmm1, xmm6 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
1691 movdqa xmm5, xmm3
1692 punpcklbw xmm3, xmm4 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
1693 punpckhbw xmm5, xmm4 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
1695 movdqa xmm2, xmm0
1696 punpcklwd xmm0, xmm3 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
1697 punpckhwd xmm2, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
1699 movdqa xmm3, xmm1
1700 punpcklwd xmm1, xmm5 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
1701 punpckhwd xmm3, xmm5 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
1703 ; write out order: xmm0 xmm2 xmm1 xmm3
1704 lea rdx, [rsi + rax*4]
1706 movd [rsi], xmm1 ; write the second 8-line result
1707 psrldq xmm1, 4
1708 movd [rdi], xmm1
1709 psrldq xmm1, 4
1710 movd [rsi + rax*2], xmm1
1711 psrldq xmm1, 4
1712 movd [rdi + rax*2], xmm1
1714 movd [rdx], xmm3
1715 psrldq xmm3, 4
1716 movd [rcx], xmm3
1717 psrldq xmm3, 4
1718 movd [rdx + rax*2], xmm3
1719 psrldq xmm3, 4
1720 movd [rcx + rax*2], xmm3
1722 neg rax
1723 lea rsi, [rsi + rax*8]
1724 neg rax
1725 lea rdi, [rsi + rax]
1726 lea rdx, [rsi + rax*4]
1727 lea rcx, [rdx + rax]
1729 movd [rsi], xmm0 ; write the first 8-line result
1730 psrldq xmm0, 4
1731 movd [rdi], xmm0
1732 psrldq xmm0, 4
1733 movd [rsi + rax*2], xmm0
1734 psrldq xmm0, 4
1735 movd [rdi + rax*2], xmm0
1737 movd [rdx], xmm2
1738 psrldq xmm2, 4
1739 movd [rcx], xmm2
1740 psrldq xmm2, 4
1741 movd [rdx + rax*2], xmm2
1742 psrldq xmm2, 4
1743 movd [rcx + rax*2], xmm2
1745 add rsp, 32
1746 pop rsp
1747 ; begin epilog
1748 pop rdi
1749 pop rsi
1750 RESTORE_GOT
1751 RESTORE_XMM
1752 UNSHADOW_ARGS
1753 pop rbp
1756 SECTION_RODATA
1757 align 16
1758 tfe:
1759 times 16 db 0xfe
1760 align 16
1761 t80:
1762 times 16 db 0x80
1763 align 16
1764 t1s:
1765 times 16 db 0x01
1766 align 16
1768 times 16 db 0x03
1769 align 16
1771 times 16 db 0x04
1772 align 16
1773 ones:
1774 times 8 dw 0x0001
1775 align 16
1777 times 8 dw 0x0900
1778 align 16
1779 s63:
1780 times 8 dw 0x003f