Merge "Skip computation of distortion in vp8_pick_inter_mode if active_map is used"
[libvpx.git] / vp8 / common / x86 / recon_sse2.asm
blobf54cc4e7e7fd38316beaca560d1fbd66a1e8e38e
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
13 ;void vp8_recon2b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
14 global sym(vp8_recon2b_sse2)
15 sym(vp8_recon2b_sse2):
16 push rbp
17 mov rbp, rsp
18 SHADOW_ARGS_TO_STACK 4
19 push rsi
20 push rdi
21 ; end prolog
23 mov rsi, arg(0) ;s
24 mov rdi, arg(2) ;d
25 mov rdx, arg(1) ;q
26 movsxd rax, dword ptr arg(3) ;stride
27 pxor xmm0, xmm0
29 movq xmm1, MMWORD PTR [rsi]
30 punpcklbw xmm1, xmm0
31 paddsw xmm1, XMMWORD PTR [rdx]
32 packuswb xmm1, xmm0 ; pack and unpack to saturate
33 movq MMWORD PTR [rdi], xmm1
36 movq xmm2, MMWORD PTR [rsi+8]
37 punpcklbw xmm2, xmm0
38 paddsw xmm2, XMMWORD PTR [rdx+16]
39 packuswb xmm2, xmm0 ; pack and unpack to saturate
40 movq MMWORD PTR [rdi+rax], xmm2
43 movq xmm3, MMWORD PTR [rsi+16]
44 punpcklbw xmm3, xmm0
45 paddsw xmm3, XMMWORD PTR [rdx+32]
46 packuswb xmm3, xmm0 ; pack and unpack to saturate
47 movq MMWORD PTR [rdi+rax*2], xmm3
49 add rdi, rax
50 movq xmm4, MMWORD PTR [rsi+24]
51 punpcklbw xmm4, xmm0
52 paddsw xmm4, XMMWORD PTR [rdx+48]
53 packuswb xmm4, xmm0 ; pack and unpack to saturate
54 movq MMWORD PTR [rdi+rax*2], xmm4
56 ; begin epilog
57 pop rdi
58 pop rsi
59 UNSHADOW_ARGS
60 pop rbp
61 ret
64 ;void vp8_recon4b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
65 global sym(vp8_recon4b_sse2)
66 sym(vp8_recon4b_sse2):
67 push rbp
68 mov rbp, rsp
69 SHADOW_ARGS_TO_STACK 4
70 SAVE_XMM 7
71 push rsi
72 push rdi
73 ; end prolog
75 mov rsi, arg(0) ;s
76 mov rdi, arg(2) ;d
77 mov rdx, arg(1) ;q
78 movsxd rax, dword ptr arg(3) ;stride
79 pxor xmm0, xmm0
81 movdqa xmm1, XMMWORD PTR [rsi]
82 movdqa xmm5, xmm1
83 punpcklbw xmm1, xmm0
84 punpckhbw xmm5, xmm0
85 paddsw xmm1, XMMWORD PTR [rdx]
86 paddsw xmm5, XMMWORD PTR [rdx+16]
87 packuswb xmm1, xmm5 ; pack and unpack to saturate
88 movdqa XMMWORD PTR [rdi], xmm1
91 movdqa xmm2, XMMWORD PTR [rsi+16]
92 movdqa xmm6, xmm2
93 punpcklbw xmm2, xmm0
94 punpckhbw xmm6, xmm0
95 paddsw xmm2, XMMWORD PTR [rdx+32]
96 paddsw xmm6, XMMWORD PTR [rdx+48]
97 packuswb xmm2, xmm6 ; pack and unpack to saturate
98 movdqa XMMWORD PTR [rdi+rax], xmm2
101 movdqa xmm3, XMMWORD PTR [rsi+32]
102 movdqa xmm7, xmm3
103 punpcklbw xmm3, xmm0
104 punpckhbw xmm7, xmm0
105 paddsw xmm3, XMMWORD PTR [rdx+64]
106 paddsw xmm7, XMMWORD PTR [rdx+80]
107 packuswb xmm3, xmm7 ; pack and unpack to saturate
108 movdqa XMMWORD PTR [rdi+rax*2], xmm3
110 add rdi, rax
111 movdqa xmm4, XMMWORD PTR [rsi+48]
112 movdqa xmm5, xmm4
113 punpcklbw xmm4, xmm0
114 punpckhbw xmm5, xmm0
115 paddsw xmm4, XMMWORD PTR [rdx+96]
116 paddsw xmm5, XMMWORD PTR [rdx+112]
117 packuswb xmm4, xmm5 ; pack and unpack to saturate
118 movdqa XMMWORD PTR [rdi+rax*2], xmm4
120 ; begin epilog
121 pop rdi
122 pop rsi
123 RESTORE_XMM
124 UNSHADOW_ARGS
125 pop rbp
129 ;void copy_mem16x16_sse2(
130 ; unsigned char *src,
131 ; int src_stride,
132 ; unsigned char *dst,
133 ; int dst_stride
135 global sym(vp8_copy_mem16x16_sse2)
136 sym(vp8_copy_mem16x16_sse2):
137 push rbp
138 mov rbp, rsp
139 SHADOW_ARGS_TO_STACK 4
140 push rsi
141 push rdi
142 ; end prolog
144 mov rsi, arg(0) ;src;
145 movdqu xmm0, [rsi]
147 movsxd rax, dword ptr arg(1) ;src_stride;
148 mov rdi, arg(2) ;dst;
150 movdqu xmm1, [rsi+rax]
151 movdqu xmm2, [rsi+rax*2]
153 movsxd rcx, dword ptr arg(3) ;dst_stride
154 lea rsi, [rsi+rax*2]
156 movdqa [rdi], xmm0
157 add rsi, rax
159 movdqa [rdi+rcx], xmm1
160 movdqa [rdi+rcx*2],xmm2
162 lea rdi, [rdi+rcx*2]
163 movdqu xmm3, [rsi]
165 add rdi, rcx
166 movdqu xmm4, [rsi+rax]
168 movdqu xmm5, [rsi+rax*2]
169 lea rsi, [rsi+rax*2]
171 movdqa [rdi], xmm3
172 add rsi, rax
174 movdqa [rdi+rcx], xmm4
175 movdqa [rdi+rcx*2],xmm5
177 lea rdi, [rdi+rcx*2]
178 movdqu xmm0, [rsi]
180 add rdi, rcx
181 movdqu xmm1, [rsi+rax]
183 movdqu xmm2, [rsi+rax*2]
184 lea rsi, [rsi+rax*2]
186 movdqa [rdi], xmm0
187 add rsi, rax
189 movdqa [rdi+rcx], xmm1
191 movdqa [rdi+rcx*2], xmm2
192 movdqu xmm3, [rsi]
194 movdqu xmm4, [rsi+rax]
195 lea rdi, [rdi+rcx*2]
197 add rdi, rcx
198 movdqu xmm5, [rsi+rax*2]
200 lea rsi, [rsi+rax*2]
201 movdqa [rdi], xmm3
203 add rsi, rax
204 movdqa [rdi+rcx], xmm4
206 movdqa [rdi+rcx*2],xmm5
207 movdqu xmm0, [rsi]
209 lea rdi, [rdi+rcx*2]
210 movdqu xmm1, [rsi+rax]
212 add rdi, rcx
213 movdqu xmm2, [rsi+rax*2]
215 lea rsi, [rsi+rax*2]
216 movdqa [rdi], xmm0
218 movdqa [rdi+rcx], xmm1
219 movdqa [rdi+rcx*2],xmm2
221 movdqu xmm3, [rsi+rax]
222 lea rdi, [rdi+rcx*2]
224 movdqa [rdi+rcx], xmm3
226 ; begin epilog
227 pop rdi
228 pop rsi
229 UNSHADOW_ARGS
230 pop rbp
234 ;void vp8_intra_pred_uv_dc_mmx2(
235 ; unsigned char *dst,
236 ; int dst_stride
237 ; unsigned char *src,
238 ; int src_stride,
240 global sym(vp8_intra_pred_uv_dc_mmx2)
241 sym(vp8_intra_pred_uv_dc_mmx2):
242 push rbp
243 mov rbp, rsp
244 SHADOW_ARGS_TO_STACK 4
245 push rsi
246 push rdi
247 ; end prolog
249 ; from top
250 mov rsi, arg(2) ;src;
251 movsxd rax, dword ptr arg(3) ;src_stride;
252 sub rsi, rax
253 pxor mm0, mm0
254 movq mm1, [rsi]
255 psadbw mm1, mm0
257 ; from left
258 dec rsi
259 lea rdi, [rax*3]
260 movzx ecx, byte [rsi+rax]
261 movzx edx, byte [rsi+rax*2]
262 add ecx, edx
263 movzx edx, byte [rsi+rdi]
264 add ecx, edx
265 lea rsi, [rsi+rax*4]
266 movzx edx, byte [rsi]
267 add ecx, edx
268 movzx edx, byte [rsi+rax]
269 add ecx, edx
270 movzx edx, byte [rsi+rax*2]
271 add ecx, edx
272 movzx edx, byte [rsi+rdi]
273 add ecx, edx
274 movzx edx, byte [rsi+rax*4]
275 add ecx, edx
277 ; add up
278 pextrw edx, mm1, 0x0
279 lea edx, [edx+ecx+8]
280 sar edx, 4
281 movd mm1, edx
282 pshufw mm1, mm1, 0x0
283 packuswb mm1, mm1
285 ; write out
286 mov rdi, arg(0) ;dst;
287 movsxd rcx, dword ptr arg(1) ;dst_stride
288 lea rax, [rcx*3]
290 movq [rdi ], mm1
291 movq [rdi+rcx ], mm1
292 movq [rdi+rcx*2], mm1
293 movq [rdi+rax ], mm1
294 lea rdi, [rdi+rcx*4]
295 movq [rdi ], mm1
296 movq [rdi+rcx ], mm1
297 movq [rdi+rcx*2], mm1
298 movq [rdi+rax ], mm1
300 ; begin epilog
301 pop rdi
302 pop rsi
303 UNSHADOW_ARGS
304 pop rbp
307 ;void vp8_intra_pred_uv_dctop_mmx2(
308 ; unsigned char *dst,
309 ; int dst_stride
310 ; unsigned char *src,
311 ; int src_stride,
313 global sym(vp8_intra_pred_uv_dctop_mmx2)
314 sym(vp8_intra_pred_uv_dctop_mmx2):
315 push rbp
316 mov rbp, rsp
317 SHADOW_ARGS_TO_STACK 4
318 GET_GOT rbx
319 push rsi
320 push rdi
321 ; end prolog
323 ; from top
324 mov rsi, arg(2) ;src;
325 movsxd rax, dword ptr arg(3) ;src_stride;
326 sub rsi, rax
327 pxor mm0, mm0
328 movq mm1, [rsi]
329 psadbw mm1, mm0
331 ; add up
332 paddw mm1, [GLOBAL(dc_4)]
333 psraw mm1, 3
334 pshufw mm1, mm1, 0x0
335 packuswb mm1, mm1
337 ; write out
338 mov rdi, arg(0) ;dst;
339 movsxd rcx, dword ptr arg(1) ;dst_stride
340 lea rax, [rcx*3]
342 movq [rdi ], mm1
343 movq [rdi+rcx ], mm1
344 movq [rdi+rcx*2], mm1
345 movq [rdi+rax ], mm1
346 lea rdi, [rdi+rcx*4]
347 movq [rdi ], mm1
348 movq [rdi+rcx ], mm1
349 movq [rdi+rcx*2], mm1
350 movq [rdi+rax ], mm1
352 ; begin epilog
353 pop rdi
354 pop rsi
355 RESTORE_GOT
356 UNSHADOW_ARGS
357 pop rbp
360 ;void vp8_intra_pred_uv_dcleft_mmx2(
361 ; unsigned char *dst,
362 ; int dst_stride
363 ; unsigned char *src,
364 ; int src_stride,
366 global sym(vp8_intra_pred_uv_dcleft_mmx2)
367 sym(vp8_intra_pred_uv_dcleft_mmx2):
368 push rbp
369 mov rbp, rsp
370 SHADOW_ARGS_TO_STACK 4
371 push rsi
372 push rdi
373 ; end prolog
375 ; from left
376 mov rsi, arg(2) ;src;
377 movsxd rax, dword ptr arg(3) ;src_stride;
378 dec rsi
379 lea rdi, [rax*3]
380 movzx ecx, byte [rsi]
381 movzx edx, byte [rsi+rax]
382 add ecx, edx
383 movzx edx, byte [rsi+rax*2]
384 add ecx, edx
385 movzx edx, byte [rsi+rdi]
386 add ecx, edx
387 lea rsi, [rsi+rax*4]
388 movzx edx, byte [rsi]
389 add ecx, edx
390 movzx edx, byte [rsi+rax]
391 add ecx, edx
392 movzx edx, byte [rsi+rax*2]
393 add ecx, edx
394 movzx edx, byte [rsi+rdi]
395 lea edx, [ecx+edx+4]
397 ; add up
398 shr edx, 3
399 movd mm1, edx
400 pshufw mm1, mm1, 0x0
401 packuswb mm1, mm1
403 ; write out
404 mov rdi, arg(0) ;dst;
405 movsxd rcx, dword ptr arg(1) ;dst_stride
406 lea rax, [rcx*3]
408 movq [rdi ], mm1
409 movq [rdi+rcx ], mm1
410 movq [rdi+rcx*2], mm1
411 movq [rdi+rax ], mm1
412 lea rdi, [rdi+rcx*4]
413 movq [rdi ], mm1
414 movq [rdi+rcx ], mm1
415 movq [rdi+rcx*2], mm1
416 movq [rdi+rax ], mm1
418 ; begin epilog
419 pop rdi
420 pop rsi
421 UNSHADOW_ARGS
422 pop rbp
425 ;void vp8_intra_pred_uv_dc128_mmx(
426 ; unsigned char *dst,
427 ; int dst_stride
428 ; unsigned char *src,
429 ; int src_stride,
431 global sym(vp8_intra_pred_uv_dc128_mmx)
432 sym(vp8_intra_pred_uv_dc128_mmx):
433 push rbp
434 mov rbp, rsp
435 SHADOW_ARGS_TO_STACK 4
436 GET_GOT rbx
437 ; end prolog
439 ; write out
440 movq mm1, [GLOBAL(dc_128)]
441 mov rax, arg(0) ;dst;
442 movsxd rdx, dword ptr arg(1) ;dst_stride
443 lea rcx, [rdx*3]
445 movq [rax ], mm1
446 movq [rax+rdx ], mm1
447 movq [rax+rdx*2], mm1
448 movq [rax+rcx ], mm1
449 lea rax, [rax+rdx*4]
450 movq [rax ], mm1
451 movq [rax+rdx ], mm1
452 movq [rax+rdx*2], mm1
453 movq [rax+rcx ], mm1
455 ; begin epilog
456 RESTORE_GOT
457 UNSHADOW_ARGS
458 pop rbp
461 ;void vp8_intra_pred_uv_tm_sse2(
462 ; unsigned char *dst,
463 ; int dst_stride
464 ; unsigned char *src,
465 ; int src_stride,
467 %macro vp8_intra_pred_uv_tm 1
468 global sym(vp8_intra_pred_uv_tm_%1)
469 sym(vp8_intra_pred_uv_tm_%1):
470 push rbp
471 mov rbp, rsp
472 SHADOW_ARGS_TO_STACK 4
473 GET_GOT rbx
474 push rsi
475 push rdi
476 ; end prolog
478 ; read top row
479 mov edx, 4
480 mov rsi, arg(2) ;src;
481 movsxd rax, dword ptr arg(3) ;src_stride;
482 sub rsi, rax
483 pxor xmm0, xmm0
484 %ifidn %1, ssse3
485 movdqa xmm2, [GLOBAL(dc_1024)]
486 %endif
487 movq xmm1, [rsi]
488 punpcklbw xmm1, xmm0
490 ; set up left ptrs ans subtract topleft
491 movd xmm3, [rsi-1]
492 lea rsi, [rsi+rax-1]
493 %ifidn %1, sse2
494 punpcklbw xmm3, xmm0
495 pshuflw xmm3, xmm3, 0x0
496 punpcklqdq xmm3, xmm3
497 %else
498 pshufb xmm3, xmm2
499 %endif
500 psubw xmm1, xmm3
502 ; set up dest ptrs
503 mov rdi, arg(0) ;dst;
504 movsxd rcx, dword ptr arg(1) ;dst_stride
506 .vp8_intra_pred_uv_tm_%1_loop:
507 movd xmm3, [rsi]
508 movd xmm5, [rsi+rax]
509 %ifidn %1, sse2
510 punpcklbw xmm3, xmm0
511 punpcklbw xmm5, xmm0
512 pshuflw xmm3, xmm3, 0x0
513 pshuflw xmm5, xmm5, 0x0
514 punpcklqdq xmm3, xmm3
515 punpcklqdq xmm5, xmm5
516 %else
517 pshufb xmm3, xmm2
518 pshufb xmm5, xmm2
519 %endif
520 paddw xmm3, xmm1
521 paddw xmm5, xmm1
522 packuswb xmm3, xmm5
523 movq [rdi ], xmm3
524 movhps[rdi+rcx], xmm3
525 lea rsi, [rsi+rax*2]
526 lea rdi, [rdi+rcx*2]
527 dec edx
528 jnz .vp8_intra_pred_uv_tm_%1_loop
530 ; begin epilog
531 pop rdi
532 pop rsi
533 RESTORE_GOT
534 UNSHADOW_ARGS
535 pop rbp
537 %endmacro
539 vp8_intra_pred_uv_tm sse2
540 vp8_intra_pred_uv_tm ssse3
542 ;void vp8_intra_pred_uv_ve_mmx(
543 ; unsigned char *dst,
544 ; int dst_stride
545 ; unsigned char *src,
546 ; int src_stride,
548 global sym(vp8_intra_pred_uv_ve_mmx)
549 sym(vp8_intra_pred_uv_ve_mmx):
550 push rbp
551 mov rbp, rsp
552 SHADOW_ARGS_TO_STACK 4
553 ; end prolog
555 ; read from top
556 mov rax, arg(2) ;src;
557 movsxd rdx, dword ptr arg(3) ;src_stride;
558 sub rax, rdx
559 movq mm1, [rax]
561 ; write out
562 mov rax, arg(0) ;dst;
563 movsxd rdx, dword ptr arg(1) ;dst_stride
564 lea rcx, [rdx*3]
566 movq [rax ], mm1
567 movq [rax+rdx ], mm1
568 movq [rax+rdx*2], mm1
569 movq [rax+rcx ], mm1
570 lea rax, [rax+rdx*4]
571 movq [rax ], mm1
572 movq [rax+rdx ], mm1
573 movq [rax+rdx*2], mm1
574 movq [rax+rcx ], mm1
576 ; begin epilog
577 UNSHADOW_ARGS
578 pop rbp
581 ;void vp8_intra_pred_uv_ho_mmx2(
582 ; unsigned char *dst,
583 ; int dst_stride
584 ; unsigned char *src,
585 ; int src_stride,
587 %macro vp8_intra_pred_uv_ho 1
588 global sym(vp8_intra_pred_uv_ho_%1)
589 sym(vp8_intra_pred_uv_ho_%1):
590 push rbp
591 mov rbp, rsp
592 SHADOW_ARGS_TO_STACK 4
593 push rsi
594 push rdi
595 %ifidn %1, ssse3
596 %ifndef GET_GOT_SAVE_ARG
597 push rbx
598 %endif
599 GET_GOT rbx
600 %endif
601 ; end prolog
603 ; read from left and write out
604 %ifidn %1, mmx2
605 mov edx, 4
606 %endif
607 mov rsi, arg(2) ;src;
608 movsxd rax, dword ptr arg(3) ;src_stride;
609 mov rdi, arg(0) ;dst;
610 movsxd rcx, dword ptr arg(1) ;dst_stride
611 %ifidn %1, ssse3
612 lea rdx, [rcx*3]
613 movdqa xmm2, [GLOBAL(dc_00001111)]
614 lea rbx, [rax*3]
615 %endif
616 dec rsi
617 %ifidn %1, mmx2
618 .vp8_intra_pred_uv_ho_%1_loop:
619 movd mm0, [rsi]
620 movd mm1, [rsi+rax]
621 punpcklbw mm0, mm0
622 punpcklbw mm1, mm1
623 pshufw mm0, mm0, 0x0
624 pshufw mm1, mm1, 0x0
625 movq [rdi ], mm0
626 movq [rdi+rcx], mm1
627 lea rsi, [rsi+rax*2]
628 lea rdi, [rdi+rcx*2]
629 dec edx
630 jnz .vp8_intra_pred_uv_ho_%1_loop
631 %else
632 movd xmm0, [rsi]
633 movd xmm3, [rsi+rax]
634 movd xmm1, [rsi+rax*2]
635 movd xmm4, [rsi+rbx]
636 punpcklbw xmm0, xmm3
637 punpcklbw xmm1, xmm4
638 pshufb xmm0, xmm2
639 pshufb xmm1, xmm2
640 movq [rdi ], xmm0
641 movhps [rdi+rcx], xmm0
642 movq [rdi+rcx*2], xmm1
643 movhps [rdi+rdx], xmm1
644 lea rsi, [rsi+rax*4]
645 lea rdi, [rdi+rcx*4]
646 movd xmm0, [rsi]
647 movd xmm3, [rsi+rax]
648 movd xmm1, [rsi+rax*2]
649 movd xmm4, [rsi+rbx]
650 punpcklbw xmm0, xmm3
651 punpcklbw xmm1, xmm4
652 pshufb xmm0, xmm2
653 pshufb xmm1, xmm2
654 movq [rdi ], xmm0
655 movhps [rdi+rcx], xmm0
656 movq [rdi+rcx*2], xmm1
657 movhps [rdi+rdx], xmm1
658 %endif
660 ; begin epilog
661 %ifidn %1, ssse3
662 RESTORE_GOT
663 %ifndef GET_GOT_SAVE_ARG
664 pop rbx
665 %endif
666 %endif
667 pop rdi
668 pop rsi
669 UNSHADOW_ARGS
670 pop rbp
672 %endmacro
674 vp8_intra_pred_uv_ho mmx2
675 vp8_intra_pred_uv_ho ssse3
677 SECTION_RODATA
678 dc_128:
679 times 8 db 128
680 dc_4:
681 times 4 dw 4
682 align 16
683 dc_1024:
684 times 8 dw 0x400
685 align 16
686 dc_00001111:
687 times 8 db 0
688 times 8 db 1