Merge "documentation: minor cosmetics"
[libvpx.git] / vp8 / common / x86 / iwalsh_sse2.asm
blob83c97df7d905288f10d45ad6584030aa9e17d6ae
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 ;void vp8_short_inv_walsh4x4_sse2(short *input, short *output)
15 global sym(vp8_short_inv_walsh4x4_sse2)
16 sym(vp8_short_inv_walsh4x4_sse2):
17 push rbp
18 mov rbp, rsp
19 SHADOW_ARGS_TO_STACK 2
20 SAVE_XMM
21 push rsi
22 push rdi
23 ; end prolog
25 mov rsi, arg(0)
26 mov rdi, arg(1)
27 mov rax, 3
29 movdqa xmm0, [rsi + 0] ;ip[4] ip[0]
30 movdqa xmm1, [rsi + 16] ;ip[12] ip[8]
32 shl rax, 16
33 or rax, 3 ;00030003h
35 pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
36 movdqa xmm3, xmm0 ;ip[4] ip[0]
38 paddw xmm0, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
39 psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
41 movdqa xmm4, xmm0
42 punpcklqdq xmm0, xmm3 ;d1 a1
43 punpckhqdq xmm4, xmm3 ;c1 b1
44 movd xmm7, eax
46 movdqa xmm1, xmm4 ;c1 b1
47 paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0]
48 psubw xmm0, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
50 ;;;temp output
51 ;; movdqu [rdi + 0], xmm4
52 ;; movdqu [rdi + 16], xmm3
54 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
55 ; 13 12 11 10 03 02 01 00
57 ; 33 32 31 30 23 22 21 20
59 movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00
60 punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00
61 punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10
62 movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00
63 punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00
64 punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02
65 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
66 pshufd xmm2, xmm1, 4eh ;ip[8] ip[12]
67 movdqa xmm3, xmm4 ;ip[4] ip[0]
69 pshufd xmm7, xmm7, 0 ;03 03 03 03 03 03 03 03
71 paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
72 psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
74 movdqa xmm5, xmm4
75 punpcklqdq xmm4, xmm3 ;d1 a1
76 punpckhqdq xmm5, xmm3 ;c1 b1
78 movdqa xmm1, xmm5 ;c1 b1
79 paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0]
80 psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8]
81 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
82 ; 13 12 11 10 03 02 01 00
84 ; 33 32 31 30 23 22 21 20
86 movdqa xmm0, xmm5 ; 13 12 11 10 03 02 01 00
87 punpcklwd xmm5, xmm4 ; 23 03 22 02 21 01 20 00
88 punpckhwd xmm0, xmm4 ; 33 13 32 12 31 11 30 10
89 movdqa xmm1, xmm5 ; 23 03 22 02 21 01 20 00
90 punpcklwd xmm5, xmm0 ; 31 21 11 01 30 20 10 00
91 punpckhwd xmm1, xmm0 ; 33 23 13 03 32 22 12 02
92 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
93 paddw xmm5, xmm7
94 paddw xmm1, xmm7
96 psraw xmm5, 3
97 psraw xmm1, 3
99 movdqa [rdi + 0], xmm5
100 movdqa [rdi + 16], xmm1
102 ; begin epilog
103 pop rdi
104 pop rsi
105 RESTORE_XMM
106 UNSHADOW_ARGS
107 pop rbp
110 SECTION_RODATA
111 align 16
112 x_s1sqr2:
113 times 4 dw 0x8A8C
114 align 16
115 x_c1sqr2less1:
116 times 4 dw 0x4E7B
117 align 16
118 fours:
119 times 4 dw 0x0004