arm: remove duplicate functions
[libvpx.git] / vp8 / encoder / x86 / fwalsh_sse2.asm
blobffc9b3dcae66ee31438a7f6b97c2599ce3d6ebea
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 ;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch)
15 global sym(vp8_short_walsh4x4_sse2)
16 sym(vp8_short_walsh4x4_sse2):
17 push rbp
18 mov rbp, rsp
19 SHADOW_ARGS_TO_STACK 3
20 push rsi
21 push rdi
22 ; end prolog
24 mov rsi, arg(0) ; input
25 mov rdi, arg(1) ; output
26 movsxd rdx, dword ptr arg(2) ; pitch
28 ; first for loop
29 movq xmm0, MMWORD PTR [rsi] ; load input
30 movq xmm1, MMWORD PTR [rsi + rdx]
31 lea rsi, [rsi + rdx*2]
32 movq xmm2, MMWORD PTR [rsi]
33 movq xmm3, MMWORD PTR [rsi + rdx]
35 punpcklwd xmm0, xmm1
36 punpcklwd xmm2, xmm3
38 movdqa xmm1, xmm0
39 punpckldq xmm0, xmm2 ; ip[1] ip[0]
40 punpckhdq xmm1, xmm2 ; ip[3] ip[2]
42 movdqa xmm2, xmm0
43 paddw xmm0, xmm1
44 psubw xmm2, xmm1
46 psllw xmm0, 2 ; d1 a1
47 psllw xmm2, 2 ; c1 b1
49 movdqa xmm1, xmm0
50 punpcklqdq xmm0, xmm2 ; b1 a1
51 punpckhqdq xmm1, xmm2 ; c1 d1
53 pxor xmm6, xmm6
54 movq xmm6, xmm0
55 pxor xmm7, xmm7
56 pcmpeqw xmm7, xmm6
57 paddw xmm7, [GLOBAL(c1)]
59 movdqa xmm2, xmm0
60 paddw xmm0, xmm1 ; b1+c1 a1+d1
61 psubw xmm2, xmm1 ; b1-c1 a1-d1
62 paddw xmm0, xmm7 ; b1+c1 a1+d1+(a1!=0)
64 ; second for loop
65 ; input: 13 9 5 1 12 8 4 0 (xmm0)
66 ; 14 10 6 2 15 11 7 3 (xmm2)
67 ; after shuffle:
68 ; 13 5 9 1 12 4 8 0 (xmm0)
69 ; 14 6 10 2 15 7 11 3 (xmm1)
70 pshuflw xmm3, xmm0, 0xd8
71 pshufhw xmm0, xmm3, 0xd8
72 pshuflw xmm3, xmm2, 0xd8
73 pshufhw xmm1, xmm3, 0xd8
75 movdqa xmm2, xmm0
76 pmaddwd xmm0, [GLOBAL(c1)] ; d11 a11 d10 a10
77 pmaddwd xmm2, [GLOBAL(cn1)] ; c11 b11 c10 b10
78 movdqa xmm3, xmm1
79 pmaddwd xmm1, [GLOBAL(c1)] ; d12 a12 d13 a13
80 pmaddwd xmm3, [GLOBAL(cn1)] ; c12 b12 c13 b13
82 pshufd xmm4, xmm0, 0xd8 ; d11 d10 a11 a10
83 pshufd xmm5, xmm2, 0xd8 ; c11 c10 b11 b10
84 pshufd xmm6, xmm1, 0x72 ; d13 d12 a13 a12
85 pshufd xmm7, xmm3, 0x72 ; c13 c12 b13 b12
87 movdqa xmm0, xmm4
88 punpcklqdq xmm0, xmm5 ; b11 b10 a11 a10
89 punpckhqdq xmm4, xmm5 ; c11 c10 d11 d10
90 movdqa xmm1, xmm6
91 punpcklqdq xmm1, xmm7 ; b13 b12 a13 a12
92 punpckhqdq xmm6, xmm7 ; c13 c12 d13 d12
94 movdqa xmm2, xmm0
95 paddd xmm0, xmm4 ; b21 b20 a21 a20
96 psubd xmm2, xmm4 ; c21 c20 d21 d20
97 movdqa xmm3, xmm1
98 paddd xmm1, xmm6 ; b23 b22 a23 a22
99 psubd xmm3, xmm6 ; c23 c22 d23 d22
101 pxor xmm4, xmm4
102 movdqa xmm5, xmm4
103 pcmpgtd xmm4, xmm0
104 pcmpgtd xmm5, xmm2
105 pand xmm4, [GLOBAL(cd1)]
106 pand xmm5, [GLOBAL(cd1)]
108 pxor xmm6, xmm6
109 movdqa xmm7, xmm6
110 pcmpgtd xmm6, xmm1
111 pcmpgtd xmm7, xmm3
112 pand xmm6, [GLOBAL(cd1)]
113 pand xmm7, [GLOBAL(cd1)]
115 paddd xmm0, xmm4
116 paddd xmm2, xmm5
117 paddd xmm0, [GLOBAL(cd3)]
118 paddd xmm2, [GLOBAL(cd3)]
119 paddd xmm1, xmm6
120 paddd xmm3, xmm7
121 paddd xmm1, [GLOBAL(cd3)]
122 paddd xmm3, [GLOBAL(cd3)]
124 psrad xmm0, 3
125 psrad xmm1, 3
126 psrad xmm2, 3
127 psrad xmm3, 3
128 movdqa xmm4, xmm0
129 punpcklqdq xmm0, xmm1 ; a23 a22 a21 a20
130 punpckhqdq xmm4, xmm1 ; b23 b22 b21 b20
131 movdqa xmm5, xmm2
132 punpckhqdq xmm2, xmm3 ; c23 c22 c21 c20
133 punpcklqdq xmm5, xmm3 ; d23 d22 d21 d20
135 packssdw xmm0, xmm4 ; b23 b22 b21 b20 a23 a22 a21 a20
136 packssdw xmm2, xmm5 ; d23 d22 d21 d20 c23 c22 c21 c20
138 movdqa XMMWORD PTR [rdi], xmm0
139 movdqa XMMWORD PTR [rdi + 16], xmm2
141 ; begin epilog
142 pop rdi
143 pop rsi
144 UNSHADOW_ARGS
145 pop rbp
148 SECTION_RODATA
149 align 16
151 dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001
152 align 16
153 cn1:
154 dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff
155 align 16
156 cd1:
157 dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
158 align 16
159 cd3:
160 dd 0x00000003, 0x00000003, 0x00000003, 0x00000003