2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 ;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch)
15 global sym
(vp8_short_walsh4x4_sse2
)
16 sym
(vp8_short_walsh4x4_sse2
):
19 SHADOW_ARGS_TO_STACK
3
24 mov rsi
, arg
(0) ; input
25 mov rdi
, arg
(1) ; output
26 movsxd rdx
, dword ptr arg
(2) ; pitch
29 movq xmm0
, MMWORD
PTR [rsi
] ; load input
30 movq xmm1
, MMWORD
PTR [rsi
+ rdx
]
31 lea rsi
, [rsi
+ rdx
*2]
32 movq xmm2
, MMWORD
PTR [rsi
]
33 movq xmm3
, MMWORD
PTR [rsi
+ rdx
]
39 punpckldq xmm0
, xmm2
; ip[1] ip[0]
40 punpckhdq xmm1
, xmm2
; ip[3] ip[2]
50 punpcklqdq xmm0
, xmm2
; b1 a1
51 punpckhqdq xmm1
, xmm2
; c1 d1
57 paddw xmm7
, [GLOBAL(c1
)]
60 paddw xmm0
, xmm1
; b1+c1 a1+d1
61 psubw xmm2
, xmm1
; b1-c1 a1-d1
62 paddw xmm0
, xmm7
; b1+c1 a1+d1+(a1!=0)
65 ; input: 13 9 5 1 12 8 4 0 (xmm0)
66 ; 14 10 6 2 15 11 7 3 (xmm2)
68 ; 13 5 9 1 12 4 8 0 (xmm0)
69 ; 14 6 10 2 15 7 11 3 (xmm1)
70 pshuflw xmm3
, xmm0
, 0xd8
71 pshufhw xmm0
, xmm3
, 0xd8
72 pshuflw xmm3
, xmm2
, 0xd8
73 pshufhw xmm1
, xmm3
, 0xd8
76 pmaddwd xmm0
, [GLOBAL(c1
)] ; d11 a11 d10 a10
77 pmaddwd xmm2
, [GLOBAL(cn1
)] ; c11 b11 c10 b10
79 pmaddwd xmm1
, [GLOBAL(c1
)] ; d12 a12 d13 a13
80 pmaddwd xmm3
, [GLOBAL(cn1
)] ; c12 b12 c13 b13
82 pshufd xmm4
, xmm0
, 0xd8 ; d11 d10 a11 a10
83 pshufd xmm5
, xmm2
, 0xd8 ; c11 c10 b11 b10
84 pshufd xmm6
, xmm1
, 0x72 ; d13 d12 a13 a12
85 pshufd xmm7
, xmm3
, 0x72 ; c13 c12 b13 b12
88 punpcklqdq xmm0
, xmm5
; b11 b10 a11 a10
89 punpckhqdq xmm4
, xmm5
; c11 c10 d11 d10
91 punpcklqdq xmm1
, xmm7
; b13 b12 a13 a12
92 punpckhqdq xmm6
, xmm7
; c13 c12 d13 d12
95 paddd xmm0
, xmm4
; b21 b20 a21 a20
96 psubd xmm2
, xmm4
; c21 c20 d21 d20
98 paddd xmm1
, xmm6
; b23 b22 a23 a22
99 psubd xmm3
, xmm6
; c23 c22 d23 d22
105 pand xmm4
, [GLOBAL(cd1
)]
106 pand xmm5
, [GLOBAL(cd1
)]
112 pand xmm6
, [GLOBAL(cd1
)]
113 pand xmm7
, [GLOBAL(cd1
)]
117 paddd xmm0
, [GLOBAL(cd3
)]
118 paddd xmm2
, [GLOBAL(cd3
)]
121 paddd xmm1
, [GLOBAL(cd3
)]
122 paddd xmm3
, [GLOBAL(cd3
)]
129 punpcklqdq xmm0
, xmm1
; a23 a22 a21 a20
130 punpckhqdq xmm4
, xmm1
; b23 b22 b21 b20
132 punpckhqdq xmm2
, xmm3
; c23 c22 c21 c20
133 punpcklqdq xmm5
, xmm3
; d23 d22 d21 d20
135 packssdw xmm0
, xmm4
; b23 b22 b21 b20 a23 a22 a21 a20
136 packssdw xmm2
, xmm5
; d23 d22 d21 d20 c23 c22 c21 c20
138 movdqa XMMWORD
PTR [rdi
], xmm0
139 movdqa XMMWORD
PTR [rdi
+ 16], xmm2
151 dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001
154 dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff
157 dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
160 dd 0x00000003, 0x00000003, 0x00000003, 0x00000003