2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 ;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch)
15 global sym
(vp8_short_walsh4x4_sse2
)
16 sym
(vp8_short_walsh4x4_sse2
):
19 SHADOW_ARGS_TO_STACK
3
26 mov rsi
, arg
(0) ; input
27 mov rdi
, arg
(1) ; output
28 movsxd rdx
, dword ptr arg
(2) ; pitch
31 movq xmm0
, MMWORD
PTR [rsi
] ; load input
32 movq xmm1
, MMWORD
PTR [rsi
+ rdx
]
33 lea rsi
, [rsi
+ rdx
*2]
34 movq xmm2
, MMWORD
PTR [rsi
]
35 movq xmm3
, MMWORD
PTR [rsi
+ rdx
]
41 punpckldq xmm0
, xmm2
; ip[1] ip[0]
42 punpckhdq xmm1
, xmm2
; ip[3] ip[2]
52 punpcklqdq xmm0
, xmm2
; b1 a1
53 punpckhqdq xmm1
, xmm2
; c1 d1
59 paddw xmm7
, [GLOBAL(c1
)]
62 paddw xmm0
, xmm1
; b1+c1 a1+d1
63 psubw xmm2
, xmm1
; b1-c1 a1-d1
64 paddw xmm0
, xmm7
; b1+c1 a1+d1+(a1!=0)
67 ; input: 13 9 5 1 12 8 4 0 (xmm0)
68 ; 14 10 6 2 15 11 7 3 (xmm2)
70 ; 13 5 9 1 12 4 8 0 (xmm0)
71 ; 14 6 10 2 15 7 11 3 (xmm1)
72 pshuflw xmm3
, xmm0
, 0xd8
73 pshufhw xmm0
, xmm3
, 0xd8
74 pshuflw xmm3
, xmm2
, 0xd8
75 pshufhw xmm1
, xmm3
, 0xd8
78 pmaddwd xmm0
, [GLOBAL(c1
)] ; d11 a11 d10 a10
79 pmaddwd xmm2
, [GLOBAL(cn1
)] ; c11 b11 c10 b10
81 pmaddwd xmm1
, [GLOBAL(c1
)] ; d12 a12 d13 a13
82 pmaddwd xmm3
, [GLOBAL(cn1
)] ; c12 b12 c13 b13
84 pshufd xmm4
, xmm0
, 0xd8 ; d11 d10 a11 a10
85 pshufd xmm5
, xmm2
, 0xd8 ; c11 c10 b11 b10
86 pshufd xmm6
, xmm1
, 0x72 ; d13 d12 a13 a12
87 pshufd xmm7
, xmm3
, 0x72 ; c13 c12 b13 b12
90 punpcklqdq xmm0
, xmm5
; b11 b10 a11 a10
91 punpckhqdq xmm4
, xmm5
; c11 c10 d11 d10
93 punpcklqdq xmm1
, xmm7
; b13 b12 a13 a12
94 punpckhqdq xmm6
, xmm7
; c13 c12 d13 d12
97 paddd xmm0
, xmm4
; b21 b20 a21 a20
98 psubd xmm2
, xmm4
; c21 c20 d21 d20
100 paddd xmm1
, xmm6
; b23 b22 a23 a22
101 psubd xmm3
, xmm6
; c23 c22 d23 d22
107 pand xmm4
, [GLOBAL(cd1
)]
108 pand xmm5
, [GLOBAL(cd1
)]
114 pand xmm6
, [GLOBAL(cd1
)]
115 pand xmm7
, [GLOBAL(cd1
)]
119 paddd xmm0
, [GLOBAL(cd3
)]
120 paddd xmm2
, [GLOBAL(cd3
)]
123 paddd xmm1
, [GLOBAL(cd3
)]
124 paddd xmm3
, [GLOBAL(cd3
)]
131 punpcklqdq xmm0
, xmm1
; a23 a22 a21 a20
132 punpckhqdq xmm4
, xmm1
; b23 b22 b21 b20
134 punpckhqdq xmm2
, xmm3
; c23 c22 c21 c20
135 punpcklqdq xmm5
, xmm3
; d23 d22 d21 d20
137 packssdw xmm0
, xmm4
; b23 b22 b21 b20 a23 a22 a21 a20
138 packssdw xmm2
, xmm5
; d23 d22 d21 d20 c23 c22 c21 c20
140 movdqa XMMWORD
PTR [rdi
], xmm0
141 movdqa XMMWORD
PTR [rdi
+ 16], xmm2
155 dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001
158 dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff
161 dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
164 dd 0x00000003, 0x00000003, 0x00000003, 0x00000003