2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 ;void vp8_short_inv_walsh4x4_1_mmx(short *input, short *output)
15 global sym
(vp8_short_inv_walsh4x4_1_mmx
)
16 sym
(vp8_short_inv_walsh4x4_1_mmx
):
19 SHADOW_ARGS_TO_STACK
2
28 add rax
, [rsi
] ;input[0] + 3
32 punpcklwd mm0
, mm0
;x x val val
34 punpckldq mm0
, mm0
;val val val val
36 psraw mm0
, 3 ;(input[0] + 3) >> 3
50 ;void vp8_short_inv_walsh4x4_mmx(short *input, short *output)
51 global sym
(vp8_short_inv_walsh4x4_mmx
)
52 sym
(vp8_short_inv_walsh4x4_mmx
):
55 SHADOW_ARGS_TO_STACK
2
65 movq mm0
, [rsi
+ 0] ;ip[0]
66 movq mm1
, [rsi
+ 8] ;ip[4]
69 movq mm2
, [rsi
+ 16] ;ip[8]
70 movq mm3
, [rsi
+ 24] ;ip[12]
75 punpcklwd mm7
, mm7
;0003000300030003h
78 paddw mm4
, mm3
;ip[0] + ip[12] aka al
79 paddw mm5
, mm2
;ip[4] + ip[8] aka bl
81 movq mm6
, mm4
;temp al
83 paddw mm4
, mm5
;al + bl
84 psubw mm6
, mm5
;al - bl
86 psubw mm0
, mm3
;ip[0] - ip[12] aka d1
87 psubw mm1
, mm2
;ip[4] - ip[8] aka c1
89 movq mm5
, mm0
;temp dl
91 paddw mm0
, mm1
;dl + cl
92 psubw mm5
, mm1
;dl - cl
99 movq mm3
, mm4
; 03 02 01 00
100 punpcklwd mm4
, mm0
; 11 01 10 00
101 punpckhwd mm3
, mm0
; 13 03 12 02
103 movq mm1
, mm6
; 23 22 21 20
104 punpcklwd mm6
, mm5
; 31 21 30 20
105 punpckhwd mm1
, mm5
; 33 23 32 22
107 movq mm0
, mm4
; 11 01 10 00
108 movq mm2
, mm3
; 13 03 12 02
110 punpckldq mm0
, mm6
; 30 20 10 00 aka ip[0]
111 punpckhdq mm4
, mm6
; 31 21 11 01 aka ip[4]
113 punpckldq mm2
, mm1
; 32 22 12 02 aka ip[8]
114 punpckhdq mm3
, mm1
; 33 23 13 03 aka ip[12]
115 ;~~~~~~~~~~~~~~~~~~~~~
119 paddw mm1
, mm3
;ip[0] + ip[12] aka al
120 paddw mm5
, mm2
;ip[4] + ip[8] aka bl
122 movq mm6
, mm1
;temp al
124 paddw mm1
, mm5
;al + bl
125 psubw mm6
, mm5
;al - bl
127 psubw mm0
, mm3
;ip[0] - ip[12] aka d1
128 psubw mm4
, mm2
;ip[4] - ip[8] aka c1
130 movq mm5
, mm0
;temp dl
132 paddw mm0
, mm4
;dl + cl
133 psubw mm5
, mm4
;dl - cl
134 ;~~~~~~~~~~~~~~~~~~~~~
135 movq mm3
, mm1
; 03 02 01 00
136 punpcklwd mm1
, mm0
; 11 01 10 00
137 punpckhwd mm3
, mm0
; 13 03 12 02
139 movq mm4
, mm6
; 23 22 21 20
140 punpcklwd mm6
, mm5
; 31 21 30 20
141 punpckhwd mm4
, mm5
; 33 23 32 22
143 movq mm0
, mm1
; 11 01 10 00
144 movq mm2
, mm3
; 13 03 12 02
146 punpckldq mm0
, mm6
; 30 20 10 00 aka ip[0]
147 punpckhdq mm1
, mm6
; 31 21 11 01 aka ip[4]
149 punpckldq mm2
, mm4
; 32 22 12 02 aka ip[8]
150 punpckhdq mm3
, mm4
; 33 23 13 03 aka ip[12]