1 ;******************************************************************************
2 ;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders
3 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
5 ;* This file is part of Libav.
7 ;* Libav is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* Libav is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with Libav; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 %include "libavutil/x86/x86util.asm"
25 pw_row_coeffs: times
4 dw 13
28 pd_512: times
2 dd 0x200
29 pw_col_coeffs: dw 13, 13, 13, -13
36 %macro IDCT_DC_NOROUND
1
41 %macro IDCT_DC_ROUND
1
48 cglobal rv34_idct_
%1, 1, 2, 0
61 %define IDCT_DC IDCT_DC_ROUND
63 %define IDCT_DC IDCT_DC_NOROUND
66 ; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc);
68 cglobal rv34_idct_dc_add
, 3, 3
101 ; Load coeffs and perform row transform
102 ; Output: coeffs in mm[0467], rounder in mm5
103 %macro ROW_TRANSFORM
1
114 mova mm6
, [pw_row_coeffs
+ 0]
115 paddsw mm0
, mm2
; b0 + b2
116 psubsw mm4
, mm2
; b0 - b2
117 pmullw mm0
, mm6
; *13 = z0
118 pmullw mm4
, mm6
; *13 = z1
120 pmullw mm1
, [pw_row_coeffs
+ 8] ; b1*17
121 pmullw mm5
, [pw_row_coeffs
+16] ; b1* 7
123 pmullw mm3
, [pw_row_coeffs
+ 8] ; b3*17
124 pmullw mm7
, [pw_row_coeffs
+16] ; b3* 7
125 paddsw mm1
, mm7
; z3 = b1*17 + b3* 7
126 psubsw mm5
, mm3
; z2 = b1* 7 - b3*17
129 paddsw mm0
, mm1
; z0 + z3
130 psubsw mm7
, mm1
; z0 - z3
131 paddsw mm4
, mm5
; z1 + z2
132 psubsw mm6
, mm5
; z1 - z2
133 mova mm5
, [pd_512
] ; 0x200
136 ; ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block);
137 %macro COL_TRANSFORM
4
138 pshufw mm3
, %2, 0xDD ; col. 1,3,1,3
139 pshufw
%2, %2, 0x88 ; col. 0,2,0,2
140 pmaddwd
%2, %3 ; 13*c0+13*c2 | 13*c0-13*c2 = z0 | z1
141 pmaddwd mm3
, %4 ; 17*c1+ 7*c3 | 7*c1-17*c3 = z3 | z2
143 pshufw mm1
, %2, 01001110b ; z1 | z0
144 pshufw mm2
, mm3
, 01001110b ; z2 | z3
145 paddd
%2, mm3
; z0+z3 | z1+z2
146 psubd mm1
, mm2
; z1-z2 | z0-z3
158 cglobal rv34_idct_add
, 3,3,0, d
, s
, b
160 COL_TRANSFORM
[dq], mm0
, [pw_col_coeffs
+ 0], [pw_col_coeffs
+ 8]
161 mova mm0
, [pw_col_coeffs
+ 0]
162 COL_TRANSFORM
[dq+sq
], mm4
, mm0
, [pw_col_coeffs
+ 8]
163 mova mm4
, [pw_col_coeffs
+ 8]
165 COL_TRANSFORM
[dq], mm6
, mm0
, mm4
166 COL_TRANSFORM
[dq+sq
], mm7
, mm0
, mm4
169 ; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc);
171 cglobal rv34_idct_dc_add
, 3, 3, 6
193 pextrd
[r0
+r1
], m2
, 1
195 pextrd
[r2
+r1
], m2
, 3