1 ;******************************************************************************
2 ;* MMX/SSE2-optimized functions for the RV40 decoder
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4 ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
5 ;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
7 ;* This file is part of Libav.
9 ;* Libav is free software; you can redistribute it and/or
10 ;* modify it under the terms of the GNU Lesser General Public
11 ;* License as published by the Free Software Foundation; either
12 ;* version 2.1 of the License, or (at your option) any later version.
14 ;* Libav is distributed in the hope that it will be useful,
15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 ;* Lesser General Public License for more details.
19 ;* You should have received a copy of the GNU Lesser General Public
20 ;* License along with Libav; if not, write to the Free Software
21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 ;******************************************************************************
24 %include "libavutil/x86/x86util.asm"
28 pw_1024: times
8 dw 1 << (16 - 6) ; pw_1024
30 sixtap_filter_hb_m: times
8 db 1, -5
32 ; multiplied by 2 to have the same shift
39 sixtap_filter_v_m: times
8 dw 1
43 ; multiplied by 2 to have the same shift
55 %define sixtap_filter_hw picregq
56 %define sixtap_filter_hb picregq
57 %define sixtap_filter_v picregq
60 %define sixtap_filter_hw sixtap_filter_hw_m
61 %define sixtap_filter_hb sixtap_filter_hb_m
62 %define sixtap_filter_v sixtap_filter_v_m
66 filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
67 filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
68 filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11
76 ;-----------------------------------------------------------------------------
77 ; subpel MC functions:
79 ; void ff_[put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride,
80 ; uint8_t *src, int srcstride,
82 ;----------------------------------------------------------------------
106 cglobal
%1_rv40_qpel_v
, 6,6+npicregs
,12, dst
, dststride
, src
, srcstride
, height
, my
, picreg
108 lea picregq
, [sixtap_filter_v_m
]
111 LOAD my
, sixtap_filter_v
117 movh m1
, [srcq
+srcstrideq
]
118 movh m2
, [srcq
+srcstrideq
*2]
119 lea srcq
, [srcq
+srcstrideq
*2]
122 movh m4
, [srcq
+srcstrideq
]
139 %define COEFF05
[myq
+ 0]
140 %define COEFF14
[myq
+16]
141 %define COEFF2
[myq
+32]
142 %define COEFF3
[myq
+48]
146 movh m5
, [srcq
+2*srcstrideq
] ; read new row
171 dec heightd
; next row
177 cglobal
%1_rv40_qpel_h
, 6, 6+npicregs
, 12, dst
, dststride
, src
, srcstride
, height
, mx
, picreg
179 lea picregq
, [sixtap_filter_v_m
]
182 LOAD mx
, sixtap_filter_v
194 %define COEFF05
[mxq
+ 0]
195 %define COEFF14
[mxq
+16]
196 %define COEFF2
[mxq
+32]
197 %define COEFF3
[mxq
+48]
228 dec heightd
; next row
253 %macro FILTER_SSSE3
1
254 cglobal
%1_rv40_qpel_v
, 6,6+npicregs
,8, dst
, dststride
, src
, srcstride
, height
, my
, picreg
256 lea picregq
, [sixtap_filter_hb_m
]
261 LOAD my
, sixtap_filter_hb
264 movh m1
, [srcq
+srcstrideq
]
265 movh m2
, [srcq
+srcstrideq
*2]
266 lea srcq
, [srcq
+srcstrideq
*2]
270 movh m4
, [srcq
+srcstrideq
]
271 lea srcq
, [srcq
+2*srcstrideq
]
278 pmaddubsw m6
, [myq
+16]
279 movh m7
, [srcq
] ; read new row
289 pmulhrsw m6
, [pw_512
]
295 dec heightd
; next row
299 cglobal
%1_rv40_qpel_h
, 6,6+npicregs
,8, dst
, dststride
, src
, srcstride
, height
, mx
, picreg
301 lea picregq
, [sixtap_filter_hb_m
]
303 mova m3
, [filter_h6_shuf2
]
304 mova m4
, [filter_h6_shuf3
]
305 LOAD mx
, sixtap_filter_hb
306 mova m5
, [mxq
] ; set up 6tap filter in bytes
308 mova m7
, [filter_h6_shuf1
]
322 pmulhrsw m0
, [pw_512
]
328 dec heightd
; next row
337 ; %1=5-bit weights?, %2=dst %3=src1 %4=src3 %5=stride if SSE2
338 %macro RV40_WCORE
4-5
339 movh m4
, [%3 + r6
+ 0]
340 movh m5
, [%4 + r6
+ 0]
342 %define
OFFSET r6
+ mmsize
/ 2
344 ; 8x8 block and SSE2, stride was provided
348 movh m6
, [%3 + OFFSET]
349 movh m7
, [%4 + OFFSET]
393 ; bias and shift down
406 ; Only called for 8x8 blocks and SSE2
419 RV40_WCORE
%2, r0
, r1
, r2
421 RV40_WCORE
%2, r0
+ 8, r1
+ 8, r2
+ 8
424 ; Prepare for next loop
428 RV40_WCORE
%2, r0
, r1
, r2
, r5
429 ; Prepare 2 next lines
432 RV40_WCORE
%2, r0
, r1
, r2
433 ; Prepare single next line
440 ; void ff_rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
441 ; %1=size %2=num of xmm regs
442 ; The weights are FP0.14 notation of fractions depending on pts.
443 ; For timebases without rounding error (i.e. PAL), the fractions
444 ; can be simplified, and several operations can be avoided.
445 ; Therefore, we check here whether they are multiples of 2^9 for
446 ; those simplifications to occur.
448 cglobal rv40_weight_func_
%1_
%2, 6, 7, 8
455 ; Set loop counter and increments
485 RV40_WEIGHT rnd
, 8, 3
486 RV40_WEIGHT rnd
, 16, 4
487 RV40_WEIGHT nornd
, 8, 3
488 RV40_WEIGHT nornd
, 16, 4
491 RV40_WEIGHT rnd
, 8, 3
492 RV40_WEIGHT rnd
, 16, 4
493 RV40_WEIGHT nornd
, 8, 3
494 RV40_WEIGHT nornd
, 16, 4
497 RV40_WEIGHT rnd
, 8, 3
498 RV40_WEIGHT rnd
, 16, 4
499 RV40_WEIGHT nornd
, 8, 3
500 RV40_WEIGHT nornd
, 16, 4