1 ;*****************************************************************************
2 ;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code
3 ;*****************************************************************************
4 ;* Copyright (C) 2011 x264 project
6 ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
8 ;* This file is part of Libav.
10 ;* Libav is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* Libav is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with Libav; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
25 %include "libavutil/x86/x86util.asm"
33 pw_pixel_max: times
8 dw ((1 << 10)-1)
35 pad10: times
8 dw 10*1023
36 pad20: times
8 dw 20*1023
37 pad30: times
8 dw 30*1023
38 depad: times
4 dd 32*20*1023 + 512
39 depad2: times
8 dw 20*1023 + 16*1022 + 16
40 unpad: times
8 dw 16*1022/32 ; needs to be mod 16
42 tap1: times
4 dw 1, -5
43 tap2: times
4 dw 20, 20
44 tap3: times
4 dw -5, 1
45 pd_0f: times
4 dd 0xffff
68 psubw
%1, %2 ; (a-b)/4-b
69 paddw
%1, %3 ; (a-b)/4-b+c
70 psraw
%1, 2 ; ((a-b)/4-b+c)/4
71 paddw
%1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
93 FILT_H
%1, %7, %8, [pw_16
]
95 CLIPW
%1, [pb_0
], [pw_pixel_max
]
105 %define OP_MOV AVG_MOV
114 cglobal
%1_h264_qpel
%4_
%2_10, %5,%6,%7
115 call stub_
%1_h264_qpel
%3_
%2_10 %+ SUFFIX
120 call stub_
%1_h264_qpel
%3_
%2_10 %+ SUFFIX
125 call stub_
%1_h264_qpel
%3_
%2_10 %+ SUFFIX
128 lea r0
, [r0
+r2
*%3+%3*2]
129 lea r1
, [r1
+r2
*%3+%3*2]
130 call stub_
%1_h264_qpel
%3_
%2_10 %+ SUFFIX
133 cglobal
%1_h264_qpel
%4_
%2_10, %5,%6 + 2,%7
137 call stub_
%1_h264_qpel
%3_
%2_10 %+ SUFFIX
139 lea r1
, [r
%+ p1
+%3*2]
140 call stub_
%1_h264_qpel
%3_
%2_10 %+ SUFFIX
142 lea r1
, [r
%+ p1
+r2
*%3]
143 call stub_
%1_h264_qpel
%3_
%2_10 %+ SUFFIX
144 lea r0
, [r
%6+r2
*%3+%3*2]
145 lea r1
, [r
%+ p1
+r2
*%3+%3*2]
146 %if UNIX64
== 0 ; fall through to function
147 call stub_
%1_h264_qpel
%3_
%2_10 %+ SUFFIX
153 ;cpu, put/avg, mc, 4/8, ...
156 %if ARCH_X86_32 || cpuflag
(sse2
)
157 MCAxA_OP
%1, %2, %3, i
, %4,%5,%6
160 cglobal
%1_h264_qpel
%3_
%2_10, %4,%5,%6
161 %if UNIX64
== 0 ; no prologue or epilogue for UNIX64
162 call stub_
%1_h264_qpel
%3_
%2_10 %+ SUFFIX
166 stub_
%1_h264_qpel
%3_
%2_10 %+ SUFFIX:
169 ;-----------------------------------------------------------------------------
170 ; void ff_h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride)
171 ;-----------------------------------------------------------------------------
185 cglobal_mc
%1, mc00
, 4, 3,4,0
191 cglobal
%1_h264_qpel8_mc00_10
, 3,4
199 cglobal
%1_h264_qpel16_mc00_10
, 3,4
209 OP_MOV
[r0
+r2
+16], m1
220 %define OP_MOV AVG_MOV
223 ;-----------------------------------------------------------------------------
224 ; void ff_h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride)
225 ;-----------------------------------------------------------------------------
230 INIT_XMM sse2
, cache64
232 INIT_XMM ssse3
, cache64
237 %define OP_MOV AVG_MOV
240 INIT_XMM sse2
, cache64
242 INIT_XMM ssse3
, cache64
249 cglobal_mc
%1, mc20
, %2, 3,4,9
251 mova m1
, [pw_pixel_max
]
266 %else
; movu is slow on these processors
274 PALIGNR m3
, m0
, m2
, 2, m5
275 PALIGNR m7
, m0
, m2
, 8, m5
277 PALIGNR m4
, m0
, m2
, 4, m5
278 PALIGNR m7
, m0
, m2
, 6, m5
284 PALIGNR m3
, m6
, m2
, 2, m5
286 PALIGNR m4
, m6
, m2
, 4, m5
287 PALIGNR m7
, m6
, m2
, 6, m5
293 FILT_H m2
, m3
, m4
, p16
307 ;-----------------------------------------------------------------------------
308 ; void ff_h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride)
309 ;-----------------------------------------------------------------------------
311 cglobal_mc
%1, mc30
, %2, 3,5,9
313 jmp stub_
%1_h264_qpel
%2_mc10_10
%+ SUFFIX
%+ .body
318 ;-----------------------------------------------------------------------------
319 ; void ff_h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride)
320 ;-----------------------------------------------------------------------------
322 cglobal_mc
%1, mc10
, %2, 3,5,9
326 mova m1
, [pw_pixel_max
]
341 %else
; movu is slow on these processors
349 PALIGNR m3
, m0
, m2
, 2, m5
350 PALIGNR m7
, m0
, m2
, 8, m5
352 PALIGNR m4
, m0
, m2
, 4, m5
353 PALIGNR m7
, m0
, m2
, 6, m5
359 PALIGNR m3
, m6
, m2
, 2, m5
361 PALIGNR m4
, m6
, m2
, 4, m5
362 PALIGNR m7
, m6
, m2
, 6, m5
368 FILT_H m2
, m3
, m4
, p16
385 ;-----------------------------------------------------------------------------
386 ; void ff_h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
387 ;-----------------------------------------------------------------------------
392 FILT_V m0
, m1
, m2
, m3
, m4
, m5
, m6
, m7
402 V_FILT m0
, m1
, m2
, m3
, m4
, m5
, m6
, m7
, 4, i
411 V_FILT m0
, m1
, m2
, m3
, m4
, m5
, m6
, m7
, 8, i
417 cglobal_mc
%1, mc02
, %2, 3,4,8
424 call v_filt
%2_
%+ i
%+ _10.no_addr4
434 ;-----------------------------------------------------------------------------
435 ; void ff_h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride)
436 ;-----------------------------------------------------------------------------
438 cglobal_mc
%1, mc01
, %2, 3,5,8
448 call v_filt
%2_
%+ i
%+ _10
460 ;-----------------------------------------------------------------------------
461 ; void ff_h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride)
462 ;-----------------------------------------------------------------------------
464 cglobal_mc
%1, mc03
, %2, 3,5,8
466 jmp stub_
%1_h264_qpel
%2_mc01_10
%+ SUFFIX
%+ .body
471 ;-----------------------------------------------------------------------------
472 ; void ff_h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride)
473 ;-----------------------------------------------------------------------------
474 %macro H_FILT_AVG
2-3
476 ;FILT_H with fewer registers and averaged with the FILT_V result
477 ;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
478 ;unfortunately I need three registers, so m5 will have to be re-read from memory
485 psraw m5
, 2 ; (a-b)/4
486 psubw m5
, m6
; (a-b)/4-b
489 paddw m5
, m6
; (a-b)/4-b+c
490 psraw m5
, 2 ; ((a-b)/4-b+c)/4
491 paddw m5
, m6
; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
493 CLIPW m5
, [pb_0
], [pw_pixel_max
]
526 ; this REALLY needs x86_64
527 cglobal_mc
%1, mc11
, %2, 3,6,8
539 call v_filt
%2_
%+ i
%+ _10
540 call h_filt
%2_
%+ i
%+ _10
553 ;-----------------------------------------------------------------------------
554 ; void ff_h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
555 ;-----------------------------------------------------------------------------
557 cglobal_mc
%1, mc31
, %2, 3,6,8
560 jmp stub_
%1_h264_qpel
%2_mc11_10
%+ SUFFIX
%+ .body
565 ;-----------------------------------------------------------------------------
566 ; void ff_h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
567 ;-----------------------------------------------------------------------------
569 cglobal_mc
%1, mc13
, %2, 3,7,12
571 jmp stub_
%1_h264_qpel
%2_mc11_10
%+ SUFFIX
%+ .body
576 ;-----------------------------------------------------------------------------
577 ; void ff_h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
578 ;-----------------------------------------------------------------------------
580 cglobal_mc
%1, mc33
, %2, 3,6,8
583 jmp stub_
%1_h264_qpel
%2_mc11_10
%+ SUFFIX
%+ .body
588 ;-----------------------------------------------------------------------------
589 ; void ff_h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride)
590 ;-----------------------------------------------------------------------------
595 psubw
%1, %2 ; a-5*b+4*c
597 paddw
%1, %3 ; a-5*b+20*c
619 neg r2
; This actually saves instructions
620 lea r1
, [r1
+r2
*2-mmsize
+PAD
]
621 lea r4
, [rsp
+PAD
+gprsize
]
636 FILT_VNRD m0
, m1
, m2
, m3
, m4
, m5
, m6
, m7
638 movu
[r4
+i
*mmsize
*3], m0
643 FILT_VNRD m0
, m1
, m2
, m3
, m4
, m5
, m6
, m7
645 movu
[r4
+i
*mmsize
*3], m0
647 lea r1
, [r1
+r2
*8+mmsize
]
675 movu m1
, [r1
+mmsize
-4]
676 movu m2
, [r1
+mmsize
-2]
677 mova m3
, [r1
+mmsize
+0]
678 movu m4
, [r1
+mmsize
+2]
679 movu m5
, [r1
+mmsize
+4]
680 movu m6
, [r1
+mmsize
+6]
727 cglobal_mc
%1, mc22
, %2, 3,7,12
728 %define PAD mmsize
*8*4*2 ; SIZE*16*4*sizeof(pixel)
729 mov r6
, rsp
; backup stack pointer
730 and rsp
, ~
(mmsize
-1) ; align stack
736 mova m7
, [pw_pixel_max
]
753 mov rsp
, r6
; restore stack pointer
759 ;-----------------------------------------------------------------------------
760 ; void ff_h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
761 ;-----------------------------------------------------------------------------
763 cglobal_mc
%1, mc12
, %2, 3,7,12
764 %define PAD mmsize
*8*4*2 ; SIZE*16*4*sizeof(pixel)
765 mov r6
, rsp
; backup stack pointer
766 and rsp
, ~
(mmsize
-1) ; align stack
775 mova m7
, [pw_pixel_max
]
786 movu m3
, [r1
+r4
-2*mmsize
] ; movu needed for mc32, etc
798 mov rsp
, r6
; restore stack pointer
804 ;-----------------------------------------------------------------------------
805 ; void ff_h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride)
806 ;-----------------------------------------------------------------------------
808 cglobal_mc
%1, mc32
, %2, 3,7,12
809 %define PAD mmsize
*8*3*2 ; SIZE*16*4*sizeof(pixel)
810 mov r6
, rsp
; backup stack pointer
811 and rsp
, ~
(mmsize
-1) ; align stack
816 mov r4d
, 2 ; sizeof(pixel)
817 jmp stub_
%1_h264_qpel
%2_mc12_10
%+ SUFFIX
%+ .body
822 ;-----------------------------------------------------------------------------
823 ; void ff_h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride)
824 ;-----------------------------------------------------------------------------
856 cglobal_mc
%1, mc21
, %2, 3,7,12
859 %define PAD mmsize
*8*3*2 ; SIZE*16*4*sizeof(pixel)
860 mov r6
, rsp
; backup stack pointer
861 and rsp
, ~
(mmsize
-1) ; align stack
869 mov r4d
, PAD
-mmsize
; H buffer
870 jmp stub_
%1_h264_qpel
%2_mc12_10
%+ SUFFIX
%+ .body
875 ;-----------------------------------------------------------------------------
876 ; void ff_h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride)
877 ;-----------------------------------------------------------------------------
879 cglobal_mc
%1, mc23
, %2, 3,7,12
881 jmp stub_
%1_h264_qpel
%2_mc21_10
%+ SUFFIX
%+ .body