1 ; *****************************************************************************
2 ; * Provide SIMD optimizations for add_residual functions for HEVC decoding
3 ; * Copyright (c) 2014 Pierre-Edouard LEPERE
5 ; * This file is part of Libav.
7 ; * Libav is free software; you can redistribute it and/or
8 ; * modify it under the terms of the GNU Lesser General Public
9 ; * License as published by the Free Software Foundation; either
10 ; * version 2.1 of the License, or (at your option) any later version.
12 ; * Libav is distributed in the hope that it will be useful,
13 ; * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ; * Lesser General Public License for more details.
17 ; * You should have received a copy of the GNU Lesser General Public
18 ; * License along with Libav; if not, write to the Free Software
19 ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ; ******************************************************************************
22 %include "libavutil/x86/x86util.asm"
25 max_pixels_10: times
16 dw ((1 << 10)-1)
29 ; the add_res macros and functions were largely inspired by h264_idct.asm from the x264 project
30 %macro ADD_RES_MMX_4_8
0
52 ; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride)
53 cglobal hevc_add_residual_4_8
, 3, 3, 6
60 %macro ADD_RES_SSE_8_8
0
89 %macro ADD_RES_SSE_16_32_8
3
93 vinserti128 m2
, m2
, [r1
+%1+32], 1
94 vinserti128 m6
, m6
, [r1
+%1+48], 1
101 mova xm4
, [r1
+%1+mmsize
*2]
102 mova xm6
, [r1
+%1+mmsize
*2+16]
104 vinserti128 m4
, m4
, [r1
+%1+96 ], 1
105 vinserti128 m6
, m6
, [r1
+%1+112], 1
121 %macro TRANSFORM_ADD_8
0
122 ; void ff_hevc_add_residual_8_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
123 cglobal hevc_add_residual_8_8
, 3, 4, 8
131 ; void ff_hevc_add_residual_16_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
132 cglobal hevc_add_residual_16_8
, 3, 5, 7
137 ADD_RES_SSE_16_32_8
0, r0
, r0
+r2
138 ADD_RES_SSE_16_32_8
64, r0
+r2
*2, r0
+r3
145 ; void ff_hevc_add_residual_32_8_<opt>(uint8_t *dst, int16_t *res, ptrdiff_t stride)
146 cglobal hevc_add_residual_32_8
, 3, 5, 7
150 ADD_RES_SSE_16_32_8
0, r0
, r0
+16
151 ADD_RES_SSE_16_32_8
64, r0
+r2
, r0
+r2
+16
164 %if HAVE_AVX2_EXTERNAL
166 ; void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride)
167 cglobal hevc_add_residual_32_8
, 3, 5, 7
172 ADD_RES_SSE_16_32_8
0, r0
, r0
+r2
173 ADD_RES_SSE_16_32_8
128, r0
+r2
*2, r0
+r3
179 %endif
;HAVE_AVX2_EXTERNAL
181 %macro ADD_RES_SSE_8_10
4
200 %macro ADD_RES_MMX_4_10
3
211 %macro ADD_RES_SSE_16_10
3
230 %macro ADD_RES_SSE_32_10
2
250 %macro ADD_RES_AVX2_16_10
4
271 %macro ADD_RES_AVX2_32_10
3
292 ; void ff_hevc_add_residual_<4|8|16|32>_10(pixel *dst, int16_t *block, ptrdiff_t stride)
294 cglobal hevc_add_residual_4_10
, 3, 3, 6
296 mova m3
, [max_pixels_10
]
297 ADD_RES_MMX_4_10 r0
, r2
, r1
300 ADD_RES_MMX_4_10 r0
, r2
, r1
304 cglobal hevc_add_residual_8_10
, 3, 4, 6
306 mova m5
, [max_pixels_10
]
309 ADD_RES_SSE_8_10 r0
, r2
, r3
, r1
312 ADD_RES_SSE_8_10 r0
, r2
, r3
, r1
315 cglobal hevc_add_residual_16_10
, 3, 5, 6
317 mova m5
, [max_pixels_10
]
321 ADD_RES_SSE_16_10 r0
, r2
, r1
328 cglobal hevc_add_residual_32_10
, 3, 5, 6
330 mova m5
, [max_pixels_10
]
334 ADD_RES_SSE_32_10 r0
, r1
341 %if HAVE_AVX2_EXTERNAL
343 cglobal hevc_add_residual_16_10
, 3, 5, 6
345 mova m5
, [max_pixels_10
]
350 ADD_RES_AVX2_16_10 r0
, r2
, r3
, r1
357 cglobal hevc_add_residual_32_10
, 3, 5, 6
359 mova m5
, [max_pixels_10
]
363 ADD_RES_AVX2_32_10 r0
, r2
, r1
369 %endif
;HAVE_AVX2_EXTERNAL