1 ;******************************************************************************
3 ;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org>
4 ;* Copyright (c) Nick Kurshev <nickols_k@mail.ru>
5 ;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
6 ;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz>
7 ;* Copyright (c) 2013 Daniel Kang
9 ;* SIMD-optimized halfpel functions
11 ;* This file is part of FFmpeg.
13 ;* FFmpeg is free software; you can redistribute it and/or
14 ;* modify it under the terms of the GNU Lesser General Public
15 ;* License as published by the Free Software Foundation; either
16 ;* version 2.1 of the License, or (at your option) any later version.
18 ;* FFmpeg is distributed in the hope that it will be useful,
19 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
20 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21 ;* Lesser General Public License for more details.
23 ;* You should have received a copy of the GNU Lesser General Public
24 ;* License along with FFmpeg; if not, write to the Free Software
25 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 ;******************************************************************************
28 %include "libavutil/x86/x86util.asm"
33 pb_interleave16: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
34 pb_interleave8: db 0, 4, 1, 5, 2, 6, 3, 7
40 ; void ff_put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
41 %macro PUT_PIXELS8_X2
0
43 cglobal put_pixels16_x2
, 4,5,4
45 cglobal put_pixels8_x2
, 4,5
88 ; void ff_put_pixels16_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
89 %macro PUT_PIXELS_16
0
90 cglobal put_pixels16_x2
, 4,5
128 ; The 8_X2 macro can easily be used here
133 ; void ff_put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
135 cglobal put_no_rnd_pixels8_x2
, 4,5
168 ; void ff_put_no_rnd_pixels8_x2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
170 cglobal put_no_rnd_pixels8_x2_exact
, 4,5
209 ; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
210 %macro PUT_PIXELS8_Y2
0
212 cglobal put_pixels16_y2
, 4,5,3
214 cglobal put_pixels8_y2
, 4,5
243 ; actually, put_pixels16_y2_sse2
248 ; void ff_put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
250 cglobal put_no_rnd_pixels8_y2
, 4,5
279 ; void ff_put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
281 cglobal put_no_rnd_pixels8_y2_exact
, 4,5
315 ; void ff_avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
316 %macro AVG_PIXELS8_X2
0
318 cglobal avg_pixels16_x2
, 4,5,4
320 cglobal avg_pixels8_x2
, 4,5
332 PAVGB m0
, [r1
+1], m3
, m5
333 PAVGB m2
, [r1
+r2
+1], m4
, m5
335 PAVGB m0
, [r0
], m3
, m5
336 PAVGB m2
, [r0
+r2
], m4
, m5
348 PAVGB m0
, [r1
+1], m3
, m5
349 PAVGB m2
, [r1
+r2
+1], m4
, m5
353 PAVGB m0
, [r0
], m3
, m5
354 PAVGB m2
, [r0
+r2
], m4
, m5
365 ; actually avg_pixels16_x2
370 ; void ff_avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
371 %macro AVG_PIXELS8_Y2
0
373 cglobal avg_pixels16_y2
, 4,5,3
375 cglobal avg_pixels8_y2
, 4,5
408 ; actually avg_pixels16_y2
413 ; void ff_avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
414 ; Note this is not correctly rounded, and is therefore used for
415 ; not-bitexact output
417 cglobal avg_approx_pixels8_xy2
, 4,5
453 ; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
454 %macro SET_PIXELS_XY2
1
456 cglobal
%1_pixels16_xy2
, 4,5,8
458 cglobal
%1_pixels8_xy2
, 4,5
537 %macro SSSE3_PIXELS_XY2
1-2
539 cglobal
%1_pixels16_xy2
, 4,5,%2
540 mova m4
, [pb_interleave16
]
542 cglobal
%1_pixels8_xy2
, 4,5
543 mova m4
, [pb_interleave8
]
559 pmulhrsw m0
, [pw_8192
]
560 pmulhrsw m1
, [pw_8192
]
579 pmulhrsw m2
, [pw_8192
]
580 pmulhrsw m3
, [pw_8192
]
601 SSSE3_PIXELS_XY2 put
, 6
602 SSSE3_PIXELS_XY2 avg
, 7