1 ;*****************************************************************************
2 ;* x86-optimized functions for nlmeans filter
4 ;* This file is part of FFmpeg.
6 ;* FFmpeg is free software; you can redistribute it and/or
7 ;* modify it under the terms of the GNU Lesser General Public
8 ;* License as published by the Free Software Foundation; either
9 ;* version 2.1 of the License, or (at your option) any later version.
11 ;* FFmpeg is distributed in the hope that it will be useful,
12 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 ;* Lesser General Public License for more details.
16 ;* You should have received a copy of the GNU Lesser General Public
17 ;* License along with FFmpeg; if not, write to the Free Software
18 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 ;******************************************************************************
22 %include "libavutil/x86/x86util.asm"
24 %if HAVE_AVX2_EXTERNAL
&& ARCH_X86_64
28 ending_lut: dd -1, -1, -1, -1, -1, -1, -1, -1,\
29 0, -1, -1, -1, -1, -1, -1, -1,\
30 0, 0, -1, -1, -1, -1, -1, -1,\
31 0, 0, 0, -1, -1, -1, -1, -1,\
32 0, 0, 0, 0, -1, -1, -1, -1,\
33 0, 0, 0, 0, 0, -1, -1, -1,\
34 0, 0, 0, 0, 0, 0, -1, -1,\
35 0, 0, 0, 0, 0, 0, 0, -1,\
36 0, 0, 0, 0, 0, 0, 0, 0
40 ; void ff_compute_weights_line(const uint32_t *const iia,
41 ; const uint32_t *const iib,
42 ; const uint32_t *const iid,
43 ; const uint32_t *const iie,
44 ; const uint8_t *const src,
47 ; const float *const lut,
49 ; int startx, int endx);
52 cglobal compute_weights_line
, 8, 13, 5, 0, iia
, iib
, iid
, iie
, src
, total
, sum
, lut
, x
, startx
, endx
, mod, elut
53 movsxd startxq
, dword startxm
54 movsxd endxq
, dword endxm
59 lea elutq
, [ending_lut
]
70 movu m0
, [iieq
+ xq
* 4]
72 psubd m0
, [iidq
+ xq
* 4]
73 psubd m0
, [iibq
+ xq
* 4]
74 paddd m0
, [iiaq
+ xq
* 4]
75 por m0
, [elutq
+ startxq
]
79 vgatherdps m1
, [lutq
+ m0
], m3
81 pmovzxbd m0
, [srcq
+ xq
]
86 addps m1
, [totalq
+ xq
* 4]
87 addps m0
, [sumq
+ xq
* 4]
89 movups
[totalq
+ xq
* 4], m1
90 movups
[sumq
+ xq
* 4], m0