libavfilter/x86/vf_nlmeans.asm

   1 ;*****************************************************************************
   2 ;* x86-optimized functions for nlmeans filter
   3 ;*
   4 ;* This file is part of FFmpeg.
   5 ;*
   6 ;* FFmpeg is free software; you can redistribute it and/or
   7 ;* modify it under the terms of the GNU Lesser General Public
   8 ;* License as published by the Free Software Foundation; either
   9 ;* version 2.1 of the License, or (at your option) any later version.
  10 ;*
  11 ;* FFmpeg is distributed in the hope that it will be useful,
  12 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 ;* Lesser General Public License for more details.
  15 ;*
  16 ;* You should have received a copy of the GNU Lesser General Public
  17 ;* License along with FFmpeg; if not, write to the Free Software
  18 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19 ;******************************************************************************
  20
  21
  22 %include "libavutil/x86/x86util.asm"
  23
  24 %if HAVE_AVX2_EXTERNAL && ARCH_X86_64
  25
  26 SECTION_RODATA 32
  27
  28 ending_lut: dd -1, -1, -1, -1, -1, -1, -1, -1,\
  29                 0, -1, -1, -1, -1, -1, -1, -1,\
  30                 0,  0, -1, -1, -1, -1, -1, -1,\
  31                 0,  0,  0, -1, -1, -1, -1, -1,\
  32                 0,  0,  0,  0, -1, -1, -1, -1,\
  33                 0,  0,  0,  0,  0, -1, -1, -1,\
  34                 0,  0,  0,  0,  0,  0, -1, -1,\
  35                 0,  0,  0,  0,  0,  0,  0, -1,\
  36                 0,  0,  0,  0,  0,  0,  0,  0
  37
  38 SECTION .text
  39
  40 ; void ff_compute_weights_line(const uint32_t *const iia,
  41 ;                              const uint32_t *const iib,
  42 ;                              const uint32_t *const iid,
  43 ;                              const uint32_t *const iie,
  44 ;                              const uint8_t *const src,
  45 ;                              float *total,
  46 ;                              float *sum,
  47 ;                              const float *const lut,
  48 ;                              int max,
  49 ;                              int startx, int endx);
  50
  51 INIT_YMM avx2
  52 cglobal compute_weights_line, 8, 13, 5, 0, iia, iib, iid, iie, src, total, sum, lut, x, startx, endx, mod, elut
  53     movsxd startxq, dword startxm
  54     movsxd   endxq, dword endxm
  55     VPBROADCASTD      m2, r8m
  56
  57     mov      xq, startxq
  58     mov    modq, mmsize / 4
  59     lea   elutq, [ending_lut]
  60
  61     vpcmpeqd  m4, m4
  62
  63     .loop:
  64         mov    startxq, endxq
  65         sub    startxq, xq
  66         cmp    startxq, modq
  67         cmovge startxq, modq
  68         sal    startxq, 5
  69
  70         movu   m0, [iieq + xq * 4]
  71
  72         psubd  m0, [iidq + xq * 4]
  73         psubd  m0, [iibq + xq * 4]
  74         paddd  m0, [iiaq + xq * 4]
  75         por    m0, [elutq + startxq]
  76         pminud m0, m2
  77         pslld  m0, 2
  78         mova   m3, m4
  79         vgatherdps m1, [lutq + m0], m3
  80
  81         pmovzxbd m0, [srcq + xq]
  82         cvtdq2ps m0, m0
  83
  84         mulps m0, m1
  85
  86         addps m1, [totalq + xq * 4]
  87         addps m0, [sumq + xq * 4]
  88
  89         movups [totalq + xq * 4], m1
  90         movups [sumq + xq * 4], m0
  91
  92         add xq, mmsize / 4
  93         cmp xq, endxq
  94         jl .loop
  95     RET
  96
  97 %endif