libavcodec/vvc/inter_template.c

   1 /*
   2  * VVC inter prediction DSP
   3  *
   4  * Copyright (C) 2022 Nuo Mi
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2.1 of the License, or (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with FFmpeg; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  */
  22
  23 #include "libavcodec/h26x/h2656_inter_template.c"
  24 #include "libavutil/imgutils.h"
  25
  26 #define TMP_STRIDE EDGE_EMU_BUFFER_STRIDE
  27 static void av_always_inline FUNC(put_scaled)(uint8_t *_dst, const ptrdiff_t _dst_stride,
  28     const uint8_t *const _src, ptrdiff_t _src_stride, const int src_height,
  29     const int _x, const int _y, const int dx, const int dy,
  30     const int height, const int8_t *hf, const int8_t *vf, const int width, const int is_uni, const int is_chroma)
  31 {
  32     int16_t tmp_array[TMP_STRIDE * MAX_PB_SIZE];
  33     int16_t *tmp                 = tmp_array;
  34     pixel *dst                   = (pixel*)_dst;
  35     int16_t *dst16               = (int16_t*)_dst;
  36     const ptrdiff_t dst_stride   = _dst_stride / sizeof(pixel);
  37     const ptrdiff_t src_stride   = _src_stride / sizeof(pixel);
  38     const int shift              = FFMAX(2, 14 - BIT_DEPTH);
  39     const int offset             = 1 << (shift - 1);
  40     const int taps               = is_chroma ? VVC_INTER_CHROMA_TAPS : VVC_INTER_LUMA_TAPS;
  41     const int extra              = is_chroma ? CHROMA_EXTRA : LUMA_EXTRA;
  42     const int extra_before       = is_chroma ? CHROMA_EXTRA_BEFORE : LUMA_EXTRA_BEFORE;
  43     const int shift1             = 6 - is_chroma;
  44     const int shift2             = 4 + is_chroma;
  45     const int x0                 = SCALED_INT(_x);
  46     const int y0                 = SCALED_INT(_y);
  47
  48     for (int i = 0; i < width; i++) {
  49         const int tx         = _x + dx * i;
  50         const int x          = SCALED_INT(tx) - x0;
  51         const int mx         = av_zero_extend(tx >> shift1, shift2);
  52         const int8_t *filter = hf + mx * taps;
  53         const pixel *src     = (pixel*)_src - extra_before * src_stride;
  54
  55         for (int j = 0; j < src_height + extra; j++) {
  56             tmp[j] = (is_chroma ? CHROMA_FILTER(src, 1) : LUMA_FILTER(src, 1)) >> (BIT_DEPTH - 8);
  57             src += src_stride;
  58         }
  59         tmp += TMP_STRIDE;
  60     }
  61
  62     for (int i = 0; i < height; i++) {
  63         const int ty         = _y + dy * i;
  64         const int x          = SCALED_INT(ty) - y0;
  65         const int mx         = av_zero_extend(ty >> shift1, shift2);
  66         const int8_t *filter = vf + mx * taps;
  67
  68         tmp = tmp_array + extra_before;
  69         for (int j = 0; j < width; j++) {
  70             const int val = (is_chroma ? CHROMA_FILTER(tmp, 1) : LUMA_FILTER(tmp, 1)) >> 6;
  71             if (is_uni)
  72                 dst[j] = av_clip_pixel((val  + offset) >> shift);
  73             else
  74                 dst16[j] = val;
  75             tmp += TMP_STRIDE;
  76         }
  77         if (is_uni)
  78             dst += dst_stride;
  79         else
  80             dst16 += dst_stride;
  81     }
  82 }
  83
  84 static void FUNC(put_luma_scaled)(int16_t *_dst,
  85     const uint8_t *_src, ptrdiff_t _src_stride, const int src_height,
  86     const int x, const int y, const int dx, const int dy,
  87     const int height, const int8_t *hf, const int8_t *vf, const int width)
  88 {
  89     FUNC(put_scaled)((uint8_t *)_dst, MAX_PB_SIZE * sizeof(pixel), _src, _src_stride, src_height, x, y, dx, dy, height, hf, vf, width, 0, 0);
  90 }
  91
  92 static void FUNC(put_chroma_scaled)(int16_t *_dst,
  93     const uint8_t *_src, ptrdiff_t _src_stride, const int src_height,
  94     const int x, const int y, const int dx, const int dy,
  95     const int height, const int8_t *hf, const int8_t *vf, const int width)
  96 {
  97     FUNC(put_scaled)((uint8_t *)_dst, MAX_PB_SIZE * sizeof(pixel), _src, _src_stride, src_height, x, y, dx, dy, height, hf, vf, width, 0, 1);
  98 }
  99
 100 static void FUNC(put_uni_luma_scaled)(uint8_t *_dst, const ptrdiff_t _dst_stride,
 101     const uint8_t *_src, ptrdiff_t _src_stride, const int src_height,
 102     const int x, const int y, const int dx, const int dy,
 103     const int height, const int8_t *hf, const int8_t *vf, const int width)
 104 {
 105     FUNC(put_scaled)(_dst, _dst_stride, _src, _src_stride, src_height, x, y, dx, dy, height, hf, vf, width, 1, 0);
 106 }
 107
 108 static void FUNC(put_uni_chroma_scaled)(uint8_t *_dst, const ptrdiff_t _dst_stride,
 109     const uint8_t *_src, ptrdiff_t _src_stride, const int src_height,
 110     const int x, const int y, const int dx, const int dy,
 111     const int height, const int8_t *hf, const int8_t *vf, const int width)
 112 {
 113     FUNC(put_scaled)(_dst, _dst_stride, _src, _src_stride, src_height, x, y, dx, dy, height, hf, vf, width, 1, 1);
 114 }
 115
 116 static void av_always_inline FUNC(put_uni_w_scaled)(uint8_t *_dst, const ptrdiff_t _dst_stride,
 117     const uint8_t *const _src, ptrdiff_t _src_stride, const int src_height,
 118     const int _x, const int _y, const int dx, const int dy, const int denom, const int wx, const int _ox,
 119     const int height, const int8_t *hf, const int8_t *vf, const int width, const int is_chroma)
 120 {
 121     int16_t tmp_array[TMP_STRIDE * MAX_PB_SIZE];
 122     int16_t *tmp                 = tmp_array;
 123     pixel *dst                   = (pixel*)_dst;
 124     const ptrdiff_t dst_stride   = _dst_stride / sizeof(pixel);
 125     const ptrdiff_t src_stride   = _src_stride / sizeof(pixel);
 126     const int shift              = FFMAX(2, 14 - BIT_DEPTH);
 127     const int offset             = 1 << (shift - 1);
 128     const int ox                 = _ox * (1 << (BIT_DEPTH - 8));
 129     const int taps               = is_chroma ? VVC_INTER_CHROMA_TAPS : VVC_INTER_LUMA_TAPS;
 130     const int extra              = is_chroma ? CHROMA_EXTRA : LUMA_EXTRA;
 131     const int extra_before       = is_chroma ? CHROMA_EXTRA_BEFORE : LUMA_EXTRA_BEFORE;
 132     const int shift1             = 6 - is_chroma;
 133     const int shift2             = 4 + is_chroma;
 134     const int x0                 = SCALED_INT(_x);
 135     const int y0                 = SCALED_INT(_y);
 136
 137     for (int i = 0; i < width; i++) {
 138         const int tx         = _x + dx * i;
 139         const int x          = SCALED_INT(tx) - x0;
 140         const int mx         = av_zero_extend(tx >> shift1, shift2);
 141         const int8_t *filter = hf + mx * taps;
 142         const pixel *src     = (pixel*)_src - extra_before * src_stride;
 143
 144         for (int j = 0; j < src_height + extra; j++) {
 145             tmp[j] = (is_chroma ? CHROMA_FILTER(src, 1) : LUMA_FILTER(src, 1)) >> (BIT_DEPTH - 8);
 146             src += src_stride;
 147         }
 148         tmp += TMP_STRIDE;
 149     }
 150
 151     for (int i = 0; i < height; i++) {
 152         const int ty         = _y + dy * i;
 153         const int x          = SCALED_INT(ty) - y0;
 154         const int mx         = av_zero_extend(ty >> shift1, shift2);
 155         const int8_t *filter = vf + mx * taps;
 156
 157         tmp = tmp_array + extra_before;
 158         for (int j = 0; j < width; j++) {
 159             const int val = (is_chroma ? CHROMA_FILTER(tmp, 1) : LUMA_FILTER(tmp, 1)) >> 6;
 160             dst[j] = av_clip_pixel(((wx * val  + offset) >> shift) + ox);
 161             tmp += TMP_STRIDE;
 162         }
 163         dst += dst_stride;
 164     }
 165 }
 166
 167 static void FUNC(put_uni_luma_w_scaled)(uint8_t *_dst, const ptrdiff_t _dst_stride,
 168     const uint8_t *_src, ptrdiff_t _src_stride, const int src_height,
 169     const int x, const int y, const int dx, const int dy, const int denom, const int wx, const int ox,
 170     const int height, const int8_t *hf, const int8_t *vf, const int width)
 171 {
 172     FUNC(put_uni_w_scaled)(_dst, _dst_stride, _src, _src_stride, src_height, x, y, dx, dy, denom, wx, ox, height, hf, vf, width, 0);
 173 }
 174
 175 static void FUNC(put_uni_chroma_w_scaled)(uint8_t *_dst, const ptrdiff_t _dst_stride,
 176     const uint8_t *_src, ptrdiff_t _src_stride, const int src_height,
 177     const int x, const int y, const int dx, const int dy, const int denom, const int wx, const int ox,
 178     const int height, const int8_t *hf, const int8_t *vf, const int width)
 179 {
 180     FUNC(put_uni_w_scaled)(_dst, _dst_stride, _src, _src_stride, src_height, x, y, dx, dy,  denom, wx, ox, height, hf, vf, width, 1);
 181 }
 182
 183 #undef TMP_STRIDE
 184
 185 static void FUNC(avg)(uint8_t *_dst, const ptrdiff_t _dst_stride,
 186     const int16_t *src0, const int16_t *src1, const int width, const int height)
 187 {
 188     pixel *dst                  = (pixel*)_dst;
 189     const ptrdiff_t dst_stride  = _dst_stride / sizeof(pixel);
 190     const int shift             = FFMAX(3, 15 - BIT_DEPTH);
 191     const int offset            = 1 << (shift - 1);
 192
 193     for (int y = 0; y < height; y++) {
 194         for (int x = 0; x < width; x++)
 195             dst[x] = av_clip_pixel((src0[x] + src1[x] + offset) >> shift);
 196         src0 += MAX_PB_SIZE;
 197         src1 += MAX_PB_SIZE;
 198         dst  += dst_stride;
 199     }
 200 }
 201
 202 static void FUNC(w_avg)(uint8_t *_dst, const ptrdiff_t _dst_stride,
 203     const int16_t *src0, const int16_t *src1, const int width, const int height,
 204     const int denom, const int w0, const int w1, const int o0, const int o1)
 205 {
 206     pixel *dst                  = (pixel*)_dst;
 207     const ptrdiff_t dst_stride  = _dst_stride / sizeof(pixel);
 208     const int shift             = denom + FFMAX(3, 15 - BIT_DEPTH);
 209     const int offset            = ((o0 + o1) * (1 << (BIT_DEPTH - 8)) + 1) * (1 << (shift - 1));
 210
 211     for (int y = 0; y < height; y++) {
 212         for (int x = 0; x < width; x++)
 213             dst[x] = av_clip_pixel((src0[x] * w0 + src1[x] * w1 + offset) >> shift);
 214         src0 += MAX_PB_SIZE;
 215         src1 += MAX_PB_SIZE;
 216         dst  += dst_stride;
 217     }
 218 }
 219
 220 static void FUNC(put_ciip)(uint8_t *_dst, const ptrdiff_t _dst_stride,
 221     const int width, const int height,
 222     const uint8_t *_inter, const ptrdiff_t _inter_stride, const int intra_weight)
 223 {
 224     pixel *dst                = (pixel *)_dst;
 225     pixel *inter              = (pixel *)_inter;
 226     const size_t dst_stride   = _dst_stride / sizeof(pixel);
 227     const size_t inter_stride = _inter_stride / sizeof(pixel);
 228     const int inter_weight    = 4 - intra_weight;
 229
 230     for (int y = 0; y < height; y++) {
 231         for (int x = 0; x < width; x++)
 232             dst[x] = (dst[x] * intra_weight + inter[x] * inter_weight + 2) >> 2;
 233         dst   += dst_stride;
 234         inter += inter_stride;
 235     }
 236 }
 237
 238 static void FUNC(put_gpm)(uint8_t *_dst, ptrdiff_t dst_stride,
 239     const int width, const int height,
 240     const int16_t *src0, const int16_t *src1,
 241     const uint8_t *weights, const int step_x, const int step_y)
 242 {
 243     const int shift  = FFMAX(5, 17 - BIT_DEPTH);
 244     const int offset = 1 << (shift - 1);
 245     pixel *dst       = (pixel *)_dst;
 246
 247     dst_stride /= sizeof(pixel);
 248     for (int y = 0; y < height; y++) {
 249         for (int x = 0; x < width; x++) {
 250             const uint8_t w = weights[x * step_x];
 251             dst[x] = av_clip_pixel((src0[x] * w + src1[x] * (8 - w) + offset) >> shift);
 252         }
 253         dst     += dst_stride;
 254         src0    += MAX_PB_SIZE;
 255         src1    += MAX_PB_SIZE;
 256         weights += step_y;
 257     }
 258 }
 259
 260 //8.5.6.3.3 Luma integer sample fetching process, add one extra pad line
 261 static void FUNC(bdof_fetch_samples)(int16_t *_dst, const uint8_t *_src, const ptrdiff_t _src_stride,
 262     const int x_frac, const int y_frac, const int width, const int height)
 263 {
 264     const int x_off             = (x_frac >> 3) - 1;
 265     const int y_off             = (y_frac >> 3) - 1;
 266     const ptrdiff_t src_stride  = _src_stride / sizeof(pixel);
 267     const pixel *src            = (pixel*)_src + (x_off) + y_off * src_stride;
 268     int16_t *dst                = _dst - 1 - MAX_PB_SIZE;
 269     const int shift             = 14 - BIT_DEPTH;
 270     const int bdof_width        = width + 2 * BDOF_BORDER_EXT;
 271
 272     // top
 273     for (int i = 0; i < bdof_width; i++)
 274         dst[i] = src[i] << shift;
 275
 276     dst += MAX_PB_SIZE;
 277     src += src_stride;
 278
 279     for (int i = 0; i < height; i++) {
 280         dst[0] = src[0] << shift;
 281         dst[1 + width] = src[1 + width] << shift;
 282         dst += MAX_PB_SIZE;
 283         src += src_stride;
 284     }
 285     for (int i = 0; i < bdof_width; i++)
 286         dst[i] = src[i] << shift;
 287 }
 288
 289 //8.5.6.3.3 Luma integer sample fetching process
 290 static void FUNC(fetch_samples)(int16_t *_dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int x_frac, const int y_frac)
 291 {
 292     FUNC(bdof_fetch_samples)(_dst, _src, _src_stride, x_frac, y_frac, AFFINE_MIN_BLOCK_SIZE, AFFINE_MIN_BLOCK_SIZE);
 293 }
 294
 295 static void FUNC(prof_grad_filter)(int16_t *gradient_h, int16_t *gradient_v, const ptrdiff_t gradient_stride,
 296     const int16_t *_src, const ptrdiff_t src_stride, const int width, const int height)
 297 {
 298     const int shift     = 6;
 299     const int16_t *src  = _src;
 300
 301     for (int y = 0; y < height; y++) {
 302         const int16_t *p = src;
 303         for (int x = 0; x < width; x++) {
 304             gradient_h[x] = (p[1] >> shift) - (p[-1] >> shift);
 305             gradient_v[x] = (p[src_stride] >> shift) - (p[-src_stride] >> shift);
 306             p++;
 307         }
 308         gradient_h += gradient_stride;
 309         gradient_v += gradient_stride;
 310         src += src_stride;
 311     }
 312 }
 313
 314 static void FUNC(apply_prof)(int16_t *dst, const int16_t *src, const int16_t *diff_mv_x, const int16_t *diff_mv_y)
 315 {
 316     const int limit     = (1 << FFMAX(13, BIT_DEPTH + 1));          ///< dILimit
 317
 318     int16_t gradient_h[AFFINE_MIN_BLOCK_SIZE * AFFINE_MIN_BLOCK_SIZE];
 319     int16_t gradient_v[AFFINE_MIN_BLOCK_SIZE * AFFINE_MIN_BLOCK_SIZE];
 320     FUNC(prof_grad_filter)(gradient_h, gradient_v, AFFINE_MIN_BLOCK_SIZE, src, MAX_PB_SIZE, AFFINE_MIN_BLOCK_SIZE, AFFINE_MIN_BLOCK_SIZE);
 321
 322     for (int y = 0; y < AFFINE_MIN_BLOCK_SIZE; y++) {
 323         for (int x = 0; x < AFFINE_MIN_BLOCK_SIZE; x++) {
 324             const int o = y * AFFINE_MIN_BLOCK_SIZE + x;
 325             const int di = gradient_h[o] * diff_mv_x[o] + gradient_v[o] * diff_mv_y[o];
 326             const int val = src[x] + av_clip(di, -limit, limit - 1);
 327             dst[x] = val;
 328
 329         }
 330         src += MAX_PB_SIZE;
 331         dst += MAX_PB_SIZE;
 332     }
 333 }
 334
 335 static void FUNC(apply_prof_uni)(uint8_t *_dst, const ptrdiff_t _dst_stride, const int16_t *src, const int16_t *diff_mv_x, const int16_t *diff_mv_y)
 336 {
 337     const int limit             = (1 << FFMAX(13, BIT_DEPTH + 1));          ///< dILimit
 338     pixel *dst                  = (pixel*)_dst;
 339     const ptrdiff_t dst_stride  = _dst_stride / sizeof(pixel);
 340     const int shift             = 14 - BIT_DEPTH;
 341 #if BIT_DEPTH < 14
 342     const int offset            = 1 << (shift - 1);
 343 #else
 344     const int offset            = 0;
 345 #endif
 346     int16_t gradient_h[AFFINE_MIN_BLOCK_SIZE * AFFINE_MIN_BLOCK_SIZE];
 347     int16_t gradient_v[AFFINE_MIN_BLOCK_SIZE * AFFINE_MIN_BLOCK_SIZE];
 348
 349     FUNC(prof_grad_filter)(gradient_h, gradient_v, AFFINE_MIN_BLOCK_SIZE, src, MAX_PB_SIZE, AFFINE_MIN_BLOCK_SIZE, AFFINE_MIN_BLOCK_SIZE);
 350
 351     for (int y = 0; y < AFFINE_MIN_BLOCK_SIZE; y++) {
 352         for (int x = 0; x < AFFINE_MIN_BLOCK_SIZE; x++) {
 353             const int o = y * AFFINE_MIN_BLOCK_SIZE + x;
 354             const int di = gradient_h[o] * diff_mv_x[o] + gradient_v[o] * diff_mv_y[o];
 355             const int val = src[x] + av_clip(di, -limit, limit - 1);
 356             dst[x] = av_clip_pixel((val + offset) >> shift);
 357
 358         }
 359         src += MAX_PB_SIZE;
 360         dst += dst_stride;
 361     }
 362 }
 363
 364 static void FUNC(apply_prof_uni_w)(uint8_t *_dst, const ptrdiff_t _dst_stride,
 365     const int16_t *src, const int16_t *diff_mv_x, const int16_t *diff_mv_y,
 366     const int denom, const int wx, const int _ox)
 367 {
 368     const int limit             = (1 << FFMAX(13, BIT_DEPTH + 1));          ///< dILimit
 369     pixel *dst                  = (pixel*)_dst;
 370     const ptrdiff_t dst_stride  = _dst_stride / sizeof(pixel);
 371     const int shift             = denom + FFMAX(2, 14 - BIT_DEPTH);
 372     const int offset            = 1 << (shift - 1);
 373     const int ox                = _ox * (1 << (BIT_DEPTH - 8));
 374     int16_t gradient_h[AFFINE_MIN_BLOCK_SIZE * AFFINE_MIN_BLOCK_SIZE];
 375     int16_t gradient_v[AFFINE_MIN_BLOCK_SIZE * AFFINE_MIN_BLOCK_SIZE];
 376
 377     FUNC(prof_grad_filter)(gradient_h, gradient_v, AFFINE_MIN_BLOCK_SIZE, src, MAX_PB_SIZE, AFFINE_MIN_BLOCK_SIZE, AFFINE_MIN_BLOCK_SIZE);
 378
 379     for (int y = 0; y < AFFINE_MIN_BLOCK_SIZE; y++) {
 380         for (int x = 0; x < AFFINE_MIN_BLOCK_SIZE; x++) {
 381             const int o = y * AFFINE_MIN_BLOCK_SIZE + x;
 382             const int di = gradient_h[o] * diff_mv_x[o] + gradient_v[o] * diff_mv_y[o];
 383             const int val = src[x] + av_clip(di, -limit, limit - 1);
 384             dst[x] = av_clip_pixel(((val * wx + offset) >>  shift)  + ox);
 385         }
 386         src += MAX_PB_SIZE;
 387         dst += dst_stride;
 388     }
 389 }
 390
 391 static void FUNC(derive_bdof_vx_vy)(const int16_t *_src0, const int16_t *_src1,
 392     const int pad_left, const int pad_top, const int pad_right, const int pad_bottom,
 393     const int16_t **gradient_h, const int16_t **gradient_v,
 394     int* vx, int* vy)
 395 {
 396     const int shift2 = 4;
 397     const int shift3 = 1;
 398     const int thres = 1 << 4;
 399     int sgx2 = 0, sgy2 = 0, sgxgy = 0, sgxdi = 0, sgydi = 0;
 400
 401     for (int y = -1; y < BDOF_MIN_BLOCK_SIZE + 1; y++) {
 402         const int dy        = y + (pad_top && y < 0) - (pad_bottom && y == BDOF_MIN_BLOCK_SIZE);         // we pad for the first and last row
 403         const int16_t *src0 = _src0 + dy * MAX_PB_SIZE;
 404         const int16_t *src1 = _src1 + dy * MAX_PB_SIZE;
 405
 406         for (int x = -1; x < BDOF_MIN_BLOCK_SIZE + 1; x++) {
 407             const int dx    = x + (pad_left && x < 0) - (pad_right && x == BDOF_MIN_BLOCK_SIZE);         // we pad for the first and last col
 408             const int diff  = (src0[dx] >> shift2) - (src1[dx] >> shift2);
 409             const int idx   = BDOF_BLOCK_SIZE * dy + dx;
 410             const int temph = (gradient_h[0][idx] + gradient_h[1][idx]) >> shift3;
 411             const int tempv = (gradient_v[0][idx] + gradient_v[1][idx]) >> shift3;
 412
 413             sgx2 += FFABS(temph);
 414             sgy2 += FFABS(tempv);
 415             sgxgy += VVC_SIGN(tempv) * temph;
 416             sgxdi += -VVC_SIGN(temph) * diff;
 417             sgydi += -VVC_SIGN(tempv) * diff;
 418         }
 419     }
 420     *vx = sgx2 > 0 ? av_clip((sgxdi * (1 << 2)) >> av_log2(sgx2) , -thres + 1, thres - 1) : 0;
 421     *vy = sgy2 > 0 ? av_clip(((sgydi * (1 << 2)) - ((*vx * sgxgy) >> 1)) >> av_log2(sgy2), -thres + 1, thres - 1) : 0;
 422 }
 423
 424 static void FUNC(apply_bdof_min_block)(pixel* dst, const ptrdiff_t dst_stride, const int16_t *src0, const int16_t *src1,
 425     const int16_t **gh, const int16_t **gv, const int vx, const int vy)
 426 {
 427     const int shift4 = 15 - BIT_DEPTH;
 428     const int offset4 = 1 << (shift4 - 1);
 429
 430     for (int y = 0; y < BDOF_MIN_BLOCK_SIZE; y++) {
 431         for (int x = 0; x < BDOF_MIN_BLOCK_SIZE; x++) {
 432             const int idx = y * BDOF_BLOCK_SIZE + x;
 433             const int bdof_offset = vx * (gh[0][idx] - gh[1][idx]) + vy * (gv[0][idx] - gv[1][idx]);
 434             dst[x] = av_clip_pixel((src0[x] + offset4 + src1[x] + bdof_offset) >> shift4);
 435         }
 436         dst  += dst_stride;
 437         src0 += MAX_PB_SIZE;
 438         src1 += MAX_PB_SIZE;
 439     }
 440 }
 441
 442 static void FUNC(apply_bdof)(uint8_t *_dst, const ptrdiff_t _dst_stride, const int16_t *_src0, const int16_t *_src1,
 443     const int block_w, const int block_h)
 444 {
 445     int16_t gradient_h[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE];
 446     int16_t gradient_v[2][BDOF_BLOCK_SIZE * BDOF_BLOCK_SIZE];
 447     int vx, vy;
 448     const ptrdiff_t dst_stride  = _dst_stride / sizeof(pixel);
 449     pixel* dst                  = (pixel*)_dst;
 450
 451     FUNC(prof_grad_filter)(gradient_h[0], gradient_v[0], BDOF_BLOCK_SIZE,
 452         _src0, MAX_PB_SIZE, block_w, block_h);
 453     FUNC(prof_grad_filter)(gradient_h[1], gradient_v[1], BDOF_BLOCK_SIZE,
 454         _src1, MAX_PB_SIZE, block_w, block_h);
 455
 456     for (int y = 0; y < block_h; y += BDOF_MIN_BLOCK_SIZE) {
 457         for (int x = 0; x < block_w; x += BDOF_MIN_BLOCK_SIZE) {
 458             const int16_t* src0 = _src0 + y * MAX_PB_SIZE + x;
 459             const int16_t* src1 = _src1 + y * MAX_PB_SIZE + x;
 460             pixel *d            = dst + x;
 461             const int idx       = BDOF_BLOCK_SIZE * y  + x;
 462             const int16_t* gh[] = { gradient_h[0] + idx, gradient_h[1] + idx };
 463             const int16_t* gv[] = { gradient_v[0] + idx, gradient_v[1] + idx };
 464             FUNC(derive_bdof_vx_vy)(src0, src1, !x, !y, x + BDOF_MIN_BLOCK_SIZE == block_w, y + BDOF_MIN_BLOCK_SIZE == block_h, gh, gv, &vx, &vy);
 465             FUNC(apply_bdof_min_block)(d, dst_stride, src0, src1, gh, gv, vx, vy);
 466         }
 467         dst += BDOF_MIN_BLOCK_SIZE * dst_stride;
 468     }
 469 }
 470
 471 #define DMVR_FILTER(src, stride)                                                \
 472     (filter[0] * src[x] +                                                       \
 473      filter[1] * src[x + stride])
 474
 475 #define DMVR_FILTER2(filter, src0, src1)        \
 476     (filter[0] * src0 + filter[1] * src1)
 477
 478 //8.5.3.2.2 Luma sample bilinear interpolation process
 479 static void FUNC(dmvr)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
 480     const int height, const intptr_t mx, const intptr_t my, const int width)
 481 {
 482 #if BIT_DEPTH != 10
 483     const pixel *src            = (const pixel *)_src;
 484     const ptrdiff_t src_stride  = _src_stride / sizeof(pixel);
 485 #if BIT_DEPTH > 10
 486     const int shift4            = BIT_DEPTH - 10;
 487     const int offset4           = 1 << (shift4 - 1);
 488     #define DMVR_SHIFT(s)       (((s) + offset4) >> shift4)
 489 #else
 490     #define DMVR_SHIFT(s)       ((s) << (10 - BIT_DEPTH))
 491 #endif // BIT_DEPTH > 10
 492
 493     for (int y = 0; y < height; y++) {
 494         for (int x = 0; x < width; x++)
 495             dst[x] = DMVR_SHIFT(src[x]);
 496         src += src_stride;
 497         dst += MAX_PB_SIZE;
 498     }
 499 #undef DMVR_SHIFT
 500 #else
 501     av_image_copy_plane((uint8_t*)dst, sizeof(int16_t) * MAX_PB_SIZE, _src, _src_stride,
 502         width * sizeof(pixel), height);
 503 #endif // BIT_DEPTH != 10
 504 }
 505
 506 //8.5.3.2.2 Luma sample bilinear interpolation process
 507 static void FUNC(dmvr_h)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
 508     const int height, const intptr_t mx, const intptr_t my, const int width)
 509 {
 510     const pixel *src            = (const pixel*)_src;
 511     const ptrdiff_t src_stride  = _src_stride / sizeof(pixel);
 512     const int8_t *filter        = ff_vvc_inter_luma_dmvr_filters[mx];
 513     const int shift1            = BIT_DEPTH - 6;
 514     const int offset1           = 1 << (shift1 - 1);
 515
 516     for (int y = 0; y < height; y++) {
 517         for (int x = 0; x < width; x++)
 518             dst[x] = (DMVR_FILTER(src, 1) + offset1) >> shift1;
 519         src += src_stride;
 520         dst += MAX_PB_SIZE;
 521     }
 522 }
 523
 524 //8.5.3.2.2 Luma sample bilinear interpolation process
 525 static void FUNC(dmvr_v)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
 526     const int height, const intptr_t mx, const intptr_t my, const int width)
 527 {
 528     const pixel *src            = (pixel*)_src;
 529     const ptrdiff_t src_stride  = _src_stride / sizeof(pixel);
 530     const int8_t *filter        = ff_vvc_inter_luma_dmvr_filters[my];
 531     const int shift1            = BIT_DEPTH - 6;
 532     const int offset1           = 1 << (shift1 - 1);
 533
 534     for (int y = 0; y < height; y++) {
 535         for (int x = 0; x < width; x++)
 536             dst[x] = (DMVR_FILTER(src, src_stride) + offset1) >> shift1;
 537         src += src_stride;
 538         dst += MAX_PB_SIZE;
 539     }
 540
 541 }
 542
 543 //8.5.3.2.2 Luma sample bilinear interpolation process
 544 static void FUNC(dmvr_hv)(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
 545     const int height, const intptr_t mx, const intptr_t my, const int width)
 546 {
 547     int16_t tmp_array[MAX_PB_SIZE * 2];
 548     int16_t *tmp0               = tmp_array;
 549     int16_t *tmp1               = tmp_array + MAX_PB_SIZE;
 550     const pixel *src            = (const pixel*)_src;
 551     const ptrdiff_t src_stride  = _src_stride / sizeof(pixel);
 552     const int8_t *filter_x      = ff_vvc_inter_luma_dmvr_filters[mx];
 553     const int8_t *filter_y      = ff_vvc_inter_luma_dmvr_filters[my];
 554     const int shift1            = BIT_DEPTH - 6;
 555     const int offset1           = 1 << (shift1 - 1);
 556     const int shift2            = 4;
 557     const int offset2           = 1 << (shift2 - 1);
 558
 559     src   -= BILINEAR_EXTRA_BEFORE * src_stride;
 560     for (int x = 0; x < width; x++)
 561         tmp0[x] = (DMVR_FILTER2(filter_x, src[x], src[x + 1]) + offset1) >> shift1;
 562     src += src_stride;
 563
 564     for (int y = 1; y < height + BILINEAR_EXTRA; y++) {
 565         for (int x = 0; x < width; x++) {
 566             tmp1[x] = (DMVR_FILTER2(filter_x, src[x], src[x + 1]) + offset1) >> shift1;
 567             dst[x] = (DMVR_FILTER2(filter_y, tmp0[x], tmp1[x]) + offset2) >> shift2;
 568         }
 569         src += src_stride;
 570         dst += MAX_PB_SIZE;
 571         FFSWAP(int16_t *, tmp0, tmp1);
 572     }
 573 }
 574
 575 #define PEL_FUNC(dst, C, idx1, idx2, a)                                         \
 576     do {                                                                        \
 577         for (int w = 0; w < 7; w++)                                             \
 578             inter->dst[C][w][idx1][idx2] = FUNC(a);                             \
 579     } while (0)                                                                 \
 580
 581 #define DIR_FUNCS(d, C, c)                                                      \
 582         PEL_FUNC(put_##d, C, 0, 0, put_##d##_pixels);                           \
 583         PEL_FUNC(put_##d, C, 0, 1, put_##d##_##c##_h);                          \
 584         PEL_FUNC(put_##d, C, 1, 0, put_##d##_##c##_v);                          \
 585         PEL_FUNC(put_##d, C, 1, 1, put_##d##_##c##_hv);                         \
 586         PEL_FUNC(put_##d##_w, C, 0, 0, put_##d##_w_pixels);                     \
 587         PEL_FUNC(put_##d##_w, C, 0, 1, put_##d##_##c##_w_h);                    \
 588         PEL_FUNC(put_##d##_w, C, 1, 0, put_##d##_##c##_w_v);                    \
 589         PEL_FUNC(put_##d##_w, C, 1, 1, put_##d##_##c##_w_hv);
 590
 591 #define FUNCS(C, c)                                                             \
 592         PEL_FUNC(put, C, 0, 0, put_pixels);                                     \
 593         PEL_FUNC(put, C, 0, 1, put_##c##_h);                                    \
 594         PEL_FUNC(put, C, 1, 0, put_##c##_v);                                    \
 595         PEL_FUNC(put, C, 1, 1, put_##c##_hv);                                   \
 596         DIR_FUNCS(uni, C, c);                                                   \
 597
 598 static void FUNC(ff_vvc_inter_dsp_init)(VVCInterDSPContext *const inter)
 599 {
 600     FUNCS(LUMA, luma);
 601     FUNCS(CHROMA, chroma);
 602
 603     for (int i = 0; i < FF_ARRAY_ELEMS(inter->put_scaled[LUMA]); i++) {
 604         inter->put_scaled[LUMA][i]         = FUNC(put_luma_scaled);
 605         inter->put_scaled[CHROMA][i]       = FUNC(put_chroma_scaled);
 606         inter->put_uni_scaled[LUMA][i]     = FUNC(put_uni_luma_scaled);
 607         inter->put_uni_scaled[CHROMA][i]   = FUNC(put_uni_chroma_scaled);
 608         inter->put_uni_w_scaled[LUMA][i]   = FUNC(put_uni_luma_w_scaled);
 609         inter->put_uni_w_scaled[CHROMA][i] = FUNC(put_uni_chroma_w_scaled);
 610     }
 611
 612     inter->avg                  = FUNC(avg);
 613     inter->w_avg                = FUNC(w_avg);
 614
 615     inter->dmvr[0][0]           = FUNC(dmvr);
 616     inter->dmvr[0][1]           = FUNC(dmvr_h);
 617     inter->dmvr[1][0]           = FUNC(dmvr_v);
 618     inter->dmvr[1][1]           = FUNC(dmvr_hv);
 619
 620     inter->put_ciip             = FUNC(put_ciip);
 621     inter->put_gpm              = FUNC(put_gpm);
 622
 623     inter->fetch_samples        = FUNC(fetch_samples);
 624     inter->bdof_fetch_samples   = FUNC(bdof_fetch_samples);
 625     inter->apply_prof           = FUNC(apply_prof);
 626     inter->apply_prof_uni       = FUNC(apply_prof_uni);
 627     inter->apply_prof_uni_w     = FUNC(apply_prof_uni_w);
 628     inter->apply_bdof           = FUNC(apply_bdof);
 629     inter->sad                  = vvc_sad;
 630 }
 631
 632 #undef FUNCS
 633 #undef PEL_FUNC
 634 #undef DMVR_FUNCS