libswscale/aarch64/hscale.S

   1 /*
   2  * Copyright (c) 2016 Clément Bœsch <clement stupeflix.com>
   3  * Copyright (c) 2019-2021 Sebastian Pop <spop@amazon.com>
   4  * Copyright (c) 2022 Jonathan Swinney <jswinney@amazon.com>
   5  *
   6  * This file is part of FFmpeg.
   7  *
   8  * FFmpeg is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2.1 of the License, or (at your option) any later version.
  12  *
  13  * FFmpeg is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with FFmpeg; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  */
  22
  23 #include "libavutil/aarch64/asm.S"
  24
  25 /*
  26 ;-----------------------------------------------------------------------------
  27 ; horizontal line scaling
  28 ;
  29 ; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt>
  30 ;                               (SwsInternal *c, int{16,32}_t *dst,
  31 ;                                int dstW, const uint{8,16}_t *src,
  32 ;                                const int16_t *filter,
  33 ;                                const int32_t *filterPos, int filterSize);
  34 ;
  35 ; Scale one horizontal line. Input is either 8-bit width or 16-bit width
  36 ; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to
  37 ; downscale before multiplying). Filter is 14 bits. Output is either 15 bits
  38 ; (in int16_t) or 19 bits (in int32_t), as given in $intermediate_nbits. Each
  39 ; output pixel is generated from $filterSize input pixels, the position of
  40 ; the first pixel is given in filterPos[nOutputPixel].
  41 ;----------------------------------------------------------------------------- */
  42
  43 function ff_hscale8to15_X8_neon, export=1
  44         sbfiz           x7, x6, #1, #32             // filterSize*2 (*2 because int16)
  45 1:      ldr             w8, [x5], #4                // filterPos[idx]
  46         ldr             w0, [x5], #4                // filterPos[idx + 1]
  47         ldr             w11, [x5], #4               // filterPos[idx + 2]
  48         ldr             w9, [x5], #4                // filterPos[idx + 3]
  49         mov             x16, x4                     // filter0 = filter
  50         add             x12, x16, x7                // filter1 = filter0 + filterSize*2
  51         add             x13, x12, x7                // filter2 = filter1 + filterSize*2
  52         add             x4, x13, x7                 // filter3 = filter2 + filterSize*2
  53         movi            v0.16b, #0                  // val sum part 1 (for dst[0])
  54         movi            v1.16b, #0                  // val sum part 2 (for dst[1])
  55         movi            v2.16b, #0                  // val sum part 3 (for dst[2])
  56         movi            v3.16b, #0                  // val sum part 4 (for dst[3])
  57         add             x17, x3, w8, uxtw           // srcp + filterPos[0]
  58         add             x8,  x3, w0, uxtw           // srcp + filterPos[1]
  59         add             x0, x3, w11, uxtw           // srcp + filterPos[2]
  60         add             x11, x3, w9, uxtw           // srcp + filterPos[3]
  61         mov             w15, w6                     // filterSize counter
  62 2:      ld1             {v4.8b}, [x17], #8          // srcp[filterPos[0] + {0..7}]
  63         ld1             {v5.8h}, [x16], #16         // load 8x16-bit filter values, part 1
  64         ld1             {v6.8b}, [x8], #8           // srcp[filterPos[1] + {0..7}]
  65         ld1             {v7.8h}, [x12], #16         // load 8x16-bit at filter+filterSize
  66         uxtl            v4.8h, v4.8b                // unpack part 1 to 16-bit
  67         smlal           v0.4s, v4.4h, v5.4h         // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
  68         smlal2          v0.4s, v4.8h, v5.8h         // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]
  69         ld1             {v16.8b}, [x0], #8          // srcp[filterPos[2] + {0..7}]
  70         ld1             {v17.8h}, [x13], #16        // load 8x16-bit at filter+2*filterSize
  71         uxtl            v6.8h, v6.8b                // unpack part 2 to 16-bit
  72         smlal           v1.4s, v6.4h, v7.4h         // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
  73         uxtl            v16.8h, v16.8b              // unpack part 3 to 16-bit
  74         smlal           v2.4s, v16.4h, v17.4h       // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
  75         smlal2          v2.4s, v16.8h, v17.8h       // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
  76         ld1             {v18.8b}, [x11], #8         // srcp[filterPos[3] + {0..7}]
  77         smlal2          v1.4s, v6.8h, v7.8h         // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
  78         ld1             {v19.8h}, [x4], #16         // load 8x16-bit at filter+3*filterSize
  79         subs            w15, w15, #8                // j -= 8: processed 8/filterSize
  80         uxtl            v18.8h, v18.8b              // unpack part 4 to 16-bit
  81         smlal           v3.4s, v18.4h, v19.4h       // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
  82         smlal2          v3.4s, v18.8h, v19.8h       // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
  83         b.gt            2b                          // inner loop if filterSize not consumed completely
  84         addp            v0.4s, v0.4s, v1.4s         // part01 horizontal pair adding
  85         addp            v2.4s, v2.4s, v3.4s         // part23 horizontal pair adding
  86         addp            v0.4s, v0.4s, v2.4s         // part0123 horizontal pair adding
  87         subs            w2, w2, #4                  // dstW -= 4
  88         sqshrn          v0.4h, v0.4s, #7            // shift and clip the 2x16-bit final values
  89         st1             {v0.4h}, [x1], #8           // write to destination part0123
  90         b.gt            1b                          // loop until end of line
  91         ret
  92 endfunc
  93
  94 function ff_hscale8to15_X4_neon, export=1
  95 // x0  SwsInternal *c (not used)
  96 // x1  int16_t *dst
  97 // w2  int dstW
  98 // x3  const uint8_t *src
  99 // x4  const int16_t *filter
 100 // x5  const int32_t *filterPos
 101 // w6  int filterSize
 102
 103 // This function for filter sizes that are 4 mod 8. In other words, anything that's 0 mod 4 but not
 104 // 0 mod 8. It also assumes that dstW is 0 mod 4.
 105
 106         lsl             w7, w6, #1                  // w7 = filterSize * 2
 107 1:
 108         ldp             w8, w9,  [x5]               // filterPos[idx + 0], [idx + 1]
 109         ldp             w10, w11, [x5, #8]          // filterPos[idx + 2], [idx + 3]
 110
 111         movi            v16.16b, #0                 // initialize accumulator for idx + 0
 112         movi            v17.16b, #0                 // initialize accumulator for idx + 1
 113         movi            v18.16b, #0                 // initialize accumulator for idx + 2
 114         movi            v19.16b, #0                 // initialize accumulator for idx + 3
 115
 116         mov             x12, x4                     // filter pointer for idx + 0
 117         add             x13, x4, x7                 // filter pointer for idx + 1
 118         add             x8, x3, w8, uxtw            // srcp + filterPos[idx + 0]
 119         add             x9, x3, w9, uxtw            // srcp + filterPos[idx + 1]
 120
 121         add             x14, x13, x7                // filter pointer for idx + 2
 122         add             x10, x3, w10, uxtw          // srcp + filterPos[idx + 2]
 123         add             x11, x3, w11, uxtw          // srcp + filterPos[idx + 3]
 124
 125         mov             w0, w6                      // copy filterSize to a temp register, w0
 126         add             x5, x5, #16                 // advance the filterPos pointer
 127         add             x15, x14, x7                // filter pointer for idx + 3
 128         mov             x16, xzr                    // temp register for offsetting filter pointers
 129
 130 2:
 131         // This section loops over 8-wide chunks of filter size
 132         ldr             d4, [x8], #8                // load 8 bytes from srcp for idx + 0
 133         ldr             q0, [x12, x16]              // load 8 values, 16 bytes from filter for idx + 0
 134
 135         ldr             d5, [x9], #8                // load 8 bytes from srcp for idx + 1
 136         ldr             q1, [x13, x16]              // load 8 values, 16 bytes from filter for idx + 1
 137
 138         uxtl            v4.8h, v4.8b                // unsigned extend long for idx + 0
 139         uxtl            v5.8h, v5.8b                // unsigned extend long for idx + 1
 140
 141         ldr             d6, [x10], #8               // load 8 bytes from srcp for idx + 2
 142         ldr             q2, [x14, x16]              // load 8 values, 16 bytes from filter for idx + 2
 143
 144         smlal           v16.4s, v0.4h, v4.4h        // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 0
 145         smlal           v17.4s, v1.4h, v5.4h        // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 1
 146
 147         ldr             d7, [x11], #8               // load 8 bytes from srcp for idx + 3
 148         ldr             q3, [x15, x16]              // load 8 values, 16 bytes from filter for idx + 3
 149
 150         sub             w0, w0, #8                  // decrement the remaining filterSize counter
 151         smlal2          v16.4s, v0.8h, v4.8h        // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 0
 152         smlal2          v17.4s, v1.8h, v5.8h        // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 1
 153         uxtl            v6.8h, v6.8b                // unsigned extend long for idx + 2
 154         uxtl            v7.8h, v7.8b                // unsigned extend long for idx + 3
 155         smlal           v18.4s, v2.4h, v6.4h        // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 2
 156         smlal           v19.4s, v3.4h, v7.4h        // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 3
 157
 158         cmp             w0, #8                      // are there at least 8 more elements in filter to consume?
 159         add             x16, x16, #16               // advance the offsetting register for filter values
 160
 161         smlal2          v18.4s, v2.8h, v6.8h        // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 2
 162         smlal2          v19.4s, v3.8h, v7.8h        // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 3
 163
 164         b.ge            2b                          // branch back to inner loop
 165
 166         // complete the remaining 4 filter elements
 167         sub             x17, x7, #8                 // calculate the offset of the filter pointer for the remaining 4 elements
 168
 169         ldr             s4, [x8]                    // load 4 bytes from srcp for idx + 0
 170         ldr             d0, [x12, x17]              // load 4 values, 8 bytes from filter for idx + 0
 171         ldr             s5, [x9]                    // load 4 bytes from srcp for idx + 1
 172         ldr             d1, [x13, x17]              // load 4 values, 8 bytes from filter for idx + 1
 173
 174         uxtl            v4.8h, v4.8b                // unsigned extend long for idx + 0
 175         uxtl            v5.8h, v5.8b                // unsigned extend long for idx + 1
 176
 177         ldr             s6, [x10]                   // load 4 bytes from srcp for idx + 2
 178         ldr             d2, [x14, x17]              // load 4 values, 8 bytes from filter for idx + 2
 179         smlal           v16.4s, v0.4h, v4.4h        // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 0
 180         smlal           v17.4s, v1.4h, v5.4h        // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 1
 181         ldr             s7, [x11]                   // load 4 bytes from srcp for idx + 3
 182         ldr             d3, [x15, x17]              // load 4 values, 8 bytes from filter for idx + 3
 183
 184         uxtl            v6.8h, v6.8b                // unsigned extend long for idx + 2
 185         uxtl            v7.8h, v7.8b                // unsigned extend long for idx + 3
 186         addp            v16.4s, v16.4s, v17.4s      // horizontal pair adding for idx 0,1
 187         smlal           v18.4s, v2.4h, v6.4h        // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 2
 188         smlal           v19.4s, v3.4h, v7.4h        // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 3
 189
 190         addp            v18.4s, v18.4s, v19.4s      // horizontal pair adding for idx 2,3
 191         addp            v16.4s, v16.4s, v18.4s      // final horizontal pair adding producing one vector with results for idx = 0..3
 192
 193         subs            w2, w2, #4                  // dstW -= 4
 194         sqshrn          v0.4h, v16.4s, #7           // shift and clip the 2x16-bit final values
 195         st1             {v0.4h}, [x1], #8           // write to destination idx 0..3
 196         add             x4, x4, x7, lsl #2          // filter += (filterSize*2) * 4
 197         b.gt            1b                          // loop until end of line
 198         ret
 199 endfunc
 200
 201 function ff_hscale8to15_4_neon, export=1
 202 // x0  SwsInternal *c (not used)
 203 // x1  int16_t *dst
 204 // x2  int dstW
 205 // x3  const uint8_t *src
 206 // x4  const int16_t *filter
 207 // x5  const int32_t *filterPos
 208 // x6  int filterSize
 209 // x8-x15 registers for gathering src data
 210
 211 // v0      madd accumulator 4S
 212 // v1-v4   filter values (16 bit) 8H
 213 // v5      madd accumulator 4S
 214 // v16-v19 src values (8 bit) 8B
 215
 216 // This implementation has 4 sections:
 217 //  1. Prefetch src data
 218 //  2. Interleaved prefetching src data and madd
 219 //  3. Complete madd
 220 //  4. Complete remaining iterations when dstW % 8 != 0
 221
 222         sub             sp, sp, #32                 // allocate 32 bytes on the stack
 223         cmp             w2, #16                     // if dstW <16, skip to the last block used for wrapping up
 224         b.lt            2f
 225
 226         // load 8 values from filterPos to be used as offsets into src
 227         ldp             w8, w9,  [x5]               // filterPos[idx + 0], [idx + 1]
 228         ldp             w10, w11, [x5, #8]          // filterPos[idx + 2], [idx + 3]
 229         ldp             w12, w13, [x5, #16]         // filterPos[idx + 4], [idx + 5]
 230         ldp             w14, w15, [x5, #24]         // filterPos[idx + 6], [idx + 7]
 231         add             x5, x5, #32                 // advance filterPos
 232
 233         // gather random access data from src into contiguous memory
 234         ldr             w8, [x3, w8, uxtw]          // src[filterPos[idx + 0]][0..3]
 235         ldr             w9, [x3, w9, uxtw]          // src[filterPos[idx + 1]][0..3]
 236         ldr             w10, [x3, w10, uxtw]        // src[filterPos[idx + 2]][0..3]
 237         ldr             w11, [x3, w11, uxtw]        // src[filterPos[idx + 3]][0..3]
 238         ldr             w12, [x3, w12, uxtw]        // src[filterPos[idx + 4]][0..3]
 239         ldr             w13, [x3, w13, uxtw]        // src[filterPos[idx + 5]][0..3]
 240         ldr             w14, [x3, w14, uxtw]        // src[filterPos[idx + 6]][0..3]
 241         ldr             w15, [x3, w15, uxtw]        // src[filterPos[idx + 7]][0..3]
 242         stp             w8, w9, [sp]                // *scratch_mem = { src[filterPos[idx + 0]][0..3], src[filterPos[idx + 1]][0..3] }
 243         stp             w10, w11, [sp, #8]          // *scratch_mem = { src[filterPos[idx + 2]][0..3], src[filterPos[idx + 3]][0..3] }
 244         stp             w12, w13, [sp, #16]         // *scratch_mem = { src[filterPos[idx + 4]][0..3], src[filterPos[idx + 5]][0..3] }
 245         stp             w14, w15, [sp, #24]         // *scratch_mem = { src[filterPos[idx + 6]][0..3], src[filterPos[idx + 7]][0..3] }
 246
 247 1:
 248         ld4             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp] // transpose 8 bytes each from src into 4 registers
 249
 250         // load 8 values from filterPos to be used as offsets into src
 251         ldp             w8, w9,  [x5]               // filterPos[idx + 0][0..3], [idx + 1][0..3], next iteration
 252         ldp             w10, w11, [x5, #8]          // filterPos[idx + 2][0..3], [idx + 3][0..3], next iteration
 253         ldp             w12, w13, [x5, #16]         // filterPos[idx + 4][0..3], [idx + 5][0..3], next iteration
 254         ldp             w14, w15, [x5, #24]         // filterPos[idx + 6][0..3], [idx + 7][0..3], next iteration
 255
 256         movi            v0.16b, #0                  // Clear madd accumulator for idx 0..3
 257         movi            v5.16b, #0                  // Clear madd accumulator for idx 4..7
 258
 259         ld4             {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7
 260
 261         add             x5, x5, #32                 // advance filterPos
 262
 263         // interleaved SIMD and prefetching intended to keep ld/st and vector pipelines busy
 264         uxtl            v16.8h, v16.8b              // unsigned extend long, covert src data to 16-bit
 265         uxtl            v17.8h, v17.8b              // unsigned extend long, covert src data to 16-bit
 266         ldr             w8, [x3, w8, uxtw]          // src[filterPos[idx + 0]], next iteration
 267         ldr             w9, [x3, w9, uxtw]          // src[filterPos[idx + 1]], next iteration
 268         uxtl            v18.8h, v18.8b              // unsigned extend long, covert src data to 16-bit
 269         uxtl            v19.8h, v19.8b              // unsigned extend long, covert src data to 16-bit
 270         ldr             w10, [x3, w10, uxtw]        // src[filterPos[idx + 2]], next iteration
 271         ldr             w11, [x3, w11, uxtw]        // src[filterPos[idx + 3]], next iteration
 272
 273         smlal           v0.4s, v1.4h, v16.4h        // multiply accumulate inner loop j = 0, idx = 0..3
 274         smlal           v0.4s, v2.4h, v17.4h        // multiply accumulate inner loop j = 1, idx = 0..3
 275         ldr             w12, [x3, w12, uxtw]        // src[filterPos[idx + 4]], next iteration
 276         ldr             w13, [x3, w13, uxtw]        // src[filterPos[idx + 5]], next iteration
 277         smlal           v0.4s, v3.4h, v18.4h        // multiply accumulate inner loop j = 2, idx = 0..3
 278         smlal           v0.4s, v4.4h, v19.4h        // multiply accumulate inner loop j = 3, idx = 0..3
 279         ldr             w14, [x3, w14, uxtw]        // src[filterPos[idx + 6]], next iteration
 280         ldr             w15, [x3, w15, uxtw]        // src[filterPos[idx + 7]], next iteration
 281
 282         smlal2          v5.4s, v1.8h, v16.8h        // multiply accumulate inner loop j = 0, idx = 4..7
 283         smlal2          v5.4s, v2.8h, v17.8h        // multiply accumulate inner loop j = 1, idx = 4..7
 284         stp             w8, w9, [sp]                // *scratch_mem = { src[filterPos[idx + 0]][0..3], src[filterPos[idx + 1]][0..3] }
 285         stp             w10, w11, [sp, #8]          // *scratch_mem = { src[filterPos[idx + 2]][0..3], src[filterPos[idx + 3]][0..3] }
 286         smlal2          v5.4s, v3.8h, v18.8h        // multiply accumulate inner loop j = 2, idx = 4..7
 287         smlal2          v5.4s, v4.8h, v19.8h        // multiply accumulate inner loop j = 3, idx = 4..7
 288         stp             w12, w13, [sp, #16]         // *scratch_mem = { src[filterPos[idx + 4]][0..3], src[filterPos[idx + 5]][0..3] }
 289         stp             w14, w15, [sp, #24]         // *scratch_mem = { src[filterPos[idx + 6]][0..3], src[filterPos[idx + 7]][0..3] }
 290
 291         sub             w2, w2, #8                  // dstW -= 8
 292         sqshrn          v0.4h, v0.4s, #7            // shift and clip the 2x16-bit final values
 293         sqshrn          v1.4h, v5.4s, #7            // shift and clip the 2x16-bit final values
 294         st1             {v0.4h, v1.4h}, [x1], #16   // write to dst[idx + 0..7]
 295         cmp             w2, #16                     // continue on main loop if there are at least 16 iterations left
 296         b.ge            1b
 297
 298         // last full iteration
 299         ld4             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp]
 300         ld4             {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7
 301
 302         movi            v0.16b, #0                  // Clear madd accumulator for idx 0..3
 303         movi            v5.16b, #0                  // Clear madd accumulator for idx 4..7
 304
 305         uxtl            v16.8h, v16.8b              // unsigned extend long, covert src data to 16-bit
 306         uxtl            v17.8h, v17.8b              // unsigned extend long, covert src data to 16-bit
 307         uxtl            v18.8h, v18.8b              // unsigned extend long, covert src data to 16-bit
 308         uxtl            v19.8h, v19.8b              // unsigned extend long, covert src data to 16-bit
 309
 310         smlal           v0.4s, v1.4h, v16.4h        // multiply accumulate inner loop j = 0, idx = 0..3
 311         smlal           v0.4s, v2.4h, v17.4h        // multiply accumulate inner loop j = 1, idx = 0..3
 312         smlal           v0.4s, v3.4h, v18.4h        // multiply accumulate inner loop j = 2, idx = 0..3
 313         smlal           v0.4s, v4.4h, v19.4h        // multiply accumulate inner loop j = 3, idx = 0..3
 314
 315         smlal2          v5.4s, v1.8h, v16.8h        // multiply accumulate inner loop j = 0, idx = 4..7
 316         smlal2          v5.4s, v2.8h, v17.8h        // multiply accumulate inner loop j = 1, idx = 4..7
 317         smlal2          v5.4s, v3.8h, v18.8h        // multiply accumulate inner loop j = 2, idx = 4..7
 318         smlal2          v5.4s, v4.8h, v19.8h        // multiply accumulate inner loop j = 3, idx = 4..7
 319
 320         subs            w2, w2, #8                  // dstW -= 8
 321         sqshrn          v0.4h, v0.4s, #7            // shift and clip the 2x16-bit final values
 322         sqshrn          v1.4h, v5.4s, #7            // shift and clip the 2x16-bit final values
 323         st1             {v0.4h, v1.4h}, [x1], #16   // write to dst[idx + 0..7]
 324
 325         cbnz            w2, 2f                      // if >0 iterations remain, jump to the wrap up section
 326
 327         add             sp, sp, #32                 // clean up stack
 328         ret
 329
 330         // finish up when dstW % 8 != 0 or dstW < 16
 331 2:
 332         // load src
 333         ldr             w8, [x5], #4                // filterPos[i]
 334         add             x9, x3, w8, uxtw            // calculate the address for src load
 335         ld1             {v5.s}[0], [x9]             // src[filterPos[i] + 0..3]
 336         // load filter
 337         ld1             {v6.4h}, [x4], #8           // filter[filterSize * i + 0..3]
 338
 339         uxtl            v5.8h, v5.8b                // unsigned exten long, convert src data to 16-bit
 340         smull           v0.4s, v5.4h, v6.4h         // 4 iterations of src[...] * filter[...]
 341         addv            s0, v0.4s                   // add up products of src and filter values
 342         sqshrn          h0, s0, #7                  // shift and clip the 2x16-bit final value
 343         st1             {v0.h}[0], [x1], #2         // dst[i] = ...
 344         sub             w2, w2, #1                  // dstW--
 345         cbnz            w2, 2b
 346
 347         add             sp, sp, #32                 // clean up stack
 348         ret
 349 endfunc
 350
 351 function ff_hscale8to19_4_neon, export=1
 352         // x0               SwsInternal *c (unused)
 353         // x1               int32_t *dst
 354         // w2               int dstW
 355         // x3               const uint8_t *src // treat it as uint16_t *src
 356         // x4               const uint16_t *filter
 357         // x5               const int32_t *filterPos
 358         // w6               int filterSize
 359
 360         movi            v18.4s, #1
 361         movi            v17.4s, #1
 362         shl             v18.4s, v18.4s, #19
 363         sub             v18.4s, v18.4s, v17.4s      // max allowed value
 364
 365         cmp             w2, #16
 366         b.lt            2f // move to last block
 367
 368         ldp             w8, w9, [x5]                // filterPos[0], filterPos[1]
 369         ldp             w10, w11, [x5, #8]          // filterPos[2], filterPos[3]
 370         ldp             w12, w13, [x5, #16]         // filterPos[4], filterPos[5]
 371         ldp             w14, w15, [x5, #24]         // filterPos[6], filterPos[7]
 372         add             x5, x5, #32
 373
 374         // load data from
 375         ldr             w8, [x3, w8, uxtw]
 376         ldr             w9, [x3, w9, uxtw]
 377         ldr             w10, [x3, w10, uxtw]
 378         ldr             w11, [x3, w11, uxtw]
 379         ldr             w12, [x3, w12, uxtw]
 380         ldr             w13, [x3, w13, uxtw]
 381         ldr             w14, [x3, w14, uxtw]
 382         ldr             w15, [x3, w15, uxtw]
 383
 384         sub             sp, sp, #32
 385
 386         stp             w8, w9, [sp]
 387         stp             w10, w11, [sp, #8]
 388         stp             w12, w13, [sp, #16]
 389         stp             w14, w15, [sp, #24]
 390
 391 1:
 392         ld4             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp]
 393         ld4             {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7]
 394         // load filterPositions into registers for next iteration
 395
 396         ldp             w8, w9, [x5]                // filterPos[0], filterPos[1]
 397         ldp             w10, w11, [x5, #8]          // filterPos[2], filterPos[3]
 398         ldp             w12, w13, [x5, #16]         // filterPos[4], filterPos[5]
 399         ldp             w14, w15, [x5, #24]         // filterPos[6], filterPos[7]
 400         add             x5, x5, #32
 401         uxtl            v0.8h, v0.8b
 402         ldr             w8, [x3, w8, uxtw]
 403         smull           v5.4s, v0.4h, v28.4h        // multiply first column of src
 404         ldr             w9, [x3, w9, uxtw]
 405         smull2          v6.4s, v0.8h, v28.8h
 406         stp             w8, w9, [sp]
 407
 408         uxtl            v1.8h, v1.8b
 409         ldr             w10, [x3, w10, uxtw]
 410         smlal           v5.4s, v1.4h, v29.4h        // multiply second column of src
 411         ldr             w11, [x3, w11, uxtw]
 412         smlal2          v6.4s, v1.8h, v29.8h
 413         stp             w10, w11, [sp, #8]
 414
 415         uxtl            v2.8h, v2.8b
 416         ldr             w12, [x3, w12, uxtw]
 417         smlal           v5.4s, v2.4h, v30.4h        // multiply third column of src
 418         ldr             w13, [x3, w13, uxtw]
 419         smlal2          v6.4s, v2.8h, v30.8h
 420         stp             w12, w13, [sp, #16]
 421
 422         uxtl            v3.8h, v3.8b
 423         ldr             w14, [x3, w14, uxtw]
 424         smlal           v5.4s, v3.4h, v31.4h        // multiply fourth column of src
 425         ldr             w15, [x3, w15, uxtw]
 426         smlal2          v6.4s, v3.8h, v31.8h
 427         stp             w14, w15, [sp, #24]
 428
 429         sub             w2, w2, #8
 430         sshr            v5.4s, v5.4s, #3
 431         sshr            v6.4s, v6.4s, #3
 432         smin            v5.4s, v5.4s, v18.4s
 433         smin            v6.4s, v6.4s, v18.4s
 434
 435         st1             {v5.4s, v6.4s}, [x1], #32
 436         cmp             w2, #16
 437         b.ge            1b
 438
 439         // here we make last iteration, without updating the registers
 440         ld4             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp]
 441         ld4             {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7]
 442
 443         uxtl            v0.8h, v0.8b
 444         uxtl            v1.8h, v1.8b
 445         smull           v5.4s, v0.4h, v28.4h
 446         smull2          v6.4s, v0.8h, v28.8h
 447         uxtl            v2.8h, v2.8b
 448         smlal           v5.4s, v1.4h, v29.4h
 449         smlal2          v6.4s, v1.8h, v29.8h
 450         uxtl            v3.8h, v3.8b
 451         smlal           v5.4s, v2.4h, v30.4h
 452         smlal2          v6.4s, v2.8h, v30.8h
 453         smlal           v5.4s, v3.4h, v31.4h
 454         smlal2          v6.4s, v3.8h, v31.8h
 455
 456         sshr            v5.4s, v5.4s, #3
 457         sshr            v6.4s, v6.4s, #3
 458
 459         smin            v5.4s, v5.4s, v18.4s
 460         smin            v6.4s, v6.4s, v18.4s
 461
 462         sub             w2, w2, #8
 463         st1             {v5.4s, v6.4s}, [x1], #32
 464         add             sp, sp, #32 // restore stack
 465         cbnz            w2, 2f
 466
 467         ret
 468
 469 2:
 470         ldr             w8, [x5], #4 // load filterPos
 471         add             x9, x3, w8, uxtw // src + filterPos
 472         ld1             {v0.s}[0], [x9] // load 4 * uint8_t* into one single
 473         ld1             {v31.4h}, [x4], #8
 474         uxtl            v0.8h, v0.8b
 475         smull           v5.4s, v0.4h, v31.4h
 476         saddlv          d0, v5.4s
 477         sqshrn          s0, d0, #3
 478         smin            v0.4s, v0.4s, v18.4s
 479         st1             {v0.s}[0], [x1], #4
 480         sub             w2, w2, #1
 481         cbnz            w2, 2b // if iterations remain jump to beginning
 482
 483         ret
 484 endfunc
 485
 486 function ff_hscale8to19_X8_neon, export=1
 487         movi            v20.4s, #1
 488         movi            v17.4s, #1
 489         shl             v20.4s, v20.4s, #19
 490         sub             v20.4s, v20.4s, v17.4s
 491
 492         sbfiz           x7, x6, #1, #32             // filterSize*2 (*2 because int16)
 493 1:
 494         mov             x16, x4                     // filter0 = filter
 495         ldr             w8, [x5], #4                // filterPos[idx]
 496         add             x12, x16, x7                // filter1 = filter0 + filterSize*2
 497         ldr             w0, [x5], #4                // filterPos[idx + 1]
 498         add             x13, x12, x7                // filter2 = filter1 + filterSize*2
 499         ldr             w11, [x5], #4               // filterPos[idx + 2]
 500         add             x4, x13, x7                 // filter3 = filter2 + filterSize*2
 501         ldr             w9, [x5], #4                // filterPos[idx + 3]
 502         movi            v0.16b, #0                  // val sum part 1 (for dst[0])
 503         movi            v1.16b, #0                  // val sum part 2 (for dst[1])
 504         movi            v2.16b, #0                  // val sum part 3 (for dst[2])
 505         movi            v3.16b, #0                  // val sum part 4 (for dst[3])
 506         add             x17, x3, w8, uxtw           // srcp + filterPos[0]
 507         add             x8,  x3, w0, uxtw           // srcp + filterPos[1]
 508         add             x0, x3, w11, uxtw           // srcp + filterPos[2]
 509         add             x11, x3, w9, uxtw           // srcp + filterPos[3]
 510         mov             w15, w6                     // filterSize counter
 511 2:      ld1             {v4.8b}, [x17], #8          // srcp[filterPos[0] + {0..7}]
 512         ld1             {v5.8h}, [x16], #16         // load 8x16-bit filter values, part 1
 513         uxtl            v4.8h, v4.8b                // unpack part 1 to 16-bit
 514         smlal           v0.4s, v4.4h, v5.4h         // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
 515         ld1             {v6.8b}, [x8], #8           // srcp[filterPos[1] + {0..7}]
 516         smlal2          v0.4s, v4.8h, v5.8h         // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]
 517         ld1             {v7.8h}, [x12], #16         // load 8x16-bit at filter+filterSize
 518         ld1             {v16.8b}, [x0], #8          // srcp[filterPos[2] + {0..7}]
 519         uxtl            v6.8h, v6.8b                // unpack part 2 to 16-bit
 520         ld1             {v17.8h}, [x13], #16        // load 8x16-bit at filter+2*filterSize
 521         uxtl            v16.8h, v16.8b              // unpack part 3 to 16-bit
 522         smlal           v1.4s, v6.4h, v7.4h         // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
 523         ld1             {v18.8b}, [x11], #8         // srcp[filterPos[3] + {0..7}]
 524         smlal           v2.4s, v16.4h, v17.4h       // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
 525         ld1             {v19.8h}, [x4], #16         // load 8x16-bit at filter+3*filterSize
 526         smlal2          v2.4s, v16.8h, v17.8h       // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
 527         uxtl            v18.8h, v18.8b              // unpack part 4 to 16-bit
 528         smlal2          v1.4s, v6.8h, v7.8h         // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
 529         smlal           v3.4s, v18.4h, v19.4h       // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
 530         subs            w15, w15, #8                // j -= 8: processed 8/filterSize
 531         smlal2          v3.4s, v18.8h, v19.8h       // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
 532         b.gt            2b                          // inner loop if filterSize not consumed completely
 533         addp            v0.4s, v0.4s, v1.4s         // part01 horizontal pair adding
 534         addp            v2.4s, v2.4s, v3.4s         // part23 horizontal pair adding
 535         addp            v0.4s, v0.4s, v2.4s         // part0123 horizontal pair adding
 536         subs            w2, w2, #4                  // dstW -= 4
 537         sshr            v0.4s, v0.4s, #3            // shift and clip the 2x16-bit final values
 538         smin            v0.4s, v0.4s, v20.4s
 539         st1             {v0.4s}, [x1], #16           // write to destination part0123
 540         b.gt            1b                          // loop until end of line
 541         ret
 542 endfunc
 543
 544 function ff_hscale8to19_X4_neon, export=1
 545         // x0  SwsInternal *c (not used)
 546         // x1  int16_t *dst
 547         // w2  int dstW
 548         // x3  const uint8_t *src
 549         // x4  const int16_t *filter
 550         // x5  const int32_t *filterPos
 551         // w6  int filterSize
 552
 553         movi            v20.4s, #1
 554         movi            v17.4s, #1
 555         shl             v20.4s, v20.4s, #19
 556         sub             v20.4s, v20.4s, v17.4s
 557
 558         lsl             w7, w6, #1
 559 1:
 560         ldp             w8, w9, [x5]
 561         ldp             w10, w11, [x5, #8]
 562
 563         movi            v16.16b, #0                 // initialize accumulator for idx + 0
 564         movi            v17.16b, #0                 // initialize accumulator for idx + 1
 565         movi            v18.16b, #0                 // initialize accumulator for idx + 2
 566         movi            v19.16b, #0                 // initialize accumulator for idx + 3
 567
 568         mov             x12, x4                     // filter + 0
 569         add             x13, x4, x7                 // filter + 1
 570         add             x8, x3, w8, uxtw            // srcp + filterPos 0
 571         add             x14, x13, x7                // filter + 2
 572         add             x9, x3, w9, uxtw            // srcp + filterPos 1
 573         add             x15, x14, x7                // filter + 3
 574         add             x10, x3, w10, uxtw          // srcp + filterPos 2
 575         mov             w0, w6                      // save the filterSize to temporary variable
 576         add             x11, x3, w11, uxtw          // srcp + filterPos 3
 577         add             x5, x5, #16                 // advance filter position
 578         mov             x16, xzr                    // clear the register x16 used for offsetting the filter values
 579
 580 2:
 581         ldr             d4, [x8], #8                // load src values for idx 0
 582         ldr             q31, [x12, x16]             // load filter values for idx 0
 583         uxtl            v4.8h, v4.8b                // extend type to match the filter' size
 584         ldr             d5, [x9], #8                // load src values for idx 1
 585         smlal           v16.4s, v4.4h, v31.4h       // multiplication of lower half for idx 0
 586         uxtl            v5.8h, v5.8b                // extend type to match the filter' size
 587         ldr             q30, [x13, x16]             // load filter values for idx 1
 588         smlal2          v16.4s, v4.8h, v31.8h       // multiplication of upper half for idx 0
 589         ldr             d6, [x10], #8               // load src values for idx 2
 590         ldr             q29, [x14, x16]             // load filter values for idx 2
 591         smlal           v17.4s, v5.4h, v30.4h       // multiplication of lower half for idx 1
 592         ldr             d7, [x11], #8               // load src values for idx 3
 593         smlal2          v17.4s, v5.8h, v30.8h       // multiplication of upper half for idx 1
 594         uxtl            v6.8h, v6.8b                // extend tpye to matchi the filter's size
 595         ldr             q28, [x15, x16]             // load filter values for idx 3
 596         smlal           v18.4s, v6.4h, v29.4h       // multiplication of lower half for idx 2
 597         uxtl            v7.8h, v7.8b
 598         smlal2          v18.4s, v6.8h, v29.8h       // multiplication of upper half for idx 2
 599         sub             w0, w0, #8
 600         smlal           v19.4s, v7.4h, v28.4h       // multiplication of lower half for idx 3
 601         cmp             w0, #8
 602         smlal2          v19.4s, v7.8h, v28.8h       // multiplication of upper half for idx 3
 603         add             x16, x16, #16                // advance filter values indexing
 604
 605         b.ge            2b
 606
 607
 608         // 4 iterations left
 609
 610         sub             x17, x7, #8                 // step back to wrap up the filter pos for last 4 elements
 611
 612         ldr             s4, [x8]                    // load src values for idx 0
 613         ldr             d31, [x12, x17]             // load filter values for idx 0
 614         uxtl            v4.8h, v4.8b                // extend type to match the filter' size
 615         ldr             s5, [x9]                    // load src values for idx 1
 616         smlal           v16.4s, v4.4h, v31.4h
 617         ldr             d30, [x13, x17]             // load filter values for idx 1
 618         uxtl            v5.8h, v5.8b                // extend type to match the filter' size
 619         ldr             s6, [x10]                   // load src values for idx 2
 620         smlal           v17.4s, v5.4h, v30.4h
 621         uxtl            v6.8h, v6.8b                // extend type to match the filter's size
 622         ldr             d29, [x14, x17]             // load filter values for idx 2
 623         ldr             s7, [x11]                   // load src values for idx 3
 624         addp            v16.4s, v16.4s, v17.4s
 625         uxtl            v7.8h, v7.8b
 626         ldr             d28, [x15, x17]             // load filter values for idx 3
 627         smlal           v18.4s, v6.4h, v29.4h
 628         smlal           v19.4s, v7.4h, v28.4h
 629         subs            w2, w2, #4
 630         addp            v18.4s, v18.4s, v19.4s
 631         addp            v16.4s, v16.4s, v18.4s
 632         sshr            v16.4s, v16.4s, #3
 633         smin            v16.4s, v16.4s, v20.4s
 634
 635         st1             {v16.4s}, [x1], #16
 636         add             x4, x4, x7, lsl #2
 637         b.gt            1b
 638         ret
 639 endfunc
 640
 641 function ff_hscale16to15_4_neon_asm, export=1
 642         // w0               int shift
 643         // x1               int32_t *dst
 644         // w2               int dstW
 645         // x3               const uint8_t *src // treat it as uint16_t *src
 646         // x4               const uint16_t *filter
 647         // x5               const int32_t *filterPos
 648         // w6               int filterSize
 649
 650         movi            v18.4s, #1
 651         movi            v17.4s, #1
 652         shl             v18.4s, v18.4s, #15
 653         sub             v18.4s, v18.4s, v17.4s      // max allowed value
 654         dup             v17.4s, w0                  // read shift
 655         neg             v17.4s, v17.4s              // negate it, so it can be used in sshl (effectively shift right)
 656
 657         cmp             w2, #16
 658         b.lt            2f // move to last block
 659
 660         ldp             w8, w9, [x5]                // filterPos[0], filterPos[1]
 661         ldp             w10, w11, [x5, #8]          // filterPos[2], filterPos[3]
 662         ldp             w12, w13, [x5, #16]         // filterPos[4], filterPos[5]
 663         ldp             w14, w15, [x5, #24]         // filterPos[6], filterPos[7]
 664         add             x5, x5, #32
 665
 666         // shift all filterPos left by one, as uint16_t will be read
 667         lsl             x8, x8, #1
 668         lsl             x9, x9, #1
 669         lsl             x10, x10, #1
 670         lsl             x11, x11, #1
 671         lsl             x12, x12, #1
 672         lsl             x13, x13, #1
 673         lsl             x14, x14, #1
 674         lsl             x15, x15, #1
 675
 676         // load src with given offset
 677         ldr             x8,  [x3, w8,  uxtw]
 678         ldr             x9,  [x3, w9,  uxtw]
 679         ldr             x10, [x3, w10, uxtw]
 680         ldr             x11, [x3, w11, uxtw]
 681         ldr             x12, [x3, w12, uxtw]
 682         ldr             x13, [x3, w13, uxtw]
 683         ldr             x14, [x3, w14, uxtw]
 684         ldr             x15, [x3, w15, uxtw]
 685
 686         sub             sp, sp, #64
 687         // push src on stack so it can be loaded into vectors later
 688         stp             x8, x9, [sp]
 689         stp             x10, x11, [sp, #16]
 690         stp             x12, x13, [sp, #32]
 691         stp             x14, x15, [sp, #48]
 692
 693 1:
 694         ld4             {v0.8h, v1.8h, v2.8h, v3.8h}, [sp]
 695         ld4             {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7]
 696
 697         // Each of blocks does the following:
 698         // Extend src and filter to 32 bits with uxtl and sxtl
 699         // multiply or multiply and accumulate results
 700         // Extending to 32 bits is necessary, as unit16_t values can't
 701         // be represented as int16_t without type promotion.
 702         uxtl            v26.4s, v0.4h
 703         sxtl            v27.4s, v28.4h
 704         uxtl2           v0.4s, v0.8h
 705         mul             v5.4s, v26.4s, v27.4s
 706         sxtl2           v28.4s, v28.8h
 707         uxtl            v26.4s, v1.4h
 708         mul             v6.4s, v0.4s, v28.4s
 709
 710         sxtl            v27.4s, v29.4h
 711         uxtl2           v0.4s, v1.8h
 712         mla             v5.4s, v27.4s, v26.4s
 713         sxtl2           v28.4s, v29.8h
 714         uxtl            v26.4s, v2.4h
 715         mla             v6.4s, v28.4s, v0.4s
 716
 717         sxtl            v27.4s, v30.4h
 718         uxtl2           v0.4s, v2.8h
 719         mla             v5.4s, v27.4s, v26.4s
 720         sxtl2           v28.4s, v30.8h
 721         uxtl            v26.4s, v3.4h
 722         mla             v6.4s, v28.4s, v0.4s
 723
 724         sxtl            v27.4s, v31.4h
 725         uxtl2           v0.4s, v3.8h
 726         mla             v5.4s, v27.4s, v26.4s
 727         sxtl2           v28.4s, v31.8h
 728         sub             w2, w2, #8
 729         mla             v6.4s, v28.4s, v0.4s
 730
 731         sshl            v5.4s, v5.4s, v17.4s
 732         sshl            v6.4s, v6.4s, v17.4s
 733         smin            v5.4s, v5.4s, v18.4s
 734         smin            v6.4s, v6.4s, v18.4s
 735         xtn             v5.4h, v5.4s
 736         xtn2            v5.8h, v6.4s
 737
 738         st1             {v5.8h}, [x1], #16
 739         cmp             w2, #16
 740
 741         // load filterPositions into registers for next iteration
 742         ldp             w8, w9, [x5]                // filterPos[0], filterPos[1]
 743         ldp             w10, w11, [x5, #8]          // filterPos[2], filterPos[3]
 744         ldp             w12, w13, [x5, #16]         // filterPos[4], filterPos[5]
 745         ldp             w14, w15, [x5, #24]         // filterPos[6], filterPos[7]
 746         add             x5, x5, #32
 747
 748         lsl             x8, x8, #1
 749         lsl             x9, x9, #1
 750         lsl             x10, x10, #1
 751         lsl             x11, x11, #1
 752         lsl             x12, x12, #1
 753         lsl             x13, x13, #1
 754         lsl             x14, x14, #1
 755         lsl             x15, x15, #1
 756
 757         ldr             x8,  [x3, w8,  uxtw]
 758         ldr             x9,  [x3, w9,  uxtw]
 759         ldr             x10, [x3, w10, uxtw]
 760         ldr             x11, [x3, w11, uxtw]
 761         ldr             x12, [x3, w12, uxtw]
 762         ldr             x13, [x3, w13, uxtw]
 763         ldr             x14, [x3, w14, uxtw]
 764         ldr             x15, [x3, w15, uxtw]
 765
 766         stp             x8, x9, [sp]
 767         stp             x10, x11, [sp, #16]
 768         stp             x12, x13, [sp, #32]
 769         stp             x14, x15, [sp, #48]
 770
 771         b.ge            1b
 772
 773         // here we make last iteration, without updating the registers
 774         ld4             {v0.8h, v1.8h, v2.8h, v3.8h}, [sp]
 775         ld4             {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64
 776
 777         uxtl            v26.4s, v0.4h
 778         sxtl            v27.4s, v28.4h
 779         uxtl2           v0.4s, v0.8h
 780         mul             v5.4s, v26.4s, v27.4s
 781         sxtl2           v28.4s, v28.8h
 782         uxtl            v26.4s, v1.4h
 783         mul             v6.4s, v0.4s, v28.4s
 784
 785         sxtl            v27.4s, v29.4h
 786         uxtl2           v0.4s, v1.8h
 787         mla             v5.4s, v26.4s, v27.4s
 788         sxtl2           v28.4s, v29.8h
 789         uxtl            v26.4s, v2.4h
 790         mla             v6.4s, v0.4s, v28.4s
 791
 792         sxtl            v27.4s, v30.4h
 793         uxtl2           v0.4s, v2.8h
 794         mla             v5.4s, v26.4s, v27.4s
 795         sxtl2           v28.4s, v30.8h
 796         uxtl            v26.4s, v3.4h
 797         mla             v6.4s, v0.4s, v28.4s
 798
 799         sxtl            v27.4s, v31.4h
 800         uxtl2           v0.4s, v3.8h
 801         mla             v5.4s, v26.4s, v27.4s
 802         sxtl2           v28.4s, v31.8h
 803         subs            w2, w2, #8
 804         mla             v6.4s, v0.4s, v28.4s
 805
 806         sshl            v5.4s, v5.4s, v17.4s
 807         sshl            v6.4s, v6.4s, v17.4s
 808         smin            v5.4s, v5.4s, v18.4s
 809         smin            v6.4s, v6.4s, v18.4s
 810         xtn             v5.4h, v5.4s
 811         xtn2            v5.8h, v6.4s
 812
 813         st1             {v5.8h}, [x1], #16
 814         add             sp, sp, #64                 // restore stack
 815         cbnz            w2, 2f
 816
 817         ret
 818
 819 2:
 820         ldr             w8, [x5], #4                // load filterPos
 821         lsl             w8, w8, #1
 822         add             x9, x3, w8, uxtw            // src + filterPos
 823         ld1             {v0.4h}, [x9]               // load 4 * uint16_t
 824         ld1             {v31.4h}, [x4], #8
 825
 826         uxtl            v0.4s, v0.4h
 827         sxtl            v31.4s, v31.4h
 828         mul             v5.4s, v0.4s, v31.4s
 829         addv            s0, v5.4s
 830         sshl            v0.4s, v0.4s, v17.4s
 831         smin            v0.4s, v0.4s, v18.4s
 832         st1             {v0.h}[0], [x1], #2
 833         sub             w2, w2, #1
 834         cbnz            w2, 2b                      // if iterations remain jump to beginning
 835
 836         ret
 837 endfunc
 838
 839 function ff_hscale16to15_X8_neon_asm, export=1
 840         // w0               int shift
 841         // x1               int32_t *dst
 842         // w2               int dstW
 843         // x3               const uint8_t *src // treat it as uint16_t *src
 844         // x4               const uint16_t *filter
 845         // x5               const int32_t *filterPos
 846         // w6               int filterSize
 847
 848         movi            v20.4s, #1
 849         movi            v21.4s, #1
 850         shl             v20.4s, v20.4s, #15
 851         sub             v20.4s, v20.4s, v21.4s
 852         dup             v21.4s, w0
 853         neg             v21.4s, v21.4s
 854
 855         sbfiz           x7, x6, #1, #32             // filterSize*2 (*2 because int16)
 856 1:      ldr             w8, [x5], #4                // filterPos[idx]
 857         lsl             w8, w8, #1
 858         ldr             w10, [x5], #4               // filterPos[idx + 1]
 859         lsl             w10, w10, #1
 860         ldr             w11, [x5], #4               // filterPos[idx + 2]
 861         lsl             w11, w11, #1
 862         ldr             w9, [x5], #4                // filterPos[idx + 3]
 863         lsl             w9, w9, #1
 864         mov             x16, x4                     // filter0 = filter
 865         add             x12, x16, x7                // filter1 = filter0 + filterSize*2
 866         add             x13, x12, x7                // filter2 = filter1 + filterSize*2
 867         add             x4, x13, x7                 // filter3 = filter2 + filterSize*2
 868         movi            v0.16b, #0                  // val sum part 1 (for dst[0])
 869         movi            v1.16b, #0                  // val sum part 2 (for dst[1])
 870         movi            v2.16b, #0                  // val sum part 3 (for dst[2])
 871         movi            v3.16b, #0                  // val sum part 4 (for dst[3])
 872         add             x17, x3, w8, uxtw           // srcp + filterPos[0]
 873         add             x8,  x3, w10, uxtw          // srcp + filterPos[1]
 874         add             x10, x3, w11, uxtw          // srcp + filterPos[2]
 875         add             x11, x3, w9, uxtw           // srcp + filterPos[3]
 876         mov             w15, w6                     // filterSize counter
 877 2:      ld1             {v4.8h}, [x17], #16         // srcp[filterPos[0] + {0..7}]
 878         ld1             {v5.8h}, [x16], #16         // load 8x16-bit filter values, part 1
 879         ld1             {v6.8h}, [x8], #16          // srcp[filterPos[1] + {0..7}]
 880         ld1             {v7.8h}, [x12], #16         // load 8x16-bit at filter+filterSize
 881         uxtl            v24.4s, v4.4h               // extend srcp lower half to 32 bits to preserve sign
 882         sxtl            v25.4s, v5.4h               // extend filter lower half to 32 bits to match srcp size
 883         uxtl2           v4.4s, v4.8h                // extend srcp upper half to 32 bits
 884         mla             v0.4s, v24.4s, v25.4s       // multiply accumulate lower half of v4 * v5
 885         sxtl2           v5.4s, v5.8h                // extend filter upper half to 32 bits
 886         uxtl            v26.4s, v6.4h               // extend srcp lower half to 32 bits
 887         mla             v0.4s, v4.4s, v5.4s         // multiply accumulate upper half of v4 * v5
 888         sxtl            v27.4s, v7.4h               // exted filter lower half
 889         uxtl2           v6.4s, v6.8h                // extend srcp upper half
 890         sxtl2           v7.4s, v7.8h                // extend filter upper half
 891         ld1             {v16.8h}, [x10], #16        // srcp[filterPos[2] + {0..7}]
 892         mla             v1.4s, v26.4s, v27.4s       // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
 893         ld1             {v17.8h}, [x13], #16        // load 8x16-bit at filter+2*filterSize
 894         uxtl            v22.4s, v16.4h              // extend srcp lower half
 895         sxtl            v23.4s, v17.4h              // extend filter lower half
 896         uxtl2           v16.4s, v16.8h              // extend srcp upper half
 897         sxtl2           v17.4s, v17.8h              // extend filter upper half
 898         mla             v2.4s, v22.4s, v23.4s       // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
 899         mla             v2.4s, v16.4s, v17.4s       // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
 900         ld1             {v18.8h}, [x11], #16        // srcp[filterPos[3] + {0..7}]
 901         mla             v1.4s, v6.4s, v7.4s         // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
 902         ld1             {v19.8h}, [x4], #16         // load 8x16-bit at filter+3*filterSize
 903         subs            w15, w15, #8                // j -= 8: processed 8/filterSize
 904         uxtl            v28.4s, v18.4h              // extend srcp lower half
 905         sxtl            v29.4s, v19.4h              // extend filter lower half
 906         uxtl2           v18.4s, v18.8h              // extend srcp upper half
 907         sxtl2           v19.4s, v19.8h              // extend filter upper half
 908         mla             v3.4s, v28.4s, v29.4s       // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
 909         mla             v3.4s, v18.4s, v19.4s       // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
 910         b.gt            2b                          // inner loop if filterSize not consumed completely
 911         addp            v0.4s, v0.4s, v1.4s         // part01 horizontal pair adding
 912         addp            v2.4s, v2.4s, v3.4s         // part23 horizontal pair adding
 913         addp            v0.4s, v0.4s, v2.4s         // part0123 horizontal pair adding
 914         subs            w2, w2, #4                  // dstW -= 4
 915         sshl            v0.4s, v0.4s, v21.4s        // shift right (effectively rigth, as shift is negative); overflow expected
 916         smin            v0.4s, v0.4s, v20.4s        // apply min (do not use sqshl)
 917         xtn             v0.4h, v0.4s                // narrow down to 16 bits
 918
 919         st1             {v0.4h}, [x1], #8           // write to destination part0123
 920         b.gt            1b                          // loop until end of line
 921         ret
 922 endfunc
 923
 924 function ff_hscale16to15_X4_neon_asm, export=1
 925         // w0  int shift
 926         // x1  int16_t *dst
 927         // w2  int dstW
 928         // x3  const uint8_t *src
 929         // x4  const int16_t *filter
 930         // x5  const int32_t *filterPos
 931         // w6  int filterSize
 932
 933         stp             d8, d9, [sp, #-0x20]!
 934         stp             d10, d11, [sp, #0x10]
 935
 936         movi            v18.4s, #1
 937         movi            v17.4s, #1
 938         shl             v18.4s, v18.4s, #15
 939         sub             v21.4s, v18.4s, v17.4s      // max allowed value
 940         dup             v17.4s, w0                  // read shift
 941         neg             v20.4s, v17.4s              // negate it, so it can be used in sshl (effectively shift right)
 942
 943         lsl             w7, w6, #1
 944 1:
 945         ldp             w8, w9, [x5]
 946         ldp             w10, w11, [x5, #8]
 947
 948         movi            v16.16b, #0                 // initialize accumulator for idx + 0
 949         movi            v17.16b, #0                 // initialize accumulator for idx + 1
 950         movi            v18.16b, #0                 // initialize accumulator for idx + 2
 951         movi            v19.16b, #0                 // initialize accumulator for idx + 3
 952
 953         mov             x12, x4                     // filter + 0
 954         add             x13, x4, x7                 // filter + 1
 955         add             x8, x3, x8, lsl #1          // srcp + filterPos 0
 956         add             x14, x13, x7                // filter + 2
 957         add             x9, x3, x9, lsl #1          // srcp + filterPos 1
 958         add             x15, x14, x7                // filter + 3
 959         add             x10, x3, x10, lsl #1        // srcp + filterPos 2
 960         mov             w0, w6                      // save the filterSize to temporary variable
 961         add             x11, x3, x11, lsl #1        // srcp + filterPos 3
 962         add             x5, x5, #16                 // advance filter position
 963         mov             x16, xzr                    // clear the register x16 used for offsetting the filter values
 964
 965 2:
 966         ldr             q4, [x8], #16               // load src values for idx 0
 967         ldr             q5, [x9], #16               // load src values for idx 1
 968         uxtl            v26.4s, v4.4h
 969         uxtl2           v4.4s, v4.8h
 970         ldr             q31, [x12, x16]             // load filter values for idx 0
 971         ldr             q6, [x10], #16              // load src values for idx 2
 972         sxtl            v22.4s, v31.4h
 973         sxtl2           v31.4s, v31.8h
 974         mla             v16.4s, v26.4s, v22.4s      // multiplication of lower half for idx 0
 975         uxtl            v25.4s, v5.4h
 976         uxtl2           v5.4s, v5.8h
 977         ldr             q30, [x13, x16]             // load filter values for idx 1
 978         ldr             q7, [x11], #16              // load src values for idx 3
 979         mla             v16.4s, v4.4s, v31.4s       // multiplication of upper half for idx 0
 980         uxtl            v24.4s, v6.4h
 981         sxtl            v8.4s, v30.4h
 982         sxtl2           v30.4s, v30.8h
 983         mla             v17.4s, v25.4s, v8.4s       // multiplication of lower half for idx 1
 984         ldr             q29, [x14, x16]             // load filter values for idx 2
 985         uxtl2           v6.4s, v6.8h
 986         sxtl            v9.4s, v29.4h
 987         sxtl2           v29.4s, v29.8h
 988         mla             v17.4s, v5.4s, v30.4s       // multiplication of upper half for idx 1
 989         mla             v18.4s, v24.4s, v9.4s       // multiplication of lower half for idx 2
 990         ldr             q28, [x15, x16]             // load filter values for idx 3
 991         uxtl            v23.4s, v7.4h
 992         sxtl            v10.4s, v28.4h
 993         mla             v18.4s, v6.4s, v29.4s       // multiplication of upper half for idx 2
 994         uxtl2           v7.4s, v7.8h
 995         sxtl2           v28.4s, v28.8h
 996         mla             v19.4s, v23.4s, v10.4s      // multiplication of lower half for idx 3
 997         sub             w0, w0, #8
 998         cmp             w0, #8
 999         mla             v19.4s, v7.4s, v28.4s       // multiplication of upper half for idx 3
1000
1001         add             x16, x16, #16               // advance filter values indexing
1002
1003         b.ge            2b
1004
1005         // 4 iterations left
1006
1007         sub             x17, x7, #8                 // step back to wrap up the filter pos for last 4 elements
1008
1009         ldr             d4, [x8]                    // load src values for idx 0
1010         ldr             d31, [x12, x17]             // load filter values for idx 0
1011         uxtl            v4.4s, v4.4h
1012         sxtl            v31.4s, v31.4h
1013         ldr             d5, [x9]                    // load src values for idx 1
1014         mla             v16.4s, v4.4s, v31.4s       // multiplication of upper half for idx 0
1015         ldr             d30, [x13, x17]             // load filter values for idx 1
1016         uxtl            v5.4s, v5.4h
1017         sxtl            v30.4s, v30.4h
1018         ldr             d6, [x10]                   // load src values for idx 2
1019         mla             v17.4s, v5.4s, v30.4s       // multiplication of upper half for idx 1
1020         ldr             d29, [x14, x17]             // load filter values for idx 2
1021         uxtl            v6.4s, v6.4h
1022         sxtl            v29.4s, v29.4h
1023         ldr             d7, [x11]                   // load src values for idx 3
1024         ldr             d28, [x15, x17]             // load filter values for idx 3
1025         mla             v18.4s, v6.4s, v29.4s       // multiplication of upper half for idx 2
1026         uxtl            v7.4s, v7.4h
1027         sxtl            v28.4s, v28.4h
1028         addp            v16.4s, v16.4s, v17.4s
1029         mla             v19.4s, v7.4s, v28.4s       // multiplication of upper half for idx 3
1030         subs            w2, w2, #4
1031         addp            v18.4s, v18.4s, v19.4s
1032         addp            v16.4s, v16.4s, v18.4s
1033         sshl            v16.4s, v16.4s, v20.4s
1034         smin            v16.4s, v16.4s, v21.4s
1035         xtn             v16.4h, v16.4s
1036
1037         st1             {v16.4h}, [x1], #8
1038         add             x4, x4, x7, lsl #2
1039         b.gt            1b
1040
1041         ldp             d8, d9, [sp]
1042         ldp             d10, d11, [sp, #0x10]
1043
1044         add             sp, sp, #0x20
1045
1046         ret
1047 endfunc
1048
1049 function ff_hscale16to19_4_neon_asm, export=1
1050         // w0               int shift
1051         // x1               int32_t *dst
1052         // w2               int dstW
1053         // x3               const uint8_t *src // treat it as uint16_t *src
1054         // x4               const uint16_t *filter
1055         // x5               const int32_t *filterPos
1056         // w6               int filterSize
1057
1058         movi            v18.4s, #1
1059         movi            v17.4s, #1
1060         shl             v18.4s, v18.4s, #19
1061         sub             v18.4s, v18.4s, v17.4s      // max allowed value
1062         dup             v17.4s, w0                  // read shift
1063         neg             v17.4s, v17.4s              // negate it, so it can be used in sshl (effectively shift right)
1064
1065         cmp             w2, #16
1066         b.lt            2f // move to last block
1067
1068         ldp             w8, w9, [x5]                // filterPos[0], filterPos[1]
1069         ldp             w10, w11, [x5, #8]          // filterPos[2], filterPos[3]
1070         ldp             w12, w13, [x5, #16]         // filterPos[4], filterPos[5]
1071         ldp             w14, w15, [x5, #24]         // filterPos[6], filterPos[7]
1072         add             x5, x5, #32
1073
1074         // shift all filterPos left by one, as uint16_t will be read
1075         lsl             x8, x8, #1
1076         lsl             x9, x9, #1
1077         lsl             x10, x10, #1
1078         lsl             x11, x11, #1
1079         lsl             x12, x12, #1
1080         lsl             x13, x13, #1
1081         lsl             x14, x14, #1
1082         lsl             x15, x15, #1
1083
1084         // load src with given offset
1085         ldr             x8,  [x3, w8,  uxtw]
1086         ldr             x9,  [x3, w9,  uxtw]
1087         ldr             x10, [x3, w10, uxtw]
1088         ldr             x11, [x3, w11, uxtw]
1089         ldr             x12, [x3, w12, uxtw]
1090         ldr             x13, [x3, w13, uxtw]
1091         ldr             x14, [x3, w14, uxtw]
1092         ldr             x15, [x3, w15, uxtw]
1093
1094         sub             sp, sp, #64
1095         // push src on stack so it can be loaded into vectors later
1096         stp             x8, x9, [sp]
1097         stp             x10, x11, [sp, #16]
1098         stp             x12, x13, [sp, #32]
1099         stp             x14, x15, [sp, #48]
1100
1101 1:
1102         ld4             {v0.8h, v1.8h, v2.8h, v3.8h}, [sp]
1103         ld4             {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7]
1104
1105         // Each of blocks does the following:
1106         // Extend src and filter to 32 bits with uxtl and sxtl
1107         // multiply or multiply and accumulate results
1108         // Extending to 32 bits is necessary, as unit16_t values can't
1109         // be represented as int16_t without type promotion.
1110         uxtl            v26.4s, v0.4h
1111         sxtl            v27.4s, v28.4h
1112         uxtl2           v0.4s, v0.8h
1113         mul             v5.4s, v26.4s, v27.4s
1114         sxtl2           v28.4s, v28.8h
1115         uxtl            v26.4s, v1.4h
1116         mul             v6.4s, v0.4s, v28.4s
1117
1118         sxtl            v27.4s, v29.4h
1119         uxtl2           v0.4s, v1.8h
1120         mla             v5.4s, v27.4s, v26.4s
1121         sxtl2           v28.4s, v29.8h
1122         uxtl            v26.4s, v2.4h
1123         mla             v6.4s, v28.4s, v0.4s
1124
1125         sxtl            v27.4s, v30.4h
1126         uxtl2           v0.4s, v2.8h
1127         mla             v5.4s, v27.4s, v26.4s
1128         sxtl2           v28.4s, v30.8h
1129         uxtl            v26.4s, v3.4h
1130         mla             v6.4s, v28.4s, v0.4s
1131
1132         sxtl            v27.4s, v31.4h
1133         uxtl2           v0.4s, v3.8h
1134         mla             v5.4s, v27.4s, v26.4s
1135         sxtl2           v28.4s, v31.8h
1136         sub             w2, w2, #8
1137         mla             v6.4s, v28.4s, v0.4s
1138
1139         sshl            v5.4s, v5.4s, v17.4s
1140         sshl            v6.4s, v6.4s, v17.4s
1141         smin            v5.4s, v5.4s, v18.4s
1142         smin            v6.4s, v6.4s, v18.4s
1143
1144         st1             {v5.4s, v6.4s}, [x1], #32
1145         cmp             w2, #16
1146
1147         // load filterPositions into registers for next iteration
1148         ldp             w8, w9, [x5]                // filterPos[0], filterPos[1]
1149         ldp             w10, w11, [x5, #8]          // filterPos[2], filterPos[3]
1150         ldp             w12, w13, [x5, #16]         // filterPos[4], filterPos[5]
1151         ldp             w14, w15, [x5, #24]         // filterPos[6], filterPos[7]
1152         add             x5, x5, #32
1153
1154         lsl             x8, x8, #1
1155         lsl             x9, x9, #1
1156         lsl             x10, x10, #1
1157         lsl             x11, x11, #1
1158         lsl             x12, x12, #1
1159         lsl             x13, x13, #1
1160         lsl             x14, x14, #1
1161         lsl             x15, x15, #1
1162
1163         ldr             x8,  [x3, w8,  uxtw]
1164         ldr             x9,  [x3, w9,  uxtw]
1165         ldr             x10, [x3, w10, uxtw]
1166         ldr             x11, [x3, w11, uxtw]
1167         ldr             x12, [x3, w12, uxtw]
1168         ldr             x13, [x3, w13, uxtw]
1169         ldr             x14, [x3, w14, uxtw]
1170         ldr             x15, [x3, w15, uxtw]
1171
1172         stp             x8, x9, [sp]
1173         stp             x10, x11, [sp, #16]
1174         stp             x12, x13, [sp, #32]
1175         stp             x14, x15, [sp, #48]
1176
1177         b.ge            1b
1178
1179         // here we make last iteration, without updating the registers
1180         ld4             {v0.8h, v1.8h, v2.8h, v3.8h}, [sp]
1181         ld4             {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64
1182
1183         uxtl            v26.4s, v0.4h
1184         sxtl            v27.4s, v28.4h
1185         uxtl2           v0.4s, v0.8h
1186         mul             v5.4s, v26.4s, v27.4s
1187         sxtl2           v28.4s, v28.8h
1188         uxtl            v26.4s, v1.4h
1189         mul             v6.4s, v0.4s, v28.4s
1190
1191         sxtl            v27.4s, v29.4h
1192         uxtl2           v0.4s, v1.8h
1193         mla             v5.4s, v26.4s, v27.4s
1194         sxtl2           v28.4s, v29.8h
1195         uxtl            v26.4s, v2.4h
1196         mla             v6.4s, v0.4s, v28.4s
1197
1198         sxtl            v27.4s, v30.4h
1199         uxtl2           v0.4s, v2.8h
1200         mla             v5.4s, v26.4s, v27.4s
1201         sxtl2           v28.4s, v30.8h
1202         uxtl            v26.4s, v3.4h
1203         mla             v6.4s, v0.4s, v28.4s
1204
1205         sxtl            v27.4s, v31.4h
1206         uxtl2           v0.4s, v3.8h
1207         mla             v5.4s, v26.4s, v27.4s
1208         sxtl2           v28.4s, v31.8h
1209         subs            w2, w2, #8
1210         mla             v6.4s, v0.4s, v28.4s
1211
1212         sshl            v5.4s, v5.4s, v17.4s
1213         sshl            v6.4s, v6.4s, v17.4s
1214
1215         smin            v5.4s, v5.4s, v18.4s
1216         smin            v6.4s, v6.4s, v18.4s
1217
1218         st1             {v5.4s, v6.4s}, [x1], #32
1219         add             sp, sp, #64                 // restore stack
1220         cbnz            w2, 2f
1221
1222         ret
1223
1224 2:
1225         ldr             w8, [x5], #4                // load filterPos
1226         lsl             w8, w8, #1
1227         add             x9, x3, w8, uxtw            // src + filterPos
1228         ld1             {v0.4h}, [x9]               // load 4 * uint16_t
1229         ld1             {v31.4h}, [x4], #8
1230
1231         uxtl            v0.4s, v0.4h
1232         sxtl            v31.4s, v31.4h
1233         subs            w2, w2, #1
1234         mul             v5.4s, v0.4s, v31.4s
1235         addv            s0, v5.4s
1236         sshl            v0.4s, v0.4s, v17.4s
1237         smin            v0.4s, v0.4s, v18.4s
1238         st1             {v0.s}[0], [x1], #4
1239         cbnz            w2, 2b                      // if iterations remain jump to beginning
1240
1241         ret
1242 endfunc
1243
1244 function ff_hscale16to19_X8_neon_asm, export=1
1245         // w0               int shift
1246         // x1               int32_t *dst
1247         // w2               int dstW
1248         // x3               const uint8_t *src // treat it as uint16_t *src
1249         // x4               const uint16_t *filter
1250         // x5               const int32_t *filterPos
1251         // w6               int filterSize
1252
1253         movi            v20.4s, #1
1254         movi            v21.4s, #1
1255         shl             v20.4s, v20.4s, #19
1256         sub             v20.4s, v20.4s, v21.4s
1257         dup             v21.4s, w0
1258         neg             v21.4s, v21.4s
1259
1260         sbfiz           x7, x6, #1, #32             // filterSize*2 (*2 because int16)
1261 1:      ldr             w8, [x5], #4                // filterPos[idx]
1262         ldr             w10, [x5], #4               // filterPos[idx + 1]
1263         lsl             w8, w8, #1
1264         ldr             w11, [x5], #4               // filterPos[idx + 2]
1265         ldr             w9, [x5], #4                // filterPos[idx + 3]
1266         mov             x16, x4                     // filter0 = filter
1267         lsl             w11, w11, #1
1268         add             x12, x16, x7                // filter1 = filter0 + filterSize*2
1269         lsl             w9, w9, #1
1270         add             x13, x12, x7                // filter2 = filter1 + filterSize*2
1271         lsl             w10, w10, #1
1272         add             x4, x13, x7                 // filter3 = filter2 + filterSize*2
1273         movi            v0.16b, #0                  // val sum part 1 (for dst[0])
1274         movi            v1.16b, #0                  // val sum part 2 (for dst[1])
1275         movi            v2.16b, #0                  // val sum part 3 (for dst[2])
1276         movi            v3.16b, #0                  // val sum part 4 (for dst[3])
1277         add             x17, x3, w8, uxtw           // srcp + filterPos[0]
1278         add             x8,  x3, w10, uxtw          // srcp + filterPos[1]
1279         add             x10, x3, w11, uxtw          // srcp + filterPos[2]
1280         add             x11, x3, w9, uxtw           // srcp + filterPos[3]
1281         mov             w15, w6                     // filterSize counter
1282 2:      ld1             {v4.8h}, [x17], #16         // srcp[filterPos[0] + {0..7}]
1283         ld1             {v5.8h}, [x16], #16         // load 8x16-bit filter values, part 1
1284         ld1             {v6.8h}, [x8], #16          // srcp[filterPos[1] + {0..7}]
1285         ld1             {v7.8h}, [x12], #16         // load 8x16-bit at filter+filterSize
1286         uxtl            v24.4s, v4.4h               // extend srcp lower half to 32 bits to preserve sign
1287         sxtl            v25.4s, v5.4h               // extend filter lower half to 32 bits to match srcp size
1288         uxtl2           v4.4s, v4.8h                // extend srcp upper half to 32 bits
1289         mla             v0.4s, v24.4s, v25.4s       // multiply accumulate lower half of v4 * v5
1290         sxtl2           v5.4s, v5.8h                // extend filter upper half to 32 bits
1291         uxtl            v26.4s, v6.4h               // extend srcp lower half to 32 bits
1292         mla             v0.4s, v4.4s, v5.4s         // multiply accumulate upper half of v4 * v5
1293         sxtl            v27.4s, v7.4h               // exted filter lower half
1294         uxtl2           v6.4s, v6.8h                // extend srcp upper half
1295         sxtl2           v7.4s, v7.8h                // extend filter upper half
1296         ld1             {v16.8h}, [x10], #16        // srcp[filterPos[2] + {0..7}]
1297         mla             v1.4s, v26.4s, v27.4s       // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
1298         ld1             {v17.8h}, [x13], #16        // load 8x16-bit at filter+2*filterSize
1299         uxtl            v22.4s, v16.4h              // extend srcp lower half
1300         sxtl            v23.4s, v17.4h              // extend filter lower half
1301         uxtl2           v16.4s, v16.8h              // extend srcp upper half
1302         sxtl2           v17.4s, v17.8h              // extend filter upper half
1303         mla             v2.4s, v22.4s, v23.4s       // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
1304         mla             v2.4s, v16.4s, v17.4s       // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
1305         ld1             {v18.8h}, [x11], #16        // srcp[filterPos[3] + {0..7}]
1306         mla             v1.4s, v6.4s, v7.4s         // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
1307         ld1             {v19.8h}, [x4], #16         // load 8x16-bit at filter+3*filterSize
1308         subs            w15, w15, #8                // j -= 8: processed 8/filterSize
1309         uxtl            v28.4s, v18.4h              // extend srcp lower half
1310         sxtl            v29.4s, v19.4h              // extend filter lower half
1311         uxtl2           v18.4s, v18.8h              // extend srcp upper half
1312         sxtl2           v19.4s, v19.8h              // extend filter upper half
1313         mla             v3.4s, v28.4s, v29.4s       // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
1314         mla             v3.4s, v18.4s, v19.4s       // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
1315         b.gt            2b                          // inner loop if filterSize not consumed completely
1316         addp            v0.4s, v0.4s, v1.4s         // part01 horizontal pair adding
1317         addp            v2.4s, v2.4s, v3.4s         // part23 horizontal pair adding
1318         addp            v0.4s, v0.4s, v2.4s         // part0123 horizontal pair adding
1319         subs            w2, w2, #4                  // dstW -= 4
1320         sshl            v0.4s, v0.4s, v21.4s        // shift right (effectively rigth, as shift is negative); overflow expected
1321         smin            v0.4s, v0.4s, v20.4s        // apply min (do not use sqshl)
1322         st1             {v0.4s}, [x1], #16          // write to destination part0123
1323         b.gt            1b                          // loop until end of line
1324         ret
1325 endfunc
1326
1327 function ff_hscale16to19_X4_neon_asm, export=1
1328         // w0  int shift
1329         // x1  int16_t *dst
1330         // w2  int dstW
1331         // x3  const uint8_t *src
1332         // x4  const int16_t *filter
1333         // x5  const int32_t *filterPos
1334         // w6  int filterSize
1335
1336         stp             d8, d9, [sp, #-0x20]!
1337         stp             d10, d11, [sp, #0x10]
1338
1339         movi            v18.4s, #1
1340         movi            v17.4s, #1
1341         shl             v18.4s, v18.4s, #19
1342         sub             v21.4s, v18.4s, v17.4s      // max allowed value
1343         dup             v17.4s, w0                  // read shift
1344         neg             v20.4s, v17.4s              // negate it, so it can be used in sshl (effectively shift right)
1345
1346         lsl             w7, w6, #1
1347 1:
1348         ldp             w8, w9, [x5]
1349         ldp             w10, w11, [x5, #8]
1350
1351         movi            v16.16b, #0                 // initialize accumulator for idx + 0
1352         movi            v17.16b, #0                 // initialize accumulator for idx + 1
1353         movi            v18.16b, #0                 // initialize accumulator for idx + 2
1354         movi            v19.16b, #0                 // initialize accumulator for idx + 3
1355
1356         mov             x12, x4                     // filter + 0
1357         add             x13, x4, x7                 // filter + 1
1358         add             x8, x3, x8, lsl #1          // srcp + filterPos 0
1359         add             x14, x13, x7                // filter + 2
1360         add             x9, x3, x9, lsl #1          // srcp + filterPos 1
1361         add             x15, x14, x7                // filter + 3
1362         add             x10, x3, x10, lsl #1        // srcp + filterPos 2
1363         mov             w0, w6                      // save the filterSize to temporary variable
1364         add             x11, x3, x11, lsl #1        // srcp + filterPos 3
1365         add             x5, x5, #16                 // advance filter position
1366         mov             x16, xzr                    // clear the register x16 used for offsetting the filter values
1367
1368 2:
1369         ldr             q4, [x8], #16               // load src values for idx 0
1370         ldr             q5, [x9], #16               // load src values for idx 1
1371         uxtl            v26.4s, v4.4h
1372         uxtl2           v4.4s, v4.8h
1373         ldr             q31, [x12, x16]             // load filter values for idx 0
1374         ldr             q6, [x10], #16              // load src values for idx 2
1375         sxtl            v22.4s, v31.4h
1376         sxtl2           v31.4s, v31.8h
1377         mla             v16.4s, v26.4s, v22.4s      // multiplication of lower half for idx 0
1378         uxtl            v25.4s, v5.4h
1379         uxtl2           v5.4s, v5.8h
1380         ldr             q30, [x13, x16]             // load filter values for idx 1
1381         ldr             q7, [x11], #16              // load src values for idx 3
1382         mla             v16.4s, v4.4s, v31.4s       // multiplication of upper half for idx 0
1383         uxtl            v24.4s, v6.4h
1384         sxtl            v8.4s, v30.4h
1385         sxtl2           v30.4s, v30.8h
1386         mla             v17.4s, v25.4s, v8.4s       // multiplication of lower half for idx 1
1387         ldr             q29, [x14, x16]             // load filter values for idx 2
1388         uxtl2           v6.4s, v6.8h
1389         sxtl            v9.4s, v29.4h
1390         sxtl2           v29.4s, v29.8h
1391         mla             v17.4s, v5.4s, v30.4s       // multiplication of upper half for idx 1
1392         ldr             q28, [x15, x16]             // load filter values for idx 3
1393         mla             v18.4s, v24.4s, v9.4s       // multiplication of lower half for idx 2
1394         uxtl            v23.4s, v7.4h
1395         sxtl            v10.4s, v28.4h
1396         mla             v18.4s, v6.4s, v29.4s       // multiplication of upper half for idx 2
1397         uxtl2           v7.4s, v7.8h
1398         sxtl2           v28.4s, v28.8h
1399         mla             v19.4s, v23.4s, v10.4s      // multiplication of lower half for idx 3
1400         sub             w0, w0, #8
1401         cmp             w0, #8
1402         mla             v19.4s, v7.4s, v28.4s       // multiplication of upper half for idx 3
1403
1404         add             x16, x16, #16               // advance filter values indexing
1405
1406         b.ge            2b
1407
1408         // 4 iterations left
1409
1410         sub             x17, x7, #8                 // step back to wrap up the filter pos for last 4 elements
1411
1412         ldr             d4, [x8]                    // load src values for idx 0
1413         ldr             d31, [x12, x17]             // load filter values for idx 0
1414         uxtl            v4.4s, v4.4h
1415         sxtl            v31.4s, v31.4h
1416         ldr             d5, [x9]                    // load src values for idx 1
1417         mla             v16.4s, v4.4s, v31.4s       // multiplication of upper half for idx 0
1418         ldr             d30, [x13, x17]             // load filter values for idx 1
1419         uxtl            v5.4s, v5.4h
1420         sxtl            v30.4s, v30.4h
1421         ldr             d6, [x10]                   // load src values for idx 2
1422         mla             v17.4s, v5.4s, v30.4s       // multiplication of upper half for idx 1
1423         ldr             d29, [x14, x17]             // load filter values for idx 2
1424         uxtl            v6.4s, v6.4h
1425         sxtl            v29.4s, v29.4h
1426         ldr             d7, [x11]                   // load src values for idx 3
1427         ldr             d28, [x15, x17]             // load filter values for idx 3
1428         mla             v18.4s, v6.4s, v29.4s       // multiplication of upper half for idx 2
1429         uxtl            v7.4s, v7.4h
1430         sxtl            v28.4s, v28.4h
1431         addp            v16.4s, v16.4s, v17.4s
1432         mla             v19.4s, v7.4s, v28.4s       // multiplication of upper half for idx 3
1433         subs            w2, w2, #4
1434         addp            v18.4s, v18.4s, v19.4s
1435         addp            v16.4s, v16.4s, v18.4s
1436         sshl            v16.4s, v16.4s, v20.4s
1437         smin            v16.4s, v16.4s, v21.4s
1438
1439         st1             {v16.4s}, [x1], #16
1440         add             x4, x4, x7, lsl #2
1441         b.gt            1b
1442
1443         ldp             d8, d9, [sp]
1444         ldp             d10, d11, [sp, #0x10]
1445
1446         add             sp, sp, #0x20
1447
1448         ret
1449 endfunc