libavformat/hls: Be more restrictive on mpegts extensions
[FFMpeg-mirror.git] / libswscale / aarch64 / hscale.S
blob435460c1af590cd8bd7b3dbb2f845d292162e89f
1 /*
2  * Copyright (c) 2016 Clément Bœsch <clement stupeflix.com>
3  * Copyright (c) 2019-2021 Sebastian Pop <spop@amazon.com>
4  * Copyright (c) 2022 Jonathan Swinney <jswinney@amazon.com>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
23 #include "libavutil/aarch64/asm.S"
26 ;-----------------------------------------------------------------------------
27 ; horizontal line scaling
29 ; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt>
30 ;                               (SwsInternal *c, int{16,32}_t *dst,
31 ;                                int dstW, const uint{8,16}_t *src,
32 ;                                const int16_t *filter,
33 ;                                const int32_t *filterPos, int filterSize);
35 ; Scale one horizontal line. Input is either 8-bit width or 16-bit width
36 ; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to
37 ; downscale before multiplying). Filter is 14 bits. Output is either 15 bits
38 ; (in int16_t) or 19 bits (in int32_t), as given in $intermediate_nbits. Each
39 ; output pixel is generated from $filterSize input pixels, the position of
40 ; the first pixel is given in filterPos[nOutputPixel].
41 ;----------------------------------------------------------------------------- */
43 function ff_hscale8to15_X8_neon, export=1
44         sbfiz           x7, x6, #1, #32             // filterSize*2 (*2 because int16)
45 1:      ldr             w8, [x5], #4                // filterPos[idx]
46         ldr             w0, [x5], #4                // filterPos[idx + 1]
47         ldr             w11, [x5], #4               // filterPos[idx + 2]
48         ldr             w9, [x5], #4                // filterPos[idx + 3]
49         mov             x16, x4                     // filter0 = filter
50         add             x12, x16, x7                // filter1 = filter0 + filterSize*2
51         add             x13, x12, x7                // filter2 = filter1 + filterSize*2
52         add             x4, x13, x7                 // filter3 = filter2 + filterSize*2
53         movi            v0.16b, #0                  // val sum part 1 (for dst[0])
54         movi            v1.16b, #0                  // val sum part 2 (for dst[1])
55         movi            v2.16b, #0                  // val sum part 3 (for dst[2])
56         movi            v3.16b, #0                  // val sum part 4 (for dst[3])
57         add             x17, x3, w8, uxtw           // srcp + filterPos[0]
58         add             x8,  x3, w0, uxtw           // srcp + filterPos[1]
59         add             x0, x3, w11, uxtw           // srcp + filterPos[2]
60         add             x11, x3, w9, uxtw           // srcp + filterPos[3]
61         mov             w15, w6                     // filterSize counter
62 2:      ld1             {v4.8b}, [x17], #8          // srcp[filterPos[0] + {0..7}]
63         ld1             {v5.8h}, [x16], #16         // load 8x16-bit filter values, part 1
64         ld1             {v6.8b}, [x8], #8           // srcp[filterPos[1] + {0..7}]
65         ld1             {v7.8h}, [x12], #16         // load 8x16-bit at filter+filterSize
66         uxtl            v4.8h, v4.8b                // unpack part 1 to 16-bit
67         smlal           v0.4s, v4.4h, v5.4h         // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
68         smlal2          v0.4s, v4.8h, v5.8h         // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]
69         ld1             {v16.8b}, [x0], #8          // srcp[filterPos[2] + {0..7}]
70         ld1             {v17.8h}, [x13], #16        // load 8x16-bit at filter+2*filterSize
71         uxtl            v6.8h, v6.8b                // unpack part 2 to 16-bit
72         smlal           v1.4s, v6.4h, v7.4h         // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
73         uxtl            v16.8h, v16.8b              // unpack part 3 to 16-bit
74         smlal           v2.4s, v16.4h, v17.4h       // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
75         smlal2          v2.4s, v16.8h, v17.8h       // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
76         ld1             {v18.8b}, [x11], #8         // srcp[filterPos[3] + {0..7}]
77         smlal2          v1.4s, v6.8h, v7.8h         // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
78         ld1             {v19.8h}, [x4], #16         // load 8x16-bit at filter+3*filterSize
79         subs            w15, w15, #8                // j -= 8: processed 8/filterSize
80         uxtl            v18.8h, v18.8b              // unpack part 4 to 16-bit
81         smlal           v3.4s, v18.4h, v19.4h       // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
82         smlal2          v3.4s, v18.8h, v19.8h       // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
83         b.gt            2b                          // inner loop if filterSize not consumed completely
84         addp            v0.4s, v0.4s, v1.4s         // part01 horizontal pair adding
85         addp            v2.4s, v2.4s, v3.4s         // part23 horizontal pair adding
86         addp            v0.4s, v0.4s, v2.4s         // part0123 horizontal pair adding
87         subs            w2, w2, #4                  // dstW -= 4
88         sqshrn          v0.4h, v0.4s, #7            // shift and clip the 2x16-bit final values
89         st1             {v0.4h}, [x1], #8           // write to destination part0123
90         b.gt            1b                          // loop until end of line
91         ret
92 endfunc
94 function ff_hscale8to15_X4_neon, export=1
95 // x0  SwsInternal *c (not used)
96 // x1  int16_t *dst
97 // w2  int dstW
98 // x3  const uint8_t *src
99 // x4  const int16_t *filter
100 // x5  const int32_t *filterPos
101 // w6  int filterSize
103 // This function for filter sizes that are 4 mod 8. In other words, anything that's 0 mod 4 but not
104 // 0 mod 8. It also assumes that dstW is 0 mod 4.
106         lsl             w7, w6, #1                  // w7 = filterSize * 2
108         ldp             w8, w9,  [x5]               // filterPos[idx + 0], [idx + 1]
109         ldp             w10, w11, [x5, #8]          // filterPos[idx + 2], [idx + 3]
111         movi            v16.16b, #0                 // initialize accumulator for idx + 0
112         movi            v17.16b, #0                 // initialize accumulator for idx + 1
113         movi            v18.16b, #0                 // initialize accumulator for idx + 2
114         movi            v19.16b, #0                 // initialize accumulator for idx + 3
116         mov             x12, x4                     // filter pointer for idx + 0
117         add             x13, x4, x7                 // filter pointer for idx + 1
118         add             x8, x3, w8, uxtw            // srcp + filterPos[idx + 0]
119         add             x9, x3, w9, uxtw            // srcp + filterPos[idx + 1]
121         add             x14, x13, x7                // filter pointer for idx + 2
122         add             x10, x3, w10, uxtw          // srcp + filterPos[idx + 2]
123         add             x11, x3, w11, uxtw          // srcp + filterPos[idx + 3]
125         mov             w0, w6                      // copy filterSize to a temp register, w0
126         add             x5, x5, #16                 // advance the filterPos pointer
127         add             x15, x14, x7                // filter pointer for idx + 3
128         mov             x16, xzr                    // temp register for offsetting filter pointers
131         // This section loops over 8-wide chunks of filter size
132         ldr             d4, [x8], #8                // load 8 bytes from srcp for idx + 0
133         ldr             q0, [x12, x16]              // load 8 values, 16 bytes from filter for idx + 0
135         ldr             d5, [x9], #8                // load 8 bytes from srcp for idx + 1
136         ldr             q1, [x13, x16]              // load 8 values, 16 bytes from filter for idx + 1
138         uxtl            v4.8h, v4.8b                // unsigned extend long for idx + 0
139         uxtl            v5.8h, v5.8b                // unsigned extend long for idx + 1
141         ldr             d6, [x10], #8               // load 8 bytes from srcp for idx + 2
142         ldr             q2, [x14, x16]              // load 8 values, 16 bytes from filter for idx + 2
144         smlal           v16.4s, v0.4h, v4.4h        // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 0
145         smlal           v17.4s, v1.4h, v5.4h        // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 1
147         ldr             d7, [x11], #8               // load 8 bytes from srcp for idx + 3
148         ldr             q3, [x15, x16]              // load 8 values, 16 bytes from filter for idx + 3
150         sub             w0, w0, #8                  // decrement the remaining filterSize counter
151         smlal2          v16.4s, v0.8h, v4.8h        // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 0
152         smlal2          v17.4s, v1.8h, v5.8h        // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 1
153         uxtl            v6.8h, v6.8b                // unsigned extend long for idx + 2
154         uxtl            v7.8h, v7.8b                // unsigned extend long for idx + 3
155         smlal           v18.4s, v2.4h, v6.4h        // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 2
156         smlal           v19.4s, v3.4h, v7.4h        // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 3
158         cmp             w0, #8                      // are there at least 8 more elements in filter to consume?
159         add             x16, x16, #16               // advance the offsetting register for filter values
161         smlal2          v18.4s, v2.8h, v6.8h        // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 2
162         smlal2          v19.4s, v3.8h, v7.8h        // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 3
164         b.ge            2b                          // branch back to inner loop
166         // complete the remaining 4 filter elements
167         sub             x17, x7, #8                 // calculate the offset of the filter pointer for the remaining 4 elements
169         ldr             s4, [x8]                    // load 4 bytes from srcp for idx + 0
170         ldr             d0, [x12, x17]              // load 4 values, 8 bytes from filter for idx + 0
171         ldr             s5, [x9]                    // load 4 bytes from srcp for idx + 1
172         ldr             d1, [x13, x17]              // load 4 values, 8 bytes from filter for idx + 1
174         uxtl            v4.8h, v4.8b                // unsigned extend long for idx + 0
175         uxtl            v5.8h, v5.8b                // unsigned extend long for idx + 1
177         ldr             s6, [x10]                   // load 4 bytes from srcp for idx + 2
178         ldr             d2, [x14, x17]              // load 4 values, 8 bytes from filter for idx + 2
179         smlal           v16.4s, v0.4h, v4.4h        // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 0
180         smlal           v17.4s, v1.4h, v5.4h        // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 1
181         ldr             s7, [x11]                   // load 4 bytes from srcp for idx + 3
182         ldr             d3, [x15, x17]              // load 4 values, 8 bytes from filter for idx + 3
184         uxtl            v6.8h, v6.8b                // unsigned extend long for idx + 2
185         uxtl            v7.8h, v7.8b                // unsigned extend long for idx + 3
186         addp            v16.4s, v16.4s, v17.4s      // horizontal pair adding for idx 0,1
187         smlal           v18.4s, v2.4h, v6.4h        // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 2
188         smlal           v19.4s, v3.4h, v7.4h        // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 3
190         addp            v18.4s, v18.4s, v19.4s      // horizontal pair adding for idx 2,3
191         addp            v16.4s, v16.4s, v18.4s      // final horizontal pair adding producing one vector with results for idx = 0..3
193         subs            w2, w2, #4                  // dstW -= 4
194         sqshrn          v0.4h, v16.4s, #7           // shift and clip the 2x16-bit final values
195         st1             {v0.4h}, [x1], #8           // write to destination idx 0..3
196         add             x4, x4, x7, lsl #2          // filter += (filterSize*2) * 4
197         b.gt            1b                          // loop until end of line
198         ret
199 endfunc
201 function ff_hscale8to15_4_neon, export=1
202 // x0  SwsInternal *c (not used)
203 // x1  int16_t *dst
204 // x2  int dstW
205 // x3  const uint8_t *src
206 // x4  const int16_t *filter
207 // x5  const int32_t *filterPos
208 // x6  int filterSize
209 // x8-x15 registers for gathering src data
211 // v0      madd accumulator 4S
212 // v1-v4   filter values (16 bit) 8H
213 // v5      madd accumulator 4S
214 // v16-v19 src values (8 bit) 8B
216 // This implementation has 4 sections:
217 //  1. Prefetch src data
218 //  2. Interleaved prefetching src data and madd
219 //  3. Complete madd
220 //  4. Complete remaining iterations when dstW % 8 != 0
222         sub             sp, sp, #32                 // allocate 32 bytes on the stack
223         cmp             w2, #16                     // if dstW <16, skip to the last block used for wrapping up
224         b.lt            2f
226         // load 8 values from filterPos to be used as offsets into src
227         ldp             w8, w9,  [x5]               // filterPos[idx + 0], [idx + 1]
228         ldp             w10, w11, [x5, #8]          // filterPos[idx + 2], [idx + 3]
229         ldp             w12, w13, [x5, #16]         // filterPos[idx + 4], [idx + 5]
230         ldp             w14, w15, [x5, #24]         // filterPos[idx + 6], [idx + 7]
231         add             x5, x5, #32                 // advance filterPos
233         // gather random access data from src into contiguous memory
234         ldr             w8, [x3, w8, uxtw]          // src[filterPos[idx + 0]][0..3]
235         ldr             w9, [x3, w9, uxtw]          // src[filterPos[idx + 1]][0..3]
236         ldr             w10, [x3, w10, uxtw]        // src[filterPos[idx + 2]][0..3]
237         ldr             w11, [x3, w11, uxtw]        // src[filterPos[idx + 3]][0..3]
238         ldr             w12, [x3, w12, uxtw]        // src[filterPos[idx + 4]][0..3]
239         ldr             w13, [x3, w13, uxtw]        // src[filterPos[idx + 5]][0..3]
240         ldr             w14, [x3, w14, uxtw]        // src[filterPos[idx + 6]][0..3]
241         ldr             w15, [x3, w15, uxtw]        // src[filterPos[idx + 7]][0..3]
242         stp             w8, w9, [sp]                // *scratch_mem = { src[filterPos[idx + 0]][0..3], src[filterPos[idx + 1]][0..3] }
243         stp             w10, w11, [sp, #8]          // *scratch_mem = { src[filterPos[idx + 2]][0..3], src[filterPos[idx + 3]][0..3] }
244         stp             w12, w13, [sp, #16]         // *scratch_mem = { src[filterPos[idx + 4]][0..3], src[filterPos[idx + 5]][0..3] }
245         stp             w14, w15, [sp, #24]         // *scratch_mem = { src[filterPos[idx + 6]][0..3], src[filterPos[idx + 7]][0..3] }
248         ld4             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp] // transpose 8 bytes each from src into 4 registers
250         // load 8 values from filterPos to be used as offsets into src
251         ldp             w8, w9,  [x5]               // filterPos[idx + 0][0..3], [idx + 1][0..3], next iteration
252         ldp             w10, w11, [x5, #8]          // filterPos[idx + 2][0..3], [idx + 3][0..3], next iteration
253         ldp             w12, w13, [x5, #16]         // filterPos[idx + 4][0..3], [idx + 5][0..3], next iteration
254         ldp             w14, w15, [x5, #24]         // filterPos[idx + 6][0..3], [idx + 7][0..3], next iteration
256         movi            v0.16b, #0                  // Clear madd accumulator for idx 0..3
257         movi            v5.16b, #0                  // Clear madd accumulator for idx 4..7
259         ld4             {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7
261         add             x5, x5, #32                 // advance filterPos
263         // interleaved SIMD and prefetching intended to keep ld/st and vector pipelines busy
264         uxtl            v16.8h, v16.8b              // unsigned extend long, covert src data to 16-bit
265         uxtl            v17.8h, v17.8b              // unsigned extend long, covert src data to 16-bit
266         ldr             w8, [x3, w8, uxtw]          // src[filterPos[idx + 0]], next iteration
267         ldr             w9, [x3, w9, uxtw]          // src[filterPos[idx + 1]], next iteration
268         uxtl            v18.8h, v18.8b              // unsigned extend long, covert src data to 16-bit
269         uxtl            v19.8h, v19.8b              // unsigned extend long, covert src data to 16-bit
270         ldr             w10, [x3, w10, uxtw]        // src[filterPos[idx + 2]], next iteration
271         ldr             w11, [x3, w11, uxtw]        // src[filterPos[idx + 3]], next iteration
273         smlal           v0.4s, v1.4h, v16.4h        // multiply accumulate inner loop j = 0, idx = 0..3
274         smlal           v0.4s, v2.4h, v17.4h        // multiply accumulate inner loop j = 1, idx = 0..3
275         ldr             w12, [x3, w12, uxtw]        // src[filterPos[idx + 4]], next iteration
276         ldr             w13, [x3, w13, uxtw]        // src[filterPos[idx + 5]], next iteration
277         smlal           v0.4s, v3.4h, v18.4h        // multiply accumulate inner loop j = 2, idx = 0..3
278         smlal           v0.4s, v4.4h, v19.4h        // multiply accumulate inner loop j = 3, idx = 0..3
279         ldr             w14, [x3, w14, uxtw]        // src[filterPos[idx + 6]], next iteration
280         ldr             w15, [x3, w15, uxtw]        // src[filterPos[idx + 7]], next iteration
282         smlal2          v5.4s, v1.8h, v16.8h        // multiply accumulate inner loop j = 0, idx = 4..7
283         smlal2          v5.4s, v2.8h, v17.8h        // multiply accumulate inner loop j = 1, idx = 4..7
284         stp             w8, w9, [sp]                // *scratch_mem = { src[filterPos[idx + 0]][0..3], src[filterPos[idx + 1]][0..3] }
285         stp             w10, w11, [sp, #8]          // *scratch_mem = { src[filterPos[idx + 2]][0..3], src[filterPos[idx + 3]][0..3] }
286         smlal2          v5.4s, v3.8h, v18.8h        // multiply accumulate inner loop j = 2, idx = 4..7
287         smlal2          v5.4s, v4.8h, v19.8h        // multiply accumulate inner loop j = 3, idx = 4..7
288         stp             w12, w13, [sp, #16]         // *scratch_mem = { src[filterPos[idx + 4]][0..3], src[filterPos[idx + 5]][0..3] }
289         stp             w14, w15, [sp, #24]         // *scratch_mem = { src[filterPos[idx + 6]][0..3], src[filterPos[idx + 7]][0..3] }
291         sub             w2, w2, #8                  // dstW -= 8
292         sqshrn          v0.4h, v0.4s, #7            // shift and clip the 2x16-bit final values
293         sqshrn          v1.4h, v5.4s, #7            // shift and clip the 2x16-bit final values
294         st1             {v0.4h, v1.4h}, [x1], #16   // write to dst[idx + 0..7]
295         cmp             w2, #16                     // continue on main loop if there are at least 16 iterations left
296         b.ge            1b
298         // last full iteration
299         ld4             {v16.8b, v17.8b, v18.8b, v19.8b}, [sp]
300         ld4             {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7
302         movi            v0.16b, #0                  // Clear madd accumulator for idx 0..3
303         movi            v5.16b, #0                  // Clear madd accumulator for idx 4..7
305         uxtl            v16.8h, v16.8b              // unsigned extend long, covert src data to 16-bit
306         uxtl            v17.8h, v17.8b              // unsigned extend long, covert src data to 16-bit
307         uxtl            v18.8h, v18.8b              // unsigned extend long, covert src data to 16-bit
308         uxtl            v19.8h, v19.8b              // unsigned extend long, covert src data to 16-bit
310         smlal           v0.4s, v1.4h, v16.4h        // multiply accumulate inner loop j = 0, idx = 0..3
311         smlal           v0.4s, v2.4h, v17.4h        // multiply accumulate inner loop j = 1, idx = 0..3
312         smlal           v0.4s, v3.4h, v18.4h        // multiply accumulate inner loop j = 2, idx = 0..3
313         smlal           v0.4s, v4.4h, v19.4h        // multiply accumulate inner loop j = 3, idx = 0..3
315         smlal2          v5.4s, v1.8h, v16.8h        // multiply accumulate inner loop j = 0, idx = 4..7
316         smlal2          v5.4s, v2.8h, v17.8h        // multiply accumulate inner loop j = 1, idx = 4..7
317         smlal2          v5.4s, v3.8h, v18.8h        // multiply accumulate inner loop j = 2, idx = 4..7
318         smlal2          v5.4s, v4.8h, v19.8h        // multiply accumulate inner loop j = 3, idx = 4..7
320         subs            w2, w2, #8                  // dstW -= 8
321         sqshrn          v0.4h, v0.4s, #7            // shift and clip the 2x16-bit final values
322         sqshrn          v1.4h, v5.4s, #7            // shift and clip the 2x16-bit final values
323         st1             {v0.4h, v1.4h}, [x1], #16   // write to dst[idx + 0..7]
325         cbnz            w2, 2f                      // if >0 iterations remain, jump to the wrap up section
327         add             sp, sp, #32                 // clean up stack
328         ret
330         // finish up when dstW % 8 != 0 or dstW < 16
332         // load src
333         ldr             w8, [x5], #4                // filterPos[i]
334         add             x9, x3, w8, uxtw            // calculate the address for src load
335         ld1             {v5.s}[0], [x9]             // src[filterPos[i] + 0..3]
336         // load filter
337         ld1             {v6.4h}, [x4], #8           // filter[filterSize * i + 0..3]
339         uxtl            v5.8h, v5.8b                // unsigned exten long, convert src data to 16-bit
340         smull           v0.4s, v5.4h, v6.4h         // 4 iterations of src[...] * filter[...]
341         addv            s0, v0.4s                   // add up products of src and filter values
342         sqshrn          h0, s0, #7                  // shift and clip the 2x16-bit final value
343         st1             {v0.h}[0], [x1], #2         // dst[i] = ...
344         sub             w2, w2, #1                  // dstW--
345         cbnz            w2, 2b
347         add             sp, sp, #32                 // clean up stack
348         ret
349 endfunc
351 function ff_hscale8to19_4_neon, export=1
352         // x0               SwsInternal *c (unused)
353         // x1               int32_t *dst
354         // w2               int dstW
355         // x3               const uint8_t *src // treat it as uint16_t *src
356         // x4               const uint16_t *filter
357         // x5               const int32_t *filterPos
358         // w6               int filterSize
360         movi            v18.4s, #1
361         movi            v17.4s, #1
362         shl             v18.4s, v18.4s, #19
363         sub             v18.4s, v18.4s, v17.4s      // max allowed value
365         cmp             w2, #16
366         b.lt            2f // move to last block
368         ldp             w8, w9, [x5]                // filterPos[0], filterPos[1]
369         ldp             w10, w11, [x5, #8]          // filterPos[2], filterPos[3]
370         ldp             w12, w13, [x5, #16]         // filterPos[4], filterPos[5]
371         ldp             w14, w15, [x5, #24]         // filterPos[6], filterPos[7]
372         add             x5, x5, #32
374         // load data from
375         ldr             w8, [x3, w8, uxtw]
376         ldr             w9, [x3, w9, uxtw]
377         ldr             w10, [x3, w10, uxtw]
378         ldr             w11, [x3, w11, uxtw]
379         ldr             w12, [x3, w12, uxtw]
380         ldr             w13, [x3, w13, uxtw]
381         ldr             w14, [x3, w14, uxtw]
382         ldr             w15, [x3, w15, uxtw]
384         sub             sp, sp, #32
386         stp             w8, w9, [sp]
387         stp             w10, w11, [sp, #8]
388         stp             w12, w13, [sp, #16]
389         stp             w14, w15, [sp, #24]
392         ld4             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp]
393         ld4             {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7]
394         // load filterPositions into registers for next iteration
396         ldp             w8, w9, [x5]                // filterPos[0], filterPos[1]
397         ldp             w10, w11, [x5, #8]          // filterPos[2], filterPos[3]
398         ldp             w12, w13, [x5, #16]         // filterPos[4], filterPos[5]
399         ldp             w14, w15, [x5, #24]         // filterPos[6], filterPos[7]
400         add             x5, x5, #32
401         uxtl            v0.8h, v0.8b
402         ldr             w8, [x3, w8, uxtw]
403         smull           v5.4s, v0.4h, v28.4h        // multiply first column of src
404         ldr             w9, [x3, w9, uxtw]
405         smull2          v6.4s, v0.8h, v28.8h
406         stp             w8, w9, [sp]
408         uxtl            v1.8h, v1.8b
409         ldr             w10, [x3, w10, uxtw]
410         smlal           v5.4s, v1.4h, v29.4h        // multiply second column of src
411         ldr             w11, [x3, w11, uxtw]
412         smlal2          v6.4s, v1.8h, v29.8h
413         stp             w10, w11, [sp, #8]
415         uxtl            v2.8h, v2.8b
416         ldr             w12, [x3, w12, uxtw]
417         smlal           v5.4s, v2.4h, v30.4h        // multiply third column of src
418         ldr             w13, [x3, w13, uxtw]
419         smlal2          v6.4s, v2.8h, v30.8h
420         stp             w12, w13, [sp, #16]
422         uxtl            v3.8h, v3.8b
423         ldr             w14, [x3, w14, uxtw]
424         smlal           v5.4s, v3.4h, v31.4h        // multiply fourth column of src
425         ldr             w15, [x3, w15, uxtw]
426         smlal2          v6.4s, v3.8h, v31.8h
427         stp             w14, w15, [sp, #24]
429         sub             w2, w2, #8
430         sshr            v5.4s, v5.4s, #3
431         sshr            v6.4s, v6.4s, #3
432         smin            v5.4s, v5.4s, v18.4s
433         smin            v6.4s, v6.4s, v18.4s
435         st1             {v5.4s, v6.4s}, [x1], #32
436         cmp             w2, #16
437         b.ge            1b
439         // here we make last iteration, without updating the registers
440         ld4             {v0.8b, v1.8b, v2.8b, v3.8b}, [sp]
441         ld4             {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7]
443         uxtl            v0.8h, v0.8b
444         uxtl            v1.8h, v1.8b
445         smull           v5.4s, v0.4h, v28.4h
446         smull2          v6.4s, v0.8h, v28.8h
447         uxtl            v2.8h, v2.8b
448         smlal           v5.4s, v1.4h, v29.4h
449         smlal2          v6.4s, v1.8h, v29.8h
450         uxtl            v3.8h, v3.8b
451         smlal           v5.4s, v2.4h, v30.4h
452         smlal2          v6.4s, v2.8h, v30.8h
453         smlal           v5.4s, v3.4h, v31.4h
454         smlal2          v6.4s, v3.8h, v31.8h
456         sshr            v5.4s, v5.4s, #3
457         sshr            v6.4s, v6.4s, #3
459         smin            v5.4s, v5.4s, v18.4s
460         smin            v6.4s, v6.4s, v18.4s
462         sub             w2, w2, #8
463         st1             {v5.4s, v6.4s}, [x1], #32
464         add             sp, sp, #32 // restore stack
465         cbnz            w2, 2f
467         ret
470         ldr             w8, [x5], #4 // load filterPos
471         add             x9, x3, w8, uxtw // src + filterPos
472         ld1             {v0.s}[0], [x9] // load 4 * uint8_t* into one single
473         ld1             {v31.4h}, [x4], #8
474         uxtl            v0.8h, v0.8b
475         smull           v5.4s, v0.4h, v31.4h
476         saddlv          d0, v5.4s
477         sqshrn          s0, d0, #3
478         smin            v0.4s, v0.4s, v18.4s
479         st1             {v0.s}[0], [x1], #4
480         sub             w2, w2, #1
481         cbnz            w2, 2b // if iterations remain jump to beginning
483         ret
484 endfunc
486 function ff_hscale8to19_X8_neon, export=1
487         movi            v20.4s, #1
488         movi            v17.4s, #1
489         shl             v20.4s, v20.4s, #19
490         sub             v20.4s, v20.4s, v17.4s
492         sbfiz           x7, x6, #1, #32             // filterSize*2 (*2 because int16)
494         mov             x16, x4                     // filter0 = filter
495         ldr             w8, [x5], #4                // filterPos[idx]
496         add             x12, x16, x7                // filter1 = filter0 + filterSize*2
497         ldr             w0, [x5], #4                // filterPos[idx + 1]
498         add             x13, x12, x7                // filter2 = filter1 + filterSize*2
499         ldr             w11, [x5], #4               // filterPos[idx + 2]
500         add             x4, x13, x7                 // filter3 = filter2 + filterSize*2
501         ldr             w9, [x5], #4                // filterPos[idx + 3]
502         movi            v0.16b, #0                  // val sum part 1 (for dst[0])
503         movi            v1.16b, #0                  // val sum part 2 (for dst[1])
504         movi            v2.16b, #0                  // val sum part 3 (for dst[2])
505         movi            v3.16b, #0                  // val sum part 4 (for dst[3])
506         add             x17, x3, w8, uxtw           // srcp + filterPos[0]
507         add             x8,  x3, w0, uxtw           // srcp + filterPos[1]
508         add             x0, x3, w11, uxtw           // srcp + filterPos[2]
509         add             x11, x3, w9, uxtw           // srcp + filterPos[3]
510         mov             w15, w6                     // filterSize counter
511 2:      ld1             {v4.8b}, [x17], #8          // srcp[filterPos[0] + {0..7}]
512         ld1             {v5.8h}, [x16], #16         // load 8x16-bit filter values, part 1
513         uxtl            v4.8h, v4.8b                // unpack part 1 to 16-bit
514         smlal           v0.4s, v4.4h, v5.4h         // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
515         ld1             {v6.8b}, [x8], #8           // srcp[filterPos[1] + {0..7}]
516         smlal2          v0.4s, v4.8h, v5.8h         // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]
517         ld1             {v7.8h}, [x12], #16         // load 8x16-bit at filter+filterSize
518         ld1             {v16.8b}, [x0], #8          // srcp[filterPos[2] + {0..7}]
519         uxtl            v6.8h, v6.8b                // unpack part 2 to 16-bit
520         ld1             {v17.8h}, [x13], #16        // load 8x16-bit at filter+2*filterSize
521         uxtl            v16.8h, v16.8b              // unpack part 3 to 16-bit
522         smlal           v1.4s, v6.4h, v7.4h         // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
523         ld1             {v18.8b}, [x11], #8         // srcp[filterPos[3] + {0..7}]
524         smlal           v2.4s, v16.4h, v17.4h       // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
525         ld1             {v19.8h}, [x4], #16         // load 8x16-bit at filter+3*filterSize
526         smlal2          v2.4s, v16.8h, v17.8h       // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
527         uxtl            v18.8h, v18.8b              // unpack part 4 to 16-bit
528         smlal2          v1.4s, v6.8h, v7.8h         // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
529         smlal           v3.4s, v18.4h, v19.4h       // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
530         subs            w15, w15, #8                // j -= 8: processed 8/filterSize
531         smlal2          v3.4s, v18.8h, v19.8h       // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
532         b.gt            2b                          // inner loop if filterSize not consumed completely
533         addp            v0.4s, v0.4s, v1.4s         // part01 horizontal pair adding
534         addp            v2.4s, v2.4s, v3.4s         // part23 horizontal pair adding
535         addp            v0.4s, v0.4s, v2.4s         // part0123 horizontal pair adding
536         subs            w2, w2, #4                  // dstW -= 4
537         sshr            v0.4s, v0.4s, #3            // shift and clip the 2x16-bit final values
538         smin            v0.4s, v0.4s, v20.4s
539         st1             {v0.4s}, [x1], #16           // write to destination part0123
540         b.gt            1b                          // loop until end of line
541         ret
542 endfunc
544 function ff_hscale8to19_X4_neon, export=1
545         // x0  SwsInternal *c (not used)
546         // x1  int16_t *dst
547         // w2  int dstW
548         // x3  const uint8_t *src
549         // x4  const int16_t *filter
550         // x5  const int32_t *filterPos
551         // w6  int filterSize
553         movi            v20.4s, #1
554         movi            v17.4s, #1
555         shl             v20.4s, v20.4s, #19
556         sub             v20.4s, v20.4s, v17.4s
558         lsl             w7, w6, #1
560         ldp             w8, w9, [x5]
561         ldp             w10, w11, [x5, #8]
563         movi            v16.16b, #0                 // initialize accumulator for idx + 0
564         movi            v17.16b, #0                 // initialize accumulator for idx + 1
565         movi            v18.16b, #0                 // initialize accumulator for idx + 2
566         movi            v19.16b, #0                 // initialize accumulator for idx + 3
568         mov             x12, x4                     // filter + 0
569         add             x13, x4, x7                 // filter + 1
570         add             x8, x3, w8, uxtw            // srcp + filterPos 0
571         add             x14, x13, x7                // filter + 2
572         add             x9, x3, w9, uxtw            // srcp + filterPos 1
573         add             x15, x14, x7                // filter + 3
574         add             x10, x3, w10, uxtw          // srcp + filterPos 2
575         mov             w0, w6                      // save the filterSize to temporary variable
576         add             x11, x3, w11, uxtw          // srcp + filterPos 3
577         add             x5, x5, #16                 // advance filter position
578         mov             x16, xzr                    // clear the register x16 used for offsetting the filter values
581         ldr             d4, [x8], #8                // load src values for idx 0
582         ldr             q31, [x12, x16]             // load filter values for idx 0
583         uxtl            v4.8h, v4.8b                // extend type to match the filter' size
584         ldr             d5, [x9], #8                // load src values for idx 1
585         smlal           v16.4s, v4.4h, v31.4h       // multiplication of lower half for idx 0
586         uxtl            v5.8h, v5.8b                // extend type to match the filter' size
587         ldr             q30, [x13, x16]             // load filter values for idx 1
588         smlal2          v16.4s, v4.8h, v31.8h       // multiplication of upper half for idx 0
589         ldr             d6, [x10], #8               // load src values for idx 2
590         ldr             q29, [x14, x16]             // load filter values for idx 2
591         smlal           v17.4s, v5.4h, v30.4h       // multiplication of lower half for idx 1
592         ldr             d7, [x11], #8               // load src values for idx 3
593         smlal2          v17.4s, v5.8h, v30.8h       // multiplication of upper half for idx 1
594         uxtl            v6.8h, v6.8b                // extend tpye to matchi the filter's size
595         ldr             q28, [x15, x16]             // load filter values for idx 3
596         smlal           v18.4s, v6.4h, v29.4h       // multiplication of lower half for idx 2
597         uxtl            v7.8h, v7.8b
598         smlal2          v18.4s, v6.8h, v29.8h       // multiplication of upper half for idx 2
599         sub             w0, w0, #8
600         smlal           v19.4s, v7.4h, v28.4h       // multiplication of lower half for idx 3
601         cmp             w0, #8
602         smlal2          v19.4s, v7.8h, v28.8h       // multiplication of upper half for idx 3
603         add             x16, x16, #16                // advance filter values indexing
605         b.ge            2b
608         // 4 iterations left
610         sub             x17, x7, #8                 // step back to wrap up the filter pos for last 4 elements
612         ldr             s4, [x8]                    // load src values for idx 0
613         ldr             d31, [x12, x17]             // load filter values for idx 0
614         uxtl            v4.8h, v4.8b                // extend type to match the filter' size
615         ldr             s5, [x9]                    // load src values for idx 1
616         smlal           v16.4s, v4.4h, v31.4h
617         ldr             d30, [x13, x17]             // load filter values for idx 1
618         uxtl            v5.8h, v5.8b                // extend type to match the filter' size
619         ldr             s6, [x10]                   // load src values for idx 2
620         smlal           v17.4s, v5.4h, v30.4h
621         uxtl            v6.8h, v6.8b                // extend type to match the filter's size
622         ldr             d29, [x14, x17]             // load filter values for idx 2
623         ldr             s7, [x11]                   // load src values for idx 3
624         addp            v16.4s, v16.4s, v17.4s
625         uxtl            v7.8h, v7.8b
626         ldr             d28, [x15, x17]             // load filter values for idx 3
627         smlal           v18.4s, v6.4h, v29.4h
628         smlal           v19.4s, v7.4h, v28.4h
629         subs            w2, w2, #4
630         addp            v18.4s, v18.4s, v19.4s
631         addp            v16.4s, v16.4s, v18.4s
632         sshr            v16.4s, v16.4s, #3
633         smin            v16.4s, v16.4s, v20.4s
635         st1             {v16.4s}, [x1], #16
636         add             x4, x4, x7, lsl #2
637         b.gt            1b
638         ret
639 endfunc
641 function ff_hscale16to15_4_neon_asm, export=1
642         // w0               int shift
643         // x1               int32_t *dst
644         // w2               int dstW
645         // x3               const uint8_t *src // treat it as uint16_t *src
646         // x4               const uint16_t *filter
647         // x5               const int32_t *filterPos
648         // w6               int filterSize
650         movi            v18.4s, #1
651         movi            v17.4s, #1
652         shl             v18.4s, v18.4s, #15
653         sub             v18.4s, v18.4s, v17.4s      // max allowed value
654         dup             v17.4s, w0                  // read shift
655         neg             v17.4s, v17.4s              // negate it, so it can be used in sshl (effectively shift right)
657         cmp             w2, #16
658         b.lt            2f // move to last block
660         ldp             w8, w9, [x5]                // filterPos[0], filterPos[1]
661         ldp             w10, w11, [x5, #8]          // filterPos[2], filterPos[3]
662         ldp             w12, w13, [x5, #16]         // filterPos[4], filterPos[5]
663         ldp             w14, w15, [x5, #24]         // filterPos[6], filterPos[7]
664         add             x5, x5, #32
666         // shift all filterPos left by one, as uint16_t will be read
667         lsl             x8, x8, #1
668         lsl             x9, x9, #1
669         lsl             x10, x10, #1
670         lsl             x11, x11, #1
671         lsl             x12, x12, #1
672         lsl             x13, x13, #1
673         lsl             x14, x14, #1
674         lsl             x15, x15, #1
676         // load src with given offset
677         ldr             x8,  [x3, w8,  uxtw]
678         ldr             x9,  [x3, w9,  uxtw]
679         ldr             x10, [x3, w10, uxtw]
680         ldr             x11, [x3, w11, uxtw]
681         ldr             x12, [x3, w12, uxtw]
682         ldr             x13, [x3, w13, uxtw]
683         ldr             x14, [x3, w14, uxtw]
684         ldr             x15, [x3, w15, uxtw]
686         sub             sp, sp, #64
687         // push src on stack so it can be loaded into vectors later
688         stp             x8, x9, [sp]
689         stp             x10, x11, [sp, #16]
690         stp             x12, x13, [sp, #32]
691         stp             x14, x15, [sp, #48]
694         ld4             {v0.8h, v1.8h, v2.8h, v3.8h}, [sp]
695         ld4             {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7]
697         // Each of blocks does the following:
698         // Extend src and filter to 32 bits with uxtl and sxtl
699         // multiply or multiply and accumulate results
700         // Extending to 32 bits is necessary, as unit16_t values can't
701         // be represented as int16_t without type promotion.
702         uxtl            v26.4s, v0.4h
703         sxtl            v27.4s, v28.4h
704         uxtl2           v0.4s, v0.8h
705         mul             v5.4s, v26.4s, v27.4s
706         sxtl2           v28.4s, v28.8h
707         uxtl            v26.4s, v1.4h
708         mul             v6.4s, v0.4s, v28.4s
710         sxtl            v27.4s, v29.4h
711         uxtl2           v0.4s, v1.8h
712         mla             v5.4s, v27.4s, v26.4s
713         sxtl2           v28.4s, v29.8h
714         uxtl            v26.4s, v2.4h
715         mla             v6.4s, v28.4s, v0.4s
717         sxtl            v27.4s, v30.4h
718         uxtl2           v0.4s, v2.8h
719         mla             v5.4s, v27.4s, v26.4s
720         sxtl2           v28.4s, v30.8h
721         uxtl            v26.4s, v3.4h
722         mla             v6.4s, v28.4s, v0.4s
724         sxtl            v27.4s, v31.4h
725         uxtl2           v0.4s, v3.8h
726         mla             v5.4s, v27.4s, v26.4s
727         sxtl2           v28.4s, v31.8h
728         sub             w2, w2, #8
729         mla             v6.4s, v28.4s, v0.4s
731         sshl            v5.4s, v5.4s, v17.4s
732         sshl            v6.4s, v6.4s, v17.4s
733         smin            v5.4s, v5.4s, v18.4s
734         smin            v6.4s, v6.4s, v18.4s
735         xtn             v5.4h, v5.4s
736         xtn2            v5.8h, v6.4s
738         st1             {v5.8h}, [x1], #16
739         cmp             w2, #16
741         // load filterPositions into registers for next iteration
742         ldp             w8, w9, [x5]                // filterPos[0], filterPos[1]
743         ldp             w10, w11, [x5, #8]          // filterPos[2], filterPos[3]
744         ldp             w12, w13, [x5, #16]         // filterPos[4], filterPos[5]
745         ldp             w14, w15, [x5, #24]         // filterPos[6], filterPos[7]
746         add             x5, x5, #32
748         lsl             x8, x8, #1
749         lsl             x9, x9, #1
750         lsl             x10, x10, #1
751         lsl             x11, x11, #1
752         lsl             x12, x12, #1
753         lsl             x13, x13, #1
754         lsl             x14, x14, #1
755         lsl             x15, x15, #1
757         ldr             x8,  [x3, w8,  uxtw]
758         ldr             x9,  [x3, w9,  uxtw]
759         ldr             x10, [x3, w10, uxtw]
760         ldr             x11, [x3, w11, uxtw]
761         ldr             x12, [x3, w12, uxtw]
762         ldr             x13, [x3, w13, uxtw]
763         ldr             x14, [x3, w14, uxtw]
764         ldr             x15, [x3, w15, uxtw]
766         stp             x8, x9, [sp]
767         stp             x10, x11, [sp, #16]
768         stp             x12, x13, [sp, #32]
769         stp             x14, x15, [sp, #48]
771         b.ge            1b
773         // here we make last iteration, without updating the registers
774         ld4             {v0.8h, v1.8h, v2.8h, v3.8h}, [sp]
775         ld4             {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64
777         uxtl            v26.4s, v0.4h
778         sxtl            v27.4s, v28.4h
779         uxtl2           v0.4s, v0.8h
780         mul             v5.4s, v26.4s, v27.4s
781         sxtl2           v28.4s, v28.8h
782         uxtl            v26.4s, v1.4h
783         mul             v6.4s, v0.4s, v28.4s
785         sxtl            v27.4s, v29.4h
786         uxtl2           v0.4s, v1.8h
787         mla             v5.4s, v26.4s, v27.4s
788         sxtl2           v28.4s, v29.8h
789         uxtl            v26.4s, v2.4h
790         mla             v6.4s, v0.4s, v28.4s
792         sxtl            v27.4s, v30.4h
793         uxtl2           v0.4s, v2.8h
794         mla             v5.4s, v26.4s, v27.4s
795         sxtl2           v28.4s, v30.8h
796         uxtl            v26.4s, v3.4h
797         mla             v6.4s, v0.4s, v28.4s
799         sxtl            v27.4s, v31.4h
800         uxtl2           v0.4s, v3.8h
801         mla             v5.4s, v26.4s, v27.4s
802         sxtl2           v28.4s, v31.8h
803         subs            w2, w2, #8
804         mla             v6.4s, v0.4s, v28.4s
806         sshl            v5.4s, v5.4s, v17.4s
807         sshl            v6.4s, v6.4s, v17.4s
808         smin            v5.4s, v5.4s, v18.4s
809         smin            v6.4s, v6.4s, v18.4s
810         xtn             v5.4h, v5.4s
811         xtn2            v5.8h, v6.4s
813         st1             {v5.8h}, [x1], #16
814         add             sp, sp, #64                 // restore stack
815         cbnz            w2, 2f
817         ret
820         ldr             w8, [x5], #4                // load filterPos
821         lsl             w8, w8, #1
822         add             x9, x3, w8, uxtw            // src + filterPos
823         ld1             {v0.4h}, [x9]               // load 4 * uint16_t
824         ld1             {v31.4h}, [x4], #8
826         uxtl            v0.4s, v0.4h
827         sxtl            v31.4s, v31.4h
828         mul             v5.4s, v0.4s, v31.4s
829         addv            s0, v5.4s
830         sshl            v0.4s, v0.4s, v17.4s
831         smin            v0.4s, v0.4s, v18.4s
832         st1             {v0.h}[0], [x1], #2
833         sub             w2, w2, #1
834         cbnz            w2, 2b                      // if iterations remain jump to beginning
836         ret
837 endfunc
839 function ff_hscale16to15_X8_neon_asm, export=1
840         // w0               int shift
841         // x1               int32_t *dst
842         // w2               int dstW
843         // x3               const uint8_t *src // treat it as uint16_t *src
844         // x4               const uint16_t *filter
845         // x5               const int32_t *filterPos
846         // w6               int filterSize
848         movi            v20.4s, #1
849         movi            v21.4s, #1
850         shl             v20.4s, v20.4s, #15
851         sub             v20.4s, v20.4s, v21.4s
852         dup             v21.4s, w0
853         neg             v21.4s, v21.4s
855         sbfiz           x7, x6, #1, #32             // filterSize*2 (*2 because int16)
856 1:      ldr             w8, [x5], #4                // filterPos[idx]
857         lsl             w8, w8, #1
858         ldr             w10, [x5], #4               // filterPos[idx + 1]
859         lsl             w10, w10, #1
860         ldr             w11, [x5], #4               // filterPos[idx + 2]
861         lsl             w11, w11, #1
862         ldr             w9, [x5], #4                // filterPos[idx + 3]
863         lsl             w9, w9, #1
864         mov             x16, x4                     // filter0 = filter
865         add             x12, x16, x7                // filter1 = filter0 + filterSize*2
866         add             x13, x12, x7                // filter2 = filter1 + filterSize*2
867         add             x4, x13, x7                 // filter3 = filter2 + filterSize*2
868         movi            v0.16b, #0                  // val sum part 1 (for dst[0])
869         movi            v1.16b, #0                  // val sum part 2 (for dst[1])
870         movi            v2.16b, #0                  // val sum part 3 (for dst[2])
871         movi            v3.16b, #0                  // val sum part 4 (for dst[3])
872         add             x17, x3, w8, uxtw           // srcp + filterPos[0]
873         add             x8,  x3, w10, uxtw          // srcp + filterPos[1]
874         add             x10, x3, w11, uxtw          // srcp + filterPos[2]
875         add             x11, x3, w9, uxtw           // srcp + filterPos[3]
876         mov             w15, w6                     // filterSize counter
877 2:      ld1             {v4.8h}, [x17], #16         // srcp[filterPos[0] + {0..7}]
878         ld1             {v5.8h}, [x16], #16         // load 8x16-bit filter values, part 1
879         ld1             {v6.8h}, [x8], #16          // srcp[filterPos[1] + {0..7}]
880         ld1             {v7.8h}, [x12], #16         // load 8x16-bit at filter+filterSize
881         uxtl            v24.4s, v4.4h               // extend srcp lower half to 32 bits to preserve sign
882         sxtl            v25.4s, v5.4h               // extend filter lower half to 32 bits to match srcp size
883         uxtl2           v4.4s, v4.8h                // extend srcp upper half to 32 bits
884         mla             v0.4s, v24.4s, v25.4s       // multiply accumulate lower half of v4 * v5
885         sxtl2           v5.4s, v5.8h                // extend filter upper half to 32 bits
886         uxtl            v26.4s, v6.4h               // extend srcp lower half to 32 bits
887         mla             v0.4s, v4.4s, v5.4s         // multiply accumulate upper half of v4 * v5
888         sxtl            v27.4s, v7.4h               // exted filter lower half
889         uxtl2           v6.4s, v6.8h                // extend srcp upper half
890         sxtl2           v7.4s, v7.8h                // extend filter upper half
891         ld1             {v16.8h}, [x10], #16        // srcp[filterPos[2] + {0..7}]
892         mla             v1.4s, v26.4s, v27.4s       // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
893         ld1             {v17.8h}, [x13], #16        // load 8x16-bit at filter+2*filterSize
894         uxtl            v22.4s, v16.4h              // extend srcp lower half
895         sxtl            v23.4s, v17.4h              // extend filter lower half
896         uxtl2           v16.4s, v16.8h              // extend srcp upper half
897         sxtl2           v17.4s, v17.8h              // extend filter upper half
898         mla             v2.4s, v22.4s, v23.4s       // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
899         mla             v2.4s, v16.4s, v17.4s       // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
900         ld1             {v18.8h}, [x11], #16        // srcp[filterPos[3] + {0..7}]
901         mla             v1.4s, v6.4s, v7.4s         // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
902         ld1             {v19.8h}, [x4], #16         // load 8x16-bit at filter+3*filterSize
903         subs            w15, w15, #8                // j -= 8: processed 8/filterSize
904         uxtl            v28.4s, v18.4h              // extend srcp lower half
905         sxtl            v29.4s, v19.4h              // extend filter lower half
906         uxtl2           v18.4s, v18.8h              // extend srcp upper half
907         sxtl2           v19.4s, v19.8h              // extend filter upper half
908         mla             v3.4s, v28.4s, v29.4s       // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
909         mla             v3.4s, v18.4s, v19.4s       // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
910         b.gt            2b                          // inner loop if filterSize not consumed completely
911         addp            v0.4s, v0.4s, v1.4s         // part01 horizontal pair adding
912         addp            v2.4s, v2.4s, v3.4s         // part23 horizontal pair adding
913         addp            v0.4s, v0.4s, v2.4s         // part0123 horizontal pair adding
914         subs            w2, w2, #4                  // dstW -= 4
915         sshl            v0.4s, v0.4s, v21.4s        // shift right (effectively rigth, as shift is negative); overflow expected
916         smin            v0.4s, v0.4s, v20.4s        // apply min (do not use sqshl)
917         xtn             v0.4h, v0.4s                // narrow down to 16 bits
919         st1             {v0.4h}, [x1], #8           // write to destination part0123
920         b.gt            1b                          // loop until end of line
921         ret
922 endfunc
924 function ff_hscale16to15_X4_neon_asm, export=1
925         // w0  int shift
926         // x1  int16_t *dst
927         // w2  int dstW
928         // x3  const uint8_t *src
929         // x4  const int16_t *filter
930         // x5  const int32_t *filterPos
931         // w6  int filterSize
933         stp             d8, d9, [sp, #-0x20]!
934         stp             d10, d11, [sp, #0x10]
936         movi            v18.4s, #1
937         movi            v17.4s, #1
938         shl             v18.4s, v18.4s, #15
939         sub             v21.4s, v18.4s, v17.4s      // max allowed value
940         dup             v17.4s, w0                  // read shift
941         neg             v20.4s, v17.4s              // negate it, so it can be used in sshl (effectively shift right)
943         lsl             w7, w6, #1
945         ldp             w8, w9, [x5]
946         ldp             w10, w11, [x5, #8]
948         movi            v16.16b, #0                 // initialize accumulator for idx + 0
949         movi            v17.16b, #0                 // initialize accumulator for idx + 1
950         movi            v18.16b, #0                 // initialize accumulator for idx + 2
951         movi            v19.16b, #0                 // initialize accumulator for idx + 3
953         mov             x12, x4                     // filter + 0
954         add             x13, x4, x7                 // filter + 1
955         add             x8, x3, x8, lsl #1          // srcp + filterPos 0
956         add             x14, x13, x7                // filter + 2
957         add             x9, x3, x9, lsl #1          // srcp + filterPos 1
958         add             x15, x14, x7                // filter + 3
959         add             x10, x3, x10, lsl #1        // srcp + filterPos 2
960         mov             w0, w6                      // save the filterSize to temporary variable
961         add             x11, x3, x11, lsl #1        // srcp + filterPos 3
962         add             x5, x5, #16                 // advance filter position
963         mov             x16, xzr                    // clear the register x16 used for offsetting the filter values
966         ldr             q4, [x8], #16               // load src values for idx 0
967         ldr             q5, [x9], #16               // load src values for idx 1
968         uxtl            v26.4s, v4.4h
969         uxtl2           v4.4s, v4.8h
970         ldr             q31, [x12, x16]             // load filter values for idx 0
971         ldr             q6, [x10], #16              // load src values for idx 2
972         sxtl            v22.4s, v31.4h
973         sxtl2           v31.4s, v31.8h
974         mla             v16.4s, v26.4s, v22.4s      // multiplication of lower half for idx 0
975         uxtl            v25.4s, v5.4h
976         uxtl2           v5.4s, v5.8h
977         ldr             q30, [x13, x16]             // load filter values for idx 1
978         ldr             q7, [x11], #16              // load src values for idx 3
979         mla             v16.4s, v4.4s, v31.4s       // multiplication of upper half for idx 0
980         uxtl            v24.4s, v6.4h
981         sxtl            v8.4s, v30.4h
982         sxtl2           v30.4s, v30.8h
983         mla             v17.4s, v25.4s, v8.4s       // multiplication of lower half for idx 1
984         ldr             q29, [x14, x16]             // load filter values for idx 2
985         uxtl2           v6.4s, v6.8h
986         sxtl            v9.4s, v29.4h
987         sxtl2           v29.4s, v29.8h
988         mla             v17.4s, v5.4s, v30.4s       // multiplication of upper half for idx 1
989         mla             v18.4s, v24.4s, v9.4s       // multiplication of lower half for idx 2
990         ldr             q28, [x15, x16]             // load filter values for idx 3
991         uxtl            v23.4s, v7.4h
992         sxtl            v10.4s, v28.4h
993         mla             v18.4s, v6.4s, v29.4s       // multiplication of upper half for idx 2
994         uxtl2           v7.4s, v7.8h
995         sxtl2           v28.4s, v28.8h
996         mla             v19.4s, v23.4s, v10.4s      // multiplication of lower half for idx 3
997         sub             w0, w0, #8
998         cmp             w0, #8
999         mla             v19.4s, v7.4s, v28.4s       // multiplication of upper half for idx 3
1001         add             x16, x16, #16               // advance filter values indexing
1003         b.ge            2b
1005         // 4 iterations left
1007         sub             x17, x7, #8                 // step back to wrap up the filter pos for last 4 elements
1009         ldr             d4, [x8]                    // load src values for idx 0
1010         ldr             d31, [x12, x17]             // load filter values for idx 0
1011         uxtl            v4.4s, v4.4h
1012         sxtl            v31.4s, v31.4h
1013         ldr             d5, [x9]                    // load src values for idx 1
1014         mla             v16.4s, v4.4s, v31.4s       // multiplication of upper half for idx 0
1015         ldr             d30, [x13, x17]             // load filter values for idx 1
1016         uxtl            v5.4s, v5.4h
1017         sxtl            v30.4s, v30.4h
1018         ldr             d6, [x10]                   // load src values for idx 2
1019         mla             v17.4s, v5.4s, v30.4s       // multiplication of upper half for idx 1
1020         ldr             d29, [x14, x17]             // load filter values for idx 2
1021         uxtl            v6.4s, v6.4h
1022         sxtl            v29.4s, v29.4h
1023         ldr             d7, [x11]                   // load src values for idx 3
1024         ldr             d28, [x15, x17]             // load filter values for idx 3
1025         mla             v18.4s, v6.4s, v29.4s       // multiplication of upper half for idx 2
1026         uxtl            v7.4s, v7.4h
1027         sxtl            v28.4s, v28.4h
1028         addp            v16.4s, v16.4s, v17.4s
1029         mla             v19.4s, v7.4s, v28.4s       // multiplication of upper half for idx 3
1030         subs            w2, w2, #4
1031         addp            v18.4s, v18.4s, v19.4s
1032         addp            v16.4s, v16.4s, v18.4s
1033         sshl            v16.4s, v16.4s, v20.4s
1034         smin            v16.4s, v16.4s, v21.4s
1035         xtn             v16.4h, v16.4s
1037         st1             {v16.4h}, [x1], #8
1038         add             x4, x4, x7, lsl #2
1039         b.gt            1b
1041         ldp             d8, d9, [sp]
1042         ldp             d10, d11, [sp, #0x10]
1044         add             sp, sp, #0x20
1046         ret
1047 endfunc
1049 function ff_hscale16to19_4_neon_asm, export=1
1050         // w0               int shift
1051         // x1               int32_t *dst
1052         // w2               int dstW
1053         // x3               const uint8_t *src // treat it as uint16_t *src
1054         // x4               const uint16_t *filter
1055         // x5               const int32_t *filterPos
1056         // w6               int filterSize
1058         movi            v18.4s, #1
1059         movi            v17.4s, #1
1060         shl             v18.4s, v18.4s, #19
1061         sub             v18.4s, v18.4s, v17.4s      // max allowed value
1062         dup             v17.4s, w0                  // read shift
1063         neg             v17.4s, v17.4s              // negate it, so it can be used in sshl (effectively shift right)
1065         cmp             w2, #16
1066         b.lt            2f // move to last block
1068         ldp             w8, w9, [x5]                // filterPos[0], filterPos[1]
1069         ldp             w10, w11, [x5, #8]          // filterPos[2], filterPos[3]
1070         ldp             w12, w13, [x5, #16]         // filterPos[4], filterPos[5]
1071         ldp             w14, w15, [x5, #24]         // filterPos[6], filterPos[7]
1072         add             x5, x5, #32
1074         // shift all filterPos left by one, as uint16_t will be read
1075         lsl             x8, x8, #1
1076         lsl             x9, x9, #1
1077         lsl             x10, x10, #1
1078         lsl             x11, x11, #1
1079         lsl             x12, x12, #1
1080         lsl             x13, x13, #1
1081         lsl             x14, x14, #1
1082         lsl             x15, x15, #1
1084         // load src with given offset
1085         ldr             x8,  [x3, w8,  uxtw]
1086         ldr             x9,  [x3, w9,  uxtw]
1087         ldr             x10, [x3, w10, uxtw]
1088         ldr             x11, [x3, w11, uxtw]
1089         ldr             x12, [x3, w12, uxtw]
1090         ldr             x13, [x3, w13, uxtw]
1091         ldr             x14, [x3, w14, uxtw]
1092         ldr             x15, [x3, w15, uxtw]
1094         sub             sp, sp, #64
1095         // push src on stack so it can be loaded into vectors later
1096         stp             x8, x9, [sp]
1097         stp             x10, x11, [sp, #16]
1098         stp             x12, x13, [sp, #32]
1099         stp             x14, x15, [sp, #48]
1102         ld4             {v0.8h, v1.8h, v2.8h, v3.8h}, [sp]
1103         ld4             {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7]
1105         // Each of blocks does the following:
1106         // Extend src and filter to 32 bits with uxtl and sxtl
1107         // multiply or multiply and accumulate results
1108         // Extending to 32 bits is necessary, as unit16_t values can't
1109         // be represented as int16_t without type promotion.
1110         uxtl            v26.4s, v0.4h
1111         sxtl            v27.4s, v28.4h
1112         uxtl2           v0.4s, v0.8h
1113         mul             v5.4s, v26.4s, v27.4s
1114         sxtl2           v28.4s, v28.8h
1115         uxtl            v26.4s, v1.4h
1116         mul             v6.4s, v0.4s, v28.4s
1118         sxtl            v27.4s, v29.4h
1119         uxtl2           v0.4s, v1.8h
1120         mla             v5.4s, v27.4s, v26.4s
1121         sxtl2           v28.4s, v29.8h
1122         uxtl            v26.4s, v2.4h
1123         mla             v6.4s, v28.4s, v0.4s
1125         sxtl            v27.4s, v30.4h
1126         uxtl2           v0.4s, v2.8h
1127         mla             v5.4s, v27.4s, v26.4s
1128         sxtl2           v28.4s, v30.8h
1129         uxtl            v26.4s, v3.4h
1130         mla             v6.4s, v28.4s, v0.4s
1132         sxtl            v27.4s, v31.4h
1133         uxtl2           v0.4s, v3.8h
1134         mla             v5.4s, v27.4s, v26.4s
1135         sxtl2           v28.4s, v31.8h
1136         sub             w2, w2, #8
1137         mla             v6.4s, v28.4s, v0.4s
1139         sshl            v5.4s, v5.4s, v17.4s
1140         sshl            v6.4s, v6.4s, v17.4s
1141         smin            v5.4s, v5.4s, v18.4s
1142         smin            v6.4s, v6.4s, v18.4s
1144         st1             {v5.4s, v6.4s}, [x1], #32
1145         cmp             w2, #16
1147         // load filterPositions into registers for next iteration
1148         ldp             w8, w9, [x5]                // filterPos[0], filterPos[1]
1149         ldp             w10, w11, [x5, #8]          // filterPos[2], filterPos[3]
1150         ldp             w12, w13, [x5, #16]         // filterPos[4], filterPos[5]
1151         ldp             w14, w15, [x5, #24]         // filterPos[6], filterPos[7]
1152         add             x5, x5, #32
1154         lsl             x8, x8, #1
1155         lsl             x9, x9, #1
1156         lsl             x10, x10, #1
1157         lsl             x11, x11, #1
1158         lsl             x12, x12, #1
1159         lsl             x13, x13, #1
1160         lsl             x14, x14, #1
1161         lsl             x15, x15, #1
1163         ldr             x8,  [x3, w8,  uxtw]
1164         ldr             x9,  [x3, w9,  uxtw]
1165         ldr             x10, [x3, w10, uxtw]
1166         ldr             x11, [x3, w11, uxtw]
1167         ldr             x12, [x3, w12, uxtw]
1168         ldr             x13, [x3, w13, uxtw]
1169         ldr             x14, [x3, w14, uxtw]
1170         ldr             x15, [x3, w15, uxtw]
1172         stp             x8, x9, [sp]
1173         stp             x10, x11, [sp, #16]
1174         stp             x12, x13, [sp, #32]
1175         stp             x14, x15, [sp, #48]
1177         b.ge            1b
1179         // here we make last iteration, without updating the registers
1180         ld4             {v0.8h, v1.8h, v2.8h, v3.8h}, [sp]
1181         ld4             {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64
1183         uxtl            v26.4s, v0.4h
1184         sxtl            v27.4s, v28.4h
1185         uxtl2           v0.4s, v0.8h
1186         mul             v5.4s, v26.4s, v27.4s
1187         sxtl2           v28.4s, v28.8h
1188         uxtl            v26.4s, v1.4h
1189         mul             v6.4s, v0.4s, v28.4s
1191         sxtl            v27.4s, v29.4h
1192         uxtl2           v0.4s, v1.8h
1193         mla             v5.4s, v26.4s, v27.4s
1194         sxtl2           v28.4s, v29.8h
1195         uxtl            v26.4s, v2.4h
1196         mla             v6.4s, v0.4s, v28.4s
1198         sxtl            v27.4s, v30.4h
1199         uxtl2           v0.4s, v2.8h
1200         mla             v5.4s, v26.4s, v27.4s
1201         sxtl2           v28.4s, v30.8h
1202         uxtl            v26.4s, v3.4h
1203         mla             v6.4s, v0.4s, v28.4s
1205         sxtl            v27.4s, v31.4h
1206         uxtl2           v0.4s, v3.8h
1207         mla             v5.4s, v26.4s, v27.4s
1208         sxtl2           v28.4s, v31.8h
1209         subs            w2, w2, #8
1210         mla             v6.4s, v0.4s, v28.4s
1212         sshl            v5.4s, v5.4s, v17.4s
1213         sshl            v6.4s, v6.4s, v17.4s
1215         smin            v5.4s, v5.4s, v18.4s
1216         smin            v6.4s, v6.4s, v18.4s
1218         st1             {v5.4s, v6.4s}, [x1], #32
1219         add             sp, sp, #64                 // restore stack
1220         cbnz            w2, 2f
1222         ret
1225         ldr             w8, [x5], #4                // load filterPos
1226         lsl             w8, w8, #1
1227         add             x9, x3, w8, uxtw            // src + filterPos
1228         ld1             {v0.4h}, [x9]               // load 4 * uint16_t
1229         ld1             {v31.4h}, [x4], #8
1231         uxtl            v0.4s, v0.4h
1232         sxtl            v31.4s, v31.4h
1233         subs            w2, w2, #1
1234         mul             v5.4s, v0.4s, v31.4s
1235         addv            s0, v5.4s
1236         sshl            v0.4s, v0.4s, v17.4s
1237         smin            v0.4s, v0.4s, v18.4s
1238         st1             {v0.s}[0], [x1], #4
1239         cbnz            w2, 2b                      // if iterations remain jump to beginning
1241         ret
1242 endfunc
1244 function ff_hscale16to19_X8_neon_asm, export=1
1245         // w0               int shift
1246         // x1               int32_t *dst
1247         // w2               int dstW
1248         // x3               const uint8_t *src // treat it as uint16_t *src
1249         // x4               const uint16_t *filter
1250         // x5               const int32_t *filterPos
1251         // w6               int filterSize
1253         movi            v20.4s, #1
1254         movi            v21.4s, #1
1255         shl             v20.4s, v20.4s, #19
1256         sub             v20.4s, v20.4s, v21.4s
1257         dup             v21.4s, w0
1258         neg             v21.4s, v21.4s
1260         sbfiz           x7, x6, #1, #32             // filterSize*2 (*2 because int16)
1261 1:      ldr             w8, [x5], #4                // filterPos[idx]
1262         ldr             w10, [x5], #4               // filterPos[idx + 1]
1263         lsl             w8, w8, #1
1264         ldr             w11, [x5], #4               // filterPos[idx + 2]
1265         ldr             w9, [x5], #4                // filterPos[idx + 3]
1266         mov             x16, x4                     // filter0 = filter
1267         lsl             w11, w11, #1
1268         add             x12, x16, x7                // filter1 = filter0 + filterSize*2
1269         lsl             w9, w9, #1
1270         add             x13, x12, x7                // filter2 = filter1 + filterSize*2
1271         lsl             w10, w10, #1
1272         add             x4, x13, x7                 // filter3 = filter2 + filterSize*2
1273         movi            v0.16b, #0                  // val sum part 1 (for dst[0])
1274         movi            v1.16b, #0                  // val sum part 2 (for dst[1])
1275         movi            v2.16b, #0                  // val sum part 3 (for dst[2])
1276         movi            v3.16b, #0                  // val sum part 4 (for dst[3])
1277         add             x17, x3, w8, uxtw           // srcp + filterPos[0]
1278         add             x8,  x3, w10, uxtw          // srcp + filterPos[1]
1279         add             x10, x3, w11, uxtw          // srcp + filterPos[2]
1280         add             x11, x3, w9, uxtw           // srcp + filterPos[3]
1281         mov             w15, w6                     // filterSize counter
1282 2:      ld1             {v4.8h}, [x17], #16         // srcp[filterPos[0] + {0..7}]
1283         ld1             {v5.8h}, [x16], #16         // load 8x16-bit filter values, part 1
1284         ld1             {v6.8h}, [x8], #16          // srcp[filterPos[1] + {0..7}]
1285         ld1             {v7.8h}, [x12], #16         // load 8x16-bit at filter+filterSize
1286         uxtl            v24.4s, v4.4h               // extend srcp lower half to 32 bits to preserve sign
1287         sxtl            v25.4s, v5.4h               // extend filter lower half to 32 bits to match srcp size
1288         uxtl2           v4.4s, v4.8h                // extend srcp upper half to 32 bits
1289         mla             v0.4s, v24.4s, v25.4s       // multiply accumulate lower half of v4 * v5
1290         sxtl2           v5.4s, v5.8h                // extend filter upper half to 32 bits
1291         uxtl            v26.4s, v6.4h               // extend srcp lower half to 32 bits
1292         mla             v0.4s, v4.4s, v5.4s         // multiply accumulate upper half of v4 * v5
1293         sxtl            v27.4s, v7.4h               // exted filter lower half
1294         uxtl2           v6.4s, v6.8h                // extend srcp upper half
1295         sxtl2           v7.4s, v7.8h                // extend filter upper half
1296         ld1             {v16.8h}, [x10], #16        // srcp[filterPos[2] + {0..7}]
1297         mla             v1.4s, v26.4s, v27.4s       // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
1298         ld1             {v17.8h}, [x13], #16        // load 8x16-bit at filter+2*filterSize
1299         uxtl            v22.4s, v16.4h              // extend srcp lower half
1300         sxtl            v23.4s, v17.4h              // extend filter lower half
1301         uxtl2           v16.4s, v16.8h              // extend srcp upper half
1302         sxtl2           v17.4s, v17.8h              // extend filter upper half
1303         mla             v2.4s, v22.4s, v23.4s       // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
1304         mla             v2.4s, v16.4s, v17.4s       // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
1305         ld1             {v18.8h}, [x11], #16        // srcp[filterPos[3] + {0..7}]
1306         mla             v1.4s, v6.4s, v7.4s         // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
1307         ld1             {v19.8h}, [x4], #16         // load 8x16-bit at filter+3*filterSize
1308         subs            w15, w15, #8                // j -= 8: processed 8/filterSize
1309         uxtl            v28.4s, v18.4h              // extend srcp lower half
1310         sxtl            v29.4s, v19.4h              // extend filter lower half
1311         uxtl2           v18.4s, v18.8h              // extend srcp upper half
1312         sxtl2           v19.4s, v19.8h              // extend filter upper half
1313         mla             v3.4s, v28.4s, v29.4s       // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
1314         mla             v3.4s, v18.4s, v19.4s       // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
1315         b.gt            2b                          // inner loop if filterSize not consumed completely
1316         addp            v0.4s, v0.4s, v1.4s         // part01 horizontal pair adding
1317         addp            v2.4s, v2.4s, v3.4s         // part23 horizontal pair adding
1318         addp            v0.4s, v0.4s, v2.4s         // part0123 horizontal pair adding
1319         subs            w2, w2, #4                  // dstW -= 4
1320         sshl            v0.4s, v0.4s, v21.4s        // shift right (effectively rigth, as shift is negative); overflow expected
1321         smin            v0.4s, v0.4s, v20.4s        // apply min (do not use sqshl)
1322         st1             {v0.4s}, [x1], #16          // write to destination part0123
1323         b.gt            1b                          // loop until end of line
1324         ret
1325 endfunc
1327 function ff_hscale16to19_X4_neon_asm, export=1
1328         // w0  int shift
1329         // x1  int16_t *dst
1330         // w2  int dstW
1331         // x3  const uint8_t *src
1332         // x4  const int16_t *filter
1333         // x5  const int32_t *filterPos
1334         // w6  int filterSize
1336         stp             d8, d9, [sp, #-0x20]!
1337         stp             d10, d11, [sp, #0x10]
1339         movi            v18.4s, #1
1340         movi            v17.4s, #1
1341         shl             v18.4s, v18.4s, #19
1342         sub             v21.4s, v18.4s, v17.4s      // max allowed value
1343         dup             v17.4s, w0                  // read shift
1344         neg             v20.4s, v17.4s              // negate it, so it can be used in sshl (effectively shift right)
1346         lsl             w7, w6, #1
1348         ldp             w8, w9, [x5]
1349         ldp             w10, w11, [x5, #8]
1351         movi            v16.16b, #0                 // initialize accumulator for idx + 0
1352         movi            v17.16b, #0                 // initialize accumulator for idx + 1
1353         movi            v18.16b, #0                 // initialize accumulator for idx + 2
1354         movi            v19.16b, #0                 // initialize accumulator for idx + 3
1356         mov             x12, x4                     // filter + 0
1357         add             x13, x4, x7                 // filter + 1
1358         add             x8, x3, x8, lsl #1          // srcp + filterPos 0
1359         add             x14, x13, x7                // filter + 2
1360         add             x9, x3, x9, lsl #1          // srcp + filterPos 1
1361         add             x15, x14, x7                // filter + 3
1362         add             x10, x3, x10, lsl #1        // srcp + filterPos 2
1363         mov             w0, w6                      // save the filterSize to temporary variable
1364         add             x11, x3, x11, lsl #1        // srcp + filterPos 3
1365         add             x5, x5, #16                 // advance filter position
1366         mov             x16, xzr                    // clear the register x16 used for offsetting the filter values
1369         ldr             q4, [x8], #16               // load src values for idx 0
1370         ldr             q5, [x9], #16               // load src values for idx 1
1371         uxtl            v26.4s, v4.4h
1372         uxtl2           v4.4s, v4.8h
1373         ldr             q31, [x12, x16]             // load filter values for idx 0
1374         ldr             q6, [x10], #16              // load src values for idx 2
1375         sxtl            v22.4s, v31.4h
1376         sxtl2           v31.4s, v31.8h
1377         mla             v16.4s, v26.4s, v22.4s      // multiplication of lower half for idx 0
1378         uxtl            v25.4s, v5.4h
1379         uxtl2           v5.4s, v5.8h
1380         ldr             q30, [x13, x16]             // load filter values for idx 1
1381         ldr             q7, [x11], #16              // load src values for idx 3
1382         mla             v16.4s, v4.4s, v31.4s       // multiplication of upper half for idx 0
1383         uxtl            v24.4s, v6.4h
1384         sxtl            v8.4s, v30.4h
1385         sxtl2           v30.4s, v30.8h
1386         mla             v17.4s, v25.4s, v8.4s       // multiplication of lower half for idx 1
1387         ldr             q29, [x14, x16]             // load filter values for idx 2
1388         uxtl2           v6.4s, v6.8h
1389         sxtl            v9.4s, v29.4h
1390         sxtl2           v29.4s, v29.8h
1391         mla             v17.4s, v5.4s, v30.4s       // multiplication of upper half for idx 1
1392         ldr             q28, [x15, x16]             // load filter values for idx 3
1393         mla             v18.4s, v24.4s, v9.4s       // multiplication of lower half for idx 2
1394         uxtl            v23.4s, v7.4h
1395         sxtl            v10.4s, v28.4h
1396         mla             v18.4s, v6.4s, v29.4s       // multiplication of upper half for idx 2
1397         uxtl2           v7.4s, v7.8h
1398         sxtl2           v28.4s, v28.8h
1399         mla             v19.4s, v23.4s, v10.4s      // multiplication of lower half for idx 3
1400         sub             w0, w0, #8
1401         cmp             w0, #8
1402         mla             v19.4s, v7.4s, v28.4s       // multiplication of upper half for idx 3
1404         add             x16, x16, #16               // advance filter values indexing
1406         b.ge            2b
1408         // 4 iterations left
1410         sub             x17, x7, #8                 // step back to wrap up the filter pos for last 4 elements
1412         ldr             d4, [x8]                    // load src values for idx 0
1413         ldr             d31, [x12, x17]             // load filter values for idx 0
1414         uxtl            v4.4s, v4.4h
1415         sxtl            v31.4s, v31.4h
1416         ldr             d5, [x9]                    // load src values for idx 1
1417         mla             v16.4s, v4.4s, v31.4s       // multiplication of upper half for idx 0
1418         ldr             d30, [x13, x17]             // load filter values for idx 1
1419         uxtl            v5.4s, v5.4h
1420         sxtl            v30.4s, v30.4h
1421         ldr             d6, [x10]                   // load src values for idx 2
1422         mla             v17.4s, v5.4s, v30.4s       // multiplication of upper half for idx 1
1423         ldr             d29, [x14, x17]             // load filter values for idx 2
1424         uxtl            v6.4s, v6.4h
1425         sxtl            v29.4s, v29.4h
1426         ldr             d7, [x11]                   // load src values for idx 3
1427         ldr             d28, [x15, x17]             // load filter values for idx 3
1428         mla             v18.4s, v6.4s, v29.4s       // multiplication of upper half for idx 2
1429         uxtl            v7.4s, v7.4h
1430         sxtl            v28.4s, v28.4h
1431         addp            v16.4s, v16.4s, v17.4s
1432         mla             v19.4s, v7.4s, v28.4s       // multiplication of upper half for idx 3
1433         subs            w2, w2, #4
1434         addp            v18.4s, v18.4s, v19.4s
1435         addp            v16.4s, v16.4s, v18.4s
1436         sshl            v16.4s, v16.4s, v20.4s
1437         smin            v16.4s, v16.4s, v21.4s
1439         st1             {v16.4s}, [x1], #16
1440         add             x4, x4, x7, lsl #2
1441         b.gt            1b
1443         ldp             d8, d9, [sp]
1444         ldp             d10, d11, [sp, #0x10]
1446         add             sp, sp, #0x20
1448         ret
1449 endfunc