2 * Copyright (c) 2016 Clément Bœsch <clement stupeflix.com>
3 * Copyright (c) 2019-2021 Sebastian Pop <spop@amazon.com>
4 * Copyright (c) 2022 Jonathan Swinney <jswinney@amazon.com>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 #include "libavutil/aarch64/asm.S"
26 ;-----------------------------------------------------------------------------
27 ; horizontal line scaling
29 ; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt>
30 ; (SwsInternal *c, int{16,32}_t *dst,
31 ; int dstW, const uint{8,16}_t *src,
32 ; const int16_t *filter,
33 ; const int32_t *filterPos, int filterSize);
35 ; Scale one horizontal line. Input is either 8-bit width or 16-bit width
36 ; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to
37 ; downscale before multiplying). Filter is 14 bits. Output is either 15 bits
38 ; (in int16_t) or 19 bits (in int32_t), as given in $intermediate_nbits. Each
39 ; output pixel is generated from $filterSize input pixels, the position of
40 ; the first pixel is given in filterPos[nOutputPixel].
41 ;----------------------------------------------------------------------------- */
43 function ff_hscale8to15_X8_neon, export=1
44 sbfiz x7, x6, #1, #32 // filterSize*2 (*2 because int16)
45 1: ldr w8, [x5], #4 // filterPos[idx]
46 ldr w0, [x5], #4 // filterPos[idx + 1]
47 ldr w11, [x5], #4 // filterPos[idx + 2]
48 ldr w9, [x5], #4 // filterPos[idx + 3]
49 mov x16, x4 // filter0 = filter
50 add x12, x16, x7 // filter1 = filter0 + filterSize*2
51 add x13, x12, x7 // filter2 = filter1 + filterSize*2
52 add x4, x13, x7 // filter3 = filter2 + filterSize*2
53 movi v0.16b, #0 // val sum part 1 (for dst[0])
54 movi v1.16b, #0 // val sum part 2 (for dst[1])
55 movi v2.16b, #0 // val sum part 3 (for dst[2])
56 movi v3.16b, #0 // val sum part 4 (for dst[3])
57 add x17, x3, w8, uxtw // srcp + filterPos[0]
58 add x8, x3, w0, uxtw // srcp + filterPos[1]
59 add x0, x3, w11, uxtw // srcp + filterPos[2]
60 add x11, x3, w9, uxtw // srcp + filterPos[3]
61 mov w15, w6 // filterSize counter
62 2: ld1 {v4.8b}, [x17], #8 // srcp[filterPos[0] + {0..7}]
63 ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
64 ld1 {v6.8b}, [x8], #8 // srcp[filterPos[1] + {0..7}]
65 ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize
66 uxtl v4.8h, v4.8b // unpack part 1 to 16-bit
67 smlal v0.4s, v4.4h, v5.4h // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
68 smlal2 v0.4s, v4.8h, v5.8h // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]
69 ld1 {v16.8b}, [x0], #8 // srcp[filterPos[2] + {0..7}]
70 ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize
71 uxtl v6.8h, v6.8b // unpack part 2 to 16-bit
72 smlal v1.4s, v6.4h, v7.4h // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
73 uxtl v16.8h, v16.8b // unpack part 3 to 16-bit
74 smlal v2.4s, v16.4h, v17.4h // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
75 smlal2 v2.4s, v16.8h, v17.8h // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
76 ld1 {v18.8b}, [x11], #8 // srcp[filterPos[3] + {0..7}]
77 smlal2 v1.4s, v6.8h, v7.8h // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
78 ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize
79 subs w15, w15, #8 // j -= 8: processed 8/filterSize
80 uxtl v18.8h, v18.8b // unpack part 4 to 16-bit
81 smlal v3.4s, v18.4h, v19.4h // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
82 smlal2 v3.4s, v18.8h, v19.8h // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
83 b.gt 2b // inner loop if filterSize not consumed completely
84 addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding
85 addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding
86 addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding
87 subs w2, w2, #4 // dstW -= 4
88 sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values
89 st1 {v0.4h}, [x1], #8 // write to destination part0123
90 b.gt 1b // loop until end of line
94 function ff_hscale8to15_X4_neon, export=1
95 // x0 SwsInternal *c (not used)
98 // x3 const uint8_t *src
99 // x4 const int16_t *filter
100 // x5 const int32_t *filterPos
103 // This function for filter sizes that are 4 mod 8. In other words, anything that's 0 mod 4 but not
104 // 0 mod 8. It also assumes that dstW is 0 mod 4.
106 lsl w7, w6, #1 // w7 = filterSize * 2
108 ldp w8, w9, [x5] // filterPos[idx + 0], [idx + 1]
109 ldp w10, w11, [x5, #8] // filterPos[idx + 2], [idx + 3]
111 movi v16.16b, #0 // initialize accumulator for idx + 0
112 movi v17.16b, #0 // initialize accumulator for idx + 1
113 movi v18.16b, #0 // initialize accumulator for idx + 2
114 movi v19.16b, #0 // initialize accumulator for idx + 3
116 mov x12, x4 // filter pointer for idx + 0
117 add x13, x4, x7 // filter pointer for idx + 1
118 add x8, x3, w8, uxtw // srcp + filterPos[idx + 0]
119 add x9, x3, w9, uxtw // srcp + filterPos[idx + 1]
121 add x14, x13, x7 // filter pointer for idx + 2
122 add x10, x3, w10, uxtw // srcp + filterPos[idx + 2]
123 add x11, x3, w11, uxtw // srcp + filterPos[idx + 3]
125 mov w0, w6 // copy filterSize to a temp register, w0
126 add x5, x5, #16 // advance the filterPos pointer
127 add x15, x14, x7 // filter pointer for idx + 3
128 mov x16, xzr // temp register for offsetting filter pointers
131 // This section loops over 8-wide chunks of filter size
132 ldr d4, [x8], #8 // load 8 bytes from srcp for idx + 0
133 ldr q0, [x12, x16] // load 8 values, 16 bytes from filter for idx + 0
135 ldr d5, [x9], #8 // load 8 bytes from srcp for idx + 1
136 ldr q1, [x13, x16] // load 8 values, 16 bytes from filter for idx + 1
138 uxtl v4.8h, v4.8b // unsigned extend long for idx + 0
139 uxtl v5.8h, v5.8b // unsigned extend long for idx + 1
141 ldr d6, [x10], #8 // load 8 bytes from srcp for idx + 2
142 ldr q2, [x14, x16] // load 8 values, 16 bytes from filter for idx + 2
144 smlal v16.4s, v0.4h, v4.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 0
145 smlal v17.4s, v1.4h, v5.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 1
147 ldr d7, [x11], #8 // load 8 bytes from srcp for idx + 3
148 ldr q3, [x15, x16] // load 8 values, 16 bytes from filter for idx + 3
150 sub w0, w0, #8 // decrement the remaining filterSize counter
151 smlal2 v16.4s, v0.8h, v4.8h // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 0
152 smlal2 v17.4s, v1.8h, v5.8h // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 1
153 uxtl v6.8h, v6.8b // unsigned extend long for idx + 2
154 uxtl v7.8h, v7.8b // unsigned extend long for idx + 3
155 smlal v18.4s, v2.4h, v6.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 2
156 smlal v19.4s, v3.4h, v7.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 3
158 cmp w0, #8 // are there at least 8 more elements in filter to consume?
159 add x16, x16, #16 // advance the offsetting register for filter values
161 smlal2 v18.4s, v2.8h, v6.8h // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 2
162 smlal2 v19.4s, v3.8h, v7.8h // val += src[srcPos + j + 4..7] * filter[fs * i + j + 4..7], idx + 3
164 b.ge 2b // branch back to inner loop
166 // complete the remaining 4 filter elements
167 sub x17, x7, #8 // calculate the offset of the filter pointer for the remaining 4 elements
169 ldr s4, [x8] // load 4 bytes from srcp for idx + 0
170 ldr d0, [x12, x17] // load 4 values, 8 bytes from filter for idx + 0
171 ldr s5, [x9] // load 4 bytes from srcp for idx + 1
172 ldr d1, [x13, x17] // load 4 values, 8 bytes from filter for idx + 1
174 uxtl v4.8h, v4.8b // unsigned extend long for idx + 0
175 uxtl v5.8h, v5.8b // unsigned extend long for idx + 1
177 ldr s6, [x10] // load 4 bytes from srcp for idx + 2
178 ldr d2, [x14, x17] // load 4 values, 8 bytes from filter for idx + 2
179 smlal v16.4s, v0.4h, v4.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 0
180 smlal v17.4s, v1.4h, v5.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 1
181 ldr s7, [x11] // load 4 bytes from srcp for idx + 3
182 ldr d3, [x15, x17] // load 4 values, 8 bytes from filter for idx + 3
184 uxtl v6.8h, v6.8b // unsigned extend long for idx + 2
185 uxtl v7.8h, v7.8b // unsigned extend long for idx + 3
186 addp v16.4s, v16.4s, v17.4s // horizontal pair adding for idx 0,1
187 smlal v18.4s, v2.4h, v6.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 2
188 smlal v19.4s, v3.4h, v7.4h // val += src[srcPos + j + 0..3] * filter[fs * i + j + 0..3], idx + 3
190 addp v18.4s, v18.4s, v19.4s // horizontal pair adding for idx 2,3
191 addp v16.4s, v16.4s, v18.4s // final horizontal pair adding producing one vector with results for idx = 0..3
193 subs w2, w2, #4 // dstW -= 4
194 sqshrn v0.4h, v16.4s, #7 // shift and clip the 2x16-bit final values
195 st1 {v0.4h}, [x1], #8 // write to destination idx 0..3
196 add x4, x4, x7, lsl #2 // filter += (filterSize*2) * 4
197 b.gt 1b // loop until end of line
201 function ff_hscale8to15_4_neon, export=1
202 // x0 SwsInternal *c (not used)
205 // x3 const uint8_t *src
206 // x4 const int16_t *filter
207 // x5 const int32_t *filterPos
209 // x8-x15 registers for gathering src data
211 // v0 madd accumulator 4S
212 // v1-v4 filter values (16 bit) 8H
213 // v5 madd accumulator 4S
214 // v16-v19 src values (8 bit) 8B
216 // This implementation has 4 sections:
217 // 1. Prefetch src data
218 // 2. Interleaved prefetching src data and madd
220 // 4. Complete remaining iterations when dstW % 8 != 0
222 sub sp, sp, #32 // allocate 32 bytes on the stack
223 cmp w2, #16 // if dstW <16, skip to the last block used for wrapping up
226 // load 8 values from filterPos to be used as offsets into src
227 ldp w8, w9, [x5] // filterPos[idx + 0], [idx + 1]
228 ldp w10, w11, [x5, #8] // filterPos[idx + 2], [idx + 3]
229 ldp w12, w13, [x5, #16] // filterPos[idx + 4], [idx + 5]
230 ldp w14, w15, [x5, #24] // filterPos[idx + 6], [idx + 7]
231 add x5, x5, #32 // advance filterPos
233 // gather random access data from src into contiguous memory
234 ldr w8, [x3, w8, uxtw] // src[filterPos[idx + 0]][0..3]
235 ldr w9, [x3, w9, uxtw] // src[filterPos[idx + 1]][0..3]
236 ldr w10, [x3, w10, uxtw] // src[filterPos[idx + 2]][0..3]
237 ldr w11, [x3, w11, uxtw] // src[filterPos[idx + 3]][0..3]
238 ldr w12, [x3, w12, uxtw] // src[filterPos[idx + 4]][0..3]
239 ldr w13, [x3, w13, uxtw] // src[filterPos[idx + 5]][0..3]
240 ldr w14, [x3, w14, uxtw] // src[filterPos[idx + 6]][0..3]
241 ldr w15, [x3, w15, uxtw] // src[filterPos[idx + 7]][0..3]
242 stp w8, w9, [sp] // *scratch_mem = { src[filterPos[idx + 0]][0..3], src[filterPos[idx + 1]][0..3] }
243 stp w10, w11, [sp, #8] // *scratch_mem = { src[filterPos[idx + 2]][0..3], src[filterPos[idx + 3]][0..3] }
244 stp w12, w13, [sp, #16] // *scratch_mem = { src[filterPos[idx + 4]][0..3], src[filterPos[idx + 5]][0..3] }
245 stp w14, w15, [sp, #24] // *scratch_mem = { src[filterPos[idx + 6]][0..3], src[filterPos[idx + 7]][0..3] }
248 ld4 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp] // transpose 8 bytes each from src into 4 registers
250 // load 8 values from filterPos to be used as offsets into src
251 ldp w8, w9, [x5] // filterPos[idx + 0][0..3], [idx + 1][0..3], next iteration
252 ldp w10, w11, [x5, #8] // filterPos[idx + 2][0..3], [idx + 3][0..3], next iteration
253 ldp w12, w13, [x5, #16] // filterPos[idx + 4][0..3], [idx + 5][0..3], next iteration
254 ldp w14, w15, [x5, #24] // filterPos[idx + 6][0..3], [idx + 7][0..3], next iteration
256 movi v0.16b, #0 // Clear madd accumulator for idx 0..3
257 movi v5.16b, #0 // Clear madd accumulator for idx 4..7
259 ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7
261 add x5, x5, #32 // advance filterPos
263 // interleaved SIMD and prefetching intended to keep ld/st and vector pipelines busy
264 uxtl v16.8h, v16.8b // unsigned extend long, covert src data to 16-bit
265 uxtl v17.8h, v17.8b // unsigned extend long, covert src data to 16-bit
266 ldr w8, [x3, w8, uxtw] // src[filterPos[idx + 0]], next iteration
267 ldr w9, [x3, w9, uxtw] // src[filterPos[idx + 1]], next iteration
268 uxtl v18.8h, v18.8b // unsigned extend long, covert src data to 16-bit
269 uxtl v19.8h, v19.8b // unsigned extend long, covert src data to 16-bit
270 ldr w10, [x3, w10, uxtw] // src[filterPos[idx + 2]], next iteration
271 ldr w11, [x3, w11, uxtw] // src[filterPos[idx + 3]], next iteration
273 smlal v0.4s, v1.4h, v16.4h // multiply accumulate inner loop j = 0, idx = 0..3
274 smlal v0.4s, v2.4h, v17.4h // multiply accumulate inner loop j = 1, idx = 0..3
275 ldr w12, [x3, w12, uxtw] // src[filterPos[idx + 4]], next iteration
276 ldr w13, [x3, w13, uxtw] // src[filterPos[idx + 5]], next iteration
277 smlal v0.4s, v3.4h, v18.4h // multiply accumulate inner loop j = 2, idx = 0..3
278 smlal v0.4s, v4.4h, v19.4h // multiply accumulate inner loop j = 3, idx = 0..3
279 ldr w14, [x3, w14, uxtw] // src[filterPos[idx + 6]], next iteration
280 ldr w15, [x3, w15, uxtw] // src[filterPos[idx + 7]], next iteration
282 smlal2 v5.4s, v1.8h, v16.8h // multiply accumulate inner loop j = 0, idx = 4..7
283 smlal2 v5.4s, v2.8h, v17.8h // multiply accumulate inner loop j = 1, idx = 4..7
284 stp w8, w9, [sp] // *scratch_mem = { src[filterPos[idx + 0]][0..3], src[filterPos[idx + 1]][0..3] }
285 stp w10, w11, [sp, #8] // *scratch_mem = { src[filterPos[idx + 2]][0..3], src[filterPos[idx + 3]][0..3] }
286 smlal2 v5.4s, v3.8h, v18.8h // multiply accumulate inner loop j = 2, idx = 4..7
287 smlal2 v5.4s, v4.8h, v19.8h // multiply accumulate inner loop j = 3, idx = 4..7
288 stp w12, w13, [sp, #16] // *scratch_mem = { src[filterPos[idx + 4]][0..3], src[filterPos[idx + 5]][0..3] }
289 stp w14, w15, [sp, #24] // *scratch_mem = { src[filterPos[idx + 6]][0..3], src[filterPos[idx + 7]][0..3] }
291 sub w2, w2, #8 // dstW -= 8
292 sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values
293 sqshrn v1.4h, v5.4s, #7 // shift and clip the 2x16-bit final values
294 st1 {v0.4h, v1.4h}, [x1], #16 // write to dst[idx + 0..7]
295 cmp w2, #16 // continue on main loop if there are at least 16 iterations left
298 // last full iteration
299 ld4 {v16.8b, v17.8b, v18.8b, v19.8b}, [sp]
300 ld4 {v1.8h, v2.8h, v3.8h, v4.8h}, [x4], #64 // load filter idx + 0..7
302 movi v0.16b, #0 // Clear madd accumulator for idx 0..3
303 movi v5.16b, #0 // Clear madd accumulator for idx 4..7
305 uxtl v16.8h, v16.8b // unsigned extend long, covert src data to 16-bit
306 uxtl v17.8h, v17.8b // unsigned extend long, covert src data to 16-bit
307 uxtl v18.8h, v18.8b // unsigned extend long, covert src data to 16-bit
308 uxtl v19.8h, v19.8b // unsigned extend long, covert src data to 16-bit
310 smlal v0.4s, v1.4h, v16.4h // multiply accumulate inner loop j = 0, idx = 0..3
311 smlal v0.4s, v2.4h, v17.4h // multiply accumulate inner loop j = 1, idx = 0..3
312 smlal v0.4s, v3.4h, v18.4h // multiply accumulate inner loop j = 2, idx = 0..3
313 smlal v0.4s, v4.4h, v19.4h // multiply accumulate inner loop j = 3, idx = 0..3
315 smlal2 v5.4s, v1.8h, v16.8h // multiply accumulate inner loop j = 0, idx = 4..7
316 smlal2 v5.4s, v2.8h, v17.8h // multiply accumulate inner loop j = 1, idx = 4..7
317 smlal2 v5.4s, v3.8h, v18.8h // multiply accumulate inner loop j = 2, idx = 4..7
318 smlal2 v5.4s, v4.8h, v19.8h // multiply accumulate inner loop j = 3, idx = 4..7
320 subs w2, w2, #8 // dstW -= 8
321 sqshrn v0.4h, v0.4s, #7 // shift and clip the 2x16-bit final values
322 sqshrn v1.4h, v5.4s, #7 // shift and clip the 2x16-bit final values
323 st1 {v0.4h, v1.4h}, [x1], #16 // write to dst[idx + 0..7]
325 cbnz w2, 2f // if >0 iterations remain, jump to the wrap up section
327 add sp, sp, #32 // clean up stack
330 // finish up when dstW % 8 != 0 or dstW < 16
333 ldr w8, [x5], #4 // filterPos[i]
334 add x9, x3, w8, uxtw // calculate the address for src load
335 ld1 {v5.s}[0], [x9] // src[filterPos[i] + 0..3]
337 ld1 {v6.4h}, [x4], #8 // filter[filterSize * i + 0..3]
339 uxtl v5.8h, v5.8b // unsigned exten long, convert src data to 16-bit
340 smull v0.4s, v5.4h, v6.4h // 4 iterations of src[...] * filter[...]
341 addv s0, v0.4s // add up products of src and filter values
342 sqshrn h0, s0, #7 // shift and clip the 2x16-bit final value
343 st1 {v0.h}[0], [x1], #2 // dst[i] = ...
344 sub w2, w2, #1 // dstW--
347 add sp, sp, #32 // clean up stack
351 function ff_hscale8to19_4_neon, export=1
352 // x0 SwsInternal *c (unused)
355 // x3 const uint8_t *src // treat it as uint16_t *src
356 // x4 const uint16_t *filter
357 // x5 const int32_t *filterPos
362 shl v18.4s, v18.4s, #19
363 sub v18.4s, v18.4s, v17.4s // max allowed value
366 b.lt 2f // move to last block
368 ldp w8, w9, [x5] // filterPos[0], filterPos[1]
369 ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3]
370 ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5]
371 ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7]
375 ldr w8, [x3, w8, uxtw]
376 ldr w9, [x3, w9, uxtw]
377 ldr w10, [x3, w10, uxtw]
378 ldr w11, [x3, w11, uxtw]
379 ldr w12, [x3, w12, uxtw]
380 ldr w13, [x3, w13, uxtw]
381 ldr w14, [x3, w14, uxtw]
382 ldr w15, [x3, w15, uxtw]
387 stp w10, w11, [sp, #8]
388 stp w12, w13, [sp, #16]
389 stp w14, w15, [sp, #24]
392 ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp]
393 ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7]
394 // load filterPositions into registers for next iteration
396 ldp w8, w9, [x5] // filterPos[0], filterPos[1]
397 ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3]
398 ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5]
399 ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7]
402 ldr w8, [x3, w8, uxtw]
403 smull v5.4s, v0.4h, v28.4h // multiply first column of src
404 ldr w9, [x3, w9, uxtw]
405 smull2 v6.4s, v0.8h, v28.8h
409 ldr w10, [x3, w10, uxtw]
410 smlal v5.4s, v1.4h, v29.4h // multiply second column of src
411 ldr w11, [x3, w11, uxtw]
412 smlal2 v6.4s, v1.8h, v29.8h
413 stp w10, w11, [sp, #8]
416 ldr w12, [x3, w12, uxtw]
417 smlal v5.4s, v2.4h, v30.4h // multiply third column of src
418 ldr w13, [x3, w13, uxtw]
419 smlal2 v6.4s, v2.8h, v30.8h
420 stp w12, w13, [sp, #16]
423 ldr w14, [x3, w14, uxtw]
424 smlal v5.4s, v3.4h, v31.4h // multiply fourth column of src
425 ldr w15, [x3, w15, uxtw]
426 smlal2 v6.4s, v3.8h, v31.8h
427 stp w14, w15, [sp, #24]
430 sshr v5.4s, v5.4s, #3
431 sshr v6.4s, v6.4s, #3
432 smin v5.4s, v5.4s, v18.4s
433 smin v6.4s, v6.4s, v18.4s
435 st1 {v5.4s, v6.4s}, [x1], #32
439 // here we make last iteration, without updating the registers
440 ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [sp]
441 ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7]
445 smull v5.4s, v0.4h, v28.4h
446 smull2 v6.4s, v0.8h, v28.8h
448 smlal v5.4s, v1.4h, v29.4h
449 smlal2 v6.4s, v1.8h, v29.8h
451 smlal v5.4s, v2.4h, v30.4h
452 smlal2 v6.4s, v2.8h, v30.8h
453 smlal v5.4s, v3.4h, v31.4h
454 smlal2 v6.4s, v3.8h, v31.8h
456 sshr v5.4s, v5.4s, #3
457 sshr v6.4s, v6.4s, #3
459 smin v5.4s, v5.4s, v18.4s
460 smin v6.4s, v6.4s, v18.4s
463 st1 {v5.4s, v6.4s}, [x1], #32
464 add sp, sp, #32 // restore stack
470 ldr w8, [x5], #4 // load filterPos
471 add x9, x3, w8, uxtw // src + filterPos
472 ld1 {v0.s}[0], [x9] // load 4 * uint8_t* into one single
473 ld1 {v31.4h}, [x4], #8
475 smull v5.4s, v0.4h, v31.4h
478 smin v0.4s, v0.4s, v18.4s
479 st1 {v0.s}[0], [x1], #4
481 cbnz w2, 2b // if iterations remain jump to beginning
486 function ff_hscale8to19_X8_neon, export=1
489 shl v20.4s, v20.4s, #19
490 sub v20.4s, v20.4s, v17.4s
492 sbfiz x7, x6, #1, #32 // filterSize*2 (*2 because int16)
494 mov x16, x4 // filter0 = filter
495 ldr w8, [x5], #4 // filterPos[idx]
496 add x12, x16, x7 // filter1 = filter0 + filterSize*2
497 ldr w0, [x5], #4 // filterPos[idx + 1]
498 add x13, x12, x7 // filter2 = filter1 + filterSize*2
499 ldr w11, [x5], #4 // filterPos[idx + 2]
500 add x4, x13, x7 // filter3 = filter2 + filterSize*2
501 ldr w9, [x5], #4 // filterPos[idx + 3]
502 movi v0.16b, #0 // val sum part 1 (for dst[0])
503 movi v1.16b, #0 // val sum part 2 (for dst[1])
504 movi v2.16b, #0 // val sum part 3 (for dst[2])
505 movi v3.16b, #0 // val sum part 4 (for dst[3])
506 add x17, x3, w8, uxtw // srcp + filterPos[0]
507 add x8, x3, w0, uxtw // srcp + filterPos[1]
508 add x0, x3, w11, uxtw // srcp + filterPos[2]
509 add x11, x3, w9, uxtw // srcp + filterPos[3]
510 mov w15, w6 // filterSize counter
511 2: ld1 {v4.8b}, [x17], #8 // srcp[filterPos[0] + {0..7}]
512 ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
513 uxtl v4.8h, v4.8b // unpack part 1 to 16-bit
514 smlal v0.4s, v4.4h, v5.4h // v0 accumulates srcp[filterPos[0] + {0..3}] * filter[{0..3}]
515 ld1 {v6.8b}, [x8], #8 // srcp[filterPos[1] + {0..7}]
516 smlal2 v0.4s, v4.8h, v5.8h // v0 accumulates srcp[filterPos[0] + {4..7}] * filter[{4..7}]
517 ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize
518 ld1 {v16.8b}, [x0], #8 // srcp[filterPos[2] + {0..7}]
519 uxtl v6.8h, v6.8b // unpack part 2 to 16-bit
520 ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize
521 uxtl v16.8h, v16.8b // unpack part 3 to 16-bit
522 smlal v1.4s, v6.4h, v7.4h // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
523 ld1 {v18.8b}, [x11], #8 // srcp[filterPos[3] + {0..7}]
524 smlal v2.4s, v16.4h, v17.4h // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
525 ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize
526 smlal2 v2.4s, v16.8h, v17.8h // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
527 uxtl v18.8h, v18.8b // unpack part 4 to 16-bit
528 smlal2 v1.4s, v6.8h, v7.8h // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
529 smlal v3.4s, v18.4h, v19.4h // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
530 subs w15, w15, #8 // j -= 8: processed 8/filterSize
531 smlal2 v3.4s, v18.8h, v19.8h // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
532 b.gt 2b // inner loop if filterSize not consumed completely
533 addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding
534 addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding
535 addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding
536 subs w2, w2, #4 // dstW -= 4
537 sshr v0.4s, v0.4s, #3 // shift and clip the 2x16-bit final values
538 smin v0.4s, v0.4s, v20.4s
539 st1 {v0.4s}, [x1], #16 // write to destination part0123
540 b.gt 1b // loop until end of line
544 function ff_hscale8to19_X4_neon, export=1
545 // x0 SwsInternal *c (not used)
548 // x3 const uint8_t *src
549 // x4 const int16_t *filter
550 // x5 const int32_t *filterPos
555 shl v20.4s, v20.4s, #19
556 sub v20.4s, v20.4s, v17.4s
561 ldp w10, w11, [x5, #8]
563 movi v16.16b, #0 // initialize accumulator for idx + 0
564 movi v17.16b, #0 // initialize accumulator for idx + 1
565 movi v18.16b, #0 // initialize accumulator for idx + 2
566 movi v19.16b, #0 // initialize accumulator for idx + 3
568 mov x12, x4 // filter + 0
569 add x13, x4, x7 // filter + 1
570 add x8, x3, w8, uxtw // srcp + filterPos 0
571 add x14, x13, x7 // filter + 2
572 add x9, x3, w9, uxtw // srcp + filterPos 1
573 add x15, x14, x7 // filter + 3
574 add x10, x3, w10, uxtw // srcp + filterPos 2
575 mov w0, w6 // save the filterSize to temporary variable
576 add x11, x3, w11, uxtw // srcp + filterPos 3
577 add x5, x5, #16 // advance filter position
578 mov x16, xzr // clear the register x16 used for offsetting the filter values
581 ldr d4, [x8], #8 // load src values for idx 0
582 ldr q31, [x12, x16] // load filter values for idx 0
583 uxtl v4.8h, v4.8b // extend type to match the filter' size
584 ldr d5, [x9], #8 // load src values for idx 1
585 smlal v16.4s, v4.4h, v31.4h // multiplication of lower half for idx 0
586 uxtl v5.8h, v5.8b // extend type to match the filter' size
587 ldr q30, [x13, x16] // load filter values for idx 1
588 smlal2 v16.4s, v4.8h, v31.8h // multiplication of upper half for idx 0
589 ldr d6, [x10], #8 // load src values for idx 2
590 ldr q29, [x14, x16] // load filter values for idx 2
591 smlal v17.4s, v5.4h, v30.4h // multiplication of lower half for idx 1
592 ldr d7, [x11], #8 // load src values for idx 3
593 smlal2 v17.4s, v5.8h, v30.8h // multiplication of upper half for idx 1
594 uxtl v6.8h, v6.8b // extend tpye to matchi the filter's size
595 ldr q28, [x15, x16] // load filter values for idx 3
596 smlal v18.4s, v6.4h, v29.4h // multiplication of lower half for idx 2
598 smlal2 v18.4s, v6.8h, v29.8h // multiplication of upper half for idx 2
600 smlal v19.4s, v7.4h, v28.4h // multiplication of lower half for idx 3
602 smlal2 v19.4s, v7.8h, v28.8h // multiplication of upper half for idx 3
603 add x16, x16, #16 // advance filter values indexing
610 sub x17, x7, #8 // step back to wrap up the filter pos for last 4 elements
612 ldr s4, [x8] // load src values for idx 0
613 ldr d31, [x12, x17] // load filter values for idx 0
614 uxtl v4.8h, v4.8b // extend type to match the filter' size
615 ldr s5, [x9] // load src values for idx 1
616 smlal v16.4s, v4.4h, v31.4h
617 ldr d30, [x13, x17] // load filter values for idx 1
618 uxtl v5.8h, v5.8b // extend type to match the filter' size
619 ldr s6, [x10] // load src values for idx 2
620 smlal v17.4s, v5.4h, v30.4h
621 uxtl v6.8h, v6.8b // extend type to match the filter's size
622 ldr d29, [x14, x17] // load filter values for idx 2
623 ldr s7, [x11] // load src values for idx 3
624 addp v16.4s, v16.4s, v17.4s
626 ldr d28, [x15, x17] // load filter values for idx 3
627 smlal v18.4s, v6.4h, v29.4h
628 smlal v19.4s, v7.4h, v28.4h
630 addp v18.4s, v18.4s, v19.4s
631 addp v16.4s, v16.4s, v18.4s
632 sshr v16.4s, v16.4s, #3
633 smin v16.4s, v16.4s, v20.4s
635 st1 {v16.4s}, [x1], #16
636 add x4, x4, x7, lsl #2
641 function ff_hscale16to15_4_neon_asm, export=1
645 // x3 const uint8_t *src // treat it as uint16_t *src
646 // x4 const uint16_t *filter
647 // x5 const int32_t *filterPos
652 shl v18.4s, v18.4s, #15
653 sub v18.4s, v18.4s, v17.4s // max allowed value
654 dup v17.4s, w0 // read shift
655 neg v17.4s, v17.4s // negate it, so it can be used in sshl (effectively shift right)
658 b.lt 2f // move to last block
660 ldp w8, w9, [x5] // filterPos[0], filterPos[1]
661 ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3]
662 ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5]
663 ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7]
666 // shift all filterPos left by one, as uint16_t will be read
676 // load src with given offset
677 ldr x8, [x3, w8, uxtw]
678 ldr x9, [x3, w9, uxtw]
679 ldr x10, [x3, w10, uxtw]
680 ldr x11, [x3, w11, uxtw]
681 ldr x12, [x3, w12, uxtw]
682 ldr x13, [x3, w13, uxtw]
683 ldr x14, [x3, w14, uxtw]
684 ldr x15, [x3, w15, uxtw]
687 // push src on stack so it can be loaded into vectors later
689 stp x10, x11, [sp, #16]
690 stp x12, x13, [sp, #32]
691 stp x14, x15, [sp, #48]
694 ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [sp]
695 ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7]
697 // Each of blocks does the following:
698 // Extend src and filter to 32 bits with uxtl and sxtl
699 // multiply or multiply and accumulate results
700 // Extending to 32 bits is necessary, as unit16_t values can't
701 // be represented as int16_t without type promotion.
705 mul v5.4s, v26.4s, v27.4s
708 mul v6.4s, v0.4s, v28.4s
712 mla v5.4s, v27.4s, v26.4s
715 mla v6.4s, v28.4s, v0.4s
719 mla v5.4s, v27.4s, v26.4s
722 mla v6.4s, v28.4s, v0.4s
726 mla v5.4s, v27.4s, v26.4s
729 mla v6.4s, v28.4s, v0.4s
731 sshl v5.4s, v5.4s, v17.4s
732 sshl v6.4s, v6.4s, v17.4s
733 smin v5.4s, v5.4s, v18.4s
734 smin v6.4s, v6.4s, v18.4s
738 st1 {v5.8h}, [x1], #16
741 // load filterPositions into registers for next iteration
742 ldp w8, w9, [x5] // filterPos[0], filterPos[1]
743 ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3]
744 ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5]
745 ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7]
757 ldr x8, [x3, w8, uxtw]
758 ldr x9, [x3, w9, uxtw]
759 ldr x10, [x3, w10, uxtw]
760 ldr x11, [x3, w11, uxtw]
761 ldr x12, [x3, w12, uxtw]
762 ldr x13, [x3, w13, uxtw]
763 ldr x14, [x3, w14, uxtw]
764 ldr x15, [x3, w15, uxtw]
767 stp x10, x11, [sp, #16]
768 stp x12, x13, [sp, #32]
769 stp x14, x15, [sp, #48]
773 // here we make last iteration, without updating the registers
774 ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [sp]
775 ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64
780 mul v5.4s, v26.4s, v27.4s
783 mul v6.4s, v0.4s, v28.4s
787 mla v5.4s, v26.4s, v27.4s
790 mla v6.4s, v0.4s, v28.4s
794 mla v5.4s, v26.4s, v27.4s
797 mla v6.4s, v0.4s, v28.4s
801 mla v5.4s, v26.4s, v27.4s
804 mla v6.4s, v0.4s, v28.4s
806 sshl v5.4s, v5.4s, v17.4s
807 sshl v6.4s, v6.4s, v17.4s
808 smin v5.4s, v5.4s, v18.4s
809 smin v6.4s, v6.4s, v18.4s
813 st1 {v5.8h}, [x1], #16
814 add sp, sp, #64 // restore stack
820 ldr w8, [x5], #4 // load filterPos
822 add x9, x3, w8, uxtw // src + filterPos
823 ld1 {v0.4h}, [x9] // load 4 * uint16_t
824 ld1 {v31.4h}, [x4], #8
828 mul v5.4s, v0.4s, v31.4s
830 sshl v0.4s, v0.4s, v17.4s
831 smin v0.4s, v0.4s, v18.4s
832 st1 {v0.h}[0], [x1], #2
834 cbnz w2, 2b // if iterations remain jump to beginning
839 function ff_hscale16to15_X8_neon_asm, export=1
843 // x3 const uint8_t *src // treat it as uint16_t *src
844 // x4 const uint16_t *filter
845 // x5 const int32_t *filterPos
850 shl v20.4s, v20.4s, #15
851 sub v20.4s, v20.4s, v21.4s
855 sbfiz x7, x6, #1, #32 // filterSize*2 (*2 because int16)
856 1: ldr w8, [x5], #4 // filterPos[idx]
858 ldr w10, [x5], #4 // filterPos[idx + 1]
860 ldr w11, [x5], #4 // filterPos[idx + 2]
862 ldr w9, [x5], #4 // filterPos[idx + 3]
864 mov x16, x4 // filter0 = filter
865 add x12, x16, x7 // filter1 = filter0 + filterSize*2
866 add x13, x12, x7 // filter2 = filter1 + filterSize*2
867 add x4, x13, x7 // filter3 = filter2 + filterSize*2
868 movi v0.16b, #0 // val sum part 1 (for dst[0])
869 movi v1.16b, #0 // val sum part 2 (for dst[1])
870 movi v2.16b, #0 // val sum part 3 (for dst[2])
871 movi v3.16b, #0 // val sum part 4 (for dst[3])
872 add x17, x3, w8, uxtw // srcp + filterPos[0]
873 add x8, x3, w10, uxtw // srcp + filterPos[1]
874 add x10, x3, w11, uxtw // srcp + filterPos[2]
875 add x11, x3, w9, uxtw // srcp + filterPos[3]
876 mov w15, w6 // filterSize counter
877 2: ld1 {v4.8h}, [x17], #16 // srcp[filterPos[0] + {0..7}]
878 ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
879 ld1 {v6.8h}, [x8], #16 // srcp[filterPos[1] + {0..7}]
880 ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize
881 uxtl v24.4s, v4.4h // extend srcp lower half to 32 bits to preserve sign
882 sxtl v25.4s, v5.4h // extend filter lower half to 32 bits to match srcp size
883 uxtl2 v4.4s, v4.8h // extend srcp upper half to 32 bits
884 mla v0.4s, v24.4s, v25.4s // multiply accumulate lower half of v4 * v5
885 sxtl2 v5.4s, v5.8h // extend filter upper half to 32 bits
886 uxtl v26.4s, v6.4h // extend srcp lower half to 32 bits
887 mla v0.4s, v4.4s, v5.4s // multiply accumulate upper half of v4 * v5
888 sxtl v27.4s, v7.4h // exted filter lower half
889 uxtl2 v6.4s, v6.8h // extend srcp upper half
890 sxtl2 v7.4s, v7.8h // extend filter upper half
891 ld1 {v16.8h}, [x10], #16 // srcp[filterPos[2] + {0..7}]
892 mla v1.4s, v26.4s, v27.4s // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
893 ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize
894 uxtl v22.4s, v16.4h // extend srcp lower half
895 sxtl v23.4s, v17.4h // extend filter lower half
896 uxtl2 v16.4s, v16.8h // extend srcp upper half
897 sxtl2 v17.4s, v17.8h // extend filter upper half
898 mla v2.4s, v22.4s, v23.4s // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
899 mla v2.4s, v16.4s, v17.4s // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
900 ld1 {v18.8h}, [x11], #16 // srcp[filterPos[3] + {0..7}]
901 mla v1.4s, v6.4s, v7.4s // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
902 ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize
903 subs w15, w15, #8 // j -= 8: processed 8/filterSize
904 uxtl v28.4s, v18.4h // extend srcp lower half
905 sxtl v29.4s, v19.4h // extend filter lower half
906 uxtl2 v18.4s, v18.8h // extend srcp upper half
907 sxtl2 v19.4s, v19.8h // extend filter upper half
908 mla v3.4s, v28.4s, v29.4s // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
909 mla v3.4s, v18.4s, v19.4s // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
910 b.gt 2b // inner loop if filterSize not consumed completely
911 addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding
912 addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding
913 addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding
914 subs w2, w2, #4 // dstW -= 4
915 sshl v0.4s, v0.4s, v21.4s // shift right (effectively rigth, as shift is negative); overflow expected
916 smin v0.4s, v0.4s, v20.4s // apply min (do not use sqshl)
917 xtn v0.4h, v0.4s // narrow down to 16 bits
919 st1 {v0.4h}, [x1], #8 // write to destination part0123
920 b.gt 1b // loop until end of line
924 function ff_hscale16to15_X4_neon_asm, export=1
928 // x3 const uint8_t *src
929 // x4 const int16_t *filter
930 // x5 const int32_t *filterPos
933 stp d8, d9, [sp, #-0x20]!
934 stp d10, d11, [sp, #0x10]
938 shl v18.4s, v18.4s, #15
939 sub v21.4s, v18.4s, v17.4s // max allowed value
940 dup v17.4s, w0 // read shift
941 neg v20.4s, v17.4s // negate it, so it can be used in sshl (effectively shift right)
946 ldp w10, w11, [x5, #8]
948 movi v16.16b, #0 // initialize accumulator for idx + 0
949 movi v17.16b, #0 // initialize accumulator for idx + 1
950 movi v18.16b, #0 // initialize accumulator for idx + 2
951 movi v19.16b, #0 // initialize accumulator for idx + 3
953 mov x12, x4 // filter + 0
954 add x13, x4, x7 // filter + 1
955 add x8, x3, x8, lsl #1 // srcp + filterPos 0
956 add x14, x13, x7 // filter + 2
957 add x9, x3, x9, lsl #1 // srcp + filterPos 1
958 add x15, x14, x7 // filter + 3
959 add x10, x3, x10, lsl #1 // srcp + filterPos 2
960 mov w0, w6 // save the filterSize to temporary variable
961 add x11, x3, x11, lsl #1 // srcp + filterPos 3
962 add x5, x5, #16 // advance filter position
963 mov x16, xzr // clear the register x16 used for offsetting the filter values
966 ldr q4, [x8], #16 // load src values for idx 0
967 ldr q5, [x9], #16 // load src values for idx 1
970 ldr q31, [x12, x16] // load filter values for idx 0
971 ldr q6, [x10], #16 // load src values for idx 2
974 mla v16.4s, v26.4s, v22.4s // multiplication of lower half for idx 0
977 ldr q30, [x13, x16] // load filter values for idx 1
978 ldr q7, [x11], #16 // load src values for idx 3
979 mla v16.4s, v4.4s, v31.4s // multiplication of upper half for idx 0
983 mla v17.4s, v25.4s, v8.4s // multiplication of lower half for idx 1
984 ldr q29, [x14, x16] // load filter values for idx 2
988 mla v17.4s, v5.4s, v30.4s // multiplication of upper half for idx 1
989 mla v18.4s, v24.4s, v9.4s // multiplication of lower half for idx 2
990 ldr q28, [x15, x16] // load filter values for idx 3
993 mla v18.4s, v6.4s, v29.4s // multiplication of upper half for idx 2
996 mla v19.4s, v23.4s, v10.4s // multiplication of lower half for idx 3
999 mla v19.4s, v7.4s, v28.4s // multiplication of upper half for idx 3
1001 add x16, x16, #16 // advance filter values indexing
1005 // 4 iterations left
1007 sub x17, x7, #8 // step back to wrap up the filter pos for last 4 elements
1009 ldr d4, [x8] // load src values for idx 0
1010 ldr d31, [x12, x17] // load filter values for idx 0
1013 ldr d5, [x9] // load src values for idx 1
1014 mla v16.4s, v4.4s, v31.4s // multiplication of upper half for idx 0
1015 ldr d30, [x13, x17] // load filter values for idx 1
1018 ldr d6, [x10] // load src values for idx 2
1019 mla v17.4s, v5.4s, v30.4s // multiplication of upper half for idx 1
1020 ldr d29, [x14, x17] // load filter values for idx 2
1023 ldr d7, [x11] // load src values for idx 3
1024 ldr d28, [x15, x17] // load filter values for idx 3
1025 mla v18.4s, v6.4s, v29.4s // multiplication of upper half for idx 2
1028 addp v16.4s, v16.4s, v17.4s
1029 mla v19.4s, v7.4s, v28.4s // multiplication of upper half for idx 3
1031 addp v18.4s, v18.4s, v19.4s
1032 addp v16.4s, v16.4s, v18.4s
1033 sshl v16.4s, v16.4s, v20.4s
1034 smin v16.4s, v16.4s, v21.4s
1037 st1 {v16.4h}, [x1], #8
1038 add x4, x4, x7, lsl #2
1042 ldp d10, d11, [sp, #0x10]
1049 function ff_hscale16to19_4_neon_asm, export=1
1053 // x3 const uint8_t *src // treat it as uint16_t *src
1054 // x4 const uint16_t *filter
1055 // x5 const int32_t *filterPos
1056 // w6 int filterSize
1060 shl v18.4s, v18.4s, #19
1061 sub v18.4s, v18.4s, v17.4s // max allowed value
1062 dup v17.4s, w0 // read shift
1063 neg v17.4s, v17.4s // negate it, so it can be used in sshl (effectively shift right)
1066 b.lt 2f // move to last block
1068 ldp w8, w9, [x5] // filterPos[0], filterPos[1]
1069 ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3]
1070 ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5]
1071 ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7]
1074 // shift all filterPos left by one, as uint16_t will be read
1084 // load src with given offset
1085 ldr x8, [x3, w8, uxtw]
1086 ldr x9, [x3, w9, uxtw]
1087 ldr x10, [x3, w10, uxtw]
1088 ldr x11, [x3, w11, uxtw]
1089 ldr x12, [x3, w12, uxtw]
1090 ldr x13, [x3, w13, uxtw]
1091 ldr x14, [x3, w14, uxtw]
1092 ldr x15, [x3, w15, uxtw]
1095 // push src on stack so it can be loaded into vectors later
1097 stp x10, x11, [sp, #16]
1098 stp x12, x13, [sp, #32]
1099 stp x14, x15, [sp, #48]
1102 ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [sp]
1103 ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64 // filter[0..7]
1105 // Each of blocks does the following:
1106 // Extend src and filter to 32 bits with uxtl and sxtl
1107 // multiply or multiply and accumulate results
1108 // Extending to 32 bits is necessary, as unit16_t values can't
1109 // be represented as int16_t without type promotion.
1113 mul v5.4s, v26.4s, v27.4s
1114 sxtl2 v28.4s, v28.8h
1116 mul v6.4s, v0.4s, v28.4s
1120 mla v5.4s, v27.4s, v26.4s
1121 sxtl2 v28.4s, v29.8h
1123 mla v6.4s, v28.4s, v0.4s
1127 mla v5.4s, v27.4s, v26.4s
1128 sxtl2 v28.4s, v30.8h
1130 mla v6.4s, v28.4s, v0.4s
1134 mla v5.4s, v27.4s, v26.4s
1135 sxtl2 v28.4s, v31.8h
1137 mla v6.4s, v28.4s, v0.4s
1139 sshl v5.4s, v5.4s, v17.4s
1140 sshl v6.4s, v6.4s, v17.4s
1141 smin v5.4s, v5.4s, v18.4s
1142 smin v6.4s, v6.4s, v18.4s
1144 st1 {v5.4s, v6.4s}, [x1], #32
1147 // load filterPositions into registers for next iteration
1148 ldp w8, w9, [x5] // filterPos[0], filterPos[1]
1149 ldp w10, w11, [x5, #8] // filterPos[2], filterPos[3]
1150 ldp w12, w13, [x5, #16] // filterPos[4], filterPos[5]
1151 ldp w14, w15, [x5, #24] // filterPos[6], filterPos[7]
1163 ldr x8, [x3, w8, uxtw]
1164 ldr x9, [x3, w9, uxtw]
1165 ldr x10, [x3, w10, uxtw]
1166 ldr x11, [x3, w11, uxtw]
1167 ldr x12, [x3, w12, uxtw]
1168 ldr x13, [x3, w13, uxtw]
1169 ldr x14, [x3, w14, uxtw]
1170 ldr x15, [x3, w15, uxtw]
1173 stp x10, x11, [sp, #16]
1174 stp x12, x13, [sp, #32]
1175 stp x14, x15, [sp, #48]
1179 // here we make last iteration, without updating the registers
1180 ld4 {v0.8h, v1.8h, v2.8h, v3.8h}, [sp]
1181 ld4 {v28.8h, v29.8h, v30.8h, v31.8h}, [x4], #64
1186 mul v5.4s, v26.4s, v27.4s
1187 sxtl2 v28.4s, v28.8h
1189 mul v6.4s, v0.4s, v28.4s
1193 mla v5.4s, v26.4s, v27.4s
1194 sxtl2 v28.4s, v29.8h
1196 mla v6.4s, v0.4s, v28.4s
1200 mla v5.4s, v26.4s, v27.4s
1201 sxtl2 v28.4s, v30.8h
1203 mla v6.4s, v0.4s, v28.4s
1207 mla v5.4s, v26.4s, v27.4s
1208 sxtl2 v28.4s, v31.8h
1210 mla v6.4s, v0.4s, v28.4s
1212 sshl v5.4s, v5.4s, v17.4s
1213 sshl v6.4s, v6.4s, v17.4s
1215 smin v5.4s, v5.4s, v18.4s
1216 smin v6.4s, v6.4s, v18.4s
1218 st1 {v5.4s, v6.4s}, [x1], #32
1219 add sp, sp, #64 // restore stack
1225 ldr w8, [x5], #4 // load filterPos
1227 add x9, x3, w8, uxtw // src + filterPos
1228 ld1 {v0.4h}, [x9] // load 4 * uint16_t
1229 ld1 {v31.4h}, [x4], #8
1234 mul v5.4s, v0.4s, v31.4s
1236 sshl v0.4s, v0.4s, v17.4s
1237 smin v0.4s, v0.4s, v18.4s
1238 st1 {v0.s}[0], [x1], #4
1239 cbnz w2, 2b // if iterations remain jump to beginning
1244 function ff_hscale16to19_X8_neon_asm, export=1
1248 // x3 const uint8_t *src // treat it as uint16_t *src
1249 // x4 const uint16_t *filter
1250 // x5 const int32_t *filterPos
1251 // w6 int filterSize
1255 shl v20.4s, v20.4s, #19
1256 sub v20.4s, v20.4s, v21.4s
1260 sbfiz x7, x6, #1, #32 // filterSize*2 (*2 because int16)
1261 1: ldr w8, [x5], #4 // filterPos[idx]
1262 ldr w10, [x5], #4 // filterPos[idx + 1]
1264 ldr w11, [x5], #4 // filterPos[idx + 2]
1265 ldr w9, [x5], #4 // filterPos[idx + 3]
1266 mov x16, x4 // filter0 = filter
1268 add x12, x16, x7 // filter1 = filter0 + filterSize*2
1270 add x13, x12, x7 // filter2 = filter1 + filterSize*2
1272 add x4, x13, x7 // filter3 = filter2 + filterSize*2
1273 movi v0.16b, #0 // val sum part 1 (for dst[0])
1274 movi v1.16b, #0 // val sum part 2 (for dst[1])
1275 movi v2.16b, #0 // val sum part 3 (for dst[2])
1276 movi v3.16b, #0 // val sum part 4 (for dst[3])
1277 add x17, x3, w8, uxtw // srcp + filterPos[0]
1278 add x8, x3, w10, uxtw // srcp + filterPos[1]
1279 add x10, x3, w11, uxtw // srcp + filterPos[2]
1280 add x11, x3, w9, uxtw // srcp + filterPos[3]
1281 mov w15, w6 // filterSize counter
1282 2: ld1 {v4.8h}, [x17], #16 // srcp[filterPos[0] + {0..7}]
1283 ld1 {v5.8h}, [x16], #16 // load 8x16-bit filter values, part 1
1284 ld1 {v6.8h}, [x8], #16 // srcp[filterPos[1] + {0..7}]
1285 ld1 {v7.8h}, [x12], #16 // load 8x16-bit at filter+filterSize
1286 uxtl v24.4s, v4.4h // extend srcp lower half to 32 bits to preserve sign
1287 sxtl v25.4s, v5.4h // extend filter lower half to 32 bits to match srcp size
1288 uxtl2 v4.4s, v4.8h // extend srcp upper half to 32 bits
1289 mla v0.4s, v24.4s, v25.4s // multiply accumulate lower half of v4 * v5
1290 sxtl2 v5.4s, v5.8h // extend filter upper half to 32 bits
1291 uxtl v26.4s, v6.4h // extend srcp lower half to 32 bits
1292 mla v0.4s, v4.4s, v5.4s // multiply accumulate upper half of v4 * v5
1293 sxtl v27.4s, v7.4h // exted filter lower half
1294 uxtl2 v6.4s, v6.8h // extend srcp upper half
1295 sxtl2 v7.4s, v7.8h // extend filter upper half
1296 ld1 {v16.8h}, [x10], #16 // srcp[filterPos[2] + {0..7}]
1297 mla v1.4s, v26.4s, v27.4s // v1 accumulates srcp[filterPos[1] + {0..3}] * filter[{0..3}]
1298 ld1 {v17.8h}, [x13], #16 // load 8x16-bit at filter+2*filterSize
1299 uxtl v22.4s, v16.4h // extend srcp lower half
1300 sxtl v23.4s, v17.4h // extend filter lower half
1301 uxtl2 v16.4s, v16.8h // extend srcp upper half
1302 sxtl2 v17.4s, v17.8h // extend filter upper half
1303 mla v2.4s, v22.4s, v23.4s // v2 accumulates srcp[filterPos[2] + {0..3}] * filter[{0..3}]
1304 mla v2.4s, v16.4s, v17.4s // v2 accumulates srcp[filterPos[2] + {4..7}] * filter[{4..7}]
1305 ld1 {v18.8h}, [x11], #16 // srcp[filterPos[3] + {0..7}]
1306 mla v1.4s, v6.4s, v7.4s // v1 accumulates srcp[filterPos[1] + {4..7}] * filter[{4..7}]
1307 ld1 {v19.8h}, [x4], #16 // load 8x16-bit at filter+3*filterSize
1308 subs w15, w15, #8 // j -= 8: processed 8/filterSize
1309 uxtl v28.4s, v18.4h // extend srcp lower half
1310 sxtl v29.4s, v19.4h // extend filter lower half
1311 uxtl2 v18.4s, v18.8h // extend srcp upper half
1312 sxtl2 v19.4s, v19.8h // extend filter upper half
1313 mla v3.4s, v28.4s, v29.4s // v3 accumulates srcp[filterPos[3] + {0..3}] * filter[{0..3}]
1314 mla v3.4s, v18.4s, v19.4s // v3 accumulates srcp[filterPos[3] + {4..7}] * filter[{4..7}]
1315 b.gt 2b // inner loop if filterSize not consumed completely
1316 addp v0.4s, v0.4s, v1.4s // part01 horizontal pair adding
1317 addp v2.4s, v2.4s, v3.4s // part23 horizontal pair adding
1318 addp v0.4s, v0.4s, v2.4s // part0123 horizontal pair adding
1319 subs w2, w2, #4 // dstW -= 4
1320 sshl v0.4s, v0.4s, v21.4s // shift right (effectively rigth, as shift is negative); overflow expected
1321 smin v0.4s, v0.4s, v20.4s // apply min (do not use sqshl)
1322 st1 {v0.4s}, [x1], #16 // write to destination part0123
1323 b.gt 1b // loop until end of line
1327 function ff_hscale16to19_X4_neon_asm, export=1
1331 // x3 const uint8_t *src
1332 // x4 const int16_t *filter
1333 // x5 const int32_t *filterPos
1334 // w6 int filterSize
1336 stp d8, d9, [sp, #-0x20]!
1337 stp d10, d11, [sp, #0x10]
1341 shl v18.4s, v18.4s, #19
1342 sub v21.4s, v18.4s, v17.4s // max allowed value
1343 dup v17.4s, w0 // read shift
1344 neg v20.4s, v17.4s // negate it, so it can be used in sshl (effectively shift right)
1349 ldp w10, w11, [x5, #8]
1351 movi v16.16b, #0 // initialize accumulator for idx + 0
1352 movi v17.16b, #0 // initialize accumulator for idx + 1
1353 movi v18.16b, #0 // initialize accumulator for idx + 2
1354 movi v19.16b, #0 // initialize accumulator for idx + 3
1356 mov x12, x4 // filter + 0
1357 add x13, x4, x7 // filter + 1
1358 add x8, x3, x8, lsl #1 // srcp + filterPos 0
1359 add x14, x13, x7 // filter + 2
1360 add x9, x3, x9, lsl #1 // srcp + filterPos 1
1361 add x15, x14, x7 // filter + 3
1362 add x10, x3, x10, lsl #1 // srcp + filterPos 2
1363 mov w0, w6 // save the filterSize to temporary variable
1364 add x11, x3, x11, lsl #1 // srcp + filterPos 3
1365 add x5, x5, #16 // advance filter position
1366 mov x16, xzr // clear the register x16 used for offsetting the filter values
1369 ldr q4, [x8], #16 // load src values for idx 0
1370 ldr q5, [x9], #16 // load src values for idx 1
1373 ldr q31, [x12, x16] // load filter values for idx 0
1374 ldr q6, [x10], #16 // load src values for idx 2
1376 sxtl2 v31.4s, v31.8h
1377 mla v16.4s, v26.4s, v22.4s // multiplication of lower half for idx 0
1380 ldr q30, [x13, x16] // load filter values for idx 1
1381 ldr q7, [x11], #16 // load src values for idx 3
1382 mla v16.4s, v4.4s, v31.4s // multiplication of upper half for idx 0
1385 sxtl2 v30.4s, v30.8h
1386 mla v17.4s, v25.4s, v8.4s // multiplication of lower half for idx 1
1387 ldr q29, [x14, x16] // load filter values for idx 2
1390 sxtl2 v29.4s, v29.8h
1391 mla v17.4s, v5.4s, v30.4s // multiplication of upper half for idx 1
1392 ldr q28, [x15, x16] // load filter values for idx 3
1393 mla v18.4s, v24.4s, v9.4s // multiplication of lower half for idx 2
1396 mla v18.4s, v6.4s, v29.4s // multiplication of upper half for idx 2
1398 sxtl2 v28.4s, v28.8h
1399 mla v19.4s, v23.4s, v10.4s // multiplication of lower half for idx 3
1402 mla v19.4s, v7.4s, v28.4s // multiplication of upper half for idx 3
1404 add x16, x16, #16 // advance filter values indexing
1408 // 4 iterations left
1410 sub x17, x7, #8 // step back to wrap up the filter pos for last 4 elements
1412 ldr d4, [x8] // load src values for idx 0
1413 ldr d31, [x12, x17] // load filter values for idx 0
1416 ldr d5, [x9] // load src values for idx 1
1417 mla v16.4s, v4.4s, v31.4s // multiplication of upper half for idx 0
1418 ldr d30, [x13, x17] // load filter values for idx 1
1421 ldr d6, [x10] // load src values for idx 2
1422 mla v17.4s, v5.4s, v30.4s // multiplication of upper half for idx 1
1423 ldr d29, [x14, x17] // load filter values for idx 2
1426 ldr d7, [x11] // load src values for idx 3
1427 ldr d28, [x15, x17] // load filter values for idx 3
1428 mla v18.4s, v6.4s, v29.4s // multiplication of upper half for idx 2
1431 addp v16.4s, v16.4s, v17.4s
1432 mla v19.4s, v7.4s, v28.4s // multiplication of upper half for idx 3
1434 addp v18.4s, v18.4s, v19.4s
1435 addp v16.4s, v16.4s, v18.4s
1436 sshl v16.4s, v16.4s, v20.4s
1437 smin v16.4s, v16.4s, v21.4s
1439 st1 {v16.4s}, [x1], #16
1440 add x4, x4, x7, lsl #2
1444 ldp d10, d11, [sp, #0x10]