avformat/mov: check for tts_count before deferencing tts_data
[FFMpeg-mirror.git] / libswscale / aarch64 / yuv2rgb_neon.S
blobcc7d8b58fc729c8e394d17ea39c3129e7606e639
1 /*
2  * Copyright (c) 2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
3  * Copyright (c) 2016 Clément Bœsch <clement stupeflix.com>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
22 #include "libavutil/aarch64/asm.S"
24 .macro load_yoff_ycoeff yoff ycoeff
25 #if defined(__APPLE__)
26         ldp             w9, w10, [sp, #\yoff]
27 #else
28         ldr             w9,  [sp, #\yoff]
29         ldr             w10, [sp, #\ycoeff]
30 #endif
31 .endm
33 .macro load_dst1_dst2 dst1 linesize1 dst2 linesize2
34 #if defined(__APPLE__)
35 #define DST_OFFSET 8
36 #else
37 #define DST_OFFSET 0
38 #endif
39         ldr             x10, [sp, #\dst1      - DST_OFFSET]
40         ldr             w12, [sp, #\linesize1 - DST_OFFSET]
41         ldr             x15, [sp, #\dst2      - DST_OFFSET]
42         ldr             w16, [sp, #\linesize2 - DST_OFFSET]
43 #undef DST_OFFSET
44         sub             w12, w12, w0                                    // w12 = linesize1 - width     (padding1)
45         sub             w16, w16, w0                                    // w16 = linesize2 - width     (padding2)
46 .endm
48 .macro load_args_nv12 ofmt
49         ldr             x8,  [sp]                                       // table
50         load_yoff_ycoeff 8, 16                                           // y_offset, y_coeff
51         ld1             {v1.1d}, [x8]
52         dup             v0.8h, w10
53         dup             v3.8h, w9
54 .ifc \ofmt,gbrp
55         load_dst1_dst2  24, 32, 40, 48
56         sub             w3, w3, w0                                      // w3 = linesize  - width     (padding)
57 .else
58         sub             w3, w3, w0, lsl #2                              // w3 = linesize  - width * 4 (padding)
59 .endif
60         sub             w5, w5, w0                                      // w5 = linesizeY - width     (paddingY)
61         sub             w7, w7, w0                                      // w7 = linesizeC - width     (paddingC)
62         neg             w11, w0
63 .endm
65 .macro load_args_nv21 ofmt
66         load_args_nv12  \ofmt
67 .endm
69 .macro load_args_yuv420p ofmt
70         ldr             x13, [sp]                                       // srcV
71         ldr             w14, [sp, #8]                                   // linesizeV
72         ldr             x8,  [sp, #16]                                  // table
73         load_yoff_ycoeff 24, 32                                          // y_offset, y_coeff
74         ld1             {v1.1d}, [x8]
75         dup             v0.8h, w10
76         dup             v3.8h, w9
77 .ifc \ofmt,gbrp
78         load_dst1_dst2  40, 48, 56, 64
79         sub             w3, w3, w0                                      // w3 = linesize  - width     (padding)
80 .else
81         sub             w3, w3, w0, lsl #2                              // w3 = linesize  - width * 4 (padding)
82 .endif
83         sub             w5, w5, w0                                      // w5 = linesizeY - width     (paddingY)
84         sub             w7,  w7,  w0, lsr #1                            // w7  = linesizeU - width / 2 (paddingU)
85         sub             w14, w14, w0, lsr #1                            // w14 = linesizeV - width / 2 (paddingV)
86         lsr             w11, w0, #1
87         neg             w11, w11
88 .endm
90 .macro load_args_yuv422p ofmt
91         ldr             x13, [sp]                                       // srcV
92         ldr             w14, [sp, #8]                                   // linesizeV
93         ldr             x8,  [sp, #16]                                  // table
94         load_yoff_ycoeff 24, 32                                          // y_offset, y_coeff
95         ld1             {v1.1d}, [x8]
96         dup             v0.8h, w10
97         dup             v3.8h, w9
98 .ifc \ofmt,gbrp
99         load_dst1_dst2  40, 48, 56, 64
100         sub             w3, w3, w0                                      // w3 = linesize  - width     (padding)
101 .else
102         sub             w3, w3, w0, lsl #2                              // w3 = linesize  - width * 4 (padding)
103 .endif
104         sub             w5, w5, w0                                      // w5 = linesizeY - width     (paddingY)
105         sub             w7,  w7,  w0, lsr #1                            // w7  = linesizeU - width / 2 (paddingU)
106         sub             w14, w14, w0, lsr #1                            // w14 = linesizeV - width / 2 (paddingV)
107 .endm
109 .macro load_chroma_nv12
110         ld2             {v16.8b, v17.8b}, [x6], #16
111         ushll           v18.8h, v16.8b, #3
112         ushll           v19.8h, v17.8b, #3
113 .endm
115 .macro load_chroma_nv21
116         ld2             {v16.8b, v17.8b}, [x6], #16
117         ushll           v19.8h, v16.8b, #3
118         ushll           v18.8h, v17.8b, #3
119 .endm
121 .macro load_chroma_yuv420p
122         ld1             {v16.8b}, [ x6], #8
123         ld1             {v17.8b}, [x13], #8
124         ushll           v18.8h, v16.8b, #3
125         ushll           v19.8h, v17.8b, #3
126 .endm
128 .macro load_chroma_yuv422p
129     load_chroma_yuv420p
130 .endm
132 .macro increment_nv12
133         ands            w17, w1, #1
134         csel            w17, w7, w11, ne                                // incC = (h & 1) ? paddincC : -width
135         add             x6,  x6, w17, sxtw                              // srcC += incC
136 .endm
138 .macro increment_nv21
139     increment_nv12
140 .endm
142 .macro increment_yuv420p
143         ands            w17, w1, #1
144         csel            w17,  w7, w11, ne                               // incU = (h & 1) ? paddincU : -width/2
145         add             x6,  x6,  w17, sxtw                             // srcU += incU
146         csel            w17, w14, w11, ne                               // incV = (h & 1) ? paddincV : -width/2
147         add             x13, x13, w17, sxtw                             // srcV += incV
148 .endm
150 .macro increment_yuv422p
151         add             x6,  x6,  w7, sxtw                              // srcU += incU
152         add             x13, x13, w14, sxtw                             // srcV += incV
153 .endm
155 .macro compute_rgb r1 g1 b1 r2 g2 b2
156         add             v20.8h, v26.8h, v20.8h                          // Y1 + R1
157         add             v21.8h, v27.8h, v21.8h                          // Y2 + R2
158         add             v22.8h, v26.8h, v22.8h                          // Y1 + G1
159         add             v23.8h, v27.8h, v23.8h                          // Y2 + G2
160         add             v24.8h, v26.8h, v24.8h                          // Y1 + B1
161         add             v25.8h, v27.8h, v25.8h                          // Y2 + B2
162         sqrshrun        \r1, v20.8h, #1                                 // clip_u8((Y1 + R1) >> 1)
163         sqrshrun        \r2, v21.8h, #1                                 // clip_u8((Y2 + R1) >> 1)
164         sqrshrun        \g1, v22.8h, #1                                 // clip_u8((Y1 + G1) >> 1)
165         sqrshrun        \g2, v23.8h, #1                                 // clip_u8((Y2 + G1) >> 1)
166         sqrshrun        \b1, v24.8h, #1                                 // clip_u8((Y1 + B1) >> 1)
167         sqrshrun        \b2, v25.8h, #1                                 // clip_u8((Y2 + B1) >> 1)
168 .endm
170 .macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
171         compute_rgb     \r1, \g1, \b1, \r2, \g2, \b2
172         movi            \a1, #255
173         movi            \a2, #255
174 .endm
176 .macro declare_func ifmt ofmt
177 function ff_\ifmt\()_to_\ofmt\()_neon, export=1
178         load_args_\ifmt \ofmt
180         mov             w9, w1
182         mov             w8, w0                                          // w8 = width
184         movi            v5.8h, #4, lsl #8                               // 128 * (1<<3)
185     load_chroma_\ifmt
186         sub             v18.8h, v18.8h, v5.8h                           // U*(1<<3) - 128*(1<<3)
187         sub             v19.8h, v19.8h, v5.8h                           // V*(1<<3) - 128*(1<<3)
188         sqdmulh         v20.8h, v19.8h, v1.h[0]                         // V * v2r            (R)
189         sqdmulh         v22.8h, v18.8h, v1.h[1]                         // U * u2g
190         sqdmulh         v19.8h, v19.8h, v1.h[2]                         //           V * v2g
191         add             v22.8h, v22.8h, v19.8h                          // U * u2g + V * v2g  (G)
192         sqdmulh         v24.8h, v18.8h, v1.h[3]                         // U * u2b            (B)
193         zip2            v21.8h, v20.8h, v20.8h                          // R2
194         zip1            v20.8h, v20.8h, v20.8h                          // R1
195         zip2            v23.8h, v22.8h, v22.8h                          // G2
196         zip1            v22.8h, v22.8h, v22.8h                          // G1
197         zip2            v25.8h, v24.8h, v24.8h                          // B2
198         zip1            v24.8h, v24.8h, v24.8h                          // B1
199         ld1             {v2.16b}, [x4], #16                             // load luma
200         ushll           v26.8h, v2.8b,  #3                              // Y1*(1<<3)
201         ushll2          v27.8h, v2.16b, #3                              // Y2*(1<<3)
202         sub             v26.8h, v26.8h, v3.8h                           // Y1*(1<<3) - y_offset
203         sub             v27.8h, v27.8h, v3.8h                           // Y2*(1<<3) - y_offset
204         sqdmulh         v26.8h, v26.8h, v0.8h                           // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15
205         sqdmulh         v27.8h, v27.8h, v0.8h                           // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15
207 .ifc \ofmt,argb // 1 2 3 0
208         compute_rgba    v5.8b,v6.8b,v7.8b,v4.8b, v17.8b,v18.8b,v19.8b,v16.8b
209 .endif
211 .ifc \ofmt,rgba // 0 1 2 3
212         compute_rgba    v4.8b,v5.8b,v6.8b,v7.8b, v16.8b,v17.8b,v18.8b,v19.8b
213 .endif
215 .ifc \ofmt,abgr // 3 2 1 0
216         compute_rgba    v7.8b,v6.8b,v5.8b,v4.8b, v19.8b,v18.8b,v17.8b,v16.8b
217 .endif
219 .ifc \ofmt,bgra // 2 1 0 3
220         compute_rgba    v6.8b,v5.8b,v4.8b,v7.8b, v18.8b,v17.8b,v16.8b,v19.8b
221 .endif
223 .ifc \ofmt,gbrp
224         compute_rgb     v18.8b,v4.8b,v6.8b, v19.8b,v5.8b,v7.8b
225         st1             {  v4.8b,  v5.8b }, [x2],  #16
226         st1             {  v6.8b,  v7.8b }, [x10], #16
227         st1             { v18.8b, v19.8b }, [x15], #16
228 .else
229         st4             { v4.8b, v5.8b, v6.8b, v7.8b}, [x2], #32
230         st4             {v16.8b,v17.8b,v18.8b,v19.8b}, [x2], #32
231 .endif
232         subs            w8, w8, #16                                     // width -= 16
233         b.gt            2b
234         add             x2, x2, w3, sxtw                                // dst  += padding
235 .ifc \ofmt,gbrp
236         add             x10, x10, w12, sxtw                             // dst1 += padding1
237         add             x15, x15, w16, sxtw                             // dst2 += padding2
238 .endif
239         add             x4, x4, w5, sxtw                                // srcY += paddingY
240     increment_\ifmt
241         subs            w1, w1, #1                                      // height -= 1
242         b.gt            1b
243         mov             w0, w9
244     ret
245 endfunc
246 .endm
248 .macro declare_rgb_funcs ifmt
249         declare_func    \ifmt, argb
250         declare_func    \ifmt, rgba
251         declare_func    \ifmt, abgr
252         declare_func    \ifmt, bgra
253         declare_func    \ifmt, gbrp
254 .endm
256 declare_rgb_funcs nv12
257 declare_rgb_funcs nv21
258 declare_rgb_funcs yuv420p
259 declare_rgb_funcs yuv422p