libswscale/aarch64/yuv2rgb_neon.S

   1 /*
   2  * Copyright (c) 2016 Matthieu Bouron <matthieu.bouron stupeflix.com>
   3  * Copyright (c) 2016 Clément Bœsch <clement stupeflix.com>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 #include "libavutil/aarch64/asm.S"
  23
  24 .macro load_yoff_ycoeff yoff ycoeff
  25 #if defined(__APPLE__)
  26         ldp             w9, w10, [sp, #\yoff]
  27 #else
  28         ldr             w9,  [sp, #\yoff]
  29         ldr             w10, [sp, #\ycoeff]
  30 #endif
  31 .endm
  32
  33 .macro load_dst1_dst2 dst1 linesize1 dst2 linesize2
  34 #if defined(__APPLE__)
  35 #define DST_OFFSET 8
  36 #else
  37 #define DST_OFFSET 0
  38 #endif
  39         ldr             x10, [sp, #\dst1      - DST_OFFSET]
  40         ldr             w12, [sp, #\linesize1 - DST_OFFSET]
  41         ldr             x15, [sp, #\dst2      - DST_OFFSET]
  42         ldr             w16, [sp, #\linesize2 - DST_OFFSET]
  43 #undef DST_OFFSET
  44         sub             w12, w12, w0                                    // w12 = linesize1 - width     (padding1)
  45         sub             w16, w16, w0                                    // w16 = linesize2 - width     (padding2)
  46 .endm
  47
  48 .macro load_args_nv12 ofmt
  49         ldr             x8,  [sp]                                       // table
  50         load_yoff_ycoeff 8, 16                                           // y_offset, y_coeff
  51         ld1             {v1.1d}, [x8]
  52         dup             v0.8h, w10
  53         dup             v3.8h, w9
  54 .ifc \ofmt,gbrp
  55         load_dst1_dst2  24, 32, 40, 48
  56         sub             w3, w3, w0                                      // w3 = linesize  - width     (padding)
  57 .else
  58         sub             w3, w3, w0, lsl #2                              // w3 = linesize  - width * 4 (padding)
  59 .endif
  60         sub             w5, w5, w0                                      // w5 = linesizeY - width     (paddingY)
  61         sub             w7, w7, w0                                      // w7 = linesizeC - width     (paddingC)
  62         neg             w11, w0
  63 .endm
  64
  65 .macro load_args_nv21 ofmt
  66         load_args_nv12  \ofmt
  67 .endm
  68
  69 .macro load_args_yuv420p ofmt
  70         ldr             x13, [sp]                                       // srcV
  71         ldr             w14, [sp, #8]                                   // linesizeV
  72         ldr             x8,  [sp, #16]                                  // table
  73         load_yoff_ycoeff 24, 32                                          // y_offset, y_coeff
  74         ld1             {v1.1d}, [x8]
  75         dup             v0.8h, w10
  76         dup             v3.8h, w9
  77 .ifc \ofmt,gbrp
  78         load_dst1_dst2  40, 48, 56, 64
  79         sub             w3, w3, w0                                      // w3 = linesize  - width     (padding)
  80 .else
  81         sub             w3, w3, w0, lsl #2                              // w3 = linesize  - width * 4 (padding)
  82 .endif
  83         sub             w5, w5, w0                                      // w5 = linesizeY - width     (paddingY)
  84         sub             w7,  w7,  w0, lsr #1                            // w7  = linesizeU - width / 2 (paddingU)
  85         sub             w14, w14, w0, lsr #1                            // w14 = linesizeV - width / 2 (paddingV)
  86         lsr             w11, w0, #1
  87         neg             w11, w11
  88 .endm
  89
  90 .macro load_args_yuv422p ofmt
  91         ldr             x13, [sp]                                       // srcV
  92         ldr             w14, [sp, #8]                                   // linesizeV
  93         ldr             x8,  [sp, #16]                                  // table
  94         load_yoff_ycoeff 24, 32                                          // y_offset, y_coeff
  95         ld1             {v1.1d}, [x8]
  96         dup             v0.8h, w10
  97         dup             v3.8h, w9
  98 .ifc \ofmt,gbrp
  99         load_dst1_dst2  40, 48, 56, 64
 100         sub             w3, w3, w0                                      // w3 = linesize  - width     (padding)
 101 .else
 102         sub             w3, w3, w0, lsl #2                              // w3 = linesize  - width * 4 (padding)
 103 .endif
 104         sub             w5, w5, w0                                      // w5 = linesizeY - width     (paddingY)
 105         sub             w7,  w7,  w0, lsr #1                            // w7  = linesizeU - width / 2 (paddingU)
 106         sub             w14, w14, w0, lsr #1                            // w14 = linesizeV - width / 2 (paddingV)
 107 .endm
 108
 109 .macro load_chroma_nv12
 110         ld2             {v16.8b, v17.8b}, [x6], #16
 111         ushll           v18.8h, v16.8b, #3
 112         ushll           v19.8h, v17.8b, #3
 113 .endm
 114
 115 .macro load_chroma_nv21
 116         ld2             {v16.8b, v17.8b}, [x6], #16
 117         ushll           v19.8h, v16.8b, #3
 118         ushll           v18.8h, v17.8b, #3
 119 .endm
 120
 121 .macro load_chroma_yuv420p
 122         ld1             {v16.8b}, [ x6], #8
 123         ld1             {v17.8b}, [x13], #8
 124         ushll           v18.8h, v16.8b, #3
 125         ushll           v19.8h, v17.8b, #3
 126 .endm
 127
 128 .macro load_chroma_yuv422p
 129     load_chroma_yuv420p
 130 .endm
 131
 132 .macro increment_nv12
 133         ands            w17, w1, #1
 134         csel            w17, w7, w11, ne                                // incC = (h & 1) ? paddincC : -width
 135         add             x6,  x6, w17, sxtw                              // srcC += incC
 136 .endm
 137
 138 .macro increment_nv21
 139     increment_nv12
 140 .endm
 141
 142 .macro increment_yuv420p
 143         ands            w17, w1, #1
 144         csel            w17,  w7, w11, ne                               // incU = (h & 1) ? paddincU : -width/2
 145         add             x6,  x6,  w17, sxtw                             // srcU += incU
 146         csel            w17, w14, w11, ne                               // incV = (h & 1) ? paddincV : -width/2
 147         add             x13, x13, w17, sxtw                             // srcV += incV
 148 .endm
 149
 150 .macro increment_yuv422p
 151         add             x6,  x6,  w7, sxtw                              // srcU += incU
 152         add             x13, x13, w14, sxtw                             // srcV += incV
 153 .endm
 154
 155 .macro compute_rgb r1 g1 b1 r2 g2 b2
 156         add             v20.8h, v26.8h, v20.8h                          // Y1 + R1
 157         add             v21.8h, v27.8h, v21.8h                          // Y2 + R2
 158         add             v22.8h, v26.8h, v22.8h                          // Y1 + G1
 159         add             v23.8h, v27.8h, v23.8h                          // Y2 + G2
 160         add             v24.8h, v26.8h, v24.8h                          // Y1 + B1
 161         add             v25.8h, v27.8h, v25.8h                          // Y2 + B2
 162         sqrshrun        \r1, v20.8h, #1                                 // clip_u8((Y1 + R1) >> 1)
 163         sqrshrun        \r2, v21.8h, #1                                 // clip_u8((Y2 + R1) >> 1)
 164         sqrshrun        \g1, v22.8h, #1                                 // clip_u8((Y1 + G1) >> 1)
 165         sqrshrun        \g2, v23.8h, #1                                 // clip_u8((Y2 + G1) >> 1)
 166         sqrshrun        \b1, v24.8h, #1                                 // clip_u8((Y1 + B1) >> 1)
 167         sqrshrun        \b2, v25.8h, #1                                 // clip_u8((Y2 + B1) >> 1)
 168 .endm
 169
 170 .macro compute_rgba r1 g1 b1 a1 r2 g2 b2 a2
 171         compute_rgb     \r1, \g1, \b1, \r2, \g2, \b2
 172         movi            \a1, #255
 173         movi            \a2, #255
 174 .endm
 175
 176 .macro declare_func ifmt ofmt
 177 function ff_\ifmt\()_to_\ofmt\()_neon, export=1
 178         load_args_\ifmt \ofmt
 179
 180         mov             w9, w1
 181 1:
 182         mov             w8, w0                                          // w8 = width
 183 2:
 184         movi            v5.8h, #4, lsl #8                               // 128 * (1<<3)
 185     load_chroma_\ifmt
 186         sub             v18.8h, v18.8h, v5.8h                           // U*(1<<3) - 128*(1<<3)
 187         sub             v19.8h, v19.8h, v5.8h                           // V*(1<<3) - 128*(1<<3)
 188         sqdmulh         v20.8h, v19.8h, v1.h[0]                         // V * v2r            (R)
 189         sqdmulh         v22.8h, v18.8h, v1.h[1]                         // U * u2g
 190         sqdmulh         v19.8h, v19.8h, v1.h[2]                         //           V * v2g
 191         add             v22.8h, v22.8h, v19.8h                          // U * u2g + V * v2g  (G)
 192         sqdmulh         v24.8h, v18.8h, v1.h[3]                         // U * u2b            (B)
 193         zip2            v21.8h, v20.8h, v20.8h                          // R2
 194         zip1            v20.8h, v20.8h, v20.8h                          // R1
 195         zip2            v23.8h, v22.8h, v22.8h                          // G2
 196         zip1            v22.8h, v22.8h, v22.8h                          // G1
 197         zip2            v25.8h, v24.8h, v24.8h                          // B2
 198         zip1            v24.8h, v24.8h, v24.8h                          // B1
 199         ld1             {v2.16b}, [x4], #16                             // load luma
 200         ushll           v26.8h, v2.8b,  #3                              // Y1*(1<<3)
 201         ushll2          v27.8h, v2.16b, #3                              // Y2*(1<<3)
 202         sub             v26.8h, v26.8h, v3.8h                           // Y1*(1<<3) - y_offset
 203         sub             v27.8h, v27.8h, v3.8h                           // Y2*(1<<3) - y_offset
 204         sqdmulh         v26.8h, v26.8h, v0.8h                           // ((Y1*(1<<3) - y_offset) * y_coeff) >> 15
 205         sqdmulh         v27.8h, v27.8h, v0.8h                           // ((Y2*(1<<3) - y_offset) * y_coeff) >> 15
 206
 207 .ifc \ofmt,argb // 1 2 3 0
 208         compute_rgba    v5.8b,v6.8b,v7.8b,v4.8b, v17.8b,v18.8b,v19.8b,v16.8b
 209 .endif
 210
 211 .ifc \ofmt,rgba // 0 1 2 3
 212         compute_rgba    v4.8b,v5.8b,v6.8b,v7.8b, v16.8b,v17.8b,v18.8b,v19.8b
 213 .endif
 214
 215 .ifc \ofmt,abgr // 3 2 1 0
 216         compute_rgba    v7.8b,v6.8b,v5.8b,v4.8b, v19.8b,v18.8b,v17.8b,v16.8b
 217 .endif
 218
 219 .ifc \ofmt,bgra // 2 1 0 3
 220         compute_rgba    v6.8b,v5.8b,v4.8b,v7.8b, v18.8b,v17.8b,v16.8b,v19.8b
 221 .endif
 222
 223 .ifc \ofmt,gbrp
 224         compute_rgb     v18.8b,v4.8b,v6.8b, v19.8b,v5.8b,v7.8b
 225         st1             {  v4.8b,  v5.8b }, [x2],  #16
 226         st1             {  v6.8b,  v7.8b }, [x10], #16
 227         st1             { v18.8b, v19.8b }, [x15], #16
 228 .else
 229         st4             { v4.8b, v5.8b, v6.8b, v7.8b}, [x2], #32
 230         st4             {v16.8b,v17.8b,v18.8b,v19.8b}, [x2], #32
 231 .endif
 232         subs            w8, w8, #16                                     // width -= 16
 233         b.gt            2b
 234         add             x2, x2, w3, sxtw                                // dst  += padding
 235 .ifc \ofmt,gbrp
 236         add             x10, x10, w12, sxtw                             // dst1 += padding1
 237         add             x15, x15, w16, sxtw                             // dst2 += padding2
 238 .endif
 239         add             x4, x4, w5, sxtw                                // srcY += paddingY
 240     increment_\ifmt
 241         subs            w1, w1, #1                                      // height -= 1
 242         b.gt            1b
 243         mov             w0, w9
 244     ret
 245 endfunc
 246 .endm
 247
 248 .macro declare_rgb_funcs ifmt
 249         declare_func    \ifmt, argb
 250         declare_func    \ifmt, rgba
 251         declare_func    \ifmt, abgr
 252         declare_func    \ifmt, bgra
 253         declare_func    \ifmt, gbrp
 254 .endm
 255
 256 declare_rgb_funcs nv12
 257 declare_rgb_funcs nv21
 258 declare_rgb_funcs yuv420p
 259 declare_rgb_funcs yuv422p