third_party/libwebp/dsp/dec_neon.c

   1 // Copyright 2012 Google Inc. All Rights Reserved.
   2 //
   3 // Use of this source code is governed by a BSD-style license
   4 // that can be found in the COPYING file in the root of the source
   5 // tree. An additional intellectual property rights grant can be found
   6 // in the file PATENTS. All contributing project authors may
   7 // be found in the AUTHORS file in the root of the source tree.
   8 // -----------------------------------------------------------------------------
   9 //
  10 // ARM NEON version of dsp functions and loop filtering.
  11 //
  12 // Authors: Somnath Banerjee (somnath@google.com)
  13 //          Johann Koenig (johannkoenig@google.com)
  14
  15 #include "./dsp.h"
  16
  17 #if defined(WEBP_USE_NEON)
  18
  19 #include "./neon.h"
  20 #include "../dec/vp8i.h"
  21
  22 //------------------------------------------------------------------------------
  23 // NxM Loading functions
  24
  25 // Load/Store vertical edge
  26 #define LOAD8x4(c1, c2, c3, c4, b1, b2, stride)                                \
  27   "vld4.8   {" #c1"[0], " #c2"[0], " #c3"[0], " #c4"[0]}," #b1 "," #stride"\n" \
  28   "vld4.8   {" #c1"[1], " #c2"[1], " #c3"[1], " #c4"[1]}," #b2 "," #stride"\n" \
  29   "vld4.8   {" #c1"[2], " #c2"[2], " #c3"[2], " #c4"[2]}," #b1 "," #stride"\n" \
  30   "vld4.8   {" #c1"[3], " #c2"[3], " #c3"[3], " #c4"[3]}," #b2 "," #stride"\n" \
  31   "vld4.8   {" #c1"[4], " #c2"[4], " #c3"[4], " #c4"[4]}," #b1 "," #stride"\n" \
  32   "vld4.8   {" #c1"[5], " #c2"[5], " #c3"[5], " #c4"[5]}," #b2 "," #stride"\n" \
  33   "vld4.8   {" #c1"[6], " #c2"[6], " #c3"[6], " #c4"[6]}," #b1 "," #stride"\n" \
  34   "vld4.8   {" #c1"[7], " #c2"[7], " #c3"[7], " #c4"[7]}," #b2 "," #stride"\n"
  35
  36 #define STORE8x2(c1, c2, p, stride)                                            \
  37   "vst2.8   {" #c1"[0], " #c2"[0]}," #p "," #stride " \n"                      \
  38   "vst2.8   {" #c1"[1], " #c2"[1]}," #p "," #stride " \n"                      \
  39   "vst2.8   {" #c1"[2], " #c2"[2]}," #p "," #stride " \n"                      \
  40   "vst2.8   {" #c1"[3], " #c2"[3]}," #p "," #stride " \n"                      \
  41   "vst2.8   {" #c1"[4], " #c2"[4]}," #p "," #stride " \n"                      \
  42   "vst2.8   {" #c1"[5], " #c2"[5]}," #p "," #stride " \n"                      \
  43   "vst2.8   {" #c1"[6], " #c2"[6]}," #p "," #stride " \n"                      \
  44   "vst2.8   {" #c1"[7], " #c2"[7]}," #p "," #stride " \n"
  45
  46 #if !defined(WORK_AROUND_GCC)
  47
  48 // This intrinsics version makes gcc-4.6.3 crash during Load4x??() compilation
  49 // (register alloc, probably). The variants somewhat mitigate the problem, but
  50 // not quite. HFilter16i() remains problematic.
  51 static WEBP_INLINE uint8x8x4_t Load4x8(const uint8_t* const src, int stride) {
  52   const uint8x8_t zero = vdup_n_u8(0);
  53   uint8x8x4_t out;
  54   INIT_VECTOR4(out, zero, zero, zero, zero);
  55   out = vld4_lane_u8(src + 0 * stride, out, 0);
  56   out = vld4_lane_u8(src + 1 * stride, out, 1);
  57   out = vld4_lane_u8(src + 2 * stride, out, 2);
  58   out = vld4_lane_u8(src + 3 * stride, out, 3);
  59   out = vld4_lane_u8(src + 4 * stride, out, 4);
  60   out = vld4_lane_u8(src + 5 * stride, out, 5);
  61   out = vld4_lane_u8(src + 6 * stride, out, 6);
  62   out = vld4_lane_u8(src + 7 * stride, out, 7);
  63   return out;
  64 }
  65
  66 static WEBP_INLINE void Load4x16(const uint8_t* const src, int stride,
  67                                  uint8x16_t* const p1, uint8x16_t* const p0,
  68                                  uint8x16_t* const q0, uint8x16_t* const q1) {
  69   // row0 = p1[0..7]|p0[0..7]|q0[0..7]|q1[0..7]
  70   // row8 = p1[8..15]|p0[8..15]|q0[8..15]|q1[8..15]
  71   const uint8x8x4_t row0 = Load4x8(src - 2 + 0 * stride, stride);
  72   const uint8x8x4_t row8 = Load4x8(src - 2 + 8 * stride, stride);
  73   *p1 = vcombine_u8(row0.val[0], row8.val[0]);
  74   *p0 = vcombine_u8(row0.val[1], row8.val[1]);
  75   *q0 = vcombine_u8(row0.val[2], row8.val[2]);
  76   *q1 = vcombine_u8(row0.val[3], row8.val[3]);
  77 }
  78
  79 #else  // WORK_AROUND_GCC
  80
  81 #define LOADQ_LANE_32b(VALUE, LANE) do {                             \
  82   (VALUE) = vld1q_lane_u32((const uint32_t*)src, (VALUE), (LANE));   \
  83   src += stride;                                                     \
  84 } while (0)
  85
  86 static WEBP_INLINE void Load4x16(const uint8_t* src, int stride,
  87                                  uint8x16_t* const p1, uint8x16_t* const p0,
  88                                  uint8x16_t* const q0, uint8x16_t* const q1) {
  89   const uint32x4_t zero = vdupq_n_u32(0);
  90   uint32x4x4_t in;
  91   INIT_VECTOR4(in, zero, zero, zero, zero);
  92   src -= 2;
  93   LOADQ_LANE_32b(in.val[0], 0);
  94   LOADQ_LANE_32b(in.val[1], 0);
  95   LOADQ_LANE_32b(in.val[2], 0);
  96   LOADQ_LANE_32b(in.val[3], 0);
  97   LOADQ_LANE_32b(in.val[0], 1);
  98   LOADQ_LANE_32b(in.val[1], 1);
  99   LOADQ_LANE_32b(in.val[2], 1);
 100   LOADQ_LANE_32b(in.val[3], 1);
 101   LOADQ_LANE_32b(in.val[0], 2);
 102   LOADQ_LANE_32b(in.val[1], 2);
 103   LOADQ_LANE_32b(in.val[2], 2);
 104   LOADQ_LANE_32b(in.val[3], 2);
 105   LOADQ_LANE_32b(in.val[0], 3);
 106   LOADQ_LANE_32b(in.val[1], 3);
 107   LOADQ_LANE_32b(in.val[2], 3);
 108   LOADQ_LANE_32b(in.val[3], 3);
 109   // Transpose four 4x4 parts:
 110   {
 111     const uint8x16x2_t row01 = vtrnq_u8(vreinterpretq_u8_u32(in.val[0]),
 112                                         vreinterpretq_u8_u32(in.val[1]));
 113     const uint8x16x2_t row23 = vtrnq_u8(vreinterpretq_u8_u32(in.val[2]),
 114                                         vreinterpretq_u8_u32(in.val[3]));
 115     const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
 116                                          vreinterpretq_u16_u8(row23.val[0]));
 117     const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
 118                                          vreinterpretq_u16_u8(row23.val[1]));
 119     *p1 = vreinterpretq_u8_u16(row02.val[0]);
 120     *p0 = vreinterpretq_u8_u16(row13.val[0]);
 121     *q0 = vreinterpretq_u8_u16(row02.val[1]);
 122     *q1 = vreinterpretq_u8_u16(row13.val[1]);
 123   }
 124 }
 125 #undef LOADQ_LANE_32b
 126
 127 #endif  // !WORK_AROUND_GCC
 128
 129 static WEBP_INLINE void Load8x16(const uint8_t* const src, int stride,
 130                                  uint8x16_t* const p3, uint8x16_t* const p2,
 131                                  uint8x16_t* const p1, uint8x16_t* const p0,
 132                                  uint8x16_t* const q0, uint8x16_t* const q1,
 133                                  uint8x16_t* const q2, uint8x16_t* const q3) {
 134   Load4x16(src - 2, stride, p3, p2, p1, p0);
 135   Load4x16(src + 2, stride, q0, q1, q2, q3);
 136 }
 137
 138 static WEBP_INLINE void Load16x4(const uint8_t* const src, int stride,
 139                                  uint8x16_t* const p1, uint8x16_t* const p0,
 140                                  uint8x16_t* const q0, uint8x16_t* const q1) {
 141   *p1 = vld1q_u8(src - 2 * stride);
 142   *p0 = vld1q_u8(src - 1 * stride);
 143   *q0 = vld1q_u8(src + 0 * stride);
 144   *q1 = vld1q_u8(src + 1 * stride);
 145 }
 146
 147 static WEBP_INLINE void Load16x8(const uint8_t* const src, int stride,
 148                                  uint8x16_t* const p3, uint8x16_t* const p2,
 149                                  uint8x16_t* const p1, uint8x16_t* const p0,
 150                                  uint8x16_t* const q0, uint8x16_t* const q1,
 151                                  uint8x16_t* const q2, uint8x16_t* const q3) {
 152   Load16x4(src - 2  * stride, stride, p3, p2, p1, p0);
 153   Load16x4(src + 2  * stride, stride, q0, q1, q2, q3);
 154 }
 155
 156 static WEBP_INLINE void Load8x8x2(const uint8_t* const u,
 157                                   const uint8_t* const v,
 158                                   int stride,
 159                                   uint8x16_t* const p3, uint8x16_t* const p2,
 160                                   uint8x16_t* const p1, uint8x16_t* const p0,
 161                                   uint8x16_t* const q0, uint8x16_t* const q1,
 162                                   uint8x16_t* const q2, uint8x16_t* const q3) {
 163   // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
 164   // and the v-samples on the higher half.
 165   *p3 = vcombine_u8(vld1_u8(u - 4 * stride), vld1_u8(v - 4 * stride));
 166   *p2 = vcombine_u8(vld1_u8(u - 3 * stride), vld1_u8(v - 3 * stride));
 167   *p1 = vcombine_u8(vld1_u8(u - 2 * stride), vld1_u8(v - 2 * stride));
 168   *p0 = vcombine_u8(vld1_u8(u - 1 * stride), vld1_u8(v - 1 * stride));
 169   *q0 = vcombine_u8(vld1_u8(u + 0 * stride), vld1_u8(v + 0 * stride));
 170   *q1 = vcombine_u8(vld1_u8(u + 1 * stride), vld1_u8(v + 1 * stride));
 171   *q2 = vcombine_u8(vld1_u8(u + 2 * stride), vld1_u8(v + 2 * stride));
 172   *q3 = vcombine_u8(vld1_u8(u + 3 * stride), vld1_u8(v + 3 * stride));
 173 }
 174
 175 #if !defined(WORK_AROUND_GCC)
 176
 177 #define LOAD_UV_8(ROW) \
 178   vcombine_u8(vld1_u8(u - 4 + (ROW) * stride), vld1_u8(v - 4 + (ROW) * stride))
 179
 180 static WEBP_INLINE void Load8x8x2T(const uint8_t* const u,
 181                                    const uint8_t* const v,
 182                                    int stride,
 183                                    uint8x16_t* const p3, uint8x16_t* const p2,
 184                                    uint8x16_t* const p1, uint8x16_t* const p0,
 185                                    uint8x16_t* const q0, uint8x16_t* const q1,
 186                                    uint8x16_t* const q2, uint8x16_t* const q3) {
 187   // We pack the 8x8 u-samples in the lower half of the uint8x16_t destination
 188   // and the v-samples on the higher half.
 189   const uint8x16_t row0 = LOAD_UV_8(0);
 190   const uint8x16_t row1 = LOAD_UV_8(1);
 191   const uint8x16_t row2 = LOAD_UV_8(2);
 192   const uint8x16_t row3 = LOAD_UV_8(3);
 193   const uint8x16_t row4 = LOAD_UV_8(4);
 194   const uint8x16_t row5 = LOAD_UV_8(5);
 195   const uint8x16_t row6 = LOAD_UV_8(6);
 196   const uint8x16_t row7 = LOAD_UV_8(7);
 197   // Perform two side-by-side 8x8 transposes
 198   // u00 u01 u02 u03 u04 u05 u06 u07 | v00 v01 v02 v03 v04 v05 v06 v07
 199   // u10 u11 u12 u13 u14 u15 u16 u17 | v10 v11 v12 ...
 200   // u20 u21 u22 u23 u24 u25 u26 u27 | v20 v21 ...
 201   // u30 u31 u32 u33 u34 u35 u36 u37 | ...
 202   // u40 u41 u42 u43 u44 u45 u46 u47 | ...
 203   // u50 u51 u52 u53 u54 u55 u56 u57 | ...
 204   // u60 u61 u62 u63 u64 u65 u66 u67 | v60 ...
 205   // u70 u71 u72 u73 u74 u75 u76 u77 | v70 v71 v72 ...
 206   const uint8x16x2_t row01 = vtrnq_u8(row0, row1);  // u00 u10 u02 u12 ...
 207                                                     // u01 u11 u03 u13 ...
 208   const uint8x16x2_t row23 = vtrnq_u8(row2, row3);  // u20 u30 u22 u32 ...
 209                                                     // u21 u31 u23 u33 ...
 210   const uint8x16x2_t row45 = vtrnq_u8(row4, row5);  // ...
 211   const uint8x16x2_t row67 = vtrnq_u8(row6, row7);  // ...
 212   const uint16x8x2_t row02 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[0]),
 213                                        vreinterpretq_u16_u8(row23.val[0]));
 214   const uint16x8x2_t row13 = vtrnq_u16(vreinterpretq_u16_u8(row01.val[1]),
 215                                        vreinterpretq_u16_u8(row23.val[1]));
 216   const uint16x8x2_t row46 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[0]),
 217                                        vreinterpretq_u16_u8(row67.val[0]));
 218   const uint16x8x2_t row57 = vtrnq_u16(vreinterpretq_u16_u8(row45.val[1]),
 219                                        vreinterpretq_u16_u8(row67.val[1]));
 220   const uint32x4x2_t row04 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[0]),
 221                                        vreinterpretq_u32_u16(row46.val[0]));
 222   const uint32x4x2_t row26 = vtrnq_u32(vreinterpretq_u32_u16(row02.val[1]),
 223                                        vreinterpretq_u32_u16(row46.val[1]));
 224   const uint32x4x2_t row15 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[0]),
 225                                        vreinterpretq_u32_u16(row57.val[0]));
 226   const uint32x4x2_t row37 = vtrnq_u32(vreinterpretq_u32_u16(row13.val[1]),
 227                                        vreinterpretq_u32_u16(row57.val[1]));
 228   *p3 = vreinterpretq_u8_u32(row04.val[0]);
 229   *p2 = vreinterpretq_u8_u32(row15.val[0]);
 230   *p1 = vreinterpretq_u8_u32(row26.val[0]);
 231   *p0 = vreinterpretq_u8_u32(row37.val[0]);
 232   *q0 = vreinterpretq_u8_u32(row04.val[1]);
 233   *q1 = vreinterpretq_u8_u32(row15.val[1]);
 234   *q2 = vreinterpretq_u8_u32(row26.val[1]);
 235   *q3 = vreinterpretq_u8_u32(row37.val[1]);
 236 }
 237 #undef LOAD_UV_8
 238
 239 #endif  // !WORK_AROUND_GCC
 240
 241 static WEBP_INLINE void Store2x8(const uint8x8x2_t v,
 242                                  uint8_t* const dst, int stride) {
 243   vst2_lane_u8(dst + 0 * stride, v, 0);
 244   vst2_lane_u8(dst + 1 * stride, v, 1);
 245   vst2_lane_u8(dst + 2 * stride, v, 2);
 246   vst2_lane_u8(dst + 3 * stride, v, 3);
 247   vst2_lane_u8(dst + 4 * stride, v, 4);
 248   vst2_lane_u8(dst + 5 * stride, v, 5);
 249   vst2_lane_u8(dst + 6 * stride, v, 6);
 250   vst2_lane_u8(dst + 7 * stride, v, 7);
 251 }
 252
 253 static WEBP_INLINE void Store2x16(const uint8x16_t p0, const uint8x16_t q0,
 254                                   uint8_t* const dst, int stride) {
 255   uint8x8x2_t lo, hi;
 256   lo.val[0] = vget_low_u8(p0);
 257   lo.val[1] = vget_low_u8(q0);
 258   hi.val[0] = vget_high_u8(p0);
 259   hi.val[1] = vget_high_u8(q0);
 260   Store2x8(lo, dst - 1 + 0 * stride, stride);
 261   Store2x8(hi, dst - 1 + 8 * stride, stride);
 262 }
 263
 264 #if !defined(WORK_AROUND_GCC)
 265 static WEBP_INLINE void Store4x8(const uint8x8x4_t v,
 266                                  uint8_t* const dst, int stride) {
 267   vst4_lane_u8(dst + 0 * stride, v, 0);
 268   vst4_lane_u8(dst + 1 * stride, v, 1);
 269   vst4_lane_u8(dst + 2 * stride, v, 2);
 270   vst4_lane_u8(dst + 3 * stride, v, 3);
 271   vst4_lane_u8(dst + 4 * stride, v, 4);
 272   vst4_lane_u8(dst + 5 * stride, v, 5);
 273   vst4_lane_u8(dst + 6 * stride, v, 6);
 274   vst4_lane_u8(dst + 7 * stride, v, 7);
 275 }
 276
 277 static WEBP_INLINE void Store4x16(const uint8x16_t p1, const uint8x16_t p0,
 278                                   const uint8x16_t q0, const uint8x16_t q1,
 279                                   uint8_t* const dst, int stride) {
 280   uint8x8x4_t lo, hi;
 281   INIT_VECTOR4(lo,
 282                vget_low_u8(p1), vget_low_u8(p0),
 283                vget_low_u8(q0), vget_low_u8(q1));
 284   INIT_VECTOR4(hi,
 285                vget_high_u8(p1), vget_high_u8(p0),
 286                vget_high_u8(q0), vget_high_u8(q1));
 287   Store4x8(lo, dst - 2 + 0 * stride, stride);
 288   Store4x8(hi, dst - 2 + 8 * stride, stride);
 289 }
 290 #endif  // !WORK_AROUND_GCC
 291
 292 static WEBP_INLINE void Store16x2(const uint8x16_t p0, const uint8x16_t q0,
 293                                   uint8_t* const dst, int stride) {
 294   vst1q_u8(dst - stride, p0);
 295   vst1q_u8(dst, q0);
 296 }
 297
 298 static WEBP_INLINE void Store16x4(const uint8x16_t p1, const uint8x16_t p0,
 299                                   const uint8x16_t q0, const uint8x16_t q1,
 300                                   uint8_t* const dst, int stride) {
 301   Store16x2(p1, p0, dst - stride, stride);
 302   Store16x2(q0, q1, dst + stride, stride);
 303 }
 304
 305 static WEBP_INLINE void Store8x2x2(const uint8x16_t p0, const uint8x16_t q0,
 306                                    uint8_t* const u, uint8_t* const v,
 307                                    int stride) {
 308   // p0 and q0 contain the u+v samples packed in low/high halves.
 309   vst1_u8(u - stride, vget_low_u8(p0));
 310   vst1_u8(u,          vget_low_u8(q0));
 311   vst1_u8(v - stride, vget_high_u8(p0));
 312   vst1_u8(v,          vget_high_u8(q0));
 313 }
 314
 315 static WEBP_INLINE void Store8x4x2(const uint8x16_t p1, const uint8x16_t p0,
 316                                    const uint8x16_t q0, const uint8x16_t q1,
 317                                    uint8_t* const u, uint8_t* const v,
 318                                    int stride) {
 319   // The p1...q1 registers contain the u+v samples packed in low/high halves.
 320   Store8x2x2(p1, p0, u - stride, v - stride, stride);
 321   Store8x2x2(q0, q1, u + stride, v + stride, stride);
 322 }
 323
 324 #if !defined(WORK_AROUND_GCC)
 325
 326 #define STORE6_LANE(DST, VAL0, VAL1, LANE) do {   \
 327   vst3_lane_u8((DST) - 3, (VAL0), (LANE));        \
 328   vst3_lane_u8((DST) + 0, (VAL1), (LANE));        \
 329   (DST) += stride;                                \
 330 } while (0)
 331
 332 static WEBP_INLINE void Store6x8x2(const uint8x16_t p2, const uint8x16_t p1,
 333                                    const uint8x16_t p0, const uint8x16_t q0,
 334                                    const uint8x16_t q1, const uint8x16_t q2,
 335                                    uint8_t* u, uint8_t* v,
 336                                    int stride) {
 337   uint8x8x3_t u0, u1, v0, v1;
 338   INIT_VECTOR3(u0, vget_low_u8(p2), vget_low_u8(p1), vget_low_u8(p0));
 339   INIT_VECTOR3(u1, vget_low_u8(q0), vget_low_u8(q1), vget_low_u8(q2));
 340   INIT_VECTOR3(v0, vget_high_u8(p2), vget_high_u8(p1), vget_high_u8(p0));
 341   INIT_VECTOR3(v1, vget_high_u8(q0), vget_high_u8(q1), vget_high_u8(q2));
 342   STORE6_LANE(u, u0, u1, 0);
 343   STORE6_LANE(u, u0, u1, 1);
 344   STORE6_LANE(u, u0, u1, 2);
 345   STORE6_LANE(u, u0, u1, 3);
 346   STORE6_LANE(u, u0, u1, 4);
 347   STORE6_LANE(u, u0, u1, 5);
 348   STORE6_LANE(u, u0, u1, 6);
 349   STORE6_LANE(u, u0, u1, 7);
 350   STORE6_LANE(v, v0, v1, 0);
 351   STORE6_LANE(v, v0, v1, 1);
 352   STORE6_LANE(v, v0, v1, 2);
 353   STORE6_LANE(v, v0, v1, 3);
 354   STORE6_LANE(v, v0, v1, 4);
 355   STORE6_LANE(v, v0, v1, 5);
 356   STORE6_LANE(v, v0, v1, 6);
 357   STORE6_LANE(v, v0, v1, 7);
 358 }
 359 #undef STORE6_LANE
 360
 361 static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0,
 362                                    const uint8x16_t q0, const uint8x16_t q1,
 363                                    uint8_t* const u, uint8_t* const v,
 364                                    int stride) {
 365   uint8x8x4_t u0, v0;
 366   INIT_VECTOR4(u0,
 367                vget_low_u8(p1), vget_low_u8(p0),
 368                vget_low_u8(q0), vget_low_u8(q1));
 369   INIT_VECTOR4(v0,
 370                vget_high_u8(p1), vget_high_u8(p0),
 371                vget_high_u8(q0), vget_high_u8(q1));
 372   vst4_lane_u8(u - 2 + 0 * stride, u0, 0);
 373   vst4_lane_u8(u - 2 + 1 * stride, u0, 1);
 374   vst4_lane_u8(u - 2 + 2 * stride, u0, 2);
 375   vst4_lane_u8(u - 2 + 3 * stride, u0, 3);
 376   vst4_lane_u8(u - 2 + 4 * stride, u0, 4);
 377   vst4_lane_u8(u - 2 + 5 * stride, u0, 5);
 378   vst4_lane_u8(u - 2 + 6 * stride, u0, 6);
 379   vst4_lane_u8(u - 2 + 7 * stride, u0, 7);
 380   vst4_lane_u8(v - 2 + 0 * stride, v0, 0);
 381   vst4_lane_u8(v - 2 + 1 * stride, v0, 1);
 382   vst4_lane_u8(v - 2 + 2 * stride, v0, 2);
 383   vst4_lane_u8(v - 2 + 3 * stride, v0, 3);
 384   vst4_lane_u8(v - 2 + 4 * stride, v0, 4);
 385   vst4_lane_u8(v - 2 + 5 * stride, v0, 5);
 386   vst4_lane_u8(v - 2 + 6 * stride, v0, 6);
 387   vst4_lane_u8(v - 2 + 7 * stride, v0, 7);
 388 }
 389
 390 #endif  // !WORK_AROUND_GCC
 391
 392 // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
 393 static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
 394   return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
 395 }
 396
 397 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
 398 // to the corresponding rows of 'dst'.
 399 static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
 400                                             const int16x8_t dst01,
 401                                             const int16x8_t dst23) {
 402   // Unsigned saturate to 8b.
 403   const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
 404   const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
 405
 406   // Store the results.
 407   vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
 408   vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
 409   vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
 410   vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
 411 }
 412
 413 static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
 414                                uint8_t* const dst) {
 415   uint32x2_t dst01 = vdup_n_u32(0);
 416   uint32x2_t dst23 = vdup_n_u32(0);
 417
 418   // Load the source pixels.
 419   dst01 = vld1_lane_u32((uint32_t*)(dst + 0 * BPS), dst01, 0);
 420   dst23 = vld1_lane_u32((uint32_t*)(dst + 2 * BPS), dst23, 0);
 421   dst01 = vld1_lane_u32((uint32_t*)(dst + 1 * BPS), dst01, 1);
 422   dst23 = vld1_lane_u32((uint32_t*)(dst + 3 * BPS), dst23, 1);
 423
 424   {
 425     // Convert to 16b.
 426     const int16x8_t dst01_s16 = ConvertU8ToS16(dst01);
 427     const int16x8_t dst23_s16 = ConvertU8ToS16(dst23);
 428
 429     // Descale with rounding.
 430     const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
 431     const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
 432     // Add the inverse transform.
 433     SaturateAndStore4x4(dst, out01, out23);
 434   }
 435 }
 436
 437 //-----------------------------------------------------------------------------
 438 // Simple In-loop filtering (Paragraph 15.2)
 439
 440 static uint8x16_t NeedsFilter(const uint8x16_t p1, const uint8x16_t p0,
 441                               const uint8x16_t q0, const uint8x16_t q1,
 442                               int thresh) {
 443   const uint8x16_t thresh_v = vdupq_n_u8((uint8_t)thresh);
 444   const uint8x16_t a_p0_q0 = vabdq_u8(p0, q0);               // abs(p0-q0)
 445   const uint8x16_t a_p1_q1 = vabdq_u8(p1, q1);               // abs(p1-q1)
 446   const uint8x16_t a_p0_q0_2 = vqaddq_u8(a_p0_q0, a_p0_q0);  // 2 * abs(p0-q0)
 447   const uint8x16_t a_p1_q1_2 = vshrq_n_u8(a_p1_q1, 1);       // abs(p1-q1) / 2
 448   const uint8x16_t sum = vqaddq_u8(a_p0_q0_2, a_p1_q1_2);
 449   const uint8x16_t mask = vcgeq_u8(thresh_v, sum);
 450   return mask;
 451 }
 452
 453 static int8x16_t FlipSign(const uint8x16_t v) {
 454   const uint8x16_t sign_bit = vdupq_n_u8(0x80);
 455   return vreinterpretq_s8_u8(veorq_u8(v, sign_bit));
 456 }
 457
 458 static uint8x16_t FlipSignBack(const int8x16_t v) {
 459   const int8x16_t sign_bit = vdupq_n_s8(0x80);
 460   return vreinterpretq_u8_s8(veorq_s8(v, sign_bit));
 461 }
 462
 463 static int8x16_t GetBaseDelta(const int8x16_t p1, const int8x16_t p0,
 464                               const int8x16_t q0, const int8x16_t q1) {
 465   const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
 466   const int8x16_t p1_q1 = vqsubq_s8(p1, q1);      // (p1-q1)
 467   const int8x16_t s1 = vqaddq_s8(p1_q1, q0_p0);   // (p1-q1) + 1 * (q0 - p0)
 468   const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // (p1-q1) + 2 * (q0 - p0)
 469   const int8x16_t s3 = vqaddq_s8(q0_p0, s2);      // (p1-q1) + 3 * (q0 - p0)
 470   return s3;
 471 }
 472
 473 static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) {
 474   const int8x16_t q0_p0 = vqsubq_s8(q0, p0);      // (q0-p0)
 475   const int8x16_t s1 = vqaddq_s8(q0_p0, q0_p0);   // 2 * (q0 - p0)
 476   const int8x16_t s2 = vqaddq_s8(q0_p0, s1);      // 3 * (q0 - p0)
 477   return s2;
 478 }
 479
 480 //------------------------------------------------------------------------------
 481
 482 static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s,
 483                          const int8x16_t delta,
 484                          uint8x16_t* const op0, uint8x16_t* const oq0) {
 485   const int8x16_t kCst3 = vdupq_n_s8(0x03);
 486   const int8x16_t kCst4 = vdupq_n_s8(0x04);
 487   const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
 488   const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
 489   const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
 490   const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
 491   const int8x16_t sp0 = vqaddq_s8(p0s, delta3);
 492   const int8x16_t sq0 = vqsubq_s8(q0s, delta4);
 493   *op0 = FlipSignBack(sp0);
 494   *oq0 = FlipSignBack(sq0);
 495 }
 496
 497 #if defined(USE_INTRINSICS)
 498
 499 static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0,
 500                       const uint8x16_t q0, const uint8x16_t q1,
 501                       const uint8x16_t mask,
 502                       uint8x16_t* const op0, uint8x16_t* const oq0) {
 503   const int8x16_t p1s = FlipSign(p1);
 504   const int8x16_t p0s = FlipSign(p0);
 505   const int8x16_t q0s = FlipSign(q0);
 506   const int8x16_t q1s = FlipSign(q1);
 507   const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);
 508   const int8x16_t delta1 = vandq_s8(delta0, vreinterpretq_s8_u8(mask));
 509   ApplyFilter2(p0s, q0s, delta1, op0, oq0);
 510 }
 511
 512 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
 513   uint8x16_t p1, p0, q0, q1, op0, oq0;
 514   Load16x4(p, stride, &p1, &p0, &q0, &q1);
 515   {
 516     const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
 517     DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);
 518   }
 519   Store16x2(op0, oq0, p, stride);
 520 }
 521
 522 static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
 523   uint8x16_t p1, p0, q0, q1, oq0, op0;
 524   Load4x16(p, stride, &p1, &p0, &q0, &q1);
 525   {
 526     const uint8x16_t mask = NeedsFilter(p1, p0, q0, q1, thresh);
 527     DoFilter2(p1, p0, q0, q1, mask, &op0, &oq0);
 528   }
 529   Store2x16(op0, oq0, p, stride);
 530 }
 531
 532 #else
 533
 534 #define QRegs "q0", "q1", "q2", "q3",                                          \
 535               "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
 536
 537 #define FLIP_SIGN_BIT2(a, b, s)                                                \
 538   "veor     " #a "," #a "," #s "               \n"                             \
 539   "veor     " #b "," #b "," #s "               \n"                             \
 540
 541 #define FLIP_SIGN_BIT4(a, b, c, d, s)                                          \
 542   FLIP_SIGN_BIT2(a, b, s)                                                      \
 543   FLIP_SIGN_BIT2(c, d, s)                                                      \
 544
 545 #define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask)                             \
 546   "vabd.u8    q15," #p0 "," #q0 "         \n"  /* abs(p0 - q0) */              \
 547   "vabd.u8    q14," #p1 "," #q1 "         \n"  /* abs(p1 - q1) */              \
 548   "vqadd.u8   q15, q15, q15               \n"  /* abs(p0 - q0) * 2 */          \
 549   "vshr.u8    q14, q14, #1                \n"  /* abs(p1 - q1) / 2 */          \
 550   "vqadd.u8   q15, q15, q14     \n"  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
 551   "vdup.8     q14, " #thresh "            \n"                                  \
 552   "vcge.u8   " #mask ", q14, q15          \n"  /* mask <= thresh */
 553
 554 #define GET_BASE_DELTA(p1, p0, q0, q1, o)                                      \
 555   "vqsub.s8   q15," #q0 "," #p0 "         \n"  /* (q0 - p0) */                 \
 556   "vqsub.s8  " #o "," #p1 "," #q1 "       \n"  /* (p1 - q1) */                 \
 557   "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 1 * (p0 - q0) */ \
 558   "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 2 * (p0 - q0) */ \
 559   "vqadd.s8  " #o "," #o ", q15           \n"  /* (p1 - q1) + 3 * (p0 - q0) */
 560
 561 #define DO_SIMPLE_FILTER(p0, q0, fl)                                           \
 562   "vmov.i8    q15, #0x03                  \n"                                  \
 563   "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 3 */      \
 564   "vshr.s8    q15, q15, #3                \n"  /* filter1 >> 3 */              \
 565   "vqadd.s8  " #p0 "," #p0 ", q15         \n"  /* p0 += filter1 */             \
 566                                                                                \
 567   "vmov.i8    q15, #0x04                  \n"                                  \
 568   "vqadd.s8   q15, q15, " #fl "           \n"  /* filter1 = filter + 4 */      \
 569   "vshr.s8    q15, q15, #3                \n"  /* filter2 >> 3 */              \
 570   "vqsub.s8  " #q0 "," #q0 ", q15         \n"  /* q0 -= filter2 */
 571
 572 // Applies filter on 2 pixels (p0 and q0)
 573 #define DO_FILTER2(p1, p0, q0, q1, thresh)                                     \
 574   NEEDS_FILTER(p1, p0, q0, q1, thresh, q9)     /* filter mask in q9 */         \
 575   "vmov.i8    q10, #0x80                  \n"  /* sign bit */                  \
 576   FLIP_SIGN_BIT4(p1, p0, q0, q1, q10)          /* convert to signed value */   \
 577   GET_BASE_DELTA(p1, p0, q0, q1, q11)          /* get filter level  */         \
 578   "vand       q9, q9, q11                 \n"  /* apply filter mask */         \
 579   DO_SIMPLE_FILTER(p0, q0, q9)                 /* apply filter */              \
 580   FLIP_SIGN_BIT2(p0, q0, q10)
 581
 582 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
 583   __asm__ volatile (
 584     "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
 585
 586     "vld1.u8    {q1}, [%[p]], %[stride]        \n"  // p1
 587     "vld1.u8    {q2}, [%[p]], %[stride]        \n"  // p0
 588     "vld1.u8    {q3}, [%[p]], %[stride]        \n"  // q0
 589     "vld1.u8    {q12}, [%[p]]                  \n"  // q1
 590
 591     DO_FILTER2(q1, q2, q3, q12, %[thresh])
 592
 593     "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
 594
 595     "vst1.u8    {q2}, [%[p]], %[stride]        \n"  // store op0
 596     "vst1.u8    {q3}, [%[p]]                   \n"  // store oq0
 597     : [p] "+r"(p)
 598     : [stride] "r"(stride), [thresh] "r"(thresh)
 599     : "memory", QRegs
 600   );
 601 }
 602
 603 static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
 604   __asm__ volatile (
 605     "sub        r4, %[p], #2                   \n"  // base1 = p - 2
 606     "lsl        r6, %[stride], #1              \n"  // r6 = 2 * stride
 607     "add        r5, r4, %[stride]              \n"  // base2 = base1 + stride
 608
 609     LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
 610     LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)
 611     "vswp       d3, d24                        \n"  // p1:q1 p0:q3
 612     "vswp       d5, d26                        \n"  // q0:q2 q1:q4
 613     "vswp       q2, q12                        \n"  // p1:q1 p0:q2 q0:q3 q1:q4
 614
 615     DO_FILTER2(q1, q2, q12, q13, %[thresh])
 616
 617     "sub        %[p], %[p], #1                 \n"  // p - 1
 618
 619     "vswp        d5, d24                       \n"
 620     STORE8x2(d4, d5, [%[p]], %[stride])
 621     STORE8x2(d24, d25, [%[p]], %[stride])
 622
 623     : [p] "+r"(p)
 624     : [stride] "r"(stride), [thresh] "r"(thresh)
 625     : "memory", "r4", "r5", "r6", QRegs
 626   );
 627 }
 628
 629 #endif    // USE_INTRINSICS
 630
 631 static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
 632   uint32_t k;
 633   for (k = 3; k != 0; --k) {
 634     p += 4 * stride;
 635     SimpleVFilter16(p, stride, thresh);
 636   }
 637 }
 638
 639 static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
 640   uint32_t k;
 641   for (k = 3; k != 0; --k) {
 642     p += 4;
 643     SimpleHFilter16(p, stride, thresh);
 644   }
 645 }
 646
 647 //------------------------------------------------------------------------------
 648 // Complex In-loop filtering (Paragraph 15.3)
 649
 650 static uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0,
 651                            const uint8x16_t q0, const uint8x16_t q1,
 652                            int hev_thresh) {
 653   const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
 654   const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
 655   const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
 656   const uint8x16_t mask1 = vcgtq_u8(a_p1_p0, hev_thresh_v);
 657   const uint8x16_t mask2 = vcgtq_u8(a_q1_q0, hev_thresh_v);
 658   const uint8x16_t mask = vorrq_u8(mask1, mask2);
 659   return mask;
 660 }
 661
 662 static uint8x16_t NeedsFilter2(const uint8x16_t p3, const uint8x16_t p2,
 663                                const uint8x16_t p1, const uint8x16_t p0,
 664                                const uint8x16_t q0, const uint8x16_t q1,
 665                                const uint8x16_t q2, const uint8x16_t q3,
 666                                int ithresh, int thresh) {
 667   const uint8x16_t ithresh_v = vdupq_n_u8((uint8_t)ithresh);
 668   const uint8x16_t a_p3_p2 = vabdq_u8(p3, p2);  // abs(p3 - p2)
 669   const uint8x16_t a_p2_p1 = vabdq_u8(p2, p1);  // abs(p2 - p1)
 670   const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
 671   const uint8x16_t a_q3_q2 = vabdq_u8(q3, q2);  // abs(q3 - q2)
 672   const uint8x16_t a_q2_q1 = vabdq_u8(q2, q1);  // abs(q2 - q1)
 673   const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
 674   const uint8x16_t max1 = vmaxq_u8(a_p3_p2, a_p2_p1);
 675   const uint8x16_t max2 = vmaxq_u8(a_p1_p0, a_q3_q2);
 676   const uint8x16_t max3 = vmaxq_u8(a_q2_q1, a_q1_q0);
 677   const uint8x16_t max12 = vmaxq_u8(max1, max2);
 678   const uint8x16_t max123 = vmaxq_u8(max12, max3);
 679   const uint8x16_t mask2 = vcgeq_u8(ithresh_v, max123);
 680   const uint8x16_t mask1 = NeedsFilter(p1, p0, q0, q1, thresh);
 681   const uint8x16_t mask = vandq_u8(mask1, mask2);
 682   return mask;
 683 }
 684
 685 //  4-points filter
 686
 687 static void ApplyFilter4(
 688     const int8x16_t p1, const int8x16_t p0,
 689     const int8x16_t q0, const int8x16_t q1,
 690     const int8x16_t delta0,
 691     uint8x16_t* const op1, uint8x16_t* const op0,
 692     uint8x16_t* const oq0, uint8x16_t* const oq1) {
 693   const int8x16_t kCst3 = vdupq_n_s8(0x03);
 694   const int8x16_t kCst4 = vdupq_n_s8(0x04);
 695   const int8x16_t delta1 = vqaddq_s8(delta0, kCst4);
 696   const int8x16_t delta2 = vqaddq_s8(delta0, kCst3);
 697   const int8x16_t a1 = vshrq_n_s8(delta1, 3);
 698   const int8x16_t a2 = vshrq_n_s8(delta2, 3);
 699   const int8x16_t a3 = vrshrq_n_s8(a1, 1);   // a3 = (a1 + 1) >> 1
 700   *op0 = FlipSignBack(vqaddq_s8(p0, a2));  // clip(p0 + a2)
 701   *oq0 = FlipSignBack(vqsubq_s8(q0, a1));  // clip(q0 - a1)
 702   *op1 = FlipSignBack(vqaddq_s8(p1, a3));  // clip(p1 + a3)
 703   *oq1 = FlipSignBack(vqsubq_s8(q1, a3));  // clip(q1 - a3)
 704 }
 705
 706 static void DoFilter4(
 707     const uint8x16_t p1, const uint8x16_t p0,
 708     const uint8x16_t q0, const uint8x16_t q1,
 709     const uint8x16_t mask, const uint8x16_t hev_mask,
 710     uint8x16_t* const op1, uint8x16_t* const op0,
 711     uint8x16_t* const oq0, uint8x16_t* const oq1) {
 712   // This is a fused version of DoFilter2() calling ApplyFilter2 directly
 713   const int8x16_t p1s = FlipSign(p1);
 714   int8x16_t p0s = FlipSign(p0);
 715   int8x16_t q0s = FlipSign(q0);
 716   const int8x16_t q1s = FlipSign(q1);
 717   const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
 718
 719   // do_filter2 part (simple loopfilter on pixels with hev)
 720   {
 721     const int8x16_t delta = GetBaseDelta(p1s, p0s, q0s, q1s);
 722     const int8x16_t simple_lf_delta =
 723         vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
 724     uint8x16_t tmp_p0, tmp_q0;
 725     ApplyFilter2(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0);
 726     // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here
 727     p0s = FlipSign(tmp_p0);
 728     q0s = FlipSign(tmp_q0);
 729   }
 730
 731   // do_filter4 part (complex loopfilter on pixels without hev)
 732   {
 733     const int8x16_t delta0 = GetBaseDelta0(p0s, q0s);
 734     // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
 735     const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
 736     const int8x16_t complex_lf_delta =
 737         vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
 738     ApplyFilter4(p1s, p0s, q0s, q1s, complex_lf_delta, op1, op0, oq0, oq1);
 739   }
 740 }
 741
 742 //  6-points filter
 743
 744 static void ApplyFilter6(
 745     const int8x16_t p2, const int8x16_t p1, const int8x16_t p0,
 746     const int8x16_t q0, const int8x16_t q1, const int8x16_t q2,
 747     const int8x16_t delta,
 748     uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
 749     uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
 750   const int16x8_t kCst63 = vdupq_n_s16(63);
 751   const int8x8_t kCst27 = vdup_n_s8(27);
 752   const int8x8_t kCst18 = vdup_n_s8(18);
 753   const int8x8_t kCst9 = vdup_n_s8(9);
 754   const int8x8_t delta_lo = vget_low_s8(delta);
 755   const int8x8_t delta_hi = vget_high_s8(delta);
 756   const int16x8_t s1_lo = vmlal_s8(kCst63, kCst27, delta_lo);  // 63 + 27 * a
 757   const int16x8_t s1_hi = vmlal_s8(kCst63, kCst27, delta_hi);  // 63 + 27 * a
 758   const int16x8_t s2_lo = vmlal_s8(kCst63, kCst18, delta_lo);  // 63 + 18 * a
 759   const int16x8_t s2_hi = vmlal_s8(kCst63, kCst18, delta_hi);  // 63 + 18 * a
 760   const int16x8_t s3_lo = vmlal_s8(kCst63, kCst9, delta_lo);   // 63 + 9 * a
 761   const int16x8_t s3_hi = vmlal_s8(kCst63, kCst9, delta_hi);   // 63 + 9 * a
 762   const int8x8_t a1_lo = vqshrn_n_s16(s1_lo, 7);
 763   const int8x8_t a1_hi = vqshrn_n_s16(s1_hi, 7);
 764   const int8x8_t a2_lo = vqshrn_n_s16(s2_lo, 7);
 765   const int8x8_t a2_hi = vqshrn_n_s16(s2_hi, 7);
 766   const int8x8_t a3_lo = vqshrn_n_s16(s3_lo, 7);
 767   const int8x8_t a3_hi = vqshrn_n_s16(s3_hi, 7);
 768   const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);
 769   const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
 770   const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
 771
 772   *op0 = FlipSignBack(vqaddq_s8(p0, a1));  // clip(p0 + a1)
 773   *oq0 = FlipSignBack(vqsubq_s8(q0, a1));  // clip(q0 - q1)
 774   *oq1 = FlipSignBack(vqsubq_s8(q1, a2));  // clip(q1 - a2)
 775   *op1 = FlipSignBack(vqaddq_s8(p1, a2));  // clip(p1 + a2)
 776   *oq2 = FlipSignBack(vqsubq_s8(q2, a3));  // clip(q2 - a3)
 777   *op2 = FlipSignBack(vqaddq_s8(p2, a3));  // clip(p2 + a3)
 778 }
 779
 780 static void DoFilter6(
 781     const uint8x16_t p2, const uint8x16_t p1, const uint8x16_t p0,
 782     const uint8x16_t q0, const uint8x16_t q1, const uint8x16_t q2,
 783     const uint8x16_t mask, const uint8x16_t hev_mask,
 784     uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
 785     uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
 786   // This is a fused version of DoFilter2() calling ApplyFilter2 directly
 787   const int8x16_t p2s = FlipSign(p2);
 788   const int8x16_t p1s = FlipSign(p1);
 789   int8x16_t p0s = FlipSign(p0);
 790   int8x16_t q0s = FlipSign(q0);
 791   const int8x16_t q1s = FlipSign(q1);
 792   const int8x16_t q2s = FlipSign(q2);
 793   const uint8x16_t simple_lf_mask = vandq_u8(mask, hev_mask);
 794   const int8x16_t delta0 = GetBaseDelta(p1s, p0s, q0s, q1s);
 795
 796   // do_filter2 part (simple loopfilter on pixels with hev)
 797   {
 798     const int8x16_t simple_lf_delta =
 799         vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
 800     uint8x16_t tmp_p0, tmp_q0;
 801     ApplyFilter2(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0);
 802     // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here
 803     p0s = FlipSign(tmp_p0);
 804     q0s = FlipSign(tmp_q0);
 805   }
 806
 807   // do_filter6 part (complex loopfilter on pixels without hev)
 808   {
 809     // we use: (mask & hev_mask) ^ mask = mask & !hev_mask
 810     const uint8x16_t complex_lf_mask = veorq_u8(simple_lf_mask, mask);
 811     const int8x16_t complex_lf_delta =
 812         vandq_s8(delta0, vreinterpretq_s8_u8(complex_lf_mask));
 813     ApplyFilter6(p2s, p1s, p0s, q0s, q1s, q2s, complex_lf_delta,
 814                  op2, op1, op0, oq0, oq1, oq2);
 815   }
 816 }
 817
 818 // on macroblock edges
 819
 820 static void VFilter16(uint8_t* p, int stride,
 821                       int thresh, int ithresh, int hev_thresh) {
 822   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
 823   Load16x8(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
 824   {
 825     const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
 826                                          ithresh, thresh);
 827     const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
 828     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
 829     DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
 830               &op2, &op1, &op0, &oq0, &oq1, &oq2);
 831     Store16x2(op2, op1, p - 2 * stride, stride);
 832     Store16x2(op0, oq0, p + 0 * stride, stride);
 833     Store16x2(oq1, oq2, p + 2 * stride, stride);
 834   }
 835 }
 836
 837 static void HFilter16(uint8_t* p, int stride,
 838                       int thresh, int ithresh, int hev_thresh) {
 839   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
 840   Load8x16(p, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
 841   {
 842     const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
 843                                          ithresh, thresh);
 844     const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
 845     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
 846     DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
 847               &op2, &op1, &op0, &oq0, &oq1, &oq2);
 848     Store2x16(op2, op1, p - 2, stride);
 849     Store2x16(op0, oq0, p + 0, stride);
 850     Store2x16(oq1, oq2, p + 2, stride);
 851   }
 852 }
 853
 854 // on three inner edges
 855 static void VFilter16i(uint8_t* p, int stride,
 856                        int thresh, int ithresh, int hev_thresh) {
 857   uint32_t k;
 858   uint8x16_t p3, p2, p1, p0;
 859   Load16x4(p + 2  * stride, stride, &p3, &p2, &p1, &p0);
 860   for (k = 3; k != 0; --k) {
 861     uint8x16_t q0, q1, q2, q3;
 862     p += 4 * stride;
 863     Load16x4(p + 2  * stride, stride, &q0, &q1, &q2, &q3);
 864     {
 865       const uint8x16_t mask =
 866           NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
 867       const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
 868       // p3 and p2 are not just temporary variables here: they will be
 869       // re-used for next span. And q2/q3 will become p1/p0 accordingly.
 870       DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
 871       Store16x4(p1, p0, p3, p2, p, stride);
 872       p1 = q2;
 873       p0 = q3;
 874     }
 875   }
 876 }
 877
 878 #if !defined(WORK_AROUND_GCC)
 879 static void HFilter16i(uint8_t* p, int stride,
 880                        int thresh, int ithresh, int hev_thresh) {
 881   uint32_t k;
 882   uint8x16_t p3, p2, p1, p0;
 883   Load4x16(p + 2, stride, &p3, &p2, &p1, &p0);
 884   for (k = 3; k != 0; --k) {
 885     uint8x16_t q0, q1, q2, q3;
 886     p += 4;
 887     Load4x16(p + 2, stride, &q0, &q1, &q2, &q3);
 888     {
 889       const uint8x16_t mask =
 890           NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3, ithresh, thresh);
 891       const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
 892       DoFilter4(p1, p0, q0, q1, mask, hev_mask, &p1, &p0, &p3, &p2);
 893       Store4x16(p1, p0, p3, p2, p, stride);
 894       p1 = q2;
 895       p0 = q3;
 896     }
 897   }
 898 }
 899 #endif  // !WORK_AROUND_GCC
 900
 901 // 8-pixels wide variant, for chroma filtering
 902 static void VFilter8(uint8_t* u, uint8_t* v, int stride,
 903                      int thresh, int ithresh, int hev_thresh) {
 904   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
 905   Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
 906   {
 907     const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
 908                                          ithresh, thresh);
 909     const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
 910     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
 911     DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
 912               &op2, &op1, &op0, &oq0, &oq1, &oq2);
 913     Store8x2x2(op2, op1, u - 2 * stride, v - 2 * stride, stride);
 914     Store8x2x2(op0, oq0, u + 0 * stride, v + 0 * stride, stride);
 915     Store8x2x2(oq1, oq2, u + 2 * stride, v + 2 * stride, stride);
 916   }
 917 }
 918 static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
 919                       int thresh, int ithresh, int hev_thresh) {
 920   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
 921   u += 4 * stride;
 922   v += 4 * stride;
 923   Load8x8x2(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
 924   {
 925     const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
 926                                          ithresh, thresh);
 927     const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
 928     uint8x16_t op1, op0, oq0, oq1;
 929     DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
 930     Store8x4x2(op1, op0, oq0, oq1, u, v, stride);
 931   }
 932 }
 933
 934 #if !defined(WORK_AROUND_GCC)
 935 static void HFilter8(uint8_t* u, uint8_t* v, int stride,
 936                      int thresh, int ithresh, int hev_thresh) {
 937   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
 938   Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
 939   {
 940     const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
 941                                          ithresh, thresh);
 942     const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
 943     uint8x16_t op2, op1, op0, oq0, oq1, oq2;
 944     DoFilter6(p2, p1, p0, q0, q1, q2, mask, hev_mask,
 945               &op2, &op1, &op0, &oq0, &oq1, &oq2);
 946     Store6x8x2(op2, op1, op0, oq0, oq1, oq2, u, v, stride);
 947   }
 948 }
 949
 950 static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
 951                       int thresh, int ithresh, int hev_thresh) {
 952   uint8x16_t p3, p2, p1, p0, q0, q1, q2, q3;
 953   u += 4;
 954   v += 4;
 955   Load8x8x2T(u, v, stride, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
 956   {
 957     const uint8x16_t mask = NeedsFilter2(p3, p2, p1, p0, q0, q1, q2, q3,
 958                                          ithresh, thresh);
 959     const uint8x16_t hev_mask = NeedsHev(p1, p0, q0, q1, hev_thresh);
 960     uint8x16_t op1, op0, oq0, oq1;
 961     DoFilter4(p1, p0, q0, q1, mask, hev_mask, &op1, &op0, &oq0, &oq1);
 962     Store4x8x2(op1, op0, oq0, oq1, u, v, stride);
 963   }
 964 }
 965 #endif  // !WORK_AROUND_GCC
 966
 967 //-----------------------------------------------------------------------------
 968 // Inverse transforms (Paragraph 14.4)
 969
 970 // Technically these are unsigned but vqdmulh is only available in signed.
 971 // vqdmulh returns high half (effectively >> 16) but also doubles the value,
 972 // changing the >> 16 to >> 15 and requiring an additional >> 1.
 973 // We use this to our advantage with kC2. The canonical value is 35468.
 974 // However, the high bit is set so treating it as signed will give incorrect
 975 // results. We avoid this by down shifting by 1 here to clear the highest bit.
 976 // Combined with the doubling effect of vqdmulh we get >> 16.
 977 // This can not be applied to kC1 because the lowest bit is set. Down shifting
 978 // the constant would reduce precision.
 979
 980 // libwebp uses a trick to avoid some extra addition that libvpx does.
 981 // Instead of:
 982 // temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
 983 // libwebp adds 1 << 16 to cospi8sqrt2minus1 (kC1). However, this causes the
 984 // same issue with kC1 and vqdmulh that we work around by down shifting kC2
 985
 986 static const int16_t kC1 = 20091;
 987 static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
 988
 989 #if defined(USE_INTRINSICS)
 990 static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
 991                                      int16x8x2_t* const out) {
 992   // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
 993   // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
 994   const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
 995                                                   // b0 d0 b1 d1 b2 d2 ...
 996   *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
 997 }
 998
 999 static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
1000   // {rows} = in0 | in4
1001   //          in8 | in12
1002   // B1 = in4 | in12
1003   const int16x8_t B1 =
1004       vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
1005   // C0 = kC1 * in4 | kC1 * in12
1006   // C1 = kC2 * in4 | kC2 * in12
1007   const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
1008   const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
1009   const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
1010                                 vget_low_s16(rows->val[1]));   // in0 + in8
1011   const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
1012                                 vget_low_s16(rows->val[1]));   // in0 - in8
1013   // c = kC2 * in4 - kC1 * in12
1014   // d = kC1 * in4 + kC2 * in12
1015   const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
1016   const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
1017   const int16x8_t D0 = vcombine_s16(a, b);      // D0 = a | b
1018   const int16x8_t D1 = vcombine_s16(d, c);      // D1 = d | c
1019   const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
1020   const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
1021   const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
1022   Transpose8x2(E0, E1, rows);
1023 }
1024
1025 static void TransformOne(const int16_t* in, uint8_t* dst) {
1026   int16x8x2_t rows;
1027   INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
1028   TransformPass(&rows);
1029   TransformPass(&rows);
1030   Add4x4(rows.val[0], rows.val[1], dst);
1031 }
1032
1033 #else
1034
1035 static void TransformOne(const int16_t* in, uint8_t* dst) {
1036   const int kBPS = BPS;
1037   // kC1, kC2. Padded because vld1.16 loads 8 bytes
1038   const int16_t constants[4] = { kC1, kC2, 0, 0 };
1039   /* Adapted from libvpx: vp8/common/arm/neon/shortidct4x4llm_neon.asm */
1040   __asm__ volatile (
1041     "vld1.16         {q1, q2}, [%[in]]           \n"
1042     "vld1.16         {d0}, [%[constants]]        \n"
1043
1044     /* d2: in[0]
1045      * d3: in[8]
1046      * d4: in[4]
1047      * d5: in[12]
1048      */
1049     "vswp            d3, d4                      \n"
1050
1051     /* q8 = {in[4], in[12]} * kC1 * 2 >> 16
1052      * q9 = {in[4], in[12]} * kC2 >> 16
1053      */
1054     "vqdmulh.s16     q8, q2, d0[0]               \n"
1055     "vqdmulh.s16     q9, q2, d0[1]               \n"
1056
1057     /* d22 = a = in[0] + in[8]
1058      * d23 = b = in[0] - in[8]
1059      */
1060     "vqadd.s16       d22, d2, d3                 \n"
1061     "vqsub.s16       d23, d2, d3                 \n"
1062
1063     /* The multiplication should be x * kC1 >> 16
1064      * However, with vqdmulh we get x * kC1 * 2 >> 16
1065      * (multiply, double, return high half)
1066      * We avoided this in kC2 by pre-shifting the constant.
1067      * q8 = in[4]/[12] * kC1 >> 16
1068      */
1069     "vshr.s16        q8, q8, #1                  \n"
1070
1071     /* Add {in[4], in[12]} back after the multiplication. This is handled by
1072      * adding 1 << 16 to kC1 in the libwebp C code.
1073      */
1074     "vqadd.s16       q8, q2, q8                  \n"
1075
1076     /* d20 = c = in[4]*kC2 - in[12]*kC1
1077      * d21 = d = in[4]*kC1 + in[12]*kC2
1078      */
1079     "vqsub.s16       d20, d18, d17               \n"
1080     "vqadd.s16       d21, d19, d16               \n"
1081
1082     /* d2 = tmp[0] = a + d
1083      * d3 = tmp[1] = b + c
1084      * d4 = tmp[2] = b - c
1085      * d5 = tmp[3] = a - d
1086      */
1087     "vqadd.s16       d2, d22, d21                \n"
1088     "vqadd.s16       d3, d23, d20                \n"
1089     "vqsub.s16       d4, d23, d20                \n"
1090     "vqsub.s16       d5, d22, d21                \n"
1091
1092     "vzip.16         q1, q2                      \n"
1093     "vzip.16         q1, q2                      \n"
1094
1095     "vswp            d3, d4                      \n"
1096
1097     /* q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
1098      * q9 = {tmp[4], tmp[12]} * kC2 >> 16
1099      */
1100     "vqdmulh.s16     q8, q2, d0[0]               \n"
1101     "vqdmulh.s16     q9, q2, d0[1]               \n"
1102
1103     /* d22 = a = tmp[0] + tmp[8]
1104      * d23 = b = tmp[0] - tmp[8]
1105      */
1106     "vqadd.s16       d22, d2, d3                 \n"
1107     "vqsub.s16       d23, d2, d3                 \n"
1108
1109     /* See long winded explanations prior */
1110     "vshr.s16        q8, q8, #1                  \n"
1111     "vqadd.s16       q8, q2, q8                  \n"
1112
1113     /* d20 = c = in[4]*kC2 - in[12]*kC1
1114      * d21 = d = in[4]*kC1 + in[12]*kC2
1115      */
1116     "vqsub.s16       d20, d18, d17               \n"
1117     "vqadd.s16       d21, d19, d16               \n"
1118
1119     /* d2 = tmp[0] = a + d
1120      * d3 = tmp[1] = b + c
1121      * d4 = tmp[2] = b - c
1122      * d5 = tmp[3] = a - d
1123      */
1124     "vqadd.s16       d2, d22, d21                \n"
1125     "vqadd.s16       d3, d23, d20                \n"
1126     "vqsub.s16       d4, d23, d20                \n"
1127     "vqsub.s16       d5, d22, d21                \n"
1128
1129     "vld1.32         d6[0], [%[dst]], %[kBPS]    \n"
1130     "vld1.32         d6[1], [%[dst]], %[kBPS]    \n"
1131     "vld1.32         d7[0], [%[dst]], %[kBPS]    \n"
1132     "vld1.32         d7[1], [%[dst]], %[kBPS]    \n"
1133
1134     "sub         %[dst], %[dst], %[kBPS], lsl #2 \n"
1135
1136     /* (val) + 4 >> 3 */
1137     "vrshr.s16       d2, d2, #3                  \n"
1138     "vrshr.s16       d3, d3, #3                  \n"
1139     "vrshr.s16       d4, d4, #3                  \n"
1140     "vrshr.s16       d5, d5, #3                  \n"
1141
1142     "vzip.16         q1, q2                      \n"
1143     "vzip.16         q1, q2                      \n"
1144
1145     /* Must accumulate before saturating */
1146     "vmovl.u8        q8, d6                      \n"
1147     "vmovl.u8        q9, d7                      \n"
1148
1149     "vqadd.s16       q1, q1, q8                  \n"
1150     "vqadd.s16       q2, q2, q9                  \n"
1151
1152     "vqmovun.s16     d0, q1                      \n"
1153     "vqmovun.s16     d1, q2                      \n"
1154
1155     "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
1156     "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
1157     "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
1158     "vst1.32         d1[1], [%[dst]]             \n"
1159
1160     : [in] "+r"(in), [dst] "+r"(dst)  /* modified registers */
1161     : [kBPS] "r"(kBPS), [constants] "r"(constants)  /* constants */
1162     : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  /* clobbered */
1163   );
1164 }
1165
1166 #endif    // USE_INTRINSICS
1167
1168 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
1169   TransformOne(in, dst);
1170   if (do_two) {
1171     TransformOne(in + 16, dst + 4);
1172   }
1173 }
1174
1175 static void TransformDC(const int16_t* in, uint8_t* dst) {
1176   const int16x8_t DC = vdupq_n_s16(in[0]);
1177   Add4x4(DC, DC, dst);
1178 }
1179
1180 //------------------------------------------------------------------------------
1181
1182 #define STORE_WHT(dst, col, rows) do {                  \
1183   *dst = vgetq_lane_s32(rows.val[0], col); (dst) += 16; \
1184   *dst = vgetq_lane_s32(rows.val[1], col); (dst) += 16; \
1185   *dst = vgetq_lane_s32(rows.val[2], col); (dst) += 16; \
1186   *dst = vgetq_lane_s32(rows.val[3], col); (dst) += 16; \
1187 } while (0)
1188
1189 static void TransformWHT(const int16_t* in, int16_t* out) {
1190   int32x4x4_t tmp;
1191
1192   {
1193     // Load the source.
1194     const int16x4_t in00_03 = vld1_s16(in + 0);
1195     const int16x4_t in04_07 = vld1_s16(in + 4);
1196     const int16x4_t in08_11 = vld1_s16(in + 8);
1197     const int16x4_t in12_15 = vld1_s16(in + 12);
1198     const int32x4_t a0 = vaddl_s16(in00_03, in12_15);  // in[0..3] + in[12..15]
1199     const int32x4_t a1 = vaddl_s16(in04_07, in08_11);  // in[4..7] + in[8..11]
1200     const int32x4_t a2 = vsubl_s16(in04_07, in08_11);  // in[4..7] - in[8..11]
1201     const int32x4_t a3 = vsubl_s16(in00_03, in12_15);  // in[0..3] - in[12..15]
1202     tmp.val[0] = vaddq_s32(a0, a1);
1203     tmp.val[1] = vaddq_s32(a3, a2);
1204     tmp.val[2] = vsubq_s32(a0, a1);
1205     tmp.val[3] = vsubq_s32(a3, a2);
1206     // Arrange the temporary results column-wise.
1207     tmp = Transpose4x4(tmp);
1208   }
1209
1210   {
1211     const int32x4_t kCst3 = vdupq_n_s32(3);
1212     const int32x4_t dc = vaddq_s32(tmp.val[0], kCst3);  // add rounder
1213     const int32x4_t a0 = vaddq_s32(dc, tmp.val[3]);
1214     const int32x4_t a1 = vaddq_s32(tmp.val[1], tmp.val[2]);
1215     const int32x4_t a2 = vsubq_s32(tmp.val[1], tmp.val[2]);
1216     const int32x4_t a3 = vsubq_s32(dc, tmp.val[3]);
1217
1218     tmp.val[0] = vaddq_s32(a0, a1);
1219     tmp.val[1] = vaddq_s32(a3, a2);
1220     tmp.val[2] = vsubq_s32(a0, a1);
1221     tmp.val[3] = vsubq_s32(a3, a2);
1222
1223     // right shift the results by 3.
1224     tmp.val[0] = vshrq_n_s32(tmp.val[0], 3);
1225     tmp.val[1] = vshrq_n_s32(tmp.val[1], 3);
1226     tmp.val[2] = vshrq_n_s32(tmp.val[2], 3);
1227     tmp.val[3] = vshrq_n_s32(tmp.val[3], 3);
1228
1229     STORE_WHT(out, 0, tmp);
1230     STORE_WHT(out, 1, tmp);
1231     STORE_WHT(out, 2, tmp);
1232     STORE_WHT(out, 3, tmp);
1233   }
1234 }
1235
1236 #undef STORE_WHT
1237
1238 //------------------------------------------------------------------------------
1239
1240 #define MUL(a, b) (((a) * (b)) >> 16)
1241 static void TransformAC3(const int16_t* in, uint8_t* dst) {
1242   static const int kC1_full = 20091 + (1 << 16);
1243   static const int kC2_full = 35468;
1244   const int16x4_t A = vdup_n_s16(in[0]);
1245   const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full));
1246   const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full));
1247   const int c1 = MUL(in[1], kC2_full);
1248   const int d1 = MUL(in[1], kC1_full);
1249   const uint64_t cd = (uint64_t)( d1 & 0xffff) <<  0 |
1250                       (uint64_t)( c1 & 0xffff) << 16 |
1251                       (uint64_t)(-c1 & 0xffff) << 32 |
1252                       (uint64_t)(-d1 & 0xffff) << 48;
1253   const int16x4_t CD = vcreate_s16(cd);
1254   const int16x4_t B = vqadd_s16(A, CD);
1255   const int16x8_t m0_m1 = vcombine_s16(vqadd_s16(B, d4), vqadd_s16(B, c4));
1256   const int16x8_t m2_m3 = vcombine_s16(vqsub_s16(B, c4), vqsub_s16(B, d4));
1257   Add4x4(m0_m1, m2_m3, dst);
1258 }
1259 #undef MUL
1260
1261 #endif   // WEBP_USE_NEON
1262
1263 //------------------------------------------------------------------------------
1264 // Entry point
1265
1266 extern void VP8DspInitNEON(void);
1267
1268 void VP8DspInitNEON(void) {
1269 #if defined(WEBP_USE_NEON)
1270   VP8Transform = TransformTwo;
1271   VP8TransformAC3 = TransformAC3;
1272   VP8TransformDC = TransformDC;
1273   VP8TransformWHT = TransformWHT;
1274
1275   VP8VFilter16 = VFilter16;
1276   VP8VFilter16i = VFilter16i;
1277   VP8HFilter16 = HFilter16;
1278 #if !defined(WORK_AROUND_GCC)
1279   VP8HFilter16i = HFilter16i;
1280 #endif
1281   VP8VFilter8 = VFilter8;
1282   VP8VFilter8i = VFilter8i;
1283 #if !defined(WORK_AROUND_GCC)
1284   VP8HFilter8 = HFilter8;
1285   VP8HFilter8i = HFilter8i;
1286 #endif
1287   VP8SimpleVFilter16 = SimpleVFilter16;
1288   VP8SimpleHFilter16 = SimpleHFilter16;
1289   VP8SimpleVFilter16i = SimpleVFilter16i;
1290   VP8SimpleHFilter16i = SimpleHFilter16i;
1291 #endif   // WEBP_USE_NEON
1292 }