third_party/libwebp/dsp/enc_neon.c

   1 // Copyright 2012 Google Inc. All Rights Reserved.
   2 //
   3 // Use of this source code is governed by a BSD-style license
   4 // that can be found in the COPYING file in the root of the source
   5 // tree. An additional intellectual property rights grant can be found
   6 // in the file PATENTS. All contributing project authors may
   7 // be found in the AUTHORS file in the root of the source tree.
   8 // -----------------------------------------------------------------------------
   9 //
  10 // ARM NEON version of speed-critical encoding functions.
  11 //
  12 // adapted from libvpx (http://www.webmproject.org/code/)
  13
  14 #include "./dsp.h"
  15
  16 #if defined(WEBP_USE_NEON)
  17
  18 #include <assert.h>
  19
  20 #include "./neon.h"
  21 #include "../enc/vp8enci.h"
  22
  23 //------------------------------------------------------------------------------
  24 // Transforms (Paragraph 14.4)
  25
  26 // Inverse transform.
  27 // This code is pretty much the same as TransformOne in the dec_neon.c, except
  28 // for subtraction to *ref. See the comments there for algorithmic explanations.
  29
  30 static const int16_t kC1 = 20091;
  31 static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
  32
  33 // This code works but is *slower* than the inlined-asm version below
  34 // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
  35 // USE_INTRINSICS define.
  36 // With gcc-4.8, it's a little faster speed than inlined-assembly.
  37 #if defined(USE_INTRINSICS)
  38
  39 // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
  40 static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
  41   return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
  42 }
  43
  44 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
  45 // to the corresponding rows of 'dst'.
  46 static WEBP_INLINE void SaturateAndStore4x4(uint8_t* const dst,
  47                                             const int16x8_t dst01,
  48                                             const int16x8_t dst23) {
  49   // Unsigned saturate to 8b.
  50   const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
  51   const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
  52
  53   // Store the results.
  54   vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
  55   vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
  56   vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
  57   vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
  58 }
  59
  60 static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
  61                                const uint8_t* const ref, uint8_t* const dst) {
  62   uint32x2_t dst01 = vdup_n_u32(0);
  63   uint32x2_t dst23 = vdup_n_u32(0);
  64
  65   // Load the source pixels.
  66   dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0);
  67   dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0);
  68   dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1);
  69   dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1);
  70
  71   {
  72     // Convert to 16b.
  73     const int16x8_t dst01_s16 = ConvertU8ToS16(dst01);
  74     const int16x8_t dst23_s16 = ConvertU8ToS16(dst23);
  75
  76     // Descale with rounding.
  77     const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
  78     const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
  79     // Add the inverse transform.
  80     SaturateAndStore4x4(dst, out01, out23);
  81   }
  82 }
  83
  84 static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
  85                                      int16x8x2_t* const out) {
  86   // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
  87   // c0 c1 c2 c3 | d0 d1 d2 d3      a2 b2 c2 d2 | a3 b3 c3 d3
  88   const int16x8x2_t tmp0 = vzipq_s16(in0, in1);   // a0 c0 a1 c1 a2 c2 ...
  89                                                   // b0 d0 b1 d1 b2 d2 ...
  90   *out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
  91 }
  92
  93 static WEBP_INLINE void TransformPass(int16x8x2_t* const rows) {
  94   // {rows} = in0 | in4
  95   //          in8 | in12
  96   // B1 = in4 | in12
  97   const int16x8_t B1 =
  98       vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
  99   // C0 = kC1 * in4 | kC1 * in12
 100   // C1 = kC2 * in4 | kC2 * in12
 101   const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
 102   const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
 103   const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
 104                                 vget_low_s16(rows->val[1]));   // in0 + in8
 105   const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
 106                                 vget_low_s16(rows->val[1]));   // in0 - in8
 107   // c = kC2 * in4 - kC1 * in12
 108   // d = kC1 * in4 + kC2 * in12
 109   const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
 110   const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
 111   const int16x8_t D0 = vcombine_s16(a, b);      // D0 = a | b
 112   const int16x8_t D1 = vcombine_s16(d, c);      // D1 = d | c
 113   const int16x8_t E0 = vqaddq_s16(D0, D1);      // a+d | b+c
 114   const int16x8_t E_tmp = vqsubq_s16(D0, D1);   // a-d | b-c
 115   const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
 116   Transpose8x2(E0, E1, rows);
 117 }
 118
 119 static void ITransformOne(const uint8_t* ref,
 120                           const int16_t* in, uint8_t* dst) {
 121   int16x8x2_t rows;
 122   INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
 123   TransformPass(&rows);
 124   TransformPass(&rows);
 125   Add4x4(rows.val[0], rows.val[1], ref, dst);
 126 }
 127
 128 #else
 129
 130 static void ITransformOne(const uint8_t* ref,
 131                           const int16_t* in, uint8_t* dst) {
 132   const int kBPS = BPS;
 133   const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
 134
 135   __asm__ volatile (
 136     "vld1.16         {q1, q2}, [%[in]]           \n"
 137     "vld1.16         {d0}, [%[kC1C2]]            \n"
 138
 139     // d2: in[0]
 140     // d3: in[8]
 141     // d4: in[4]
 142     // d5: in[12]
 143     "vswp            d3, d4                      \n"
 144
 145     // q8 = {in[4], in[12]} * kC1 * 2 >> 16
 146     // q9 = {in[4], in[12]} * kC2 >> 16
 147     "vqdmulh.s16     q8, q2, d0[0]               \n"
 148     "vqdmulh.s16     q9, q2, d0[1]               \n"
 149
 150     // d22 = a = in[0] + in[8]
 151     // d23 = b = in[0] - in[8]
 152     "vqadd.s16       d22, d2, d3                 \n"
 153     "vqsub.s16       d23, d2, d3                 \n"
 154
 155     //  q8 = in[4]/[12] * kC1 >> 16
 156     "vshr.s16        q8, q8, #1                  \n"
 157
 158     // Add {in[4], in[12]} back after the multiplication.
 159     "vqadd.s16       q8, q2, q8                  \n"
 160
 161     // d20 = c = in[4]*kC2 - in[12]*kC1
 162     // d21 = d = in[4]*kC1 + in[12]*kC2
 163     "vqsub.s16       d20, d18, d17               \n"
 164     "vqadd.s16       d21, d19, d16               \n"
 165
 166     // d2 = tmp[0] = a + d
 167     // d3 = tmp[1] = b + c
 168     // d4 = tmp[2] = b - c
 169     // d5 = tmp[3] = a - d
 170     "vqadd.s16       d2, d22, d21                \n"
 171     "vqadd.s16       d3, d23, d20                \n"
 172     "vqsub.s16       d4, d23, d20                \n"
 173     "vqsub.s16       d5, d22, d21                \n"
 174
 175     "vzip.16         q1, q2                      \n"
 176     "vzip.16         q1, q2                      \n"
 177
 178     "vswp            d3, d4                      \n"
 179
 180     // q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
 181     // q9 = {tmp[4], tmp[12]} * kC2 >> 16
 182     "vqdmulh.s16     q8, q2, d0[0]               \n"
 183     "vqdmulh.s16     q9, q2, d0[1]               \n"
 184
 185     // d22 = a = tmp[0] + tmp[8]
 186     // d23 = b = tmp[0] - tmp[8]
 187     "vqadd.s16       d22, d2, d3                 \n"
 188     "vqsub.s16       d23, d2, d3                 \n"
 189
 190     "vshr.s16        q8, q8, #1                  \n"
 191     "vqadd.s16       q8, q2, q8                  \n"
 192
 193     // d20 = c = in[4]*kC2 - in[12]*kC1
 194     // d21 = d = in[4]*kC1 + in[12]*kC2
 195     "vqsub.s16       d20, d18, d17               \n"
 196     "vqadd.s16       d21, d19, d16               \n"
 197
 198     // d2 = tmp[0] = a + d
 199     // d3 = tmp[1] = b + c
 200     // d4 = tmp[2] = b - c
 201     // d5 = tmp[3] = a - d
 202     "vqadd.s16       d2, d22, d21                \n"
 203     "vqadd.s16       d3, d23, d20                \n"
 204     "vqsub.s16       d4, d23, d20                \n"
 205     "vqsub.s16       d5, d22, d21                \n"
 206
 207     "vld1.32         d6[0], [%[ref]], %[kBPS]    \n"
 208     "vld1.32         d6[1], [%[ref]], %[kBPS]    \n"
 209     "vld1.32         d7[0], [%[ref]], %[kBPS]    \n"
 210     "vld1.32         d7[1], [%[ref]], %[kBPS]    \n"
 211
 212     "sub         %[ref], %[ref], %[kBPS], lsl #2 \n"
 213
 214     // (val) + 4 >> 3
 215     "vrshr.s16       d2, d2, #3                  \n"
 216     "vrshr.s16       d3, d3, #3                  \n"
 217     "vrshr.s16       d4, d4, #3                  \n"
 218     "vrshr.s16       d5, d5, #3                  \n"
 219
 220     "vzip.16         q1, q2                      \n"
 221     "vzip.16         q1, q2                      \n"
 222
 223     // Must accumulate before saturating
 224     "vmovl.u8        q8, d6                      \n"
 225     "vmovl.u8        q9, d7                      \n"
 226
 227     "vqadd.s16       q1, q1, q8                  \n"
 228     "vqadd.s16       q2, q2, q9                  \n"
 229
 230     "vqmovun.s16     d0, q1                      \n"
 231     "vqmovun.s16     d1, q2                      \n"
 232
 233     "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
 234     "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
 235     "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
 236     "vst1.32         d1[1], [%[dst]]             \n"
 237
 238     : [in] "+r"(in), [dst] "+r"(dst)               // modified registers
 239     : [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref)  // constants
 240     : "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11"  // clobbered
 241   );
 242 }
 243
 244 #endif    // USE_INTRINSICS
 245
 246 static void ITransform(const uint8_t* ref,
 247                        const int16_t* in, uint8_t* dst, int do_two) {
 248   ITransformOne(ref, in, dst);
 249   if (do_two) {
 250     ITransformOne(ref + 4, in + 16, dst + 4);
 251   }
 252 }
 253
 254 // Load all 4x4 pixels into a single uint8x16_t variable.
 255 static uint8x16_t Load4x4(const uint8_t* src) {
 256   uint32x4_t out = vdupq_n_u32(0);
 257   out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
 258   out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
 259   out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
 260   out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3);
 261   return vreinterpretq_u8_u32(out);
 262 }
 263
 264 // Forward transform.
 265
 266 #if defined(USE_INTRINSICS)
 267
 268 static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,
 269                                          const int16x4_t C, const int16x4_t D,
 270                                          int16x8_t* const out01,
 271                                          int16x8_t* const out32) {
 272   const int16x4x2_t AB = vtrn_s16(A, B);
 273   const int16x4x2_t CD = vtrn_s16(C, D);
 274   const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
 275                                      vreinterpret_s32_s16(CD.val[0]));
 276   const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),
 277                                      vreinterpret_s32_s16(CD.val[1]));
 278   *out01 = vreinterpretq_s16_s64(
 279       vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]),
 280                    vreinterpret_s64_s32(tmp13.val[0])));
 281   *out32 = vreinterpretq_s16_s64(
 282       vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]),
 283                    vreinterpret_s64_s32(tmp02.val[1])));
 284 }
 285
 286 static WEBP_INLINE int16x8_t DiffU8ToS16(const uint8x8_t a,
 287                                          const uint8x8_t b) {
 288   return vreinterpretq_s16_u16(vsubl_u8(a, b));
 289 }
 290
 291 static void FTransform(const uint8_t* src, const uint8_t* ref,
 292                        int16_t* out) {
 293   int16x8_t d0d1, d3d2;   // working 4x4 int16 variables
 294   {
 295     const uint8x16_t S0 = Load4x4(src);
 296     const uint8x16_t R0 = Load4x4(ref);
 297     const int16x8_t D0D1 = DiffU8ToS16(vget_low_u8(S0), vget_low_u8(R0));
 298     const int16x8_t D2D3 = DiffU8ToS16(vget_high_u8(S0), vget_high_u8(R0));
 299     const int16x4_t D0 = vget_low_s16(D0D1);
 300     const int16x4_t D1 = vget_high_s16(D0D1);
 301     const int16x4_t D2 = vget_low_s16(D2D3);
 302     const int16x4_t D3 = vget_high_s16(D2D3);
 303     Transpose4x4_S16(D0, D1, D2, D3, &d0d1, &d3d2);
 304   }
 305   {    // 1rst pass
 306     const int32x4_t kCst937 = vdupq_n_s32(937);
 307     const int32x4_t kCst1812 = vdupq_n_s32(1812);
 308     const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
 309     const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
 310     const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3);
 311     const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),
 312                                     vget_high_s16(a0a1_2));
 313     const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),
 314                                     vget_high_s16(a0a1_2));
 315     const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
 316     const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
 317     const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
 318     const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
 319     const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
 320     const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
 321     Transpose4x4_S16(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
 322   }
 323   {    // 2nd pass
 324     // the (1<<16) addition is for the replacement: a3!=0  <-> 1-(a3==0)
 325     const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16));
 326     const int32x4_t kCst51000 = vdupq_n_s32(51000);
 327     const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2);   // d0+d3 | d1+d2   (=a0|a1)
 328     const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2);   // d0-d3 | d1-d2   (=a3|a2)
 329     const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));
 330     const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4);
 331     const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4);
 332     const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
 333     const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
 334     const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
 335     const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
 336     const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);
 337     const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);
 338     const int16x4_t a3_eq_0 =
 339         vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));
 340     const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);
 341     vst1_s16(out +  0, out0);
 342     vst1_s16(out +  4, out1);
 343     vst1_s16(out +  8, out2);
 344     vst1_s16(out + 12, out3);
 345   }
 346 }
 347
 348 #else
 349
 350 // adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
 351 static const int16_t kCoeff16[] = {
 352   5352,  5352,  5352, 5352, 2217,  2217,  2217, 2217
 353 };
 354 static const int32_t kCoeff32[] = {
 355    1812,  1812,  1812,  1812,
 356     937,   937,   937,   937,
 357   12000, 12000, 12000, 12000,
 358   51000, 51000, 51000, 51000
 359 };
 360
 361 static void FTransform(const uint8_t* src, const uint8_t* ref,
 362                        int16_t* out) {
 363   const int kBPS = BPS;
 364   const uint8_t* src_ptr = src;
 365   const uint8_t* ref_ptr = ref;
 366   const int16_t* coeff16 = kCoeff16;
 367   const int32_t* coeff32 = kCoeff32;
 368
 369   __asm__ volatile (
 370     // load src into q4, q5 in high half
 371     "vld1.8 {d8},  [%[src_ptr]], %[kBPS]      \n"
 372     "vld1.8 {d10}, [%[src_ptr]], %[kBPS]      \n"
 373     "vld1.8 {d9},  [%[src_ptr]], %[kBPS]      \n"
 374     "vld1.8 {d11}, [%[src_ptr]]               \n"
 375
 376     // load ref into q6, q7 in high half
 377     "vld1.8 {d12}, [%[ref_ptr]], %[kBPS]      \n"
 378     "vld1.8 {d14}, [%[ref_ptr]], %[kBPS]      \n"
 379     "vld1.8 {d13}, [%[ref_ptr]], %[kBPS]      \n"
 380     "vld1.8 {d15}, [%[ref_ptr]]               \n"
 381
 382     // Pack the high values in to q4 and q6
 383     "vtrn.32     q4, q5                       \n"
 384     "vtrn.32     q6, q7                       \n"
 385
 386     // d[0-3] = src - ref
 387     "vsubl.u8    q0, d8, d12                  \n"
 388     "vsubl.u8    q1, d9, d13                  \n"
 389
 390     // load coeff16 into q8(d16=5352, d17=2217)
 391     "vld1.16     {q8}, [%[coeff16]]           \n"
 392
 393     // load coeff32 high half into q9 = 1812, q10 = 937
 394     "vld1.32     {q9, q10}, [%[coeff32]]!     \n"
 395
 396     // load coeff32 low half into q11=12000, q12=51000
 397     "vld1.32     {q11,q12}, [%[coeff32]]      \n"
 398
 399     // part 1
 400     // Transpose. Register dN is the same as dN in C
 401     "vtrn.32         d0, d2                   \n"
 402     "vtrn.32         d1, d3                   \n"
 403     "vtrn.16         d0, d1                   \n"
 404     "vtrn.16         d2, d3                   \n"
 405
 406     "vadd.s16        d4, d0, d3               \n" // a0 = d0 + d3
 407     "vadd.s16        d5, d1, d2               \n" // a1 = d1 + d2
 408     "vsub.s16        d6, d1, d2               \n" // a2 = d1 - d2
 409     "vsub.s16        d7, d0, d3               \n" // a3 = d0 - d3
 410
 411     "vadd.s16        d0, d4, d5               \n" // a0 + a1
 412     "vshl.s16        d0, d0, #3               \n" // temp[0+i*4] = (a0+a1) << 3
 413     "vsub.s16        d2, d4, d5               \n" // a0 - a1
 414     "vshl.s16        d2, d2, #3               \n" // (temp[2+i*4] = (a0-a1) << 3
 415
 416     "vmlal.s16       q9, d7, d16              \n" // a3*5352 + 1812
 417     "vmlal.s16       q10, d7, d17             \n" // a3*2217 + 937
 418     "vmlal.s16       q9, d6, d17              \n" // a2*2217 + a3*5352 + 1812
 419     "vmlsl.s16       q10, d6, d16             \n" // a3*2217 + 937 - a2*5352
 420
 421     // temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9
 422     // temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9
 423     "vshrn.s32       d1, q9, #9               \n"
 424     "vshrn.s32       d3, q10, #9              \n"
 425
 426     // part 2
 427     // transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
 428     "vtrn.32         d0, d2                   \n"
 429     "vtrn.32         d1, d3                   \n"
 430     "vtrn.16         d0, d1                   \n"
 431     "vtrn.16         d2, d3                   \n"
 432
 433     "vmov.s16        d26, #7                  \n"
 434
 435     "vadd.s16        d4, d0, d3               \n" // a1 = ip[0] + ip[12]
 436     "vadd.s16        d5, d1, d2               \n" // b1 = ip[4] + ip[8]
 437     "vsub.s16        d6, d1, d2               \n" // c1 = ip[4] - ip[8]
 438     "vadd.s16        d4, d4, d26              \n" // a1 + 7
 439     "vsub.s16        d7, d0, d3               \n" // d1 = ip[0] - ip[12]
 440
 441     "vadd.s16        d0, d4, d5               \n" // op[0] = a1 + b1 + 7
 442     "vsub.s16        d2, d4, d5               \n" // op[8] = a1 - b1 + 7
 443
 444     "vmlal.s16       q11, d7, d16             \n" // d1*5352 + 12000
 445     "vmlal.s16       q12, d7, d17             \n" // d1*2217 + 51000
 446
 447     "vceq.s16        d4, d7, #0               \n"
 448
 449     "vshr.s16        d0, d0, #4               \n"
 450     "vshr.s16        d2, d2, #4               \n"
 451
 452     "vmlal.s16       q11, d6, d17             \n" // c1*2217 + d1*5352 + 12000
 453     "vmlsl.s16       q12, d6, d16             \n" // d1*2217 - c1*5352 + 51000
 454
 455     "vmvn            d4, d4                   \n" // !(d1 == 0)
 456     // op[4] = (c1*2217 + d1*5352 + 12000)>>16
 457     "vshrn.s32       d1, q11, #16             \n"
 458     // op[4] += (d1!=0)
 459     "vsub.s16        d1, d1, d4               \n"
 460     // op[12]= (d1*2217 - c1*5352 + 51000)>>16
 461     "vshrn.s32       d3, q12, #16             \n"
 462
 463     // set result to out array
 464     "vst1.16         {q0, q1}, [%[out]]   \n"
 465     : [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
 466       [coeff32] "+r"(coeff32)          // modified registers
 467     : [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
 468       [out] "r"(out)                   // constants
 469     : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
 470       "q10", "q11", "q12", "q13"       // clobbered
 471   );
 472 }
 473
 474 #endif
 475
 476 #define LOAD_LANE_16b(VALUE, LANE) do {             \
 477   (VALUE) = vld1_lane_s16(src, (VALUE), (LANE));    \
 478   src += stride;                                    \
 479 } while (0)
 480
 481 static void FTransformWHT(const int16_t* src, int16_t* out) {
 482   const int stride = 16;
 483   const int16x4_t zero = vdup_n_s16(0);
 484   int32x4x4_t tmp0;
 485   int16x4x4_t in;
 486   INIT_VECTOR4(in, zero, zero, zero, zero);
 487   LOAD_LANE_16b(in.val[0], 0);
 488   LOAD_LANE_16b(in.val[1], 0);
 489   LOAD_LANE_16b(in.val[2], 0);
 490   LOAD_LANE_16b(in.val[3], 0);
 491   LOAD_LANE_16b(in.val[0], 1);
 492   LOAD_LANE_16b(in.val[1], 1);
 493   LOAD_LANE_16b(in.val[2], 1);
 494   LOAD_LANE_16b(in.val[3], 1);
 495   LOAD_LANE_16b(in.val[0], 2);
 496   LOAD_LANE_16b(in.val[1], 2);
 497   LOAD_LANE_16b(in.val[2], 2);
 498   LOAD_LANE_16b(in.val[3], 2);
 499   LOAD_LANE_16b(in.val[0], 3);
 500   LOAD_LANE_16b(in.val[1], 3);
 501   LOAD_LANE_16b(in.val[2], 3);
 502   LOAD_LANE_16b(in.val[3], 3);
 503
 504   {
 505     // a0 = in[0 * 16] + in[2 * 16]
 506     // a1 = in[1 * 16] + in[3 * 16]
 507     // a2 = in[1 * 16] - in[3 * 16]
 508     // a3 = in[0 * 16] - in[2 * 16]
 509     const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]);
 510     const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]);
 511     const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]);
 512     const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]);
 513     tmp0.val[0] = vaddq_s32(a0, a1);
 514     tmp0.val[1] = vaddq_s32(a3, a2);
 515     tmp0.val[2] = vsubq_s32(a3, a2);
 516     tmp0.val[3] = vsubq_s32(a0, a1);
 517   }
 518   {
 519     const int32x4x4_t tmp1 = Transpose4x4(tmp0);
 520     // a0 = tmp[0 + i] + tmp[ 8 + i]
 521     // a1 = tmp[4 + i] + tmp[12 + i]
 522     // a2 = tmp[4 + i] - tmp[12 + i]
 523     // a3 = tmp[0 + i] - tmp[ 8 + i]
 524     const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]);
 525     const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]);
 526     const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]);
 527     const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]);
 528     const int32x4_t b0 = vhaddq_s32(a0, a1);  // (a0 + a1) >> 1
 529     const int32x4_t b1 = vhaddq_s32(a3, a2);  // (a3 + a2) >> 1
 530     const int32x4_t b2 = vhsubq_s32(a3, a2);  // (a3 - a2) >> 1
 531     const int32x4_t b3 = vhsubq_s32(a0, a1);  // (a0 - a1) >> 1
 532     const int16x4_t out0 = vmovn_s32(b0);
 533     const int16x4_t out1 = vmovn_s32(b1);
 534     const int16x4_t out2 = vmovn_s32(b2);
 535     const int16x4_t out3 = vmovn_s32(b3);
 536
 537     vst1_s16(out +  0, out0);
 538     vst1_s16(out +  4, out1);
 539     vst1_s16(out +  8, out2);
 540     vst1_s16(out + 12, out3);
 541   }
 542 }
 543 #undef LOAD_LANE_16b
 544
 545 //------------------------------------------------------------------------------
 546 // Texture distortion
 547 //
 548 // We try to match the spectral content (weighted) between source and
 549 // reconstructed samples.
 550
 551 // This code works but is *slower* than the inlined-asm version below
 552 // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
 553 // USE_INTRINSICS define.
 554 // With gcc-4.8, it's only slightly slower than the inlined.
 555 #if defined(USE_INTRINSICS)
 556
 557 // Zero extend an uint16x4_t 'v' to an int32x4_t.
 558 static WEBP_INLINE int32x4_t ConvertU16ToS32(uint16x4_t v) {
 559   return vreinterpretq_s32_u32(vmovl_u16(v));
 560 }
 561
 562 // Does a regular 4x4 transpose followed by an adjustment of the upper columns
 563 // in the inner rows to restore the source order of differences,
 564 // i.e., a0 - a1 | a3 - a2.
 565 static WEBP_INLINE int32x4x4_t DistoTranspose4x4(const int32x4x4_t rows) {
 566   int32x4x4_t out = Transpose4x4(rows);
 567   // restore source order in the columns containing differences.
 568   const int32x2_t r1h = vget_high_s32(out.val[1]);
 569   const int32x2_t r2h = vget_high_s32(out.val[2]);
 570   out.val[1] = vcombine_s32(vget_low_s32(out.val[1]), r2h);
 571   out.val[2] = vcombine_s32(vget_low_s32(out.val[2]), r1h);
 572   return out;
 573 }
 574
 575 static WEBP_INLINE int32x4x4_t DistoHorizontalPass(const uint8x8_t r0r1,
 576                                                    const uint8x8_t r2r3) {
 577   // a0 = in[0] + in[2] | a1 = in[1] + in[3]
 578   const uint16x8_t a0a1 = vaddl_u8(r0r1, r2r3);
 579   // a3 = in[0] - in[2] | a2 = in[1] - in[3]
 580   const uint16x8_t a3a2 = vsubl_u8(r0r1, r2r3);
 581   const int32x4_t tmp0 = vpaddlq_s16(vreinterpretq_s16_u16(a0a1));  // a0 + a1
 582   const int32x4_t tmp1 = vpaddlq_s16(vreinterpretq_s16_u16(a3a2));  // a3 + a2
 583   // no pairwise subtraction; reorder to perform tmp[2]/tmp[3] calculations.
 584   // a0a0 a3a3 a0a0 a3a3 a0a0 a3a3 a0a0 a3a3
 585   // a1a1 a2a2 a1a1 a2a2 a1a1 a2a2 a1a1 a2a2
 586   const int16x8x2_t transpose =
 587       vtrnq_s16(vreinterpretq_s16_u16(a0a1), vreinterpretq_s16_u16(a3a2));
 588   // tmp[3] = a0 - a1 | tmp[2] = a3 - a2
 589   const int32x4_t tmp32_1 = vsubl_s16(vget_low_s16(transpose.val[0]),
 590                                       vget_low_s16(transpose.val[1]));
 591   const int32x4_t tmp32_2 = vsubl_s16(vget_high_s16(transpose.val[0]),
 592                                       vget_high_s16(transpose.val[1]));
 593   // [0]: tmp[3] [1]: tmp[2]
 594   const int32x4x2_t split = vtrnq_s32(tmp32_1, tmp32_2);
 595   const int32x4x4_t res = { { tmp0, tmp1, split.val[1], split.val[0] } };
 596   return res;
 597 }
 598
 599 static WEBP_INLINE int32x4x4_t DistoVerticalPass(const int32x4x4_t rows) {
 600   // a0 = tmp[0 + i] + tmp[8 + i];
 601   const int32x4_t a0 = vaddq_s32(rows.val[0], rows.val[1]);
 602   // a1 = tmp[4 + i] + tmp[12+ i];
 603   const int32x4_t a1 = vaddq_s32(rows.val[2], rows.val[3]);
 604   // a2 = tmp[4 + i] - tmp[12+ i];
 605   const int32x4_t a2 = vsubq_s32(rows.val[2], rows.val[3]);
 606   // a3 = tmp[0 + i] - tmp[8 + i];
 607   const int32x4_t a3 = vsubq_s32(rows.val[0], rows.val[1]);
 608   const int32x4_t b0 = vqabsq_s32(vaddq_s32(a0, a1));  // abs(a0 + a1)
 609   const int32x4_t b1 = vqabsq_s32(vaddq_s32(a3, a2));  // abs(a3 + a2)
 610   const int32x4_t b2 = vabdq_s32(a3, a2);              // abs(a3 - a2)
 611   const int32x4_t b3 = vabdq_s32(a0, a1);              // abs(a0 - a1)
 612   const int32x4x4_t res = { { b0, b1, b2, b3 } };
 613   return res;
 614 }
 615
 616 // Calculate the weighted sum of the rows in 'b'.
 617 static WEBP_INLINE int64x1_t DistoSum(const int32x4x4_t b,
 618                                       const int32x4_t w0, const int32x4_t w1,
 619                                       const int32x4_t w2, const int32x4_t w3) {
 620   const int32x4_t s0 = vmulq_s32(w0, b.val[0]);
 621   const int32x4_t s1 = vmlaq_s32(s0, w1, b.val[1]);
 622   const int32x4_t s2 = vmlaq_s32(s1, w2, b.val[2]);
 623   const int32x4_t s3 = vmlaq_s32(s2, w3, b.val[3]);
 624   const int64x2_t sum1 = vpaddlq_s32(s3);
 625   const int64x1_t sum2 = vadd_s64(vget_low_s64(sum1), vget_high_s64(sum1));
 626   return sum2;
 627 }
 628
 629 #define LOAD_LANE_32b(src, VALUE, LANE) \
 630     (VALUE) = vld1q_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
 631
 632 // Hadamard transform
 633 // Returns the weighted sum of the absolute value of transformed coefficients.
 634 static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
 635                     const uint16_t* const w) {
 636   uint32x4_t d0d1 = { 0, 0, 0, 0 };
 637   uint32x4_t d2d3 = { 0, 0, 0, 0 };
 638   LOAD_LANE_32b(a + 0 * BPS, d0d1, 0);  // a00 a01 a02 a03
 639   LOAD_LANE_32b(a + 1 * BPS, d0d1, 1);  // a10 a11 a12 a13
 640   LOAD_LANE_32b(b + 0 * BPS, d0d1, 2);  // b00 b01 b02 b03
 641   LOAD_LANE_32b(b + 1 * BPS, d0d1, 3);  // b10 b11 b12 b13
 642   LOAD_LANE_32b(a + 2 * BPS, d2d3, 0);  // a20 a21 a22 a23
 643   LOAD_LANE_32b(a + 3 * BPS, d2d3, 1);  // a30 a31 a32 a33
 644   LOAD_LANE_32b(b + 2 * BPS, d2d3, 2);  // b20 b21 b22 b23
 645   LOAD_LANE_32b(b + 3 * BPS, d2d3, 3);  // b30 b31 b32 b33
 646
 647   {
 648     // a00 a01 a20 a21 a10 a11 a30 a31 b00 b01 b20 b21 b10 b11 b30 b31
 649     // a02 a03 a22 a23 a12 a13 a32 a33 b02 b03 b22 b23 b12 b13 b32 b33
 650     const uint16x8x2_t tmp =
 651         vtrnq_u16(vreinterpretq_u16_u32(d0d1), vreinterpretq_u16_u32(d2d3));
 652     const uint8x16_t d0d1u8 = vreinterpretq_u8_u16(tmp.val[0]);
 653     const uint8x16_t d2d3u8 = vreinterpretq_u8_u16(tmp.val[1]);
 654     const int32x4x4_t hpass_a = DistoHorizontalPass(vget_low_u8(d0d1u8),
 655                                                     vget_low_u8(d2d3u8));
 656     const int32x4x4_t hpass_b = DistoHorizontalPass(vget_high_u8(d0d1u8),
 657                                                     vget_high_u8(d2d3u8));
 658     const int32x4x4_t tmp_a = DistoTranspose4x4(hpass_a);
 659     const int32x4x4_t tmp_b = DistoTranspose4x4(hpass_b);
 660     const int32x4x4_t vpass_a = DistoVerticalPass(tmp_a);
 661     const int32x4x4_t vpass_b = DistoVerticalPass(tmp_b);
 662     const int32x4_t w0 = ConvertU16ToS32(vld1_u16(w + 0));
 663     const int32x4_t w1 = ConvertU16ToS32(vld1_u16(w + 4));
 664     const int32x4_t w2 = ConvertU16ToS32(vld1_u16(w + 8));
 665     const int32x4_t w3 = ConvertU16ToS32(vld1_u16(w + 12));
 666     const int64x1_t sum1 = DistoSum(vpass_a, w0, w1, w2, w3);
 667     const int64x1_t sum2 = DistoSum(vpass_b, w0, w1, w2, w3);
 668     const int32x2_t diff = vabd_s32(vreinterpret_s32_s64(sum1),
 669                                     vreinterpret_s32_s64(sum2));
 670     const int32x2_t res = vshr_n_s32(diff, 5);
 671     return vget_lane_s32(res, 0);
 672   }
 673 }
 674
 675 #undef LOAD_LANE_32b
 676
 677 #else
 678
 679 // Hadamard transform
 680 // Returns the weighted sum of the absolute value of transformed coefficients.
 681 static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
 682                     const uint16_t* const w) {
 683   const int kBPS = BPS;
 684   const uint8_t* A = a;
 685   const uint8_t* B = b;
 686   const uint16_t* W = w;
 687   int sum;
 688   __asm__ volatile (
 689     "vld1.32         d0[0], [%[a]], %[kBPS]   \n"
 690     "vld1.32         d0[1], [%[a]], %[kBPS]   \n"
 691     "vld1.32         d2[0], [%[a]], %[kBPS]   \n"
 692     "vld1.32         d2[1], [%[a]]            \n"
 693
 694     "vld1.32         d1[0], [%[b]], %[kBPS]   \n"
 695     "vld1.32         d1[1], [%[b]], %[kBPS]   \n"
 696     "vld1.32         d3[0], [%[b]], %[kBPS]   \n"
 697     "vld1.32         d3[1], [%[b]]            \n"
 698
 699     // a d0/d2, b d1/d3
 700     // d0/d1: 01 01 01 01
 701     // d2/d3: 23 23 23 23
 702     // But: it goes 01 45 23 67
 703     // Notice the middle values are transposed
 704     "vtrn.16         q0, q1                   \n"
 705
 706     // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
 707     "vaddl.u8        q2, d0, d2               \n"
 708     "vaddl.u8        q10, d1, d3              \n"
 709     // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
 710     "vsubl.u8        q3, d0, d2               \n"
 711     "vsubl.u8        q11, d1, d3              \n"
 712
 713     // tmp[0] = a0 + a1
 714     "vpaddl.s16      q0, q2                   \n"
 715     "vpaddl.s16      q8, q10                  \n"
 716
 717     // tmp[1] = a3 + a2
 718     "vpaddl.s16      q1, q3                   \n"
 719     "vpaddl.s16      q9, q11                  \n"
 720
 721     // No pair subtract
 722     // q2 = {a0, a3}
 723     // q3 = {a1, a2}
 724     "vtrn.16         q2, q3                   \n"
 725     "vtrn.16         q10, q11                 \n"
 726
 727     // {tmp[3], tmp[2]} = {a0 - a1, a3 - a2}
 728     "vsubl.s16       q12, d4, d6              \n"
 729     "vsubl.s16       q13, d5, d7              \n"
 730     "vsubl.s16       q14, d20, d22            \n"
 731     "vsubl.s16       q15, d21, d23            \n"
 732
 733     // separate tmp[3] and tmp[2]
 734     // q12 = tmp[3]
 735     // q13 = tmp[2]
 736     "vtrn.32         q12, q13                 \n"
 737     "vtrn.32         q14, q15                 \n"
 738
 739     // Transpose tmp for a
 740     "vswp            d1, d26                  \n" // vtrn.64
 741     "vswp            d3, d24                  \n" // vtrn.64
 742     "vtrn.32         q0, q1                   \n"
 743     "vtrn.32         q13, q12                 \n"
 744
 745     // Transpose tmp for b
 746     "vswp            d17, d30                 \n" // vtrn.64
 747     "vswp            d19, d28                 \n" // vtrn.64
 748     "vtrn.32         q8, q9                   \n"
 749     "vtrn.32         q15, q14                 \n"
 750
 751     // The first Q register is a, the second b.
 752     // q0/8 tmp[0-3]
 753     // q13/15 tmp[4-7]
 754     // q1/9 tmp[8-11]
 755     // q12/14 tmp[12-15]
 756
 757     // These are still in 01 45 23 67 order. We fix it easily in the addition
 758     // case but the subtraction propagates them.
 759     "vswp            d3, d27                  \n"
 760     "vswp            d19, d31                 \n"
 761
 762     // a0 = tmp[0] + tmp[8]
 763     "vadd.s32        q2, q0, q1               \n"
 764     "vadd.s32        q3, q8, q9               \n"
 765
 766     // a1 = tmp[4] + tmp[12]
 767     "vadd.s32        q10, q13, q12            \n"
 768     "vadd.s32        q11, q15, q14            \n"
 769
 770     // a2 = tmp[4] - tmp[12]
 771     "vsub.s32        q13, q13, q12            \n"
 772     "vsub.s32        q15, q15, q14            \n"
 773
 774     // a3 = tmp[0] - tmp[8]
 775     "vsub.s32        q0, q0, q1               \n"
 776     "vsub.s32        q8, q8, q9               \n"
 777
 778     // b0 = a0 + a1
 779     "vadd.s32        q1, q2, q10              \n"
 780     "vadd.s32        q9, q3, q11              \n"
 781
 782     // b1 = a3 + a2
 783     "vadd.s32        q12, q0, q13             \n"
 784     "vadd.s32        q14, q8, q15             \n"
 785
 786     // b2 = a3 - a2
 787     "vsub.s32        q0, q0, q13              \n"
 788     "vsub.s32        q8, q8, q15              \n"
 789
 790     // b3 = a0 - a1
 791     "vsub.s32        q2, q2, q10              \n"
 792     "vsub.s32        q3, q3, q11              \n"
 793
 794     "vld1.64         {q10, q11}, [%[w]]       \n"
 795
 796     // abs(b0)
 797     "vabs.s32        q1, q1                   \n"
 798     "vabs.s32        q9, q9                   \n"
 799     // abs(b1)
 800     "vabs.s32        q12, q12                 \n"
 801     "vabs.s32        q14, q14                 \n"
 802     // abs(b2)
 803     "vabs.s32        q0, q0                   \n"
 804     "vabs.s32        q8, q8                   \n"
 805     // abs(b3)
 806     "vabs.s32        q2, q2                   \n"
 807     "vabs.s32        q3, q3                   \n"
 808
 809     // expand w before using.
 810     "vmovl.u16       q13, d20                 \n"
 811     "vmovl.u16       q15, d21                 \n"
 812
 813     // w[0] * abs(b0)
 814     "vmul.u32        q1, q1, q13              \n"
 815     "vmul.u32        q9, q9, q13              \n"
 816
 817     // w[4] * abs(b1)
 818     "vmla.u32        q1, q12, q15             \n"
 819     "vmla.u32        q9, q14, q15             \n"
 820
 821     // expand w before using.
 822     "vmovl.u16       q13, d22                 \n"
 823     "vmovl.u16       q15, d23                 \n"
 824
 825     // w[8] * abs(b1)
 826     "vmla.u32        q1, q0, q13              \n"
 827     "vmla.u32        q9, q8, q13              \n"
 828
 829     // w[12] * abs(b1)
 830     "vmla.u32        q1, q2, q15              \n"
 831     "vmla.u32        q9, q3, q15              \n"
 832
 833     // Sum the arrays
 834     "vpaddl.u32      q1, q1                   \n"
 835     "vpaddl.u32      q9, q9                   \n"
 836     "vadd.u64        d2, d3                   \n"
 837     "vadd.u64        d18, d19                 \n"
 838
 839     // Hadamard transform needs 4 bits of extra precision (2 bits in each
 840     // direction) for dynamic raw. Weights w[] are 16bits at max, so the maximum
 841     // precision for coeff is 8bit of input + 4bits of Hadamard transform +
 842     // 16bits for w[] + 2 bits of abs() summation.
 843     //
 844     // This uses a maximum of 31 bits (signed). Discarding the top 32 bits is
 845     // A-OK.
 846
 847     // sum2 - sum1
 848     "vsub.u32        d0, d2, d18              \n"
 849     // abs(sum2 - sum1)
 850     "vabs.s32        d0, d0                   \n"
 851     // abs(sum2 - sum1) >> 5
 852     "vshr.u32        d0, #5                   \n"
 853
 854     // It would be better to move the value straight into r0 but I'm not
 855     // entirely sure how this works with inline assembly.
 856     "vmov.32         %[sum], d0[0]            \n"
 857
 858     : [sum] "=r"(sum), [a] "+r"(A), [b] "+r"(B), [w] "+r"(W)
 859     : [kBPS] "r"(kBPS)
 860     : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
 861       "q10", "q11", "q12", "q13", "q14", "q15"  // clobbered
 862   ) ;
 863
 864   return sum;
 865 }
 866
 867 #endif  // USE_INTRINSICS
 868
 869 static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
 870                       const uint16_t* const w) {
 871   int D = 0;
 872   int x, y;
 873   for (y = 0; y < 16 * BPS; y += 4 * BPS) {
 874     for (x = 0; x < 16; x += 4) {
 875       D += Disto4x4(a + x + y, b + x + y, w);
 876     }
 877   }
 878   return D;
 879 }
 880
 881 //------------------------------------------------------------------------------
 882
 883 static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
 884                              int start_block, int end_block,
 885                              VP8Histogram* const histo) {
 886   const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
 887   int j;
 888   for (j = start_block; j < end_block; ++j) {
 889     int16_t out[16];
 890     FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
 891     {
 892       int k;
 893       const int16x8_t a0 = vld1q_s16(out + 0);
 894       const int16x8_t b0 = vld1q_s16(out + 8);
 895       const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));
 896       const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));
 897       const uint16x8_t a2 = vshrq_n_u16(a1, 3);
 898       const uint16x8_t b2 = vshrq_n_u16(b1, 3);
 899       const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);
 900       const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);
 901       vst1q_s16(out + 0, vreinterpretq_s16_u16(a3));
 902       vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));
 903       // Convert coefficients to bin.
 904       for (k = 0; k < 16; ++k) {
 905         histo->distribution[out[k]]++;
 906       }
 907     }
 908   }
 909 }
 910
 911 //------------------------------------------------------------------------------
 912
 913 static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,
 914                                         const uint8_t* const b,
 915                                         uint32x4_t* const sum) {
 916   const uint8x16_t a0 = vld1q_u8(a);
 917   const uint8x16_t b0 = vld1q_u8(b);
 918   const uint8x16_t abs_diff = vabdq_u8(a0, b0);
 919   uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));
 920   prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff));
 921   *sum = vpadalq_u16(*sum, prod);      // pair-wise add and accumulate
 922 }
 923
 924 // Horizontal sum of all four uint32_t values in 'sum'.
 925 static int SumToInt(uint32x4_t sum) {
 926   const uint64x2_t sum2 = vpaddlq_u32(sum);
 927   const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);
 928   return (int)sum3;
 929 }
 930
 931 static int SSE16x16(const uint8_t* a, const uint8_t* b) {
 932   uint32x4_t sum = vdupq_n_u32(0);
 933   int y;
 934   for (y = 0; y < 16; ++y) {
 935     AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
 936   }
 937   return SumToInt(sum);
 938 }
 939
 940 static int SSE16x8(const uint8_t* a, const uint8_t* b) {
 941   uint32x4_t sum = vdupq_n_u32(0);
 942   int y;
 943   for (y = 0; y < 8; ++y) {
 944     AccumulateSSE16(a + y * BPS, b + y * BPS, &sum);
 945   }
 946   return SumToInt(sum);
 947 }
 948
 949 static int SSE8x8(const uint8_t* a, const uint8_t* b) {
 950   uint32x4_t sum = vdupq_n_u32(0);
 951   int y;
 952   for (y = 0; y < 8; ++y) {
 953     const uint8x8_t a0 = vld1_u8(a + y * BPS);
 954     const uint8x8_t b0 = vld1_u8(b + y * BPS);
 955     const uint8x8_t abs_diff = vabd_u8(a0, b0);
 956     const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
 957     sum = vpadalq_u16(sum, prod);
 958   }
 959   return SumToInt(sum);
 960 }
 961
 962 static int SSE4x4(const uint8_t* a, const uint8_t* b) {
 963   const uint8x16_t a0 = Load4x4(a);
 964   const uint8x16_t b0 = Load4x4(b);
 965   const uint8x16_t abs_diff = vabdq_u8(a0, b0);
 966   uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));
 967   prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff));
 968   return SumToInt(vpaddlq_u16(prod));
 969 }
 970
 971 //------------------------------------------------------------------------------
 972
 973 // Compilation with gcc-4.6.x is problematic for now.
 974 #if !defined(WORK_AROUND_GCC)
 975
 976 static int16x8_t Quantize(int16_t* const in,
 977                           const VP8Matrix* const mtx, int offset) {
 978   const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
 979   const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
 980   const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
 981   const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);
 982   const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);
 983
 984   const int16x8_t a = vld1q_s16(in + offset);                // in
 985   const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a));  // coeff = abs(in)
 986   const int16x8_t sign = vshrq_n_s16(a, 15);                 // sign
 987   const uint16x8_t c = vaddq_u16(b, sharp);                  // + sharpen
 988   const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
 989   const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
 990   const uint32x4_t m2 = vhaddq_u32(m0, bias0);
 991   const uint32x4_t m3 = vhaddq_u32(m1, bias1);     // (coeff * iQ + bias) >> 1
 992   const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),
 993                                      vshrn_n_u32(m3, 16));   // QFIX=17 = 16+1
 994   const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
 995   const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
 996   const int16x8_t c3 = vsubq_s16(c2, sign);                  // restore sign
 997   const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
 998   vst1q_s16(in + offset, c4);
 999   assert(QFIX == 17);  // this function can't work as is if QFIX != 16+1
1000   return c3;
1001 }
1002
1003 static const uint8_t kShuffles[4][8] = {
1004   { 0,   1,  2,  3,  8,  9, 16, 17 },
1005   { 10, 11,  4,  5,  6,  7, 12, 13 },
1006   { 18, 19, 24, 25, 26, 27, 20, 21 },
1007   { 14, 15, 22, 23, 28, 29, 30, 31 }
1008 };
1009
1010 static int QuantizeBlock(int16_t in[16], int16_t out[16],
1011                          const VP8Matrix* const mtx) {
1012   const int16x8_t out0 = Quantize(in, mtx, 0);
1013   const int16x8_t out1 = Quantize(in, mtx, 8);
1014   uint8x8x4_t shuffles;
1015   // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
1016   // non-standard versions there.
1017 #if defined(__APPLE__) && defined(__aarch64__) && \
1018     defined(__apple_build_version__) && (__apple_build_version__< 6020037)
1019   uint8x16x2_t all_out;
1020   INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
1021   INIT_VECTOR4(shuffles,
1022                vtbl2q_u8(all_out, vld1_u8(kShuffles[0])),
1023                vtbl2q_u8(all_out, vld1_u8(kShuffles[1])),
1024                vtbl2q_u8(all_out, vld1_u8(kShuffles[2])),
1025                vtbl2q_u8(all_out, vld1_u8(kShuffles[3])));
1026 #else
1027   uint8x8x4_t all_out;
1028   INIT_VECTOR4(all_out,
1029                vreinterpret_u8_s16(vget_low_s16(out0)),
1030                vreinterpret_u8_s16(vget_high_s16(out0)),
1031                vreinterpret_u8_s16(vget_low_s16(out1)),
1032                vreinterpret_u8_s16(vget_high_s16(out1)));
1033   INIT_VECTOR4(shuffles,
1034                vtbl4_u8(all_out, vld1_u8(kShuffles[0])),
1035                vtbl4_u8(all_out, vld1_u8(kShuffles[1])),
1036                vtbl4_u8(all_out, vld1_u8(kShuffles[2])),
1037                vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
1038 #endif
1039   // Zigzag reordering
1040   vst1_u8((uint8_t*)(out +  0), shuffles.val[0]);
1041   vst1_u8((uint8_t*)(out +  4), shuffles.val[1]);
1042   vst1_u8((uint8_t*)(out +  8), shuffles.val[2]);
1043   vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);
1044   // test zeros
1045   if (*(uint64_t*)(out +  0) != 0) return 1;
1046   if (*(uint64_t*)(out +  4) != 0) return 1;
1047   if (*(uint64_t*)(out +  8) != 0) return 1;
1048   if (*(uint64_t*)(out + 12) != 0) return 1;
1049   return 0;
1050 }
1051
1052 #endif   // !WORK_AROUND_GCC
1053
1054 #endif   // WEBP_USE_NEON
1055
1056 //------------------------------------------------------------------------------
1057 // Entry point
1058
1059 extern void VP8EncDspInitNEON(void);
1060
1061 void VP8EncDspInitNEON(void) {
1062 #if defined(WEBP_USE_NEON)
1063   VP8ITransform = ITransform;
1064   VP8FTransform = FTransform;
1065
1066   VP8FTransformWHT = FTransformWHT;
1067
1068   VP8TDisto4x4 = Disto4x4;
1069   VP8TDisto16x16 = Disto16x16;
1070   VP8CollectHistogram = CollectHistogram;
1071   VP8SSE16x16 = SSE16x16;
1072   VP8SSE16x8 = SSE16x8;
1073   VP8SSE8x8 = SSE8x8;
1074   VP8SSE4x4 = SSE4x4;
1075 #if !defined(WORK_AROUND_GCC)
1076   VP8EncQuantizeBlock = QuantizeBlock;
1077 #endif
1078 #endif   // WEBP_USE_NEON
1079 }