third_party/libwebp/dsp/lossless.c

   1 // Copyright 2012 Google Inc. All Rights Reserved.
   2 //
   3 // Use of this source code is governed by a BSD-style license
   4 // that can be found in the COPYING file in the root of the source
   5 // tree. An additional intellectual property rights grant can be found
   6 // in the file PATENTS. All contributing project authors may
   7 // be found in the AUTHORS file in the root of the source tree.
   8 // -----------------------------------------------------------------------------
   9 //
  10 // Image transforms and color space conversion methods for lossless decoder.
  11 //
  12 // Authors: Vikas Arora (vikaas.arora@gmail.com)
  13 //          Jyrki Alakuijala (jyrki@google.com)
  14 //          Urvang Joshi (urvang@google.com)
  15
  16 #include "./dsp.h"
  17
  18 // Define the following if target arch is sure to have SSE2
  19 // #define WEBP_TARGET_HAS_SSE2
  20
  21 #if defined(__cplusplus) || defined(c_plusplus)
  22 extern "C" {
  23 #endif
  24
  25 #if defined(WEBP_TARGET_HAS_SSE2)
  26 #include <emmintrin.h>
  27 #endif
  28
  29 #include <math.h>
  30 #include <stdlib.h>
  31 #include "./lossless.h"
  32 #include "../dec/vp8li.h"
  33 #include "./yuv.h"
  34
  35 #define MAX_DIFF_COST (1e30f)
  36
  37 // lookup table for small values of log2(int)
  38 #define APPROX_LOG_MAX  4096
  39 #define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
  40 const float kLog2Table[LOG_LOOKUP_IDX_MAX] = {
  41   0.0000000000000000f, 0.0000000000000000f,
  42   1.0000000000000000f, 1.5849625007211560f,
  43   2.0000000000000000f, 2.3219280948873621f,
  44   2.5849625007211560f, 2.8073549220576041f,
  45   3.0000000000000000f, 3.1699250014423121f,
  46   3.3219280948873621f, 3.4594316186372973f,
  47   3.5849625007211560f, 3.7004397181410921f,
  48   3.8073549220576041f, 3.9068905956085187f,
  49   4.0000000000000000f, 4.0874628412503390f,
  50   4.1699250014423121f, 4.2479275134435852f,
  51   4.3219280948873626f, 4.3923174227787606f,
  52   4.4594316186372973f, 4.5235619560570130f,
  53   4.5849625007211560f, 4.6438561897747243f,
  54   4.7004397181410917f, 4.7548875021634682f,
  55   4.8073549220576037f, 4.8579809951275718f,
  56   4.9068905956085187f, 4.9541963103868749f,
  57   5.0000000000000000f, 5.0443941193584533f,
  58   5.0874628412503390f, 5.1292830169449663f,
  59   5.1699250014423121f, 5.2094533656289501f,
  60   5.2479275134435852f, 5.2854022188622487f,
  61   5.3219280948873626f, 5.3575520046180837f,
  62   5.3923174227787606f, 5.4262647547020979f,
  63   5.4594316186372973f, 5.4918530963296747f,
  64   5.5235619560570130f, 5.5545888516776376f,
  65   5.5849625007211560f, 5.6147098441152083f,
  66   5.6438561897747243f, 5.6724253419714951f,
  67   5.7004397181410917f, 5.7279204545631987f,
  68   5.7548875021634682f, 5.7813597135246599f,
  69   5.8073549220576037f, 5.8328900141647412f,
  70   5.8579809951275718f, 5.8826430493618415f,
  71   5.9068905956085187f, 5.9307373375628866f,
  72   5.9541963103868749f, 5.9772799234999167f,
  73   6.0000000000000000f, 6.0223678130284543f,
  74   6.0443941193584533f, 6.0660891904577720f,
  75   6.0874628412503390f, 6.1085244567781691f,
  76   6.1292830169449663f, 6.1497471195046822f,
  77   6.1699250014423121f, 6.1898245588800175f,
  78   6.2094533656289501f, 6.2288186904958804f,
  79   6.2479275134435852f, 6.2667865406949010f,
  80   6.2854022188622487f, 6.3037807481771030f,
  81   6.3219280948873626f, 6.3398500028846243f,
  82   6.3575520046180837f, 6.3750394313469245f,
  83   6.3923174227787606f, 6.4093909361377017f,
  84   6.4262647547020979f, 6.4429434958487279f,
  85   6.4594316186372973f, 6.4757334309663976f,
  86   6.4918530963296747f, 6.5077946401986963f,
  87   6.5235619560570130f, 6.5391588111080309f,
  88   6.5545888516776376f, 6.5698556083309478f,
  89   6.5849625007211560f, 6.5999128421871278f,
  90   6.6147098441152083f, 6.6293566200796094f,
  91   6.6438561897747243f, 6.6582114827517946f,
  92   6.6724253419714951f, 6.6865005271832185f,
  93   6.7004397181410917f, 6.7142455176661224f,
  94   6.7279204545631987f, 6.7414669864011464f,
  95   6.7548875021634682f, 6.7681843247769259f,
  96   6.7813597135246599f, 6.7944158663501061f,
  97   6.8073549220576037f, 6.8201789624151878f,
  98   6.8328900141647412f, 6.8454900509443747f,
  99   6.8579809951275718f, 6.8703647195834047f,
 100   6.8826430493618415f, 6.8948177633079437f,
 101   6.9068905956085187f, 6.9188632372745946f,
 102   6.9307373375628866f, 6.9425145053392398f,
 103   6.9541963103868749f, 6.9657842846620869f,
 104   6.9772799234999167f, 6.9886846867721654f,
 105   7.0000000000000000f, 7.0112272554232539f,
 106   7.0223678130284543f, 7.0334230015374501f,
 107   7.0443941193584533f, 7.0552824355011898f,
 108   7.0660891904577720f, 7.0768155970508308f,
 109   7.0874628412503390f, 7.0980320829605263f,
 110   7.1085244567781691f, 7.1189410727235076f,
 111   7.1292830169449663f, 7.1395513523987936f,
 112   7.1497471195046822f, 7.1598713367783890f,
 113   7.1699250014423121f, 7.1799090900149344f,
 114   7.1898245588800175f, 7.1996723448363644f,
 115   7.2094533656289501f, 7.2191685204621611f,
 116   7.2288186904958804f, 7.2384047393250785f,
 117   7.2479275134435852f, 7.2573878426926521f,
 118   7.2667865406949010f, 7.2761244052742375f,
 119   7.2854022188622487f, 7.2946207488916270f,
 120   7.3037807481771030f, 7.3128829552843557f,
 121   7.3219280948873626f, 7.3309168781146167f,
 122   7.3398500028846243f, 7.3487281542310771f,
 123   7.3575520046180837f, 7.3663222142458160f,
 124   7.3750394313469245f, 7.3837042924740519f,
 125   7.3923174227787606f, 7.4008794362821843f,
 126   7.4093909361377017f, 7.4178525148858982f,
 127   7.4262647547020979f, 7.4346282276367245f,
 128   7.4429434958487279f, 7.4512111118323289f,
 129   7.4594316186372973f, 7.4676055500829976f,
 130   7.4757334309663976f, 7.4838157772642563f,
 131   7.4918530963296747f, 7.4998458870832056f,
 132   7.5077946401986963f, 7.5156998382840427f,
 133   7.5235619560570130f, 7.5313814605163118f,
 134   7.5391588111080309f, 7.5468944598876364f,
 135   7.5545888516776376f, 7.5622424242210728f,
 136   7.5698556083309478f, 7.5774288280357486f,
 137   7.5849625007211560f, 7.5924570372680806f,
 138   7.5999128421871278f, 7.6073303137496104f,
 139   7.6147098441152083f, 7.6220518194563764f,
 140   7.6293566200796094f, 7.6366246205436487f,
 141   7.6438561897747243f, 7.6510516911789281f,
 142   7.6582114827517946f, 7.6653359171851764f,
 143   7.6724253419714951f, 7.6794800995054464f,
 144   7.6865005271832185f, 7.6934869574993252f,
 145   7.7004397181410917f, 7.7073591320808825f,
 146   7.7142455176661224f, 7.7210991887071855f,
 147   7.7279204545631987f, 7.7347096202258383f,
 148   7.7414669864011464f, 7.7481928495894605f,
 149   7.7548875021634682f, 7.7615512324444795f,
 150   7.7681843247769259f, 7.7747870596011736f,
 151   7.7813597135246599f, 7.7879025593914317f,
 152   7.7944158663501061f, 7.8008998999203047f,
 153   7.8073549220576037f, 7.8137811912170374f,
 154   7.8201789624151878f, 7.8265484872909150f,
 155   7.8328900141647412f, 7.8392037880969436f,
 156   7.8454900509443747f, 7.8517490414160571f,
 157   7.8579809951275718f, 7.8641861446542797f,
 158   7.8703647195834047f, 7.8765169465649993f,
 159   7.8826430493618415f, 7.8887432488982591f,
 160   7.8948177633079437f, 7.9008668079807486f,
 161   7.9068905956085187f, 7.9128893362299619f,
 162   7.9188632372745946f, 7.9248125036057812f,
 163   7.9307373375628866f, 7.9366379390025709f,
 164   7.9425145053392398f, 7.9483672315846778f,
 165   7.9541963103868749f, 7.9600019320680805f,
 166   7.9657842846620869f, 7.9715435539507719f,
 167   7.9772799234999167f, 7.9829935746943103f,
 168   7.9886846867721654f, 7.9943534368588577f
 169 };
 170
 171 const float kSLog2Table[LOG_LOOKUP_IDX_MAX] = {
 172   0.00000000f,    0.00000000f,  2.00000000f,   4.75488750f,
 173   8.00000000f,   11.60964047f,  15.50977500f,  19.65148445f,
 174   24.00000000f,  28.52932501f,  33.21928095f,  38.05374781f,
 175   43.01955001f,  48.10571634f,  53.30296891f,  58.60335893f,
 176   64.00000000f,  69.48686830f,  75.05865003f,  80.71062276f,
 177   86.43856190f,  92.23866588f,  98.10749561f,  104.04192499f,
 178   110.03910002f, 116.09640474f, 122.21143267f, 128.38196256f,
 179   134.60593782f, 140.88144886f, 147.20671787f, 153.58008562f,
 180   160.00000000f, 166.46500594f, 172.97373660f, 179.52490559f,
 181   186.11730005f, 192.74977453f, 199.42124551f, 206.13068654f,
 182   212.87712380f, 219.65963219f, 226.47733176f, 233.32938445f,
 183   240.21499122f, 247.13338933f, 254.08384998f, 261.06567603f,
 184   268.07820003f, 275.12078236f, 282.19280949f, 289.29369244f,
 185   296.42286534f, 303.57978409f, 310.76392512f, 317.97478424f,
 186   325.21187564f, 332.47473081f, 339.76289772f, 347.07593991f,
 187   354.41343574f, 361.77497759f, 369.16017124f, 376.56863518f,
 188   384.00000000f, 391.45390785f, 398.93001188f, 406.42797576f,
 189   413.94747321f, 421.48818752f, 429.04981119f, 436.63204548f,
 190   444.23460010f, 451.85719280f, 459.49954906f, 467.16140179f,
 191   474.84249102f, 482.54256363f, 490.26137307f, 497.99867911f,
 192   505.75424759f, 513.52785023f, 521.31926438f, 529.12827280f,
 193   536.95466351f, 544.79822957f, 552.65876890f, 560.53608414f,
 194   568.42998244f, 576.34027536f, 584.26677867f, 592.20931226f,
 195   600.16769996f, 608.14176943f, 616.13135206f, 624.13628279f,
 196   632.15640007f, 640.19154569f, 648.24156472f, 656.30630539f,
 197   664.38561898f, 672.47935976f, 680.58738488f, 688.70955430f,
 198   696.84573069f, 704.99577935f, 713.15956818f, 721.33696754f,
 199   729.52785023f, 737.73209140f, 745.94956849f, 754.18016116f,
 200   762.42375127f, 770.68022275f, 778.94946161f, 787.23135586f,
 201   795.52579543f, 803.83267219f, 812.15187982f, 820.48331383f,
 202   828.82687147f, 837.18245171f, 845.54995518f, 853.92928416f,
 203   862.32034249f, 870.72303558f, 879.13727036f, 887.56295522f,
 204   896.00000000f, 904.44831595f, 912.90781569f, 921.37841320f,
 205   929.86002376f, 938.35256392f, 946.85595152f, 955.37010560f,
 206   963.89494641f, 972.43039537f, 980.97637504f, 989.53280911f,
 207   998.09962237f, 1006.67674069f, 1015.26409097f, 1023.86160116f,
 208   1032.46920021f, 1041.08681805f, 1049.71438560f, 1058.35183469f,
 209   1066.99909811f, 1075.65610955f, 1084.32280357f, 1092.99911564f,
 210   1101.68498204f, 1110.38033993f, 1119.08512727f, 1127.79928282f,
 211   1136.52274614f, 1145.25545758f, 1153.99735821f, 1162.74838989f,
 212   1171.50849518f, 1180.27761738f, 1189.05570047f, 1197.84268914f,
 213   1206.63852876f, 1215.44316535f, 1224.25654560f, 1233.07861684f,
 214   1241.90932703f, 1250.74862473f, 1259.59645914f, 1268.45278005f,
 215   1277.31753781f, 1286.19068338f, 1295.07216828f, 1303.96194457f,
 216   1312.85996488f, 1321.76618236f, 1330.68055071f, 1339.60302413f,
 217   1348.53355734f, 1357.47210556f, 1366.41862452f, 1375.37307041f,
 218   1384.33539991f, 1393.30557020f, 1402.28353887f, 1411.26926400f,
 219   1420.26270412f, 1429.26381818f, 1438.27256558f, 1447.28890615f,
 220   1456.31280014f, 1465.34420819f, 1474.38309138f, 1483.42941118f,
 221   1492.48312945f, 1501.54420843f, 1510.61261078f, 1519.68829949f,
 222   1528.77123795f, 1537.86138993f, 1546.95871952f, 1556.06319119f,
 223   1565.17476976f, 1574.29342040f, 1583.41910860f, 1592.55180020f,
 224   1601.69146137f, 1610.83805860f, 1619.99155871f, 1629.15192882f,
 225   1638.31913637f, 1647.49314911f, 1656.67393509f, 1665.86146266f,
 226   1675.05570047f, 1684.25661744f, 1693.46418280f, 1702.67836605f,
 227   1711.89913698f, 1721.12646563f, 1730.36032233f, 1739.60067768f,
 228   1748.84750254f, 1758.10076802f, 1767.36044551f, 1776.62650662f,
 229   1785.89892323f, 1795.17766747f, 1804.46271172f, 1813.75402857f,
 230   1823.05159087f, 1832.35537170f, 1841.66534438f, 1850.98148244f,
 231   1860.30375965f, 1869.63214999f, 1878.96662767f, 1888.30716711f,
 232   1897.65374295f, 1907.00633003f, 1916.36490342f, 1925.72943838f,
 233   1935.09991037f, 1944.47629506f, 1953.85856831f, 1963.24670620f,
 234   1972.64068498f, 1982.04048108f, 1991.44607117f, 2000.85743204f,
 235   2010.27454072f, 2019.69737440f, 2029.12591044f, 2038.56012640f
 236 };
 237
 238 float VP8LFastSLog2Slow(int v) {
 239   assert(v >= LOG_LOOKUP_IDX_MAX);
 240   if (v < APPROX_LOG_MAX) {
 241     int log_cnt = 0;
 242     const float v_f = (float)v;
 243     while (v >= LOG_LOOKUP_IDX_MAX) {
 244       ++log_cnt;
 245       v = v >> 1;
 246     }
 247     return v_f * (kLog2Table[v] + log_cnt);
 248   } else {
 249     return (float)(LOG_2_RECIPROCAL * v * log((double)v));
 250   }
 251 }
 252
 253 float VP8LFastLog2Slow(int v) {
 254   assert(v >= LOG_LOOKUP_IDX_MAX);
 255   if (v < APPROX_LOG_MAX) {
 256     int log_cnt = 0;
 257     while (v >= LOG_LOOKUP_IDX_MAX) {
 258       ++log_cnt;
 259       v = v >> 1;
 260     }
 261     return kLog2Table[v] + log_cnt;
 262   } else {
 263     return (float)(LOG_2_RECIPROCAL * log((double)v));
 264   }
 265 }
 266
 267 //------------------------------------------------------------------------------
 268 // Image transforms.
 269
 270 // In-place sum of each component with mod 256.
 271 static WEBP_INLINE void AddPixelsEq(uint32_t* a, uint32_t b) {
 272   const uint32_t alpha_and_green = (*a & 0xff00ff00u) + (b & 0xff00ff00u);
 273   const uint32_t red_and_blue = (*a & 0x00ff00ffu) + (b & 0x00ff00ffu);
 274   *a = (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
 275 }
 276
 277 static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
 278   return (((a0 ^ a1) & 0xfefefefeL) >> 1) + (a0 & a1);
 279 }
 280
 281 static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
 282   return Average2(Average2(a0, a2), a1);
 283 }
 284
 285 static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
 286                                      uint32_t a2, uint32_t a3) {
 287   return Average2(Average2(a0, a1), Average2(a2, a3));
 288 }
 289
 290 #if defined(WEBP_TARGET_HAS_SSE2)
 291 static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
 292                                                    uint32_t c2) {
 293   const __m128i zero = _mm_setzero_si128();
 294   const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
 295   const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
 296   const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
 297   const __m128i V1 = _mm_add_epi16(C0, C1);
 298   const __m128i V2 = _mm_sub_epi16(V1, C2);
 299   const __m128i b = _mm_packus_epi16(V2, V2);
 300   const uint32_t output = _mm_cvtsi128_si32(b);
 301   return output;
 302 }
 303
 304 static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
 305                                                    uint32_t c2) {
 306   const uint32_t ave = Average2(c0, c1);
 307   const __m128i zero = _mm_setzero_si128();
 308   const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ave), zero);
 309   const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
 310   const __m128i A1 = _mm_sub_epi16(A0, B0);
 311   const __m128i BgtA = _mm_cmpgt_epi16(B0, A0);
 312   const __m128i A2 = _mm_sub_epi16(A1, BgtA);
 313   const __m128i A3 = _mm_srai_epi16(A2, 1);
 314   const __m128i A4 = _mm_add_epi16(A0, A3);
 315   const __m128i A5 = _mm_packus_epi16(A4, A4);
 316   const uint32_t output = _mm_cvtsi128_si32(A5);
 317   return output;
 318 }
 319
 320 static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
 321   int pa_minus_pb;
 322   const __m128i zero = _mm_setzero_si128();
 323   const __m128i A0 = _mm_cvtsi32_si128(a);
 324   const __m128i B0 = _mm_cvtsi32_si128(b);
 325   const __m128i C0 = _mm_cvtsi32_si128(c);
 326   const __m128i AC0 = _mm_subs_epu8(A0, C0);
 327   const __m128i CA0 = _mm_subs_epu8(C0, A0);
 328   const __m128i BC0 = _mm_subs_epu8(B0, C0);
 329   const __m128i CB0 = _mm_subs_epu8(C0, B0);
 330   const __m128i AC = _mm_or_si128(AC0, CA0);
 331   const __m128i BC = _mm_or_si128(BC0, CB0);
 332   const __m128i pa = _mm_unpacklo_epi8(AC, zero);  // |a - c|
 333   const __m128i pb = _mm_unpacklo_epi8(BC, zero);  // |b - c|
 334   const __m128i diff = _mm_sub_epi16(pb, pa);
 335   {
 336     int16_t out[8];
 337     _mm_storeu_si128((__m128i*)out, diff);
 338     pa_minus_pb = out[0] + out[1] + out[2] + out[3];
 339   }
 340   return (pa_minus_pb <= 0) ? a : b;
 341 }
 342
 343 #else
 344
 345 static WEBP_INLINE uint32_t Clip255(uint32_t a) {
 346   if (a < 256) {
 347     return a;
 348   }
 349   // return 0, when a is a negative integer.
 350   // return 255, when a is positive.
 351   return ~a >> 24;
 352 }
 353
 354 static WEBP_INLINE int AddSubtractComponentFull(int a, int b, int c) {
 355   return Clip255(a + b - c);
 356 }
 357
 358 static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
 359                                                    uint32_t c2) {
 360   const int a = AddSubtractComponentFull(c0 >> 24, c1 >> 24, c2 >> 24);
 361   const int r = AddSubtractComponentFull((c0 >> 16) & 0xff,
 362                                          (c1 >> 16) & 0xff,
 363                                          (c2 >> 16) & 0xff);
 364   const int g = AddSubtractComponentFull((c0 >> 8) & 0xff,
 365                                          (c1 >> 8) & 0xff,
 366                                          (c2 >> 8) & 0xff);
 367   const int b = AddSubtractComponentFull(c0 & 0xff, c1 & 0xff, c2 & 0xff);
 368   return (a << 24) | (r << 16) | (g << 8) | b;
 369 }
 370
 371 static WEBP_INLINE int AddSubtractComponentHalf(int a, int b) {
 372   return Clip255(a + (a - b) / 2);
 373 }
 374
 375 static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
 376                                                    uint32_t c2) {
 377   const uint32_t ave = Average2(c0, c1);
 378   const int a = AddSubtractComponentHalf(ave >> 24, c2 >> 24);
 379   const int r = AddSubtractComponentHalf((ave >> 16) & 0xff, (c2 >> 16) & 0xff);
 380   const int g = AddSubtractComponentHalf((ave >> 8) & 0xff, (c2 >> 8) & 0xff);
 381   const int b = AddSubtractComponentHalf((ave >> 0) & 0xff, (c2 >> 0) & 0xff);
 382   return (a << 24) | (r << 16) | (g << 8) | b;
 383 }
 384
 385 static WEBP_INLINE int Sub3(int a, int b, int c) {
 386   const int pb = b - c;
 387   const int pa = a - c;
 388   return abs(pb) - abs(pa);
 389 }
 390
 391 static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
 392   const int pa_minus_pb =
 393       Sub3((a >> 24)       , (b >> 24)       , (c >> 24)       ) +
 394       Sub3((a >> 16) & 0xff, (b >> 16) & 0xff, (c >> 16) & 0xff) +
 395       Sub3((a >>  8) & 0xff, (b >>  8) & 0xff, (c >>  8) & 0xff) +
 396       Sub3((a      ) & 0xff, (b      ) & 0xff, (c      ) & 0xff);
 397   return (pa_minus_pb <= 0) ? a : b;
 398 }
 399 #endif
 400
 401 //------------------------------------------------------------------------------
 402 // Predictors
 403
 404 static uint32_t Predictor0(uint32_t left, const uint32_t* const top) {
 405   (void)top;
 406   (void)left;
 407   return ARGB_BLACK;
 408 }
 409 static uint32_t Predictor1(uint32_t left, const uint32_t* const top) {
 410   (void)top;
 411   return left;
 412 }
 413 static uint32_t Predictor2(uint32_t left, const uint32_t* const top) {
 414   (void)left;
 415   return top[0];
 416 }
 417 static uint32_t Predictor3(uint32_t left, const uint32_t* const top) {
 418   (void)left;
 419   return top[1];
 420 }
 421 static uint32_t Predictor4(uint32_t left, const uint32_t* const top) {
 422   (void)left;
 423   return top[-1];
 424 }
 425 static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
 426   const uint32_t pred = Average3(left, top[0], top[1]);
 427   return pred;
 428 }
 429 static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
 430   const uint32_t pred = Average2(left, top[-1]);
 431   return pred;
 432 }
 433 static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
 434   const uint32_t pred = Average2(left, top[0]);
 435   return pred;
 436 }
 437 static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
 438   const uint32_t pred = Average2(top[-1], top[0]);
 439   (void)left;
 440   return pred;
 441 }
 442 static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
 443   const uint32_t pred = Average2(top[0], top[1]);
 444   (void)left;
 445   return pred;
 446 }
 447 static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
 448   const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
 449   return pred;
 450 }
 451 static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
 452   const uint32_t pred = Select(top[0], left, top[-1]);
 453   return pred;
 454 }
 455 static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
 456   const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
 457   return pred;
 458 }
 459 static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
 460   const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
 461   return pred;
 462 }
 463
 464 typedef uint32_t (*PredictorFunc)(uint32_t left, const uint32_t* const top);
 465 static const PredictorFunc kPredictors[16] = {
 466   Predictor0, Predictor1, Predictor2, Predictor3,
 467   Predictor4, Predictor5, Predictor6, Predictor7,
 468   Predictor8, Predictor9, Predictor10, Predictor11,
 469   Predictor12, Predictor13,
 470   Predictor0, Predictor0    // <- padding security sentinels
 471 };
 472
 473 // TODO(vikasa): Replace 256 etc with defines.
 474 static float PredictionCostSpatial(const int* counts,
 475                                    int weight_0, double exp_val) {
 476   const int significant_symbols = 16;
 477   const double exp_decay_factor = 0.6;
 478   double bits = weight_0 * counts[0];
 479   int i;
 480   for (i = 1; i < significant_symbols; ++i) {
 481     bits += exp_val * (counts[i] + counts[256 - i]);
 482     exp_val *= exp_decay_factor;
 483   }
 484   return (float)(-0.1 * bits);
 485 }
 486
 487 // Compute the combined Shanon's entropy for distribution {X} and {X+Y}
 488 static float CombinedShannonEntropy(const int* const X,
 489                                     const int* const Y, int n) {
 490   int i;
 491   double retval = 0.;
 492   int sumX = 0, sumXY = 0;
 493   for (i = 0; i < n; ++i) {
 494     const int x = X[i];
 495     const int xy = X[i] + Y[i];
 496     if (x != 0) {
 497       sumX += x;
 498       retval -= VP8LFastSLog2(x);
 499     }
 500     if (xy != 0) {
 501       sumXY += xy;
 502       retval -= VP8LFastSLog2(xy);
 503     }
 504   }
 505   retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
 506   return (float)retval;
 507 }
 508
 509 static float PredictionCostSpatialHistogram(int accumulated[4][256],
 510                                             int tile[4][256]) {
 511   int i;
 512   double retval = 0;
 513   for (i = 0; i < 4; ++i) {
 514     const double kExpValue = 0.94;
 515     retval += PredictionCostSpatial(tile[i], 1, kExpValue);
 516     retval += CombinedShannonEntropy(tile[i], accumulated[i], 256);
 517   }
 518   return (float)retval;
 519 }
 520
 521 static int GetBestPredictorForTile(int width, int height,
 522                                    int tile_x, int tile_y, int bits,
 523                                    int accumulated[4][256],
 524                                    const uint32_t* const argb_scratch) {
 525   const int kNumPredModes = 14;
 526   const int col_start = tile_x << bits;
 527   const int row_start = tile_y << bits;
 528   const int tile_size = 1 << bits;
 529   const int ymax = (tile_size <= height - row_start) ?
 530       tile_size : height - row_start;
 531   const int xmax = (tile_size <= width - col_start) ?
 532       tile_size : width - col_start;
 533   int histo[4][256];
 534   float best_diff = MAX_DIFF_COST;
 535   int best_mode = 0;
 536
 537   int mode;
 538   for (mode = 0; mode < kNumPredModes; ++mode) {
 539     const uint32_t* current_row = argb_scratch;
 540     const PredictorFunc pred_func = kPredictors[mode];
 541     float cur_diff;
 542     int y;
 543     memset(&histo[0][0], 0, sizeof(histo));
 544     for (y = 0; y < ymax; ++y) {
 545       int x;
 546       const int row = row_start + y;
 547       const uint32_t* const upper_row = current_row;
 548       current_row = upper_row + width;
 549       for (x = 0; x < xmax; ++x) {
 550         const int col = col_start + x;
 551         uint32_t predict;
 552         uint32_t predict_diff;
 553         if (row == 0) {
 554           predict = (col == 0) ? ARGB_BLACK : current_row[col - 1];  // Left.
 555         } else if (col == 0) {
 556           predict = upper_row[col];  // Top.
 557         } else {
 558           predict = pred_func(current_row[col - 1], upper_row + col);
 559         }
 560         predict_diff = VP8LSubPixels(current_row[col], predict);
 561         ++histo[0][predict_diff >> 24];
 562         ++histo[1][((predict_diff >> 16) & 0xff)];
 563         ++histo[2][((predict_diff >> 8) & 0xff)];
 564         ++histo[3][(predict_diff & 0xff)];
 565       }
 566     }
 567     cur_diff = PredictionCostSpatialHistogram(accumulated, histo);
 568     if (cur_diff < best_diff) {
 569       best_diff = cur_diff;
 570       best_mode = mode;
 571     }
 572   }
 573
 574   return best_mode;
 575 }
 576
 577 static void CopyTileWithPrediction(int width, int height,
 578                                    int tile_x, int tile_y, int bits, int mode,
 579                                    const uint32_t* const argb_scratch,
 580                                    uint32_t* const argb) {
 581   const int col_start = tile_x << bits;
 582   const int row_start = tile_y << bits;
 583   const int tile_size = 1 << bits;
 584   const int ymax = (tile_size <= height - row_start) ?
 585       tile_size : height - row_start;
 586   const int xmax = (tile_size <= width - col_start) ?
 587       tile_size : width - col_start;
 588   const PredictorFunc pred_func = kPredictors[mode];
 589   const uint32_t* current_row = argb_scratch;
 590
 591   int y;
 592   for (y = 0; y < ymax; ++y) {
 593     int x;
 594     const int row = row_start + y;
 595     const uint32_t* const upper_row = current_row;
 596     current_row = upper_row + width;
 597     for (x = 0; x < xmax; ++x) {
 598       const int col = col_start + x;
 599       const int pix = row * width + col;
 600       uint32_t predict;
 601       if (row == 0) {
 602         predict = (col == 0) ? ARGB_BLACK : current_row[col - 1];  // Left.
 603       } else if (col == 0) {
 604         predict = upper_row[col];  // Top.
 605       } else {
 606         predict = pred_func(current_row[col - 1], upper_row + col);
 607       }
 608       argb[pix] = VP8LSubPixels(current_row[col], predict);
 609     }
 610   }
 611 }
 612
 613 void VP8LResidualImage(int width, int height, int bits,
 614                        uint32_t* const argb, uint32_t* const argb_scratch,
 615                        uint32_t* const image) {
 616   const int max_tile_size = 1 << bits;
 617   const int tiles_per_row = VP8LSubSampleSize(width, bits);
 618   const int tiles_per_col = VP8LSubSampleSize(height, bits);
 619   uint32_t* const upper_row = argb_scratch;
 620   uint32_t* const current_tile_rows = argb_scratch + width;
 621   int tile_y;
 622   int histo[4][256];
 623   memset(histo, 0, sizeof(histo));
 624   for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) {
 625     const int tile_y_offset = tile_y * max_tile_size;
 626     const int this_tile_height =
 627         (tile_y < tiles_per_col - 1) ? max_tile_size : height - tile_y_offset;
 628     int tile_x;
 629     if (tile_y > 0) {
 630       memcpy(upper_row, current_tile_rows + (max_tile_size - 1) * width,
 631              width * sizeof(*upper_row));
 632     }
 633     memcpy(current_tile_rows, &argb[tile_y_offset * width],
 634            this_tile_height * width * sizeof(*current_tile_rows));
 635     for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) {
 636       int pred;
 637       int y;
 638       const int tile_x_offset = tile_x * max_tile_size;
 639       int all_x_max = tile_x_offset + max_tile_size;
 640       if (all_x_max > width) {
 641         all_x_max = width;
 642       }
 643       pred = GetBestPredictorForTile(width, height, tile_x, tile_y, bits, histo,
 644                                      argb_scratch);
 645       image[tile_y * tiles_per_row + tile_x] = 0xff000000u | (pred << 8);
 646       CopyTileWithPrediction(width, height, tile_x, tile_y, bits, pred,
 647                              argb_scratch, argb);
 648       for (y = 0; y < max_tile_size; ++y) {
 649         int ix;
 650         int all_x;
 651         int all_y = tile_y_offset + y;
 652         if (all_y >= height) {
 653           break;
 654         }
 655         ix = all_y * width + tile_x_offset;
 656         for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
 657           const uint32_t a = argb[ix];
 658           ++histo[0][a >> 24];
 659           ++histo[1][((a >> 16) & 0xff)];
 660           ++histo[2][((a >> 8) & 0xff)];
 661           ++histo[3][(a & 0xff)];
 662         }
 663       }
 664     }
 665   }
 666 }
 667
 668 // Inverse prediction.
 669 static void PredictorInverseTransform(const VP8LTransform* const transform,
 670                                       int y_start, int y_end, uint32_t* data) {
 671   const int width = transform->xsize_;
 672   if (y_start == 0) {  // First Row follows the L (mode=1) mode.
 673     int x;
 674     const uint32_t pred0 = Predictor0(data[-1], NULL);
 675     AddPixelsEq(data, pred0);
 676     for (x = 1; x < width; ++x) {
 677       const uint32_t pred1 = Predictor1(data[x - 1], NULL);
 678       AddPixelsEq(data + x, pred1);
 679     }
 680     data += width;
 681     ++y_start;
 682   }
 683
 684   {
 685     int y = y_start;
 686     const int mask = (1 << transform->bits_) - 1;
 687     const int tiles_per_row = VP8LSubSampleSize(width, transform->bits_);
 688     const uint32_t* pred_mode_base =
 689         transform->data_ + (y >> transform->bits_) * tiles_per_row;
 690
 691     while (y < y_end) {
 692       int x;
 693       const uint32_t pred2 = Predictor2(data[-1], data - width);
 694       const uint32_t* pred_mode_src = pred_mode_base;
 695       PredictorFunc pred_func;
 696
 697       // First pixel follows the T (mode=2) mode.
 698       AddPixelsEq(data, pred2);
 699
 700       // .. the rest:
 701       pred_func = kPredictors[((*pred_mode_src++) >> 8) & 0xf];
 702       for (x = 1; x < width; ++x) {
 703         uint32_t pred;
 704         if ((x & mask) == 0) {    // start of tile. Read predictor function.
 705           pred_func = kPredictors[((*pred_mode_src++) >> 8) & 0xf];
 706         }
 707         pred = pred_func(data[x - 1], data + x - width);
 708         AddPixelsEq(data + x, pred);
 709       }
 710       data += width;
 711       ++y;
 712       if ((y & mask) == 0) {   // Use the same mask, since tiles are squares.
 713         pred_mode_base += tiles_per_row;
 714       }
 715     }
 716   }
 717 }
 718
 719 void VP8LSubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixs) {
 720   int i = 0;
 721 #if defined(WEBP_TARGET_HAS_SSE2)
 722   const __m128i mask = _mm_set1_epi32(0x0000ff00);
 723   for (; i + 4 < num_pixs; i += 4) {
 724     const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
 725     const __m128i in_00g0 = _mm_and_si128(in, mask);     // 00g0|00g0|...
 726     const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8);  // 0g00|0g00|...
 727     const __m128i in_000g = _mm_srli_epi32(in_00g0, 8);  // 000g|000g|...
 728     const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
 729     const __m128i out = _mm_sub_epi8(in, in_0g0g);
 730     _mm_storeu_si128((__m128i*)&argb_data[i], out);
 731   }
 732   // fallthrough and finish off with plain-C
 733 #endif
 734   for (; i < num_pixs; ++i) {
 735     const uint32_t argb = argb_data[i];
 736     const uint32_t green = (argb >> 8) & 0xff;
 737     const uint32_t new_r = (((argb >> 16) & 0xff) - green) & 0xff;
 738     const uint32_t new_b = ((argb & 0xff) - green) & 0xff;
 739     argb_data[i] = (argb & 0xff00ff00) | (new_r << 16) | new_b;
 740   }
 741 }
 742
 743 // Add green to blue and red channels (i.e. perform the inverse transform of
 744 // 'subtract green').
 745 static void AddGreenToBlueAndRed(const VP8LTransform* const transform,
 746                                  int y_start, int y_end, uint32_t* data) {
 747   const int width = transform->xsize_;
 748   const uint32_t* const data_end = data + (y_end - y_start) * width;
 749 #if defined(WEBP_TARGET_HAS_SSE2)
 750   const __m128i mask = _mm_set1_epi32(0x0000ff00);
 751   for (; data + 4 < data_end; data += 4) {
 752     const __m128i in = _mm_loadu_si128((__m128i*)data);
 753     const __m128i in_00g0 = _mm_and_si128(in, mask);     // 00g0|00g0|...
 754     const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8);  // 0g00|0g00|...
 755     const __m128i in_000g = _mm_srli_epi32(in_00g0, 8);  // 000g|000g|...
 756     const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
 757     const __m128i out = _mm_add_epi8(in, in_0g0g);
 758     _mm_storeu_si128((__m128i*)data, out);
 759   }
 760   // fallthrough and finish off with plain-C
 761 #endif
 762   while (data < data_end) {
 763     const uint32_t argb = *data;
 764     const uint32_t green = ((argb >> 8) & 0xff);
 765     uint32_t red_blue = (argb & 0x00ff00ffu);
 766     red_blue += (green << 16) | green;
 767     red_blue &= 0x00ff00ffu;
 768     *data++ = (argb & 0xff00ff00u) | red_blue;
 769   }
 770 }
 771
 772 typedef struct {
 773   // Note: the members are uint8_t, so that any negative values are
 774   // automatically converted to "mod 256" values.
 775   uint8_t green_to_red_;
 776   uint8_t green_to_blue_;
 777   uint8_t red_to_blue_;
 778 } Multipliers;
 779
 780 static WEBP_INLINE void MultipliersClear(Multipliers* m) {
 781   m->green_to_red_ = 0;
 782   m->green_to_blue_ = 0;
 783   m->red_to_blue_ = 0;
 784 }
 785
 786 static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
 787                                                 int8_t color) {
 788   return (uint32_t)((int)(color_pred) * color) >> 5;
 789 }
 790
 791 static WEBP_INLINE void ColorCodeToMultipliers(uint32_t color_code,
 792                                                Multipliers* const m) {
 793   m->green_to_red_  = (color_code >>  0) & 0xff;
 794   m->green_to_blue_ = (color_code >>  8) & 0xff;
 795   m->red_to_blue_   = (color_code >> 16) & 0xff;
 796 }
 797
 798 static WEBP_INLINE uint32_t MultipliersToColorCode(Multipliers* const m) {
 799   return 0xff000000u |
 800          ((uint32_t)(m->red_to_blue_) << 16) |
 801          ((uint32_t)(m->green_to_blue_) << 8) |
 802          m->green_to_red_;
 803 }
 804
 805 static WEBP_INLINE uint32_t TransformColor(const Multipliers* const m,
 806                                            uint32_t argb, int inverse) {
 807   const uint32_t green = argb >> 8;
 808   const uint32_t red = argb >> 16;
 809   uint32_t new_red = red;
 810   uint32_t new_blue = argb;
 811
 812   if (inverse) {
 813     new_red += ColorTransformDelta(m->green_to_red_, green);
 814     new_red &= 0xff;
 815     new_blue += ColorTransformDelta(m->green_to_blue_, green);
 816     new_blue += ColorTransformDelta(m->red_to_blue_, new_red);
 817     new_blue &= 0xff;
 818   } else {
 819     new_red -= ColorTransformDelta(m->green_to_red_, green);
 820     new_red &= 0xff;
 821     new_blue -= ColorTransformDelta(m->green_to_blue_, green);
 822     new_blue -= ColorTransformDelta(m->red_to_blue_, red);
 823     new_blue &= 0xff;
 824   }
 825   return (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
 826 }
 827
 828 static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
 829                                              uint32_t argb) {
 830   const uint32_t green = argb >> 8;
 831   uint32_t new_red = argb >> 16;
 832   new_red -= ColorTransformDelta(green_to_red, green);
 833   return (new_red & 0xff);
 834 }
 835
 836 static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
 837                                               uint8_t red_to_blue,
 838                                               uint32_t argb) {
 839   const uint32_t green = argb >> 8;
 840   const uint32_t red = argb >> 16;
 841   uint8_t new_blue = argb;
 842   new_blue -= ColorTransformDelta(green_to_blue, green);
 843   new_blue -= ColorTransformDelta(red_to_blue, red);
 844   return (new_blue & 0xff);
 845 }
 846
 847 static WEBP_INLINE int SkipRepeatedPixels(const uint32_t* const argb,
 848                                           int ix, int xsize) {
 849   const uint32_t v = argb[ix];
 850   if (ix >= xsize + 3) {
 851     if (v == argb[ix - xsize] &&
 852         argb[ix - 1] == argb[ix - xsize - 1] &&
 853         argb[ix - 2] == argb[ix - xsize - 2] &&
 854         argb[ix - 3] == argb[ix - xsize - 3]) {
 855       return 1;
 856     }
 857     return v == argb[ix - 3] && v == argb[ix - 2] && v == argb[ix - 1];
 858   } else if (ix >= 3) {
 859     return v == argb[ix - 3] && v == argb[ix - 2] && v == argb[ix - 1];
 860   }
 861   return 0;
 862 }
 863
 864 static float PredictionCostCrossColor(const int accumulated[256],
 865                                       const int counts[256]) {
 866   // Favor low entropy, locally and globally.
 867   // Favor small absolute values for PredictionCostSpatial
 868   static const double kExpValue = 2.4;
 869   return CombinedShannonEntropy(counts, accumulated, 256) +
 870          PredictionCostSpatial(counts, 3, kExpValue);
 871 }
 872
 873 static Multipliers GetBestColorTransformForTile(
 874     int tile_x, int tile_y, int bits,
 875     Multipliers prevX,
 876     Multipliers prevY,
 877     int step, int xsize, int ysize,
 878     int* accumulated_red_histo,
 879     int* accumulated_blue_histo,
 880     const uint32_t* const argb) {
 881   float best_diff = MAX_DIFF_COST;
 882   float cur_diff;
 883   const int halfstep = step / 2;
 884   const int max_tile_size = 1 << bits;
 885   const int tile_y_offset = tile_y * max_tile_size;
 886   const int tile_x_offset = tile_x * max_tile_size;
 887   int green_to_red;
 888   int green_to_blue;
 889   int red_to_blue;
 890   int all_x_max = tile_x_offset + max_tile_size;
 891   int all_y_max = tile_y_offset + max_tile_size;
 892   Multipliers best_tx;
 893   MultipliersClear(&best_tx);
 894   if (all_x_max > xsize) {
 895     all_x_max = xsize;
 896   }
 897   if (all_y_max > ysize) {
 898     all_y_max = ysize;
 899   }
 900
 901   for (green_to_red = -64; green_to_red <= 64; green_to_red += halfstep) {
 902     int histo[256] = { 0 };
 903     int all_y;
 904
 905     for (all_y = tile_y_offset; all_y < all_y_max; ++all_y) {
 906       int ix = all_y * xsize + tile_x_offset;
 907       int all_x;
 908       for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
 909         if (SkipRepeatedPixels(argb, ix, xsize)) {
 910           continue;
 911         }
 912         ++histo[TransformColorRed(green_to_red, argb[ix])];  // red.
 913       }
 914     }
 915     cur_diff = PredictionCostCrossColor(&accumulated_red_histo[0], &histo[0]);
 916     if ((uint8_t)green_to_red == prevX.green_to_red_) {
 917       cur_diff -= 3;  // favor keeping the areas locally similar
 918     }
 919     if ((uint8_t)green_to_red == prevY.green_to_red_) {
 920       cur_diff -= 3;  // favor keeping the areas locally similar
 921     }
 922     if (green_to_red == 0) {
 923       cur_diff -= 3;
 924     }
 925     if (cur_diff < best_diff) {
 926       best_diff = cur_diff;
 927       best_tx.green_to_red_ = green_to_red;
 928     }
 929   }
 930   best_diff = MAX_DIFF_COST;
 931   for (green_to_blue = -32; green_to_blue <= 32; green_to_blue += step) {
 932     for (red_to_blue = -32; red_to_blue <= 32; red_to_blue += step) {
 933       int all_y;
 934       int histo[256] = { 0 };
 935       for (all_y = tile_y_offset; all_y < all_y_max; ++all_y) {
 936         int all_x;
 937         int ix = all_y * xsize + tile_x_offset;
 938         for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
 939           if (SkipRepeatedPixels(argb, ix, xsize)) {
 940             continue;
 941           }
 942           ++histo[TransformColorBlue(green_to_blue, red_to_blue, argb[ix])];
 943         }
 944       }
 945       cur_diff =
 946           PredictionCostCrossColor(&accumulated_blue_histo[0], &histo[0]);
 947       if ((uint8_t)green_to_blue == prevX.green_to_blue_) {
 948         cur_diff -= 3;  // favor keeping the areas locally similar
 949       }
 950       if ((uint8_t)green_to_blue == prevY.green_to_blue_) {
 951         cur_diff -= 3;  // favor keeping the areas locally similar
 952       }
 953       if ((uint8_t)red_to_blue == prevX.red_to_blue_) {
 954         cur_diff -= 3;  // favor keeping the areas locally similar
 955       }
 956       if ((uint8_t)red_to_blue == prevY.red_to_blue_) {
 957         cur_diff -= 3;  // favor keeping the areas locally similar
 958       }
 959       if (green_to_blue == 0) {
 960         cur_diff -= 3;
 961       }
 962       if (red_to_blue == 0) {
 963         cur_diff -= 3;
 964       }
 965       if (cur_diff < best_diff) {
 966         best_diff = cur_diff;
 967         best_tx.green_to_blue_ = green_to_blue;
 968         best_tx.red_to_blue_ = red_to_blue;
 969       }
 970     }
 971   }
 972   return best_tx;
 973 }
 974
 975 static void CopyTileWithColorTransform(int xsize, int ysize,
 976                                        int tile_x, int tile_y, int bits,
 977                                        Multipliers color_transform,
 978                                        uint32_t* const argb) {
 979   int y;
 980   int xscan = 1 << bits;
 981   int yscan = 1 << bits;
 982   tile_x <<= bits;
 983   tile_y <<= bits;
 984   if (xscan > xsize - tile_x) {
 985     xscan = xsize - tile_x;
 986   }
 987   if (yscan > ysize - tile_y) {
 988     yscan = ysize - tile_y;
 989   }
 990   yscan += tile_y;
 991   for (y = tile_y; y < yscan; ++y) {
 992     int ix = y * xsize + tile_x;
 993     const int end_ix = ix + xscan;
 994     for (; ix < end_ix; ++ix) {
 995       argb[ix] = TransformColor(&color_transform, argb[ix], 0);
 996     }
 997   }
 998 }
 999
1000 void VP8LColorSpaceTransform(int width, int height, int bits, int step,
1001                              uint32_t* const argb, uint32_t* image) {
1002   const int max_tile_size = 1 << bits;
1003   int tile_xsize = VP8LSubSampleSize(width, bits);
1004   int tile_ysize = VP8LSubSampleSize(height, bits);
1005   int accumulated_red_histo[256] = { 0 };
1006   int accumulated_blue_histo[256] = { 0 };
1007   int tile_y;
1008   int tile_x;
1009   Multipliers prevX;
1010   Multipliers prevY;
1011   MultipliersClear(&prevY);
1012   MultipliersClear(&prevX);
1013   for (tile_y = 0; tile_y < tile_ysize; ++tile_y) {
1014     for (tile_x = 0; tile_x < tile_xsize; ++tile_x) {
1015       Multipliers color_transform;
1016       int all_x_max;
1017       int y;
1018       const int tile_y_offset = tile_y * max_tile_size;
1019       const int tile_x_offset = tile_x * max_tile_size;
1020       if (tile_y != 0) {
1021         ColorCodeToMultipliers(image[tile_y * tile_xsize + tile_x - 1], &prevX);
1022         ColorCodeToMultipliers(image[(tile_y - 1) * tile_xsize + tile_x],
1023                                &prevY);
1024       } else if (tile_x != 0) {
1025         ColorCodeToMultipliers(image[tile_y * tile_xsize + tile_x - 1], &prevX);
1026       }
1027       color_transform =
1028           GetBestColorTransformForTile(tile_x, tile_y, bits,
1029                                        prevX, prevY,
1030                                        step, width, height,
1031                                        &accumulated_red_histo[0],
1032                                        &accumulated_blue_histo[0],
1033                                        argb);
1034       image[tile_y * tile_xsize + tile_x] =
1035           MultipliersToColorCode(&color_transform);
1036       CopyTileWithColorTransform(width, height, tile_x, tile_y, bits,
1037                                  color_transform, argb);
1038
1039       // Gather accumulated histogram data.
1040       all_x_max = tile_x_offset + max_tile_size;
1041       if (all_x_max > width) {
1042         all_x_max = width;
1043       }
1044       for (y = 0; y < max_tile_size; ++y) {
1045         int ix;
1046         int all_x;
1047         int all_y = tile_y_offset + y;
1048         if (all_y >= height) {
1049           break;
1050         }
1051         ix = all_y * width + tile_x_offset;
1052         for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
1053           if (ix >= 2 &&
1054               argb[ix] == argb[ix - 2] &&
1055               argb[ix] == argb[ix - 1]) {
1056             continue;  // repeated pixels are handled by backward references
1057           }
1058           if (ix >= width + 2 &&
1059               argb[ix - 2] == argb[ix - width - 2] &&
1060               argb[ix - 1] == argb[ix - width - 1] &&
1061               argb[ix] == argb[ix - width]) {
1062             continue;  // repeated pixels are handled by backward references
1063           }
1064           ++accumulated_red_histo[(argb[ix] >> 16) & 0xff];
1065           ++accumulated_blue_histo[argb[ix] & 0xff];
1066         }
1067       }
1068     }
1069   }
1070 }
1071
1072 // Color space inverse transform.
1073 static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
1074                                        int y_start, int y_end, uint32_t* data) {
1075   const int width = transform->xsize_;
1076   const int mask = (1 << transform->bits_) - 1;
1077   const int tiles_per_row = VP8LSubSampleSize(width, transform->bits_);
1078   int y = y_start;
1079   const uint32_t* pred_row =
1080       transform->data_ + (y >> transform->bits_) * tiles_per_row;
1081
1082   while (y < y_end) {
1083     const uint32_t* pred = pred_row;
1084     Multipliers m = { 0, 0, 0 };
1085     int x;
1086
1087     for (x = 0; x < width; ++x) {
1088       if ((x & mask) == 0) ColorCodeToMultipliers(*pred++, &m);
1089       data[x] = TransformColor(&m, data[x], 1);
1090     }
1091     data += width;
1092     ++y;
1093     if ((y & mask) == 0) pred_row += tiles_per_row;;
1094   }
1095 }
1096
1097 // Separate out pixels packed together using pixel-bundling.
1098 // We define two methods for ARGB data (uint32_t) and alpha-only data (uint8_t).
1099 #define COLOR_INDEX_INVERSE(FUNC_NAME, TYPE, GET_INDEX, GET_VALUE)             \
1100 void FUNC_NAME(const VP8LTransform* const transform,                           \
1101                int y_start, int y_end, const TYPE* src, TYPE* dst) {           \
1102   int y;                                                                       \
1103   const int bits_per_pixel = 8 >> transform->bits_;                            \
1104   const int width = transform->xsize_;                                         \
1105   const uint32_t* const color_map = transform->data_;                          \
1106   if (bits_per_pixel < 8) {                                                    \
1107     const int pixels_per_byte = 1 << transform->bits_;                         \
1108     const int count_mask = pixels_per_byte - 1;                                \
1109     const uint32_t bit_mask = (1 << bits_per_pixel) - 1;                       \
1110     for (y = y_start; y < y_end; ++y) {                                        \
1111       uint32_t packed_pixels = 0;                                              \
1112       int x;                                                                   \
1113       for (x = 0; x < width; ++x) {                                            \
1114         /* We need to load fresh 'packed_pixels' once every                */  \
1115         /* 'pixels_per_byte' increments of x. Fortunately, pixels_per_byte */  \
1116         /* is a power of 2, so can just use a mask for that, instead of    */  \
1117         /* decrementing a counter.                                         */  \
1118         if ((x & count_mask) == 0) packed_pixels = GET_INDEX(*src++);          \
1119         *dst++ = GET_VALUE(color_map[packed_pixels & bit_mask]);               \
1120         packed_pixels >>= bits_per_pixel;                                      \
1121       }                                                                        \
1122     }                                                                          \
1123   } else {                                                                     \
1124     for (y = y_start; y < y_end; ++y) {                                        \
1125       int x;                                                                   \
1126       for (x = 0; x < width; ++x) {                                            \
1127         *dst++ = GET_VALUE(color_map[GET_INDEX(*src++)]);                      \
1128       }                                                                        \
1129     }                                                                          \
1130   }                                                                            \
1131 }
1132
1133 static WEBP_INLINE uint32_t GetARGBIndex(uint32_t idx) {
1134   return (idx >> 8) & 0xff;
1135 }
1136
1137 static WEBP_INLINE uint8_t GetAlphaIndex(uint8_t idx) {
1138   return idx;
1139 }
1140
1141 static WEBP_INLINE uint32_t GetARGBValue(uint32_t val) {
1142   return val;
1143 }
1144
1145 static WEBP_INLINE uint8_t GetAlphaValue(uint32_t val) {
1146   return (val >> 8) & 0xff;
1147 }
1148
1149 static COLOR_INDEX_INVERSE(ColorIndexInverseTransform, uint32_t, GetARGBIndex,
1150                            GetARGBValue)
1151 COLOR_INDEX_INVERSE(VP8LColorIndexInverseTransformAlpha, uint8_t, GetAlphaIndex,
1152                     GetAlphaValue)
1153
1154 #undef COLOR_INDEX_INVERSE
1155
1156 void VP8LInverseTransform(const VP8LTransform* const transform,
1157                           int row_start, int row_end,
1158                           const uint32_t* const in, uint32_t* const out) {
1159   assert(row_start < row_end);
1160   assert(row_end <= transform->ysize_);
1161   switch (transform->type_) {
1162     case SUBTRACT_GREEN:
1163       AddGreenToBlueAndRed(transform, row_start, row_end, out);
1164       break;
1165     case PREDICTOR_TRANSFORM:
1166       PredictorInverseTransform(transform, row_start, row_end, out);
1167       if (row_end != transform->ysize_) {
1168         // The last predicted row in this iteration will be the top-pred row
1169         // for the first row in next iteration.
1170         const int width = transform->xsize_;
1171         memcpy(out - width, out + (row_end - row_start - 1) * width,
1172                width * sizeof(*out));
1173       }
1174       break;
1175     case CROSS_COLOR_TRANSFORM:
1176       ColorSpaceInverseTransform(transform, row_start, row_end, out);
1177       break;
1178     case COLOR_INDEXING_TRANSFORM:
1179       if (in == out && transform->bits_ > 0) {
1180         // Move packed pixels to the end of unpacked region, so that unpacking
1181         // can occur seamlessly.
1182         // Also, note that this is the only transform that applies on
1183         // the effective width of VP8LSubSampleSize(xsize_, bits_). All other
1184         // transforms work on effective width of xsize_.
1185         const int out_stride = (row_end - row_start) * transform->xsize_;
1186         const int in_stride = (row_end - row_start) *
1187             VP8LSubSampleSize(transform->xsize_, transform->bits_);
1188         uint32_t* const src = out + out_stride - in_stride;
1189         memmove(src, out, in_stride * sizeof(*src));
1190         ColorIndexInverseTransform(transform, row_start, row_end, src, out);
1191       } else {
1192         ColorIndexInverseTransform(transform, row_start, row_end, in, out);
1193       }
1194       break;
1195   }
1196 }
1197
1198 //------------------------------------------------------------------------------
1199 // Color space conversion.
1200
1201 static int is_big_endian(void) {
1202   static const union {
1203     uint16_t w;
1204     uint8_t b[2];
1205   } tmp = { 1 };
1206   return (tmp.b[0] != 1);
1207 }
1208
1209 static void ConvertBGRAToRGB(const uint32_t* src,
1210                              int num_pixels, uint8_t* dst) {
1211   const uint32_t* const src_end = src + num_pixels;
1212   while (src < src_end) {
1213     const uint32_t argb = *src++;
1214     *dst++ = (argb >> 16) & 0xff;
1215     *dst++ = (argb >>  8) & 0xff;
1216     *dst++ = (argb >>  0) & 0xff;
1217   }
1218 }
1219
1220 static void ConvertBGRAToRGBA(const uint32_t* src,
1221                               int num_pixels, uint8_t* dst) {
1222   const uint32_t* const src_end = src + num_pixels;
1223   while (src < src_end) {
1224     const uint32_t argb = *src++;
1225     *dst++ = (argb >> 16) & 0xff;
1226     *dst++ = (argb >>  8) & 0xff;
1227     *dst++ = (argb >>  0) & 0xff;
1228     *dst++ = (argb >> 24) & 0xff;
1229   }
1230 }
1231
1232 static void ConvertBGRAToRGBA4444(const uint32_t* src,
1233                                   int num_pixels, uint8_t* dst) {
1234   const uint32_t* const src_end = src + num_pixels;
1235   while (src < src_end) {
1236     const uint32_t argb = *src++;
1237     const uint8_t rg = ((argb >> 16) & 0xf0) | ((argb >> 12) & 0xf);
1238     const uint8_t ba = ((argb >>  0) & 0xf0) | ((argb >> 28) & 0xf);
1239 #ifdef WEBP_SWAP_16BIT_CSP
1240     *dst++ = ba;
1241     *dst++ = rg;
1242 #else
1243     *dst++ = rg;
1244     *dst++ = ba;
1245 #endif
1246   }
1247 }
1248
1249 static void ConvertBGRAToRGB565(const uint32_t* src,
1250                                 int num_pixels, uint8_t* dst) {
1251   const uint32_t* const src_end = src + num_pixels;
1252   while (src < src_end) {
1253     const uint32_t argb = *src++;
1254     const uint8_t rg = ((argb >> 16) & 0xf8) | ((argb >> 13) & 0x7);
1255     const uint8_t gb = ((argb >>  5) & 0xe0) | ((argb >>  3) & 0x1f);
1256 #ifdef WEBP_SWAP_16BIT_CSP
1257     *dst++ = gb;
1258     *dst++ = rg;
1259 #else
1260     *dst++ = rg;
1261     *dst++ = gb;
1262 #endif
1263   }
1264 }
1265
1266 static void ConvertBGRAToBGR(const uint32_t* src,
1267                              int num_pixels, uint8_t* dst) {
1268   const uint32_t* const src_end = src + num_pixels;
1269   while (src < src_end) {
1270     const uint32_t argb = *src++;
1271     *dst++ = (argb >>  0) & 0xff;
1272     *dst++ = (argb >>  8) & 0xff;
1273     *dst++ = (argb >> 16) & 0xff;
1274   }
1275 }
1276
1277 static void CopyOrSwap(const uint32_t* src, int num_pixels, uint8_t* dst,
1278                        int swap_on_big_endian) {
1279   if (is_big_endian() == swap_on_big_endian) {
1280     const uint32_t* const src_end = src + num_pixels;
1281     while (src < src_end) {
1282       uint32_t argb = *src++;
1283
1284 #if !defined(__BIG_ENDIAN__)
1285 #if !defined(WEBP_REFERENCE_IMPLEMENTATION)
1286 #if defined(__i386__) || defined(__x86_64__)
1287       __asm__ volatile("bswap %0" : "=r"(argb) : "0"(argb));
1288       *(uint32_t*)dst = argb;
1289 #elif defined(_MSC_VER)
1290       argb = _byteswap_ulong(argb);
1291       *(uint32_t*)dst = argb;
1292 #else
1293       dst[0] = (argb >> 24) & 0xff;
1294       dst[1] = (argb >> 16) & 0xff;
1295       dst[2] = (argb >>  8) & 0xff;
1296       dst[3] = (argb >>  0) & 0xff;
1297 #endif
1298 #else  // WEBP_REFERENCE_IMPLEMENTATION
1299       dst[0] = (argb >> 24) & 0xff;
1300       dst[1] = (argb >> 16) & 0xff;
1301       dst[2] = (argb >>  8) & 0xff;
1302       dst[3] = (argb >>  0) & 0xff;
1303 #endif
1304 #else  // __BIG_ENDIAN__
1305       dst[0] = (argb >>  0) & 0xff;
1306       dst[1] = (argb >>  8) & 0xff;
1307       dst[2] = (argb >> 16) & 0xff;
1308       dst[3] = (argb >> 24) & 0xff;
1309 #endif
1310       dst += sizeof(argb);
1311     }
1312   } else {
1313     memcpy(dst, src, num_pixels * sizeof(*src));
1314   }
1315 }
1316
1317 void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
1318                          WEBP_CSP_MODE out_colorspace, uint8_t* const rgba) {
1319   switch (out_colorspace) {
1320     case MODE_RGB:
1321       ConvertBGRAToRGB(in_data, num_pixels, rgba);
1322       break;
1323     case MODE_RGBA:
1324       ConvertBGRAToRGBA(in_data, num_pixels, rgba);
1325       break;
1326     case MODE_rgbA:
1327       ConvertBGRAToRGBA(in_data, num_pixels, rgba);
1328       WebPApplyAlphaMultiply(rgba, 0, num_pixels, 1, 0);
1329       break;
1330     case MODE_BGR:
1331       ConvertBGRAToBGR(in_data, num_pixels, rgba);
1332       break;
1333     case MODE_BGRA:
1334       CopyOrSwap(in_data, num_pixels, rgba, 1);
1335       break;
1336     case MODE_bgrA:
1337       CopyOrSwap(in_data, num_pixels, rgba, 1);
1338       WebPApplyAlphaMultiply(rgba, 0, num_pixels, 1, 0);
1339       break;
1340     case MODE_ARGB:
1341       CopyOrSwap(in_data, num_pixels, rgba, 0);
1342       break;
1343     case MODE_Argb:
1344       CopyOrSwap(in_data, num_pixels, rgba, 0);
1345       WebPApplyAlphaMultiply(rgba, 1, num_pixels, 1, 0);
1346       break;
1347     case MODE_RGBA_4444:
1348       ConvertBGRAToRGBA4444(in_data, num_pixels, rgba);
1349       break;
1350     case MODE_rgbA_4444:
1351       ConvertBGRAToRGBA4444(in_data, num_pixels, rgba);
1352       WebPApplyAlphaMultiply4444(rgba, num_pixels, 1, 0);
1353       break;
1354     case MODE_RGB_565:
1355       ConvertBGRAToRGB565(in_data, num_pixels, rgba);
1356       break;
1357     default:
1358       assert(0);          // Code flow should not reach here.
1359   }
1360 }
1361
1362 // Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
1363 void VP8LBundleColorMap(const uint8_t* const row, int width,
1364                         int xbits, uint32_t* const dst) {
1365   int x;
1366   if (xbits > 0) {
1367     const int bit_depth = 1 << (3 - xbits);
1368     const int mask = (1 << xbits) - 1;
1369     uint32_t code = 0xff000000;
1370     for (x = 0; x < width; ++x) {
1371       const int xsub = x & mask;
1372       if (xsub == 0) {
1373         code = 0xff000000;
1374       }
1375       code |= row[x] << (8 + bit_depth * xsub);
1376       dst[x >> xbits] = code;
1377     }
1378   } else {
1379     for (x = 0; x < width; ++x) dst[x] = 0xff000000 | (row[x] << 8);
1380   }
1381 }
1382
1383 //------------------------------------------------------------------------------
1384
1385 #if defined(__cplusplus) || defined(c_plusplus)
1386 }    // extern "C"
1387 #endif