media/base/simd/convert_rgb_to_yuv_sse2.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "build/build_config.h"
   6 #include "media/base/simd/convert_rgb_to_yuv.h"
   7
   8 #if defined(COMPILER_MSVC)
   9 #include <intrin.h>
  10 #else
  11 #include <mmintrin.h>
  12 #include <emmintrin.h>
  13 #endif
  14
  15 #if defined(COMPILER_MSVC)
  16 #define SIMD_ALIGNED(var) __declspec(align(16)) var
  17 #else
  18 #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
  19 #endif
  20
  21 namespace media {
  22
  23 #define FIX_SHIFT 12
  24 #define FIX(x) ((x) * (1 << FIX_SHIFT))
  25
  26 // Define a convenient macro to do static cast.
  27 #define INT16_FIX(x) static_cast<int16>(FIX(x))
  28
  29 // Android's pixel layout is RGBA, while other platforms
  30 // are BGRA.
  31 #if defined(OS_ANDROID)
  32 SIMD_ALIGNED(const int16 ConvertRGBAToYUV_kTable[8 * 3]) = {
  33   INT16_FIX(0.257), INT16_FIX(0.504), INT16_FIX(0.098), 0,
  34   INT16_FIX(0.257), INT16_FIX(0.504), INT16_FIX(0.098), 0,
  35   -INT16_FIX(0.148), -INT16_FIX(0.291), INT16_FIX(0.439), 0,
  36   -INT16_FIX(0.148), -INT16_FIX(0.291), INT16_FIX(0.439), 0,
  37   INT16_FIX(0.439), -INT16_FIX(0.368), -INT16_FIX(0.071), 0,
  38   INT16_FIX(0.439), -INT16_FIX(0.368), -INT16_FIX(0.071), 0,
  39 };
  40 #else
  41 SIMD_ALIGNED(const int16 ConvertRGBAToYUV_kTable[8 * 3]) = {
  42   INT16_FIX(0.098), INT16_FIX(0.504), INT16_FIX(0.257), 0,
  43   INT16_FIX(0.098), INT16_FIX(0.504), INT16_FIX(0.257), 0,
  44   INT16_FIX(0.439), -INT16_FIX(0.291), -INT16_FIX(0.148), 0,
  45   INT16_FIX(0.439), -INT16_FIX(0.291), -INT16_FIX(0.148), 0,
  46   -INT16_FIX(0.071), -INT16_FIX(0.368), INT16_FIX(0.439), 0,
  47   -INT16_FIX(0.071), -INT16_FIX(0.368), INT16_FIX(0.439), 0,
  48 };
  49 #endif
  50
  51 #undef INT16_FIX
  52
  53 // This is the final offset for the conversion from signed yuv values to
  54 // unsigned values. It is arranged so that offset of 16 is applied to Y
  55 // components and 128 is added to UV components for 2 pixels.
  56 SIMD_ALIGNED(const int32 kYOffset[4]) = {16, 16, 16, 16};
  57
  58 static inline uint8 Clamp(int value) {
  59   if (value < 0)
  60     return 0;
  61   if (value > 255)
  62     return 255;
  63   return static_cast<uint8>(value);
  64 }
  65
  66 static inline uint8 RGBToY(int r, int g, int b) {
  67   int y = ConvertRGBAToYUV_kTable[0] * b +
  68       ConvertRGBAToYUV_kTable[1] * g +
  69       ConvertRGBAToYUV_kTable[2] * r;
  70   y >>= FIX_SHIFT;
  71   return Clamp(y + 16);
  72 }
  73
  74 static inline uint8 RGBToU(int r, int g, int b, int shift) {
  75   int u = ConvertRGBAToYUV_kTable[8] * b +
  76       ConvertRGBAToYUV_kTable[9] * g +
  77       ConvertRGBAToYUV_kTable[10] * r;
  78   u >>= FIX_SHIFT + shift;
  79   return Clamp(u + 128);
  80 }
  81
  82 static inline uint8 RGBToV(int r, int g, int b, int shift) {
  83   int v = ConvertRGBAToYUV_kTable[16] * b +
  84       ConvertRGBAToYUV_kTable[17] * g +
  85       ConvertRGBAToYUV_kTable[18] * r;
  86   v >>= FIX_SHIFT + shift;
  87   return Clamp(v + 128);
  88 }
  89
  90 #define CONVERT_Y(rgb_buf, y_buf) \
  91   b = *rgb_buf++; \
  92   g = *rgb_buf++; \
  93   r = *rgb_buf++; \
  94   ++rgb_buf;      \
  95   sum_b += b;     \
  96   sum_g += g;     \
  97   sum_r += r;     \
  98   *y_buf++ = RGBToY(r, g, b);
  99
 100 static inline void ConvertRGBToYUV_V2H2(const uint8* rgb_buf_1,
 101                                         const uint8* rgb_buf_2,
 102                                         uint8* y_buf_1,
 103                                         uint8* y_buf_2,
 104                                         uint8* u_buf,
 105                                         uint8* v_buf) {
 106   int sum_b = 0;
 107   int sum_g = 0;
 108   int sum_r = 0;
 109   int r, g, b;
 110
 111
 112
 113   CONVERT_Y(rgb_buf_1, y_buf_1);
 114   CONVERT_Y(rgb_buf_1, y_buf_1);
 115   CONVERT_Y(rgb_buf_2, y_buf_2);
 116   CONVERT_Y(rgb_buf_2, y_buf_2);
 117   *u_buf++ = RGBToU(sum_r, sum_g, sum_b, 2);
 118   *v_buf++ = RGBToV(sum_r, sum_g, sum_b, 2);
 119 }
 120
 121 static inline void ConvertRGBToYUV_V2H1(const uint8* rgb_buf_1,
 122                                         const uint8* rgb_buf_2,
 123                                         uint8* y_buf_1,
 124                                         uint8* y_buf_2,
 125                                         uint8* u_buf,
 126                                         uint8* v_buf) {
 127   int sum_b = 0;
 128   int sum_g = 0;
 129   int sum_r = 0;
 130   int r, g, b;
 131
 132   CONVERT_Y(rgb_buf_1, y_buf_1);
 133   CONVERT_Y(rgb_buf_2, y_buf_2);
 134   *u_buf++ = RGBToU(sum_r, sum_g, sum_b, 1);
 135   *v_buf++ = RGBToV(sum_r, sum_g, sum_b, 1);
 136 }
 137
 138 static inline void ConvertRGBToYUV_V1H2(const uint8* rgb_buf,
 139                                        uint8* y_buf,
 140                                        uint8* u_buf,
 141                                        uint8* v_buf) {
 142   int sum_b = 0;
 143   int sum_g = 0;
 144   int sum_r = 0;
 145   int r, g, b;
 146
 147   CONVERT_Y(rgb_buf, y_buf);
 148   CONVERT_Y(rgb_buf, y_buf);
 149   *u_buf++ = RGBToU(sum_r, sum_g, sum_b, 1);
 150   *v_buf++ = RGBToV(sum_r, sum_g, sum_b, 1);
 151 }
 152
 153 static inline void ConvertRGBToYUV_V1H1(const uint8* rgb_buf,
 154                                        uint8* y_buf,
 155                                        uint8* u_buf,
 156                                        uint8* v_buf) {
 157   int sum_b = 0;
 158   int sum_g = 0;
 159   int sum_r = 0;
 160   int r, g, b;
 161
 162   CONVERT_Y(rgb_buf, y_buf);
 163   *u_buf++ = RGBToU(r, g, b, 0);
 164   *v_buf++ = RGBToV(r, g, b, 0);
 165 }
 166
 167 static void ConvertRGB32ToYUVRow_SSE2(const uint8* rgb_buf_1,
 168                                       const uint8* rgb_buf_2,
 169                                       uint8* y_buf_1,
 170                                       uint8* y_buf_2,
 171                                       uint8* u_buf,
 172                                       uint8* v_buf,
 173                                       int width) {
 174   while (width >= 4) {
 175     // Name for the Y pixels:
 176     // Row 1: a b c d
 177     // Row 2: e f g h
 178     //
 179     // First row 4 pixels.
 180     __m128i rgb_row_1 = _mm_loadu_si128(
 181         reinterpret_cast<const __m128i*>(rgb_buf_1));
 182     __m128i zero_1 = _mm_xor_si128(rgb_row_1, rgb_row_1);
 183
 184     __m128i y_table = _mm_load_si128(
 185         reinterpret_cast<const __m128i*>(ConvertRGBAToYUV_kTable));
 186
 187     __m128i rgb_a_b = _mm_unpackhi_epi8(rgb_row_1, zero_1);
 188     rgb_a_b = _mm_madd_epi16(rgb_a_b, y_table);
 189
 190     __m128i rgb_c_d = _mm_unpacklo_epi8(rgb_row_1, zero_1);
 191     rgb_c_d = _mm_madd_epi16(rgb_c_d, y_table);
 192
 193     // Do a crazh shuffle so that we get:
 194     //  v------------ Multiply Add
 195     // BG: a b c d
 196     // A0: a b c d
 197     __m128i bg_abcd = _mm_castps_si128(
 198         _mm_shuffle_ps(
 199             _mm_castsi128_ps(rgb_c_d),
 200             _mm_castsi128_ps(rgb_a_b),
 201             (3 << 6) | (1 << 4) | (3 << 2) | 1));
 202     __m128i r_abcd = _mm_castps_si128(
 203         _mm_shuffle_ps(
 204             _mm_castsi128_ps(rgb_c_d),
 205             _mm_castsi128_ps(rgb_a_b),
 206             (2 << 6) | (2 << 2)));
 207     __m128i y_abcd = _mm_add_epi32(bg_abcd, r_abcd);
 208
 209     // Down shift back to 8bits range.
 210     __m128i y_offset = _mm_load_si128(
 211         reinterpret_cast<const __m128i*>(kYOffset));
 212     y_abcd = _mm_srai_epi32(y_abcd, FIX_SHIFT);
 213     y_abcd = _mm_add_epi32(y_abcd, y_offset);
 214     y_abcd = _mm_packs_epi32(y_abcd, y_abcd);
 215     y_abcd = _mm_packus_epi16(y_abcd, y_abcd);
 216     *reinterpret_cast<uint32*>(y_buf_1) = _mm_cvtsi128_si32(y_abcd);
 217     y_buf_1 += 4;
 218
 219     // Second row 4 pixels.
 220     __m128i rgb_row_2 = _mm_loadu_si128(
 221         reinterpret_cast<const __m128i*>(rgb_buf_2));
 222     __m128i zero_2 = _mm_xor_si128(rgb_row_2, rgb_row_2);
 223     __m128i rgb_e_f = _mm_unpackhi_epi8(rgb_row_2, zero_2);
 224     __m128i rgb_g_h = _mm_unpacklo_epi8(rgb_row_2, zero_2);
 225
 226     // Add two rows together.
 227     __m128i rgb_ae_bf =
 228         _mm_add_epi16(_mm_unpackhi_epi8(rgb_row_1, zero_2), rgb_e_f);
 229     __m128i rgb_cg_dh =
 230         _mm_add_epi16(_mm_unpacklo_epi8(rgb_row_1, zero_2), rgb_g_h);
 231
 232     // Multiply add like the previous row.
 233     rgb_e_f = _mm_madd_epi16(rgb_e_f, y_table);
 234     rgb_g_h = _mm_madd_epi16(rgb_g_h, y_table);
 235
 236     __m128i bg_efgh = _mm_castps_si128(
 237         _mm_shuffle_ps(_mm_castsi128_ps(rgb_g_h),
 238                        _mm_castsi128_ps(rgb_e_f),
 239                        (3 << 6) | (1 << 4) | (3 << 2) | 1));
 240     __m128i r_efgh = _mm_castps_si128(
 241         _mm_shuffle_ps(_mm_castsi128_ps(rgb_g_h),
 242                        _mm_castsi128_ps(rgb_e_f),
 243                        (2 << 6) | (2 << 2)));
 244     __m128i y_efgh = _mm_add_epi32(bg_efgh, r_efgh);
 245     y_efgh = _mm_srai_epi32(y_efgh, FIX_SHIFT);
 246     y_efgh = _mm_add_epi32(y_efgh, y_offset);
 247     y_efgh = _mm_packs_epi32(y_efgh, y_efgh);
 248     y_efgh = _mm_packus_epi16(y_efgh, y_efgh);
 249     *reinterpret_cast<uint32*>(y_buf_2) = _mm_cvtsi128_si32(y_efgh);
 250     y_buf_2 += 4;
 251
 252     __m128i rgb_ae_cg = _mm_castps_si128(
 253         _mm_shuffle_ps(_mm_castsi128_ps(rgb_cg_dh),
 254                        _mm_castsi128_ps(rgb_ae_bf),
 255                        (3 << 6) | (2 << 4) | (3 << 2) | 2));
 256     __m128i rgb_bf_dh = _mm_castps_si128(
 257         _mm_shuffle_ps(_mm_castsi128_ps(rgb_cg_dh),
 258                        _mm_castsi128_ps(rgb_ae_bf),
 259                        (1 << 6) | (1 << 2)));
 260
 261     // This is a 2x2 subsampling for 2 pixels.
 262     __m128i rgb_abef_cdgh = _mm_add_epi16(rgb_ae_cg, rgb_bf_dh);
 263
 264     // Do a multiply add with U table.
 265     __m128i u_a_b = _mm_madd_epi16(
 266         rgb_abef_cdgh,
 267         _mm_load_si128(
 268             reinterpret_cast<const __m128i*>(ConvertRGBAToYUV_kTable + 8)));
 269     u_a_b = _mm_add_epi32(_mm_shuffle_epi32(u_a_b, ((3 << 2) | 1)),
 270                           _mm_shuffle_epi32(u_a_b, (2 << 2)));
 271     // Right shift 14 because of 12 from fixed point and 2 from subsampling.
 272     u_a_b = _mm_srai_epi32(u_a_b, FIX_SHIFT + 2);
 273     __m128i uv_offset = _mm_slli_epi32(y_offset, 3);
 274     u_a_b = _mm_add_epi32(u_a_b, uv_offset);
 275     u_a_b = _mm_packs_epi32(u_a_b, u_a_b);
 276     u_a_b = _mm_packus_epi16(u_a_b, u_a_b);
 277     *reinterpret_cast<uint16*>(u_buf) =
 278         static_cast<uint16>(_mm_extract_epi16(u_a_b, 0));
 279     u_buf += 2;
 280
 281     __m128i v_a_b = _mm_madd_epi16(
 282         rgb_abef_cdgh,
 283         _mm_load_si128(
 284             reinterpret_cast<const __m128i*>(ConvertRGBAToYUV_kTable + 16)));
 285     v_a_b = _mm_add_epi32(_mm_shuffle_epi32(v_a_b, ((3 << 2) | 1)),
 286                           _mm_shuffle_epi32(v_a_b, (2 << 2)));
 287     v_a_b = _mm_srai_epi32(v_a_b, FIX_SHIFT + 2);
 288     v_a_b = _mm_add_epi32(v_a_b, uv_offset);
 289     v_a_b = _mm_packs_epi32(v_a_b, v_a_b);
 290     v_a_b = _mm_packus_epi16(v_a_b, v_a_b);
 291     *reinterpret_cast<uint16*>(v_buf) =
 292         static_cast<uint16>(_mm_extract_epi16(v_a_b, 0));
 293     v_buf += 2;
 294
 295     rgb_buf_1 += 16;
 296     rgb_buf_2 += 16;
 297
 298     // Move forward by 4 pixels.
 299     width -= 4;
 300   }
 301
 302   // Just use C code to convert the remaining pixels.
 303   if (width >= 2) {
 304     ConvertRGBToYUV_V2H2(rgb_buf_1, rgb_buf_2, y_buf_1, y_buf_2, u_buf, v_buf);
 305     rgb_buf_1 += 8;
 306     rgb_buf_2 += 8;
 307     y_buf_1 += 2;
 308     y_buf_2 += 2;
 309     ++u_buf;
 310     ++v_buf;
 311     width -= 2;
 312   }
 313
 314   if (width)
 315     ConvertRGBToYUV_V2H1(rgb_buf_1, rgb_buf_2, y_buf_1, y_buf_2, u_buf, v_buf);
 316 }
 317
 318 extern void ConvertRGB32ToYUV_SSE2(const uint8* rgbframe,
 319                                    uint8* yplane,
 320                                    uint8* uplane,
 321                                    uint8* vplane,
 322                                    int width,
 323                                    int height,
 324                                    int rgbstride,
 325                                    int ystride,
 326                                    int uvstride) {
 327   while (height >= 2) {
 328     ConvertRGB32ToYUVRow_SSE2(rgbframe,
 329                               rgbframe + rgbstride,
 330                               yplane,
 331                               yplane + ystride,
 332                               uplane,
 333                               vplane,
 334                               width);
 335     rgbframe += 2 * rgbstride;
 336     yplane += 2 * ystride;
 337     uplane += uvstride;
 338     vplane += uvstride;
 339     height -= 2;
 340   }
 341
 342   if (!height)
 343     return;
 344
 345   // Handle the last row.
 346   while (width >= 2) {
 347     ConvertRGBToYUV_V1H2(rgbframe, yplane, uplane, vplane);
 348     rgbframe += 8;
 349     yplane += 2;
 350     ++uplane;
 351     ++vplane;
 352     width -= 2;
 353   }
 354
 355   if (width)
 356     ConvertRGBToYUV_V1H1(rgbframe, yplane, uplane, vplane);
 357 }
 358
 359 void ConvertRGB32ToYUV_SSE2_Reference(const uint8* rgbframe,
 360                                       uint8* yplane,
 361                                       uint8* uplane,
 362                                       uint8* vplane,
 363                                       int width,
 364                                       int height,
 365                                       int rgbstride,
 366                                       int ystride,
 367                                       int uvstride) {
 368   while (height >= 2) {
 369     int i = 0;
 370
 371     // Convert a 2x2 block.
 372     while (i + 2 <= width) {
 373       ConvertRGBToYUV_V2H2(rgbframe + i * 4,
 374                            rgbframe + rgbstride + i * 4,
 375                            yplane + i,
 376                            yplane + ystride + i,
 377                            uplane + i / 2,
 378                            vplane + i / 2);
 379       i += 2;
 380     }
 381
 382     // Convert the last pixel of two rows.
 383     if (i < width) {
 384       ConvertRGBToYUV_V2H1(rgbframe + i * 4,
 385                            rgbframe + rgbstride + i * 4,
 386                            yplane + i,
 387                            yplane + ystride + i,
 388                            uplane + i / 2,
 389                            vplane + i / 2);
 390     }
 391
 392     rgbframe += 2 * rgbstride;
 393     yplane += 2 * ystride;
 394     uplane += uvstride;
 395     vplane += uvstride;
 396     height -= 2;
 397   }
 398
 399   if (!height)
 400     return;
 401
 402   // Handle the last row.
 403   while (width >= 2) {
 404     ConvertRGBToYUV_V1H2(rgbframe, yplane, uplane, vplane);
 405     rgbframe += 8;
 406     yplane += 2;
 407     ++uplane;
 408     ++vplane;
 409     width -= 2;
 410   }
 411
 412   // Handle the last pixel in the last row.
 413   if (width)
 414     ConvertRGBToYUV_V1H1(rgbframe, yplane, uplane, vplane);
 415 }
 416
 417 }  // namespace media