media/base/simd/convert_rgb_to_yuv_sse2.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "build/build_config.h"
   6 #include "media/base/simd/convert_rgb_to_yuv.h"
   7 #include "media/base/simd/yuv_to_rgb_table.h"
   8
   9 #if defined(COMPILER_MSVC)
  10 #include <intrin.h>
  11 #else
  12 #include <mmintrin.h>
  13 #include <emmintrin.h>
  14 #endif
  15
  16 namespace media {
  17
  18 #define FIX_SHIFT 12
  19 #define FIX(x) ((x) * (1 << FIX_SHIFT))
  20
  21 // Define a convenient macro to do static cast.
  22 #define INT16_FIX(x) static_cast<int16>(FIX(x))
  23
  24 // Android's pixel layout is RGBA, while other platforms
  25 // are BGRA.
  26 #if defined(OS_ANDROID)
  27 SIMD_ALIGNED(const int16 ConvertRGBAToYUV_kTable[8 * 3]) = {
  28   INT16_FIX(0.257), INT16_FIX(0.504), INT16_FIX(0.098), 0,
  29   INT16_FIX(0.257), INT16_FIX(0.504), INT16_FIX(0.098), 0,
  30   -INT16_FIX(0.148), -INT16_FIX(0.291), INT16_FIX(0.439), 0,
  31   -INT16_FIX(0.148), -INT16_FIX(0.291), INT16_FIX(0.439), 0,
  32   INT16_FIX(0.439), -INT16_FIX(0.368), -INT16_FIX(0.071), 0,
  33   INT16_FIX(0.439), -INT16_FIX(0.368), -INT16_FIX(0.071), 0,
  34 };
  35 #else
  36 SIMD_ALIGNED(const int16 ConvertRGBAToYUV_kTable[8 * 3]) = {
  37   INT16_FIX(0.098), INT16_FIX(0.504), INT16_FIX(0.257), 0,
  38   INT16_FIX(0.098), INT16_FIX(0.504), INT16_FIX(0.257), 0,
  39   INT16_FIX(0.439), -INT16_FIX(0.291), -INT16_FIX(0.148), 0,
  40   INT16_FIX(0.439), -INT16_FIX(0.291), -INT16_FIX(0.148), 0,
  41   -INT16_FIX(0.071), -INT16_FIX(0.368), INT16_FIX(0.439), 0,
  42   -INT16_FIX(0.071), -INT16_FIX(0.368), INT16_FIX(0.439), 0,
  43 };
  44 #endif
  45
  46 #undef INT16_FIX
  47
  48 // This is the final offset for the conversion from signed yuv values to
  49 // unsigned values. It is arranged so that offset of 16 is applied to Y
  50 // components and 128 is added to UV components for 2 pixels.
  51 SIMD_ALIGNED(const int32 kYOffset[4]) = {16, 16, 16, 16};
  52
  53 static inline int Clamp(int value) {
  54   if (value < 0)
  55     return 0;
  56   if (value > 255)
  57     return 255;
  58   return value;
  59 }
  60
  61 static inline int RGBToY(int r, int g, int b) {
  62   int y = ConvertRGBAToYUV_kTable[0] * b +
  63       ConvertRGBAToYUV_kTable[1] * g +
  64       ConvertRGBAToYUV_kTable[2] * r;
  65   y >>= FIX_SHIFT;
  66   return Clamp(y + 16);
  67 }
  68
  69 static inline int RGBToU(int r, int g, int b, int shift) {
  70   int u = ConvertRGBAToYUV_kTable[8] * b +
  71       ConvertRGBAToYUV_kTable[9] * g +
  72       ConvertRGBAToYUV_kTable[10] * r;
  73   u >>= FIX_SHIFT + shift;
  74   return Clamp(u + 128);
  75 }
  76
  77 static inline int RGBToV(int r, int g, int b, int shift) {
  78   int v = ConvertRGBAToYUV_kTable[16] * b +
  79       ConvertRGBAToYUV_kTable[17] * g +
  80       ConvertRGBAToYUV_kTable[18] * r;
  81   v >>= FIX_SHIFT + shift;
  82   return Clamp(v + 128);
  83 }
  84
  85 #define CONVERT_Y(rgb_buf, y_buf) \
  86   b = *rgb_buf++; \
  87   g = *rgb_buf++; \
  88   r = *rgb_buf++; \
  89   ++rgb_buf;      \
  90   sum_b += b;     \
  91   sum_g += g;     \
  92   sum_r += r;     \
  93   *y_buf++ = RGBToY(r, g, b);
  94
  95 static inline void ConvertRGBToYUV_V2H2(const uint8* rgb_buf_1,
  96                                         const uint8* rgb_buf_2,
  97                                         uint8* y_buf_1,
  98                                         uint8* y_buf_2,
  99                                         uint8* u_buf,
 100                                         uint8* v_buf) {
 101   int sum_b = 0;
 102   int sum_g = 0;
 103   int sum_r = 0;
 104   int r, g, b;
 105
 106
 107
 108   CONVERT_Y(rgb_buf_1, y_buf_1);
 109   CONVERT_Y(rgb_buf_1, y_buf_1);
 110   CONVERT_Y(rgb_buf_2, y_buf_2);
 111   CONVERT_Y(rgb_buf_2, y_buf_2);
 112   *u_buf++ = RGBToU(sum_r, sum_g, sum_b, 2);
 113   *v_buf++ = RGBToV(sum_r, sum_g, sum_b, 2);
 114 }
 115
 116 static inline void ConvertRGBToYUV_V2H1(const uint8* rgb_buf_1,
 117                                         const uint8* rgb_buf_2,
 118                                         uint8* y_buf_1,
 119                                         uint8* y_buf_2,
 120                                         uint8* u_buf,
 121                                         uint8* v_buf) {
 122   int sum_b = 0;
 123   int sum_g = 0;
 124   int sum_r = 0;
 125   int r, g, b;
 126
 127   CONVERT_Y(rgb_buf_1, y_buf_1);
 128   CONVERT_Y(rgb_buf_2, y_buf_2);
 129   *u_buf++ = RGBToU(sum_r, sum_g, sum_b, 1);
 130   *v_buf++ = RGBToV(sum_r, sum_g, sum_b, 1);
 131 }
 132
 133 static inline void ConvertRGBToYUV_V1H2(const uint8* rgb_buf,
 134                                        uint8* y_buf,
 135                                        uint8* u_buf,
 136                                        uint8* v_buf) {
 137   int sum_b = 0;
 138   int sum_g = 0;
 139   int sum_r = 0;
 140   int r, g, b;
 141
 142   CONVERT_Y(rgb_buf, y_buf);
 143   CONVERT_Y(rgb_buf, y_buf);
 144   *u_buf++ = RGBToU(sum_r, sum_g, sum_b, 1);
 145   *v_buf++ = RGBToV(sum_r, sum_g, sum_b, 1);
 146 }
 147
 148 static inline void ConvertRGBToYUV_V1H1(const uint8* rgb_buf,
 149                                        uint8* y_buf,
 150                                        uint8* u_buf,
 151                                        uint8* v_buf) {
 152   int sum_b = 0;
 153   int sum_g = 0;
 154   int sum_r = 0;
 155   int r, g, b;
 156
 157   CONVERT_Y(rgb_buf, y_buf);
 158   *u_buf++ = RGBToU(r, g, b, 0);
 159   *v_buf++ = RGBToV(r, g, b, 0);
 160 }
 161
 162 static void ConvertRGB32ToYUVRow_SSE2(const uint8* rgb_buf_1,
 163                                       const uint8* rgb_buf_2,
 164                                       uint8* y_buf_1,
 165                                       uint8* y_buf_2,
 166                                       uint8* u_buf,
 167                                       uint8* v_buf,
 168                                       int width) {
 169   while (width >= 4) {
 170     // Name for the Y pixels:
 171     // Row 1: a b c d
 172     // Row 2: e f g h
 173     //
 174     // First row 4 pixels.
 175     __m128i rgb_row_1 = _mm_loadu_si128(
 176         reinterpret_cast<const __m128i*>(rgb_buf_1));
 177     __m128i zero_1 = _mm_xor_si128(rgb_row_1, rgb_row_1);
 178
 179     __m128i y_table = _mm_load_si128(
 180         reinterpret_cast<const __m128i*>(ConvertRGBAToYUV_kTable));
 181
 182     __m128i rgb_a_b = _mm_unpackhi_epi8(rgb_row_1, zero_1);
 183     rgb_a_b = _mm_madd_epi16(rgb_a_b, y_table);
 184
 185     __m128i rgb_c_d = _mm_unpacklo_epi8(rgb_row_1, zero_1);
 186     rgb_c_d = _mm_madd_epi16(rgb_c_d, y_table);
 187
 188     // Do a crazh shuffle so that we get:
 189     //  v------------ Multiply Add
 190     // BG: a b c d
 191     // A0: a b c d
 192     __m128i bg_abcd = _mm_castps_si128(
 193         _mm_shuffle_ps(
 194             _mm_castsi128_ps(rgb_c_d),
 195             _mm_castsi128_ps(rgb_a_b),
 196             (3 << 6) | (1 << 4) | (3 << 2) | 1));
 197     __m128i r_abcd = _mm_castps_si128(
 198         _mm_shuffle_ps(
 199             _mm_castsi128_ps(rgb_c_d),
 200             _mm_castsi128_ps(rgb_a_b),
 201             (2 << 6) | (2 << 2)));
 202     __m128i y_abcd = _mm_add_epi32(bg_abcd, r_abcd);
 203
 204     // Down shift back to 8bits range.
 205     __m128i y_offset = _mm_load_si128(
 206         reinterpret_cast<const __m128i*>(kYOffset));
 207     y_abcd = _mm_srai_epi32(y_abcd, FIX_SHIFT);
 208     y_abcd = _mm_add_epi32(y_abcd, y_offset);
 209     y_abcd = _mm_packs_epi32(y_abcd, y_abcd);
 210     y_abcd = _mm_packus_epi16(y_abcd, y_abcd);
 211     *reinterpret_cast<uint32*>(y_buf_1) = _mm_cvtsi128_si32(y_abcd);
 212     y_buf_1 += 4;
 213
 214     // Second row 4 pixels.
 215     __m128i rgb_row_2 = _mm_loadu_si128(
 216         reinterpret_cast<const __m128i*>(rgb_buf_2));
 217     __m128i zero_2 = _mm_xor_si128(rgb_row_2, rgb_row_2);
 218     __m128i rgb_e_f = _mm_unpackhi_epi8(rgb_row_2, zero_2);
 219     __m128i rgb_g_h = _mm_unpacklo_epi8(rgb_row_2, zero_2);
 220
 221     // Add two rows together.
 222     __m128i rgb_ae_bf =
 223         _mm_add_epi16(_mm_unpackhi_epi8(rgb_row_1, zero_2), rgb_e_f);
 224     __m128i rgb_cg_dh =
 225         _mm_add_epi16(_mm_unpacklo_epi8(rgb_row_1, zero_2), rgb_g_h);
 226
 227     // Multiply add like the previous row.
 228     rgb_e_f = _mm_madd_epi16(rgb_e_f, y_table);
 229     rgb_g_h = _mm_madd_epi16(rgb_g_h, y_table);
 230
 231     __m128i bg_efgh = _mm_castps_si128(
 232         _mm_shuffle_ps(_mm_castsi128_ps(rgb_g_h),
 233                        _mm_castsi128_ps(rgb_e_f),
 234                        (3 << 6) | (1 << 4) | (3 << 2) | 1));
 235     __m128i r_efgh = _mm_castps_si128(
 236         _mm_shuffle_ps(_mm_castsi128_ps(rgb_g_h),
 237                        _mm_castsi128_ps(rgb_e_f),
 238                        (2 << 6) | (2 << 2)));
 239     __m128i y_efgh = _mm_add_epi32(bg_efgh, r_efgh);
 240     y_efgh = _mm_srai_epi32(y_efgh, FIX_SHIFT);
 241     y_efgh = _mm_add_epi32(y_efgh, y_offset);
 242     y_efgh = _mm_packs_epi32(y_efgh, y_efgh);
 243     y_efgh = _mm_packus_epi16(y_efgh, y_efgh);
 244     *reinterpret_cast<uint32*>(y_buf_2) = _mm_cvtsi128_si32(y_efgh);
 245     y_buf_2 += 4;
 246
 247     __m128i rgb_ae_cg = _mm_castps_si128(
 248         _mm_shuffle_ps(_mm_castsi128_ps(rgb_cg_dh),
 249                        _mm_castsi128_ps(rgb_ae_bf),
 250                        (3 << 6) | (2 << 4) | (3 << 2) | 2));
 251     __m128i rgb_bf_dh = _mm_castps_si128(
 252         _mm_shuffle_ps(_mm_castsi128_ps(rgb_cg_dh),
 253                        _mm_castsi128_ps(rgb_ae_bf),
 254                        (1 << 6) | (1 << 2)));
 255
 256     // This is a 2x2 subsampling for 2 pixels.
 257     __m128i rgb_abef_cdgh = _mm_add_epi16(rgb_ae_cg, rgb_bf_dh);
 258
 259     // Do a multiply add with U table.
 260     __m128i u_a_b = _mm_madd_epi16(
 261         rgb_abef_cdgh,
 262         _mm_load_si128(
 263             reinterpret_cast<const __m128i*>(ConvertRGBAToYUV_kTable + 8)));
 264     u_a_b = _mm_add_epi32(_mm_shuffle_epi32(u_a_b, ((3 << 2) | 1)),
 265                           _mm_shuffle_epi32(u_a_b, (2 << 2)));
 266     // Right shift 14 because of 12 from fixed point and 2 from subsampling.
 267     u_a_b = _mm_srai_epi32(u_a_b, FIX_SHIFT + 2);
 268     __m128i uv_offset = _mm_slli_epi32(y_offset, 3);
 269     u_a_b = _mm_add_epi32(u_a_b, uv_offset);
 270     u_a_b = _mm_packs_epi32(u_a_b, u_a_b);
 271     u_a_b = _mm_packus_epi16(u_a_b, u_a_b);
 272     *reinterpret_cast<uint16*>(u_buf) = _mm_extract_epi16(u_a_b, 0);
 273     u_buf += 2;
 274
 275     __m128i v_a_b = _mm_madd_epi16(
 276         rgb_abef_cdgh,
 277         _mm_load_si128(
 278             reinterpret_cast<const __m128i*>(ConvertRGBAToYUV_kTable + 16)));
 279     v_a_b = _mm_add_epi32(_mm_shuffle_epi32(v_a_b, ((3 << 2) | 1)),
 280                           _mm_shuffle_epi32(v_a_b, (2 << 2)));
 281     v_a_b = _mm_srai_epi32(v_a_b, FIX_SHIFT + 2);
 282     v_a_b = _mm_add_epi32(v_a_b, uv_offset);
 283     v_a_b = _mm_packs_epi32(v_a_b, v_a_b);
 284     v_a_b = _mm_packus_epi16(v_a_b, v_a_b);
 285     *reinterpret_cast<uint16*>(v_buf) = _mm_extract_epi16(v_a_b, 0);
 286     v_buf += 2;
 287
 288     rgb_buf_1 += 16;
 289     rgb_buf_2 += 16;
 290
 291     // Move forward by 4 pixels.
 292     width -= 4;
 293   }
 294
 295   // Just use C code to convert the remaining pixels.
 296   if (width >= 2) {
 297     ConvertRGBToYUV_V2H2(rgb_buf_1, rgb_buf_2, y_buf_1, y_buf_2, u_buf, v_buf);
 298     rgb_buf_1 += 8;
 299     rgb_buf_2 += 8;
 300     y_buf_1 += 2;
 301     y_buf_2 += 2;
 302     ++u_buf;
 303     ++v_buf;
 304     width -= 2;
 305   }
 306
 307   if (width)
 308     ConvertRGBToYUV_V2H1(rgb_buf_1, rgb_buf_2, y_buf_1, y_buf_2, u_buf, v_buf);
 309 }
 310
 311 extern void ConvertRGB32ToYUV_SSE2(const uint8* rgbframe,
 312                                    uint8* yplane,
 313                                    uint8* uplane,
 314                                    uint8* vplane,
 315                                    int width,
 316                                    int height,
 317                                    int rgbstride,
 318                                    int ystride,
 319                                    int uvstride) {
 320   while (height >= 2) {
 321     ConvertRGB32ToYUVRow_SSE2(rgbframe,
 322                               rgbframe + rgbstride,
 323                               yplane,
 324                               yplane + ystride,
 325                               uplane,
 326                               vplane,
 327                               width);
 328     rgbframe += 2 * rgbstride;
 329     yplane += 2 * ystride;
 330     uplane += uvstride;
 331     vplane += uvstride;
 332     height -= 2;
 333   }
 334
 335   if (!height)
 336     return;
 337
 338   // Handle the last row.
 339   while (width >= 2) {
 340     ConvertRGBToYUV_V1H2(rgbframe, yplane, uplane, vplane);
 341     rgbframe += 8;
 342     yplane += 2;
 343     ++uplane;
 344     ++vplane;
 345     width -= 2;
 346   }
 347
 348   if (width)
 349     ConvertRGBToYUV_V1H1(rgbframe, yplane, uplane, vplane);
 350 }
 351
 352 void ConvertRGB32ToYUV_SSE2_Reference(const uint8* rgbframe,
 353                                       uint8* yplane,
 354                                       uint8* uplane,
 355                                       uint8* vplane,
 356                                       int width,
 357                                       int height,
 358                                       int rgbstride,
 359                                       int ystride,
 360                                       int uvstride) {
 361   while (height >= 2) {
 362     int i = 0;
 363
 364     // Convert a 2x2 block.
 365     while (i + 2 <= width) {
 366       ConvertRGBToYUV_V2H2(rgbframe + i * 4,
 367                            rgbframe + rgbstride + i * 4,
 368                            yplane + i,
 369                            yplane + ystride + i,
 370                            uplane + i / 2,
 371                            vplane + i / 2);
 372       i += 2;
 373     }
 374
 375     // Convert the last pixel of two rows.
 376     if (i < width) {
 377       ConvertRGBToYUV_V2H1(rgbframe + i * 4,
 378                            rgbframe + rgbstride + i * 4,
 379                            yplane + i,
 380                            yplane + ystride + i,
 381                            uplane + i / 2,
 382                            vplane + i / 2);
 383     }
 384
 385     rgbframe += 2 * rgbstride;
 386     yplane += 2 * ystride;
 387     uplane += uvstride;
 388     vplane += uvstride;
 389     height -= 2;
 390   }
 391
 392   if (!height)
 393     return;
 394
 395   // Handle the last row.
 396   while (width >= 2) {
 397     ConvertRGBToYUV_V1H2(rgbframe, yplane, uplane, vplane);
 398     rgbframe += 8;
 399     yplane += 2;
 400     ++uplane;
 401     ++vplane;
 402     width -= 2;
 403   }
 404
 405   // Handle the last pixel in the last row.
 406   if (width)
 407     ConvertRGBToYUV_V1H1(rgbframe, yplane, uplane, vplane);
 408 }
 409
 410 }  // namespace media