skia/ext/convolver.cc

   1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include <algorithm>
   6
   7 #include "skia/ext/convolver.h"
   8 #include "skia/ext/convolver_SSE2.h"
   9 #include "third_party/skia/include/core/SkTypes.h"
  10
  11 namespace skia {
  12
  13 namespace {
  14
  15 // Converts the argument to an 8-bit unsigned value by clamping to the range
  16 // 0-255.
  17 inline unsigned char ClampTo8(int a) {
  18   if (static_cast<unsigned>(a) < 256)
  19     return a;  // Avoid the extra check in the common case.
  20   if (a < 0)
  21     return 0;
  22   return 255;
  23 }
  24
  25 // Stores a list of rows in a circular buffer. The usage is you write into it
  26 // by calling AdvanceRow. It will keep track of which row in the buffer it
  27 // should use next, and the total number of rows added.
  28 class CircularRowBuffer {
  29  public:
  30   // The number of pixels in each row is given in |source_row_pixel_width|.
  31   // The maximum number of rows needed in the buffer is |max_y_filter_size|
  32   // (we only need to store enough rows for the biggest filter).
  33   //
  34   // We use the |first_input_row| to compute the coordinates of all of the
  35   // following rows returned by Advance().
  36   CircularRowBuffer(int dest_row_pixel_width, int max_y_filter_size,
  37                     int first_input_row)
  38       : row_byte_width_(dest_row_pixel_width * 4),
  39         num_rows_(max_y_filter_size),
  40         next_row_(0),
  41         next_row_coordinate_(first_input_row) {
  42     buffer_.resize(row_byte_width_ * max_y_filter_size);
  43     row_addresses_.resize(num_rows_);
  44   }
  45
  46   // Moves to the next row in the buffer, returning a pointer to the beginning
  47   // of it.
  48   unsigned char* AdvanceRow() {
  49     unsigned char* row = &buffer_[next_row_ * row_byte_width_];
  50     next_row_coordinate_++;
  51
  52     // Set the pointer to the next row to use, wrapping around if necessary.
  53     next_row_++;
  54     if (next_row_ == num_rows_)
  55       next_row_ = 0;
  56     return row;
  57   }
  58
  59   // Returns a pointer to an "unrolled" array of rows. These rows will start
  60   // at the y coordinate placed into |*first_row_index| and will continue in
  61   // order for the maximum number of rows in this circular buffer.
  62   //
  63   // The |first_row_index_| may be negative. This means the circular buffer
  64   // starts before the top of the image (it hasn't been filled yet).
  65   unsigned char* const* GetRowAddresses(int* first_row_index) {
  66     // Example for a 4-element circular buffer holding coords 6-9.
  67     //   Row 0   Coord 8
  68     //   Row 1   Coord 9
  69     //   Row 2   Coord 6  <- next_row_ = 2, next_row_coordinate_ = 10.
  70     //   Row 3   Coord 7
  71     //
  72     // The "next" row is also the first (lowest) coordinate. This computation
  73     // may yield a negative value, but that's OK, the math will work out
  74     // since the user of this buffer will compute the offset relative
  75     // to the first_row_index and the negative rows will never be used.
  76     *first_row_index = next_row_coordinate_ - num_rows_;
  77
  78     int cur_row = next_row_;
  79     for (int i = 0; i < num_rows_; i++) {
  80       row_addresses_[i] = &buffer_[cur_row * row_byte_width_];
  81
  82       // Advance to the next row, wrapping if necessary.
  83       cur_row++;
  84       if (cur_row == num_rows_)
  85         cur_row = 0;
  86     }
  87     return &row_addresses_[0];
  88   }
  89
  90  private:
  91   // The buffer storing the rows. They are packed, each one row_byte_width_.
  92   std::vector<unsigned char> buffer_;
  93
  94   // Number of bytes per row in the |buffer_|.
  95   int row_byte_width_;
  96
  97   // The number of rows available in the buffer.
  98   int num_rows_;
  99
 100   // The next row index we should write into. This wraps around as the
 101   // circular buffer is used.
 102   int next_row_;
 103
 104   // The y coordinate of the |next_row_|. This is incremented each time a
 105   // new row is appended and does not wrap.
 106   int next_row_coordinate_;
 107
 108   // Buffer used by GetRowAddresses().
 109   std::vector<unsigned char*> row_addresses_;
 110 };
 111
 112 // Convolves horizontally along a single row. The row data is given in
 113 // |src_data| and continues for the num_values() of the filter.
 114 template<bool has_alpha>
 115 void ConvolveHorizontally(const unsigned char* src_data,
 116                           const ConvolutionFilter1D& filter,
 117                           unsigned char* out_row) {
 118   // Loop over each pixel on this row in the output image.
 119   int num_values = filter.num_values();
 120   for (int out_x = 0; out_x < num_values; out_x++) {
 121     // Get the filter that determines the current output pixel.
 122     int filter_offset, filter_length;
 123     const ConvolutionFilter1D::Fixed* filter_values =
 124         filter.FilterForValue(out_x, &filter_offset, &filter_length);
 125
 126     // Compute the first pixel in this row that the filter affects. It will
 127     // touch |filter_length| pixels (4 bytes each) after this.
 128     const unsigned char* row_to_filter = &src_data[filter_offset * 4];
 129
 130     // Apply the filter to the row to get the destination pixel in |accum|.
 131     int accum[4] = {0};
 132     for (int filter_x = 0; filter_x < filter_length; filter_x++) {
 133       ConvolutionFilter1D::Fixed cur_filter = filter_values[filter_x];
 134       accum[0] += cur_filter * row_to_filter[filter_x * 4 + 0];
 135       accum[1] += cur_filter * row_to_filter[filter_x * 4 + 1];
 136       accum[2] += cur_filter * row_to_filter[filter_x * 4 + 2];
 137       if (has_alpha)
 138         accum[3] += cur_filter * row_to_filter[filter_x * 4 + 3];
 139     }
 140
 141     // Bring this value back in range. All of the filter scaling factors
 142     // are in fixed point with kShiftBits bits of fractional part.
 143     accum[0] >>= ConvolutionFilter1D::kShiftBits;
 144     accum[1] >>= ConvolutionFilter1D::kShiftBits;
 145     accum[2] >>= ConvolutionFilter1D::kShiftBits;
 146     if (has_alpha)
 147       accum[3] >>= ConvolutionFilter1D::kShiftBits;
 148
 149     // Store the new pixel.
 150     out_row[out_x * 4 + 0] = ClampTo8(accum[0]);
 151     out_row[out_x * 4 + 1] = ClampTo8(accum[1]);
 152     out_row[out_x * 4 + 2] = ClampTo8(accum[2]);
 153     if (has_alpha)
 154       out_row[out_x * 4 + 3] = ClampTo8(accum[3]);
 155   }
 156 }
 157
 158 // Does vertical convolution to produce one output row. The filter values and
 159 // length are given in the first two parameters. These are applied to each
 160 // of the rows pointed to in the |source_data_rows| array, with each row
 161 // being |pixel_width| wide.
 162 //
 163 // The output must have room for |pixel_width * 4| bytes.
 164 template<bool has_alpha>
 165 void ConvolveVertically(const ConvolutionFilter1D::Fixed* filter_values,
 166                         int filter_length,
 167                         unsigned char* const* source_data_rows,
 168                         int pixel_width,
 169                         unsigned char* out_row) {
 170   // We go through each column in the output and do a vertical convolution,
 171   // generating one output pixel each time.
 172   for (int out_x = 0; out_x < pixel_width; out_x++) {
 173     // Compute the number of bytes over in each row that the current column
 174     // we're convolving starts at. The pixel will cover the next 4 bytes.
 175     int byte_offset = out_x * 4;
 176
 177     // Apply the filter to one column of pixels.
 178     int accum[4] = {0};
 179     for (int filter_y = 0; filter_y < filter_length; filter_y++) {
 180       ConvolutionFilter1D::Fixed cur_filter = filter_values[filter_y];
 181       accum[0] += cur_filter * source_data_rows[filter_y][byte_offset + 0];
 182       accum[1] += cur_filter * source_data_rows[filter_y][byte_offset + 1];
 183       accum[2] += cur_filter * source_data_rows[filter_y][byte_offset + 2];
 184       if (has_alpha)
 185         accum[3] += cur_filter * source_data_rows[filter_y][byte_offset + 3];
 186     }
 187
 188     // Bring this value back in range. All of the filter scaling factors
 189     // are in fixed point with kShiftBits bits of precision.
 190     accum[0] >>= ConvolutionFilter1D::kShiftBits;
 191     accum[1] >>= ConvolutionFilter1D::kShiftBits;
 192     accum[2] >>= ConvolutionFilter1D::kShiftBits;
 193     if (has_alpha)
 194       accum[3] >>= ConvolutionFilter1D::kShiftBits;
 195
 196     // Store the new pixel.
 197     out_row[byte_offset + 0] = ClampTo8(accum[0]);
 198     out_row[byte_offset + 1] = ClampTo8(accum[1]);
 199     out_row[byte_offset + 2] = ClampTo8(accum[2]);
 200     if (has_alpha) {
 201       unsigned char alpha = ClampTo8(accum[3]);
 202
 203       // Make sure the alpha channel doesn't come out smaller than any of the
 204       // color channels. We use premultipled alpha channels, so this should
 205       // never happen, but rounding errors will cause this from time to time.
 206       // These "impossible" colors will cause overflows (and hence random pixel
 207       // values) when the resulting bitmap is drawn to the screen.
 208       //
 209       // We only need to do this when generating the final output row (here).
 210       int max_color_channel = std::max(out_row[byte_offset + 0],
 211           std::max(out_row[byte_offset + 1], out_row[byte_offset + 2]));
 212       if (alpha < max_color_channel)
 213         out_row[byte_offset + 3] = max_color_channel;
 214       else
 215         out_row[byte_offset + 3] = alpha;
 216     } else {
 217       // No alpha channel, the image is opaque.
 218       out_row[byte_offset + 3] = 0xff;
 219     }
 220   }
 221 }
 222
 223 void ConvolveVertically(const ConvolutionFilter1D::Fixed* filter_values,
 224                         int filter_length,
 225                         unsigned char* const* source_data_rows,
 226                         int pixel_width,
 227                         unsigned char* out_row,
 228                         bool source_has_alpha) {
 229   if (source_has_alpha) {
 230     ConvolveVertically<true>(filter_values, filter_length,
 231                              source_data_rows,
 232                              pixel_width,
 233                              out_row);
 234   } else {
 235     ConvolveVertically<false>(filter_values, filter_length,
 236                               source_data_rows,
 237                               pixel_width,
 238                               out_row);
 239   }
 240 }
 241
 242 }  // namespace
 243
 244 // ConvolutionFilter1D ---------------------------------------------------------
 245
 246 ConvolutionFilter1D::ConvolutionFilter1D()
 247     : max_filter_(0) {
 248 }
 249
 250 ConvolutionFilter1D::~ConvolutionFilter1D() {
 251 }
 252
 253 void ConvolutionFilter1D::AddFilter(int filter_offset,
 254                                     const float* filter_values,
 255                                     int filter_length) {
 256   SkASSERT(filter_length > 0);
 257
 258   std::vector<Fixed> fixed_values;
 259   fixed_values.reserve(filter_length);
 260
 261   for (int i = 0; i < filter_length; ++i)
 262     fixed_values.push_back(FloatToFixed(filter_values[i]));
 263
 264   AddFilter(filter_offset, &fixed_values[0], filter_length);
 265 }
 266
 267 void ConvolutionFilter1D::AddFilter(int filter_offset,
 268                                     const Fixed* filter_values,
 269                                     int filter_length) {
 270   // It is common for leading/trailing filter values to be zeros. In such
 271   // cases it is beneficial to only store the central factors.
 272   // For a scaling to 1/4th in each dimension using a Lanczos-2 filter on
 273   // a 1080p image this optimization gives a ~10% speed improvement.
 274   int first_non_zero = 0;
 275   while (first_non_zero < filter_length && filter_values[first_non_zero] == 0)
 276     first_non_zero++;
 277
 278   if (first_non_zero < filter_length) {
 279     // Here we have at least one non-zero factor.
 280     int last_non_zero = filter_length - 1;
 281     while (last_non_zero >= 0 && filter_values[last_non_zero] == 0)
 282       last_non_zero--;
 283
 284     filter_offset += first_non_zero;
 285     filter_length = last_non_zero + 1 - first_non_zero;
 286     SkASSERT(filter_length > 0);
 287
 288     for (int i = first_non_zero; i <= last_non_zero; i++)
 289       filter_values_.push_back(filter_values[i]);
 290   } else {
 291     // Here all the factors were zeroes.
 292     filter_length = 0;
 293   }
 294
 295   FilterInstance instance;
 296
 297   // We pushed filter_length elements onto filter_values_
 298   instance.data_location = (static_cast<int>(filter_values_.size()) -
 299                             filter_length);
 300   instance.offset = filter_offset;
 301   instance.length = filter_length;
 302   filters_.push_back(instance);
 303
 304   max_filter_ = std::max(max_filter_, filter_length);
 305 }
 306
 307 typedef void (*ConvolveVertically_pointer)(
 308     const ConvolutionFilter1D::Fixed* filter_values,
 309     int filter_length,
 310     unsigned char* const* source_data_rows,
 311     int pixel_width,
 312     unsigned char* out_row,
 313     bool has_alpha);
 314 typedef void (*Convolve4RowsHorizontally_pointer)(
 315     const unsigned char* src_data[4],
 316     const ConvolutionFilter1D& filter,
 317     unsigned char* out_row[4]);
 318 typedef void (*ConvolveHorizontally_pointer)(
 319     const unsigned char* src_data,
 320     const ConvolutionFilter1D& filter,
 321     unsigned char* out_row);
 322
 323 struct ConvolveProcs {
 324   // This is how many extra pixels may be read by the
 325   // conolve*horizontally functions.
 326   int extra_horizontal_reads;
 327   ConvolveVertically_pointer convolve_vertically;
 328   Convolve4RowsHorizontally_pointer convolve_4rows_horizontally;
 329   ConvolveHorizontally_pointer convolve_horizontally;
 330 };
 331
 332 void SetupSIMD(ConvolveProcs *procs) {
 333 #ifdef SIMD_SSE2
 334   base::CPU cpu;
 335   if (cpu.has_sse2()) {
 336     procs->extra_horizontal_reads = 3;
 337     procs->convolve_vertically = &ConvolveVertically_SSE2;
 338     procs->convolve_4rows_horizontally = &Convolve4RowsHorizontally_SSE2;
 339     procs->convolve_horizontally = &ConvolveHorizontally_SSE2;
 340   }
 341 #endif
 342 }
 343
 344 void BGRAConvolve2D(const unsigned char* source_data,
 345                     int source_byte_row_stride,
 346                     bool source_has_alpha,
 347                     const ConvolutionFilter1D& filter_x,
 348                     const ConvolutionFilter1D& filter_y,
 349                     int output_byte_row_stride,
 350                     unsigned char* output,
 351                     bool use_simd_if_possible) {
 352   ConvolveProcs simd;
 353   simd.extra_horizontal_reads = 0;
 354   simd.convolve_vertically = NULL;
 355   simd.convolve_4rows_horizontally = NULL;
 356   simd.convolve_horizontally = NULL;
 357   if (use_simd_if_possible) {
 358     SetupSIMD(&simd);
 359   }
 360
 361   int max_y_filter_size = filter_y.max_filter();
 362
 363   // The next row in the input that we will generate a horizontally
 364   // convolved row for. If the filter doesn't start at the beginning of the
 365   // image (this is the case when we are only resizing a subset), then we
 366   // don't want to generate any output rows before that. Compute the starting
 367   // row for convolution as the first pixel for the first vertical filter.
 368   int filter_offset, filter_length;
 369   const ConvolutionFilter1D::Fixed* filter_values =
 370       filter_y.FilterForValue(0, &filter_offset, &filter_length);
 371   int next_x_row = filter_offset;
 372
 373   // We loop over each row in the input doing a horizontal convolution. This
 374   // will result in a horizontally convolved image. We write the results into
 375   // a circular buffer of convolved rows and do vertical convolution as rows
 376   // are available. This prevents us from having to store the entire
 377   // intermediate image and helps cache coherency.
 378   // We will need four extra rows to allow horizontal convolution could be done
 379   // simultaneously. We also padding each row in row buffer to be aligned-up to
 380   // 16 bytes.
 381   // TODO(jiesun): We do not use aligned load from row buffer in vertical
 382   // convolution pass yet. Somehow Windows does not like it.
 383   int row_buffer_width = (filter_x.num_values() + 15) & ~0xF;
 384   int row_buffer_height = max_y_filter_size +
 385       (simd.convolve_4rows_horizontally ? 4 : 0);
 386   CircularRowBuffer row_buffer(row_buffer_width,
 387                                row_buffer_height,
 388                                filter_offset);
 389
 390   // Loop over every possible output row, processing just enough horizontal
 391   // convolutions to run each subsequent vertical convolution.
 392   SkASSERT(output_byte_row_stride >= filter_x.num_values() * 4);
 393   int num_output_rows = filter_y.num_values();
 394
 395   // We need to check which is the last line to convolve before we advance 4
 396   // lines in one iteration.
 397   int last_filter_offset, last_filter_length;
 398
 399   // SSE2 can access up to 3 extra pixels past the end of the
 400   // buffer. At the bottom of the image, we have to be careful
 401   // not to access data past the end of the buffer. Normally
 402   // we fall back to the C++ implementation for the last row.
 403   // If the last row is less than 3 pixels wide, we may have to fall
 404   // back to the C++ version for more rows. Compute how many
 405   // rows we need to avoid the SSE implementation for here.
 406   filter_x.FilterForValue(filter_x.num_values() - 1, &last_filter_offset,
 407                           &last_filter_length);
 408   int avoid_simd_rows = 1 + simd.extra_horizontal_reads /
 409       (last_filter_offset + last_filter_length);
 410
 411   filter_y.FilterForValue(num_output_rows - 1, &last_filter_offset,
 412                           &last_filter_length);
 413
 414   for (int out_y = 0; out_y < num_output_rows; out_y++) {
 415     filter_values = filter_y.FilterForValue(out_y,
 416                                             &filter_offset, &filter_length);
 417
 418     // Generate output rows until we have enough to run the current filter.
 419     while (next_x_row < filter_offset + filter_length) {
 420       if (simd.convolve_4rows_horizontally &&
 421           next_x_row + 3 < last_filter_offset + last_filter_length -
 422           avoid_simd_rows) {
 423         const unsigned char* src[4];
 424         unsigned char* out_row[4];
 425         for (int i = 0; i < 4; ++i) {
 426           src[i] = &source_data[(next_x_row + i) * source_byte_row_stride];
 427           out_row[i] = row_buffer.AdvanceRow();
 428         }
 429         simd.convolve_4rows_horizontally(src, filter_x, out_row);
 430         next_x_row += 4;
 431       } else {
 432         // Check if we need to avoid SSE2 for this row.
 433         if (simd.convolve_horizontally &&
 434             next_x_row < last_filter_offset + last_filter_length -
 435             avoid_simd_rows) {
 436           simd.convolve_horizontally(
 437               &source_data[next_x_row * source_byte_row_stride],
 438               filter_x, row_buffer.AdvanceRow());
 439         } else {
 440           if (source_has_alpha) {
 441             ConvolveHorizontally<true>(
 442                 &source_data[next_x_row * source_byte_row_stride],
 443                 filter_x, row_buffer.AdvanceRow());
 444           } else {
 445             ConvolveHorizontally<false>(
 446                 &source_data[next_x_row * source_byte_row_stride],
 447                 filter_x, row_buffer.AdvanceRow());
 448           }
 449         }
 450         next_x_row++;
 451       }
 452     }
 453
 454     // Compute where in the output image this row of final data will go.
 455     unsigned char* cur_output_row = &output[out_y * output_byte_row_stride];
 456
 457     // Get the list of rows that the circular buffer has, in order.
 458     int first_row_in_circular_buffer;
 459     unsigned char* const* rows_to_convolve =
 460         row_buffer.GetRowAddresses(&first_row_in_circular_buffer);
 461
 462     // Now compute the start of the subset of those rows that the filter
 463     // needs.
 464     unsigned char* const* first_row_for_filter =
 465         &rows_to_convolve[filter_offset - first_row_in_circular_buffer];
 466
 467     if (simd.convolve_vertically) {
 468       simd.convolve_vertically(filter_values, filter_length,
 469                                first_row_for_filter,
 470                                filter_x.num_values(), cur_output_row,
 471                                source_has_alpha);
 472     } else {
 473       ConvolveVertically(filter_values, filter_length,
 474                          first_row_for_filter,
 475                          filter_x.num_values(), cur_output_row,
 476                          source_has_alpha);
 477     }
 478   }
 479 }
 480
 481 }  // namespace skia