Separate Simple Backend creation from initialization.
[chromium-blink-merge.git] / skia / ext / convolver.cc
blobcbfa9315930edcc9664165a48dc667c2d7fba3b5
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include <algorithm>
7 #include "skia/ext/convolver.h"
8 #include "skia/ext/convolver_SSE2.h"
9 #include "third_party/skia/include/core/SkTypes.h"
11 namespace skia {
13 namespace {
15 // Converts the argument to an 8-bit unsigned value by clamping to the range
16 // 0-255.
17 inline unsigned char ClampTo8(int a) {
18 if (static_cast<unsigned>(a) < 256)
19 return a; // Avoid the extra check in the common case.
20 if (a < 0)
21 return 0;
22 return 255;
25 // Stores a list of rows in a circular buffer. The usage is you write into it
26 // by calling AdvanceRow. It will keep track of which row in the buffer it
27 // should use next, and the total number of rows added.
28 class CircularRowBuffer {
29 public:
30 // The number of pixels in each row is given in |source_row_pixel_width|.
31 // The maximum number of rows needed in the buffer is |max_y_filter_size|
32 // (we only need to store enough rows for the biggest filter).
34 // We use the |first_input_row| to compute the coordinates of all of the
35 // following rows returned by Advance().
36 CircularRowBuffer(int dest_row_pixel_width, int max_y_filter_size,
37 int first_input_row)
38 : row_byte_width_(dest_row_pixel_width * 4),
39 num_rows_(max_y_filter_size),
40 next_row_(0),
41 next_row_coordinate_(first_input_row) {
42 buffer_.resize(row_byte_width_ * max_y_filter_size);
43 row_addresses_.resize(num_rows_);
46 // Moves to the next row in the buffer, returning a pointer to the beginning
47 // of it.
48 unsigned char* AdvanceRow() {
49 unsigned char* row = &buffer_[next_row_ * row_byte_width_];
50 next_row_coordinate_++;
52 // Set the pointer to the next row to use, wrapping around if necessary.
53 next_row_++;
54 if (next_row_ == num_rows_)
55 next_row_ = 0;
56 return row;
59 // Returns a pointer to an "unrolled" array of rows. These rows will start
60 // at the y coordinate placed into |*first_row_index| and will continue in
61 // order for the maximum number of rows in this circular buffer.
63 // The |first_row_index_| may be negative. This means the circular buffer
64 // starts before the top of the image (it hasn't been filled yet).
65 unsigned char* const* GetRowAddresses(int* first_row_index) {
66 // Example for a 4-element circular buffer holding coords 6-9.
67 // Row 0 Coord 8
68 // Row 1 Coord 9
69 // Row 2 Coord 6 <- next_row_ = 2, next_row_coordinate_ = 10.
70 // Row 3 Coord 7
72 // The "next" row is also the first (lowest) coordinate. This computation
73 // may yield a negative value, but that's OK, the math will work out
74 // since the user of this buffer will compute the offset relative
75 // to the first_row_index and the negative rows will never be used.
76 *first_row_index = next_row_coordinate_ - num_rows_;
78 int cur_row = next_row_;
79 for (int i = 0; i < num_rows_; i++) {
80 row_addresses_[i] = &buffer_[cur_row * row_byte_width_];
82 // Advance to the next row, wrapping if necessary.
83 cur_row++;
84 if (cur_row == num_rows_)
85 cur_row = 0;
87 return &row_addresses_[0];
90 private:
91 // The buffer storing the rows. They are packed, each one row_byte_width_.
92 std::vector<unsigned char> buffer_;
94 // Number of bytes per row in the |buffer_|.
95 int row_byte_width_;
97 // The number of rows available in the buffer.
98 int num_rows_;
100 // The next row index we should write into. This wraps around as the
101 // circular buffer is used.
102 int next_row_;
104 // The y coordinate of the |next_row_|. This is incremented each time a
105 // new row is appended and does not wrap.
106 int next_row_coordinate_;
108 // Buffer used by GetRowAddresses().
109 std::vector<unsigned char*> row_addresses_;
112 // Convolves horizontally along a single row. The row data is given in
113 // |src_data| and continues for the num_values() of the filter.
114 template<bool has_alpha>
115 void ConvolveHorizontally(const unsigned char* src_data,
116 const ConvolutionFilter1D& filter,
117 unsigned char* out_row) {
118 // Loop over each pixel on this row in the output image.
119 int num_values = filter.num_values();
120 for (int out_x = 0; out_x < num_values; out_x++) {
121 // Get the filter that determines the current output pixel.
122 int filter_offset, filter_length;
123 const ConvolutionFilter1D::Fixed* filter_values =
124 filter.FilterForValue(out_x, &filter_offset, &filter_length);
126 // Compute the first pixel in this row that the filter affects. It will
127 // touch |filter_length| pixels (4 bytes each) after this.
128 const unsigned char* row_to_filter = &src_data[filter_offset * 4];
130 // Apply the filter to the row to get the destination pixel in |accum|.
131 int accum[4] = {0};
132 for (int filter_x = 0; filter_x < filter_length; filter_x++) {
133 ConvolutionFilter1D::Fixed cur_filter = filter_values[filter_x];
134 accum[0] += cur_filter * row_to_filter[filter_x * 4 + 0];
135 accum[1] += cur_filter * row_to_filter[filter_x * 4 + 1];
136 accum[2] += cur_filter * row_to_filter[filter_x * 4 + 2];
137 if (has_alpha)
138 accum[3] += cur_filter * row_to_filter[filter_x * 4 + 3];
141 // Bring this value back in range. All of the filter scaling factors
142 // are in fixed point with kShiftBits bits of fractional part.
143 accum[0] >>= ConvolutionFilter1D::kShiftBits;
144 accum[1] >>= ConvolutionFilter1D::kShiftBits;
145 accum[2] >>= ConvolutionFilter1D::kShiftBits;
146 if (has_alpha)
147 accum[3] >>= ConvolutionFilter1D::kShiftBits;
149 // Store the new pixel.
150 out_row[out_x * 4 + 0] = ClampTo8(accum[0]);
151 out_row[out_x * 4 + 1] = ClampTo8(accum[1]);
152 out_row[out_x * 4 + 2] = ClampTo8(accum[2]);
153 if (has_alpha)
154 out_row[out_x * 4 + 3] = ClampTo8(accum[3]);
158 // Does vertical convolution to produce one output row. The filter values and
159 // length are given in the first two parameters. These are applied to each
160 // of the rows pointed to in the |source_data_rows| array, with each row
161 // being |pixel_width| wide.
163 // The output must have room for |pixel_width * 4| bytes.
164 template<bool has_alpha>
165 void ConvolveVertically(const ConvolutionFilter1D::Fixed* filter_values,
166 int filter_length,
167 unsigned char* const* source_data_rows,
168 int pixel_width,
169 unsigned char* out_row) {
170 // We go through each column in the output and do a vertical convolution,
171 // generating one output pixel each time.
172 for (int out_x = 0; out_x < pixel_width; out_x++) {
173 // Compute the number of bytes over in each row that the current column
174 // we're convolving starts at. The pixel will cover the next 4 bytes.
175 int byte_offset = out_x * 4;
177 // Apply the filter to one column of pixels.
178 int accum[4] = {0};
179 for (int filter_y = 0; filter_y < filter_length; filter_y++) {
180 ConvolutionFilter1D::Fixed cur_filter = filter_values[filter_y];
181 accum[0] += cur_filter * source_data_rows[filter_y][byte_offset + 0];
182 accum[1] += cur_filter * source_data_rows[filter_y][byte_offset + 1];
183 accum[2] += cur_filter * source_data_rows[filter_y][byte_offset + 2];
184 if (has_alpha)
185 accum[3] += cur_filter * source_data_rows[filter_y][byte_offset + 3];
188 // Bring this value back in range. All of the filter scaling factors
189 // are in fixed point with kShiftBits bits of precision.
190 accum[0] >>= ConvolutionFilter1D::kShiftBits;
191 accum[1] >>= ConvolutionFilter1D::kShiftBits;
192 accum[2] >>= ConvolutionFilter1D::kShiftBits;
193 if (has_alpha)
194 accum[3] >>= ConvolutionFilter1D::kShiftBits;
196 // Store the new pixel.
197 out_row[byte_offset + 0] = ClampTo8(accum[0]);
198 out_row[byte_offset + 1] = ClampTo8(accum[1]);
199 out_row[byte_offset + 2] = ClampTo8(accum[2]);
200 if (has_alpha) {
201 unsigned char alpha = ClampTo8(accum[3]);
203 // Make sure the alpha channel doesn't come out smaller than any of the
204 // color channels. We use premultipled alpha channels, so this should
205 // never happen, but rounding errors will cause this from time to time.
206 // These "impossible" colors will cause overflows (and hence random pixel
207 // values) when the resulting bitmap is drawn to the screen.
209 // We only need to do this when generating the final output row (here).
210 int max_color_channel = std::max(out_row[byte_offset + 0],
211 std::max(out_row[byte_offset + 1], out_row[byte_offset + 2]));
212 if (alpha < max_color_channel)
213 out_row[byte_offset + 3] = max_color_channel;
214 else
215 out_row[byte_offset + 3] = alpha;
216 } else {
217 // No alpha channel, the image is opaque.
218 out_row[byte_offset + 3] = 0xff;
223 void ConvolveVertically(const ConvolutionFilter1D::Fixed* filter_values,
224 int filter_length,
225 unsigned char* const* source_data_rows,
226 int pixel_width,
227 unsigned char* out_row,
228 bool source_has_alpha) {
229 if (source_has_alpha) {
230 ConvolveVertically<true>(filter_values, filter_length,
231 source_data_rows,
232 pixel_width,
233 out_row);
234 } else {
235 ConvolveVertically<false>(filter_values, filter_length,
236 source_data_rows,
237 pixel_width,
238 out_row);
242 } // namespace
244 // ConvolutionFilter1D ---------------------------------------------------------
246 ConvolutionFilter1D::ConvolutionFilter1D()
247 : max_filter_(0) {
250 ConvolutionFilter1D::~ConvolutionFilter1D() {
253 void ConvolutionFilter1D::AddFilter(int filter_offset,
254 const float* filter_values,
255 int filter_length) {
256 SkASSERT(filter_length > 0);
258 std::vector<Fixed> fixed_values;
259 fixed_values.reserve(filter_length);
261 for (int i = 0; i < filter_length; ++i)
262 fixed_values.push_back(FloatToFixed(filter_values[i]));
264 AddFilter(filter_offset, &fixed_values[0], filter_length);
267 void ConvolutionFilter1D::AddFilter(int filter_offset,
268 const Fixed* filter_values,
269 int filter_length) {
270 // It is common for leading/trailing filter values to be zeros. In such
271 // cases it is beneficial to only store the central factors.
272 // For a scaling to 1/4th in each dimension using a Lanczos-2 filter on
273 // a 1080p image this optimization gives a ~10% speed improvement.
274 int first_non_zero = 0;
275 while (first_non_zero < filter_length && filter_values[first_non_zero] == 0)
276 first_non_zero++;
278 if (first_non_zero < filter_length) {
279 // Here we have at least one non-zero factor.
280 int last_non_zero = filter_length - 1;
281 while (last_non_zero >= 0 && filter_values[last_non_zero] == 0)
282 last_non_zero--;
284 filter_offset += first_non_zero;
285 filter_length = last_non_zero + 1 - first_non_zero;
286 SkASSERT(filter_length > 0);
288 for (int i = first_non_zero; i <= last_non_zero; i++)
289 filter_values_.push_back(filter_values[i]);
290 } else {
291 // Here all the factors were zeroes.
292 filter_length = 0;
295 FilterInstance instance;
297 // We pushed filter_length elements onto filter_values_
298 instance.data_location = (static_cast<int>(filter_values_.size()) -
299 filter_length);
300 instance.offset = filter_offset;
301 instance.length = filter_length;
302 filters_.push_back(instance);
304 max_filter_ = std::max(max_filter_, filter_length);
307 typedef void (*ConvolveVertically_pointer)(
308 const ConvolutionFilter1D::Fixed* filter_values,
309 int filter_length,
310 unsigned char* const* source_data_rows,
311 int pixel_width,
312 unsigned char* out_row,
313 bool has_alpha);
314 typedef void (*Convolve4RowsHorizontally_pointer)(
315 const unsigned char* src_data[4],
316 const ConvolutionFilter1D& filter,
317 unsigned char* out_row[4]);
318 typedef void (*ConvolveHorizontally_pointer)(
319 const unsigned char* src_data,
320 const ConvolutionFilter1D& filter,
321 unsigned char* out_row);
323 struct ConvolveProcs {
324 // This is how many extra pixels may be read by the
325 // conolve*horizontally functions.
326 int extra_horizontal_reads;
327 ConvolveVertically_pointer convolve_vertically;
328 Convolve4RowsHorizontally_pointer convolve_4rows_horizontally;
329 ConvolveHorizontally_pointer convolve_horizontally;
332 void SetupSIMD(ConvolveProcs *procs) {
333 #ifdef SIMD_SSE2
334 base::CPU cpu;
335 if (cpu.has_sse2()) {
336 procs->extra_horizontal_reads = 3;
337 procs->convolve_vertically = &ConvolveVertically_SSE2;
338 procs->convolve_4rows_horizontally = &Convolve4RowsHorizontally_SSE2;
339 procs->convolve_horizontally = &ConvolveHorizontally_SSE2;
341 #endif
344 void BGRAConvolve2D(const unsigned char* source_data,
345 int source_byte_row_stride,
346 bool source_has_alpha,
347 const ConvolutionFilter1D& filter_x,
348 const ConvolutionFilter1D& filter_y,
349 int output_byte_row_stride,
350 unsigned char* output,
351 bool use_simd_if_possible) {
352 ConvolveProcs simd;
353 simd.extra_horizontal_reads = 0;
354 simd.convolve_vertically = NULL;
355 simd.convolve_4rows_horizontally = NULL;
356 simd.convolve_horizontally = NULL;
357 if (use_simd_if_possible) {
358 SetupSIMD(&simd);
361 int max_y_filter_size = filter_y.max_filter();
363 // The next row in the input that we will generate a horizontally
364 // convolved row for. If the filter doesn't start at the beginning of the
365 // image (this is the case when we are only resizing a subset), then we
366 // don't want to generate any output rows before that. Compute the starting
367 // row for convolution as the first pixel for the first vertical filter.
368 int filter_offset, filter_length;
369 const ConvolutionFilter1D::Fixed* filter_values =
370 filter_y.FilterForValue(0, &filter_offset, &filter_length);
371 int next_x_row = filter_offset;
373 // We loop over each row in the input doing a horizontal convolution. This
374 // will result in a horizontally convolved image. We write the results into
375 // a circular buffer of convolved rows and do vertical convolution as rows
376 // are available. This prevents us from having to store the entire
377 // intermediate image and helps cache coherency.
378 // We will need four extra rows to allow horizontal convolution could be done
379 // simultaneously. We also padding each row in row buffer to be aligned-up to
380 // 16 bytes.
381 // TODO(jiesun): We do not use aligned load from row buffer in vertical
382 // convolution pass yet. Somehow Windows does not like it.
383 int row_buffer_width = (filter_x.num_values() + 15) & ~0xF;
384 int row_buffer_height = max_y_filter_size +
385 (simd.convolve_4rows_horizontally ? 4 : 0);
386 CircularRowBuffer row_buffer(row_buffer_width,
387 row_buffer_height,
388 filter_offset);
390 // Loop over every possible output row, processing just enough horizontal
391 // convolutions to run each subsequent vertical convolution.
392 SkASSERT(output_byte_row_stride >= filter_x.num_values() * 4);
393 int num_output_rows = filter_y.num_values();
395 // We need to check which is the last line to convolve before we advance 4
396 // lines in one iteration.
397 int last_filter_offset, last_filter_length;
399 // SSE2 can access up to 3 extra pixels past the end of the
400 // buffer. At the bottom of the image, we have to be careful
401 // not to access data past the end of the buffer. Normally
402 // we fall back to the C++ implementation for the last row.
403 // If the last row is less than 3 pixels wide, we may have to fall
404 // back to the C++ version for more rows. Compute how many
405 // rows we need to avoid the SSE implementation for here.
406 filter_x.FilterForValue(filter_x.num_values() - 1, &last_filter_offset,
407 &last_filter_length);
408 int avoid_simd_rows = 1 + simd.extra_horizontal_reads /
409 (last_filter_offset + last_filter_length);
411 filter_y.FilterForValue(num_output_rows - 1, &last_filter_offset,
412 &last_filter_length);
414 for (int out_y = 0; out_y < num_output_rows; out_y++) {
415 filter_values = filter_y.FilterForValue(out_y,
416 &filter_offset, &filter_length);
418 // Generate output rows until we have enough to run the current filter.
419 while (next_x_row < filter_offset + filter_length) {
420 if (simd.convolve_4rows_horizontally &&
421 next_x_row + 3 < last_filter_offset + last_filter_length -
422 avoid_simd_rows) {
423 const unsigned char* src[4];
424 unsigned char* out_row[4];
425 for (int i = 0; i < 4; ++i) {
426 src[i] = &source_data[(next_x_row + i) * source_byte_row_stride];
427 out_row[i] = row_buffer.AdvanceRow();
429 simd.convolve_4rows_horizontally(src, filter_x, out_row);
430 next_x_row += 4;
431 } else {
432 // Check if we need to avoid SSE2 for this row.
433 if (simd.convolve_horizontally &&
434 next_x_row < last_filter_offset + last_filter_length -
435 avoid_simd_rows) {
436 simd.convolve_horizontally(
437 &source_data[next_x_row * source_byte_row_stride],
438 filter_x, row_buffer.AdvanceRow());
439 } else {
440 if (source_has_alpha) {
441 ConvolveHorizontally<true>(
442 &source_data[next_x_row * source_byte_row_stride],
443 filter_x, row_buffer.AdvanceRow());
444 } else {
445 ConvolveHorizontally<false>(
446 &source_data[next_x_row * source_byte_row_stride],
447 filter_x, row_buffer.AdvanceRow());
450 next_x_row++;
454 // Compute where in the output image this row of final data will go.
455 unsigned char* cur_output_row = &output[out_y * output_byte_row_stride];
457 // Get the list of rows that the circular buffer has, in order.
458 int first_row_in_circular_buffer;
459 unsigned char* const* rows_to_convolve =
460 row_buffer.GetRowAddresses(&first_row_in_circular_buffer);
462 // Now compute the start of the subset of those rows that the filter
463 // needs.
464 unsigned char* const* first_row_for_filter =
465 &rows_to_convolve[filter_offset - first_row_in_circular_buffer];
467 if (simd.convolve_vertically) {
468 simd.convolve_vertically(filter_values, filter_length,
469 first_row_for_filter,
470 filter_x.num_values(), cur_output_row,
471 source_has_alpha);
472 } else {
473 ConvolveVertically(filter_values, filter_length,
474 first_row_for_filter,
475 filter_x.num_values(), cur_output_row,
476 source_has_alpha);
481 } // namespace skia