Bug 1924993 - [devtools] Debugger tests wait before typing in conditional panel r...
[gecko.git] / gfx / 2d / Blur.cpp
blob63bfaa769fdea1af35068652521ccbc014883bc2
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
7 #include "Blur.h"
9 #include <algorithm>
10 #include <math.h>
11 #include <string.h>
13 #include "mozilla/CheckedInt.h"
14 #include "NumericTools.h"
16 #include "2D.h"
17 #include "DataSurfaceHelpers.h"
18 #include "Tools.h"
20 #ifdef USE_NEON
21 # include "mozilla/arm.h"
22 #endif
24 namespace mozilla {
25 namespace gfx {
27 /**
28 * Helper function to process each row of the box blur.
29 * It takes care of transposing the data on input or output depending
30 * on whether we intend a horizontal or vertical blur, and whether we're
31 * reading from the initial source or writing to the final destination.
32 * It allows starting or ending anywhere within the row to accomodate
33 * a skip rect.
35 template <bool aTransposeInput, bool aTransposeOutput>
36 static inline void BoxBlurRow(const uint8_t* aInput, uint8_t* aOutput,
37 int32_t aLeftLobe, int32_t aRightLobe,
38 int32_t aWidth, int32_t aStride, int32_t aStart,
39 int32_t aEnd) {
40 // If the input or output is transposed, then we will move down a row
41 // for each step, instead of moving over a column. Since these values
42 // only depend on a template parameter, they will more easily get
43 // copy-propagated in the non-transposed case, which is why they
44 // are not passed as parameters.
45 const int32_t inputStep = aTransposeInput ? aStride : 1;
46 const int32_t outputStep = aTransposeOutput ? aStride : 1;
48 // We need to sample aLeftLobe pixels to the left and aRightLobe pixels
49 // to the right of the current position, then average them. So this is
50 // the size of the total width of this filter.
51 const int32_t boxSize = aLeftLobe + aRightLobe + 1;
53 // Instead of dividing the pixel sum by boxSize to average, we can just
54 // compute a scale that will normalize the result so that it can be quickly
55 // shifted into the desired range.
56 const uint32_t reciprocal = (1 << 24) / boxSize;
58 // The shift would normally truncate the result, whereas we would rather
59 // prefer to round the result to the closest increment. By adding 0.5 units
60 // to the initial sum, we bias the sum so that it will be rounded by the
61 // truncation instead.
62 uint32_t alphaSum = (boxSize + 1) / 2;
64 // We process the row with a moving filter, keeping a sum (alphaSum) of
65 // boxSize pixels. As we move over a pixel, we need to add on a pixel
66 // from the right extreme of the window that moved into range, and subtract
67 // off a pixel from the left extreme of window that moved out of range.
68 // But first, we need to initialization alphaSum to the contents of
69 // the window before we can get going. If the window moves out of bounds
70 // of the row, we clamp each sample to be the closest pixel from within
71 // row bounds, so the 0th and aWidth-1th pixel.
72 int32_t initLeft = aStart - aLeftLobe;
73 if (initLeft < 0) {
74 // If the left lobe samples before the row, add in clamped samples.
75 alphaSum += -initLeft * aInput[0];
76 initLeft = 0;
78 int32_t initRight = aStart + boxSize - aLeftLobe;
79 if (initRight > aWidth) {
80 // If the right lobe samples after the row, add in clamped samples.
81 alphaSum += (initRight - aWidth) * aInput[(aWidth - 1) * inputStep];
82 initRight = aWidth;
84 // Finally, add in all the valid, non-clamped samples to fill up the
85 // rest of the window.
86 const uint8_t* src = &aInput[initLeft * inputStep];
87 const uint8_t* iterEnd = &aInput[initRight * inputStep];
89 #define INIT_ITER \
90 alphaSum += *src; \
91 src += inputStep;
93 // We unroll the per-pixel loop here substantially. The amount of work
94 // done per sample is so small that the cost of a loop condition check
95 // and a branch can substantially add to or even dominate the performance
96 // of the loop.
97 while (src + 16 * inputStep <= iterEnd) {
98 INIT_ITER;
99 INIT_ITER;
100 INIT_ITER;
101 INIT_ITER;
102 INIT_ITER;
103 INIT_ITER;
104 INIT_ITER;
105 INIT_ITER;
106 INIT_ITER;
107 INIT_ITER;
108 INIT_ITER;
109 INIT_ITER;
110 INIT_ITER;
111 INIT_ITER;
112 INIT_ITER;
113 INIT_ITER;
115 while (src < iterEnd) {
116 INIT_ITER;
119 // Now we start moving the window over the row. We will be accessing
120 // pixels form aStart - aLeftLobe up to aEnd + aRightLobe, which may be
121 // out of bounds of the row. To avoid having to check within the inner
122 // loops if we are in bound, we instead compute the points at which
123 // we will move out of bounds of the row on the left side (splitLeft)
124 // and right side (splitRight).
125 int32_t splitLeft = std::clamp(aLeftLobe, aStart, aEnd);
126 int32_t splitRight = std::clamp(aWidth - (boxSize - aLeftLobe), aStart, aEnd);
127 // If the filter window is actually large than the size of the row,
128 // there will be a middle area of overlap where the leftmost and rightmost
129 // pixel of the filter will both be outside the row. In this case, we need
130 // to invert the splits so that splitLeft <= splitRight.
131 if (boxSize > aWidth) {
132 std::swap(splitLeft, splitRight);
135 // Process all pixels up to splitLeft that would sample before the start of
136 // the row. Note that because inputStep and outputStep may not be a const 1
137 // value, it is more performant to increment pointers here for the source and
138 // destination rather than use a loop counter, since doing so would entail an
139 // expensive multiplication that significantly slows down the loop.
140 uint8_t* dst = &aOutput[aStart * outputStep];
141 iterEnd = &aOutput[splitLeft * outputStep];
142 src = &aInput[(aStart + boxSize - aLeftLobe) * inputStep];
143 uint8_t firstVal = aInput[0];
145 #define LEFT_ITER \
146 *dst = (alphaSum * reciprocal) >> 24; \
147 alphaSum += *src - firstVal; \
148 dst += outputStep; \
149 src += inputStep;
151 while (dst + 16 * outputStep <= iterEnd) {
152 LEFT_ITER;
153 LEFT_ITER;
154 LEFT_ITER;
155 LEFT_ITER;
156 LEFT_ITER;
157 LEFT_ITER;
158 LEFT_ITER;
159 LEFT_ITER;
160 LEFT_ITER;
161 LEFT_ITER;
162 LEFT_ITER;
163 LEFT_ITER;
164 LEFT_ITER;
165 LEFT_ITER;
166 LEFT_ITER;
167 LEFT_ITER;
169 while (dst < iterEnd) {
170 LEFT_ITER;
173 // Process all pixels between splitLeft and splitRight.
174 iterEnd = &aOutput[splitRight * outputStep];
175 if (boxSize <= aWidth) {
176 // The filter window is smaller than the row size, so the leftmost and
177 // rightmost samples are both within row bounds.
178 src = &aInput[(splitLeft - aLeftLobe) * inputStep];
179 int32_t boxStep = boxSize * inputStep;
181 #define CENTER_ITER \
182 *dst = (alphaSum * reciprocal) >> 24; \
183 alphaSum += src[boxStep] - *src; \
184 dst += outputStep; \
185 src += inputStep;
187 while (dst + 16 * outputStep <= iterEnd) {
188 CENTER_ITER;
189 CENTER_ITER;
190 CENTER_ITER;
191 CENTER_ITER;
192 CENTER_ITER;
193 CENTER_ITER;
194 CENTER_ITER;
195 CENTER_ITER;
196 CENTER_ITER;
197 CENTER_ITER;
198 CENTER_ITER;
199 CENTER_ITER;
200 CENTER_ITER;
201 CENTER_ITER;
202 CENTER_ITER;
203 CENTER_ITER;
205 while (dst < iterEnd) {
206 CENTER_ITER;
208 } else {
209 // The filter window is larger than the row size, and we're in the area of
210 // split overlap. So the leftmost and rightmost samples are both out of
211 // bounds and need to be clamped. We can just precompute the difference here
212 // consequently.
213 int32_t firstLastDiff = aInput[(aWidth - 1) * inputStep] - aInput[0];
214 while (dst < iterEnd) {
215 *dst = (alphaSum * reciprocal) >> 24;
216 alphaSum += firstLastDiff;
217 dst += outputStep;
221 // Process all remaining pixels after splitRight that would sample after the
222 // row end.
223 iterEnd = &aOutput[aEnd * outputStep];
224 src = &aInput[(splitRight - aLeftLobe) * inputStep];
225 uint8_t lastVal = aInput[(aWidth - 1) * inputStep];
227 #define RIGHT_ITER \
228 *dst = (alphaSum * reciprocal) >> 24; \
229 alphaSum += lastVal - *src; \
230 dst += outputStep; \
231 src += inputStep;
233 while (dst + 16 * outputStep <= iterEnd) {
234 RIGHT_ITER;
235 RIGHT_ITER;
236 RIGHT_ITER;
237 RIGHT_ITER;
238 RIGHT_ITER;
239 RIGHT_ITER;
240 RIGHT_ITER;
241 RIGHT_ITER;
242 RIGHT_ITER;
243 RIGHT_ITER;
244 RIGHT_ITER;
245 RIGHT_ITER;
246 RIGHT_ITER;
247 RIGHT_ITER;
248 RIGHT_ITER;
249 RIGHT_ITER;
251 while (dst < iterEnd) {
252 RIGHT_ITER;
257 * Box blur involves looking at one pixel, and setting its value to the average
258 * of its neighbouring pixels. This is meant to provide a 3-pass approximation
259 * of a Gaussian blur.
260 * @param aTranspose Whether to transpose the buffer when reading and writing
261 * to it.
262 * @param aData The buffer to be blurred.
263 * @param aLobes The number of pixels to blend on the left and right for each of
264 * 3 passes.
265 * @param aWidth The number of columns in the buffers.
266 * @param aRows The number of rows in the buffers.
267 * @param aStride The stride of the buffer.
269 template <bool aTranspose>
270 static void BoxBlur(uint8_t* aData, const int32_t aLobes[3][2], int32_t aWidth,
271 int32_t aRows, int32_t aStride, IntRect aSkipRect) {
272 if (aTranspose) {
273 std::swap(aWidth, aRows);
274 aSkipRect.Swap();
277 MOZ_ASSERT(aWidth > 0);
279 // All three passes of the box blur that approximate the Gaussian are done
280 // on each row in turn, so we only need two temporary row buffers to process
281 // each row, instead of a full-sized buffer. Data moves from the source to the
282 // first temporary, from the first temporary to the second, then from the
283 // second back to the destination. This way is more cache-friendly than
284 // processing whe whole buffer in each pass and thus yields a nice speedup.
285 uint8_t* tmpRow = new (std::nothrow) uint8_t[2 * aWidth];
286 if (!tmpRow) {
287 return;
289 uint8_t* tmpRow2 = tmpRow + aWidth;
291 const int32_t stride = aTranspose ? 1 : aStride;
292 bool skipRectCoversWholeRow =
293 0 >= aSkipRect.X() && aWidth <= aSkipRect.XMost();
295 for (int32_t y = 0; y < aRows; y++) {
296 // Check whether the skip rect intersects this row. If the skip
297 // rect covers the whole surface in this row, we can avoid
298 // this row entirely (and any others along the skip rect).
299 bool inSkipRectY = aSkipRect.ContainsY(y);
300 if (inSkipRectY && skipRectCoversWholeRow) {
301 aData += stride * (aSkipRect.YMost() - y);
302 y = aSkipRect.YMost() - 1;
303 continue;
306 // Read in data from the source transposed if necessary.
307 BoxBlurRow<aTranspose, false>(aData, tmpRow, aLobes[0][0], aLobes[0][1],
308 aWidth, aStride, 0, aWidth);
310 // For the middle pass, the data is already pre-transposed and does not need
311 // to be post-transposed yet.
312 BoxBlurRow<false, false>(tmpRow, tmpRow2, aLobes[1][0], aLobes[1][1],
313 aWidth, aStride, 0, aWidth);
315 // Write back data to the destination transposed if necessary too.
316 // Make sure not to overwrite the skip rect by only outputting to the
317 // destination before and after the skip rect, if requested.
318 int32_t skipStart =
319 inSkipRectY ? std::clamp(aSkipRect.X(), 0, aWidth) : aWidth;
320 int32_t skipEnd = std::max(skipStart, aSkipRect.XMost());
321 if (skipStart > 0) {
322 BoxBlurRow<false, aTranspose>(tmpRow2, aData, aLobes[2][0], aLobes[2][1],
323 aWidth, aStride, 0, skipStart);
325 if (skipEnd < aWidth) {
326 BoxBlurRow<false, aTranspose>(tmpRow2, aData, aLobes[2][0], aLobes[2][1],
327 aWidth, aStride, skipEnd, aWidth);
330 aData += stride;
333 delete[] tmpRow;
336 static void ComputeLobes(int32_t aRadius, int32_t aLobes[3][2]) {
337 int32_t major, minor, final;
339 /* See http://www.w3.org/TR/SVG/filters.html#feGaussianBlur for
340 * some notes about approximating the Gaussian blur with box-blurs.
341 * The comments below are in the terminology of that page.
343 int32_t z = aRadius / 3;
344 switch (aRadius % 3) {
345 case 0:
346 // aRadius = z*3; choose d = 2*z + 1
347 major = minor = final = z;
348 break;
349 case 1:
350 // aRadius = z*3 + 1
351 // This is a tricky case since there is no value of d which will
352 // yield a radius of exactly aRadius. If d is odd, i.e. d=2*k + 1
353 // for some integer k, then the radius will be 3*k. If d is even,
354 // i.e. d=2*k, then the radius will be 3*k - 1.
355 // So we have to choose values that don't match the standard
356 // algorithm.
357 major = z + 1;
358 minor = final = z;
359 break;
360 case 2:
361 // aRadius = z*3 + 2; choose d = 2*z + 2
362 major = final = z + 1;
363 minor = z;
364 break;
365 default:
366 // Mathematical impossibility!
367 MOZ_ASSERT(false);
368 major = minor = final = 0;
370 MOZ_ASSERT(major + minor + final == aRadius);
372 aLobes[0][0] = major;
373 aLobes[0][1] = minor;
374 aLobes[1][0] = minor;
375 aLobes[1][1] = major;
376 aLobes[2][0] = final;
377 aLobes[2][1] = final;
380 static void SpreadHorizontal(uint8_t* aInput, uint8_t* aOutput, int32_t aRadius,
381 int32_t aWidth, int32_t aRows, int32_t aStride,
382 const IntRect& aSkipRect) {
383 if (aRadius == 0) {
384 memcpy(aOutput, aInput, aStride * aRows);
385 return;
388 bool skipRectCoversWholeRow =
389 0 >= aSkipRect.X() && aWidth <= aSkipRect.XMost();
390 for (int32_t y = 0; y < aRows; y++) {
391 // Check whether the skip rect intersects this row. If the skip
392 // rect covers the whole surface in this row, we can avoid
393 // this row entirely (and any others along the skip rect).
394 bool inSkipRectY = aSkipRect.ContainsY(y);
395 if (inSkipRectY && skipRectCoversWholeRow) {
396 y = aSkipRect.YMost() - 1;
397 continue;
400 for (int32_t x = 0; x < aWidth; x++) {
401 // Check whether we are within the skip rect. If so, go
402 // to the next point outside the skip rect.
403 if (inSkipRectY && aSkipRect.ContainsX(x)) {
404 x = aSkipRect.XMost();
405 if (x >= aWidth) break;
408 int32_t sMin = std::max(x - aRadius, 0);
409 int32_t sMax = std::min(x + aRadius, aWidth - 1);
410 int32_t v = 0;
411 for (int32_t s = sMin; s <= sMax; ++s) {
412 v = std::max<int32_t>(v, aInput[aStride * y + s]);
414 aOutput[aStride * y + x] = v;
419 static void SpreadVertical(uint8_t* aInput, uint8_t* aOutput, int32_t aRadius,
420 int32_t aWidth, int32_t aRows, int32_t aStride,
421 const IntRect& aSkipRect) {
422 if (aRadius == 0) {
423 memcpy(aOutput, aInput, aStride * aRows);
424 return;
427 bool skipRectCoversWholeColumn =
428 0 >= aSkipRect.Y() && aRows <= aSkipRect.YMost();
429 for (int32_t x = 0; x < aWidth; x++) {
430 bool inSkipRectX = aSkipRect.ContainsX(x);
431 if (inSkipRectX && skipRectCoversWholeColumn) {
432 x = aSkipRect.XMost() - 1;
433 continue;
436 for (int32_t y = 0; y < aRows; y++) {
437 // Check whether we are within the skip rect. If so, go
438 // to the next point outside the skip rect.
439 if (inSkipRectX && aSkipRect.ContainsY(y)) {
440 y = aSkipRect.YMost();
441 if (y >= aRows) break;
444 int32_t sMin = std::max(y - aRadius, 0);
445 int32_t sMax = std::min(y + aRadius, aRows - 1);
446 int32_t v = 0;
447 for (int32_t s = sMin; s <= sMax; ++s) {
448 v = std::max<int32_t>(v, aInput[aStride * s + x]);
450 aOutput[aStride * y + x] = v;
455 CheckedInt<int32_t> AlphaBoxBlur::RoundUpToMultipleOf4(int32_t aVal) {
456 CheckedInt<int32_t> val(aVal);
458 val += 3;
459 val /= 4;
460 val *= 4;
462 return val;
465 AlphaBoxBlur::AlphaBoxBlur(const Rect& aRect, const IntSize& aSpreadRadius,
466 const IntSize& aBlurRadius, const Rect* aDirtyRect,
467 const Rect* aSkipRect)
468 : mStride(0), mSurfaceAllocationSize(0) {
469 Init(aRect, aSpreadRadius, aBlurRadius, aDirtyRect, aSkipRect);
472 AlphaBoxBlur::AlphaBoxBlur()
473 : mStride(0), mSurfaceAllocationSize(0), mHasDirtyRect(false) {}
475 void AlphaBoxBlur::Init(const Rect& aRect, const IntSize& aSpreadRadius,
476 const IntSize& aBlurRadius, const Rect* aDirtyRect,
477 const Rect* aSkipRect) {
478 mSpreadRadius = aSpreadRadius;
479 mBlurRadius = aBlurRadius;
481 Rect rect(aRect);
482 rect.Inflate(Size(aBlurRadius + aSpreadRadius));
483 rect.RoundOut();
485 if (aDirtyRect) {
486 // If we get passed a dirty rect from layout, we can minimize the
487 // shadow size and make painting faster.
488 mHasDirtyRect = true;
489 mDirtyRect = *aDirtyRect;
490 Rect requiredBlurArea = mDirtyRect.Intersect(rect);
491 requiredBlurArea.Inflate(Size(aBlurRadius + aSpreadRadius));
492 rect = requiredBlurArea.Intersect(rect);
493 } else {
494 mHasDirtyRect = false;
497 mRect = TruncatedToInt(rect);
498 if (mRect.IsEmpty()) {
499 return;
502 if (aSkipRect) {
503 // If we get passed a skip rect, we can lower the amount of
504 // blurring/spreading we need to do. We convert it to IntRect to avoid
505 // expensive int<->float conversions if we were to use Rect instead.
506 Rect skipRect = *aSkipRect;
507 skipRect.Deflate(Size(aBlurRadius + aSpreadRadius));
508 mSkipRect = RoundedIn(skipRect);
509 mSkipRect = mSkipRect.Intersect(mRect);
510 if (mSkipRect.IsEqualInterior(mRect)) {
511 return;
514 mSkipRect -= mRect.TopLeft();
515 // Ensure the skip rect is 4-pixel-aligned in the x axis, so that all our
516 // accesses later are aligned as well, see bug 1622113.
517 mSkipRect.SetLeftEdge(RoundUpToMultiple(mSkipRect.X(), 4));
518 mSkipRect.SetRightEdge(RoundDownToMultiple(mSkipRect.XMost(), 4));
519 if (mSkipRect.IsEmpty()) {
520 mSkipRect = IntRect();
522 } else {
523 mSkipRect = IntRect();
526 CheckedInt<int32_t> stride = RoundUpToMultipleOf4(mRect.Width());
527 if (stride.isValid()) {
528 mStride = stride.value();
530 // We need to leave room for an additional 3 bytes for a potential overrun
531 // in our blurring code.
532 size_t size = BufferSizeFromStrideAndHeight(mStride, mRect.Height(), 3);
533 if (size != 0) {
534 mSurfaceAllocationSize = size;
539 AlphaBoxBlur::AlphaBoxBlur(const Rect& aRect, int32_t aStride, float aSigmaX,
540 float aSigmaY)
541 : mRect(TruncatedToInt(aRect)),
543 mBlurRadius(CalculateBlurRadius(Point(aSigmaX, aSigmaY))),
544 mStride(aStride),
545 mSurfaceAllocationSize(0),
546 mHasDirtyRect(false) {
547 IntRect intRect;
548 if (aRect.ToIntRect(&intRect)) {
549 size_t minDataSize =
550 BufferSizeFromStrideAndHeight(intRect.Width(), intRect.Height());
551 if (minDataSize != 0) {
552 mSurfaceAllocationSize = minDataSize;
557 AlphaBoxBlur::~AlphaBoxBlur() = default;
559 IntSize AlphaBoxBlur::GetSize() const {
560 IntSize size(mRect.Width(), mRect.Height());
561 return size;
564 int32_t AlphaBoxBlur::GetStride() const { return mStride; }
566 IntRect AlphaBoxBlur::GetRect() const { return mRect; }
568 Rect* AlphaBoxBlur::GetDirtyRect() {
569 if (mHasDirtyRect) {
570 return &mDirtyRect;
573 return nullptr;
576 size_t AlphaBoxBlur::GetSurfaceAllocationSize() const {
577 return mSurfaceAllocationSize;
580 void AlphaBoxBlur::Blur(uint8_t* aData) const {
581 if (!aData) {
582 return;
585 // no need to do all this if not blurring or spreading
586 if (mBlurRadius != IntSize(0, 0) || mSpreadRadius != IntSize(0, 0)) {
587 int32_t stride = GetStride();
589 IntSize size = GetSize();
591 if (mSpreadRadius.width > 0 || mSpreadRadius.height > 0) {
592 // No need to use CheckedInt here - we have validated it in the
593 // constructor.
594 size_t szB = stride * size.height;
595 uint8_t* tmpData = new (std::nothrow) uint8_t[szB];
597 if (!tmpData) {
598 return;
601 memset(tmpData, 0, szB);
603 SpreadHorizontal(aData, tmpData, mSpreadRadius.width, size.width,
604 size.height, stride, mSkipRect);
605 SpreadVertical(tmpData, aData, mSpreadRadius.height, size.width,
606 size.height, stride, mSkipRect);
608 delete[] tmpData;
611 int32_t horizontalLobes[3][2];
612 ComputeLobes(mBlurRadius.width, horizontalLobes);
613 int32_t verticalLobes[3][2];
614 ComputeLobes(mBlurRadius.height, verticalLobes);
616 // We want to allow for some extra space on the left for alignment reasons.
617 int32_t maxLeftLobe =
618 RoundUpToMultipleOf4(horizontalLobes[0][0] + 1).value();
620 IntSize integralImageSize(
621 size.width + maxLeftLobe + horizontalLobes[1][1],
622 size.height + verticalLobes[0][0] + verticalLobes[1][1] + 1);
624 if ((integralImageSize.width * integralImageSize.height) > (1 << 24)) {
625 // Fallback to old blurring code when the surface is so large it may
626 // overflow our integral image!
627 if (mBlurRadius.width > 0) {
628 BoxBlur<false>(aData, horizontalLobes, size.width, size.height, stride,
629 mSkipRect);
631 if (mBlurRadius.height > 0) {
632 BoxBlur<true>(aData, verticalLobes, size.width, size.height, stride,
633 mSkipRect);
635 } else {
636 size_t integralImageStride =
637 GetAlignedStride<16>(integralImageSize.width, 4);
638 if (integralImageStride == 0) {
639 return;
642 // We need to leave room for an additional 12 bytes for a maximum overrun
643 // of 3 pixels in the blurring code.
644 size_t bufLen = BufferSizeFromStrideAndHeight(
645 integralImageStride, integralImageSize.height, 12);
646 if (bufLen == 0) {
647 return;
649 // bufLen is a byte count, but here we want a multiple of 32-bit ints, so
650 // we divide by 4.
651 AlignedArray<uint32_t> integralImage((bufLen / 4) +
652 ((bufLen % 4) ? 1 : 0));
654 if (!integralImage) {
655 return;
658 #ifdef USE_SSE2
659 if (Factory::HasSSE2()) {
660 BoxBlur_SSE2(aData, horizontalLobes[0][0], horizontalLobes[0][1],
661 verticalLobes[0][0], verticalLobes[0][1], integralImage,
662 integralImageStride);
663 BoxBlur_SSE2(aData, horizontalLobes[1][0], horizontalLobes[1][1],
664 verticalLobes[1][0], verticalLobes[1][1], integralImage,
665 integralImageStride);
666 BoxBlur_SSE2(aData, horizontalLobes[2][0], horizontalLobes[2][1],
667 verticalLobes[2][0], verticalLobes[2][1], integralImage,
668 integralImageStride);
669 } else
670 #endif
671 #ifdef USE_NEON
672 if (mozilla::supports_neon()) {
673 BoxBlur_NEON(aData, horizontalLobes[0][0], horizontalLobes[0][1],
674 verticalLobes[0][0], verticalLobes[0][1], integralImage,
675 integralImageStride);
676 BoxBlur_NEON(aData, horizontalLobes[1][0], horizontalLobes[1][1],
677 verticalLobes[1][0], verticalLobes[1][1], integralImage,
678 integralImageStride);
679 BoxBlur_NEON(aData, horizontalLobes[2][0], horizontalLobes[2][1],
680 verticalLobes[2][0], verticalLobes[2][1], integralImage,
681 integralImageStride);
682 } else
683 #endif
685 #ifdef _MIPS_ARCH_LOONGSON3A
686 BoxBlur_LS3(aData, horizontalLobes[0][0], horizontalLobes[0][1],
687 verticalLobes[0][0], verticalLobes[0][1], integralImage,
688 integralImageStride);
689 BoxBlur_LS3(aData, horizontalLobes[1][0], horizontalLobes[1][1],
690 verticalLobes[1][0], verticalLobes[1][1], integralImage,
691 integralImageStride);
692 BoxBlur_LS3(aData, horizontalLobes[2][0], horizontalLobes[2][1],
693 verticalLobes[2][0], verticalLobes[2][1], integralImage,
694 integralImageStride);
695 #else
696 BoxBlur_C(aData, horizontalLobes[0][0], horizontalLobes[0][1],
697 verticalLobes[0][0], verticalLobes[0][1], integralImage,
698 integralImageStride);
699 BoxBlur_C(aData, horizontalLobes[1][0], horizontalLobes[1][1],
700 verticalLobes[1][0], verticalLobes[1][1], integralImage,
701 integralImageStride);
702 BoxBlur_C(aData, horizontalLobes[2][0], horizontalLobes[2][1],
703 verticalLobes[2][0], verticalLobes[2][1], integralImage,
704 integralImageStride);
705 #endif
711 MOZ_ALWAYS_INLINE void GenerateIntegralRow(uint32_t* aDest,
712 const uint8_t* aSource,
713 uint32_t* aPreviousRow,
714 const uint32_t& aSourceWidth,
715 const uint32_t& aLeftInflation,
716 const uint32_t& aRightInflation) {
717 uint32_t currentRowSum = 0;
718 uint32_t pixel = aSource[0];
719 for (uint32_t x = 0; x < aLeftInflation; x++) {
720 currentRowSum += pixel;
721 *aDest++ = currentRowSum + *aPreviousRow++;
723 for (uint32_t x = aLeftInflation; x < (aSourceWidth + aLeftInflation);
724 x += 4) {
725 uint32_t alphaValues = *(uint32_t*)(aSource + (x - aLeftInflation));
726 #if defined WORDS_BIGENDIAN || defined IS_BIG_ENDIAN || defined __BIG_ENDIAN__
727 currentRowSum += (alphaValues >> 24) & 0xff;
728 *aDest++ = *aPreviousRow++ + currentRowSum;
729 currentRowSum += (alphaValues >> 16) & 0xff;
730 *aDest++ = *aPreviousRow++ + currentRowSum;
731 currentRowSum += (alphaValues >> 8) & 0xff;
732 *aDest++ = *aPreviousRow++ + currentRowSum;
733 currentRowSum += alphaValues & 0xff;
734 *aDest++ = *aPreviousRow++ + currentRowSum;
735 #else
736 currentRowSum += alphaValues & 0xff;
737 *aDest++ = *aPreviousRow++ + currentRowSum;
738 alphaValues >>= 8;
739 currentRowSum += alphaValues & 0xff;
740 *aDest++ = *aPreviousRow++ + currentRowSum;
741 alphaValues >>= 8;
742 currentRowSum += alphaValues & 0xff;
743 *aDest++ = *aPreviousRow++ + currentRowSum;
744 alphaValues >>= 8;
745 currentRowSum += alphaValues & 0xff;
746 *aDest++ = *aPreviousRow++ + currentRowSum;
747 #endif
749 pixel = aSource[aSourceWidth - 1];
750 for (uint32_t x = (aSourceWidth + aLeftInflation);
751 x < (aSourceWidth + aLeftInflation + aRightInflation); x++) {
752 currentRowSum += pixel;
753 *aDest++ = currentRowSum + *aPreviousRow++;
757 MOZ_ALWAYS_INLINE void GenerateIntegralImage_C(
758 int32_t aLeftInflation, int32_t aRightInflation, int32_t aTopInflation,
759 int32_t aBottomInflation, uint32_t* aIntegralImage,
760 size_t aIntegralImageStride, uint8_t* aSource, int32_t aSourceStride,
761 const IntSize& aSize) {
762 uint32_t stride32bit = aIntegralImageStride / 4;
764 IntSize integralImageSize(aSize.width + aLeftInflation + aRightInflation,
765 aSize.height + aTopInflation + aBottomInflation);
767 memset(aIntegralImage, 0, aIntegralImageStride);
769 GenerateIntegralRow(aIntegralImage, aSource, aIntegralImage, aSize.width,
770 aLeftInflation, aRightInflation);
771 for (int y = 1; y < aTopInflation + 1; y++) {
772 GenerateIntegralRow(aIntegralImage + (y * stride32bit), aSource,
773 aIntegralImage + (y - 1) * stride32bit, aSize.width,
774 aLeftInflation, aRightInflation);
777 for (int y = aTopInflation + 1; y < (aSize.height + aTopInflation); y++) {
778 GenerateIntegralRow(aIntegralImage + (y * stride32bit),
779 aSource + aSourceStride * (y - aTopInflation),
780 aIntegralImage + (y - 1) * stride32bit, aSize.width,
781 aLeftInflation, aRightInflation);
784 if (aBottomInflation) {
785 for (int y = (aSize.height + aTopInflation); y < integralImageSize.height;
786 y++) {
787 GenerateIntegralRow(aIntegralImage + (y * stride32bit),
788 aSource + ((aSize.height - 1) * aSourceStride),
789 aIntegralImage + (y - 1) * stride32bit, aSize.width,
790 aLeftInflation, aRightInflation);
796 * Attempt to do an in-place box blur using an integral image.
798 void AlphaBoxBlur::BoxBlur_C(uint8_t* aData, int32_t aLeftLobe,
799 int32_t aRightLobe, int32_t aTopLobe,
800 int32_t aBottomLobe, uint32_t* aIntegralImage,
801 size_t aIntegralImageStride) const {
802 IntSize size = GetSize();
804 MOZ_ASSERT(size.width > 0);
806 // Our 'left' or 'top' lobe will include the current pixel. i.e. when
807 // looking at an integral image the value of a pixel at 'x,y' is calculated
808 // using the value of the integral image values above/below that.
809 aLeftLobe++;
810 aTopLobe++;
811 int32_t boxSize = (aLeftLobe + aRightLobe) * (aTopLobe + aBottomLobe);
813 MOZ_ASSERT(boxSize > 0);
815 if (boxSize == 1) {
816 return;
819 int32_t stride32bit = aIntegralImageStride / 4;
821 int32_t leftInflation = RoundUpToMultipleOf4(aLeftLobe).value();
823 GenerateIntegralImage_C(leftInflation, aRightLobe, aTopLobe, aBottomLobe,
824 aIntegralImage, aIntegralImageStride, aData, mStride,
825 size);
827 uint32_t reciprocal = uint32_t((uint64_t(1) << 32) / boxSize);
829 uint32_t* innerIntegral =
830 aIntegralImage + (aTopLobe * stride32bit) + leftInflation;
832 // Storing these locally makes this about 30% faster! Presumably the compiler
833 // can't be sure we're not altering the member variables in this loop.
834 IntRect skipRect = mSkipRect;
835 uint8_t* data = aData;
836 int32_t stride = mStride;
837 for (int32_t y = 0; y < size.height; y++) {
838 // Not using ContainsY(y) because we do not skip y == skipRect.Y()
839 // although that may not be done on purpose
840 bool inSkipRectY = y > skipRect.Y() && y < skipRect.YMost();
842 uint32_t* topLeftBase =
843 innerIntegral + ((y - aTopLobe) * stride32bit - aLeftLobe);
844 uint32_t* topRightBase =
845 innerIntegral + ((y - aTopLobe) * stride32bit + aRightLobe);
846 uint32_t* bottomRightBase =
847 innerIntegral + ((y + aBottomLobe) * stride32bit + aRightLobe);
848 uint32_t* bottomLeftBase =
849 innerIntegral + ((y + aBottomLobe) * stride32bit - aLeftLobe);
851 for (int32_t x = 0; x < size.width; x++) {
852 // Not using ContainsX(x) because we do not skip x == skipRect.X()
853 // although that may not be done on purpose
854 if (inSkipRectY && x > skipRect.X() && x < skipRect.XMost()) {
855 x = skipRect.XMost() - 1;
856 // Trigger early jump on coming loop iterations, this will be reset
857 // next line anyway.
858 inSkipRectY = false;
859 continue;
861 int32_t topLeft = topLeftBase[x];
862 int32_t topRight = topRightBase[x];
863 int32_t bottomRight = bottomRightBase[x];
864 int32_t bottomLeft = bottomLeftBase[x];
866 uint32_t value = bottomRight - topRight - bottomLeft;
867 value += topLeft;
869 data[stride * y + x] =
870 (uint64_t(reciprocal) * value + (uint64_t(1) << 31)) >> 32;
876 * Compute the box blur size (which we're calling the blur radius) from
877 * the standard deviation.
879 * Much of this, the 3 * sqrt(2 * pi) / 4, is the known value for
880 * approximating a Gaussian using box blurs. This yields quite a good
881 * approximation for a Gaussian. Then we multiply this by 1.5 since our
882 * code wants the radius of the entire triple-box-blur kernel instead of
883 * the diameter of an individual box blur. For more details, see:
884 * http://www.w3.org/TR/SVG11/filters.html#feGaussianBlurElement
885 * https://bugzilla.mozilla.org/show_bug.cgi?id=590039#c19
887 constexpr double sqrt_2_PI = 0x1.40d931ff62705p+1; // sqrt is not constexpr
888 static constexpr Float GAUSSIAN_SCALE_FACTOR = Float((3 * sqrt_2_PI / 4) * 1.5);
890 IntSize AlphaBoxBlur::CalculateBlurRadius(const Point& aStd) {
891 IntSize size(
892 static_cast<int32_t>(floor(aStd.x * GAUSSIAN_SCALE_FACTOR + 0.5f)),
893 static_cast<int32_t>(floor(aStd.y * GAUSSIAN_SCALE_FACTOR + 0.5f)));
895 return size;
898 Float AlphaBoxBlur::CalculateBlurSigma(int32_t aBlurRadius) {
899 return aBlurRadius / GAUSSIAN_SCALE_FACTOR;
902 } // namespace gfx
903 } // namespace mozilla