1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
13 #include "mozilla/CheckedInt.h"
14 #include "NumericTools.h"
17 #include "DataSurfaceHelpers.h"
21 # include "mozilla/arm.h"
28 * Helper function to process each row of the box blur.
29 * It takes care of transposing the data on input or output depending
30 * on whether we intend a horizontal or vertical blur, and whether we're
31 * reading from the initial source or writing to the final destination.
32 * It allows starting or ending anywhere within the row to accomodate
35 template <bool aTransposeInput
, bool aTransposeOutput
>
36 static inline void BoxBlurRow(const uint8_t* aInput
, uint8_t* aOutput
,
37 int32_t aLeftLobe
, int32_t aRightLobe
,
38 int32_t aWidth
, int32_t aStride
, int32_t aStart
,
40 // If the input or output is transposed, then we will move down a row
41 // for each step, instead of moving over a column. Since these values
42 // only depend on a template parameter, they will more easily get
43 // copy-propagated in the non-transposed case, which is why they
44 // are not passed as parameters.
45 const int32_t inputStep
= aTransposeInput
? aStride
: 1;
46 const int32_t outputStep
= aTransposeOutput
? aStride
: 1;
48 // We need to sample aLeftLobe pixels to the left and aRightLobe pixels
49 // to the right of the current position, then average them. So this is
50 // the size of the total width of this filter.
51 const int32_t boxSize
= aLeftLobe
+ aRightLobe
+ 1;
53 // Instead of dividing the pixel sum by boxSize to average, we can just
54 // compute a scale that will normalize the result so that it can be quickly
55 // shifted into the desired range.
56 const uint32_t reciprocal
= (1 << 24) / boxSize
;
58 // The shift would normally truncate the result, whereas we would rather
59 // prefer to round the result to the closest increment. By adding 0.5 units
60 // to the initial sum, we bias the sum so that it will be rounded by the
61 // truncation instead.
62 uint32_t alphaSum
= (boxSize
+ 1) / 2;
64 // We process the row with a moving filter, keeping a sum (alphaSum) of
65 // boxSize pixels. As we move over a pixel, we need to add on a pixel
66 // from the right extreme of the window that moved into range, and subtract
67 // off a pixel from the left extreme of window that moved out of range.
68 // But first, we need to initialization alphaSum to the contents of
69 // the window before we can get going. If the window moves out of bounds
70 // of the row, we clamp each sample to be the closest pixel from within
71 // row bounds, so the 0th and aWidth-1th pixel.
72 int32_t initLeft
= aStart
- aLeftLobe
;
74 // If the left lobe samples before the row, add in clamped samples.
75 alphaSum
+= -initLeft
* aInput
[0];
78 int32_t initRight
= aStart
+ boxSize
- aLeftLobe
;
79 if (initRight
> aWidth
) {
80 // If the right lobe samples after the row, add in clamped samples.
81 alphaSum
+= (initRight
- aWidth
) * aInput
[(aWidth
- 1) * inputStep
];
84 // Finally, add in all the valid, non-clamped samples to fill up the
85 // rest of the window.
86 const uint8_t* src
= &aInput
[initLeft
* inputStep
];
87 const uint8_t* iterEnd
= &aInput
[initRight
* inputStep
];
93 // We unroll the per-pixel loop here substantially. The amount of work
94 // done per sample is so small that the cost of a loop condition check
95 // and a branch can substantially add to or even dominate the performance
97 while (src
+ 16 * inputStep
<= iterEnd
) {
115 while (src
< iterEnd
) {
119 // Now we start moving the window over the row. We will be accessing
120 // pixels form aStart - aLeftLobe up to aEnd + aRightLobe, which may be
121 // out of bounds of the row. To avoid having to check within the inner
122 // loops if we are in bound, we instead compute the points at which
123 // we will move out of bounds of the row on the left side (splitLeft)
124 // and right side (splitRight).
125 int32_t splitLeft
= std::clamp(aLeftLobe
, aStart
, aEnd
);
126 int32_t splitRight
= std::clamp(aWidth
- (boxSize
- aLeftLobe
), aStart
, aEnd
);
127 // If the filter window is actually large than the size of the row,
128 // there will be a middle area of overlap where the leftmost and rightmost
129 // pixel of the filter will both be outside the row. In this case, we need
130 // to invert the splits so that splitLeft <= splitRight.
131 if (boxSize
> aWidth
) {
132 std::swap(splitLeft
, splitRight
);
135 // Process all pixels up to splitLeft that would sample before the start of
136 // the row. Note that because inputStep and outputStep may not be a const 1
137 // value, it is more performant to increment pointers here for the source and
138 // destination rather than use a loop counter, since doing so would entail an
139 // expensive multiplication that significantly slows down the loop.
140 uint8_t* dst
= &aOutput
[aStart
* outputStep
];
141 iterEnd
= &aOutput
[splitLeft
* outputStep
];
142 src
= &aInput
[(aStart
+ boxSize
- aLeftLobe
) * inputStep
];
143 uint8_t firstVal
= aInput
[0];
146 *dst = (alphaSum * reciprocal) >> 24; \
147 alphaSum += *src - firstVal; \
151 while (dst
+ 16 * outputStep
<= iterEnd
) {
169 while (dst
< iterEnd
) {
173 // Process all pixels between splitLeft and splitRight.
174 iterEnd
= &aOutput
[splitRight
* outputStep
];
175 if (boxSize
<= aWidth
) {
176 // The filter window is smaller than the row size, so the leftmost and
177 // rightmost samples are both within row bounds.
178 src
= &aInput
[(splitLeft
- aLeftLobe
) * inputStep
];
179 int32_t boxStep
= boxSize
* inputStep
;
181 #define CENTER_ITER \
182 *dst = (alphaSum * reciprocal) >> 24; \
183 alphaSum += src[boxStep] - *src; \
187 while (dst
+ 16 * outputStep
<= iterEnd
) {
205 while (dst
< iterEnd
) {
209 // The filter window is larger than the row size, and we're in the area of
210 // split overlap. So the leftmost and rightmost samples are both out of
211 // bounds and need to be clamped. We can just precompute the difference here
213 int32_t firstLastDiff
= aInput
[(aWidth
- 1) * inputStep
] - aInput
[0];
214 while (dst
< iterEnd
) {
215 *dst
= (alphaSum
* reciprocal
) >> 24;
216 alphaSum
+= firstLastDiff
;
221 // Process all remaining pixels after splitRight that would sample after the
223 iterEnd
= &aOutput
[aEnd
* outputStep
];
224 src
= &aInput
[(splitRight
- aLeftLobe
) * inputStep
];
225 uint8_t lastVal
= aInput
[(aWidth
- 1) * inputStep
];
228 *dst = (alphaSum * reciprocal) >> 24; \
229 alphaSum += lastVal - *src; \
233 while (dst
+ 16 * outputStep
<= iterEnd
) {
251 while (dst
< iterEnd
) {
257 * Box blur involves looking at one pixel, and setting its value to the average
258 * of its neighbouring pixels. This is meant to provide a 3-pass approximation
259 * of a Gaussian blur.
260 * @param aTranspose Whether to transpose the buffer when reading and writing
262 * @param aData The buffer to be blurred.
263 * @param aLobes The number of pixels to blend on the left and right for each of
265 * @param aWidth The number of columns in the buffers.
266 * @param aRows The number of rows in the buffers.
267 * @param aStride The stride of the buffer.
269 template <bool aTranspose
>
270 static void BoxBlur(uint8_t* aData
, const int32_t aLobes
[3][2], int32_t aWidth
,
271 int32_t aRows
, int32_t aStride
, IntRect aSkipRect
) {
273 std::swap(aWidth
, aRows
);
277 MOZ_ASSERT(aWidth
> 0);
279 // All three passes of the box blur that approximate the Gaussian are done
280 // on each row in turn, so we only need two temporary row buffers to process
281 // each row, instead of a full-sized buffer. Data moves from the source to the
282 // first temporary, from the first temporary to the second, then from the
283 // second back to the destination. This way is more cache-friendly than
284 // processing whe whole buffer in each pass and thus yields a nice speedup.
285 uint8_t* tmpRow
= new (std::nothrow
) uint8_t[2 * aWidth
];
289 uint8_t* tmpRow2
= tmpRow
+ aWidth
;
291 const int32_t stride
= aTranspose
? 1 : aStride
;
292 bool skipRectCoversWholeRow
=
293 0 >= aSkipRect
.X() && aWidth
<= aSkipRect
.XMost();
295 for (int32_t y
= 0; y
< aRows
; y
++) {
296 // Check whether the skip rect intersects this row. If the skip
297 // rect covers the whole surface in this row, we can avoid
298 // this row entirely (and any others along the skip rect).
299 bool inSkipRectY
= aSkipRect
.ContainsY(y
);
300 if (inSkipRectY
&& skipRectCoversWholeRow
) {
301 aData
+= stride
* (aSkipRect
.YMost() - y
);
302 y
= aSkipRect
.YMost() - 1;
306 // Read in data from the source transposed if necessary.
307 BoxBlurRow
<aTranspose
, false>(aData
, tmpRow
, aLobes
[0][0], aLobes
[0][1],
308 aWidth
, aStride
, 0, aWidth
);
310 // For the middle pass, the data is already pre-transposed and does not need
311 // to be post-transposed yet.
312 BoxBlurRow
<false, false>(tmpRow
, tmpRow2
, aLobes
[1][0], aLobes
[1][1],
313 aWidth
, aStride
, 0, aWidth
);
315 // Write back data to the destination transposed if necessary too.
316 // Make sure not to overwrite the skip rect by only outputting to the
317 // destination before and after the skip rect, if requested.
319 inSkipRectY
? std::clamp(aSkipRect
.X(), 0, aWidth
) : aWidth
;
320 int32_t skipEnd
= std::max(skipStart
, aSkipRect
.XMost());
322 BoxBlurRow
<false, aTranspose
>(tmpRow2
, aData
, aLobes
[2][0], aLobes
[2][1],
323 aWidth
, aStride
, 0, skipStart
);
325 if (skipEnd
< aWidth
) {
326 BoxBlurRow
<false, aTranspose
>(tmpRow2
, aData
, aLobes
[2][0], aLobes
[2][1],
327 aWidth
, aStride
, skipEnd
, aWidth
);
336 static void ComputeLobes(int32_t aRadius
, int32_t aLobes
[3][2]) {
337 int32_t major
, minor
, final
;
339 /* See http://www.w3.org/TR/SVG/filters.html#feGaussianBlur for
340 * some notes about approximating the Gaussian blur with box-blurs.
341 * The comments below are in the terminology of that page.
343 int32_t z
= aRadius
/ 3;
344 switch (aRadius
% 3) {
346 // aRadius = z*3; choose d = 2*z + 1
347 major
= minor
= final
= z
;
351 // This is a tricky case since there is no value of d which will
352 // yield a radius of exactly aRadius. If d is odd, i.e. d=2*k + 1
353 // for some integer k, then the radius will be 3*k. If d is even,
354 // i.e. d=2*k, then the radius will be 3*k - 1.
355 // So we have to choose values that don't match the standard
361 // aRadius = z*3 + 2; choose d = 2*z + 2
362 major
= final
= z
+ 1;
366 // Mathematical impossibility!
368 major
= minor
= final
= 0;
370 MOZ_ASSERT(major
+ minor
+ final
== aRadius
);
372 aLobes
[0][0] = major
;
373 aLobes
[0][1] = minor
;
374 aLobes
[1][0] = minor
;
375 aLobes
[1][1] = major
;
376 aLobes
[2][0] = final
;
377 aLobes
[2][1] = final
;
380 static void SpreadHorizontal(uint8_t* aInput
, uint8_t* aOutput
, int32_t aRadius
,
381 int32_t aWidth
, int32_t aRows
, int32_t aStride
,
382 const IntRect
& aSkipRect
) {
384 memcpy(aOutput
, aInput
, aStride
* aRows
);
388 bool skipRectCoversWholeRow
=
389 0 >= aSkipRect
.X() && aWidth
<= aSkipRect
.XMost();
390 for (int32_t y
= 0; y
< aRows
; y
++) {
391 // Check whether the skip rect intersects this row. If the skip
392 // rect covers the whole surface in this row, we can avoid
393 // this row entirely (and any others along the skip rect).
394 bool inSkipRectY
= aSkipRect
.ContainsY(y
);
395 if (inSkipRectY
&& skipRectCoversWholeRow
) {
396 y
= aSkipRect
.YMost() - 1;
400 for (int32_t x
= 0; x
< aWidth
; x
++) {
401 // Check whether we are within the skip rect. If so, go
402 // to the next point outside the skip rect.
403 if (inSkipRectY
&& aSkipRect
.ContainsX(x
)) {
404 x
= aSkipRect
.XMost();
405 if (x
>= aWidth
) break;
408 int32_t sMin
= std::max(x
- aRadius
, 0);
409 int32_t sMax
= std::min(x
+ aRadius
, aWidth
- 1);
411 for (int32_t s
= sMin
; s
<= sMax
; ++s
) {
412 v
= std::max
<int32_t>(v
, aInput
[aStride
* y
+ s
]);
414 aOutput
[aStride
* y
+ x
] = v
;
419 static void SpreadVertical(uint8_t* aInput
, uint8_t* aOutput
, int32_t aRadius
,
420 int32_t aWidth
, int32_t aRows
, int32_t aStride
,
421 const IntRect
& aSkipRect
) {
423 memcpy(aOutput
, aInput
, aStride
* aRows
);
427 bool skipRectCoversWholeColumn
=
428 0 >= aSkipRect
.Y() && aRows
<= aSkipRect
.YMost();
429 for (int32_t x
= 0; x
< aWidth
; x
++) {
430 bool inSkipRectX
= aSkipRect
.ContainsX(x
);
431 if (inSkipRectX
&& skipRectCoversWholeColumn
) {
432 x
= aSkipRect
.XMost() - 1;
436 for (int32_t y
= 0; y
< aRows
; y
++) {
437 // Check whether we are within the skip rect. If so, go
438 // to the next point outside the skip rect.
439 if (inSkipRectX
&& aSkipRect
.ContainsY(y
)) {
440 y
= aSkipRect
.YMost();
441 if (y
>= aRows
) break;
444 int32_t sMin
= std::max(y
- aRadius
, 0);
445 int32_t sMax
= std::min(y
+ aRadius
, aRows
- 1);
447 for (int32_t s
= sMin
; s
<= sMax
; ++s
) {
448 v
= std::max
<int32_t>(v
, aInput
[aStride
* s
+ x
]);
450 aOutput
[aStride
* y
+ x
] = v
;
455 CheckedInt
<int32_t> AlphaBoxBlur::RoundUpToMultipleOf4(int32_t aVal
) {
456 CheckedInt
<int32_t> val(aVal
);
465 AlphaBoxBlur::AlphaBoxBlur(const Rect
& aRect
, const IntSize
& aSpreadRadius
,
466 const IntSize
& aBlurRadius
, const Rect
* aDirtyRect
,
467 const Rect
* aSkipRect
)
468 : mStride(0), mSurfaceAllocationSize(0) {
469 Init(aRect
, aSpreadRadius
, aBlurRadius
, aDirtyRect
, aSkipRect
);
472 AlphaBoxBlur::AlphaBoxBlur()
473 : mStride(0), mSurfaceAllocationSize(0), mHasDirtyRect(false) {}
475 void AlphaBoxBlur::Init(const Rect
& aRect
, const IntSize
& aSpreadRadius
,
476 const IntSize
& aBlurRadius
, const Rect
* aDirtyRect
,
477 const Rect
* aSkipRect
) {
478 mSpreadRadius
= aSpreadRadius
;
479 mBlurRadius
= aBlurRadius
;
482 rect
.Inflate(Size(aBlurRadius
+ aSpreadRadius
));
486 // If we get passed a dirty rect from layout, we can minimize the
487 // shadow size and make painting faster.
488 mHasDirtyRect
= true;
489 mDirtyRect
= *aDirtyRect
;
490 Rect requiredBlurArea
= mDirtyRect
.Intersect(rect
);
491 requiredBlurArea
.Inflate(Size(aBlurRadius
+ aSpreadRadius
));
492 rect
= requiredBlurArea
.Intersect(rect
);
494 mHasDirtyRect
= false;
497 mRect
= TruncatedToInt(rect
);
498 if (mRect
.IsEmpty()) {
503 // If we get passed a skip rect, we can lower the amount of
504 // blurring/spreading we need to do. We convert it to IntRect to avoid
505 // expensive int<->float conversions if we were to use Rect instead.
506 Rect skipRect
= *aSkipRect
;
507 skipRect
.Deflate(Size(aBlurRadius
+ aSpreadRadius
));
508 mSkipRect
= RoundedIn(skipRect
);
509 mSkipRect
= mSkipRect
.Intersect(mRect
);
510 if (mSkipRect
.IsEqualInterior(mRect
)) {
514 mSkipRect
-= mRect
.TopLeft();
515 // Ensure the skip rect is 4-pixel-aligned in the x axis, so that all our
516 // accesses later are aligned as well, see bug 1622113.
517 mSkipRect
.SetLeftEdge(RoundUpToMultiple(mSkipRect
.X(), 4));
518 mSkipRect
.SetRightEdge(RoundDownToMultiple(mSkipRect
.XMost(), 4));
519 if (mSkipRect
.IsEmpty()) {
520 mSkipRect
= IntRect();
523 mSkipRect
= IntRect();
526 CheckedInt
<int32_t> stride
= RoundUpToMultipleOf4(mRect
.Width());
527 if (stride
.isValid()) {
528 mStride
= stride
.value();
530 // We need to leave room for an additional 3 bytes for a potential overrun
531 // in our blurring code.
532 size_t size
= BufferSizeFromStrideAndHeight(mStride
, mRect
.Height(), 3);
534 mSurfaceAllocationSize
= size
;
539 AlphaBoxBlur::AlphaBoxBlur(const Rect
& aRect
, int32_t aStride
, float aSigmaX
,
541 : mRect(TruncatedToInt(aRect
)),
543 mBlurRadius(CalculateBlurRadius(Point(aSigmaX
, aSigmaY
))),
545 mSurfaceAllocationSize(0),
546 mHasDirtyRect(false) {
548 if (aRect
.ToIntRect(&intRect
)) {
550 BufferSizeFromStrideAndHeight(intRect
.Width(), intRect
.Height());
551 if (minDataSize
!= 0) {
552 mSurfaceAllocationSize
= minDataSize
;
557 AlphaBoxBlur::~AlphaBoxBlur() = default;
559 IntSize
AlphaBoxBlur::GetSize() const {
560 IntSize
size(mRect
.Width(), mRect
.Height());
564 int32_t AlphaBoxBlur::GetStride() const { return mStride
; }
566 IntRect
AlphaBoxBlur::GetRect() const { return mRect
; }
568 Rect
* AlphaBoxBlur::GetDirtyRect() {
576 size_t AlphaBoxBlur::GetSurfaceAllocationSize() const {
577 return mSurfaceAllocationSize
;
580 void AlphaBoxBlur::Blur(uint8_t* aData
) const {
585 // no need to do all this if not blurring or spreading
586 if (mBlurRadius
!= IntSize(0, 0) || mSpreadRadius
!= IntSize(0, 0)) {
587 int32_t stride
= GetStride();
589 IntSize size
= GetSize();
591 if (mSpreadRadius
.width
> 0 || mSpreadRadius
.height
> 0) {
592 // No need to use CheckedInt here - we have validated it in the
594 size_t szB
= stride
* size
.height
;
595 uint8_t* tmpData
= new (std::nothrow
) uint8_t[szB
];
601 memset(tmpData
, 0, szB
);
603 SpreadHorizontal(aData
, tmpData
, mSpreadRadius
.width
, size
.width
,
604 size
.height
, stride
, mSkipRect
);
605 SpreadVertical(tmpData
, aData
, mSpreadRadius
.height
, size
.width
,
606 size
.height
, stride
, mSkipRect
);
611 int32_t horizontalLobes
[3][2];
612 ComputeLobes(mBlurRadius
.width
, horizontalLobes
);
613 int32_t verticalLobes
[3][2];
614 ComputeLobes(mBlurRadius
.height
, verticalLobes
);
616 // We want to allow for some extra space on the left for alignment reasons.
617 int32_t maxLeftLobe
=
618 RoundUpToMultipleOf4(horizontalLobes
[0][0] + 1).value();
620 IntSize
integralImageSize(
621 size
.width
+ maxLeftLobe
+ horizontalLobes
[1][1],
622 size
.height
+ verticalLobes
[0][0] + verticalLobes
[1][1] + 1);
624 if ((integralImageSize
.width
* integralImageSize
.height
) > (1 << 24)) {
625 // Fallback to old blurring code when the surface is so large it may
626 // overflow our integral image!
627 if (mBlurRadius
.width
> 0) {
628 BoxBlur
<false>(aData
, horizontalLobes
, size
.width
, size
.height
, stride
,
631 if (mBlurRadius
.height
> 0) {
632 BoxBlur
<true>(aData
, verticalLobes
, size
.width
, size
.height
, stride
,
636 size_t integralImageStride
=
637 GetAlignedStride
<16>(integralImageSize
.width
, 4);
638 if (integralImageStride
== 0) {
642 // We need to leave room for an additional 12 bytes for a maximum overrun
643 // of 3 pixels in the blurring code.
644 size_t bufLen
= BufferSizeFromStrideAndHeight(
645 integralImageStride
, integralImageSize
.height
, 12);
649 // bufLen is a byte count, but here we want a multiple of 32-bit ints, so
651 AlignedArray
<uint32_t> integralImage((bufLen
/ 4) +
652 ((bufLen
% 4) ? 1 : 0));
654 if (!integralImage
) {
659 if (Factory::HasSSE2()) {
660 BoxBlur_SSE2(aData
, horizontalLobes
[0][0], horizontalLobes
[0][1],
661 verticalLobes
[0][0], verticalLobes
[0][1], integralImage
,
662 integralImageStride
);
663 BoxBlur_SSE2(aData
, horizontalLobes
[1][0], horizontalLobes
[1][1],
664 verticalLobes
[1][0], verticalLobes
[1][1], integralImage
,
665 integralImageStride
);
666 BoxBlur_SSE2(aData
, horizontalLobes
[2][0], horizontalLobes
[2][1],
667 verticalLobes
[2][0], verticalLobes
[2][1], integralImage
,
668 integralImageStride
);
672 if (mozilla::supports_neon()) {
673 BoxBlur_NEON(aData
, horizontalLobes
[0][0], horizontalLobes
[0][1],
674 verticalLobes
[0][0], verticalLobes
[0][1], integralImage
,
675 integralImageStride
);
676 BoxBlur_NEON(aData
, horizontalLobes
[1][0], horizontalLobes
[1][1],
677 verticalLobes
[1][0], verticalLobes
[1][1], integralImage
,
678 integralImageStride
);
679 BoxBlur_NEON(aData
, horizontalLobes
[2][0], horizontalLobes
[2][1],
680 verticalLobes
[2][0], verticalLobes
[2][1], integralImage
,
681 integralImageStride
);
685 #ifdef _MIPS_ARCH_LOONGSON3A
686 BoxBlur_LS3(aData
, horizontalLobes
[0][0], horizontalLobes
[0][1],
687 verticalLobes
[0][0], verticalLobes
[0][1], integralImage
,
688 integralImageStride
);
689 BoxBlur_LS3(aData
, horizontalLobes
[1][0], horizontalLobes
[1][1],
690 verticalLobes
[1][0], verticalLobes
[1][1], integralImage
,
691 integralImageStride
);
692 BoxBlur_LS3(aData
, horizontalLobes
[2][0], horizontalLobes
[2][1],
693 verticalLobes
[2][0], verticalLobes
[2][1], integralImage
,
694 integralImageStride
);
696 BoxBlur_C(aData
, horizontalLobes
[0][0], horizontalLobes
[0][1],
697 verticalLobes
[0][0], verticalLobes
[0][1], integralImage
,
698 integralImageStride
);
699 BoxBlur_C(aData
, horizontalLobes
[1][0], horizontalLobes
[1][1],
700 verticalLobes
[1][0], verticalLobes
[1][1], integralImage
,
701 integralImageStride
);
702 BoxBlur_C(aData
, horizontalLobes
[2][0], horizontalLobes
[2][1],
703 verticalLobes
[2][0], verticalLobes
[2][1], integralImage
,
704 integralImageStride
);
711 MOZ_ALWAYS_INLINE
void GenerateIntegralRow(uint32_t* aDest
,
712 const uint8_t* aSource
,
713 uint32_t* aPreviousRow
,
714 const uint32_t& aSourceWidth
,
715 const uint32_t& aLeftInflation
,
716 const uint32_t& aRightInflation
) {
717 uint32_t currentRowSum
= 0;
718 uint32_t pixel
= aSource
[0];
719 for (uint32_t x
= 0; x
< aLeftInflation
; x
++) {
720 currentRowSum
+= pixel
;
721 *aDest
++ = currentRowSum
+ *aPreviousRow
++;
723 for (uint32_t x
= aLeftInflation
; x
< (aSourceWidth
+ aLeftInflation
);
725 uint32_t alphaValues
= *(uint32_t*)(aSource
+ (x
- aLeftInflation
));
726 #if defined WORDS_BIGENDIAN || defined IS_BIG_ENDIAN || defined __BIG_ENDIAN__
727 currentRowSum
+= (alphaValues
>> 24) & 0xff;
728 *aDest
++ = *aPreviousRow
++ + currentRowSum
;
729 currentRowSum
+= (alphaValues
>> 16) & 0xff;
730 *aDest
++ = *aPreviousRow
++ + currentRowSum
;
731 currentRowSum
+= (alphaValues
>> 8) & 0xff;
732 *aDest
++ = *aPreviousRow
++ + currentRowSum
;
733 currentRowSum
+= alphaValues
& 0xff;
734 *aDest
++ = *aPreviousRow
++ + currentRowSum
;
736 currentRowSum
+= alphaValues
& 0xff;
737 *aDest
++ = *aPreviousRow
++ + currentRowSum
;
739 currentRowSum
+= alphaValues
& 0xff;
740 *aDest
++ = *aPreviousRow
++ + currentRowSum
;
742 currentRowSum
+= alphaValues
& 0xff;
743 *aDest
++ = *aPreviousRow
++ + currentRowSum
;
745 currentRowSum
+= alphaValues
& 0xff;
746 *aDest
++ = *aPreviousRow
++ + currentRowSum
;
749 pixel
= aSource
[aSourceWidth
- 1];
750 for (uint32_t x
= (aSourceWidth
+ aLeftInflation
);
751 x
< (aSourceWidth
+ aLeftInflation
+ aRightInflation
); x
++) {
752 currentRowSum
+= pixel
;
753 *aDest
++ = currentRowSum
+ *aPreviousRow
++;
757 MOZ_ALWAYS_INLINE
void GenerateIntegralImage_C(
758 int32_t aLeftInflation
, int32_t aRightInflation
, int32_t aTopInflation
,
759 int32_t aBottomInflation
, uint32_t* aIntegralImage
,
760 size_t aIntegralImageStride
, uint8_t* aSource
, int32_t aSourceStride
,
761 const IntSize
& aSize
) {
762 uint32_t stride32bit
= aIntegralImageStride
/ 4;
764 IntSize
integralImageSize(aSize
.width
+ aLeftInflation
+ aRightInflation
,
765 aSize
.height
+ aTopInflation
+ aBottomInflation
);
767 memset(aIntegralImage
, 0, aIntegralImageStride
);
769 GenerateIntegralRow(aIntegralImage
, aSource
, aIntegralImage
, aSize
.width
,
770 aLeftInflation
, aRightInflation
);
771 for (int y
= 1; y
< aTopInflation
+ 1; y
++) {
772 GenerateIntegralRow(aIntegralImage
+ (y
* stride32bit
), aSource
,
773 aIntegralImage
+ (y
- 1) * stride32bit
, aSize
.width
,
774 aLeftInflation
, aRightInflation
);
777 for (int y
= aTopInflation
+ 1; y
< (aSize
.height
+ aTopInflation
); y
++) {
778 GenerateIntegralRow(aIntegralImage
+ (y
* stride32bit
),
779 aSource
+ aSourceStride
* (y
- aTopInflation
),
780 aIntegralImage
+ (y
- 1) * stride32bit
, aSize
.width
,
781 aLeftInflation
, aRightInflation
);
784 if (aBottomInflation
) {
785 for (int y
= (aSize
.height
+ aTopInflation
); y
< integralImageSize
.height
;
787 GenerateIntegralRow(aIntegralImage
+ (y
* stride32bit
),
788 aSource
+ ((aSize
.height
- 1) * aSourceStride
),
789 aIntegralImage
+ (y
- 1) * stride32bit
, aSize
.width
,
790 aLeftInflation
, aRightInflation
);
796 * Attempt to do an in-place box blur using an integral image.
798 void AlphaBoxBlur::BoxBlur_C(uint8_t* aData
, int32_t aLeftLobe
,
799 int32_t aRightLobe
, int32_t aTopLobe
,
800 int32_t aBottomLobe
, uint32_t* aIntegralImage
,
801 size_t aIntegralImageStride
) const {
802 IntSize size
= GetSize();
804 MOZ_ASSERT(size
.width
> 0);
806 // Our 'left' or 'top' lobe will include the current pixel. i.e. when
807 // looking at an integral image the value of a pixel at 'x,y' is calculated
808 // using the value of the integral image values above/below that.
811 int32_t boxSize
= (aLeftLobe
+ aRightLobe
) * (aTopLobe
+ aBottomLobe
);
813 MOZ_ASSERT(boxSize
> 0);
819 int32_t stride32bit
= aIntegralImageStride
/ 4;
821 int32_t leftInflation
= RoundUpToMultipleOf4(aLeftLobe
).value();
823 GenerateIntegralImage_C(leftInflation
, aRightLobe
, aTopLobe
, aBottomLobe
,
824 aIntegralImage
, aIntegralImageStride
, aData
, mStride
,
827 uint32_t reciprocal
= uint32_t((uint64_t(1) << 32) / boxSize
);
829 uint32_t* innerIntegral
=
830 aIntegralImage
+ (aTopLobe
* stride32bit
) + leftInflation
;
832 // Storing these locally makes this about 30% faster! Presumably the compiler
833 // can't be sure we're not altering the member variables in this loop.
834 IntRect skipRect
= mSkipRect
;
835 uint8_t* data
= aData
;
836 int32_t stride
= mStride
;
837 for (int32_t y
= 0; y
< size
.height
; y
++) {
838 // Not using ContainsY(y) because we do not skip y == skipRect.Y()
839 // although that may not be done on purpose
840 bool inSkipRectY
= y
> skipRect
.Y() && y
< skipRect
.YMost();
842 uint32_t* topLeftBase
=
843 innerIntegral
+ ((y
- aTopLobe
) * stride32bit
- aLeftLobe
);
844 uint32_t* topRightBase
=
845 innerIntegral
+ ((y
- aTopLobe
) * stride32bit
+ aRightLobe
);
846 uint32_t* bottomRightBase
=
847 innerIntegral
+ ((y
+ aBottomLobe
) * stride32bit
+ aRightLobe
);
848 uint32_t* bottomLeftBase
=
849 innerIntegral
+ ((y
+ aBottomLobe
) * stride32bit
- aLeftLobe
);
851 for (int32_t x
= 0; x
< size
.width
; x
++) {
852 // Not using ContainsX(x) because we do not skip x == skipRect.X()
853 // although that may not be done on purpose
854 if (inSkipRectY
&& x
> skipRect
.X() && x
< skipRect
.XMost()) {
855 x
= skipRect
.XMost() - 1;
856 // Trigger early jump on coming loop iterations, this will be reset
861 int32_t topLeft
= topLeftBase
[x
];
862 int32_t topRight
= topRightBase
[x
];
863 int32_t bottomRight
= bottomRightBase
[x
];
864 int32_t bottomLeft
= bottomLeftBase
[x
];
866 uint32_t value
= bottomRight
- topRight
- bottomLeft
;
869 data
[stride
* y
+ x
] =
870 (uint64_t(reciprocal
) * value
+ (uint64_t(1) << 31)) >> 32;
876 * Compute the box blur size (which we're calling the blur radius) from
877 * the standard deviation.
879 * Much of this, the 3 * sqrt(2 * pi) / 4, is the known value for
880 * approximating a Gaussian using box blurs. This yields quite a good
881 * approximation for a Gaussian. Then we multiply this by 1.5 since our
882 * code wants the radius of the entire triple-box-blur kernel instead of
883 * the diameter of an individual box blur. For more details, see:
884 * http://www.w3.org/TR/SVG11/filters.html#feGaussianBlurElement
885 * https://bugzilla.mozilla.org/show_bug.cgi?id=590039#c19
887 constexpr double sqrt_2_PI
= 0x1.40d931ff62705p
+1; // sqrt is not constexpr
888 static constexpr Float GAUSSIAN_SCALE_FACTOR
= Float((3 * sqrt_2_PI
/ 4) * 1.5);
890 IntSize
AlphaBoxBlur::CalculateBlurRadius(const Point
& aStd
) {
892 static_cast<int32_t>(floor(aStd
.x
* GAUSSIAN_SCALE_FACTOR
+ 0.5f
)),
893 static_cast<int32_t>(floor(aStd
.y
* GAUSSIAN_SCALE_FACTOR
+ 0.5f
)));
898 Float
AlphaBoxBlur::CalculateBlurSigma(int32_t aBlurRadius
) {
899 return aBlurRadius
/ GAUSSIAN_SCALE_FACTOR
;
903 } // namespace mozilla