gfx/2d/Blur.cpp

   1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
   3 /* This Source Code Form is subject to the terms of the Mozilla Public
   4  * License, v. 2.0. If a copy of the MPL was not distributed with this
   5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   6
   7 #include "Blur.h"
   8
   9 #include <algorithm>
  10 #include <math.h>
  11 #include <string.h>
  12
  13 #include "mozilla/CheckedInt.h"
  14 #include "NumericTools.h"
  15
  16 #include "2D.h"
  17 #include "DataSurfaceHelpers.h"
  18 #include "Tools.h"
  19
  20 #ifdef USE_NEON
  21 #  include "mozilla/arm.h"
  22 #endif
  23
  24 namespace mozilla {
  25 namespace gfx {
  26
  27 /**
  28  * Helper function to process each row of the box blur.
  29  * It takes care of transposing the data on input or output depending
  30  * on whether we intend a horizontal or vertical blur, and whether we're
  31  * reading from the initial source or writing to the final destination.
  32  * It allows starting or ending anywhere within the row to accomodate
  33  * a skip rect.
  34  */
  35 template <bool aTransposeInput, bool aTransposeOutput>
  36 static inline void BoxBlurRow(const uint8_t* aInput, uint8_t* aOutput,
  37                               int32_t aLeftLobe, int32_t aRightLobe,
  38                               int32_t aWidth, int32_t aStride, int32_t aStart,
  39                               int32_t aEnd) {
  40   // If the input or output is transposed, then we will move down a row
  41   // for each step, instead of moving over a column. Since these values
  42   // only depend on a template parameter, they will more easily get
  43   // copy-propagated in the non-transposed case, which is why they
  44   // are not passed as parameters.
  45   const int32_t inputStep = aTransposeInput ? aStride : 1;
  46   const int32_t outputStep = aTransposeOutput ? aStride : 1;
  47
  48   // We need to sample aLeftLobe pixels to the left and aRightLobe pixels
  49   // to the right of the current position, then average them. So this is
  50   // the size of the total width of this filter.
  51   const int32_t boxSize = aLeftLobe + aRightLobe + 1;
  52
  53   // Instead of dividing the pixel sum by boxSize to average, we can just
  54   // compute a scale that will normalize the result so that it can be quickly
  55   // shifted into the desired range.
  56   const uint32_t reciprocal = (1 << 24) / boxSize;
  57
  58   // The shift would normally truncate the result, whereas we would rather
  59   // prefer to round the result to the closest increment. By adding 0.5 units
  60   // to the initial sum, we bias the sum so that it will be rounded by the
  61   // truncation instead.
  62   uint32_t alphaSum = (boxSize + 1) / 2;
  63
  64   // We process the row with a moving filter, keeping a sum (alphaSum) of
  65   // boxSize pixels. As we move over a pixel, we need to add on a pixel
  66   // from the right extreme of the window that moved into range, and subtract
  67   // off a pixel from the left extreme of window that moved out of range.
  68   // But first, we need to initialization alphaSum to the contents of
  69   // the window before we can get going. If the window moves out of bounds
  70   // of the row, we clamp each sample to be the closest pixel from within
  71   // row bounds, so the 0th and aWidth-1th pixel.
  72   int32_t initLeft = aStart - aLeftLobe;
  73   if (initLeft < 0) {
  74     // If the left lobe samples before the row, add in clamped samples.
  75     alphaSum += -initLeft * aInput[0];
  76     initLeft = 0;
  77   }
  78   int32_t initRight = aStart + boxSize - aLeftLobe;
  79   if (initRight > aWidth) {
  80     // If the right lobe samples after the row, add in clamped samples.
  81     alphaSum += (initRight - aWidth) * aInput[(aWidth - 1) * inputStep];
  82     initRight = aWidth;
  83   }
  84   // Finally, add in all the valid, non-clamped samples to fill up the
  85   // rest of the window.
  86   const uint8_t* src = &aInput[initLeft * inputStep];
  87   const uint8_t* iterEnd = &aInput[initRight * inputStep];
  88
  89 #define INIT_ITER   \
  90   alphaSum += *src; \
  91   src += inputStep;
  92
  93   // We unroll the per-pixel loop here substantially. The amount of work
  94   // done per sample is so small that the cost of a loop condition check
  95   // and a branch can substantially add to or even dominate the performance
  96   // of the loop.
  97   while (src + 16 * inputStep <= iterEnd) {
  98     INIT_ITER;
  99     INIT_ITER;
 100     INIT_ITER;
 101     INIT_ITER;
 102     INIT_ITER;
 103     INIT_ITER;
 104     INIT_ITER;
 105     INIT_ITER;
 106     INIT_ITER;
 107     INIT_ITER;
 108     INIT_ITER;
 109     INIT_ITER;
 110     INIT_ITER;
 111     INIT_ITER;
 112     INIT_ITER;
 113     INIT_ITER;
 114   }
 115   while (src < iterEnd) {
 116     INIT_ITER;
 117   }
 118
 119   // Now we start moving the window over the row. We will be accessing
 120   // pixels form aStart - aLeftLobe up to aEnd + aRightLobe, which may be
 121   // out of bounds of the row. To avoid having to check within the inner
 122   // loops if we are in bound, we instead compute the points at which
 123   // we will move out of bounds of the row on the left side (splitLeft)
 124   // and right side (splitRight).
 125   int32_t splitLeft = std::clamp(aLeftLobe, aStart, aEnd);
 126   int32_t splitRight = std::clamp(aWidth - (boxSize - aLeftLobe), aStart, aEnd);
 127   // If the filter window is actually large than the size of the row,
 128   // there will be a middle area of overlap where the leftmost and rightmost
 129   // pixel of the filter will both be outside the row. In this case, we need
 130   // to invert the splits so that splitLeft <= splitRight.
 131   if (boxSize > aWidth) {
 132     std::swap(splitLeft, splitRight);
 133   }
 134
 135   // Process all pixels up to splitLeft that would sample before the start of
 136   // the row. Note that because inputStep and outputStep may not be a const 1
 137   // value, it is more performant to increment pointers here for the source and
 138   // destination rather than use a loop counter, since doing so would entail an
 139   // expensive multiplication that significantly slows down the loop.
 140   uint8_t* dst = &aOutput[aStart * outputStep];
 141   iterEnd = &aOutput[splitLeft * outputStep];
 142   src = &aInput[(aStart + boxSize - aLeftLobe) * inputStep];
 143   uint8_t firstVal = aInput[0];
 144
 145 #define LEFT_ITER                       \
 146   *dst = (alphaSum * reciprocal) >> 24; \
 147   alphaSum += *src - firstVal;          \
 148   dst += outputStep;                    \
 149   src += inputStep;
 150
 151   while (dst + 16 * outputStep <= iterEnd) {
 152     LEFT_ITER;
 153     LEFT_ITER;
 154     LEFT_ITER;
 155     LEFT_ITER;
 156     LEFT_ITER;
 157     LEFT_ITER;
 158     LEFT_ITER;
 159     LEFT_ITER;
 160     LEFT_ITER;
 161     LEFT_ITER;
 162     LEFT_ITER;
 163     LEFT_ITER;
 164     LEFT_ITER;
 165     LEFT_ITER;
 166     LEFT_ITER;
 167     LEFT_ITER;
 168   }
 169   while (dst < iterEnd) {
 170     LEFT_ITER;
 171   }
 172
 173   // Process all pixels between splitLeft and splitRight.
 174   iterEnd = &aOutput[splitRight * outputStep];
 175   if (boxSize <= aWidth) {
 176     // The filter window is smaller than the row size, so the leftmost and
 177     // rightmost samples are both within row bounds.
 178     src = &aInput[(splitLeft - aLeftLobe) * inputStep];
 179     int32_t boxStep = boxSize * inputStep;
 180
 181 #define CENTER_ITER                     \
 182   *dst = (alphaSum * reciprocal) >> 24; \
 183   alphaSum += src[boxStep] - *src;      \
 184   dst += outputStep;                    \
 185   src += inputStep;
 186
 187     while (dst + 16 * outputStep <= iterEnd) {
 188       CENTER_ITER;
 189       CENTER_ITER;
 190       CENTER_ITER;
 191       CENTER_ITER;
 192       CENTER_ITER;
 193       CENTER_ITER;
 194       CENTER_ITER;
 195       CENTER_ITER;
 196       CENTER_ITER;
 197       CENTER_ITER;
 198       CENTER_ITER;
 199       CENTER_ITER;
 200       CENTER_ITER;
 201       CENTER_ITER;
 202       CENTER_ITER;
 203       CENTER_ITER;
 204     }
 205     while (dst < iterEnd) {
 206       CENTER_ITER;
 207     }
 208   } else {
 209     // The filter window is larger than the row size, and we're in the area of
 210     // split overlap. So the leftmost and rightmost samples are both out of
 211     // bounds and need to be clamped. We can just precompute the difference here
 212     // consequently.
 213     int32_t firstLastDiff = aInput[(aWidth - 1) * inputStep] - aInput[0];
 214     while (dst < iterEnd) {
 215       *dst = (alphaSum * reciprocal) >> 24;
 216       alphaSum += firstLastDiff;
 217       dst += outputStep;
 218     }
 219   }
 220
 221   // Process all remaining pixels after splitRight that would sample after the
 222   // row end.
 223   iterEnd = &aOutput[aEnd * outputStep];
 224   src = &aInput[(splitRight - aLeftLobe) * inputStep];
 225   uint8_t lastVal = aInput[(aWidth - 1) * inputStep];
 226
 227 #define RIGHT_ITER                      \
 228   *dst = (alphaSum * reciprocal) >> 24; \
 229   alphaSum += lastVal - *src;           \
 230   dst += outputStep;                    \
 231   src += inputStep;
 232
 233   while (dst + 16 * outputStep <= iterEnd) {
 234     RIGHT_ITER;
 235     RIGHT_ITER;
 236     RIGHT_ITER;
 237     RIGHT_ITER;
 238     RIGHT_ITER;
 239     RIGHT_ITER;
 240     RIGHT_ITER;
 241     RIGHT_ITER;
 242     RIGHT_ITER;
 243     RIGHT_ITER;
 244     RIGHT_ITER;
 245     RIGHT_ITER;
 246     RIGHT_ITER;
 247     RIGHT_ITER;
 248     RIGHT_ITER;
 249     RIGHT_ITER;
 250   }
 251   while (dst < iterEnd) {
 252     RIGHT_ITER;
 253   }
 254 }
 255
 256 /**
 257  * Box blur involves looking at one pixel, and setting its value to the average
 258  * of its neighbouring pixels. This is meant to provide a 3-pass approximation
 259  * of a Gaussian blur.
 260  * @param aTranspose Whether to transpose the buffer when reading and writing
 261  *                   to it.
 262  * @param aData The buffer to be blurred.
 263  * @param aLobes The number of pixels to blend on the left and right for each of
 264  *               3 passes.
 265  * @param aWidth The number of columns in the buffers.
 266  * @param aRows The number of rows in the buffers.
 267  * @param aStride The stride of the buffer.
 268  */
 269 template <bool aTranspose>
 270 static void BoxBlur(uint8_t* aData, const int32_t aLobes[3][2], int32_t aWidth,
 271                     int32_t aRows, int32_t aStride, IntRect aSkipRect) {
 272   if (aTranspose) {
 273     std::swap(aWidth, aRows);
 274     aSkipRect.Swap();
 275   }
 276
 277   MOZ_ASSERT(aWidth > 0);
 278
 279   // All three passes of the box blur that approximate the Gaussian are done
 280   // on each row in turn, so we only need two temporary row buffers to process
 281   // each row, instead of a full-sized buffer. Data moves from the source to the
 282   // first temporary, from the first temporary to the second, then from the
 283   // second back to the destination. This way is more cache-friendly than
 284   // processing whe whole buffer in each pass and thus yields a nice speedup.
 285   uint8_t* tmpRow = new (std::nothrow) uint8_t[2 * aWidth];
 286   if (!tmpRow) {
 287     return;
 288   }
 289   uint8_t* tmpRow2 = tmpRow + aWidth;
 290
 291   const int32_t stride = aTranspose ? 1 : aStride;
 292   bool skipRectCoversWholeRow =
 293       0 >= aSkipRect.X() && aWidth <= aSkipRect.XMost();
 294
 295   for (int32_t y = 0; y < aRows; y++) {
 296     // Check whether the skip rect intersects this row. If the skip
 297     // rect covers the whole surface in this row, we can avoid
 298     // this row entirely (and any others along the skip rect).
 299     bool inSkipRectY = aSkipRect.ContainsY(y);
 300     if (inSkipRectY && skipRectCoversWholeRow) {
 301       aData += stride * (aSkipRect.YMost() - y);
 302       y = aSkipRect.YMost() - 1;
 303       continue;
 304     }
 305
 306     // Read in data from the source transposed if necessary.
 307     BoxBlurRow<aTranspose, false>(aData, tmpRow, aLobes[0][0], aLobes[0][1],
 308                                   aWidth, aStride, 0, aWidth);
 309
 310     // For the middle pass, the data is already pre-transposed and does not need
 311     // to be post-transposed yet.
 312     BoxBlurRow<false, false>(tmpRow, tmpRow2, aLobes[1][0], aLobes[1][1],
 313                              aWidth, aStride, 0, aWidth);
 314
 315     // Write back data to the destination transposed if necessary too.
 316     // Make sure not to overwrite the skip rect by only outputting to the
 317     // destination before and after the skip rect, if requested.
 318     int32_t skipStart =
 319         inSkipRectY ? std::clamp(aSkipRect.X(), 0, aWidth) : aWidth;
 320     int32_t skipEnd = std::max(skipStart, aSkipRect.XMost());
 321     if (skipStart > 0) {
 322       BoxBlurRow<false, aTranspose>(tmpRow2, aData, aLobes[2][0], aLobes[2][1],
 323                                     aWidth, aStride, 0, skipStart);
 324     }
 325     if (skipEnd < aWidth) {
 326       BoxBlurRow<false, aTranspose>(tmpRow2, aData, aLobes[2][0], aLobes[2][1],
 327                                     aWidth, aStride, skipEnd, aWidth);
 328     }
 329
 330     aData += stride;
 331   }
 332
 333   delete[] tmpRow;
 334 }
 335
 336 static void ComputeLobes(int32_t aRadius, int32_t aLobes[3][2]) {
 337   int32_t major, minor, final;
 338
 339   /* See http://www.w3.org/TR/SVG/filters.html#feGaussianBlur for
 340    * some notes about approximating the Gaussian blur with box-blurs.
 341    * The comments below are in the terminology of that page.
 342    */
 343   int32_t z = aRadius / 3;
 344   switch (aRadius % 3) {
 345     case 0:
 346       // aRadius = z*3; choose d = 2*z + 1
 347       major = minor = final = z;
 348       break;
 349     case 1:
 350       // aRadius = z*3 + 1
 351       // This is a tricky case since there is no value of d which will
 352       // yield a radius of exactly aRadius. If d is odd, i.e. d=2*k + 1
 353       // for some integer k, then the radius will be 3*k. If d is even,
 354       // i.e. d=2*k, then the radius will be 3*k - 1.
 355       // So we have to choose values that don't match the standard
 356       // algorithm.
 357       major = z + 1;
 358       minor = final = z;
 359       break;
 360     case 2:
 361       // aRadius = z*3 + 2; choose d = 2*z + 2
 362       major = final = z + 1;
 363       minor = z;
 364       break;
 365     default:
 366       // Mathematical impossibility!
 367       MOZ_ASSERT(false);
 368       major = minor = final = 0;
 369   }
 370   MOZ_ASSERT(major + minor + final == aRadius);
 371
 372   aLobes[0][0] = major;
 373   aLobes[0][1] = minor;
 374   aLobes[1][0] = minor;
 375   aLobes[1][1] = major;
 376   aLobes[2][0] = final;
 377   aLobes[2][1] = final;
 378 }
 379
 380 static void SpreadHorizontal(uint8_t* aInput, uint8_t* aOutput, int32_t aRadius,
 381                              int32_t aWidth, int32_t aRows, int32_t aStride,
 382                              const IntRect& aSkipRect) {
 383   if (aRadius == 0) {
 384     memcpy(aOutput, aInput, aStride * aRows);
 385     return;
 386   }
 387
 388   bool skipRectCoversWholeRow =
 389       0 >= aSkipRect.X() && aWidth <= aSkipRect.XMost();
 390   for (int32_t y = 0; y < aRows; y++) {
 391     // Check whether the skip rect intersects this row. If the skip
 392     // rect covers the whole surface in this row, we can avoid
 393     // this row entirely (and any others along the skip rect).
 394     bool inSkipRectY = aSkipRect.ContainsY(y);
 395     if (inSkipRectY && skipRectCoversWholeRow) {
 396       y = aSkipRect.YMost() - 1;
 397       continue;
 398     }
 399
 400     for (int32_t x = 0; x < aWidth; x++) {
 401       // Check whether we are within the skip rect. If so, go
 402       // to the next point outside the skip rect.
 403       if (inSkipRectY && aSkipRect.ContainsX(x)) {
 404         x = aSkipRect.XMost();
 405         if (x >= aWidth) break;
 406       }
 407
 408       int32_t sMin = std::max(x - aRadius, 0);
 409       int32_t sMax = std::min(x + aRadius, aWidth - 1);
 410       int32_t v = 0;
 411       for (int32_t s = sMin; s <= sMax; ++s) {
 412         v = std::max<int32_t>(v, aInput[aStride * y + s]);
 413       }
 414       aOutput[aStride * y + x] = v;
 415     }
 416   }
 417 }
 418
 419 static void SpreadVertical(uint8_t* aInput, uint8_t* aOutput, int32_t aRadius,
 420                            int32_t aWidth, int32_t aRows, int32_t aStride,
 421                            const IntRect& aSkipRect) {
 422   if (aRadius == 0) {
 423     memcpy(aOutput, aInput, aStride * aRows);
 424     return;
 425   }
 426
 427   bool skipRectCoversWholeColumn =
 428       0 >= aSkipRect.Y() && aRows <= aSkipRect.YMost();
 429   for (int32_t x = 0; x < aWidth; x++) {
 430     bool inSkipRectX = aSkipRect.ContainsX(x);
 431     if (inSkipRectX && skipRectCoversWholeColumn) {
 432       x = aSkipRect.XMost() - 1;
 433       continue;
 434     }
 435
 436     for (int32_t y = 0; y < aRows; y++) {
 437       // Check whether we are within the skip rect. If so, go
 438       // to the next point outside the skip rect.
 439       if (inSkipRectX && aSkipRect.ContainsY(y)) {
 440         y = aSkipRect.YMost();
 441         if (y >= aRows) break;
 442       }
 443
 444       int32_t sMin = std::max(y - aRadius, 0);
 445       int32_t sMax = std::min(y + aRadius, aRows - 1);
 446       int32_t v = 0;
 447       for (int32_t s = sMin; s <= sMax; ++s) {
 448         v = std::max<int32_t>(v, aInput[aStride * s + x]);
 449       }
 450       aOutput[aStride * y + x] = v;
 451     }
 452   }
 453 }
 454
 455 CheckedInt<int32_t> AlphaBoxBlur::RoundUpToMultipleOf4(int32_t aVal) {
 456   CheckedInt<int32_t> val(aVal);
 457
 458   val += 3;
 459   val /= 4;
 460   val *= 4;
 461
 462   return val;
 463 }
 464
 465 AlphaBoxBlur::AlphaBoxBlur(const Rect& aRect, const IntSize& aSpreadRadius,
 466                            const IntSize& aBlurRadius, const Rect* aDirtyRect,
 467                            const Rect* aSkipRect)
 468     : mStride(0), mSurfaceAllocationSize(0) {
 469   Init(aRect, aSpreadRadius, aBlurRadius, aDirtyRect, aSkipRect);
 470 }
 471
 472 AlphaBoxBlur::AlphaBoxBlur()
 473     : mStride(0), mSurfaceAllocationSize(0), mHasDirtyRect(false) {}
 474
 475 void AlphaBoxBlur::Init(const Rect& aRect, const IntSize& aSpreadRadius,
 476                         const IntSize& aBlurRadius, const Rect* aDirtyRect,
 477                         const Rect* aSkipRect) {
 478   mSpreadRadius = aSpreadRadius;
 479   mBlurRadius = aBlurRadius;
 480
 481   Rect rect(aRect);
 482   rect.Inflate(Size(aBlurRadius + aSpreadRadius));
 483   rect.RoundOut();
 484
 485   if (aDirtyRect) {
 486     // If we get passed a dirty rect from layout, we can minimize the
 487     // shadow size and make painting faster.
 488     mHasDirtyRect = true;
 489     mDirtyRect = *aDirtyRect;
 490     Rect requiredBlurArea = mDirtyRect.Intersect(rect);
 491     requiredBlurArea.Inflate(Size(aBlurRadius + aSpreadRadius));
 492     rect = requiredBlurArea.Intersect(rect);
 493   } else {
 494     mHasDirtyRect = false;
 495   }
 496
 497   mRect = TruncatedToInt(rect);
 498   if (mRect.IsEmpty()) {
 499     return;
 500   }
 501
 502   if (aSkipRect) {
 503     // If we get passed a skip rect, we can lower the amount of
 504     // blurring/spreading we need to do. We convert it to IntRect to avoid
 505     // expensive int<->float conversions if we were to use Rect instead.
 506     Rect skipRect = *aSkipRect;
 507     skipRect.Deflate(Size(aBlurRadius + aSpreadRadius));
 508     mSkipRect = RoundedIn(skipRect);
 509     mSkipRect = mSkipRect.Intersect(mRect);
 510     if (mSkipRect.IsEqualInterior(mRect)) {
 511       return;
 512     }
 513
 514     mSkipRect -= mRect.TopLeft();
 515     // Ensure the skip rect is 4-pixel-aligned in the x axis, so that all our
 516     // accesses later are aligned as well, see bug 1622113.
 517     mSkipRect.SetLeftEdge(RoundUpToMultiple(mSkipRect.X(), 4));
 518     mSkipRect.SetRightEdge(RoundDownToMultiple(mSkipRect.XMost(), 4));
 519     if (mSkipRect.IsEmpty()) {
 520       mSkipRect = IntRect();
 521     }
 522   } else {
 523     mSkipRect = IntRect();
 524   }
 525
 526   CheckedInt<int32_t> stride = RoundUpToMultipleOf4(mRect.Width());
 527   if (stride.isValid()) {
 528     mStride = stride.value();
 529
 530     // We need to leave room for an additional 3 bytes for a potential overrun
 531     // in our blurring code.
 532     size_t size = BufferSizeFromStrideAndHeight(mStride, mRect.Height(), 3);
 533     if (size != 0) {
 534       mSurfaceAllocationSize = size;
 535     }
 536   }
 537 }
 538
 539 AlphaBoxBlur::AlphaBoxBlur(const Rect& aRect, int32_t aStride, float aSigmaX,
 540                            float aSigmaY)
 541     : mRect(TruncatedToInt(aRect)),
 542
 543       mBlurRadius(CalculateBlurRadius(Point(aSigmaX, aSigmaY))),
 544       mStride(aStride),
 545       mSurfaceAllocationSize(0),
 546       mHasDirtyRect(false) {
 547   IntRect intRect;
 548   if (aRect.ToIntRect(&intRect)) {
 549     size_t minDataSize =
 550         BufferSizeFromStrideAndHeight(intRect.Width(), intRect.Height());
 551     if (minDataSize != 0) {
 552       mSurfaceAllocationSize = minDataSize;
 553     }
 554   }
 555 }
 556
 557 AlphaBoxBlur::~AlphaBoxBlur() = default;
 558
 559 IntSize AlphaBoxBlur::GetSize() const {
 560   IntSize size(mRect.Width(), mRect.Height());
 561   return size;
 562 }
 563
 564 int32_t AlphaBoxBlur::GetStride() const { return mStride; }
 565
 566 IntRect AlphaBoxBlur::GetRect() const { return mRect; }
 567
 568 Rect* AlphaBoxBlur::GetDirtyRect() {
 569   if (mHasDirtyRect) {
 570     return &mDirtyRect;
 571   }
 572
 573   return nullptr;
 574 }
 575
 576 size_t AlphaBoxBlur::GetSurfaceAllocationSize() const {
 577   return mSurfaceAllocationSize;
 578 }
 579
 580 void AlphaBoxBlur::Blur(uint8_t* aData) const {
 581   if (!aData) {
 582     return;
 583   }
 584
 585   // no need to do all this if not blurring or spreading
 586   if (mBlurRadius != IntSize(0, 0) || mSpreadRadius != IntSize(0, 0)) {
 587     int32_t stride = GetStride();
 588
 589     IntSize size = GetSize();
 590
 591     if (mSpreadRadius.width > 0 || mSpreadRadius.height > 0) {
 592       // No need to use CheckedInt here - we have validated it in the
 593       // constructor.
 594       size_t szB = stride * size.height;
 595       uint8_t* tmpData = new (std::nothrow) uint8_t[szB];
 596
 597       if (!tmpData) {
 598         return;
 599       }
 600
 601       memset(tmpData, 0, szB);
 602
 603       SpreadHorizontal(aData, tmpData, mSpreadRadius.width, size.width,
 604                        size.height, stride, mSkipRect);
 605       SpreadVertical(tmpData, aData, mSpreadRadius.height, size.width,
 606                      size.height, stride, mSkipRect);
 607
 608       delete[] tmpData;
 609     }
 610
 611     int32_t horizontalLobes[3][2];
 612     ComputeLobes(mBlurRadius.width, horizontalLobes);
 613     int32_t verticalLobes[3][2];
 614     ComputeLobes(mBlurRadius.height, verticalLobes);
 615
 616     // We want to allow for some extra space on the left for alignment reasons.
 617     int32_t maxLeftLobe =
 618         RoundUpToMultipleOf4(horizontalLobes[0][0] + 1).value();
 619
 620     IntSize integralImageSize(
 621         size.width + maxLeftLobe + horizontalLobes[1][1],
 622         size.height + verticalLobes[0][0] + verticalLobes[1][1] + 1);
 623
 624     if ((integralImageSize.width * integralImageSize.height) > (1 << 24)) {
 625       // Fallback to old blurring code when the surface is so large it may
 626       // overflow our integral image!
 627       if (mBlurRadius.width > 0) {
 628         BoxBlur<false>(aData, horizontalLobes, size.width, size.height, stride,
 629                        mSkipRect);
 630       }
 631       if (mBlurRadius.height > 0) {
 632         BoxBlur<true>(aData, verticalLobes, size.width, size.height, stride,
 633                       mSkipRect);
 634       }
 635     } else {
 636       size_t integralImageStride =
 637           GetAlignedStride<16>(integralImageSize.width, 4);
 638       if (integralImageStride == 0) {
 639         return;
 640       }
 641
 642       // We need to leave room for an additional 12 bytes for a maximum overrun
 643       // of 3 pixels in the blurring code.
 644       size_t bufLen = BufferSizeFromStrideAndHeight(
 645           integralImageStride, integralImageSize.height, 12);
 646       if (bufLen == 0) {
 647         return;
 648       }
 649       // bufLen is a byte count, but here we want a multiple of 32-bit ints, so
 650       // we divide by 4.
 651       AlignedArray<uint32_t> integralImage((bufLen / 4) +
 652                                            ((bufLen % 4) ? 1 : 0));
 653
 654       if (!integralImage) {
 655         return;
 656       }
 657
 658 #ifdef USE_SSE2
 659       if (Factory::HasSSE2()) {
 660         BoxBlur_SSE2(aData, horizontalLobes[0][0], horizontalLobes[0][1],
 661                      verticalLobes[0][0], verticalLobes[0][1], integralImage,
 662                      integralImageStride);
 663         BoxBlur_SSE2(aData, horizontalLobes[1][0], horizontalLobes[1][1],
 664                      verticalLobes[1][0], verticalLobes[1][1], integralImage,
 665                      integralImageStride);
 666         BoxBlur_SSE2(aData, horizontalLobes[2][0], horizontalLobes[2][1],
 667                      verticalLobes[2][0], verticalLobes[2][1], integralImage,
 668                      integralImageStride);
 669       } else
 670 #endif
 671 #ifdef USE_NEON
 672           if (mozilla::supports_neon()) {
 673         BoxBlur_NEON(aData, horizontalLobes[0][0], horizontalLobes[0][1],
 674                      verticalLobes[0][0], verticalLobes[0][1], integralImage,
 675                      integralImageStride);
 676         BoxBlur_NEON(aData, horizontalLobes[1][0], horizontalLobes[1][1],
 677                      verticalLobes[1][0], verticalLobes[1][1], integralImage,
 678                      integralImageStride);
 679         BoxBlur_NEON(aData, horizontalLobes[2][0], horizontalLobes[2][1],
 680                      verticalLobes[2][0], verticalLobes[2][1], integralImage,
 681                      integralImageStride);
 682       } else
 683 #endif
 684       {
 685 #ifdef _MIPS_ARCH_LOONGSON3A
 686         BoxBlur_LS3(aData, horizontalLobes[0][0], horizontalLobes[0][1],
 687                     verticalLobes[0][0], verticalLobes[0][1], integralImage,
 688                     integralImageStride);
 689         BoxBlur_LS3(aData, horizontalLobes[1][0], horizontalLobes[1][1],
 690                     verticalLobes[1][0], verticalLobes[1][1], integralImage,
 691                     integralImageStride);
 692         BoxBlur_LS3(aData, horizontalLobes[2][0], horizontalLobes[2][1],
 693                     verticalLobes[2][0], verticalLobes[2][1], integralImage,
 694                     integralImageStride);
 695 #else
 696         BoxBlur_C(aData, horizontalLobes[0][0], horizontalLobes[0][1],
 697                   verticalLobes[0][0], verticalLobes[0][1], integralImage,
 698                   integralImageStride);
 699         BoxBlur_C(aData, horizontalLobes[1][0], horizontalLobes[1][1],
 700                   verticalLobes[1][0], verticalLobes[1][1], integralImage,
 701                   integralImageStride);
 702         BoxBlur_C(aData, horizontalLobes[2][0], horizontalLobes[2][1],
 703                   verticalLobes[2][0], verticalLobes[2][1], integralImage,
 704                   integralImageStride);
 705 #endif
 706       }
 707     }
 708   }
 709 }
 710
 711 MOZ_ALWAYS_INLINE void GenerateIntegralRow(uint32_t* aDest,
 712                                            const uint8_t* aSource,
 713                                            uint32_t* aPreviousRow,
 714                                            const uint32_t& aSourceWidth,
 715                                            const uint32_t& aLeftInflation,
 716                                            const uint32_t& aRightInflation) {
 717   uint32_t currentRowSum = 0;
 718   uint32_t pixel = aSource[0];
 719   for (uint32_t x = 0; x < aLeftInflation; x++) {
 720     currentRowSum += pixel;
 721     *aDest++ = currentRowSum + *aPreviousRow++;
 722   }
 723   for (uint32_t x = aLeftInflation; x < (aSourceWidth + aLeftInflation);
 724        x += 4) {
 725     uint32_t alphaValues = *(uint32_t*)(aSource + (x - aLeftInflation));
 726 #if defined WORDS_BIGENDIAN || defined IS_BIG_ENDIAN || defined __BIG_ENDIAN__
 727     currentRowSum += (alphaValues >> 24) & 0xff;
 728     *aDest++ = *aPreviousRow++ + currentRowSum;
 729     currentRowSum += (alphaValues >> 16) & 0xff;
 730     *aDest++ = *aPreviousRow++ + currentRowSum;
 731     currentRowSum += (alphaValues >> 8) & 0xff;
 732     *aDest++ = *aPreviousRow++ + currentRowSum;
 733     currentRowSum += alphaValues & 0xff;
 734     *aDest++ = *aPreviousRow++ + currentRowSum;
 735 #else
 736     currentRowSum += alphaValues & 0xff;
 737     *aDest++ = *aPreviousRow++ + currentRowSum;
 738     alphaValues >>= 8;
 739     currentRowSum += alphaValues & 0xff;
 740     *aDest++ = *aPreviousRow++ + currentRowSum;
 741     alphaValues >>= 8;
 742     currentRowSum += alphaValues & 0xff;
 743     *aDest++ = *aPreviousRow++ + currentRowSum;
 744     alphaValues >>= 8;
 745     currentRowSum += alphaValues & 0xff;
 746     *aDest++ = *aPreviousRow++ + currentRowSum;
 747 #endif
 748   }
 749   pixel = aSource[aSourceWidth - 1];
 750   for (uint32_t x = (aSourceWidth + aLeftInflation);
 751        x < (aSourceWidth + aLeftInflation + aRightInflation); x++) {
 752     currentRowSum += pixel;
 753     *aDest++ = currentRowSum + *aPreviousRow++;
 754   }
 755 }
 756
 757 MOZ_ALWAYS_INLINE void GenerateIntegralImage_C(
 758     int32_t aLeftInflation, int32_t aRightInflation, int32_t aTopInflation,
 759     int32_t aBottomInflation, uint32_t* aIntegralImage,
 760     size_t aIntegralImageStride, uint8_t* aSource, int32_t aSourceStride,
 761     const IntSize& aSize) {
 762   uint32_t stride32bit = aIntegralImageStride / 4;
 763
 764   IntSize integralImageSize(aSize.width + aLeftInflation + aRightInflation,
 765                             aSize.height + aTopInflation + aBottomInflation);
 766
 767   memset(aIntegralImage, 0, aIntegralImageStride);
 768
 769   GenerateIntegralRow(aIntegralImage, aSource, aIntegralImage, aSize.width,
 770                       aLeftInflation, aRightInflation);
 771   for (int y = 1; y < aTopInflation + 1; y++) {
 772     GenerateIntegralRow(aIntegralImage + (y * stride32bit), aSource,
 773                         aIntegralImage + (y - 1) * stride32bit, aSize.width,
 774                         aLeftInflation, aRightInflation);
 775   }
 776
 777   for (int y = aTopInflation + 1; y < (aSize.height + aTopInflation); y++) {
 778     GenerateIntegralRow(aIntegralImage + (y * stride32bit),
 779                         aSource + aSourceStride * (y - aTopInflation),
 780                         aIntegralImage + (y - 1) * stride32bit, aSize.width,
 781                         aLeftInflation, aRightInflation);
 782   }
 783
 784   if (aBottomInflation) {
 785     for (int y = (aSize.height + aTopInflation); y < integralImageSize.height;
 786          y++) {
 787       GenerateIntegralRow(aIntegralImage + (y * stride32bit),
 788                           aSource + ((aSize.height - 1) * aSourceStride),
 789                           aIntegralImage + (y - 1) * stride32bit, aSize.width,
 790                           aLeftInflation, aRightInflation);
 791     }
 792   }
 793 }
 794
 795 /**
 796  * Attempt to do an in-place box blur using an integral image.
 797  */
 798 void AlphaBoxBlur::BoxBlur_C(uint8_t* aData, int32_t aLeftLobe,
 799                              int32_t aRightLobe, int32_t aTopLobe,
 800                              int32_t aBottomLobe, uint32_t* aIntegralImage,
 801                              size_t aIntegralImageStride) const {
 802   IntSize size = GetSize();
 803
 804   MOZ_ASSERT(size.width > 0);
 805
 806   // Our 'left' or 'top' lobe will include the current pixel. i.e. when
 807   // looking at an integral image the value of a pixel at 'x,y' is calculated
 808   // using the value of the integral image values above/below that.
 809   aLeftLobe++;
 810   aTopLobe++;
 811   int32_t boxSize = (aLeftLobe + aRightLobe) * (aTopLobe + aBottomLobe);
 812
 813   MOZ_ASSERT(boxSize > 0);
 814
 815   if (boxSize == 1) {
 816     return;
 817   }
 818
 819   int32_t stride32bit = aIntegralImageStride / 4;
 820
 821   int32_t leftInflation = RoundUpToMultipleOf4(aLeftLobe).value();
 822
 823   GenerateIntegralImage_C(leftInflation, aRightLobe, aTopLobe, aBottomLobe,
 824                           aIntegralImage, aIntegralImageStride, aData, mStride,
 825                           size);
 826
 827   uint32_t reciprocal = uint32_t((uint64_t(1) << 32) / boxSize);
 828
 829   uint32_t* innerIntegral =
 830       aIntegralImage + (aTopLobe * stride32bit) + leftInflation;
 831
 832   // Storing these locally makes this about 30% faster! Presumably the compiler
 833   // can't be sure we're not altering the member variables in this loop.
 834   IntRect skipRect = mSkipRect;
 835   uint8_t* data = aData;
 836   int32_t stride = mStride;
 837   for (int32_t y = 0; y < size.height; y++) {
 838     // Not using ContainsY(y) because we do not skip y == skipRect.Y()
 839     // although that may not be done on purpose
 840     bool inSkipRectY = y > skipRect.Y() && y < skipRect.YMost();
 841
 842     uint32_t* topLeftBase =
 843         innerIntegral + ((y - aTopLobe) * stride32bit - aLeftLobe);
 844     uint32_t* topRightBase =
 845         innerIntegral + ((y - aTopLobe) * stride32bit + aRightLobe);
 846     uint32_t* bottomRightBase =
 847         innerIntegral + ((y + aBottomLobe) * stride32bit + aRightLobe);
 848     uint32_t* bottomLeftBase =
 849         innerIntegral + ((y + aBottomLobe) * stride32bit - aLeftLobe);
 850
 851     for (int32_t x = 0; x < size.width; x++) {
 852       // Not using ContainsX(x) because we do not skip x == skipRect.X()
 853       // although that may not be done on purpose
 854       if (inSkipRectY && x > skipRect.X() && x < skipRect.XMost()) {
 855         x = skipRect.XMost() - 1;
 856         // Trigger early jump on coming loop iterations, this will be reset
 857         // next line anyway.
 858         inSkipRectY = false;
 859         continue;
 860       }
 861       int32_t topLeft = topLeftBase[x];
 862       int32_t topRight = topRightBase[x];
 863       int32_t bottomRight = bottomRightBase[x];
 864       int32_t bottomLeft = bottomLeftBase[x];
 865
 866       uint32_t value = bottomRight - topRight - bottomLeft;
 867       value += topLeft;
 868
 869       data[stride * y + x] =
 870           (uint64_t(reciprocal) * value + (uint64_t(1) << 31)) >> 32;
 871     }
 872   }
 873 }
 874
 875 /**
 876  * Compute the box blur size (which we're calling the blur radius) from
 877  * the standard deviation.
 878  *
 879  * Much of this, the 3 * sqrt(2 * pi) / 4, is the known value for
 880  * approximating a Gaussian using box blurs.  This yields quite a good
 881  * approximation for a Gaussian.  Then we multiply this by 1.5 since our
 882  * code wants the radius of the entire triple-box-blur kernel instead of
 883  * the diameter of an individual box blur.  For more details, see:
 884  *   http://www.w3.org/TR/SVG11/filters.html#feGaussianBlurElement
 885  *   https://bugzilla.mozilla.org/show_bug.cgi?id=590039#c19
 886  */
 887 constexpr double sqrt_2_PI = 0x1.40d931ff62705p+1;  // sqrt is not constexpr
 888 static constexpr Float GAUSSIAN_SCALE_FACTOR = Float((3 * sqrt_2_PI / 4) * 1.5);
 889
 890 IntSize AlphaBoxBlur::CalculateBlurRadius(const Point& aStd) {
 891   IntSize size(
 892       static_cast<int32_t>(floor(aStd.x * GAUSSIAN_SCALE_FACTOR + 0.5f)),
 893       static_cast<int32_t>(floor(aStd.y * GAUSSIAN_SCALE_FACTOR + 0.5f)));
 894
 895   return size;
 896 }
 897
 898 Float AlphaBoxBlur::CalculateBlurSigma(int32_t aBlurRadius) {
 899   return aBlurRadius / GAUSSIAN_SCALE_FACTOR;
 900 }
 901
 902 }  // namespace gfx
 903 }  // namespace mozilla