src/subtitles/Rasterizer.cpp

   1 /*
   2  *      Copyright (C) 2003-2006 Gabest
   3  *      http://www.gabest.org
   4  *
   5  *  This Program is free software; you can redistribute it and/or modify
   6  *  it under the terms of the GNU General Public License as published by
   7  *  the Free Software Foundation; either version 2, or (at your option)
   8  *  any later version.
   9  *
  10  *  This Program is distributed in the hope that it will be useful,
  11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13  *  GNU General Public License for more details.
  14  *
  15  *  You should have received a copy of the GNU General Public License
  16  *  along with GNU Make; see the file COPYING.  If not, write to
  17  *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  18  *  http://www.gnu.org/copyleft/gpl.html
  19  *
  20  */
  21
  22 #include "stdafx.h"
  23 #include <string.h>
  24 #include <cmath>
  25 #include <vector>
  26 #include <algorithm>
  27 #include "Rasterizer.h"
  28 #include "SeparableFilter.h"
  29 #include "xy_logger.h"
  30 #include <boost/flyweight/key_value.hpp>
  31 #include "xy_bitmap.h"
  32 #include "xy_widen_regoin.h"
  33
  34 #ifndef _MAX    /* avoid collision with common (nonconforming) macros */
  35 #define _MAX    (std::max)
  36 #define _MIN    (std::min)
  37 #define _IMPL_MAX std::max
  38 #define _IMPL_MIN std::min
  39 #else
  40 #define _IMPL_MAX _MAX
  41 #define _IMPL_MIN _MIN
  42 #endif
  43
  44 typedef const UINT8 CUINT8, *PCUINT8;
  45
  46 //NOTE: signed or unsigned affects the result seriously
  47 #define COMBINE_AYUV(a, y, u, v) ((((((((int)(a))<<8)|y)<<8)|u)<<8)|v)
  48
  49 #define SPLIT_AYUV(color, a, y, u, v) do { \
  50         *(v)=(color)&0xff; \
  51         *(u)=((color)>>8) &0xff; \
  52         *(y)=((color)>>16)&0xff;\
  53         *(a)=((color)>>24)&0xff;\
  54     } while(0)
  55
  56 class GaussianCoefficients
  57 {
  58 public:
  59     int g_r;
  60     int g_w;
  61     int g_w_ex;
  62     float *g_f;
  63
  64     double sigma;
  65 public:
  66     GaussianCoefficients(const double sigma)
  67     {
  68         g_r = 0;
  69         g_w = 0;
  70         g_w_ex = 0;
  71
  72         g_f = NULL;
  73
  74         this->sigma = 0;
  75         init(sigma);
  76     }
  77     GaussianCoefficients(const GaussianCoefficients& priv)
  78         :g_r(priv.g_r),g_w(priv.g_w),sigma(priv.sigma),g_f(NULL)
  79         ,g_w_ex(priv.g_w_ex)
  80     {
  81         if (this->g_w_ex > 0 && this != &priv) {
  82             this->g_f = reinterpret_cast<float*>(xy_malloc(this->g_w_ex * sizeof(float)));
  83             ASSERT(this->g_f);
  84             memcpy(g_f, priv.g_f, this->g_w_ex * sizeof(g_f[0]));
  85         }
  86     }
  87
  88     ~GaussianCoefficients()
  89     {
  90         xy_free(g_f); g_f=NULL;
  91     }
  92
  93 private:
  94     int init(double sigma)
  95     {
  96         double a = -1 / (sigma * sigma * 2);
  97         double exp_a = exp(a);
  98
  99         double volume =  0;
 100
 101         if (this->sigma == sigma)
 102             return 0;
 103         else
 104             this->sigma = sigma;
 105
 106         this->g_w = (int)ceil(sigma*3) | 1;
 107         this->g_r = this->g_w / 2;
 108         this->g_w_ex = (this->g_w + 3) & ~3;
 109
 110         if (this->g_w_ex > 0) {
 111             xy_free(this->g_f);
 112             this->g_f = reinterpret_cast<float*>(xy_malloc(this->g_w_ex * sizeof(float)));
 113             if (this->g_f == NULL) {
 114                 return -1;
 115             }
 116         }
 117
 118         if (this->g_w > 0) {
 119             volume = 0;
 120
 121             double exp_0 = 1.0;
 122             double exp_1 = exp_a;
 123             double exp_2 = exp_1 * exp_1;
 124             volume = exp_0;
 125             this->g_f[this->g_r] = exp_0;
 126             float* p_left = this->g_f+this->g_r-1;
 127             float* p_right= this->g_f+this->g_r+1;
 128             for(int i=0; i<this->g_r;++i,p_left--,p_right++)
 129             {
 130                 exp_0 *= exp_1;
 131                 exp_1 *= exp_2;
 132
 133                 *p_left = exp_0;
 134                 *p_right = exp_0;
 135
 136                 volume += exp_0;
 137                 volume += exp_0;
 138             }
 139             //equivalent:
 140             //  for (i = 0; i < this->g_w; ++i) {
 141             //    this->g[i] = (unsigned) ( exp(a * (i - this->g_r) * (i - this->g_r))* volume_factor + .5 );
 142             //    volume += this->g[i];
 143             //  }
 144             ASSERT(volume>0);
 145             for (int i=0;i<this->g_w;i++)
 146             {
 147                 this->g_f[i] /= volume;
 148             }
 149             for (int i=this->g_w;i<this->g_w_ex;i++)
 150             {
 151                 this->g_f[i] = 0;
 152             }
 153         }
 154         return 0;
 155     }
 156
 157 };
 158
 159 class ass_synth_priv
 160 {
 161 public:
 162     static const int VOLUME_BITS = 22;//should not exceed 32-8, and better not exceed 31-8
 163
 164     ass_synth_priv(const double sigma);
 165     ass_synth_priv(const ass_synth_priv& priv);
 166
 167     ~ass_synth_priv();
 168     int generate_tables(double sigma);
 169
 170     int g_r;
 171     int g_w;
 172
 173     unsigned *g;
 174     unsigned *gt2;
 175
 176     double sigma;
 177 };
 178
 179
 180 // GaussianFilter = GaussianCoefficients or ass_synth_priv
 181 template<typename GaussianFilter>
 182 struct GaussianFilterKey
 183 {
 184     const double& operator()(const GaussianFilter& x)const
 185     {
 186         return x.sigma;
 187     }
 188 };
 189
 190 struct ass_tmp_buf
 191 {
 192 public:
 193     ass_tmp_buf(size_t size);
 194     ass_tmp_buf(const ass_tmp_buf& buf);
 195     ~ass_tmp_buf();
 196     size_t size;
 197     unsigned *tmp;
 198 };
 199
 200 struct ass_tmp_buf_get_size
 201 {
 202     const size_t& operator()(const ass_tmp_buf& buf)const
 203     {
 204         return buf.size;
 205     }
 206 };
 207
 208 static const unsigned int maxcolor = 255;
 209 static const unsigned base = 256;
 210
 211 ass_synth_priv::ass_synth_priv(const double sigma)
 212 {
 213     g_r = 0;
 214     g_w = 0;
 215
 216     g = NULL;
 217     gt2 = NULL;
 218
 219     this->sigma = 0;
 220     generate_tables(sigma);
 221 }
 222
 223 ass_synth_priv::ass_synth_priv(const ass_synth_priv& priv):g_r(priv.g_r),g_w(priv.g_w),sigma(priv.sigma)
 224 {
 225     if (this->g_w > 0 && this != &priv) {
 226         this->g = (unsigned*)realloc(this->g, this->g_w * sizeof(unsigned));
 227         this->gt2 = (unsigned*)realloc(this->gt2, 256 * this->g_w * sizeof(unsigned));
 228         //if (this->g == null || this->gt2 == null) {
 229         //    return -1;
 230         //}
 231         memcpy(g, priv.g, this->g_w * sizeof(unsigned));
 232         memcpy(gt2, priv.gt2, 256 * this->g_w * sizeof(unsigned));
 233     }
 234 }
 235
 236 ass_synth_priv::~ass_synth_priv()
 237 {
 238     free(g); g=NULL;
 239     free(gt2); gt2=NULL;
 240 }
 241
 242 int ass_synth_priv::generate_tables(double sigma)
 243 {
 244     const int TARGET_VOLUME = 1<<VOLUME_BITS;
 245     const int MAX_VOLUME_ERROR = VOLUME_BITS>=22 ? 16 : 1;
 246
 247     double a = -1 / (sigma * sigma * 2);
 248     double exp_a = exp(a);
 249
 250     double volume_factor = 0;
 251     double volume_start =  0, volume_end = 0;
 252     unsigned volume;
 253
 254     if (this->sigma == sigma)
 255         return 0;
 256     else
 257         this->sigma = sigma;
 258
 259     this->g_w = (int)ceil(sigma*3) | 1;
 260     this->g_r = this->g_w / 2;
 261
 262     if (this->g_w > 0) {
 263         this->g = (unsigned*)realloc(this->g, this->g_w * sizeof(unsigned));
 264         this->gt2 = (unsigned*)realloc(this->gt2, 256 * this->g_w * sizeof(unsigned));
 265         if (this->g == NULL || this->gt2 == NULL) {
 266             return -1;
 267         }
 268     }
 269
 270     if (this->g_w > 0) {
 271         volume_start = 0;
 272
 273         double exp_0 = 1.0;
 274         double exp_1 = exp_a;
 275         double exp_2 = exp_1 * exp_1;
 276         volume_start += exp_0;
 277         for(int i=0;i<this->g_r;++i)
 278         {
 279             exp_0 *= exp_1;
 280             exp_1 *= exp_2;
 281             volume_start += exp_0;
 282             volume_start += exp_0;
 283         }
 284         //euqivalent:
 285         //  for (i = 0; i < this->g_w; ++i) {
 286         //      volume_start += exp(a * (i - this->g_r) * (i - this->g_r));
 287         //  }
 288
 289         volume_end = (TARGET_VOLUME+g_w)/volume_start;
 290         volume_start = (TARGET_VOLUME-g_w)/volume_start;
 291
 292         volume = 0;
 293         while( volume_start+0.000001<volume_end )
 294         {
 295             volume_factor = (volume_start+volume_end)*0.5;
 296             volume = 0;
 297
 298             exp_0 = volume_factor;
 299             exp_1 = exp_a;
 300             exp_2 = exp_1 * exp_1;
 301
 302             volume = static_cast<int>(exp_0+.5);
 303             this->g[this->g_r] = volume;
 304
 305             unsigned* p_left = this->g+this->g_r-1;
 306             unsigned* p_right= this->g+this->g_r+1;
 307             for(int i=0; i<this->g_r;++i,p_left--,p_right++)
 308             {
 309                 exp_0 *= exp_1;
 310                 exp_1 *= exp_2;
 311                 *p_left = static_cast<int>(exp_0+.5);
 312                 *p_right = *p_left;
 313                 volume += (*p_left<<1);
 314             }
 315             //equivalent:
 316             //    for (i = 0; i < this->g_w; ++i) {
 317             //        this->g[i] = (unsigned) ( exp(a * (i - this->g_r) * (i - this->g_r))* volume_factor + .5 );
 318             //        volume += this->g[i];
 319             //    }
 320
 321             // volume don't have to be equal to TARGET_VOLUME,
 322             // even if volume=TARGET_VOLUME+MAX_VOLUME_ERROR,
 323             // max error introducing in later blur operation,
 324             // which is (dot_product(g_w, pixel))/TARGET_VOLUME with pixel<256,
 325             // would not exceed (MAX_VOLUME_ERROR*256)/TARGET_VOLUME,
 326             // as long as MAX_VOLUME_ERROR/TARGET_VOLUME is small enough, error introduced would be kept in safe range
 327             //
 328             // NOTE: when it comes to rounding, no matter how small the error is,
 329             // it may result a different rounding output
 330             if( volume>=TARGET_VOLUME && volume< (TARGET_VOLUME+MAX_VOLUME_ERROR) )
 331                 break;
 332             else if(volume < TARGET_VOLUME)
 333             {
 334                 volume_start = volume_factor;
 335             }
 336             else if(volume >= TARGET_VOLUME+MAX_VOLUME_ERROR)
 337             {
 338                 volume_end = volume_factor;
 339             }
 340         }
 341         if(volume==0)
 342         {
 343             volume_factor = volume_end;
 344
 345             exp_0 = volume_factor;
 346             exp_1 = exp_a;
 347             exp_2 = exp_1 * exp_1;
 348
 349             volume = static_cast<int>(exp_0+.5);
 350             this->g[this->g_r] = volume;
 351
 352             unsigned* p_left = this->g+this->g_r-1;
 353             unsigned* p_right= this->g+this->g_r+1;
 354             for(int i=0; i<this->g_r;++i,p_left--,p_right++)
 355             {
 356                 exp_0 *= exp_1;
 357                 exp_1 *= exp_2;
 358                 *p_left = static_cast<int>(exp_0+.5);
 359                 *p_right = *p_left;
 360                 volume += (*p_left<<1);
 361             }
 362             //equivalent:
 363             //    for (i = 0; i < this->g_w; ++i) {
 364             //        this->g[i] = (unsigned) ( exp(a * (i - this->g_r) * (i - this->g_r))* volume_factor + .5 );
 365             //        volume += this->g[i];
 366             //    }
 367         }
 368
 369         // gauss table:
 370         for (int mx = 0; mx < this->g_w; mx++) {
 371             int last_mul = 0;
 372             unsigned *p_gt2 = this->gt2 + mx;
 373             *p_gt2 = 0;
 374             for (int i = 1; i < 256; i++) {
 375                 last_mul = last_mul+this->g[mx];
 376                 p_gt2 += this->g_w;
 377                 *p_gt2 = last_mul;
 378                 //equivalent:
 379                 //    this->gt2[this->g_w * i+ mx] = this->g[mx] * i;
 380             }
 381         }
 382     }
 383     return 0;
 384 }
 385
 386 ass_tmp_buf::ass_tmp_buf(size_t size)
 387 {
 388     tmp = (unsigned *)malloc(size * sizeof(unsigned));
 389     this->size = size;
 390 }
 391
 392 ass_tmp_buf::ass_tmp_buf(const ass_tmp_buf& buf)
 393     :size(buf.size)
 394 {
 395     tmp = (unsigned *)malloc(size * sizeof(unsigned));
 396 }
 397
 398 ass_tmp_buf::~ass_tmp_buf()
 399 {
 400     free(tmp);
 401 }
 402
 403 /*
 404  * \brief gaussian blur.  an fast pure c implementation from libass.
 405  */
 406 static void ass_gauss_blur(unsigned char *buffer, unsigned *tmp2,
 407                            int width, int height, int stride,
 408                            const unsigned *g_t_x, int g_r_x, int g_width_x,
 409                            const unsigned *g_t_y, int g_r_y, int g_width_y)
 410 {
 411
 412     int x, y;
 413
 414     unsigned char *s = buffer;
 415     unsigned *t = tmp2 + 1;
 416     for (y = 0; y < height; y++) {
 417         memset(t - 1, 0, (width + 1) * sizeof(*t));
 418         x = 0;
 419         if(x < g_r_x)//in case that r < 0
 420         {
 421             const int src = s[x];
 422             if (src) {
 423                 register unsigned *dstp = t + x - g_r_x;
 424                 int mx;
 425                 const unsigned *m3 = g_t_x + src * g_width_x;
 426                 unsigned sum = 0;
 427                 for (mx = g_width_x-1; mx >= g_r_x - x ; mx--) {
 428                     sum += m3[mx];
 429                     dstp[mx] += sum;
 430                 }
 431             }
 432         }
 433
 434         for (x = 1; x < g_r_x; x++) {
 435             const int src = s[x];
 436             if (src) {
 437                 register unsigned *dstp = t + x - g_r_x;
 438                 int mx;
 439                 const unsigned *m3 = g_t_x + src * g_width_x;
 440                 for (mx = g_r_x - x; mx < g_width_x; mx++) {
 441                     dstp[mx] += m3[mx];
 442                 }
 443             }
 444         }
 445
 446         for (; x < width - g_r_x; x++) {
 447             const int src = s[x];
 448             if (src) {
 449                 register unsigned *dstp = t + x - g_r_x;
 450                 int mx;
 451                 const unsigned *m3 = g_t_x + src * g_width_x;
 452                 for (mx = 0; mx < g_width_x; mx++) {
 453                     dstp[mx] += m3[mx];
 454                 }
 455             }
 456         }
 457
 458         for (; x < width-1; x++) {
 459             const int src = s[x];
 460             if (src) {
 461                 register unsigned *dstp = t + x - g_r_x;
 462                 int mx;
 463                 const int x2 = g_r_x + width - x;
 464                 const unsigned *m3 = g_t_x + src * g_width_x;
 465                 for (mx = 0; mx < x2; mx++) {
 466                     dstp[mx] += m3[mx];
 467                 }
 468             }
 469         }
 470         if(x==width-1) //important: x==width-1 failed, if r==0
 471         {
 472             const int src = s[x];
 473             if (src) {
 474                 register unsigned *dstp = t + x - g_r_x;
 475                 int mx;
 476                 const int x2 = g_r_x + width - x;
 477                 const unsigned *m3 = g_t_x + src * g_width_x;
 478                 unsigned sum = 0;
 479                 for (mx = 0; mx < x2; mx++) {
 480                     sum += m3[mx];
 481                     dstp[mx] += sum;
 482                 }
 483             }
 484         }
 485
 486         s += stride;
 487         t += width + 1;
 488     }
 489
 490     t = tmp2;
 491     for (x = 0; x < width; x++) {
 492         y = 0;
 493         if(y < g_r_y)//in case that r<0
 494         {
 495             unsigned *srcp = t + y * (width + 1) + 1;
 496             int src = *srcp;
 497             if (src) {
 498                 register unsigned *dstp = srcp - 1 + (g_width_y -g_r_y +y)*(width + 1);
 499                 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
 500                 const unsigned *m3 = g_t_y + src2 * g_width_y;
 501                 unsigned sum = 0;
 502                 int mx;
 503                 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
 504                 for (mx = g_width_y-1; mx >=g_r_y - y ; mx--) {
 505                     sum += m3[mx];
 506                     *dstp += sum;
 507                     dstp -= width + 1;
 508                 }
 509             }
 510         }
 511         for (y = 1; y < g_r_y; y++) {
 512             unsigned *srcp = t + y * (width + 1) + 1;
 513             int src = *srcp;
 514             if (src) {
 515                 register unsigned *dstp = srcp - 1 + width + 1;
 516                 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
 517                 const unsigned *m3 = g_t_y + src2 * g_width_y;
 518
 519                 int mx;
 520                 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
 521                 for (mx = g_r_y - y; mx < g_width_y; mx++) {
 522                     *dstp += m3[mx];
 523                     dstp += width + 1;
 524                 }
 525             }
 526         }
 527         for (; y < height - g_r_y; y++) {
 528             unsigned *srcp = t + y * (width + 1) + 1;
 529             int src = *srcp;
 530             if (src) {
 531                 register unsigned *dstp = srcp - 1 - g_r_y * (width + 1);
 532                 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
 533                 const unsigned *m3 = g_t_y + src2 * g_width_y;
 534
 535                 int mx;
 536                 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
 537                 for (mx = 0; mx < g_width_y; mx++) {
 538                     *dstp += m3[mx];
 539                     dstp += width + 1;
 540                 }
 541             }
 542         }
 543         for (; y < height-1; y++) {
 544             unsigned *srcp = t + y * (width + 1) + 1;
 545             int src = *srcp;
 546             if (src) {
 547                 const int y2 = g_r_y + height - y;
 548                 register unsigned *dstp = srcp - 1 - g_r_y * (width + 1);
 549                 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
 550                 const unsigned *m3 = g_t_y + src2 * g_width_y;
 551
 552                 int mx;
 553                 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
 554                 for (mx = 0; mx < y2; mx++) {
 555                     *dstp += m3[mx];
 556                     dstp += width + 1;
 557                 }
 558             }
 559         }
 560         if(y == height - 1)//important: y == height - 1 failed if r==0
 561         {
 562             unsigned *srcp = t + y * (width + 1) + 1;
 563             int src = *srcp;
 564             if (src) {
 565                 const int y2 = g_r_y + height - y;
 566                 register unsigned *dstp = srcp - 1 - g_r_y * (width + 1);
 567                 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
 568                 const unsigned *m3 = g_t_y + src2 * g_width_y;
 569                 unsigned sum = 0;
 570                 int mx;
 571                 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
 572                 for (mx = 0; mx < y2; mx++) {
 573                     sum += m3[mx];
 574                     *dstp += sum;
 575                     dstp += width + 1;
 576                 }
 577             }
 578         }
 579         t++;
 580     }
 581
 582     t = tmp2;
 583     s = buffer;
 584     for (y = 0; y < height; y++) {
 585         for (x = 0; x < width; x++) {
 586             s[x] = t[x] >> ass_synth_priv::VOLUME_BITS;
 587         }
 588         s += stride;
 589         t += width + 1;
 590     }
 591 }
 592
 593 void xy_gaussian_blur(PUINT8 dst, int dst_stride,
 594     PCUINT8 src, int width, int height, int stride,
 595     const float *gt_x, int r_x, int gt_ex_width_x,
 596     const float *gt_y, int r_y, int gt_ex_width_y);
 597
 598 void xy_be_blur(PUINT8 src, int width, int height, int stride, float pass_x, float pass_y);
 599
 600 /**
 601  * \brief blur with [[1,2,1]. [2,4,2], [1,2,1]] kernel.
 602  */
 603 static void be_blur(unsigned char *buf, unsigned *tmp_base, int w, int h, int stride)
 604 {
 605     WORD *col_pix_buf_base = reinterpret_cast<WORD*>(xy_malloc(w*sizeof(WORD)));
 606     WORD *col_sum_buf_base = reinterpret_cast<WORD*>(xy_malloc(w*sizeof(WORD)));
 607     if(!col_sum_buf_base || !col_pix_buf_base)
 608     {
 609         //ToDo: error handling
 610         return;
 611     }
 612     memset(col_pix_buf_base, 0, w*sizeof(WORD));
 613     memset(col_sum_buf_base, 0, w*sizeof(WORD));
 614     WORD *col_pix_buf = col_pix_buf_base-2;//for aligment;
 615     WORD *col_sum_buf = col_sum_buf_base-2;//for aligment;
 616     {
 617         int y = 0;
 618         unsigned char *src=buf+y*stride;
 619
 620         int x = 2;
 621         int old_pix = src[x-1];
 622         int old_sum = old_pix + src[x-2];
 623         for ( ; x < w; x++) {
 624             int temp1 = src[x];
 625             int temp2 = old_pix + temp1;
 626             old_pix = temp1;
 627             temp1 = old_sum + temp2;
 628             old_sum = temp2;
 629             col_pix_buf[x] = temp1;
 630         }
 631     }
 632     {
 633         int y = 1;
 634         unsigned char *src=buf+y*stride;
 635
 636
 637         int x = 2;
 638         int old_pix = src[x-1];
 639         int old_sum = old_pix + src[x-2];
 640         for ( ; x < w; x++) {
 641             int temp1 = src[x];
 642             int temp2 = old_pix + temp1;
 643             old_pix = temp1;
 644             temp1 = old_sum + temp2;
 645             old_sum = temp2;
 646
 647             temp2 = col_pix_buf[x] + temp1;
 648             col_pix_buf[x] = temp1;
 649             //dst[x-1] = (col_sum_buf[x] + temp2) >> 4;
 650             col_sum_buf[x] = temp2;
 651         }
 652     }
 653
 654     //__m128i round = _mm_set1_epi16(8);
 655     for (int y = 2; y < h; y++) {
 656         unsigned char *src=buf+y*stride;
 657         unsigned char *dst=buf+(y-1)*stride;
 658
 659
 660         int x = 2;
 661         __m128i old_pix_128 = _mm_cvtsi32_si128(src[1]);
 662         __m128i old_sum_128 = _mm_cvtsi32_si128(src[0]+src[1]);
 663         for ( ; x < ((w-2)&(~7)); x+=8) {
 664             __m128i new_pix = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src+x));
 665             new_pix = _mm_unpacklo_epi8(new_pix, _mm_setzero_si128());
 666             __m128i temp = _mm_slli_si128(new_pix,2);
 667             temp = _mm_add_epi16(temp, old_pix_128);
 668             temp = _mm_add_epi16(temp, new_pix);
 669             old_pix_128 = _mm_srli_si128(new_pix,14);
 670
 671             new_pix = _mm_slli_si128(temp,2);
 672             new_pix = _mm_add_epi16(new_pix, old_sum_128);
 673             new_pix = _mm_add_epi16(new_pix, temp);
 674             old_sum_128 = _mm_srli_si128(temp, 14);
 675
 676             __m128i old_col_pix = _mm_loadu_si128( reinterpret_cast<const __m128i*>(col_pix_buf+x) );
 677             __m128i old_col_sum = _mm_loadu_si128( reinterpret_cast<const __m128i*>(col_sum_buf+x) );
 678             _mm_storeu_si128( reinterpret_cast<__m128i*>(col_pix_buf+x), new_pix );
 679             temp = _mm_add_epi16(new_pix, old_col_pix);
 680             _mm_storeu_si128( reinterpret_cast<__m128i*>(col_sum_buf+x), temp );
 681
 682             old_col_sum = _mm_add_epi16(old_col_sum, temp);
 683             //old_col_sum = _mm_add_epi16(old_col_sum, round);
 684             old_col_sum = _mm_srli_epi16(old_col_sum, 4);
 685             old_col_sum = _mm_packus_epi16(old_col_sum, old_col_sum);
 686             _mm_storel_epi64( reinterpret_cast<__m128i*>(dst+x-1), old_col_sum );
 687         }
 688         int old_pix = src[x-1];
 689         int old_sum = old_pix + src[x-2];
 690         for ( ; x < w; x++) {
 691             int temp1 = src[x];
 692             int temp2 = old_pix + temp1;
 693             old_pix = temp1;
 694             temp1 = old_sum + temp2;
 695             old_sum = temp2;
 696
 697             temp2 = col_pix_buf[x] + temp1;
 698             col_pix_buf[x] = temp1;
 699             dst[x-1] = (col_sum_buf[x] + temp2) >> 4;
 700             col_sum_buf[x] = temp2;
 701         }
 702     }
 703
 704     xy_free(col_sum_buf_base);
 705     xy_free(col_pix_buf_base);
 706 }
 707
 708 /**
 709  * see @be_blur
 710  */
 711 static void be_blur_c(unsigned char *buf, unsigned *tmp_base, int w, int h, int stride)
 712 {
 713     WORD *col_pix_buf_base = reinterpret_cast<WORD*>(xy_malloc(w*sizeof(WORD)));
 714     WORD *col_sum_buf_base = reinterpret_cast<WORD*>(xy_malloc(w*sizeof(WORD)));
 715     if(!col_sum_buf_base || !col_pix_buf_base)
 716     {
 717         //ToDo: error handling
 718         return;
 719     }
 720     memset(col_pix_buf_base, 0, w*sizeof(WORD));
 721     memset(col_sum_buf_base, 0, w*sizeof(WORD));
 722     WORD *col_pix_buf = col_pix_buf_base-2;//for aligment;
 723     WORD *col_sum_buf = col_sum_buf_base-2;//for aligment;
 724     {
 725         int y = 0;
 726         unsigned char *src=buf+y*stride;
 727
 728         int x = 2;
 729         int old_pix = src[x-1];
 730         int old_sum = old_pix + src[x-2];
 731         for ( ; x < w; x++) {
 732             int temp1 = src[x];
 733             int temp2 = old_pix + temp1;
 734             old_pix = temp1;
 735             temp1 = old_sum + temp2;
 736             old_sum = temp2;
 737             col_pix_buf[x] = temp1;
 738         }
 739     }
 740     {
 741         int y = 1;
 742         unsigned char *src=buf+y*stride;
 743
 744
 745         int x = 2;
 746         int old_pix = src[x-1];
 747         int old_sum = old_pix + src[x-2];
 748         for ( ; x < w; x++) {
 749             int temp1 = src[x];
 750             int temp2 = old_pix + temp1;
 751             old_pix = temp1;
 752             temp1 = old_sum + temp2;
 753             old_sum = temp2;
 754
 755             temp2 = col_pix_buf[x] + temp1;
 756             col_pix_buf[x] = temp1;
 757             //dst[x-1] = (col_sum_buf[x] + temp2) >> 4;
 758             col_sum_buf[x] = temp2;
 759         }
 760     }
 761
 762     for (int y = 2; y < h; y++) {
 763         unsigned char *src=buf+y*stride;
 764         unsigned char *dst=buf+(y-1)*stride;
 765
 766         int x = 2;
 767         int old_pix = src[x-1];
 768         int old_sum = old_pix + src[x-2];
 769         for ( ; x < w; x++) {
 770             int temp1 = src[x];
 771             int temp2 = old_pix + temp1;
 772             old_pix = temp1;
 773             temp1 = old_sum + temp2;
 774             old_sum = temp2;
 775
 776             temp2 = col_pix_buf[x] + temp1;
 777             col_pix_buf[x] = temp1;
 778             dst[x-1] = (col_sum_buf[x] + temp2) >> 4;
 779             col_sum_buf[x] = temp2;
 780         }
 781     }
 782
 783     xy_free(col_sum_buf_base);
 784     xy_free(col_pix_buf_base);
 785 }
 786
 787 static void Bilinear(unsigned char *buf, int w, int h, int stride, int x_factor, int y_factor)
 788 {
 789     WORD *col_pix_buf_base = reinterpret_cast<WORD*>(xy_malloc(w*sizeof(WORD)));
 790     if(!col_pix_buf_base)
 791     {
 792         //ToDo: error handling
 793         return;
 794     }
 795     memset(col_pix_buf_base, 0, w*sizeof(WORD));
 796
 797     for (int y = 0; y < h; y++){
 798         unsigned char *src=buf+y*stride;
 799
 800         WORD *col_pix_buf = col_pix_buf_base;
 801         int last=0;
 802         for(int x = 0; x < w; x++)
 803         {
 804             int temp1 = src[x];
 805             int temp2 = temp1*x_factor;
 806             temp1 <<= 3;
 807             temp1 -= temp2;
 808             temp1 += last;
 809             last = temp2;
 810
 811             temp2 = temp1*y_factor;
 812             temp1 <<= 3;
 813             temp1 -= temp2;
 814             temp1 += col_pix_buf[x];
 815             src[x] = ((temp1+32)>>6);
 816             col_pix_buf[x] = temp2;
 817         }
 818     }
 819     xy_free(col_pix_buf_base);
 820 }
 821
 822 bool Rasterizer::Rasterize(const ScanLineData2& scan_line_data2, int xsub, int ysub, SharedPtrOverlay overlay)
 823 {
 824     using namespace ::boost::flyweights;
 825
 826     if(!overlay)
 827     {
 828         return false;
 829     }
 830     overlay->CleanUp();
 831     const ScanLineData& scan_line_data = *scan_line_data2.m_scan_line_data;
 832     if(!scan_line_data.mWidth || !scan_line_data.mHeight)
 833     {
 834         return true;
 835     }
 836     xsub &= 7;
 837     ysub &= 7;
 838     //xsub = ysub = 0;
 839     int width = scan_line_data.mWidth + xsub;
 840     int height = scan_line_data.mHeight + ysub;
 841     overlay->mfWideOutlineEmpty = scan_line_data2.mWideOutline.empty();
 842     if(!overlay->mfWideOutlineEmpty)
 843     {
 844         int wide_border = (scan_line_data2.mWideBorder+7)&~7;
 845
 846         width += 2*wide_border ;
 847         height += 2*wide_border ;
 848         xsub += wide_border ;
 849         ysub += wide_border ;
 850     }
 851     overlay->mOffsetX = scan_line_data2.mPathOffsetX - xsub;
 852     overlay->mOffsetY = scan_line_data2.mPathOffsetY - ysub;
 853
 854     overlay->mWidth = width;
 855     overlay->mHeight = height;
 856     overlay->mOverlayWidth = ((width+7)>>3) + 1;
 857     overlay->mOverlayHeight = ((height+7)>>3) + 1;
 858     overlay->mOverlayPitch = (overlay->mOverlayWidth+15)&~15;
 859
 860     BYTE* body = reinterpret_cast<BYTE*>(xy_malloc(overlay->mOverlayPitch * overlay->mOverlayHeight));
 861     if( body==NULL )
 862     {
 863         return false;
 864     }
 865     overlay->mBody.reset(body, xy_free);
 866     memset(body, 0, overlay->mOverlayPitch * overlay->mOverlayHeight);
 867     BYTE* border = NULL;
 868     if (!overlay->mfWideOutlineEmpty)
 869     {
 870         border = reinterpret_cast<BYTE*>(xy_malloc(overlay->mOverlayPitch * overlay->mOverlayHeight));
 871         if (border==NULL)
 872         {
 873             return false;
 874         }
 875         overlay->mBorder.reset(border, xy_free);
 876         memset(border, 0, overlay->mOverlayPitch * overlay->mOverlayHeight);
 877     }
 878
 879     // Are we doing a border?
 880     const tSpanBuffer* pOutline[2] = {&(scan_line_data.mOutline), &(scan_line_data2.mWideOutline)};
 881     for(int i = countof(pOutline)-1; i >= 0; i--)
 882     {
 883         tSpanBuffer::const_iterator it = pOutline[i]->begin();
 884         tSpanBuffer::const_iterator itEnd = pOutline[i]->end();
 885         byte* plan_selected = i==0 ? body : border;
 886         int pitch = overlay->mOverlayPitch;
 887         for(; it!=itEnd; ++it)
 888         {
 889             int y = (int)(((*it).first >> 32) - 0x40000000 + ysub);
 890             int x1 = (int)(((*it).first & 0xffffffff) - 0x40000000 + xsub);
 891             int x2 = (int)(((*it).second & 0xffffffff) - 0x40000000 + xsub);
 892             if(x2 > x1)
 893             {
 894                 int first = x1>>3;
 895                 int last = (x2-1)>>3;
 896                 byte* dst = plan_selected + (pitch*(y>>3) + first);
 897                 if(first == last)
 898                     *dst += x2-x1;
 899                 else
 900                 {
 901                     *dst += ((first+1)<<3) - x1;
 902                     dst += 1;
 903                     while(++first < last)
 904                     {
 905                         *dst += 0x08;
 906                         dst += 1;
 907                     }
 908                     *dst += x2 - (last<<3);
 909                 }
 910             }
 911         }
 912     }
 913
 914     return true;
 915 }
 916
 917 const float Rasterizer::GAUSSIAN_BLUR_THREHOLD = 0.333333f;
 918
 919 bool Rasterizer::IsItReallyBlur( float be_strength, double gaussian_blur_strength )
 920 {
 921     if (be_strength<=0 && gaussian_blur_strength<=GAUSSIAN_BLUR_THREHOLD)
 922     {
 923         return false;
 924     }
 925     return true;
 926 }
 927
 928 // @return: true if actually a blur operation has done, or else false and output is leave unset.
 929 // To Do: rewrite it or delete it
 930 bool Rasterizer::OldFixedPointBlur(const Overlay& input_overlay, float be_strength, double gaussian_blur_strength,
 931     double target_scale_x, double target_scale_y, SharedPtrOverlay output_overlay)
 932 {
 933     using namespace ::boost::flyweights;
 934
 935     ASSERT(IsItReallyBlur(be_strength, gaussian_blur_strength));
 936     if(!output_overlay)
 937     {
 938         return false;
 939     }
 940     output_overlay->CleanUp();
 941
 942     output_overlay->mOffsetX = input_overlay.mOffsetX;
 943     output_overlay->mOffsetY = input_overlay.mOffsetY;
 944     output_overlay->mWidth = input_overlay.mWidth;
 945     output_overlay->mHeight = input_overlay.mHeight;
 946     output_overlay->mOverlayWidth = input_overlay.mOverlayWidth;
 947     output_overlay->mOverlayHeight = input_overlay.mOverlayHeight;
 948     output_overlay->mfWideOutlineEmpty = input_overlay.mfWideOutlineEmpty;
 949
 950     double gaussian_blur_strength_x = gaussian_blur_strength*target_scale_x;
 951     double gaussian_blur_strength_y = gaussian_blur_strength*target_scale_y;
 952
 953     int gaussian_blur_radius_x = (static_cast<int>( ceil(gaussian_blur_strength_x*3) ) | 1)/2;//fix me: rounding err?
 954     int gaussian_blur_radius_y = (static_cast<int>( ceil(gaussian_blur_strength_y*3) ) | 1)/2;//fix me: rounding err?
 955     if( gaussian_blur_radius_x < 1 && gaussian_blur_strength>GAUSSIAN_BLUR_THREHOLD )
 956         gaussian_blur_radius_x = 1;//make sure that it really do a blur
 957     if( gaussian_blur_radius_y < 1 && gaussian_blur_strength>GAUSSIAN_BLUR_THREHOLD )
 958         gaussian_blur_radius_y = 1;//make sure that it really do a blur
 959
 960     int bluradjust_x = 0, bluradjust_y = 0;
 961     if ( IsItReallyBlur(be_strength, gaussian_blur_strength) )
 962     {
 963         if (gaussian_blur_strength > 0)
 964         {
 965             bluradjust_x += gaussian_blur_radius_x * 8;
 966             bluradjust_y += gaussian_blur_radius_y * 8;
 967         }
 968         if (be_strength)
 969         {
 970             int be_adjust_x = static_cast<int>( target_scale_x*std::sqrt(be_strength*0.25f)+0.5 );//fix me: rounding err?
 971             be_adjust_x *= 8;
 972             int be_adjust_y = static_cast<int>(target_scale_y*std::sqrt(be_strength*0.25f)+0.5);//fix me: rounding err?
 973             be_adjust_y *= 8;
 974
 975             bluradjust_x += be_adjust_x;
 976             bluradjust_y += be_adjust_y;
 977         }
 978         // Expand the buffer a bit when we're blurring, since that can also widen the borders a bit
 979         bluradjust_x = (bluradjust_x+7)&~7;
 980         bluradjust_y = (bluradjust_y+7)&~7;
 981
 982         output_overlay->mOffsetX -= bluradjust_x;
 983         output_overlay->mOffsetY -= bluradjust_y;
 984         output_overlay->mWidth += (bluradjust_x<<1);
 985         output_overlay->mHeight += (bluradjust_y<<1);
 986         output_overlay->mOverlayWidth += (bluradjust_x>>2);
 987         output_overlay->mOverlayHeight += (bluradjust_y>>2);
 988     }
 989     else
 990     {
 991         return false;
 992     }
 993
 994     output_overlay->mOverlayPitch = (output_overlay->mOverlayWidth+15)&~15;
 995
 996     BYTE* body = reinterpret_cast<BYTE*>(xy_malloc(output_overlay->mOverlayPitch * output_overlay->mOverlayHeight));
 997     if( body==NULL )
 998     {
 999         return false;
1000     }
1001     output_overlay->mBody.reset(body, xy_free);
1002     memset(body, 0, output_overlay->mOverlayPitch * output_overlay->mOverlayHeight);
1003     BYTE* border = NULL;
1004     if (!output_overlay->mfWideOutlineEmpty)
1005     {
1006         border = reinterpret_cast<BYTE*>(xy_malloc(output_overlay->mOverlayPitch * output_overlay->mOverlayHeight));
1007         if (border==NULL)
1008         {
1009             return false;
1010         }
1011         output_overlay->mBorder.reset(border, xy_free);
1012         memset(border, 0, output_overlay->mOverlayPitch * output_overlay->mOverlayHeight);
1013     }
1014
1015     //copy buffer
1016     for(int i = 1; i >= 0; i--)
1017     {
1018         byte* plan_selected = i==0 ? body : border;
1019         const byte* plan_input = i==0 ? input_overlay.mBody.get() : input_overlay.mBorder.get();
1020
1021         plan_selected += (bluradjust_x>>3) + (bluradjust_y>>3)*output_overlay->mOverlayPitch;
1022         if ( plan_selected!=NULL && plan_input!=NULL )
1023         {
1024             for (int j=0;j<input_overlay.mOverlayHeight;j++)
1025             {
1026                 memcpy(plan_selected, plan_input, input_overlay.mOverlayPitch);
1027                 plan_selected += output_overlay->mOverlayPitch;
1028                 plan_input += input_overlay.mOverlayPitch;
1029             }
1030         }
1031     }
1032
1033     ass_tmp_buf tmp_buf( max((output_overlay->mOverlayPitch+1)*(output_overlay->mOverlayHeight+1),0) );
1034     //flyweight<key_value<int, ass_tmp_buf, ass_tmp_buf_get_size>, no_locking> tmp_buf((overlay->mOverlayWidth+1)*(overlay->mOverlayPitch+1));
1035     // Do some gaussian blur magic
1036     if ( gaussian_blur_strength > GAUSSIAN_BLUR_THREHOLD )
1037     {
1038         byte* plan_selected= output_overlay->mfWideOutlineEmpty ? body : border;
1039
1040         flyweight<key_value<double, ass_synth_priv, GaussianFilterKey<ass_synth_priv>>, no_locking>
1041             fw_priv_blur_x(gaussian_blur_strength_x);
1042         flyweight<key_value<double, ass_synth_priv, GaussianFilterKey<ass_synth_priv>>, no_locking>
1043             fw_priv_blur_y(gaussian_blur_strength_y);
1044
1045         const ass_synth_priv& priv_blur_x = fw_priv_blur_x.get();
1046         const ass_synth_priv& priv_blur_y = fw_priv_blur_y.get();
1047         if (output_overlay->mOverlayWidth>=priv_blur_x.g_w && output_overlay->mOverlayHeight>=priv_blur_y.g_w)
1048         {
1049             ass_gauss_blur(plan_selected, tmp_buf.tmp, output_overlay->mOverlayWidth, output_overlay->mOverlayHeight, output_overlay->mOverlayPitch,
1050                 priv_blur_x.gt2, priv_blur_x.g_r, priv_blur_x.g_w,
1051                 priv_blur_y.gt2, priv_blur_y.g_r, priv_blur_y.g_w);
1052         }
1053     }
1054
1055     float scaled_be_strength = be_strength * 0.5f * (target_scale_x+target_scale_y);
1056     int pass_num = static_cast<int>(scaled_be_strength);
1057     int pitch = output_overlay->mOverlayPitch;
1058     byte* blur_plan = output_overlay->mfWideOutlineEmpty ? body : border;
1059
1060     for (int pass = 0; pass < pass_num; pass++)
1061     {
1062         if(output_overlay->mOverlayWidth >= 3 && output_overlay->mOverlayHeight >= 3)
1063         {
1064             if (g_cpuid.m_flags & CCpuID::sse2)
1065             {
1066                 be_blur(blur_plan, tmp_buf.tmp, output_overlay->mOverlayWidth, output_overlay->mOverlayHeight, pitch);
1067             }
1068             else
1069             {
1070                 be_blur_c(blur_plan, tmp_buf.tmp, output_overlay->mOverlayWidth, output_overlay->mOverlayHeight, pitch);
1071             }
1072         }
1073     }
1074     if (scaled_be_strength>pass_num)
1075     {
1076         xy_be_blur(blur_plan, output_overlay->mOverlayWidth, output_overlay->mOverlayHeight, pitch,
1077             scaled_be_strength-pass_num, scaled_be_strength-pass_num);
1078     }
1079
1080     return true;
1081 }
1082
1083 // @return: true if actually a blur operation has done, or else false and output is leave unset.
1084 bool Rasterizer::Blur(const Overlay& input_overlay, float be_strength,
1085     double gaussian_blur_strength,
1086     double target_scale_x, double target_scale_y,
1087     SharedPtrOverlay output_overlay)
1088 {
1089     using namespace ::boost::flyweights;
1090
1091     ASSERT(IsItReallyBlur(be_strength, gaussian_blur_strength));
1092     if(!output_overlay || !IsItReallyBlur(be_strength, gaussian_blur_strength))
1093     {
1094         return false;
1095     }
1096     if (input_overlay.mOverlayWidth<=0 || input_overlay.mOverlayHeight<=0)
1097     {
1098         return true;
1099     }
1100
1101     if (!(g_cpuid.m_flags & CCpuID::sse2))
1102     {
1103         // C code path of floating point version is extremely slow,
1104         // so we fall back to fixed point version instead
1105         return Rasterizer::OldFixedPointBlur(input_overlay, be_strength,
1106             gaussian_blur_strength, target_scale_x, target_scale_y, output_overlay);//fix me: important!
1107     }
1108
1109     if (gaussian_blur_strength>0)
1110     {
1111         if (be_strength)//this insane thing should NEVER happen
1112         {
1113             SharedPtrOverlay tmp(new Overlay());
1114
1115             bool rv = GaussianBlur(input_overlay, gaussian_blur_strength, target_scale_x, target_scale_y, tmp);
1116             ASSERT(rv);
1117             rv = BeBlur(*tmp, be_strength, target_scale_x, target_scale_y, output_overlay);
1118             ASSERT(rv);
1119         }
1120         else
1121         {
1122             bool rv = GaussianBlur(input_overlay, gaussian_blur_strength, target_scale_x, target_scale_y, output_overlay);
1123             ASSERT(rv);
1124         }
1125     }
1126     else if (be_strength)
1127     {
1128         bool rv = BeBlur(input_overlay, be_strength, target_scale_x, target_scale_y, output_overlay);
1129         ASSERT(rv);
1130     }
1131     return true;
1132 }
1133
1134 bool Rasterizer::GaussianBlur( const Overlay& input_overlay, double gaussian_blur_strength,
1135     double target_scale_x, double target_scale_y,
1136     SharedPtrOverlay output_overlay )
1137 {
1138     using namespace ::boost::flyweights;
1139
1140     ASSERT(output_overlay);
1141     output_overlay->CleanUp();
1142     output_overlay->mfWideOutlineEmpty = input_overlay.mfWideOutlineEmpty;
1143
1144     ASSERT(gaussian_blur_strength > 0);
1145
1146     double gaussian_blur_strength_x = gaussian_blur_strength*target_scale_x;
1147     double gaussian_blur_strength_y = gaussian_blur_strength*target_scale_y;
1148
1149     int gaussian_blur_radius_x = (static_cast<int>( ceil(gaussian_blur_strength_x*3) ) | 1)/2;//fix me: rounding err?
1150     int gaussian_blur_radius_y = (static_cast<int>( ceil(gaussian_blur_strength_y*3) ) | 1)/2;//fix me: rounding err?
1151     if( gaussian_blur_radius_x < 1 && gaussian_blur_strength>GAUSSIAN_BLUR_THREHOLD )
1152         gaussian_blur_radius_x = 1;//make sure that it really do a blur
1153     if( gaussian_blur_radius_y < 1 && gaussian_blur_strength>GAUSSIAN_BLUR_THREHOLD )
1154         gaussian_blur_radius_y = 1;//make sure that it really do a blur
1155
1156     flyweight<key_value<double, GaussianCoefficients, GaussianFilterKey<GaussianCoefficients>>, no_locking>
1157         fw_filter_x(gaussian_blur_strength_x);
1158     flyweight<key_value<double, GaussianCoefficients, GaussianFilterKey<GaussianCoefficients>>, no_locking>
1159         fw_filter_y(gaussian_blur_strength_y);
1160
1161     const GaussianCoefficients& filter_x = fw_filter_x.get();
1162     const GaussianCoefficients& filter_y = fw_filter_y.get();
1163
1164     int bluradjust_x = filter_x.g_r * 8;
1165     int bluradjust_y = filter_y.g_r * 8;
1166     output_overlay->mOffsetX       = input_overlay.mOffsetX - bluradjust_x;
1167     output_overlay->mOffsetY       = input_overlay.mOffsetY - bluradjust_y;
1168     output_overlay->mWidth         = input_overlay.mWidth + (bluradjust_x<<1);
1169     output_overlay->mHeight        = input_overlay.mHeight + (bluradjust_y<<1);
1170     output_overlay->mOverlayWidth  = input_overlay.mOverlayWidth + (bluradjust_x>>2);
1171     output_overlay->mOverlayHeight = input_overlay.mOverlayHeight + (bluradjust_y>>2);
1172
1173     output_overlay->mOverlayPitch = (output_overlay->mOverlayWidth+15)&~15;
1174
1175     BYTE* blur_plan = reinterpret_cast<BYTE*>(xy_malloc(output_overlay->mOverlayPitch * output_overlay->mOverlayHeight));
1176     //memset(blur_plan, 0, output_overlay->mOverlayPitch * output_overlay->mOverlayHeight);
1177
1178     const BYTE* plan_input = input_overlay.mfWideOutlineEmpty ? input_overlay.mBody.get() : input_overlay.mBorder.get();
1179     ASSERT(output_overlay->mOverlayWidth>=filter_x.g_w && output_overlay->mOverlayHeight>=filter_y.g_w);
1180     xy_gaussian_blur(blur_plan, output_overlay->mOverlayPitch,
1181         plan_input, input_overlay.mOverlayWidth, input_overlay.mOverlayHeight, input_overlay.mOverlayPitch,
1182         filter_x.g_f, filter_x.g_r, filter_x.g_w_ex,
1183         filter_y.g_f, filter_y.g_r, filter_y.g_w_ex);
1184     if (input_overlay.mfWideOutlineEmpty)
1185     {
1186         output_overlay->mBody.reset(blur_plan, xy_free);
1187     }
1188     else
1189     {
1190         output_overlay->mBorder.reset(blur_plan, xy_free);
1191
1192         BYTE* body = reinterpret_cast<BYTE*>(xy_malloc(output_overlay->mOverlayPitch * output_overlay->mOverlayHeight));
1193         if( body==NULL )
1194         {
1195             return false;
1196         }
1197         output_overlay->mBody.reset(body, xy_free);
1198         memset(body, 0, output_overlay->mOverlayPitch * (bluradjust_y>>3));
1199         body += (bluradjust_y>>3)*output_overlay->mOverlayPitch;
1200         plan_input = input_overlay.mBody.get();
1201         ASSERT(plan_input);
1202         for (int j=0;j<input_overlay.mOverlayHeight;j++)
1203         {
1204             memset(body, 0, (bluradjust_x>>3));
1205             memcpy(body+(bluradjust_x>>3), plan_input, input_overlay.mOverlayWidth);
1206             memset(body+(bluradjust_x>>3)+input_overlay.mOverlayWidth, 0, (bluradjust_x>>3));
1207             body += output_overlay->mOverlayPitch;
1208             plan_input += input_overlay.mOverlayPitch;
1209         }
1210         memset(body, 0, output_overlay->mOverlayPitch * (bluradjust_y>>3));
1211     }
1212     return true;
1213 }
1214
1215 bool Rasterizer::BeBlur( const Overlay& input_overlay, float be_strength,
1216     float target_scale_x, float target_scale_y, SharedPtrOverlay output_overlay )
1217 {
1218     ASSERT(output_overlay);
1219     output_overlay->CleanUp();
1220     output_overlay->mfWideOutlineEmpty = input_overlay.mfWideOutlineEmpty;
1221
1222     ASSERT(be_strength>0 && target_scale_x>0 && target_scale_y>0);
1223     int bluradjust_x = static_cast<int>( target_scale_x*std::sqrt(be_strength*0.25f)+0.5 );//fix me: rounding err?
1224     bluradjust_x *= 8;
1225     int bluradjust_y = static_cast<int>(target_scale_y*std::sqrt(be_strength*0.25f)+0.5);//fix me: rounding err?
1226     bluradjust_y *= 8;
1227
1228     output_overlay->mOffsetX       = input_overlay.mOffsetX - bluradjust_x;
1229     output_overlay->mOffsetY       = input_overlay.mOffsetY - bluradjust_y;
1230     output_overlay->mWidth         = input_overlay.mWidth + (bluradjust_x<<1);
1231     output_overlay->mHeight        = input_overlay.mHeight + (bluradjust_y<<1);
1232     output_overlay->mOverlayWidth  = input_overlay.mOverlayWidth + (bluradjust_x>>2);
1233     output_overlay->mOverlayHeight = input_overlay.mOverlayHeight + (bluradjust_y>>2);
1234
1235     output_overlay->mOverlayPitch = (output_overlay->mOverlayWidth+15)&~15;
1236
1237     BYTE* body = reinterpret_cast<BYTE*>(xy_malloc(output_overlay->mOverlayPitch * output_overlay->mOverlayHeight));
1238     if( body==NULL )
1239     {
1240         return false;
1241     }
1242     output_overlay->mBody.reset(body, xy_free);
1243     memset(body, 0, output_overlay->mOverlayPitch * output_overlay->mOverlayHeight);
1244     BYTE* border = NULL;
1245     if (!output_overlay->mfWideOutlineEmpty)
1246     {
1247         border = reinterpret_cast<BYTE*>(xy_malloc(output_overlay->mOverlayPitch * output_overlay->mOverlayHeight));
1248         if (border==NULL)
1249         {
1250             return false;
1251         }
1252         output_overlay->mBorder.reset(border, xy_free);
1253         memset(border, 0, output_overlay->mOverlayPitch * output_overlay->mOverlayHeight);
1254     }
1255
1256     //copy buffer
1257     for(int i = 1; i >= 0; i--)
1258     {
1259         byte* plan_selected = i==0 ? body : border;
1260         const byte* plan_input = i==0 ? input_overlay.mBody.get() : input_overlay.mBorder.get();
1261
1262         plan_selected += (bluradjust_x>>3) + (bluradjust_y>>3)*output_overlay->mOverlayPitch;
1263         if ( plan_selected!=NULL && plan_input!=NULL )
1264         {
1265             for (int j=0;j<input_overlay.mOverlayHeight;j++)
1266             {
1267                 memcpy(plan_selected, plan_input, input_overlay.mOverlayWidth*sizeof(plan_input[0]));
1268                 plan_selected += output_overlay->mOverlayPitch;
1269                 plan_input += input_overlay.mOverlayPitch;
1270             }
1271         }
1272     }
1273     if (be_strength<=0)
1274     {
1275         return true;
1276     }
1277
1278     float scaled_be_strength = be_strength * 0.5f * (target_scale_x+target_scale_y);
1279     int pass_num = static_cast<int>(scaled_be_strength);
1280     int pitch = output_overlay->mOverlayPitch;
1281     byte* blur_plan = output_overlay->mfWideOutlineEmpty ? body : border;
1282     ass_tmp_buf tmp_buf( max((output_overlay->mOverlayPitch+1)*(output_overlay->mOverlayHeight+1),0) );
1283     for (int pass = 0; pass < pass_num; pass++)
1284     {
1285         if(output_overlay->mOverlayWidth >= 3 && output_overlay->mOverlayHeight >= 3)
1286         {
1287             if (g_cpuid.m_flags & CCpuID::sse2)
1288             {
1289                 be_blur(blur_plan, tmp_buf.tmp, output_overlay->mOverlayWidth, output_overlay->mOverlayHeight, pitch);
1290             }
1291             else
1292             {
1293                 be_blur_c(blur_plan, tmp_buf.tmp, output_overlay->mOverlayWidth, output_overlay->mOverlayHeight, pitch);
1294             }
1295         }
1296     }
1297     if (scaled_be_strength>pass_num)
1298     {
1299         xy_be_blur(blur_plan, output_overlay->mOverlayWidth, output_overlay->mOverlayHeight, pitch,
1300             scaled_be_strength-pass_num, scaled_be_strength-pass_num);
1301     }
1302
1303     return true;
1304 }
1305
1306 ///////////////////////////////////////////////////////////////////////////
1307
1308 static __forceinline void pixmix(DWORD *dst, DWORD color, DWORD alpha)
1309 {
1310     int a = alpha;
1311     // Make sure both a and ia are in range 1..256 for the >>8 operations below to be correct
1312     int ia = 256-a;
1313     a+=1;
1314     *dst = ((((*dst&0x00ff00ff)*ia + (color&0x00ff00ff)*a)&0xff00ff00)>>8)
1315            | ((((*dst&0x0000ff00)*ia + (color&0x0000ff00)*a)&0x00ff0000)>>8)
1316            | ((((*dst>>8)&0x00ff0000)*ia)&0xff000000);
1317 }
1318
1319 static __forceinline void pixmix2(DWORD *dst, DWORD color, DWORD shapealpha, DWORD clipalpha)
1320 {
1321     int a = (((shapealpha)*(clipalpha)*(color>>24))>>12)&0xff;
1322     int ia = 256-a;
1323     a+=1;
1324     *dst = ((((*dst&0x00ff00ff)*ia + (color&0x00ff00ff)*a)&0xff00ff00)>>8)
1325            | ((((*dst&0x0000ff00)*ia + (color&0x0000ff00)*a)&0x00ff0000)>>8)
1326            | ((((*dst>>8)&0x00ff0000)*ia)&0xff000000);
1327 }
1328
1329 #include <xmmintrin.h>
1330 #include <emmintrin.h>
1331
1332 static __forceinline void pixmix_sse2(DWORD* dst, DWORD color, DWORD alpha)
1333 {
1334 //    alpha = (((alpha) * (color>>24)) >> 6) & 0xff;
1335     color &= 0xffffff;
1336     __m128i zero = _mm_setzero_si128();
1337     __m128i a = _mm_set1_epi32(((alpha+1) << 16) | (0x100 - alpha));
1338     __m128i d = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*dst), zero);
1339     __m128i s = _mm_unpacklo_epi8(_mm_cvtsi32_si128(color), zero);
1340     __m128i r = _mm_unpacklo_epi16(d, s);
1341     r = _mm_madd_epi16(r, a);
1342     r = _mm_srli_epi32(r, 8);
1343     r = _mm_packs_epi32(r, r);
1344     r = _mm_packus_epi16(r, r);
1345     *dst = (DWORD)_mm_cvtsi128_si32(r);
1346 }
1347
1348 static __forceinline void pixmix2_sse2(DWORD* dst, DWORD color, DWORD shapealpha, DWORD clipalpha)
1349 {
1350     int alpha = (((shapealpha)*(clipalpha)*(color>>24))>>12)&0xff;
1351     color &= 0xffffff;
1352     __m128i zero = _mm_setzero_si128();
1353     __m128i a = _mm_set1_epi32(((alpha+1) << 16) | (0x100 - alpha));
1354     __m128i d = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*dst), zero);
1355     __m128i s = _mm_unpacklo_epi8(_mm_cvtsi32_si128(color), zero);
1356     __m128i r = _mm_unpacklo_epi16(d, s);
1357     r = _mm_madd_epi16(r, a);
1358     r = _mm_srli_epi32(r, 8);
1359     r = _mm_packs_epi32(r, r);
1360     r = _mm_packus_epi16(r, r);
1361     *dst = (DWORD)_mm_cvtsi128_si32(r);
1362 }
1363
1364 #include <mmintrin.h>
1365
1366 // Calculate a - b clamping to 0 instead of underflowing
1367 static __forceinline DWORD safe_subtract(DWORD a, DWORD b)
1368 {
1369     __m64 ap = _mm_cvtsi32_si64(a);
1370     __m64 bp = _mm_cvtsi32_si64(b);
1371     __m64 rp = _mm_subs_pu16(ap, bp);
1372     DWORD r = (DWORD)_mm_cvtsi64_si32(rp);
1373     _mm_empty();
1374     return r;
1375     //return (b > a) ? 0 : a - b;
1376 }
1377
1378 /***
1379  * No aligned requirement
1380  *
1381  **/
1382 void AlphaBlt(byte* pY,
1383     const byte* pAlphaMask,
1384     const byte Y,
1385     int h, int w, int src_stride, int dst_stride)
1386 {
1387     __m128i zero = _mm_setzero_si128();
1388     __m128i s = _mm_set1_epi16(Y);               //s = c  0  c  0  c  0  c  0  c  0  c  0  c  0  c  0
1389
1390     if( w>16 )//IMPORTANT! The result of the following code is undefined with w<15.
1391     {
1392         for( ; h>0; h--, pAlphaMask += src_stride, pY += dst_stride )
1393         {
1394             const BYTE* sa = pAlphaMask;
1395             BYTE* dy = pY;
1396             const BYTE* dy_first_mod16 = reinterpret_cast<BYTE*>((reinterpret_cast<int>(pY)+15)&~15);  //IMPORTANT! w must >= 15
1397             const BYTE* dy_end_mod16 = reinterpret_cast<BYTE*>(reinterpret_cast<int>(pY+w)&~15);
1398             const BYTE* dy_end = pY + w;
1399
1400             for(;dy < dy_first_mod16; sa++, dy++)
1401             {
1402                 *dy = (*dy * (256 - *sa)+ Y*(*sa+1))>>8;
1403             }
1404             for(; dy < dy_end_mod16; sa+=8, dy+=16)
1405             {
1406                 __m128i a = _mm_loadl_epi64((__m128i*)sa);
1407
1408                 //Y
1409                 __m128i d = _mm_load_si128((__m128i*)dy);
1410
1411                 //__m128i ones = _mm_cmpeq_epi32(zero,zero); //ones = ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
1412                 //__m128i ia = _mm_xor_si128(a,ones);        //ia   = ~a
1413                 //ia = _mm_unpacklo_epi8(ia,zero);           //ia   = ~a0 0 ~a1 0 ~a2 0 ~a3 0 ~a4 0 ~a5 0 ~a6 0 ~a7 0
1414                 a = _mm_unpacklo_epi8(a,zero);               //a= a0 0  a1 0  a2 0  a3 0  a4 0  a5 0  a6 0  a7 0
1415                 __m128i ones = _mm_set1_epi16(256);          //ones = 0  1  0  1  0  1  0  1  0  1  0  1  0  1  0  1
1416                 __m128i ia = _mm_sub_epi16(ones, a);         //ia   = 256-a0 ... 256-a7
1417                 ones = _mm_srli_epi16(ones, 8);
1418                 a = _mm_add_epi16(a, ones);                  //a= 1+a0 ... 1+a7
1419
1420                 __m128i dl = _mm_unpacklo_epi8(d,zero);               //d    = b0 0  b1 0  b2 0  b3 0  b4 0  b5 0  b6 0  b7 0
1421                 __m128i sl = _mm_mullo_epi16(s,a);            //sl   = c0*a0  c1*a1  ... c7*a7
1422
1423                 dl = _mm_mullo_epi16(dl,ia);                   //d    = b0*~a0 b1*~a1 ... b7*~a7
1424
1425                 dl = _mm_add_epi16(dl,sl);                     //d   = d + sl
1426                 dl = _mm_srli_epi16(dl, 8);                    //d   = d>>8
1427
1428                 sa += 8;
1429                 a = _mm_loadl_epi64((__m128i*)sa);
1430
1431                 a = _mm_unpacklo_epi8(a,zero);
1432                 ones = _mm_slli_epi16(ones, 8);
1433                 ia = _mm_sub_epi16(ones, a);
1434                 ones = _mm_srli_epi16(ones, 8);
1435                 a = _mm_add_epi16(a,ones);
1436
1437                 d = _mm_unpackhi_epi8(d,zero);
1438                 sl = _mm_mullo_epi16(s,a);
1439                 d = _mm_mullo_epi16(d,ia);
1440                 d = _mm_add_epi16(d,sl);
1441                 d = _mm_srli_epi16(d, 8);
1442
1443                 dl = _mm_packus_epi16(dl,d);
1444
1445                 _mm_store_si128((__m128i*)dy, dl);
1446             }
1447             for(;dy < dy_end; sa++, dy++)
1448             {
1449                 *dy = (*dy * (256 - *sa)+ Y*(*sa+1))>>8;
1450             }
1451         }
1452     }
1453     else
1454     {
1455         for( ; h>0; h--, pAlphaMask += src_stride, pY += dst_stride )
1456         {
1457             const BYTE* sa = pAlphaMask;
1458             BYTE* dy = pY;
1459             const BYTE* dy_end = pY + w;
1460
1461             for(;dy < dy_end; sa++, dy++)
1462             {
1463                 *dy = (*dy * (256 - *sa)+ Y*(*sa+1))>>8;
1464             }
1465         }
1466     }
1467     //__asm emms;
1468 }
1469
1470 /***
1471  * No aligned requirement
1472  *
1473  **/
1474 void AlphaBlt(byte* pY,
1475     const byte alpha,
1476     const byte Y,
1477     int h, int w, int dst_stride)
1478 {
1479     int yPremul = Y*(alpha+1);
1480     int dstAlpha = 0x100 - alpha;
1481     if( w>32 )//IMPORTANT! The result of the following code is undefined with w<15.
1482     {
1483         __m128i zero = _mm_setzero_si128();
1484         __m128i s = _mm_set1_epi16(yPremul);    //s = c  0  c  0  c  0  c  0  c  0  c  0  c  0  c  0
1485         __m128i ia = _mm_set1_epi16(dstAlpha);
1486         for( ; h>0; h--, pY += dst_stride )
1487         {
1488             BYTE* dy = pY;
1489             const BYTE* dy_first_mod16 = reinterpret_cast<BYTE*>((reinterpret_cast<int>(pY)+15)&~15);  //IMPORTANT! w must >= 15
1490             const BYTE* dy_end_mod16 = reinterpret_cast<BYTE*>(reinterpret_cast<int>(pY+w)&~15);
1491             const BYTE* dy_end = pY + w;
1492
1493             for(;dy < dy_first_mod16; dy++)
1494             {
1495                 *dy = (*dy * dstAlpha + yPremul)>>8;
1496             }
1497             for(; dy < dy_end_mod16; dy+=16)
1498             {
1499                 //Y
1500                 __m128i d = _mm_load_si128(reinterpret_cast<const __m128i*>(dy));
1501                 __m128i dl = _mm_unpacklo_epi8(d,zero);        //d    = b0 0  b1 0  b2 0  b3 0  b4 0  b5 0  b6 0  b7 0
1502
1503                 dl = _mm_mullo_epi16(dl,ia);                   //d    = b0*~a0 b1*~a1 ... b7*~a7
1504                 dl = _mm_adds_epu16(dl,s);                     //d   = d + s
1505                 dl = _mm_srli_epi16(dl, 8);                    //d   = d>>8
1506
1507                 d = _mm_unpackhi_epi8(d,zero);
1508                 d = _mm_mullo_epi16(d,ia);
1509                 d = _mm_adds_epu16(d,s);
1510                 d = _mm_srli_epi16(d, 8);
1511
1512                 dl = _mm_packus_epi16(dl,d);
1513
1514                 _mm_store_si128(reinterpret_cast<__m128i*>(dy), dl);
1515             }
1516             for(;dy < dy_end; dy++)
1517             {
1518                 *dy = (*dy * dstAlpha + yPremul)>>8;
1519             }
1520         }
1521     }
1522     else
1523     {
1524         for( ; h>0; h--, pY += dst_stride )
1525         {
1526             BYTE* dy = pY;
1527             const BYTE* dy_end = pY + w;
1528
1529             for(;dy < dy_end; dy++)
1530             {
1531                 *dy = (*dy * dstAlpha + yPremul)>>8;
1532             }
1533         }
1534     }
1535     //__asm emms;
1536 }
1537
1538 /***
1539  * No aligned requirement
1540  *
1541  **/
1542 void AlphaBltC(byte* pY,
1543     const byte alpha,
1544     const byte Y,
1545     int h, int w, int dst_stride)
1546 {
1547     int yPremul = Y*(alpha+1);
1548     int dstAlpha = 0x100 - alpha;
1549
1550     for( ; h>0; h--, pY += dst_stride )
1551     {
1552         BYTE* dy = pY;
1553         const BYTE* dy_end = pY + w;
1554
1555         for(;dy < dy_end; dy++)
1556         {
1557             *dy = (*dy * dstAlpha + yPremul)>>8;
1558         }
1559     }
1560 }
1561
1562 // For CPUID usage in Rasterizer::Draw
1563 #include "../dsutil/vd.h"
1564
1565 void OverlapRegion(tSpanBuffer& dst, const tSpanBuffer& src, int dx, int dy)
1566 {
1567     tSpanBuffer temp;
1568     temp.reserve(dst.size() + src.size());
1569     dst.swap(temp);
1570     tSpanBuffer::iterator itA = temp.begin();
1571     tSpanBuffer::iterator itAE = temp.end();
1572     tSpanBuffer::const_iterator itB = src.begin();
1573     tSpanBuffer::const_iterator itBE = src.end();
1574     // Don't worry -- even if dy<0 this will still work! // G: hehe, the evil twin :)
1575     unsigned __int64 offset1 = (((__int64)dy)<<32) - dx;
1576     unsigned __int64 offset2 = (((__int64)dy)<<32) + dx;
1577     while(itA != itAE && itB != itBE)
1578     {
1579         if((*itB).first + offset1 < (*itA).first)
1580         {
1581             // B span is earlier.  Use it.
1582             unsigned __int64 x1 = (*itB).first + offset1;
1583             unsigned __int64 x2 = (*itB).second + offset2;
1584             ++itB;
1585             // B spans don't overlap, so begin merge loop with A first.
1586             for(;;)
1587             {
1588                 // If we run out of A spans or the A span doesn't overlap,
1589                 // then the next B span can't either (because B spans don't
1590                 // overlap) and we exit.
1591                 if(itA == itAE || (*itA).first > x2)
1592                     break;
1593                 do {x2 = _MAX(x2, (*itA++).second);}
1594                 while(itA != itAE && (*itA).first <= x2);
1595                 // If we run out of B spans or the B span doesn't overlap,
1596                 // then the next A span can't either (because A spans don't
1597                 // overlap) and we exit.
1598                 if(itB == itBE || (*itB).first + offset1 > x2)
1599                     break;
1600                 do {x2 = _MAX(x2, (*itB++).second + offset2);}
1601                 while(itB != itBE && (*itB).first + offset1 <= x2);
1602             }
1603             // Flush span.
1604             dst.push_back(tSpan(x1, x2));
1605         }
1606         else
1607         {
1608             // A span is earlier.  Use it.
1609             unsigned __int64 x1 = (*itA).first;
1610             unsigned __int64 x2 = (*itA).second;
1611             ++itA;
1612             // A spans don't overlap, so begin merge loop with B first.
1613             for(;;)
1614             {
1615                 // If we run out of B spans or the B span doesn't overlap,
1616                 // then the next A span can't either (because A spans don't
1617                 // overlap) and we exit.
1618                 if(itB == itBE || (*itB).first + offset1 > x2)
1619                     break;
1620                 do {x2 = _MAX(x2, (*itB++).second + offset2);}
1621                 while(itB != itBE && (*itB).first + offset1 <= x2);
1622                 // If we run out of A spans or the A span doesn't overlap,
1623                 // then the next B span can't either (because B spans don't
1624                 // overlap) and we exit.
1625                 if(itA == itAE || (*itA).first > x2)
1626                     break;
1627                 do {x2 = _MAX(x2, (*itA++).second);}
1628                 while(itA != itAE && (*itA).first <= x2);
1629             }
1630             // Flush span.
1631             dst.push_back(tSpan(x1, x2));
1632         }
1633     }
1634     // Copy over leftover spans.
1635     while(itA != itAE)
1636         dst.push_back(*itA++);
1637     while(itB != itBE)
1638     {
1639         dst.push_back(tSpan((*itB).first + offset1, (*itB).second + offset2));
1640         ++itB;
1641     }
1642 }
1643
1644 // Render a subpicture onto a surface.
1645 // spd is the surface to render on.
1646 // clipRect is a rectangular clip region to render inside.
1647 // pAlphaMask is an alpha clipping mask.
1648 // xsub and ysub ???
1649 // switchpts seems to be an array of fill colours interlaced with coordinates.
1650 //    switchpts[i*2] contains a colour and switchpts[i*2+1] contains the coordinate to use that colour from
1651 // fBody tells whether to render the body of the subs.
1652 // fBorder tells whether to render the border of the subs.
1653 SharedPtrByte Rasterizer::CompositeAlphaMask(const SharedPtrOverlay& overlay, const CRect& clipRect,
1654     const GrayImage2* alpha_mask,
1655     int xsub, int ysub, const DWORD* switchpts, bool fBody, bool fBorder,
1656     CRect *outputDirtyRect)
1657 {
1658     //fix me: check and log error
1659     SharedPtrByte result;
1660     *outputDirtyRect = CRect(0, 0, 0, 0);
1661     if (!switchpts || !fBody && !fBorder) return result;
1662     if (fBorder && !overlay->mBorder) return result;
1663
1664     CRect r = clipRect;
1665     if (alpha_mask!=NULL)
1666     {
1667         r &= CRect(alpha_mask->left_top, alpha_mask->size);
1668     }
1669
1670     // Remember that all subtitle coordinates are specified in 1/8 pixels
1671     // (x+4)>>3 rounds to nearest whole pixel.
1672     // ??? What is xsub, ysub, mOffsetX and mOffsetY ?
1673     int x = (xsub + overlay->mOffsetX + 4)>>3;
1674     int y = (ysub + overlay->mOffsetY + 4)>>3;
1675     int w = overlay->mOverlayWidth;
1676     int h = overlay->mOverlayHeight;
1677     int xo = 0, yo = 0;
1678     // Again, limiting?
1679     if(x < r.left) {xo = r.left-x; w -= r.left-x; x = r.left;}
1680     if(y < r.top) {yo = r.top-y; h -= r.top-y; y = r.top;}
1681     if(x+w > r.right) w = r.right-x;
1682     if(y+h > r.bottom) h = r.bottom-y;
1683     // Check if there's actually anything to render
1684     if(w <= 0 || h <= 0) return(result);
1685     outputDirtyRect->SetRect(x, y, x+w, y+h);
1686
1687     bool fSingleColor = (switchpts[1]==0xffffffff);
1688
1689     // draw
1690     // Grab the first colour
1691     DWORD color = switchpts[0];
1692     byte* s_base = (byte*)xy_malloc(overlay->mOverlayPitch * overlay->mOverlayHeight);
1693     const byte* alpha_mask_data = alpha_mask != NULL ? alpha_mask->data.get() : NULL;
1694     const int alpha_mask_pitch = alpha_mask != NULL ? alpha_mask->pitch : 0;
1695     if(alpha_mask_data!=NULL )
1696         alpha_mask_data += alpha_mask->pitch * y + x - alpha_mask->left_top.y*alpha_mask->pitch - alpha_mask->left_top.x;
1697
1698     if(fSingleColor)
1699     {
1700         overlay->FillAlphaMash(s_base, fBody, fBorder, xo, yo, w, h,
1701             alpha_mask_data, alpha_mask_pitch,
1702             color>>24 );
1703     }
1704     else
1705     {
1706         int last_x = xo;
1707         const DWORD *sw = switchpts;
1708         while( last_x<w+xo )
1709         {
1710             byte alpha = sw[0]>>24;
1711             while( sw[3]<w+xo && (sw[2]>>24)==alpha )
1712             {
1713                 sw += 2;
1714             }
1715             int new_x = sw[3] < w+xo ? sw[3] : w+xo;
1716             overlay->FillAlphaMash(s_base, fBody, fBorder,
1717                 last_x, yo, new_x-last_x, h,
1718                 alpha_mask_data, alpha_mask_pitch,
1719                 alpha );
1720             last_x = new_x;
1721             sw += 2;
1722         }
1723     }
1724     result.reset( s_base, xy_free );
1725     return result;
1726 }
1727
1728
1729 //
1730 // draw overlay[clipRect] to bitmap[0,0,w,h]
1731 //
1732 void Rasterizer::Draw(XyBitmap* bitmap, SharedPtrOverlay overlay, const CRect& clipRect, byte* s_base,
1733     int xsub, int ysub, const DWORD* switchpts, bool fBody, bool fBorder)
1734 {
1735     if (!switchpts || !fBody && !fBorder) return;
1736     if (bitmap==NULL)
1737     {
1738         ASSERT(0);
1739         return;
1740     }
1741     // clip
1742     // Limit drawn area to rectangular clip area
1743     CRect r = clipRect;
1744     // Remember that all subtitle coordinates are specified in 1/8 pixels
1745     // (x+4)>>3 rounds to nearest whole pixel.
1746     int overlayPitch = overlay->mOverlayPitch;
1747     int x = (xsub + overlay->mOffsetX + 4)>>3;
1748     int y = (ysub + overlay->mOffsetY + 4)>>3;
1749     int w = overlay->mOverlayWidth;
1750     int h = overlay->mOverlayHeight;
1751     int xo = 0, yo = 0;
1752
1753     if(x < r.left) {xo = r.left-x; w -= r.left-x; x = r.left;}
1754     if(y < r.top) {yo = r.top-y; h -= r.top-y; y = r.top;}
1755     if(x+w > r.right) w = r.right-x;
1756     if(y+h > r.bottom) h = r.bottom-y;
1757     // Check if there's actually anything to render
1758     if (w <= 0 || h <= 0) return;
1759     // must have enough space to draw into
1760     ASSERT(x >= bitmap->x && y >= bitmap->y && x+w <= bitmap->x + bitmap->w && y+h <= bitmap->y + bitmap->h );
1761
1762     // CPUID from VDub
1763     bool fSSE2 = !!(g_cpuid.m_flags & CCpuID::sse2);
1764     bool fSingleColor = (switchpts[1]==0xffffffff);
1765     bool PLANAR = (bitmap->type==XyBitmap::PLANNA);
1766     int draw_method = 0;
1767     if(fSingleColor)
1768         draw_method |= DM::SINGLE_COLOR;
1769     if(fSSE2)
1770         draw_method |= DM::SSE2;
1771     if(PLANAR)
1772         draw_method |= DM::AYUV_PLANAR;
1773
1774     // draw
1775     // Grab the first colour
1776     DWORD color = switchpts[0];
1777     const byte* s = s_base + overlay->mOverlayPitch*yo + xo;
1778
1779     int dst_offset = 0;
1780     if (bitmap->type==XyBitmap::PLANNA)
1781         dst_offset = bitmap->pitch*(y-bitmap->y) + x - bitmap->x;
1782     else
1783         dst_offset = bitmap->pitch*(y-bitmap->y) + (x - bitmap->x)*4;
1784     unsigned long* dst = (unsigned long*)((BYTE*)bitmap->plans[0] + dst_offset);
1785
1786     // Every remaining line in the bitmap to be rendered...
1787     switch(draw_method)
1788     {
1789     case   DM::SINGLE_COLOR |   DM::SSE2 | 0*DM::AYUV_PLANAR :
1790     {
1791         while(h--)
1792         {
1793             for(int wt=0; wt<w; ++wt)
1794                 // The <<6 is due to pixmix expecting the alpha parameter to be
1795                 // the multiplication of two 6-bit unsigned numbers but we
1796                 // only have one here. (No alpha mask.)
1797                 pixmix_sse2(&dst[wt], color, s[wt]);
1798             s += overlayPitch;
1799             dst = (unsigned long *)((char *)dst + bitmap->pitch);
1800         }
1801     }
1802     break;
1803     case   DM::SINGLE_COLOR | 0*DM::SSE2 | 0*DM::AYUV_PLANAR :
1804     {
1805         while(h--)
1806         {
1807             for(int wt=0; wt<w; ++wt)
1808                 pixmix(&dst[wt], color, s[wt]);
1809             s += overlayPitch;
1810             dst = (unsigned long *)((char *)dst + bitmap->pitch);
1811         }
1812     }
1813     break;
1814     case 0*DM::SINGLE_COLOR |   DM::SSE2 | 0*DM::AYUV_PLANAR :
1815     {
1816         while(h--)
1817         {
1818             const DWORD *sw = switchpts;
1819             for(int wt=0; wt<w; ++wt)
1820             {
1821                 // xo is the offset (usually negative) we have moved into the image
1822                 // So if we have passed the switchpoint (?) switch to another colour
1823                 // (So switchpts stores both colours *and* coordinates?)
1824                 if(wt+xo >= sw[1]) {while(wt+xo >= sw[1]) sw += 2; color = sw[-2];}
1825                 pixmix_sse2(&dst[wt], color, s[wt]);
1826             }
1827             s += overlayPitch;
1828             dst = (unsigned long *)((char *)dst + bitmap->pitch);
1829         }
1830     }
1831     break;
1832     case 0*DM::SINGLE_COLOR | 0*DM::SSE2 | 0*DM::AYUV_PLANAR :
1833     {
1834         while(h--)
1835         {
1836             const DWORD *sw = switchpts;
1837             for(int wt=0; wt<w; ++wt)
1838             {
1839                 if(wt+xo >= sw[1]) {while(wt+xo >= sw[1]) sw += 2; color = sw[-2];}
1840                 pixmix(&dst[wt], color, s[wt]);
1841             }
1842             s += overlayPitch;
1843             dst = (unsigned long *)((char *)dst + bitmap->pitch);
1844         }
1845     }
1846     break;
1847     case   DM::SINGLE_COLOR |   DM::SSE2 |   DM::AYUV_PLANAR :
1848     {
1849         unsigned char* dst_A = bitmap->plans[0] + dst_offset;
1850         unsigned char* dst_Y = bitmap->plans[1] + dst_offset;
1851         unsigned char* dst_U = bitmap->plans[2] + dst_offset;
1852         unsigned char* dst_V = bitmap->plans[3] + dst_offset;
1853
1854         AlphaBlt(dst_Y, s, ((color)>>16)&0xff, h, w, overlayPitch, bitmap->pitch);
1855         AlphaBlt(dst_U, s, ((color)>>8)&0xff, h, w, overlayPitch, bitmap->pitch);
1856         AlphaBlt(dst_V, s, ((color))&0xff, h, w, overlayPitch, bitmap->pitch);
1857         AlphaBlt(dst_A, s, 0, h, w, overlayPitch, bitmap->pitch);
1858     }
1859     break;
1860     case 0*DM::SINGLE_COLOR |   DM::SSE2 |   DM::AYUV_PLANAR :
1861     {
1862         unsigned char* dst_A = bitmap->plans[0] + dst_offset;
1863         unsigned char* dst_Y = bitmap->plans[1] + dst_offset;
1864         unsigned char* dst_U = bitmap->plans[2] + dst_offset;
1865         unsigned char* dst_V = bitmap->plans[3] + dst_offset;
1866
1867         const DWORD *sw = switchpts;
1868         int last_x = xo;
1869         color = sw[0];
1870         while(last_x<w+xo)
1871         {
1872             int new_x = sw[3] < w+xo ? sw[3] : w+xo;
1873             color = sw[0];
1874             sw += 2;
1875             if( new_x < last_x )
1876                 continue;
1877             AlphaBlt(dst_Y, s + last_x - xo, (color>>16)&0xff, h, new_x-last_x, overlayPitch, bitmap->pitch);
1878             AlphaBlt(dst_U, s + last_x - xo, (color>>8)&0xff, h, new_x-last_x, overlayPitch, bitmap->pitch);
1879             AlphaBlt(dst_V, s + last_x - xo, (color)&0xff, h, new_x-last_x, overlayPitch, bitmap->pitch);
1880             AlphaBlt(dst_A, s + last_x - xo, 0, h, new_x-last_x, overlayPitch, bitmap->pitch);
1881
1882             dst_A += new_x - last_x;
1883             dst_Y += new_x - last_x;
1884             dst_U += new_x - last_x;
1885             dst_V += new_x - last_x;
1886             last_x = new_x;
1887         }
1888     }
1889     break;
1890     case   DM::SINGLE_COLOR | 0*DM::SSE2 |   DM::AYUV_PLANAR :
1891     {
1892 //        char * debug_dst=(char*)dst;int h2 = h;
1893 //        XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", (char*)&color, sizeof(color)) );
1894 //        XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", debug_dst, (h2-1)*spd.pitch) );
1895 //        debug_dst += spd.pitch*spd.h;
1896 //        XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", debug_dst, (h2-1)*spd.pitch) );
1897 //        debug_dst += spd.pitch*spd.h;
1898 //        XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", debug_dst, (h2-1)*spd.pitch) );
1899 //        debug_dst += spd.pitch*spd.h;
1900 //        XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", debug_dst, (h2-1)*spd.pitch) );
1901 //        debug_dst=(char*)dst;
1902
1903         unsigned char* dst_A = bitmap->plans[0] + dst_offset;
1904         unsigned char* dst_Y = bitmap->plans[1] + dst_offset;
1905         unsigned char* dst_U = bitmap->plans[2] + dst_offset;
1906         unsigned char* dst_V = bitmap->plans[3] + dst_offset;
1907         while(h--)
1908         {
1909             for(int wt=0; wt<w; ++wt)
1910             {
1911                 DWORD temp = COMBINE_AYUV(dst_A[wt], dst_Y[wt], dst_U[wt], dst_V[wt]);
1912                 pixmix(&temp, color, s[wt]);
1913                 SPLIT_AYUV(temp, dst_A+wt, dst_Y+wt, dst_U+wt, dst_V+wt);
1914             }
1915             s += overlayPitch;
1916             dst_A += bitmap->pitch;
1917             dst_Y += bitmap->pitch;
1918             dst_U += bitmap->pitch;
1919             dst_V += bitmap->pitch;
1920         }
1921 //        XY_DO_ONCE( xy_logger::write_file("G:\\a2_rt", debug_dst, (h2-1)*spd.pitch) );
1922 //        debug_dst += spd.pitch*spd.h;
1923 //        XY_DO_ONCE( xy_logger::write_file("G:\\a2_rt", debug_dst, (h2-1)*spd.pitch) );
1924 //        debug_dst += spd.pitch*spd.h;
1925 //        XY_DO_ONCE( xy_logger::write_file("G:\\a2_rt", debug_dst, (h2-1)*spd.pitch) );
1926 //        debug_dst += spd.pitch*spd.h;
1927 //        XY_DO_ONCE( xy_logger::write_file("G:\\a2_rt", debug_dst, (h2-1)*spd.pitch) );
1928     }
1929     break;
1930     case 0*DM::SINGLE_COLOR | 0*DM::SSE2 |   DM::AYUV_PLANAR :
1931     {
1932         unsigned char* dst_A = bitmap->plans[0] + dst_offset;
1933         unsigned char* dst_Y = bitmap->plans[1] + dst_offset;
1934         unsigned char* dst_U = bitmap->plans[2] + dst_offset;
1935         unsigned char* dst_V = bitmap->plans[3] + dst_offset;
1936         while(h--)
1937         {
1938             const DWORD *sw = switchpts;
1939             for(int wt=0; wt<w; ++wt)
1940             {
1941                 if(wt+xo >= sw[1]) {while(wt+xo >= sw[1]) sw += 2; color = sw[-2];}
1942                 DWORD temp = COMBINE_AYUV(dst_A[wt], dst_Y[wt], dst_U[wt], dst_V[wt]);
1943                 pixmix(&temp, color, (s[wt]*(color>>24))>>8);
1944                 SPLIT_AYUV(temp, dst_A+wt, dst_Y+wt, dst_U+wt, dst_V+wt);
1945             }
1946             s += overlayPitch;
1947             dst_A += bitmap->pitch;
1948             dst_Y += bitmap->pitch;
1949             dst_U += bitmap->pitch;
1950             dst_V += bitmap->pitch;
1951         }
1952     }
1953     break;
1954     }
1955     // Remember to EMMS!
1956     // Rendering fails in funny ways if we don't do this.
1957     _mm_empty();
1958     return;
1959 }
1960
1961 void Rasterizer::FillSolidRect(SubPicDesc& spd, int x, int y, int nWidth, int nHeight, DWORD argb)
1962 {
1963     bool fSSE2 = !!(g_cpuid.m_flags & CCpuID::sse2);
1964     bool AYUV_PLANAR = (spd.type==MSP_AYUV_PLANAR);
1965     int draw_method = 0;
1966     if(fSSE2)
1967         draw_method |= DM::SSE2;
1968     if(AYUV_PLANAR)
1969         draw_method |= DM::AYUV_PLANAR;
1970
1971     switch (draw_method)
1972     {
1973     case   DM::SSE2 | 0*DM::AYUV_PLANAR :
1974     {
1975         for (int wy=y; wy<y+nHeight; wy++) {
1976             DWORD* dst = (DWORD*)((BYTE*)spd.bits + spd.pitch * wy) + x;
1977             for(int wt=0; wt<nWidth; ++wt) {
1978                 pixmix_sse2(&dst[wt], argb, argb>>24);
1979             }
1980         }
1981     }
1982     break;
1983     case 0*DM::SSE2 | 0*DM::AYUV_PLANAR :
1984     {
1985         for (int wy=y; wy<y+nHeight; wy++) {
1986             DWORD* dst = (DWORD*)((BYTE*)spd.bits + spd.pitch * wy) + x;
1987             for(int wt=0; wt<nWidth; ++wt) {
1988                 pixmix(&dst[wt], argb,  argb>>24);
1989             }
1990         }
1991     }
1992     break;
1993     case   DM::SSE2 |   DM::AYUV_PLANAR :
1994     {
1995         BYTE* dst = reinterpret_cast<BYTE*>(spd.bits) + spd.pitch * y + x;
1996         BYTE* dst_A = dst;
1997         BYTE* dst_Y = dst_A + spd.pitch*spd.h;
1998         BYTE* dst_U = dst_Y + spd.pitch*spd.h;
1999         BYTE* dst_V = dst_U + spd.pitch*spd.h;
2000         AlphaBlt(dst_Y, argb>>24, ((argb)>>16)&0xff, nHeight, nWidth, spd.pitch);
2001         AlphaBlt(dst_U, argb>>24, ((argb)>>8)&0xff, nHeight, nWidth, spd.pitch);
2002         AlphaBlt(dst_V, argb>>24, ((argb))&0xff, nHeight, nWidth, spd.pitch);
2003         AlphaBlt(dst_A, argb>>24, 0, nHeight, nWidth, spd.pitch);
2004     }
2005     break;
2006     case 0*DM::SSE2 |   DM::AYUV_PLANAR :
2007     {
2008         BYTE* dst = reinterpret_cast<BYTE*>(spd.bits) + spd.pitch * y + x;
2009         BYTE* dst_A = dst;
2010         BYTE* dst_Y = dst_A + spd.pitch*spd.h;
2011         BYTE* dst_U = dst_Y + spd.pitch*spd.h;
2012         BYTE* dst_V = dst_U + spd.pitch*spd.h;
2013         AlphaBltC(dst_Y, argb>>24, ((argb)>>16)&0xff, nHeight, nWidth, spd.pitch);
2014         AlphaBltC(dst_U, argb>>24, ((argb)>>8)&0xff, nHeight, nWidth, spd.pitch);
2015         AlphaBltC(dst_V, argb>>24, ((argb))&0xff, nHeight, nWidth, spd.pitch);
2016         AlphaBltC(dst_A, argb>>24, 0, nHeight, nWidth, spd.pitch);
2017     }
2018     break;
2019     }
2020     _mm_empty();
2021 }
2022
2023
2024 ///////////////////////////////////////////////////////////////
2025
2026 // Overlay
2027
2028 void Overlay::_DoFillAlphaMash(byte* outputAlphaMask, const byte* pBody, const byte* pBorder, int x, int y, int w, int h,
2029     const byte* pAlphaMask, int pitch, DWORD color_alpha )
2030 {
2031     if (g_cpuid.m_flags & CCpuID::sse2)
2032     {
2033         pBody = pBody!=NULL ? pBody + y*mOverlayPitch + x: NULL;
2034         pBorder = pBorder!=NULL ? pBorder + y*mOverlayPitch + x: NULL;
2035         byte* dst = outputAlphaMask + y*mOverlayPitch + x;
2036
2037         const int x0 = ((reinterpret_cast<int>(dst)+3)&~3) - reinterpret_cast<int>(dst) < w ?
2038             ((reinterpret_cast<int>(dst)+3)&~3) - reinterpret_cast<int>(dst) : w; //IMPORTANT! Should not exceed w.
2039         const int x00 = ((reinterpret_cast<int>(dst)+15)&~15) - reinterpret_cast<int>(dst) < w ?
2040             ((reinterpret_cast<int>(dst)+15)&~15) - reinterpret_cast<int>(dst) : w;//IMPORTANT! Should not exceed w.
2041         const int x_end00  = ((reinterpret_cast<int>(dst)+w)&~15) - reinterpret_cast<int>(dst);
2042         const int x_end0 = ((reinterpret_cast<int>(dst)+w)&~3) - reinterpret_cast<int>(dst);
2043         const int x_end = w;
2044
2045         __m64 color_alpha_64 = _mm_set1_pi16(color_alpha);
2046         __m128i color_alpha_128 = _mm_set1_epi16(color_alpha);
2047
2048         if(pAlphaMask==NULL && pBody!=NULL && pBorder!=NULL)
2049         {
2050             /*
2051             __asm
2052             {
2053             mov        eax, color_alpha
2054             movd           XMM3, eax
2055             punpcklwd  XMM3, XMM3
2056             pshufd         XMM3, XMM3, 0
2057             }
2058             */
2059             while(h--)
2060             {
2061                 int j=0;
2062                 for( ; j<x0; j++ )
2063                 {
2064                     int temp = pBorder[j]-pBody[j];
2065                     temp = temp<0 ? 0 : temp;
2066                     dst[j] = (temp * color_alpha)>>6;
2067                 }
2068                 for( ;j<x00;j+=4 )
2069                 {
2070                     __m64 border = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBorder+j));
2071                     __m64 body = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBody+j));
2072                     border = _mm_subs_pu8(border, body);
2073                     __m64 zero = _mm_setzero_si64();
2074                     border = _mm_unpacklo_pi8(border, zero);
2075                     border = _mm_mullo_pi16(border, color_alpha_64);
2076                     border = _mm_srli_pi16(border, 6);
2077                     border = _mm_packs_pu16(border,border);
2078                     *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(border);
2079                 }
2080                 __m128i zero = _mm_setzero_si128();
2081                 for( ;j<x_end00;j+=16)
2082                 {
2083                     __m128i border = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pBorder+j));
2084                     __m128i body = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pBody+j));
2085                     border = _mm_subs_epu8(border,body);
2086                     __m128i srchi = border;
2087                     border = _mm_unpacklo_epi8(border, zero);
2088                     srchi = _mm_unpackhi_epi8(srchi, zero);
2089                     border = _mm_mullo_epi16(border, color_alpha_128);
2090                     srchi = _mm_mullo_epi16(srchi, color_alpha_128);
2091                     border = _mm_srli_epi16(border, 6);
2092                     srchi = _mm_srli_epi16(srchi, 6);
2093                     border = _mm_packus_epi16(border, srchi);
2094                     _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), border);
2095                 }
2096                 for( ;j<x_end0;j+=4)
2097                 {
2098                     __m64 border = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBorder+j));
2099                     __m64 body = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBody+j));
2100                     border = _mm_subs_pu8(border, body);
2101                     __m64 zero = _mm_setzero_si64();
2102                     border = _mm_unpacklo_pi8(border, zero);
2103                     border = _mm_mullo_pi16(border, color_alpha_64);
2104                     border = _mm_srli_pi16(border, 6);
2105                     border = _mm_packs_pu16(border,border);
2106                     *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(border);
2107                 }
2108                 for( ;j<x_end;j++)
2109                 {
2110                     int temp = pBorder[j]-pBody[j];
2111                     temp = temp<0 ? 0 : temp;
2112                     dst[j] = (temp * color_alpha)>>6;
2113                 }
2114                 pBody += mOverlayPitch;
2115                 pBorder += mOverlayPitch;
2116                 //pAlphaMask += pitch;
2117                 dst += mOverlayPitch;
2118             }
2119         }
2120         else if( ((pBody==NULL) + (pBorder==NULL))==1 && pAlphaMask==NULL)
2121         {
2122             const BYTE* src1 = pBody!=NULL ? pBody : pBorder;
2123             while(h--)
2124             {
2125                 int j=0;
2126                 for( ; j<x0; j++ )
2127                 {
2128                     dst[j] = (src1[j] * color_alpha)>>6;
2129                 }
2130                 for( ;j<x00;j+=4 )
2131                 {
2132                     __m64 src = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(src1+j));
2133                     __m64 zero = _mm_setzero_si64();
2134                     src = _mm_unpacklo_pi8(src, zero);
2135                     src = _mm_mullo_pi16(src, color_alpha_64);
2136                     src = _mm_srli_pi16(src, 6);
2137                     src = _mm_packs_pu16(src,src);
2138                     *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(src);
2139                 }
2140                 __m128i zero = _mm_setzero_si128();
2141                 for( ;j<x_end00;j+=16)
2142                 {
2143                     __m128i src = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src1+j));
2144                     __m128i srchi = src;
2145                     src = _mm_unpacklo_epi8(src, zero);
2146                     srchi = _mm_unpackhi_epi8(srchi, zero);
2147                     src = _mm_mullo_epi16(src, color_alpha_128);
2148                     srchi = _mm_mullo_epi16(srchi, color_alpha_128);
2149                     src = _mm_srli_epi16(src, 6);
2150                     srchi = _mm_srli_epi16(srchi, 6);
2151                     src = _mm_packus_epi16(src, srchi);
2152                     _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), src);
2153                 }
2154                 for( ;j<x_end0;j+=4)
2155                 {
2156                     __m64 src = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(src1+j));
2157                     __m64 zero = _mm_setzero_si64();
2158                     src = _mm_unpacklo_pi8(src, zero);
2159                     src = _mm_mullo_pi16(src, color_alpha_64);
2160                     src = _mm_srli_pi16(src, 6);
2161                     src = _mm_packs_pu16(src,src);
2162                     *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(src);
2163                 }
2164                 for( ;j<x_end;j++)
2165                 {
2166                     dst[j] = (src1[j] * color_alpha)>>6;
2167                 }
2168                 src1 += mOverlayPitch;
2169                 //pAlphaMask += pitch;
2170                 dst += mOverlayPitch;
2171             }
2172         }
2173         else if( ((pBody==NULL) + (pBorder==NULL))==1 && pAlphaMask!=NULL)
2174         {
2175             const BYTE* src1 = pBody!=NULL ? pBody : pBorder;
2176             while(h--)
2177             {
2178                 int j=0;
2179                 for( ; j<x0; j++ )
2180                 {
2181                     dst[j] = (src1[j] * pAlphaMask[j] * color_alpha)>>12;
2182                 }
2183                 for( ;j<x00;j+=4 )
2184                 {
2185                     __m64 src = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(src1+j));
2186                     __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pAlphaMask+j));
2187                     __m64 zero = _mm_setzero_si64();
2188                     src = _mm_unpacklo_pi8(src, zero);
2189                     src = _mm_mullo_pi16(src, color_alpha_64);
2190                     mask = _mm_unpacklo_pi8(zero, mask); //important!
2191                     src = _mm_mulhi_pi16(src, mask); //important!
2192                     src = _mm_srli_pi16(src, 12+8-16); //important!
2193                     src = _mm_packs_pu16(src,src);
2194                     *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(src);
2195                 }
2196                 __m128i zero = _mm_setzero_si128();
2197                 for( ;j<x_end00;j+=16)
2198                 {
2199                     __m128i src = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src1+j));
2200                     __m128i mask = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pAlphaMask+j));
2201                     __m128i srchi = src;
2202                     __m128i maskhi = mask;
2203                     src = _mm_unpacklo_epi8(src, zero);
2204                     srchi = _mm_unpackhi_epi8(srchi, zero);
2205                     mask = _mm_unpacklo_epi8(zero, mask); //important!
2206                     maskhi = _mm_unpackhi_epi8(zero, maskhi);
2207                     src = _mm_mullo_epi16(src, color_alpha_128);
2208                     srchi = _mm_mullo_epi16(srchi, color_alpha_128);
2209                     src = _mm_mulhi_epu16(src, mask); //important!
2210                     srchi = _mm_mulhi_epu16(srchi, maskhi);
2211                     src = _mm_srli_epi16(src, 12+8-16); //important!
2212                     srchi = _mm_srli_epi16(srchi, 12+8-16);
2213                     src = _mm_packus_epi16(src, srchi);
2214                     _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), src);
2215                 }
2216                 for( ;j<x_end0;j+=4)
2217                 {
2218                     __m64 src = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(src1+j));
2219                     __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pAlphaMask+j));
2220                     __m64 zero = _mm_setzero_si64();
2221                     src = _mm_unpacklo_pi8(src, zero);
2222                     src = _mm_mullo_pi16(src, color_alpha_64);
2223                     mask = _mm_unpacklo_pi8(zero, mask); //important!
2224                     src = _mm_mulhi_pi16(src, mask); //important!
2225                     src = _mm_srli_pi16(src, 12+8-16); //important!
2226                     src = _mm_packs_pu16(src,src);
2227                     *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(src);
2228                 }
2229                 for( ;j<x_end;j++)
2230                 {
2231                     dst[j] = (src1[j] * pAlphaMask[j] * color_alpha)>>12;
2232                 }
2233                 src1 += mOverlayPitch;
2234                 pAlphaMask += pitch;
2235                 dst += mOverlayPitch;
2236             }
2237         }
2238         else if( pAlphaMask!=NULL && pBody!=NULL && pBorder!=NULL )
2239         {
2240             while(h--)
2241             {
2242                 int j=0;
2243                 for( ; j<x0; j++ )
2244                 {
2245                     int temp = pBorder[j]-pBody[j];
2246                     temp = temp<0 ? 0 : temp;
2247                     dst[j] = (temp * pAlphaMask[j] * color_alpha)>>12;
2248                 }
2249                 for( ;j<x00;j+=4 )
2250                 {
2251                     __m64 border = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBorder+j));
2252                     __m64 body = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBody+j));
2253                     border = _mm_subs_pu8(border, body);
2254                     __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pAlphaMask+j));
2255                     __m64 zero = _mm_setzero_si64();
2256                     border = _mm_unpacklo_pi8(border, zero);
2257                     border = _mm_mullo_pi16(border, color_alpha_64);
2258                     mask = _mm_unpacklo_pi8(zero, mask); //important!
2259                     border = _mm_mulhi_pi16(border, mask); //important!
2260                     border = _mm_srli_pi16(border, 12+8-16); //important!
2261                     border = _mm_packs_pu16(border,border);
2262                     *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(border);
2263                 }
2264                 __m128i zero = _mm_setzero_si128();
2265                 for( ;j<x_end00;j+=16)
2266                 {
2267                     __m128i border = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pBorder+j));
2268                     __m128i body = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pBody+j));
2269                     border = _mm_subs_epu8(border,body);
2270
2271                     __m128i mask = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pAlphaMask+j));
2272                     __m128i srchi = border;
2273                     __m128i maskhi = mask;
2274                     border = _mm_unpacklo_epi8(border, zero);
2275                     srchi = _mm_unpackhi_epi8(srchi, zero);
2276                     mask = _mm_unpacklo_epi8(zero, mask); //important!
2277                     maskhi = _mm_unpackhi_epi8(zero, maskhi);
2278                     border = _mm_mullo_epi16(border, color_alpha_128);
2279                     srchi = _mm_mullo_epi16(srchi, color_alpha_128);
2280                     border = _mm_mulhi_epu16(border, mask); //important!
2281                     srchi = _mm_mulhi_epu16(srchi, maskhi);
2282                     border = _mm_srli_epi16(border, 12+8-16); //important!
2283                     srchi = _mm_srli_epi16(srchi, 12+8-16);
2284                     border = _mm_packus_epi16(border, srchi);
2285                     _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), border);
2286                 }
2287                 for( ;j<x_end0;j+=4)
2288                 {
2289                     __m64 border = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBorder+j));
2290                     __m64 body = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBody+j));
2291                     border = _mm_subs_pu8(border, body);
2292                     __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pAlphaMask+j));
2293                     __m64 zero = _mm_setzero_si64();
2294                     border = _mm_unpacklo_pi8(border, zero);
2295                     border = _mm_mullo_pi16(border, color_alpha_64);
2296                     mask = _mm_unpacklo_pi8(zero, mask); //important!
2297                     border = _mm_mulhi_pi16(border, mask); //important!
2298                     border = _mm_srli_pi16(border, 12+8-16); //important!
2299                     border = _mm_packs_pu16(border,border);
2300                     *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(border);
2301                 }
2302                 for( ;j<x_end;j++)
2303                 {
2304                     int temp = pBorder[j]-pBody[j];
2305                     temp = temp<0 ? 0 : temp;
2306                     dst[j] = (temp * pAlphaMask[j] * color_alpha)>>12;
2307                 }
2308                 pBody += mOverlayPitch;
2309                 pBorder += mOverlayPitch;
2310                 pAlphaMask += pitch;
2311                 dst += mOverlayPitch;
2312             }
2313         }
2314         else
2315         {
2316             //should NOT happen!
2317             ASSERT(0);
2318             while(h--)
2319             {
2320                 for(int j=0;j<x_end;j++)
2321                 {
2322                     dst[j] = 0;
2323                 }
2324                 dst += mOverlayPitch;
2325             }
2326         }
2327     }
2328     else
2329     {
2330         _DoFillAlphaMash_c(outputAlphaMask, pBody, pBorder, x, y, w, h, pAlphaMask, pitch, color_alpha);
2331         return;
2332     }
2333 }
2334
2335 void Overlay::_DoFillAlphaMash_c(byte* outputAlphaMask, const byte* pBody, const byte* pBorder, int x, int y, int w, int h,
2336     const byte* pAlphaMask, int pitch, DWORD color_alpha )
2337 {
2338     pBody = pBody!=NULL ? pBody + y*mOverlayPitch + x: NULL;
2339     pBorder = pBorder!=NULL ? pBorder + y*mOverlayPitch + x: NULL;
2340     byte* dst = outputAlphaMask + y*mOverlayPitch + x;
2341
2342     if(pAlphaMask==NULL && pBody!=NULL && pBorder!=NULL)
2343     {
2344         while(h--)
2345         {
2346             int j=0;
2347             for( ;j<w;j++)
2348             {
2349                 int temp = pBorder[j]-pBody[j];
2350                 temp = temp<0 ? 0 : temp;
2351                 dst[j] = (temp * color_alpha)>>6;
2352             }
2353             pBody += mOverlayPitch;
2354             pBorder += mOverlayPitch;
2355             //pAlphaMask += pitch;
2356             dst += mOverlayPitch;
2357         }
2358     }
2359     else if( ((pBody==NULL) + (pBorder==NULL))==1 && pAlphaMask==NULL)
2360     {
2361         const BYTE* src1 = pBody!=NULL ? pBody : pBorder;
2362         while(h--)
2363         {
2364             int j=0;
2365             for( ; j<w; j++ )
2366             {
2367                 dst[j] = (src1[j] * color_alpha)>>6;
2368             }
2369             src1 += mOverlayPitch;
2370             //pAlphaMask += pitch;
2371             dst += mOverlayPitch;
2372         }
2373     }
2374     else if( ((pBody==NULL) + (pBorder==NULL))==1 && pAlphaMask!=NULL)
2375     {
2376         const BYTE* src1 = pBody!=NULL ? pBody : pBorder;
2377         while(h--)
2378         {
2379             int j=0;
2380             for( ; j<w; j++ )
2381             {
2382                 dst[j] = (src1[j] * pAlphaMask[j] * color_alpha)>>12;
2383             }
2384             src1 += mOverlayPitch;
2385             pAlphaMask += pitch;
2386             dst += mOverlayPitch;
2387         }
2388     }
2389     else if( pAlphaMask!=NULL && pBody!=NULL && pBorder!=NULL )
2390     {
2391         while(h--)
2392         {
2393             int j=0;
2394             for( ; j<w; j++ )
2395             {
2396                 int temp = pBorder[j]-pBody[j];
2397                 temp = temp<0 ? 0 : temp;
2398                 dst[j] = (temp * pAlphaMask[j] * color_alpha)>>12;
2399             }
2400             pBody += mOverlayPitch;
2401             pBorder += mOverlayPitch;
2402             pAlphaMask += pitch;
2403             dst += mOverlayPitch;
2404         }
2405     }
2406     else
2407     {
2408         //should NOT happen!
2409         ASSERT(0);
2410         while(h--)
2411         {
2412             for(int j=0;j<w;j++)
2413             {
2414                 dst[j] = 0;
2415             }
2416             dst += mOverlayPitch;
2417         }
2418     }
2419 }
2420
2421 void Overlay::FillAlphaMash( byte* outputAlphaMask, bool fBody, bool fBorder, int x, int y, int w, int h, const byte* pAlphaMask, int pitch, DWORD color_alpha)
2422 {
2423     if(!fBorder && fBody && pAlphaMask==NULL)
2424     {
2425         _DoFillAlphaMash(outputAlphaMask, mBody.get(), NULL, x, y, w, h, pAlphaMask, pitch, color_alpha);
2426     }
2427     else if(/*fBorder &&*/ fBody && pAlphaMask==NULL)
2428     {
2429         _DoFillAlphaMash(outputAlphaMask, NULL, mBorder.get(), x, y, w, h, pAlphaMask, pitch, color_alpha);
2430     }
2431     else if(!fBody && fBorder /* pAlphaMask==NULL or not*/)
2432     {
2433         _DoFillAlphaMash(outputAlphaMask, mBody.get(), mBorder.get(), x, y, w, h, pAlphaMask, pitch, color_alpha);
2434     }
2435     else if(!fBorder && fBody && pAlphaMask!=NULL)
2436     {
2437         _DoFillAlphaMash(outputAlphaMask, mBody.get(), NULL, x, y, w, h, pAlphaMask, pitch, color_alpha);
2438     }
2439     else if(fBorder && fBody && pAlphaMask!=NULL)
2440     {
2441         _DoFillAlphaMash(outputAlphaMask, NULL, mBorder.get(), x, y, w, h, pAlphaMask, pitch, color_alpha);
2442     }
2443     else
2444     {
2445         //should NOT happen
2446         ASSERT(0);
2447     }
2448 }
2449
2450 Overlay* Overlay::GetSubpixelVariance(unsigned int xshift, unsigned int yshift)
2451 {
2452     Overlay* overlay = new Overlay();
2453     if(!overlay)
2454     {
2455         return NULL;
2456     }
2457     xshift &= 7;
2458     yshift &= 7;
2459
2460     overlay->mOffsetX = mOffsetX - xshift;
2461     overlay->mOffsetY = mOffsetY - yshift;
2462     overlay->mWidth = mWidth + xshift;
2463     overlay->mHeight = mHeight + yshift;
2464
2465     overlay->mOverlayWidth = ((overlay->mWidth+7)>>3) + 1;
2466     overlay->mOverlayHeight = ((overlay->mHeight + 7)>>3) + 1;
2467     overlay->mOverlayPitch = (overlay->mOverlayWidth+15)&~15;
2468
2469
2470     overlay->mfWideOutlineEmpty = mfWideOutlineEmpty;
2471
2472     if (overlay->mOverlayPitch * overlay->mOverlayHeight<=0)
2473     {
2474         return NULL;
2475     }
2476
2477     BYTE* body = reinterpret_cast<BYTE*>(xy_malloc(overlay->mOverlayPitch * overlay->mOverlayHeight));
2478     if( body==NULL )
2479     {
2480         return NULL;
2481     }
2482     overlay->mBody.reset(body, xy_free);
2483     BYTE* border = NULL;
2484     if (!overlay->mfWideOutlineEmpty)
2485     {
2486         border = reinterpret_cast<BYTE*>(xy_malloc(overlay->mOverlayPitch * overlay->mOverlayHeight));
2487         if (border==NULL)
2488         {
2489             return NULL;
2490         }
2491         overlay->mBorder.reset(border, xy_free);
2492     }
2493
2494     if(overlay->mOverlayPitch==mOverlayPitch && overlay->mOverlayHeight>=mOverlayHeight)
2495     {
2496         if (body && mBody)
2497         {
2498             memcpy(body, mBody.get(), mOverlayPitch * mOverlayHeight);
2499             memset(body+mOverlayPitch*mOverlayHeight, 0, mOverlayPitch * (overlay->mOverlayHeight-mOverlayHeight));
2500         }
2501         else if ( (!!body)!=(!!mBody)/*==NULL*/)
2502         {
2503             return NULL;
2504         }
2505
2506         if (border && mBorder)
2507         {
2508             memcpy(border, mBorder.get(), mOverlayPitch * mOverlayHeight);
2509             memset(border+mOverlayPitch*mOverlayHeight, 0, mOverlayPitch * (overlay->mOverlayHeight-mOverlayHeight));
2510         }
2511         else if ( (!!border)!=(!!mBorder)/*==NULL*/ )
2512         {
2513             return NULL;
2514         }
2515     }
2516     else
2517     {
2518         memset(body, 0, overlay->mOverlayPitch * overlay->mOverlayHeight);
2519         byte* dst = body;
2520         const byte* src = mBody.get();
2521         for (int i=0;i<mOverlayHeight;i++)
2522         {
2523             memcpy(dst, src, mOverlayPitch);
2524             dst += overlay->mOverlayPitch;
2525             src += mOverlayPitch;
2526         }
2527         if (!overlay->mfWideOutlineEmpty)
2528         {
2529             ASSERT(border && mBorder);
2530             memset(border, 0, overlay->mOverlayPitch * overlay->mOverlayHeight);
2531             dst = border;
2532             src = mBorder.get();
2533             for (int i=0;i<mOverlayHeight;i++)
2534             {
2535                 memcpy(dst, src, mOverlayPitch);
2536                 dst += overlay->mOverlayPitch;
2537                 src += mOverlayPitch;
2538             }
2539         }
2540     }
2541     //not equal
2542     //  Bilinear(overlay->mpOverlayBuffer.base, overlay->mOverlayWidth, 2*overlay->mOverlayHeight, overlay->mOverlayPitch, xshift, yshift);
2543     Bilinear(body, overlay->mOverlayWidth, overlay->mOverlayHeight, overlay->mOverlayPitch, xshift, yshift);
2544     if (!overlay->mfWideOutlineEmpty)
2545     {
2546         Bilinear(border, overlay->mOverlayWidth, overlay->mOverlayHeight, overlay->mOverlayPitch, xshift, yshift);
2547     }
2548     return overlay;
2549 }
2550
2551 ///////////////////////////////////////////////////////////////
2552
2553 // PathData
2554
2555 PathData::PathData():mpPathTypes(NULL), mpPathPoints(NULL), mPathPoints(0)
2556 {
2557 }
2558
2559 PathData::PathData( const PathData& src ):mpPathTypes(NULL), mpPathPoints(NULL), mPathPoints(src.mPathPoints)
2560 {
2561     //TODO: deal with the case that src.mPathPoints<0
2562     if(mPathPoints>0)
2563     {
2564         mpPathTypes = static_cast<BYTE*>(malloc(mPathPoints * sizeof(BYTE)));
2565         mpPathPoints = static_cast<POINT*>(malloc(mPathPoints * sizeof(POINT)));
2566     }
2567     if(mPathPoints>0)
2568     {
2569         memcpy(mpPathTypes, src.mpPathTypes, mPathPoints*sizeof(BYTE));
2570         memcpy(mpPathPoints, src.mpPathPoints, mPathPoints*sizeof(POINT));
2571     }
2572 }
2573
2574 const PathData& PathData::operator=( const PathData& src )
2575 {
2576     if(this!=&src)
2577     {
2578         if(mPathPoints!=src.mPathPoints && src.mPathPoints>0)
2579         {
2580             _TrashPath();
2581             mPathPoints = src.mPathPoints;
2582             mpPathTypes = static_cast<BYTE*>(malloc(mPathPoints * sizeof(BYTE)));
2583             mpPathPoints = static_cast<POINT*>(malloc(mPathPoints * sizeof(POINT)));//better than realloc
2584         }
2585         if(src.mPathPoints>0)
2586         {
2587             memcpy(mpPathTypes, src.mpPathTypes, mPathPoints*sizeof(BYTE));
2588             memcpy(mpPathPoints, src.mpPathPoints, mPathPoints*sizeof(POINT));
2589         }
2590     }
2591     return *this;
2592 }
2593
2594 PathData::~PathData()
2595 {
2596     _TrashPath();
2597 }
2598
2599 bool PathData::operator==( const PathData& rhs ) const
2600 {
2601     return (this==&rhs) || (
2602         mPathPoints==rhs.mPathPoints
2603         && !memcmp(mpPathTypes, rhs.mpPathTypes, mPathPoints * sizeof(BYTE) )
2604         && !memcmp(mpPathPoints, rhs.mpPathPoints, mPathPoints * sizeof(POINT) )
2605         );
2606 }
2607
2608 void PathData::_TrashPath()
2609 {
2610     if (mpPathTypes)
2611     {
2612         free(mpPathTypes);
2613         mpPathTypes = NULL;
2614     }
2615     if (mpPathPoints)
2616     {
2617         free(mpPathPoints);
2618         mpPathPoints = NULL;
2619     }
2620     mPathPoints = 0;
2621 }
2622
2623 bool PathData::BeginPath(HDC hdc)
2624 {
2625     _TrashPath();
2626     return !!::BeginPath(hdc);
2627 }
2628
2629 bool PathData::EndPath(HDC hdc)
2630 {
2631     ::CloseFigure(hdc);
2632     if(::EndPath(hdc))
2633     {
2634         mPathPoints = GetPath(hdc, NULL, NULL, 0);
2635         if(!mPathPoints)
2636             return true;
2637         mpPathTypes = (BYTE*)malloc(sizeof(BYTE) * mPathPoints);
2638         mpPathPoints = (POINT*)malloc(sizeof(POINT) * mPathPoints);
2639         if(mPathPoints == GetPath(hdc, mpPathPoints, mpPathTypes, mPathPoints))
2640             return true;
2641     }
2642     ::AbortPath(hdc);
2643     return false;
2644 }
2645
2646 bool PathData::PartialBeginPath(HDC hdc, bool bClearPath)
2647 {
2648     if(bClearPath)
2649         _TrashPath();
2650     return !!::BeginPath(hdc);
2651 }
2652
2653 bool PathData::PartialEndPath(HDC hdc, long dx, long dy)
2654 {
2655     ::CloseFigure(hdc);
2656     if(::EndPath(hdc))
2657     {
2658         int nPoints;
2659         BYTE* pNewTypes;
2660         POINT* pNewPoints;
2661         nPoints = GetPath(hdc, NULL, NULL, 0);
2662         if(!nPoints)
2663             return true;
2664         pNewTypes = (BYTE*)realloc(mpPathTypes, (mPathPoints + nPoints) * sizeof(BYTE));
2665         pNewPoints = (POINT*)realloc(mpPathPoints, (mPathPoints + nPoints) * sizeof(POINT));
2666         if(pNewTypes)
2667             mpPathTypes = pNewTypes;
2668         if(pNewPoints)
2669             mpPathPoints = pNewPoints;
2670         BYTE* pTypes = new BYTE[nPoints];
2671         POINT* pPoints = new POINT[nPoints];
2672         if(pNewTypes && pNewPoints && nPoints == GetPath(hdc, pPoints, pTypes, nPoints))
2673         {
2674             for(int i = 0; i < nPoints; ++i)
2675             {
2676                 mpPathPoints[mPathPoints + i].x = pPoints[i].x + dx;
2677                 mpPathPoints[mPathPoints + i].y = pPoints[i].y + dy;
2678                 mpPathTypes[mPathPoints + i] = pTypes[i];
2679             }
2680             mPathPoints += nPoints;
2681             delete[] pTypes;
2682             delete[] pPoints;
2683             return true;
2684         }
2685         else
2686             DebugBreak();
2687         delete[] pTypes;
2688         delete[] pPoints;
2689     }
2690     ::AbortPath(hdc);
2691     return false;
2692 }
2693
2694 void PathData::AlignLeftTop(CPoint *left_top, CSize *size)
2695 {
2696     int minx = INT_MAX;
2697     int miny = INT_MAX;
2698     int maxx = INT_MIN;
2699     int maxy = INT_MIN;
2700     for(int i=0; i<mPathPoints; ++i)
2701     {
2702         int ix = mpPathPoints[i].x;
2703         int iy = mpPathPoints[i].y;
2704         if(ix < minx) minx = ix;
2705         if(ix > maxx) maxx = ix;
2706         if(iy < miny) miny = iy;
2707         if(iy > maxy) maxy = iy;
2708     }
2709     if(minx > maxx || miny > maxy)
2710     {
2711         _TrashPath();
2712         *left_top = CPoint(0, 0);
2713         *size = CSize(0, 0);
2714         return;
2715     }
2716     minx = (minx >> 3) & ~7;
2717     miny = (miny >> 3) & ~7;
2718     maxx = (maxx + 7) >> 3;
2719     maxy = (maxy + 7) >> 3;
2720     for(int i=0; i<mPathPoints; ++i)
2721     {
2722         mpPathPoints[i].x -= minx*8;
2723         mpPathPoints[i].y -= miny*8;
2724     }
2725     *left_top = CPoint(minx, miny);
2726     *size = CSize(maxx+1-minx, maxy+1-miny);
2727     return;
2728 }
2729
2730 //////////////////////////////////////////////////////////////////////////
2731
2732 // ScanLineData
2733
2734 ScanLineData::ScanLineData()
2735 {
2736 }
2737
2738 ScanLineData::~ScanLineData()
2739 {
2740 }
2741
2742 void ScanLineData::_ReallocEdgeBuffer(int edges)
2743 {
2744     mEdgeHeapSize = edges;
2745     mpEdgeBuffer = (Edge*)realloc(mpEdgeBuffer, sizeof(Edge)*edges);
2746 }
2747
2748 void ScanLineData::_EvaluateBezier(const PathData& path_data, int ptbase, bool fBSpline)
2749 {
2750     const POINT* pt0 = path_data.mpPathPoints + ptbase;
2751     const POINT* pt1 = path_data.mpPathPoints + ptbase + 1;
2752     const POINT* pt2 = path_data.mpPathPoints + ptbase + 2;
2753     const POINT* pt3 = path_data.mpPathPoints + ptbase + 3;
2754     double x0 = pt0->x;
2755     double x1 = pt1->x;
2756     double x2 = pt2->x;
2757     double x3 = pt3->x;
2758     double y0 = pt0->y;
2759     double y1 = pt1->y;
2760     double y2 = pt2->y;
2761     double y3 = pt3->y;
2762     double cx3, cx2, cx1, cx0, cy3, cy2, cy1, cy0;
2763     if(fBSpline)
2764     {
2765         // 1   [-1 +3 -3 +1]
2766         // - * [+3 -6 +3  0]
2767         // 6   [-3  0 +3  0]
2768         //         [+1 +4 +1  0]
2769         double _1div6 = 1.0/6.0;
2770         cx3 = _1div6*(-  x0+3*x1-3*x2+x3);
2771         cx2 = _1div6*( 3*x0-6*x1+3*x2);
2772         cx1 = _1div6*(-3*x0        +3*x2);
2773         cx0 = _1div6*(   x0+4*x1+1*x2);
2774         cy3 = _1div6*(-  y0+3*y1-3*y2+y3);
2775         cy2 = _1div6*( 3*y0-6*y1+3*y2);
2776         cy1 = _1div6*(-3*y0     +3*y2);
2777         cy0 = _1div6*(   y0+4*y1+1*y2);
2778     }
2779     else // bezier
2780     {
2781         // [-1 +3 -3 +1]
2782         // [+3 -6 +3  0]
2783         // [-3 +3  0  0]
2784         // [+1  0  0  0]
2785         cx3 = -  x0+3*x1-3*x2+x3;
2786         cx2 =  3*x0-6*x1+3*x2;
2787         cx1 = -3*x0+3*x1;
2788         cx0 =    x0;
2789         cy3 = -  y0+3*y1-3*y2+y3;
2790         cy2 =  3*y0-6*y1+3*y2;
2791         cy1 = -3*y0+3*y1;
2792         cy0 =    y0;
2793     }
2794     //
2795     // This equation is from Graphics Gems I.
2796     //
2797     // The idea is that since we're approximating a cubic curve with lines,
2798     // any error we incur is due to the curvature of the line, which we can
2799     // estimate by calculating the maximum acceleration of the curve.  For
2800     // a cubic, the acceleration (second derivative) is a line, meaning that
2801     // the absolute maximum acceleration must occur at either the beginning
2802     // (|c2|) or the end (|c2+c3|).  Our bounds here are a little more
2803     // conservative than that, but that's okay.
2804     //
2805     // If the acceleration of the parametric formula is zero (c2 = c3 = 0),
2806     // that component of the curve is linear and does not incur any error.
2807     // If a=0 for both X and Y, the curve is a line segment and we can
2808     // use a step size of 1.
2809     double maxaccel1 = fabs(2*cy2) + fabs(6*cy3);
2810     double maxaccel2 = fabs(2*cx2) + fabs(6*cx3);
2811     double maxaccel = maxaccel1 > maxaccel2 ? maxaccel1 : maxaccel2;
2812     double h = 1.0;
2813     if(maxaccel > 8.0) h = sqrt(8.0 / maxaccel);
2814     if(!fFirstSet) {firstp.x = (LONG)cx0; firstp.y = (LONG)cy0; lastp = firstp; fFirstSet = true;}
2815     for(double t = 0; t < 1.0; t += h)
2816     {
2817         double x = cx0 + t*(cx1 + t*(cx2 + t*cx3));
2818         double y = cy0 + t*(cy1 + t*(cy2 + t*cy3));
2819         _EvaluateLine(lastp.x, lastp.y, (int)x, (int)y);
2820     }
2821     double x = cx0 + cx1 + cx2 + cx3;
2822     double y = cy0 + cy1 + cy2 + cy3;
2823     _EvaluateLine(lastp.x, lastp.y, (int)x, (int)y);
2824 }
2825
2826 void ScanLineData::_EvaluateLine(const PathData& path_data, int pt1idx, int pt2idx)
2827 {
2828     const POINT* pt1 = path_data.mpPathPoints + pt1idx;
2829     const POINT* pt2 = path_data.mpPathPoints + pt2idx;
2830     _EvaluateLine(pt1->x, pt1->y, pt2->x, pt2->y);
2831 }
2832
2833 void ScanLineData::_EvaluateLine(int x0, int y0, int x1, int y1)
2834 {
2835     if(lastp.x != x0 || lastp.y != y0)
2836     {
2837         _EvaluateLine(lastp.x, lastp.y, x0, y0);
2838     }
2839     if(!fFirstSet) {firstp.x = x0; firstp.y = y0; fFirstSet = true;}
2840     lastp.x = x1;
2841     lastp.y = y1;
2842     if(y1 > y0) // down
2843     {
2844         __int64 xacc = (__int64)x0 << 13;
2845         // prestep y0 down
2846         int dy = y1 - y0;
2847         int y = ((y0 + 3)&~7) + 4;
2848         int iy = y >> 3;
2849         y1 = (y1 - 5) >> 3;
2850         if(iy <= y1)
2851         {
2852             __int64 invslope = (__int64(x1 - x0) << 16) / dy;
2853             while(mEdgeNext + y1 + 1 - iy > mEdgeHeapSize)
2854                 _ReallocEdgeBuffer(mEdgeHeapSize*2);
2855             xacc += (invslope * (y - y0)) >> 3;
2856             while(iy <= y1)
2857             {
2858                 int ix = (int)((xacc + 32768) >> 16);
2859                 mpEdgeBuffer[mEdgeNext].next = mpScanBuffer[iy];
2860                 mpEdgeBuffer[mEdgeNext].posandflag = ix*2 + 1;
2861                 mpScanBuffer[iy] = mEdgeNext++;
2862                 ++iy;
2863                 xacc += invslope;
2864             }
2865         }
2866     }
2867     else if(y1 < y0) // up
2868     {
2869         __int64 xacc = (__int64)x1 << 13;
2870         // prestep y1 down
2871         int dy = y0 - y1;
2872         int y = ((y1 + 3)&~7) + 4;
2873         int iy = y >> 3;
2874         y0 = (y0 - 5) >> 3;
2875         if(iy <= y0)
2876         {
2877             __int64 invslope = (__int64(x0 - x1) << 16) / dy;
2878             while(mEdgeNext + y0 + 1 - iy > mEdgeHeapSize)
2879                 _ReallocEdgeBuffer(mEdgeHeapSize*2);
2880             xacc += (invslope * (y - y1)) >> 3;
2881             while(iy <= y0)
2882             {
2883                 int ix = (int)((xacc + 32768) >> 16);
2884                 mpEdgeBuffer[mEdgeNext].next = mpScanBuffer[iy];
2885                 mpEdgeBuffer[mEdgeNext].posandflag = ix*2;
2886                 mpScanBuffer[iy] = mEdgeNext++;
2887                 ++iy;
2888                 xacc += invslope;
2889             }
2890         }
2891     }
2892 }
2893
2894 bool ScanLineData::ScanConvert(const PathData& path_data, const CSize& size)
2895 {
2896     int lastmoveto = -1;
2897     int i;
2898     // Drop any outlines we may have.
2899     mOutline.clear();
2900     // Determine bounding box
2901     if(!path_data.mPathPoints)
2902     {
2903         mWidth = mHeight = 0;
2904         return false;
2905     }
2906     mWidth = size.cx;
2907     mHeight = size.cy;
2908     // Initialize edge buffer.  We use edge 0 as a sentinel.
2909     mEdgeNext = 1;
2910     mEdgeHeapSize = 2048;
2911     mpEdgeBuffer = (Edge*)malloc(sizeof(Edge)*mEdgeHeapSize);
2912     // Initialize scanline list.
2913     mpScanBuffer = new unsigned int[mHeight];
2914     memset(mpScanBuffer, 0, mHeight*sizeof(unsigned int));
2915     // Scan convert the outline.  Yuck, Bezier curves....
2916     // Unfortunately, Windows 95/98 GDI has a bad habit of giving us text
2917     // paths with all but the first figure left open, so we can't rely
2918     // on the PT_CLOSEFIGURE flag being used appropriately.
2919     fFirstSet = false;
2920     firstp.x = firstp.y = 0;
2921     lastp.x = lastp.y = 0;
2922     for(i=0; i<path_data.mPathPoints; ++i)
2923     {
2924         BYTE t = path_data.mpPathTypes[i] & ~PT_CLOSEFIGURE;
2925         switch(t)
2926         {
2927         case PT_MOVETO:
2928             if(lastmoveto >= 0 && firstp != lastp)
2929                 _EvaluateLine(lastp.x, lastp.y, firstp.x, firstp.y);
2930             lastmoveto = i;
2931             fFirstSet = false;
2932             lastp = path_data.mpPathPoints[i];
2933             break;
2934         case PT_MOVETONC:
2935             break;
2936         case PT_LINETO:
2937             if(path_data.mPathPoints - (i-1) >= 2) _EvaluateLine(path_data, i-1, i);
2938             break;
2939         case PT_BEZIERTO:
2940             if(path_data.mPathPoints - (i-1) >= 4) _EvaluateBezier(path_data, i-1, false);
2941             i += 2;
2942             break;
2943         case PT_BSPLINETO:
2944             if(path_data.mPathPoints - (i-1) >= 4) _EvaluateBezier(path_data, i-1, true);
2945             i += 2;
2946             break;
2947         case PT_BSPLINEPATCHTO:
2948             if(path_data.mPathPoints - (i-3) >= 4) _EvaluateBezier(path_data, i-3, true);
2949             break;
2950         }
2951     }
2952     if(lastmoveto >= 0 && firstp != lastp)
2953         _EvaluateLine(lastp.x, lastp.y, firstp.x, firstp.y);
2954     // Convert the edges to spans.  We couldn't do this before because some of
2955     // the regions may have winding numbers >+1 and it would have been a pain
2956     // to try to adjust the spans on the fly.  We use one heap to detangle
2957     // a scanline's worth of edges from the singly-linked lists, and another
2958     // to collect the actual scans.
2959     std::vector<int> heap;
2960     mOutline.reserve(mEdgeNext / 2);
2961     __int64 y = 0;
2962     for(y=0; y<mHeight; ++y)
2963     {
2964         int count = 0;
2965         // Detangle scanline into edge heap.
2966         for(unsigned ptr = (unsigned)(mpScanBuffer[y]&0xffffffff); ptr; ptr = mpEdgeBuffer[ptr].next)
2967         {
2968             heap.push_back(mpEdgeBuffer[ptr].posandflag);
2969         }
2970         // Sort edge heap.  Note that we conveniently made the opening edges
2971         // one more than closing edges at the same spot, so we won't have any
2972         // problems with abutting spans.
2973         std::sort(heap.begin(), heap.end()/*begin() + heap.size()*/);
2974         // Process edges and add spans.  Since we only check for a non-zero
2975         // winding number, it doesn't matter which way the outlines go!
2976         std::vector<int>::iterator itX1 = heap.begin();
2977         std::vector<int>::iterator itX2 = heap.end(); // begin() + heap.size();
2978         int x1, x2;
2979         for(; itX1 != itX2; ++itX1)
2980         {
2981             int x = *itX1;
2982             if(!count)
2983                 x1 = (x>>1);
2984             if(x&1)
2985                 ++count;
2986             else
2987                 --count;
2988             if(!count)
2989             {
2990                 x2 = (x>>1);
2991                 if(x2>x1)
2992                     mOutline.push_back(std::pair<__int64,__int64>((y<<32)+x1+0x4000000040000000i64, (y<<32)+x2+0x4000000040000000i64)); // G: damn Avery, this is evil! :)
2993             }
2994         }
2995         heap.clear();
2996     }
2997     // Dump the edge and scan buffers, since we no longer need them.
2998     free(mpEdgeBuffer);
2999     delete [] mpScanBuffer;
3000     // All done!
3001     return true;
3002 }
3003
3004 void ScanLineData::DeleteOutlines()
3005 {
3006     mOutline.clear();
3007 }
3008
3009 bool ScanLineData2::CreateWidenedRegion(int rx, int ry)
3010 {
3011     if(rx < 0) rx = 0;
3012     if(ry < 0) ry = 0;
3013     mWideBorder = max(rx,ry);
3014     mWideOutline.clear();
3015
3016     const tSpanBuffer& out_line = m_scan_line_data->mOutline;
3017     if (ry > 0)
3018     {
3019         WidenRegionCreater *widen_region_creater = WidenRegionCreater::GetDefaultWidenRegionCreater();
3020         widen_region_creater->xy_overlap_region(&mWideOutline, out_line, rx, ry);
3021     }
3022     else if (ry == 0 && rx > 0)
3023     {
3024         // There are artifacts if we don't make at least two overlaps of the line, even at same Y coord
3025         OverlapRegion(mWideOutline, out_line, rx, 0);
3026         OverlapRegion(mWideOutline, out_line, rx, 0);
3027     }
3028     return true;
3029 }