src/subtitles/Rasterizer.cpp

   1 /*
   2  *      Copyright (C) 2003-2006 Gabest
   3  *      http://www.gabest.org
   4  *
   5  *  This Program is free software; you can redistribute it and/or modify
   6  *  it under the terms of the GNU General Public License as published by
   7  *  the Free Software Foundation; either version 2, or (at your option)
   8  *  any later version.
   9  *
  10  *  This Program is distributed in the hope that it will be useful,
  11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13  *  GNU General Public License for more details.
  14  *
  15  *  You should have received a copy of the GNU General Public License
  16  *  along with GNU Make; see the file COPYING.  If not, write to
  17  *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  18  *  http://www.gnu.org/copyleft/gpl.html
  19  *
  20  */
  21
  22 #include "stdafx.h"
  23 #include <string.h>
  24 #include <cmath>
  25 #include <vector>
  26 #include <algorithm>
  27 #include "Rasterizer.h"
  28 #include "SeparableFilter.h"
  29 #include "xy_logger.h"
  30 #include <boost/flyweight/key_value.hpp>
  31 #include "xy_bitmap.h"
  32 #include "xy_widen_regoin.h"
  33
  34 #ifndef _MAX    /* avoid collision with common (nonconforming) macros */
  35 #define _MAX    (std::max)
  36 #define _MIN    (std::min)
  37 #define _IMPL_MAX std::max
  38 #define _IMPL_MIN std::min
  39 #else
  40 #define _IMPL_MAX _MAX
  41 #define _IMPL_MIN _MIN
  42 #endif
  43
  44 typedef const UINT8 CUINT8, *PCUINT8;
  45
  46 //NOTE: signed or unsigned affects the result seriously
  47 #define COMBINE_AYUV(a, y, u, v) ((((((((int)(a))<<8)|y)<<8)|u)<<8)|v)
  48
  49 #define SPLIT_AYUV(color, a, y, u, v) do { \
  50         *(v)=(color)&0xff; \
  51         *(u)=((color)>>8) &0xff; \
  52         *(y)=((color)>>16)&0xff;\
  53         *(a)=((color)>>24)&0xff;\
  54     } while(0)
  55
  56 class GaussianCoefficients
  57 {
  58 public:
  59     int g_r;
  60     int g_w;
  61     int g_w_ex;
  62     float *g_f;
  63
  64     double sigma;
  65 public:
  66     GaussianCoefficients(const double sigma)
  67     {
  68         g_r = 0;
  69         g_w = 0;
  70         g_w_ex = 0;
  71
  72         g_f = NULL;
  73
  74         this->sigma = 0;
  75         init(sigma);
  76     }
  77     GaussianCoefficients(const GaussianCoefficients& priv)
  78         :g_r(priv.g_r),g_w(priv.g_w),sigma(priv.sigma),g_f(NULL)
  79         ,g_w_ex(priv.g_w_ex)
  80     {
  81         if (this->g_w_ex > 0 && this != &priv) {
  82             this->g_f = reinterpret_cast<float*>(xy_malloc(this->g_w_ex * sizeof(float)));
  83             ASSERT(this->g_f);
  84             memcpy(g_f, priv.g_f, this->g_w_ex * sizeof(g_f[0]));
  85         }
  86     }
  87
  88     ~GaussianCoefficients()
  89     {
  90         xy_free(g_f); g_f=NULL;
  91     }
  92
  93 private:
  94     int init(double sigma)
  95     {
  96         double a = -1 / (sigma * sigma * 2);
  97         double exp_a = exp(a);
  98
  99         double volume =  0;
 100
 101         if (this->sigma == sigma)
 102             return 0;
 103         else
 104             this->sigma = sigma;
 105
 106         this->g_w = (int)ceil(sigma*3) | 1;
 107         this->g_r = this->g_w / 2;
 108         this->g_w_ex = (this->g_w + 3) & ~3;
 109
 110         if (this->g_w_ex > 0) {
 111             xy_free(this->g_f);
 112             this->g_f = reinterpret_cast<float*>(xy_malloc(this->g_w_ex * sizeof(float)));
 113             if (this->g_f == NULL) {
 114                 return -1;
 115             }
 116         }
 117
 118         if (this->g_w > 0) {
 119             volume = 0;
 120
 121             double exp_0 = 1.0;
 122             double exp_1 = exp_a;
 123             double exp_2 = exp_1 * exp_1;
 124             volume = exp_0;
 125             this->g_f[this->g_r] = exp_0;
 126             float* p_left = this->g_f+this->g_r-1;
 127             float* p_right= this->g_f+this->g_r+1;
 128             for(int i=0; i<this->g_r;++i,p_left--,p_right++)
 129             {
 130                 exp_0 *= exp_1;
 131                 exp_1 *= exp_2;
 132
 133                 *p_left = exp_0;
 134                 *p_right = exp_0;
 135
 136                 volume += exp_0;
 137                 volume += exp_0;
 138             }
 139             //equivalent:
 140             //  for (i = 0; i < this->g_w; ++i) {
 141             //    this->g[i] = (unsigned) ( exp(a * (i - this->g_r) * (i - this->g_r))* volume_factor + .5 );
 142             //    volume += this->g[i];
 143             //  }
 144             ASSERT(volume>0);
 145             for (int i=0;i<this->g_w;i++)
 146             {
 147                 this->g_f[i] /= volume;
 148             }
 149             for (int i=this->g_w;i<this->g_w_ex;i++)
 150             {
 151                 this->g_f[i] = 0;
 152             }
 153         }
 154         return 0;
 155     }
 156
 157 };
 158
 159 class ass_synth_priv
 160 {
 161 public:
 162     static const int VOLUME_BITS = 22;//should not exceed 32-8, and better not exceed 31-8
 163
 164     ass_synth_priv(const double sigma);
 165     ass_synth_priv(const ass_synth_priv& priv);
 166
 167     ~ass_synth_priv();
 168     int generate_tables(double sigma);
 169
 170     int g_r;
 171     int g_w;
 172
 173     unsigned *g;
 174     unsigned *gt2;
 175
 176     double sigma;
 177 };
 178
 179
 180 // GaussianFilter = GaussianCoefficients or ass_synth_priv
 181 template<typename GaussianFilter>
 182 struct GaussianFilterKey
 183 {
 184     const double& operator()(const GaussianFilter& x)const
 185     {
 186         return x.sigma;
 187     }
 188 };
 189
 190 struct ass_tmp_buf
 191 {
 192 public:
 193     ass_tmp_buf(size_t size);
 194     ass_tmp_buf(const ass_tmp_buf& buf);
 195     ~ass_tmp_buf();
 196     size_t size;
 197     unsigned *tmp;
 198 };
 199
 200 struct ass_tmp_buf_get_size
 201 {
 202     const size_t& operator()(const ass_tmp_buf& buf)const
 203     {
 204         return buf.size;
 205     }
 206 };
 207
 208 static const unsigned int maxcolor = 255;
 209 static const unsigned base = 256;
 210
 211 ass_synth_priv::ass_synth_priv(const double sigma)
 212 {
 213     g_r = 0;
 214     g_w = 0;
 215
 216     g = NULL;
 217     gt2 = NULL;
 218
 219     this->sigma = 0;
 220     generate_tables(sigma);
 221 }
 222
 223 ass_synth_priv::ass_synth_priv(const ass_synth_priv& priv):g_r(priv.g_r),g_w(priv.g_w),sigma(priv.sigma)
 224 {
 225     if (this->g_w > 0 && this != &priv) {
 226         this->g = (unsigned*)realloc(this->g, this->g_w * sizeof(unsigned));
 227         this->gt2 = (unsigned*)realloc(this->gt2, 256 * this->g_w * sizeof(unsigned));
 228         //if (this->g == null || this->gt2 == null) {
 229         //    return -1;
 230         //}
 231         memcpy(g, priv.g, this->g_w * sizeof(unsigned));
 232         memcpy(gt2, priv.gt2, 256 * this->g_w * sizeof(unsigned));
 233     }
 234 }
 235
 236 ass_synth_priv::~ass_synth_priv()
 237 {
 238     free(g); g=NULL;
 239     free(gt2); gt2=NULL;
 240 }
 241
 242 int ass_synth_priv::generate_tables(double sigma)
 243 {
 244     const int TARGET_VOLUME = 1<<VOLUME_BITS;
 245     const int MAX_VOLUME_ERROR = VOLUME_BITS>=22 ? 16 : 1;
 246
 247     double a = -1 / (sigma * sigma * 2);
 248     double exp_a = exp(a);
 249
 250     double volume_factor = 0;
 251     double volume_start =  0, volume_end = 0;
 252     unsigned volume;
 253
 254     if (this->sigma == sigma)
 255         return 0;
 256     else
 257         this->sigma = sigma;
 258
 259     this->g_w = (int)ceil(sigma*3) | 1;
 260     this->g_r = this->g_w / 2;
 261
 262     if (this->g_w > 0) {
 263         this->g = (unsigned*)realloc(this->g, this->g_w * sizeof(unsigned));
 264         this->gt2 = (unsigned*)realloc(this->gt2, 256 * this->g_w * sizeof(unsigned));
 265         if (this->g == NULL || this->gt2 == NULL) {
 266             return -1;
 267         }
 268     }
 269
 270     if (this->g_w > 0) {
 271         volume_start = 0;
 272
 273         double exp_0 = 1.0;
 274         double exp_1 = exp_a;
 275         double exp_2 = exp_1 * exp_1;
 276         volume_start += exp_0;
 277         for(int i=0;i<this->g_r;++i)
 278         {
 279             exp_0 *= exp_1;
 280             exp_1 *= exp_2;
 281             volume_start += exp_0;
 282             volume_start += exp_0;
 283         }
 284         //euqivalent:
 285         //  for (i = 0; i < this->g_w; ++i) {
 286         //      volume_start += exp(a * (i - this->g_r) * (i - this->g_r));
 287         //  }
 288
 289         volume_end = (TARGET_VOLUME+g_w)/volume_start;
 290         volume_start = (TARGET_VOLUME-g_w)/volume_start;
 291
 292         volume = 0;
 293         while( volume_start+0.000001<volume_end )
 294         {
 295             volume_factor = (volume_start+volume_end)*0.5;
 296             volume = 0;
 297
 298             exp_0 = volume_factor;
 299             exp_1 = exp_a;
 300             exp_2 = exp_1 * exp_1;
 301
 302             volume = static_cast<int>(exp_0+.5);
 303             this->g[this->g_r] = volume;
 304
 305             unsigned* p_left = this->g+this->g_r-1;
 306             unsigned* p_right= this->g+this->g_r+1;
 307             for(int i=0; i<this->g_r;++i,p_left--,p_right++)
 308             {
 309                 exp_0 *= exp_1;
 310                 exp_1 *= exp_2;
 311                 *p_left = static_cast<int>(exp_0+.5);
 312                 *p_right = *p_left;
 313                 volume += (*p_left<<1);
 314             }
 315             //equivalent:
 316             //    for (i = 0; i < this->g_w; ++i) {
 317             //        this->g[i] = (unsigned) ( exp(a * (i - this->g_r) * (i - this->g_r))* volume_factor + .5 );
 318             //        volume += this->g[i];
 319             //    }
 320
 321             // volume don't have to be equal to TARGET_VOLUME,
 322             // even if volume=TARGET_VOLUME+MAX_VOLUME_ERROR,
 323             // max error introducing in later blur operation,
 324             // which is (dot_product(g_w, pixel))/TARGET_VOLUME with pixel<256,
 325             // would not exceed (MAX_VOLUME_ERROR*256)/TARGET_VOLUME,
 326             // as long as MAX_VOLUME_ERROR/TARGET_VOLUME is small enough, error introduced would be kept in safe range
 327             //
 328             // NOTE: when it comes to rounding, no matter how small the error is,
 329             // it may result a different rounding output
 330             if( volume>=TARGET_VOLUME && volume< (TARGET_VOLUME+MAX_VOLUME_ERROR) )
 331                 break;
 332             else if(volume < TARGET_VOLUME)
 333             {
 334                 volume_start = volume_factor;
 335             }
 336             else if(volume >= TARGET_VOLUME+MAX_VOLUME_ERROR)
 337             {
 338                 volume_end = volume_factor;
 339             }
 340         }
 341         if(volume==0)
 342         {
 343             volume_factor = volume_end;
 344
 345             exp_0 = volume_factor;
 346             exp_1 = exp_a;
 347             exp_2 = exp_1 * exp_1;
 348
 349             volume = static_cast<int>(exp_0+.5);
 350             this->g[this->g_r] = volume;
 351
 352             unsigned* p_left = this->g+this->g_r-1;
 353             unsigned* p_right= this->g+this->g_r+1;
 354             for(int i=0; i<this->g_r;++i,p_left--,p_right++)
 355             {
 356                 exp_0 *= exp_1;
 357                 exp_1 *= exp_2;
 358                 *p_left = static_cast<int>(exp_0+.5);
 359                 *p_right = *p_left;
 360                 volume += (*p_left<<1);
 361             }
 362             //equivalent:
 363             //    for (i = 0; i < this->g_w; ++i) {
 364             //        this->g[i] = (unsigned) ( exp(a * (i - this->g_r) * (i - this->g_r))* volume_factor + .5 );
 365             //        volume += this->g[i];
 366             //    }
 367         }
 368
 369         // gauss table:
 370         for (int mx = 0; mx < this->g_w; mx++) {
 371             int last_mul = 0;
 372             unsigned *p_gt2 = this->gt2 + mx;
 373             *p_gt2 = 0;
 374             for (int i = 1; i < 256; i++) {
 375                 last_mul = last_mul+this->g[mx];
 376                 p_gt2 += this->g_w;
 377                 *p_gt2 = last_mul;
 378                 //equivalent:
 379                 //    this->gt2[this->g_w * i+ mx] = this->g[mx] * i;
 380             }
 381         }
 382     }
 383     return 0;
 384 }
 385
 386 ass_tmp_buf::ass_tmp_buf(size_t size)
 387 {
 388     tmp = (unsigned *)malloc(size * sizeof(unsigned));
 389     this->size = size;
 390 }
 391
 392 ass_tmp_buf::ass_tmp_buf(const ass_tmp_buf& buf)
 393     :size(buf.size)
 394 {
 395     tmp = (unsigned *)malloc(size * sizeof(unsigned));
 396 }
 397
 398 ass_tmp_buf::~ass_tmp_buf()
 399 {
 400     free(tmp);
 401 }
 402
 403 /*
 404  * \brief gaussian blur.  an fast pure c implementation from libass.
 405  */
 406 static void ass_gauss_blur(unsigned char *buffer, unsigned *tmp2,
 407                            int width, int height, int stride,
 408                            const unsigned *g_t_x, int g_r_x, int g_width_x,
 409                            const unsigned *g_t_y, int g_r_y, int g_width_y)
 410 {
 411
 412     int x, y;
 413
 414     unsigned char *s = buffer;
 415     unsigned *t = tmp2 + 1;
 416     for (y = 0; y < height; y++) {
 417         memset(t - 1, 0, (width + 1) * sizeof(*t));
 418         x = 0;
 419         if(x < g_r_x)//in case that r < 0
 420         {
 421             const int src = s[x];
 422             if (src) {
 423                 register unsigned *dstp = t + x - g_r_x;
 424                 int mx;
 425                 const unsigned *m3 = g_t_x + src * g_width_x;
 426                 unsigned sum = 0;
 427                 for (mx = g_width_x-1; mx >= g_r_x - x ; mx--) {
 428                     sum += m3[mx];
 429                     dstp[mx] += sum;
 430                 }
 431             }
 432         }
 433
 434         for (x = 1; x < g_r_x; x++) {
 435             const int src = s[x];
 436             if (src) {
 437                 register unsigned *dstp = t + x - g_r_x;
 438                 int mx;
 439                 const unsigned *m3 = g_t_x + src * g_width_x;
 440                 for (mx = g_r_x - x; mx < g_width_x; mx++) {
 441                     dstp[mx] += m3[mx];
 442                 }
 443             }
 444         }
 445
 446         for (; x < width - g_r_x; x++) {
 447             const int src = s[x];
 448             if (src) {
 449                 register unsigned *dstp = t + x - g_r_x;
 450                 int mx;
 451                 const unsigned *m3 = g_t_x + src * g_width_x;
 452                 for (mx = 0; mx < g_width_x; mx++) {
 453                     dstp[mx] += m3[mx];
 454                 }
 455             }
 456         }
 457
 458         for (; x < width-1; x++) {
 459             const int src = s[x];
 460             if (src) {
 461                 register unsigned *dstp = t + x - g_r_x;
 462                 int mx;
 463                 const int x2 = g_r_x + width - x;
 464                 const unsigned *m3 = g_t_x + src * g_width_x;
 465                 for (mx = 0; mx < x2; mx++) {
 466                     dstp[mx] += m3[mx];
 467                 }
 468             }
 469         }
 470         if(x==width-1) //important: x==width-1 failed, if r==0
 471         {
 472             const int src = s[x];
 473             if (src) {
 474                 register unsigned *dstp = t + x - g_r_x;
 475                 int mx;
 476                 const int x2 = g_r_x + width - x;
 477                 const unsigned *m3 = g_t_x + src * g_width_x;
 478                 unsigned sum = 0;
 479                 for (mx = 0; mx < x2; mx++) {
 480                     sum += m3[mx];
 481                     dstp[mx] += sum;
 482                 }
 483             }
 484         }
 485
 486         s += stride;
 487         t += width + 1;
 488     }
 489
 490     t = tmp2;
 491     for (x = 0; x < width; x++) {
 492         y = 0;
 493         if(y < g_r_y)//in case that r<0
 494         {
 495             unsigned *srcp = t + y * (width + 1) + 1;
 496             int src = *srcp;
 497             if (src) {
 498                 register unsigned *dstp = srcp - 1 + (g_width_y -g_r_y +y)*(width + 1);
 499                 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
 500                 const unsigned *m3 = g_t_y + src2 * g_width_y;
 501                 unsigned sum = 0;
 502                 int mx;
 503                 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
 504                 for (mx = g_width_y-1; mx >=g_r_y - y ; mx--) {
 505                     sum += m3[mx];
 506                     *dstp += sum;
 507                     dstp -= width + 1;
 508                 }
 509             }
 510         }
 511         for (y = 1; y < g_r_y; y++) {
 512             unsigned *srcp = t + y * (width + 1) + 1;
 513             int src = *srcp;
 514             if (src) {
 515                 register unsigned *dstp = srcp - 1 + width + 1;
 516                 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
 517                 const unsigned *m3 = g_t_y + src2 * g_width_y;
 518
 519                 int mx;
 520                 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
 521                 for (mx = g_r_y - y; mx < g_width_y; mx++) {
 522                     *dstp += m3[mx];
 523                     dstp += width + 1;
 524                 }
 525             }
 526         }
 527         for (; y < height - g_r_y; y++) {
 528             unsigned *srcp = t + y * (width + 1) + 1;
 529             int src = *srcp;
 530             if (src) {
 531                 register unsigned *dstp = srcp - 1 - g_r_y * (width + 1);
 532                 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
 533                 const unsigned *m3 = g_t_y + src2 * g_width_y;
 534
 535                 int mx;
 536                 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
 537                 for (mx = 0; mx < g_width_y; mx++) {
 538                     *dstp += m3[mx];
 539                     dstp += width + 1;
 540                 }
 541             }
 542         }
 543         for (; y < height-1; y++) {
 544             unsigned *srcp = t + y * (width + 1) + 1;
 545             int src = *srcp;
 546             if (src) {
 547                 const int y2 = g_r_y + height - y;
 548                 register unsigned *dstp = srcp - 1 - g_r_y * (width + 1);
 549                 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
 550                 const unsigned *m3 = g_t_y + src2 * g_width_y;
 551
 552                 int mx;
 553                 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
 554                 for (mx = 0; mx < y2; mx++) {
 555                     *dstp += m3[mx];
 556                     dstp += width + 1;
 557                 }
 558             }
 559         }
 560         if(y == height - 1)//important: y == height - 1 failed if r==0
 561         {
 562             unsigned *srcp = t + y * (width + 1) + 1;
 563             int src = *srcp;
 564             if (src) {
 565                 const int y2 = g_r_y + height - y;
 566                 register unsigned *dstp = srcp - 1 - g_r_y * (width + 1);
 567                 const int src2 = (src + (1<<(ass_synth_priv::VOLUME_BITS-1))) >> ass_synth_priv::VOLUME_BITS;
 568                 const unsigned *m3 = g_t_y + src2 * g_width_y;
 569                 unsigned sum = 0;
 570                 int mx;
 571                 *srcp = (1<<(ass_synth_priv::VOLUME_BITS-1));
 572                 for (mx = 0; mx < y2; mx++) {
 573                     sum += m3[mx];
 574                     *dstp += sum;
 575                     dstp += width + 1;
 576                 }
 577             }
 578         }
 579         t++;
 580     }
 581
 582     t = tmp2;
 583     s = buffer;
 584     for (y = 0; y < height; y++) {
 585         for (x = 0; x < width; x++) {
 586             s[x] = t[x] >> ass_synth_priv::VOLUME_BITS;
 587         }
 588         s += stride;
 589         t += width + 1;
 590     }
 591 }
 592
 593 void xy_gaussian_blur(PUINT8 dst, int dst_stride,
 594     PCUINT8 src, int width, int height, int stride,
 595     const float *gt_x, int r_x, int gt_ex_width_x,
 596     const float *gt_y, int r_y, int gt_ex_width_y);
 597
 598 void xy_be_blur(PUINT8 src, int width, int height, int stride, float pass_x, float pass_y);
 599
 600 /**
 601  * \brief blur with [[1,2,1]. [2,4,2], [1,2,1]] kernel.
 602  */
 603 static void be_blur(unsigned char *buf, unsigned *tmp_base, int w, int h, int stride)
 604 {
 605     WORD *col_pix_buf_base = reinterpret_cast<WORD*>(xy_malloc(w*sizeof(WORD)));
 606     WORD *col_sum_buf_base = reinterpret_cast<WORD*>(xy_malloc(w*sizeof(WORD)));
 607     if(!col_sum_buf_base || !col_pix_buf_base)
 608     {
 609         //ToDo: error handling
 610         return;
 611     }
 612     memset(col_pix_buf_base, 0, w*sizeof(WORD));
 613     memset(col_sum_buf_base, 0, w*sizeof(WORD));
 614     WORD *col_pix_buf = col_pix_buf_base-2;//for aligment;
 615     WORD *col_sum_buf = col_sum_buf_base-2;//for aligment;
 616     {
 617         int y = 0;
 618         unsigned char *src=buf+y*stride;
 619
 620         int x = 2;
 621         int old_pix = src[x-1];
 622         int old_sum = old_pix + src[x-2];
 623         for ( ; x < w; x++) {
 624             int temp1 = src[x];
 625             int temp2 = old_pix + temp1;
 626             old_pix = temp1;
 627             temp1 = old_sum + temp2;
 628             old_sum = temp2;
 629             col_pix_buf[x] = temp1;
 630         }
 631     }
 632     {
 633         int y = 1;
 634         unsigned char *src=buf+y*stride;
 635
 636
 637         int x = 2;
 638         int old_pix = src[x-1];
 639         int old_sum = old_pix + src[x-2];
 640         for ( ; x < w; x++) {
 641             int temp1 = src[x];
 642             int temp2 = old_pix + temp1;
 643             old_pix = temp1;
 644             temp1 = old_sum + temp2;
 645             old_sum = temp2;
 646
 647             temp2 = col_pix_buf[x] + temp1;
 648             col_pix_buf[x] = temp1;
 649             //dst[x-1] = (col_sum_buf[x] + temp2) >> 4;
 650             col_sum_buf[x] = temp2;
 651         }
 652     }
 653
 654     //__m128i round = _mm_set1_epi16(8);
 655     for (int y = 2; y < h; y++) {
 656         unsigned char *src=buf+y*stride;
 657         unsigned char *dst=buf+(y-1)*stride;
 658
 659
 660         int x = 2;
 661         __m128i old_pix_128 = _mm_cvtsi32_si128(src[1]);
 662         __m128i old_sum_128 = _mm_cvtsi32_si128(src[0]+src[1]);
 663         for ( ; x < ((w-2)&(~7)); x+=8) {
 664             __m128i new_pix = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(src+x));
 665             new_pix = _mm_unpacklo_epi8(new_pix, _mm_setzero_si128());
 666             __m128i temp = _mm_slli_si128(new_pix,2);
 667             temp = _mm_add_epi16(temp, old_pix_128);
 668             temp = _mm_add_epi16(temp, new_pix);
 669             old_pix_128 = _mm_srli_si128(new_pix,14);
 670
 671             new_pix = _mm_slli_si128(temp,2);
 672             new_pix = _mm_add_epi16(new_pix, old_sum_128);
 673             new_pix = _mm_add_epi16(new_pix, temp);
 674             old_sum_128 = _mm_srli_si128(temp, 14);
 675
 676             __m128i old_col_pix = _mm_loadu_si128( reinterpret_cast<const __m128i*>(col_pix_buf+x) );
 677             __m128i old_col_sum = _mm_loadu_si128( reinterpret_cast<const __m128i*>(col_sum_buf+x) );
 678             _mm_storeu_si128( reinterpret_cast<__m128i*>(col_pix_buf+x), new_pix );
 679             temp = _mm_add_epi16(new_pix, old_col_pix);
 680             _mm_storeu_si128( reinterpret_cast<__m128i*>(col_sum_buf+x), temp );
 681
 682             old_col_sum = _mm_add_epi16(old_col_sum, temp);
 683             //old_col_sum = _mm_add_epi16(old_col_sum, round);
 684             old_col_sum = _mm_srli_epi16(old_col_sum, 4);
 685             old_col_sum = _mm_packus_epi16(old_col_sum, old_col_sum);
 686             _mm_storel_epi64( reinterpret_cast<__m128i*>(dst+x-1), old_col_sum );
 687         }
 688         int old_pix = src[x-1];
 689         int old_sum = old_pix + src[x-2];
 690         for ( ; x < w; x++) {
 691             int temp1 = src[x];
 692             int temp2 = old_pix + temp1;
 693             old_pix = temp1;
 694             temp1 = old_sum + temp2;
 695             old_sum = temp2;
 696
 697             temp2 = col_pix_buf[x] + temp1;
 698             col_pix_buf[x] = temp1;
 699             dst[x-1] = (col_sum_buf[x] + temp2) >> 4;
 700             col_sum_buf[x] = temp2;
 701         }
 702     }
 703
 704     xy_free(col_sum_buf_base);
 705     xy_free(col_pix_buf_base);
 706 }
 707
 708 /**
 709  * see @be_blur
 710  */
 711 static void be_blur_c(unsigned char *buf, unsigned *tmp_base, int w, int h, int stride)
 712 {
 713     WORD *col_pix_buf_base = reinterpret_cast<WORD*>(xy_malloc(w*sizeof(WORD)));
 714     WORD *col_sum_buf_base = reinterpret_cast<WORD*>(xy_malloc(w*sizeof(WORD)));
 715     if(!col_sum_buf_base || !col_pix_buf_base)
 716     {
 717         //ToDo: error handling
 718         return;
 719     }
 720     memset(col_pix_buf_base, 0, w*sizeof(WORD));
 721     memset(col_sum_buf_base, 0, w*sizeof(WORD));
 722     WORD *col_pix_buf = col_pix_buf_base-2;//for aligment;
 723     WORD *col_sum_buf = col_sum_buf_base-2;//for aligment;
 724     {
 725         int y = 0;
 726         unsigned char *src=buf+y*stride;
 727
 728         int x = 2;
 729         int old_pix = src[x-1];
 730         int old_sum = old_pix + src[x-2];
 731         for ( ; x < w; x++) {
 732             int temp1 = src[x];
 733             int temp2 = old_pix + temp1;
 734             old_pix = temp1;
 735             temp1 = old_sum + temp2;
 736             old_sum = temp2;
 737             col_pix_buf[x] = temp1;
 738         }
 739     }
 740     {
 741         int y = 1;
 742         unsigned char *src=buf+y*stride;
 743
 744
 745         int x = 2;
 746         int old_pix = src[x-1];
 747         int old_sum = old_pix + src[x-2];
 748         for ( ; x < w; x++) {
 749             int temp1 = src[x];
 750             int temp2 = old_pix + temp1;
 751             old_pix = temp1;
 752             temp1 = old_sum + temp2;
 753             old_sum = temp2;
 754
 755             temp2 = col_pix_buf[x] + temp1;
 756             col_pix_buf[x] = temp1;
 757             //dst[x-1] = (col_sum_buf[x] + temp2) >> 4;
 758             col_sum_buf[x] = temp2;
 759         }
 760     }
 761
 762     for (int y = 2; y < h; y++) {
 763         unsigned char *src=buf+y*stride;
 764         unsigned char *dst=buf+(y-1)*stride;
 765
 766         int x = 2;
 767         int old_pix = src[x-1];
 768         int old_sum = old_pix + src[x-2];
 769         for ( ; x < w; x++) {
 770             int temp1 = src[x];
 771             int temp2 = old_pix + temp1;
 772             old_pix = temp1;
 773             temp1 = old_sum + temp2;
 774             old_sum = temp2;
 775
 776             temp2 = col_pix_buf[x] + temp1;
 777             col_pix_buf[x] = temp1;
 778             dst[x-1] = (col_sum_buf[x] + temp2) >> 4;
 779             col_sum_buf[x] = temp2;
 780         }
 781     }
 782
 783     xy_free(col_sum_buf_base);
 784     xy_free(col_pix_buf_base);
 785 }
 786
 787 static void Bilinear(unsigned char *buf, int w, int h, int stride, int x_factor, int y_factor)
 788 {
 789     WORD *col_pix_buf_base = reinterpret_cast<WORD*>(xy_malloc(w*sizeof(WORD)));
 790     if(!col_pix_buf_base)
 791     {
 792         //ToDo: error handling
 793         return;
 794     }
 795     memset(col_pix_buf_base, 0, w*sizeof(WORD));
 796
 797     for (int y = 0; y < h; y++){
 798         unsigned char *src=buf+y*stride;
 799
 800         WORD *col_pix_buf = col_pix_buf_base;
 801         int last=0;
 802         for(int x = 0; x < w; x++)
 803         {
 804             int temp1 = src[x];
 805             int temp2 = temp1*x_factor;
 806             temp1 <<= 3;
 807             temp1 -= temp2;
 808             temp1 += last;
 809             last = temp2;
 810
 811             temp2 = temp1*y_factor;
 812             temp1 <<= 3;
 813             temp1 -= temp2;
 814             temp1 += col_pix_buf[x];
 815             src[x] = ((temp1+32)>>6);
 816             col_pix_buf[x] = temp2;
 817         }
 818     }
 819     xy_free(col_pix_buf_base);
 820 }
 821
 822 bool Rasterizer::Rasterize(const ScanLineData2& scan_line_data2, int xsub, int ysub, SharedPtrOverlay overlay)
 823 {
 824     using namespace ::boost::flyweights;
 825
 826     if(!overlay)
 827     {
 828         return false;
 829     }
 830     overlay->CleanUp();
 831     const ScanLineData& scan_line_data = *scan_line_data2.m_scan_line_data;
 832     if(!scan_line_data.mWidth || !scan_line_data.mHeight)
 833     {
 834         return true;
 835     }
 836     xsub &= 7;
 837     ysub &= 7;
 838     //xsub = ysub = 0;
 839     int width = scan_line_data.mWidth + xsub;
 840     int height = scan_line_data.mHeight + ysub;
 841     overlay->mfWideOutlineEmpty = scan_line_data2.mWideOutline.empty();
 842     if(!overlay->mfWideOutlineEmpty)
 843     {
 844         int wide_border = (scan_line_data2.mWideBorder+7)&~7;
 845
 846         width += 2*wide_border ;
 847         height += 2*wide_border ;
 848         xsub += wide_border ;
 849         ysub += wide_border ;
 850     }
 851     overlay->mOffsetX = scan_line_data2.mPathOffsetX - xsub;
 852     overlay->mOffsetY = scan_line_data2.mPathOffsetY - ysub;
 853
 854     overlay->mWidth = width;
 855     overlay->mHeight = height;
 856     overlay->mOverlayWidth = ((width+7)>>3) + 1;
 857     overlay->mOverlayHeight = ((height+7)>>3) + 1;
 858     overlay->mOverlayPitch = (overlay->mOverlayWidth+15)&~15;
 859
 860     BYTE* body = reinterpret_cast<BYTE*>(xy_malloc(overlay->mOverlayPitch * overlay->mOverlayHeight));
 861     if( body==NULL )
 862     {
 863         return false;
 864     }
 865     overlay->mBody.reset(body, xy_free);
 866     memset(body, 0, overlay->mOverlayPitch * overlay->mOverlayHeight);
 867     BYTE* border = NULL;
 868     if (!overlay->mfWideOutlineEmpty)
 869     {
 870         border = reinterpret_cast<BYTE*>(xy_malloc(overlay->mOverlayPitch * overlay->mOverlayHeight));
 871         if (border==NULL)
 872         {
 873             return false;
 874         }
 875         overlay->mBorder.reset(border, xy_free);
 876         memset(border, 0, overlay->mOverlayPitch * overlay->mOverlayHeight);
 877     }
 878
 879     // Are we doing a border?
 880     const tSpanBuffer* pOutline[2] = {&(scan_line_data.mOutline), &(scan_line_data2.mWideOutline)};
 881     for(int i = countof(pOutline)-1; i >= 0; i--)
 882     {
 883         tSpanBuffer::const_iterator it = pOutline[i]->begin();
 884         tSpanBuffer::const_iterator itEnd = pOutline[i]->end();
 885         byte* plan_selected = i==0 ? body : border;
 886         int pitch = overlay->mOverlayPitch;
 887         for(; it!=itEnd; ++it)
 888         {
 889             int y = (int)(((*it).first >> 32) - 0x40000000 + ysub);
 890             int x1 = (int)(((*it).first & 0xffffffff) - 0x40000000 + xsub);
 891             int x2 = (int)(((*it).second & 0xffffffff) - 0x40000000 + xsub);
 892             if(x2 > x1)
 893             {
 894                 int first = x1>>3;
 895                 int last = (x2-1)>>3;
 896                 byte* dst = plan_selected + (pitch*(y>>3) + first);
 897                 if(first == last)
 898                     *dst += x2-x1;
 899                 else
 900                 {
 901                     *dst += ((first+1)<<3) - x1;
 902                     dst += 1;
 903                     while(++first < last)
 904                     {
 905                         *dst += 0x08;
 906                         dst += 1;
 907                     }
 908                     *dst += x2 - (last<<3);
 909                 }
 910             }
 911         }
 912     }
 913
 914     return true;
 915 }
 916
 917 const float Rasterizer::GAUSSIAN_BLUR_THREHOLD = 0.333333f;
 918
 919 bool Rasterizer::IsItReallyBlur( float be_strength, double gaussian_blur_strength )
 920 {
 921     if (be_strength<=0 && gaussian_blur_strength<=GAUSSIAN_BLUR_THREHOLD)
 922     {
 923         return false;
 924     }
 925     return true;
 926 }
 927
 928 // @return: true if actually a blur operation has done, or else false and output is leave unset.
 929 // To Do: rewrite it or delete it
 930 bool Rasterizer::OldFixedPointBlur(const Overlay& input_overlay, float be_strength, double gaussian_blur_strength,
 931     double target_scale_x, double target_scale_y, SharedPtrOverlay output_overlay)
 932 {
 933     using namespace ::boost::flyweights;
 934
 935     ASSERT(IsItReallyBlur(be_strength, gaussian_blur_strength));
 936     if(!output_overlay)
 937     {
 938         return false;
 939     }
 940     output_overlay->CleanUp();
 941
 942     output_overlay->mOffsetX = input_overlay.mOffsetX;
 943     output_overlay->mOffsetY = input_overlay.mOffsetY;
 944     output_overlay->mWidth = input_overlay.mWidth;
 945     output_overlay->mHeight = input_overlay.mHeight;
 946     output_overlay->mOverlayWidth = input_overlay.mOverlayWidth;
 947     output_overlay->mOverlayHeight = input_overlay.mOverlayHeight;
 948     output_overlay->mfWideOutlineEmpty = input_overlay.mfWideOutlineEmpty;
 949
 950     double gaussian_blur_strength_x = gaussian_blur_strength*target_scale_x;
 951     double gaussian_blur_strength_y = gaussian_blur_strength*target_scale_y;
 952
 953     int gaussian_blur_radius_x = (static_cast<int>( ceil(gaussian_blur_strength_x*3) ) | 1)/2;//fix me: rounding err?
 954     int gaussian_blur_radius_y = (static_cast<int>( ceil(gaussian_blur_strength_y*3) ) | 1)/2;//fix me: rounding err?
 955     if( gaussian_blur_radius_x < 1 && gaussian_blur_strength>GAUSSIAN_BLUR_THREHOLD )
 956         gaussian_blur_radius_x = 1;//make sure that it really do a blur
 957     if( gaussian_blur_radius_y < 1 && gaussian_blur_strength>GAUSSIAN_BLUR_THREHOLD )
 958         gaussian_blur_radius_y = 1;//make sure that it really do a blur
 959
 960     int bluradjust_x = 0, bluradjust_y = 0;
 961     if ( IsItReallyBlur(be_strength, gaussian_blur_strength) )
 962     {
 963         if (gaussian_blur_strength > 0)
 964         {
 965             bluradjust_x += gaussian_blur_radius_x * 8;
 966             bluradjust_y += gaussian_blur_radius_y * 8;
 967         }
 968         if (be_strength)
 969         {
 970             int be_adjust_x = static_cast<int>( target_scale_x*std::sqrt(be_strength*0.25f)+0.5 );//fix me: rounding err?
 971             be_adjust_x *= 8;
 972             int be_adjust_y = static_cast<int>(target_scale_y*std::sqrt(be_strength*0.25f)+0.5);//fix me: rounding err?
 973             be_adjust_y *= 8;
 974
 975             bluradjust_x += be_adjust_x;
 976             bluradjust_y += be_adjust_y;
 977         }
 978         // Expand the buffer a bit when we're blurring, since that can also widen the borders a bit
 979         bluradjust_x = (bluradjust_x+7)&~7;
 980         bluradjust_y = (bluradjust_y+7)&~7;
 981
 982         output_overlay->mOffsetX -= bluradjust_x;
 983         output_overlay->mOffsetY -= bluradjust_y;
 984         output_overlay->mWidth += (bluradjust_x<<1);
 985         output_overlay->mHeight += (bluradjust_y<<1);
 986         output_overlay->mOverlayWidth += (bluradjust_x>>2);
 987         output_overlay->mOverlayHeight += (bluradjust_y>>2);
 988     }
 989     else
 990     {
 991         return false;
 992     }
 993
 994     output_overlay->mOverlayPitch = (output_overlay->mOverlayWidth+15)&~15;
 995
 996     BYTE* body = reinterpret_cast<BYTE*>(xy_malloc(output_overlay->mOverlayPitch * output_overlay->mOverlayHeight));
 997     if( body==NULL )
 998     {
 999         return false;
1000     }
1001     output_overlay->mBody.reset(body, xy_free);
1002     memset(body, 0, output_overlay->mOverlayPitch * output_overlay->mOverlayHeight);
1003     BYTE* border = NULL;
1004     if (!output_overlay->mfWideOutlineEmpty)
1005     {
1006         border = reinterpret_cast<BYTE*>(xy_malloc(output_overlay->mOverlayPitch * output_overlay->mOverlayHeight));
1007         if (border==NULL)
1008         {
1009             return false;
1010         }
1011         output_overlay->mBorder.reset(border, xy_free);
1012         memset(border, 0, output_overlay->mOverlayPitch * output_overlay->mOverlayHeight);
1013     }
1014
1015     //copy buffer
1016     for(int i = 1; i >= 0; i--)
1017     {
1018         byte* plan_selected = i==0 ? body : border;
1019         const byte* plan_input = i==0 ? input_overlay.mBody.get() : input_overlay.mBorder.get();
1020
1021         plan_selected += (bluradjust_x>>3) + (bluradjust_y>>3)*output_overlay->mOverlayPitch;
1022         if ( plan_selected!=NULL && plan_input!=NULL )
1023         {
1024             for (int j=0;j<input_overlay.mOverlayHeight;j++)
1025             {
1026                 memcpy(plan_selected, plan_input, input_overlay.mOverlayPitch);
1027                 plan_selected += output_overlay->mOverlayPitch;
1028                 plan_input += input_overlay.mOverlayPitch;
1029             }
1030         }
1031     }
1032
1033     ass_tmp_buf tmp_buf( max((output_overlay->mOverlayPitch+1)*(output_overlay->mOverlayHeight+1),0) );
1034     //flyweight<key_value<int, ass_tmp_buf, ass_tmp_buf_get_size>, no_locking> tmp_buf((overlay->mOverlayWidth+1)*(overlay->mOverlayPitch+1));
1035     // Do some gaussian blur magic
1036     if ( gaussian_blur_strength > GAUSSIAN_BLUR_THREHOLD )
1037     {
1038         byte* plan_selected= output_overlay->mfWideOutlineEmpty ? body : border;
1039
1040         flyweight<key_value<double, ass_synth_priv, GaussianFilterKey<ass_synth_priv>>, no_locking>
1041             fw_priv_blur_x(gaussian_blur_strength_x);
1042         flyweight<key_value<double, ass_synth_priv, GaussianFilterKey<ass_synth_priv>>, no_locking>
1043             fw_priv_blur_y(gaussian_blur_strength_y);
1044
1045         const ass_synth_priv& priv_blur_x = fw_priv_blur_x.get();
1046         const ass_synth_priv& priv_blur_y = fw_priv_blur_y.get();
1047         if (output_overlay->mOverlayWidth>=priv_blur_x.g_w && output_overlay->mOverlayHeight>=priv_blur_y.g_w)
1048         {
1049             ass_gauss_blur(plan_selected, tmp_buf.tmp, output_overlay->mOverlayWidth, output_overlay->mOverlayHeight, output_overlay->mOverlayPitch,
1050                 priv_blur_x.gt2, priv_blur_x.g_r, priv_blur_x.g_w,
1051                 priv_blur_y.gt2, priv_blur_y.g_r, priv_blur_y.g_w);
1052         }
1053     }
1054
1055     float scaled_be_strength = be_strength * 0.5f * (target_scale_x+target_scale_y);
1056     int pass_num = static_cast<int>(scaled_be_strength);
1057     int pitch = output_overlay->mOverlayPitch;
1058     byte* blur_plan = output_overlay->mfWideOutlineEmpty ? body : border;
1059
1060     for (int pass = 0; pass < pass_num; pass++)
1061     {
1062         if(output_overlay->mOverlayWidth >= 3 && output_overlay->mOverlayHeight >= 3)
1063         {
1064             if (g_cpuid.m_flags & CCpuID::sse2)
1065             {
1066                 be_blur(blur_plan, tmp_buf.tmp, output_overlay->mOverlayWidth, output_overlay->mOverlayHeight, pitch);
1067             }
1068             else
1069             {
1070                 be_blur_c(blur_plan, tmp_buf.tmp, output_overlay->mOverlayWidth, output_overlay->mOverlayHeight, pitch);
1071             }
1072         }
1073     }
1074     if (scaled_be_strength>pass_num)
1075     {
1076         xy_be_blur(blur_plan, output_overlay->mOverlayWidth, output_overlay->mOverlayHeight, pitch,
1077             scaled_be_strength-pass_num, scaled_be_strength-pass_num);
1078     }
1079
1080     return true;
1081 }
1082
1083 // @return: true if actually a blur operation has done, or else false and output is leave unset.
1084 bool Rasterizer::Blur(const Overlay& input_overlay, float be_strength,
1085     double gaussian_blur_strength,
1086     double target_scale_x, double target_scale_y,
1087     SharedPtrOverlay output_overlay)
1088 {
1089     using namespace ::boost::flyweights;
1090
1091     ASSERT(IsItReallyBlur(be_strength, gaussian_blur_strength));
1092     if(!output_overlay || !IsItReallyBlur(be_strength, gaussian_blur_strength))
1093     {
1094         return false;
1095     }
1096     if (input_overlay.mOverlayWidth<=0 || input_overlay.mOverlayHeight<=0)
1097     {
1098         return true;
1099     }
1100
1101     if (!(g_cpuid.m_flags & CCpuID::sse2))
1102     {
1103         // C code path of floating point version is extremely slow,
1104         // so we fall back to fixed point version instead
1105         return Rasterizer::OldFixedPointBlur(input_overlay, be_strength,
1106             gaussian_blur_strength, target_scale_x, target_scale_y, output_overlay);//fix me: important!
1107     }
1108
1109     if (gaussian_blur_strength>0)
1110     {
1111         if (be_strength)//this insane thing should NEVER happen
1112         {
1113             SharedPtrOverlay tmp(new Overlay());
1114
1115             bool rv = GaussianBlur(input_overlay, gaussian_blur_strength, target_scale_x, target_scale_y, tmp);
1116             ASSERT(rv);
1117             rv = BeBlur(*tmp, be_strength, target_scale_x, target_scale_y, output_overlay);
1118             ASSERT(rv);
1119         }
1120         else
1121         {
1122             bool rv = GaussianBlur(input_overlay, gaussian_blur_strength, target_scale_x, target_scale_y, output_overlay);
1123             ASSERT(rv);
1124         }
1125     }
1126     else if (be_strength)
1127     {
1128         bool rv = BeBlur(input_overlay, be_strength, target_scale_x, target_scale_y, output_overlay);
1129         ASSERT(rv);
1130     }
1131     return true;
1132 }
1133
1134 bool Rasterizer::GaussianBlur( const Overlay& input_overlay, double gaussian_blur_strength,
1135     double target_scale_x, double target_scale_y,
1136     SharedPtrOverlay output_overlay )
1137 {
1138     using namespace ::boost::flyweights;
1139
1140     ASSERT(output_overlay);
1141     output_overlay->CleanUp();
1142     output_overlay->mfWideOutlineEmpty = input_overlay.mfWideOutlineEmpty;
1143
1144     ASSERT(gaussian_blur_strength > 0);
1145
1146     double gaussian_blur_strength_x = gaussian_blur_strength*target_scale_x;
1147     double gaussian_blur_strength_y = gaussian_blur_strength*target_scale_y;
1148
1149     int gaussian_blur_radius_x = (static_cast<int>( ceil(gaussian_blur_strength_x*3) ) | 1)/2;//fix me: rounding err?
1150     int gaussian_blur_radius_y = (static_cast<int>( ceil(gaussian_blur_strength_y*3) ) | 1)/2;//fix me: rounding err?
1151     if( gaussian_blur_radius_x < 1 && gaussian_blur_strength>GAUSSIAN_BLUR_THREHOLD )
1152         gaussian_blur_radius_x = 1;//make sure that it really do a blur
1153     if( gaussian_blur_radius_y < 1 && gaussian_blur_strength>GAUSSIAN_BLUR_THREHOLD )
1154         gaussian_blur_radius_y = 1;//make sure that it really do a blur
1155
1156     flyweight<key_value<double, GaussianCoefficients, GaussianFilterKey<GaussianCoefficients>>, no_locking>
1157         fw_filter_x(gaussian_blur_strength_x);
1158     flyweight<key_value<double, GaussianCoefficients, GaussianFilterKey<GaussianCoefficients>>, no_locking>
1159         fw_filter_y(gaussian_blur_strength_y);
1160
1161     const GaussianCoefficients& filter_x = fw_filter_x.get();
1162     const GaussianCoefficients& filter_y = fw_filter_y.get();
1163
1164     int bluradjust_x = filter_x.g_r * 8;
1165     int bluradjust_y = filter_y.g_r * 8;
1166     output_overlay->mOffsetX       = input_overlay.mOffsetX - bluradjust_x;
1167     output_overlay->mOffsetY       = input_overlay.mOffsetY - bluradjust_y;
1168     output_overlay->mWidth         = input_overlay.mWidth + (bluradjust_x<<1);
1169     output_overlay->mHeight        = input_overlay.mHeight + (bluradjust_y<<1);
1170     output_overlay->mOverlayWidth  = input_overlay.mOverlayWidth + (bluradjust_x>>2);
1171     output_overlay->mOverlayHeight = input_overlay.mOverlayHeight + (bluradjust_y>>2);
1172
1173     output_overlay->mOverlayPitch = (output_overlay->mOverlayWidth+15)&~15;
1174
1175     BYTE* blur_plan = reinterpret_cast<BYTE*>(xy_malloc(output_overlay->mOverlayPitch * output_overlay->mOverlayHeight));
1176     //memset(blur_plan, 0, output_overlay->mOverlayPitch * output_overlay->mOverlayHeight);
1177
1178     const BYTE* plan_input = input_overlay.mfWideOutlineEmpty ? input_overlay.mBody.get() : input_overlay.mBorder.get();
1179     ASSERT(output_overlay->mOverlayWidth>=filter_x.g_w && output_overlay->mOverlayHeight>=filter_y.g_w);
1180     xy_gaussian_blur(blur_plan, output_overlay->mOverlayPitch,
1181         plan_input, input_overlay.mOverlayWidth, input_overlay.mOverlayHeight, input_overlay.mOverlayPitch,
1182         filter_x.g_f, filter_x.g_r, filter_x.g_w_ex,
1183         filter_y.g_f, filter_y.g_r, filter_y.g_w_ex);
1184     if (input_overlay.mfWideOutlineEmpty)
1185     {
1186         output_overlay->mBody.reset(blur_plan, xy_free);
1187     }
1188     else
1189     {
1190         output_overlay->mBorder.reset(blur_plan, xy_free);
1191
1192         BYTE* body = reinterpret_cast<BYTE*>(xy_malloc(output_overlay->mOverlayPitch * output_overlay->mOverlayHeight));
1193         if( body==NULL )
1194         {
1195             return false;
1196         }
1197         output_overlay->mBody.reset(body, xy_free);
1198         memset(body, 0, output_overlay->mOverlayPitch * (bluradjust_y>>3));
1199         body += (bluradjust_y>>3)*output_overlay->mOverlayPitch;
1200         plan_input = input_overlay.mBody.get();
1201         ASSERT(plan_input);
1202         for (int j=0;j<input_overlay.mOverlayHeight;j++)
1203         {
1204             memset(body, 0, (bluradjust_x>>3));
1205             memcpy(body+(bluradjust_x>>3), plan_input, input_overlay.mOverlayWidth);
1206             memset(body+(bluradjust_x>>3)+input_overlay.mOverlayWidth, 0, (bluradjust_x>>3));
1207             body += output_overlay->mOverlayPitch;
1208             plan_input += input_overlay.mOverlayPitch;
1209         }
1210         memset(body, 0, output_overlay->mOverlayPitch * (bluradjust_y>>3));
1211     }
1212     return true;
1213 }
1214
1215 bool Rasterizer::BeBlur( const Overlay& input_overlay, float be_strength,
1216     float target_scale_x, float target_scale_y, SharedPtrOverlay output_overlay )
1217 {
1218     ASSERT(output_overlay);
1219     output_overlay->CleanUp();
1220     output_overlay->mfWideOutlineEmpty = input_overlay.mfWideOutlineEmpty;
1221
1222     ASSERT(be_strength>0 && target_scale_x>0 && target_scale_y>0);
1223     int bluradjust_x = static_cast<int>( target_scale_x*std::sqrt(be_strength*0.25f)+0.5 );//fix me: rounding err?
1224     bluradjust_x *= 8;
1225     int bluradjust_y = static_cast<int>(target_scale_y*std::sqrt(be_strength*0.25f)+0.5);//fix me: rounding err?
1226     bluradjust_y *= 8;
1227
1228     output_overlay->mOffsetX       = input_overlay.mOffsetX - bluradjust_x;
1229     output_overlay->mOffsetY       = input_overlay.mOffsetY - bluradjust_y;
1230     output_overlay->mWidth         = input_overlay.mWidth + (bluradjust_x<<1);
1231     output_overlay->mHeight        = input_overlay.mHeight + (bluradjust_y<<1);
1232     output_overlay->mOverlayWidth  = input_overlay.mOverlayWidth + (bluradjust_x>>2);
1233     output_overlay->mOverlayHeight = input_overlay.mOverlayHeight + (bluradjust_y>>2);
1234
1235     output_overlay->mOverlayPitch = (output_overlay->mOverlayWidth+15)&~15;
1236
1237     BYTE* body = reinterpret_cast<BYTE*>(xy_malloc(output_overlay->mOverlayPitch * output_overlay->mOverlayHeight));
1238     if( body==NULL )
1239     {
1240         return false;
1241     }
1242     output_overlay->mBody.reset(body, xy_free);
1243     memset(body, 0, output_overlay->mOverlayPitch * output_overlay->mOverlayHeight);
1244     BYTE* border = NULL;
1245     if (!output_overlay->mfWideOutlineEmpty)
1246     {
1247         border = reinterpret_cast<BYTE*>(xy_malloc(output_overlay->mOverlayPitch * output_overlay->mOverlayHeight));
1248         if (border==NULL)
1249         {
1250             return false;
1251         }
1252         output_overlay->mBorder.reset(border, xy_free);
1253         memset(border, 0, output_overlay->mOverlayPitch * output_overlay->mOverlayHeight);
1254     }
1255
1256     //copy buffer
1257     for(int i = 1; i >= 0; i--)
1258     {
1259         byte* plan_selected = i==0 ? body : border;
1260         const byte* plan_input = i==0 ? input_overlay.mBody.get() : input_overlay.mBorder.get();
1261
1262         plan_selected += (bluradjust_x>>3) + (bluradjust_y>>3)*output_overlay->mOverlayPitch;
1263         if ( plan_selected!=NULL && plan_input!=NULL )
1264         {
1265             for (int j=0;j<input_overlay.mOverlayHeight;j++)
1266             {
1267                 memcpy(plan_selected, plan_input, input_overlay.mOverlayWidth*sizeof(plan_input[0]));
1268                 plan_selected += output_overlay->mOverlayPitch;
1269                 plan_input += input_overlay.mOverlayPitch;
1270             }
1271         }
1272     }
1273     if (be_strength<=0)
1274     {
1275         return true;
1276     }
1277
1278     float scaled_be_strength = be_strength * 0.5f * (target_scale_x+target_scale_y);
1279     int pass_num = static_cast<int>(scaled_be_strength);
1280     int pitch = output_overlay->mOverlayPitch;
1281     byte* blur_plan = output_overlay->mfWideOutlineEmpty ? body : border;
1282     ass_tmp_buf tmp_buf( max((output_overlay->mOverlayPitch+1)*(output_overlay->mOverlayHeight+1),0) );
1283     for (int pass = 0; pass < pass_num; pass++)
1284     {
1285         if(output_overlay->mOverlayWidth >= 3 && output_overlay->mOverlayHeight >= 3)
1286         {
1287             if (g_cpuid.m_flags & CCpuID::sse2)
1288             {
1289                 be_blur(blur_plan, tmp_buf.tmp, output_overlay->mOverlayWidth, output_overlay->mOverlayHeight, pitch);
1290             }
1291             else
1292             {
1293                 be_blur_c(blur_plan, tmp_buf.tmp, output_overlay->mOverlayWidth, output_overlay->mOverlayHeight, pitch);
1294             }
1295         }
1296     }
1297     if (scaled_be_strength>pass_num)
1298     {
1299         xy_be_blur(blur_plan, output_overlay->mOverlayWidth, output_overlay->mOverlayHeight, pitch,
1300             scaled_be_strength-pass_num, scaled_be_strength-pass_num);
1301     }
1302
1303     return true;
1304 }
1305
1306 ///////////////////////////////////////////////////////////////////////////
1307
1308 static __forceinline void pixmix(DWORD *dst, DWORD color, DWORD alpha)
1309 {
1310     int a = alpha;
1311     // Make sure both a and ia are in range 1..256 for the >>8 operations below to be correct
1312     int ia = 256-a;
1313     a+=1;
1314     *dst = ((((*dst&0x00ff00ff)*ia + (color&0x00ff00ff)*a)&0xff00ff00)>>8)
1315            | ((((*dst&0x0000ff00)*ia + (color&0x0000ff00)*a)&0x00ff0000)>>8)
1316            | ((((*dst>>8)&0x00ff0000)*ia)&0xff000000);
1317 }
1318
1319 static __forceinline void pixmix2(DWORD *dst, DWORD color, DWORD shapealpha, DWORD clipalpha)
1320 {
1321     int a = (((shapealpha)*(clipalpha)*(color>>24))>>12)&0xff;
1322     int ia = 256-a;
1323     a+=1;
1324     *dst = ((((*dst&0x00ff00ff)*ia + (color&0x00ff00ff)*a)&0xff00ff00)>>8)
1325            | ((((*dst&0x0000ff00)*ia + (color&0x0000ff00)*a)&0x00ff0000)>>8)
1326            | ((((*dst>>8)&0x00ff0000)*ia)&0xff000000);
1327 }
1328
1329 #include <xmmintrin.h>
1330 #include <emmintrin.h>
1331
1332 static __forceinline void pixmix_sse2(DWORD* dst, DWORD color, DWORD alpha)
1333 {
1334 //    alpha = (((alpha) * (color>>24)) >> 6) & 0xff;
1335     color &= 0xffffff;
1336     __m128i zero = _mm_setzero_si128();
1337     __m128i a = _mm_set1_epi32(((alpha+1) << 16) | (0x100 - alpha));
1338     __m128i d = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*dst), zero);
1339     __m128i s = _mm_unpacklo_epi8(_mm_cvtsi32_si128(color), zero);
1340     __m128i r = _mm_unpacklo_epi16(d, s);
1341     r = _mm_madd_epi16(r, a);
1342     r = _mm_srli_epi32(r, 8);
1343     r = _mm_packs_epi32(r, r);
1344     r = _mm_packus_epi16(r, r);
1345     *dst = (DWORD)_mm_cvtsi128_si32(r);
1346 }
1347
1348 static __forceinline void pixmix2_sse2(DWORD* dst, DWORD color, DWORD shapealpha, DWORD clipalpha)
1349 {
1350     int alpha = (((shapealpha)*(clipalpha)*(color>>24))>>12)&0xff;
1351     color &= 0xffffff;
1352     __m128i zero = _mm_setzero_si128();
1353     __m128i a = _mm_set1_epi32(((alpha+1) << 16) | (0x100 - alpha));
1354     __m128i d = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*dst), zero);
1355     __m128i s = _mm_unpacklo_epi8(_mm_cvtsi32_si128(color), zero);
1356     __m128i r = _mm_unpacklo_epi16(d, s);
1357     r = _mm_madd_epi16(r, a);
1358     r = _mm_srli_epi32(r, 8);
1359     r = _mm_packs_epi32(r, r);
1360     r = _mm_packus_epi16(r, r);
1361     *dst = (DWORD)_mm_cvtsi128_si32(r);
1362 }
1363
1364 #include <mmintrin.h>
1365
1366 // Calculate a - b clamping to 0 instead of underflowing
1367 static __forceinline DWORD safe_subtract(DWORD a, DWORD b)
1368 {
1369 #ifndef _WIN64
1370     __m64 ap = _mm_cvtsi32_si64(a);
1371     __m64 bp = _mm_cvtsi32_si64(b);
1372     __m64 rp = _mm_subs_pu16(ap, bp);
1373     DWORD r = (DWORD)_mm_cvtsi64_si32(rp);
1374     _mm_empty();
1375     return r;
1376 #else
1377     return (b > a) ? 0 : a - b;
1378 #endif
1379 }
1380
1381 /***
1382  * No aligned requirement
1383  *
1384  **/
1385 void AlphaBlt(byte* pY,
1386     const byte* pAlphaMask,
1387     const byte Y,
1388     int h, int w, int src_stride, int dst_stride)
1389 {
1390     __m128i zero = _mm_setzero_si128();
1391     __m128i s = _mm_set1_epi16(Y);               //s = c  0  c  0  c  0  c  0  c  0  c  0  c  0  c  0
1392
1393     __m128i ones;
1394 #ifdef _DEBUG
1395     ones = _mm_setzero_si128();
1396 #endif // _DEBUG
1397     ones = _mm_cmpeq_epi32(ones, ones);
1398     ones = _mm_srli_epi16(ones, 15);
1399     ones = _mm_slli_epi16(ones, 8);
1400
1401     if( w>16 )//IMPORTANT! The result of the following code is undefined with w<15.
1402     {
1403         for( ; h>0; h--, pAlphaMask += src_stride, pY += dst_stride )
1404         {
1405             const BYTE* sa = pAlphaMask;
1406             BYTE* dy = pY;
1407             const BYTE* dy_first_mod16 = reinterpret_cast<BYTE*>((reinterpret_cast<int>(pY)+15)&~15);  //IMPORTANT! w must >= 15
1408             const BYTE* dy_end_mod16 = reinterpret_cast<BYTE*>(reinterpret_cast<int>(pY+w)&~15);
1409             const BYTE* dy_end = pY + w;
1410
1411             for(;dy < dy_first_mod16; sa++, dy++)
1412             {
1413                 *dy = (*dy * (256 - *sa)+ Y*(*sa+1))>>8;
1414             }
1415             for(; dy < dy_end_mod16; sa+=8, dy+=16)
1416             {
1417                 __m128i a = _mm_loadl_epi64((__m128i*)sa);
1418
1419                 //Y
1420                 __m128i d = _mm_load_si128((__m128i*)dy);
1421
1422                 a = _mm_unpacklo_epi8(a,zero);               //a= a0 0  a1 0  a2 0  a3 0  a4 0  a5 0  a6 0  a7 0
1423                 __m128i ia = _mm_sub_epi16(ones,a);         //ia   = 256-a0 ... 256-a7
1424
1425                 __m128i dl = _mm_unpacklo_epi8(d,zero);               //d    = b0 0  b1 0  b2 0  b3 0  b4 0  b5 0  b6 0  b7 0
1426                 __m128i sl = _mm_mullo_epi16(s,a);            //sl   = c0*a0  c1*a1  ... c7*a7
1427                 sl = _mm_add_epi16(sl,s);
1428
1429                 dl = _mm_mullo_epi16(dl,ia);                   //d    = b0*~a0 b1*~a1 ... b7*~a7
1430
1431                 dl = _mm_add_epi16(dl,sl);                     //d   = (256-a)*d + s + a*s
1432                 dl = _mm_srli_epi16(dl,8);                    //d   = d>>8
1433
1434                 sa += 8;
1435                 a = _mm_loadl_epi64((__m128i*)sa);
1436
1437                 a = _mm_unpacklo_epi8(a,zero);
1438                 ia = _mm_sub_epi16(ones,a);
1439
1440                 d = _mm_unpackhi_epi8(d,zero);
1441                 sl = _mm_mullo_epi16(s,a);
1442                 sl = _mm_add_epi16(sl,s);
1443
1444                 d = _mm_mullo_epi16(d,ia);
1445                 d = _mm_add_epi16(d,sl);
1446                 d = _mm_srli_epi16(d, 8);
1447
1448                 dl = _mm_packus_epi16(dl,d);
1449
1450                 _mm_store_si128((__m128i*)dy, dl);
1451             }
1452             for(;dy < dy_end; sa++, dy++)
1453             {
1454                 *dy = (*dy * (256 - *sa)+ Y*(*sa+1))>>8;
1455             }
1456         }
1457     }
1458     else
1459     {
1460         for( ; h>0; h--, pAlphaMask += src_stride, pY += dst_stride )
1461         {
1462             const BYTE* sa = pAlphaMask;
1463             BYTE* dy = pY;
1464             const BYTE* dy_end = pY + w;
1465
1466             for(;dy < dy_end; sa++, dy++)
1467             {
1468                 *dy = (*dy * (256 - *sa)+ Y*(*sa+1))>>8;
1469             }
1470         }
1471     }
1472     //__asm emms;
1473 }
1474
1475 /***
1476  * No aligned requirement
1477  *
1478  **/
1479 void AlphaBlt(byte* pY,
1480     const byte alpha,
1481     const byte Y,
1482     int h, int w, int dst_stride)
1483 {
1484     int yPremul = Y*(alpha+1);
1485     int dstAlpha = 0x100 - alpha;
1486     if( w>32 )//IMPORTANT! The result of the following code is undefined with w<15.
1487     {
1488         __m128i zero = _mm_setzero_si128();
1489         __m128i s = _mm_set1_epi16(yPremul);    //s = c  0  c  0  c  0  c  0  c  0  c  0  c  0  c  0
1490         __m128i ia = _mm_set1_epi16(dstAlpha);
1491         for( ; h>0; h--, pY += dst_stride )
1492         {
1493             BYTE* dy = pY;
1494             const BYTE* dy_first_mod16 = reinterpret_cast<BYTE*>((reinterpret_cast<int>(pY)+15)&~15);  //IMPORTANT! w must >= 15
1495             const BYTE* dy_end_mod16 = reinterpret_cast<BYTE*>(reinterpret_cast<int>(pY+w)&~15);
1496             const BYTE* dy_end = pY + w;
1497
1498             for(;dy < dy_first_mod16; dy++)
1499             {
1500                 *dy = (*dy * dstAlpha + yPremul)>>8;
1501             }
1502             for(; dy < dy_end_mod16; dy+=16)
1503             {
1504                 //Y
1505                 __m128i d = _mm_load_si128(reinterpret_cast<const __m128i*>(dy));
1506                 __m128i dl = _mm_unpacklo_epi8(d,zero);        //d    = b0 0  b1 0  b2 0  b3 0  b4 0  b5 0  b6 0  b7 0
1507
1508                 dl = _mm_mullo_epi16(dl,ia);                   //d    = b0*~a0 b1*~a1 ... b7*~a7
1509                 dl = _mm_adds_epu16(dl,s);                     //d   = d + s
1510                 dl = _mm_srli_epi16(dl, 8);                    //d   = d>>8
1511
1512                 d = _mm_unpackhi_epi8(d,zero);
1513                 d = _mm_mullo_epi16(d,ia);
1514                 d = _mm_adds_epu16(d,s);
1515                 d = _mm_srli_epi16(d, 8);
1516
1517                 dl = _mm_packus_epi16(dl,d);
1518
1519                 _mm_store_si128(reinterpret_cast<__m128i*>(dy), dl);
1520             }
1521             for(;dy < dy_end; dy++)
1522             {
1523                 *dy = (*dy * dstAlpha + yPremul)>>8;
1524             }
1525         }
1526     }
1527     else
1528     {
1529         for( ; h>0; h--, pY += dst_stride )
1530         {
1531             BYTE* dy = pY;
1532             const BYTE* dy_end = pY + w;
1533
1534             for(;dy < dy_end; dy++)
1535             {
1536                 *dy = (*dy * dstAlpha + yPremul)>>8;
1537             }
1538         }
1539     }
1540     //__asm emms;
1541 }
1542
1543 /***
1544  * No aligned requirement
1545  *
1546  **/
1547 void AlphaBltC(byte* pY,
1548     const byte alpha,
1549     const byte Y,
1550     int h, int w, int dst_stride)
1551 {
1552     int yPremul = Y*(alpha+1);
1553     int dstAlpha = 0x100 - alpha;
1554
1555     for( ; h>0; h--, pY += dst_stride )
1556     {
1557         BYTE* dy = pY;
1558         const BYTE* dy_end = pY + w;
1559
1560         for(;dy < dy_end; dy++)
1561         {
1562             *dy = (*dy * dstAlpha + yPremul)>>8;
1563         }
1564     }
1565 }
1566
1567 // For CPUID usage in Rasterizer::Draw
1568 #include "../dsutil/vd.h"
1569
1570 void OverlapRegion(tSpanBuffer& dst, const tSpanBuffer& src, int dx, int dy)
1571 {
1572     tSpanBuffer temp;
1573     temp.reserve(dst.size() + src.size());
1574     dst.swap(temp);
1575     tSpanBuffer::iterator itA = temp.begin();
1576     tSpanBuffer::iterator itAE = temp.end();
1577     tSpanBuffer::const_iterator itB = src.begin();
1578     tSpanBuffer::const_iterator itBE = src.end();
1579     // Don't worry -- even if dy<0 this will still work! // G: hehe, the evil twin :)
1580     unsigned __int64 offset1 = (((__int64)dy)<<32) - dx;
1581     unsigned __int64 offset2 = (((__int64)dy)<<32) + dx;
1582     while(itA != itAE && itB != itBE)
1583     {
1584         if((*itB).first + offset1 < (*itA).first)
1585         {
1586             // B span is earlier.  Use it.
1587             unsigned __int64 x1 = (*itB).first + offset1;
1588             unsigned __int64 x2 = (*itB).second + offset2;
1589             ++itB;
1590             // B spans don't overlap, so begin merge loop with A first.
1591             for(;;)
1592             {
1593                 // If we run out of A spans or the A span doesn't overlap,
1594                 // then the next B span can't either (because B spans don't
1595                 // overlap) and we exit.
1596                 if(itA == itAE || (*itA).first > x2)
1597                     break;
1598                 do {x2 = _MAX(x2, (*itA++).second);}
1599                 while(itA != itAE && (*itA).first <= x2);
1600                 // If we run out of B spans or the B span doesn't overlap,
1601                 // then the next A span can't either (because A spans don't
1602                 // overlap) and we exit.
1603                 if(itB == itBE || (*itB).first + offset1 > x2)
1604                     break;
1605                 do {x2 = _MAX(x2, (*itB++).second + offset2);}
1606                 while(itB != itBE && (*itB).first + offset1 <= x2);
1607             }
1608             // Flush span.
1609             dst.push_back(tSpan(x1, x2));
1610         }
1611         else
1612         {
1613             // A span is earlier.  Use it.
1614             unsigned __int64 x1 = (*itA).first;
1615             unsigned __int64 x2 = (*itA).second;
1616             ++itA;
1617             // A spans don't overlap, so begin merge loop with B first.
1618             for(;;)
1619             {
1620                 // If we run out of B spans or the B span doesn't overlap,
1621                 // then the next A span can't either (because A spans don't
1622                 // overlap) and we exit.
1623                 if(itB == itBE || (*itB).first + offset1 > x2)
1624                     break;
1625                 do {x2 = _MAX(x2, (*itB++).second + offset2);}
1626                 while(itB != itBE && (*itB).first + offset1 <= x2);
1627                 // If we run out of A spans or the A span doesn't overlap,
1628                 // then the next B span can't either (because B spans don't
1629                 // overlap) and we exit.
1630                 if(itA == itAE || (*itA).first > x2)
1631                     break;
1632                 do {x2 = _MAX(x2, (*itA++).second);}
1633                 while(itA != itAE && (*itA).first <= x2);
1634             }
1635             // Flush span.
1636             dst.push_back(tSpan(x1, x2));
1637         }
1638     }
1639     // Copy over leftover spans.
1640     while(itA != itAE)
1641         dst.push_back(*itA++);
1642     while(itB != itBE)
1643     {
1644         dst.push_back(tSpan((*itB).first + offset1, (*itB).second + offset2));
1645         ++itB;
1646     }
1647 }
1648
1649 // Render a subpicture onto a surface.
1650 // spd is the surface to render on.
1651 // clipRect is a rectangular clip region to render inside.
1652 // pAlphaMask is an alpha clipping mask.
1653 // xsub and ysub ???
1654 // switchpts seems to be an array of fill colours interlaced with coordinates.
1655 //    switchpts[i*2] contains a colour and switchpts[i*2+1] contains the coordinate to use that colour from
1656 // fBody tells whether to render the body of the subs.
1657 // fBorder tells whether to render the border of the subs.
1658 SharedPtrByte Rasterizer::CompositeAlphaMask(const SharedPtrOverlay& overlay, const CRect& clipRect,
1659     const GrayImage2* alpha_mask,
1660     int xsub, int ysub, const DWORD* switchpts, bool fBody, bool fBorder,
1661     CRect *outputDirtyRect)
1662 {
1663     //fix me: check and log error
1664     SharedPtrByte result;
1665     *outputDirtyRect = CRect(0, 0, 0, 0);
1666     if (!switchpts || !fBody && !fBorder) return result;
1667     if (fBorder && !overlay->mBorder) return result;
1668
1669     CRect r = clipRect;
1670     if (alpha_mask!=NULL)
1671     {
1672         r &= CRect(alpha_mask->left_top, alpha_mask->size);
1673     }
1674
1675     // Remember that all subtitle coordinates are specified in 1/8 pixels
1676     // (x+4)>>3 rounds to nearest whole pixel.
1677     // ??? What is xsub, ysub, mOffsetX and mOffsetY ?
1678     int x = (xsub + overlay->mOffsetX + 4)>>3;
1679     int y = (ysub + overlay->mOffsetY + 4)>>3;
1680     int w = overlay->mOverlayWidth;
1681     int h = overlay->mOverlayHeight;
1682     int xo = 0, yo = 0;
1683     // Again, limiting?
1684     if(x < r.left) {xo = r.left-x; w -= r.left-x; x = r.left;}
1685     if(y < r.top) {yo = r.top-y; h -= r.top-y; y = r.top;}
1686     if(x+w > r.right) w = r.right-x;
1687     if(y+h > r.bottom) h = r.bottom-y;
1688     // Check if there's actually anything to render
1689     if(w <= 0 || h <= 0) return(result);
1690     outputDirtyRect->SetRect(x, y, x+w, y+h);
1691
1692     bool fSingleColor = (switchpts[1]==0xffffffff);
1693
1694     // draw
1695     // Grab the first colour
1696     DWORD color = switchpts[0];
1697     byte* s_base = (byte*)xy_malloc(overlay->mOverlayPitch * overlay->mOverlayHeight);
1698     const byte* alpha_mask_data = alpha_mask != NULL ? alpha_mask->data.get() : NULL;
1699     const int alpha_mask_pitch = alpha_mask != NULL ? alpha_mask->pitch : 0;
1700     if(alpha_mask_data!=NULL )
1701         alpha_mask_data += alpha_mask->pitch * y + x - alpha_mask->left_top.y*alpha_mask->pitch - alpha_mask->left_top.x;
1702
1703     if(fSingleColor)
1704     {
1705         overlay->FillAlphaMash(s_base, fBody, fBorder, xo, yo, w, h,
1706             alpha_mask_data, alpha_mask_pitch,
1707             color>>24 );
1708     }
1709     else
1710     {
1711         int last_x = xo;
1712         const DWORD *sw = switchpts;
1713         while( last_x<w+xo )
1714         {
1715             byte alpha = sw[0]>>24;
1716             while( sw[3]<w+xo && (sw[2]>>24)==alpha )
1717             {
1718                 sw += 2;
1719             }
1720             int new_x = sw[3] < w+xo ? sw[3] : w+xo;
1721             overlay->FillAlphaMash(s_base, fBody, fBorder,
1722                 last_x, yo, new_x-last_x, h,
1723                 alpha_mask_data, alpha_mask_pitch,
1724                 alpha );
1725             last_x = new_x;
1726             sw += 2;
1727         }
1728     }
1729     result.reset( s_base, xy_free );
1730     return result;
1731 }
1732
1733
1734 //
1735 // draw overlay[clipRect] to bitmap[0,0,w,h]
1736 //
1737 void Rasterizer::Draw(XyBitmap* bitmap, SharedPtrOverlay overlay, const CRect& clipRect, byte* s_base,
1738     int xsub, int ysub, const DWORD* switchpts, bool fBody, bool fBorder)
1739 {
1740     if (!switchpts || !fBody && !fBorder) return;
1741     if (bitmap==NULL)
1742     {
1743         ASSERT(0);
1744         return;
1745     }
1746     // clip
1747     // Limit drawn area to rectangular clip area
1748     CRect r = clipRect;
1749     // Remember that all subtitle coordinates are specified in 1/8 pixels
1750     // (x+4)>>3 rounds to nearest whole pixel.
1751     int overlayPitch = overlay->mOverlayPitch;
1752     int x = (xsub + overlay->mOffsetX + 4)>>3;
1753     int y = (ysub + overlay->mOffsetY + 4)>>3;
1754     int w = overlay->mOverlayWidth;
1755     int h = overlay->mOverlayHeight;
1756     int xo = 0, yo = 0;
1757
1758     if(x < r.left) {xo = r.left-x; w -= r.left-x; x = r.left;}
1759     if(y < r.top) {yo = r.top-y; h -= r.top-y; y = r.top;}
1760     if(x+w > r.right) w = r.right-x;
1761     if(y+h > r.bottom) h = r.bottom-y;
1762     // Check if there's actually anything to render
1763     if (w <= 0 || h <= 0) return;
1764     // must have enough space to draw into
1765     ASSERT(x >= bitmap->x && y >= bitmap->y && x+w <= bitmap->x + bitmap->w && y+h <= bitmap->y + bitmap->h );
1766
1767     // CPUID from VDub
1768     bool fSSE2 = !!(g_cpuid.m_flags & CCpuID::sse2);
1769     bool fSingleColor = (switchpts[1]==0xffffffff);
1770     bool PLANAR = (bitmap->type==XyBitmap::PLANNA);
1771     int draw_method = 0;
1772     if(fSingleColor)
1773         draw_method |= DM::SINGLE_COLOR;
1774     if(fSSE2)
1775         draw_method |= DM::SSE2;
1776     if(PLANAR)
1777         draw_method |= DM::AYUV_PLANAR;
1778
1779     // draw
1780     // Grab the first colour
1781     DWORD color = switchpts[0];
1782     const byte* s = s_base + overlay->mOverlayPitch*yo + xo;
1783
1784     int dst_offset = 0;
1785     if (bitmap->type==XyBitmap::PLANNA)
1786         dst_offset = bitmap->pitch*(y-bitmap->y) + x - bitmap->x;
1787     else
1788         dst_offset = bitmap->pitch*(y-bitmap->y) + (x - bitmap->x)*4;
1789     unsigned long* dst = (unsigned long*)((BYTE*)bitmap->plans[0] + dst_offset);
1790
1791     // Every remaining line in the bitmap to be rendered...
1792     switch(draw_method)
1793     {
1794     case   DM::SINGLE_COLOR |   DM::SSE2 | 0*DM::AYUV_PLANAR :
1795     {
1796         while(h--)
1797         {
1798             for(int wt=0; wt<w; ++wt)
1799                 // The <<6 is due to pixmix expecting the alpha parameter to be
1800                 // the multiplication of two 6-bit unsigned numbers but we
1801                 // only have one here. (No alpha mask.)
1802                 pixmix_sse2(&dst[wt], color, s[wt]);
1803             s += overlayPitch;
1804             dst = (unsigned long *)((char *)dst + bitmap->pitch);
1805         }
1806     }
1807     break;
1808     case   DM::SINGLE_COLOR | 0*DM::SSE2 | 0*DM::AYUV_PLANAR :
1809     {
1810         while(h--)
1811         {
1812             for(int wt=0; wt<w; ++wt)
1813                 pixmix(&dst[wt], color, s[wt]);
1814             s += overlayPitch;
1815             dst = (unsigned long *)((char *)dst + bitmap->pitch);
1816         }
1817     }
1818     break;
1819     case 0*DM::SINGLE_COLOR |   DM::SSE2 | 0*DM::AYUV_PLANAR :
1820     {
1821         while(h--)
1822         {
1823             const DWORD *sw = switchpts;
1824             for(int wt=0; wt<w; ++wt)
1825             {
1826                 // xo is the offset (usually negative) we have moved into the image
1827                 // So if we have passed the switchpoint (?) switch to another colour
1828                 // (So switchpts stores both colours *and* coordinates?)
1829                 if(wt+xo >= sw[1]) {while(wt+xo >= sw[1]) sw += 2; color = sw[-2];}
1830                 pixmix_sse2(&dst[wt], color, s[wt]);
1831             }
1832             s += overlayPitch;
1833             dst = (unsigned long *)((char *)dst + bitmap->pitch);
1834         }
1835     }
1836     break;
1837     case 0*DM::SINGLE_COLOR | 0*DM::SSE2 | 0*DM::AYUV_PLANAR :
1838     {
1839         while(h--)
1840         {
1841             const DWORD *sw = switchpts;
1842             for(int wt=0; wt<w; ++wt)
1843             {
1844                 if(wt+xo >= sw[1]) {while(wt+xo >= sw[1]) sw += 2; color = sw[-2];}
1845                 pixmix(&dst[wt], color, s[wt]);
1846             }
1847             s += overlayPitch;
1848             dst = (unsigned long *)((char *)dst + bitmap->pitch);
1849         }
1850     }
1851     break;
1852     case   DM::SINGLE_COLOR |   DM::SSE2 |   DM::AYUV_PLANAR :
1853     {
1854         unsigned char* dst_A = bitmap->plans[0] + dst_offset;
1855         unsigned char* dst_Y = bitmap->plans[1] + dst_offset;
1856         unsigned char* dst_U = bitmap->plans[2] + dst_offset;
1857         unsigned char* dst_V = bitmap->plans[3] + dst_offset;
1858
1859         AlphaBlt(dst_Y, s, ((color)>>16)&0xff, h, w, overlayPitch, bitmap->pitch);
1860         AlphaBlt(dst_U, s, ((color)>>8)&0xff, h, w, overlayPitch, bitmap->pitch);
1861         AlphaBlt(dst_V, s, ((color))&0xff, h, w, overlayPitch, bitmap->pitch);
1862         AlphaBlt(dst_A, s, 0, h, w, overlayPitch, bitmap->pitch);
1863     }
1864     break;
1865     case 0*DM::SINGLE_COLOR |   DM::SSE2 |   DM::AYUV_PLANAR :
1866     {
1867         unsigned char* dst_A = bitmap->plans[0] + dst_offset;
1868         unsigned char* dst_Y = bitmap->plans[1] + dst_offset;
1869         unsigned char* dst_U = bitmap->plans[2] + dst_offset;
1870         unsigned char* dst_V = bitmap->plans[3] + dst_offset;
1871
1872         const DWORD *sw = switchpts;
1873         int last_x = xo;
1874         color = sw[0];
1875         while(last_x<w+xo)
1876         {
1877             int new_x = sw[3] < w+xo ? sw[3] : w+xo;
1878             color = sw[0];
1879             sw += 2;
1880             if( new_x < last_x )
1881                 continue;
1882             AlphaBlt(dst_Y, s + last_x - xo, (color>>16)&0xff, h, new_x-last_x, overlayPitch, bitmap->pitch);
1883             AlphaBlt(dst_U, s + last_x - xo, (color>>8)&0xff, h, new_x-last_x, overlayPitch, bitmap->pitch);
1884             AlphaBlt(dst_V, s + last_x - xo, (color)&0xff, h, new_x-last_x, overlayPitch, bitmap->pitch);
1885             AlphaBlt(dst_A, s + last_x - xo, 0, h, new_x-last_x, overlayPitch, bitmap->pitch);
1886
1887             dst_A += new_x - last_x;
1888             dst_Y += new_x - last_x;
1889             dst_U += new_x - last_x;
1890             dst_V += new_x - last_x;
1891             last_x = new_x;
1892         }
1893     }
1894     break;
1895     case   DM::SINGLE_COLOR | 0*DM::SSE2 |   DM::AYUV_PLANAR :
1896     {
1897 //        char * debug_dst=(char*)dst;int h2 = h;
1898 //        XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", (char*)&color, sizeof(color)) );
1899 //        XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", debug_dst, (h2-1)*spd.pitch) );
1900 //        debug_dst += spd.pitch*spd.h;
1901 //        XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", debug_dst, (h2-1)*spd.pitch) );
1902 //        debug_dst += spd.pitch*spd.h;
1903 //        XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", debug_dst, (h2-1)*spd.pitch) );
1904 //        debug_dst += spd.pitch*spd.h;
1905 //        XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", debug_dst, (h2-1)*spd.pitch) );
1906 //        debug_dst=(char*)dst;
1907
1908         unsigned char* dst_A = bitmap->plans[0] + dst_offset;
1909         unsigned char* dst_Y = bitmap->plans[1] + dst_offset;
1910         unsigned char* dst_U = bitmap->plans[2] + dst_offset;
1911         unsigned char* dst_V = bitmap->plans[3] + dst_offset;
1912         while(h--)
1913         {
1914             for(int wt=0; wt<w; ++wt)
1915             {
1916                 DWORD temp = COMBINE_AYUV(dst_A[wt], dst_Y[wt], dst_U[wt], dst_V[wt]);
1917                 pixmix(&temp, color, s[wt]);
1918                 SPLIT_AYUV(temp, dst_A+wt, dst_Y+wt, dst_U+wt, dst_V+wt);
1919             }
1920             s += overlayPitch;
1921             dst_A += bitmap->pitch;
1922             dst_Y += bitmap->pitch;
1923             dst_U += bitmap->pitch;
1924             dst_V += bitmap->pitch;
1925         }
1926 //        XY_DO_ONCE( xy_logger::write_file("G:\\a2_rt", debug_dst, (h2-1)*spd.pitch) );
1927 //        debug_dst += spd.pitch*spd.h;
1928 //        XY_DO_ONCE( xy_logger::write_file("G:\\a2_rt", debug_dst, (h2-1)*spd.pitch) );
1929 //        debug_dst += spd.pitch*spd.h;
1930 //        XY_DO_ONCE( xy_logger::write_file("G:\\a2_rt", debug_dst, (h2-1)*spd.pitch) );
1931 //        debug_dst += spd.pitch*spd.h;
1932 //        XY_DO_ONCE( xy_logger::write_file("G:\\a2_rt", debug_dst, (h2-1)*spd.pitch) );
1933     }
1934     break;
1935     case 0*DM::SINGLE_COLOR | 0*DM::SSE2 |   DM::AYUV_PLANAR :
1936     {
1937         unsigned char* dst_A = bitmap->plans[0] + dst_offset;
1938         unsigned char* dst_Y = bitmap->plans[1] + dst_offset;
1939         unsigned char* dst_U = bitmap->plans[2] + dst_offset;
1940         unsigned char* dst_V = bitmap->plans[3] + dst_offset;
1941         while(h--)
1942         {
1943             const DWORD *sw = switchpts;
1944             for(int wt=0; wt<w; ++wt)
1945             {
1946                 if(wt+xo >= sw[1]) {while(wt+xo >= sw[1]) sw += 2; color = sw[-2];}
1947                 DWORD temp = COMBINE_AYUV(dst_A[wt], dst_Y[wt], dst_U[wt], dst_V[wt]);
1948                 pixmix(&temp, color, (s[wt]*(color>>24))>>8);
1949                 SPLIT_AYUV(temp, dst_A+wt, dst_Y+wt, dst_U+wt, dst_V+wt);
1950             }
1951             s += overlayPitch;
1952             dst_A += bitmap->pitch;
1953             dst_Y += bitmap->pitch;
1954             dst_U += bitmap->pitch;
1955             dst_V += bitmap->pitch;
1956         }
1957     }
1958     break;
1959     }
1960     return;
1961 }
1962
1963 void Rasterizer::FillSolidRect(SubPicDesc& spd, int x, int y, int nWidth, int nHeight, DWORD argb)
1964 {
1965     bool fSSE2 = !!(g_cpuid.m_flags & CCpuID::sse2);
1966     bool AYUV_PLANAR = (spd.type==MSP_AYUV_PLANAR);
1967     int draw_method = 0;
1968     if(fSSE2)
1969         draw_method |= DM::SSE2;
1970     if(AYUV_PLANAR)
1971         draw_method |= DM::AYUV_PLANAR;
1972
1973     switch (draw_method)
1974     {
1975     case   DM::SSE2 | 0*DM::AYUV_PLANAR :
1976     {
1977         for (int wy=y; wy<y+nHeight; wy++) {
1978             DWORD* dst = (DWORD*)((BYTE*)spd.bits + spd.pitch * wy) + x;
1979             for(int wt=0; wt<nWidth; ++wt) {
1980                 pixmix_sse2(&dst[wt], argb, argb>>24);
1981             }
1982         }
1983     }
1984     break;
1985     case 0*DM::SSE2 | 0*DM::AYUV_PLANAR :
1986     {
1987         for (int wy=y; wy<y+nHeight; wy++) {
1988             DWORD* dst = (DWORD*)((BYTE*)spd.bits + spd.pitch * wy) + x;
1989             for(int wt=0; wt<nWidth; ++wt) {
1990                 pixmix(&dst[wt], argb,  argb>>24);
1991             }
1992         }
1993     }
1994     break;
1995     case   DM::SSE2 |   DM::AYUV_PLANAR :
1996     {
1997         BYTE* dst = reinterpret_cast<BYTE*>(spd.bits) + spd.pitch * y + x;
1998         BYTE* dst_A = dst;
1999         BYTE* dst_Y = dst_A + spd.pitch*spd.h;
2000         BYTE* dst_U = dst_Y + spd.pitch*spd.h;
2001         BYTE* dst_V = dst_U + spd.pitch*spd.h;
2002         AlphaBlt(dst_Y, argb>>24, ((argb)>>16)&0xff, nHeight, nWidth, spd.pitch);
2003         AlphaBlt(dst_U, argb>>24, ((argb)>>8)&0xff, nHeight, nWidth, spd.pitch);
2004         AlphaBlt(dst_V, argb>>24, ((argb))&0xff, nHeight, nWidth, spd.pitch);
2005         AlphaBlt(dst_A, argb>>24, 0, nHeight, nWidth, spd.pitch);
2006     }
2007     break;
2008     case 0*DM::SSE2 |   DM::AYUV_PLANAR :
2009     {
2010         BYTE* dst = reinterpret_cast<BYTE*>(spd.bits) + spd.pitch * y + x;
2011         BYTE* dst_A = dst;
2012         BYTE* dst_Y = dst_A + spd.pitch*spd.h;
2013         BYTE* dst_U = dst_Y + spd.pitch*spd.h;
2014         BYTE* dst_V = dst_U + spd.pitch*spd.h;
2015         AlphaBltC(dst_Y, argb>>24, ((argb)>>16)&0xff, nHeight, nWidth, spd.pitch);
2016         AlphaBltC(dst_U, argb>>24, ((argb)>>8)&0xff, nHeight, nWidth, spd.pitch);
2017         AlphaBltC(dst_V, argb>>24, ((argb))&0xff, nHeight, nWidth, spd.pitch);
2018         AlphaBltC(dst_A, argb>>24, 0, nHeight, nWidth, spd.pitch);
2019     }
2020     break;
2021     }
2022 }
2023
2024
2025 ///////////////////////////////////////////////////////////////
2026
2027 // Overlay
2028
2029 void Overlay::_DoFillAlphaMash(byte* outputAlphaMask, const byte* pBody, const byte* pBorder, int x, int y, int w, int h,
2030     const byte* pAlphaMask, int pitch, DWORD color_alpha )
2031 {
2032 #ifndef _WIN64
2033     if (g_cpuid.m_flags & CCpuID::sse2)
2034     {
2035         pBody = pBody!=NULL ? pBody + y*mOverlayPitch + x: NULL;
2036         pBorder = pBorder!=NULL ? pBorder + y*mOverlayPitch + x: NULL;
2037         byte* dst = outputAlphaMask + y*mOverlayPitch + x;
2038
2039         const int x0 = ((reinterpret_cast<int>(dst)+3)&~3) - reinterpret_cast<int>(dst) < w ?
2040             ((reinterpret_cast<int>(dst)+3)&~3) - reinterpret_cast<int>(dst) : w; //IMPORTANT! Should not exceed w.
2041         const int x00 = ((reinterpret_cast<int>(dst)+15)&~15) - reinterpret_cast<int>(dst) < w ?
2042             ((reinterpret_cast<int>(dst)+15)&~15) - reinterpret_cast<int>(dst) : w;//IMPORTANT! Should not exceed w.
2043         const int x_end00  = ((reinterpret_cast<int>(dst)+w)&~15) - reinterpret_cast<int>(dst);
2044         const int x_end0 = ((reinterpret_cast<int>(dst)+w)&~3) - reinterpret_cast<int>(dst);
2045         const int x_end = w;
2046
2047         __m64 color_alpha_64 = _mm_set1_pi16(color_alpha);
2048         __m128i color_alpha_128 = _mm_set1_epi16(color_alpha);
2049
2050         if(pAlphaMask==NULL && pBody!=NULL && pBorder!=NULL)
2051         {
2052             /*
2053             __asm
2054             {
2055             mov        eax, color_alpha
2056             movd           XMM3, eax
2057             punpcklwd  XMM3, XMM3
2058             pshufd         XMM3, XMM3, 0
2059             }
2060             */
2061             while(h--)
2062             {
2063                 int j=0;
2064                 for( ; j<x0; j++ )
2065                 {
2066                     int temp = pBorder[j]-pBody[j];
2067                     temp = temp<0 ? 0 : temp;
2068                     dst[j] = (temp * color_alpha)>>6;
2069                 }
2070                 for( ;j<x00;j+=4 )
2071                 {
2072                     __m64 border = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBorder+j));
2073                     __m64 body = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBody+j));
2074                     border = _mm_subs_pu8(border, body);
2075                     __m64 zero = _mm_setzero_si64();
2076                     border = _mm_unpacklo_pi8(border, zero);
2077                     border = _mm_mullo_pi16(border, color_alpha_64);
2078                     border = _mm_srli_pi16(border, 6);
2079                     border = _mm_packs_pu16(border,border);
2080                     *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(border);
2081                 }
2082                 __m128i zero = _mm_setzero_si128();
2083                 for( ;j<x_end00;j+=16)
2084                 {
2085                     __m128i border = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pBorder+j));
2086                     __m128i body = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pBody+j));
2087                     border = _mm_subs_epu8(border,body);
2088                     __m128i srchi = border;
2089                     border = _mm_unpacklo_epi8(border, zero);
2090                     srchi = _mm_unpackhi_epi8(srchi, zero);
2091                     border = _mm_mullo_epi16(border, color_alpha_128);
2092                     srchi = _mm_mullo_epi16(srchi, color_alpha_128);
2093                     border = _mm_srli_epi16(border, 6);
2094                     srchi = _mm_srli_epi16(srchi, 6);
2095                     border = _mm_packus_epi16(border, srchi);
2096                     _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), border);
2097                 }
2098                 for( ;j<x_end0;j+=4)
2099                 {
2100                     __m64 border = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBorder+j));
2101                     __m64 body = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBody+j));
2102                     border = _mm_subs_pu8(border, body);
2103                     __m64 zero = _mm_setzero_si64();
2104                     border = _mm_unpacklo_pi8(border, zero);
2105                     border = _mm_mullo_pi16(border, color_alpha_64);
2106                     border = _mm_srli_pi16(border, 6);
2107                     border = _mm_packs_pu16(border,border);
2108                     *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(border);
2109                 }
2110                 for( ;j<x_end;j++)
2111                 {
2112                     int temp = pBorder[j]-pBody[j];
2113                     temp = temp<0 ? 0 : temp;
2114                     dst[j] = (temp * color_alpha)>>6;
2115                 }
2116                 pBody += mOverlayPitch;
2117                 pBorder += mOverlayPitch;
2118                 //pAlphaMask += pitch;
2119                 dst += mOverlayPitch;
2120             }
2121         }
2122         else if( ((pBody==NULL) + (pBorder==NULL))==1 && pAlphaMask==NULL)
2123         {
2124             const BYTE* src1 = pBody!=NULL ? pBody : pBorder;
2125             while(h--)
2126             {
2127                 int j=0;
2128                 for( ; j<x0; j++ )
2129                 {
2130                     dst[j] = (src1[j] * color_alpha)>>6;
2131                 }
2132                 for( ;j<x00;j+=4 )
2133                 {
2134                     __m64 src = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(src1+j));
2135                     __m64 zero = _mm_setzero_si64();
2136                     src = _mm_unpacklo_pi8(src, zero);
2137                     src = _mm_mullo_pi16(src, color_alpha_64);
2138                     src = _mm_srli_pi16(src, 6);
2139                     src = _mm_packs_pu16(src,src);
2140                     *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(src);
2141                 }
2142                 __m128i zero = _mm_setzero_si128();
2143                 for( ;j<x_end00;j+=16)
2144                 {
2145                     __m128i src = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src1+j));
2146                     __m128i srchi = src;
2147                     src = _mm_unpacklo_epi8(src, zero);
2148                     srchi = _mm_unpackhi_epi8(srchi, zero);
2149                     src = _mm_mullo_epi16(src, color_alpha_128);
2150                     srchi = _mm_mullo_epi16(srchi, color_alpha_128);
2151                     src = _mm_srli_epi16(src, 6);
2152                     srchi = _mm_srli_epi16(srchi, 6);
2153                     src = _mm_packus_epi16(src, srchi);
2154                     _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), src);
2155                 }
2156                 for( ;j<x_end0;j+=4)
2157                 {
2158                     __m64 src = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(src1+j));
2159                     __m64 zero = _mm_setzero_si64();
2160                     src = _mm_unpacklo_pi8(src, zero);
2161                     src = _mm_mullo_pi16(src, color_alpha_64);
2162                     src = _mm_srli_pi16(src, 6);
2163                     src = _mm_packs_pu16(src,src);
2164                     *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(src);
2165                 }
2166                 for( ;j<x_end;j++)
2167                 {
2168                     dst[j] = (src1[j] * color_alpha)>>6;
2169                 }
2170                 src1 += mOverlayPitch;
2171                 //pAlphaMask += pitch;
2172                 dst += mOverlayPitch;
2173             }
2174         }
2175         else if( ((pBody==NULL) + (pBorder==NULL))==1 && pAlphaMask!=NULL)
2176         {
2177             const BYTE* src1 = pBody!=NULL ? pBody : pBorder;
2178             while(h--)
2179             {
2180                 int j=0;
2181                 for( ; j<x0; j++ )
2182                 {
2183                     dst[j] = (src1[j] * pAlphaMask[j] * color_alpha)>>12;
2184                 }
2185                 for( ;j<x00;j+=4 )
2186                 {
2187                     __m64 src = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(src1+j));
2188                     __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pAlphaMask+j));
2189                     __m64 zero = _mm_setzero_si64();
2190                     src = _mm_unpacklo_pi8(src, zero);
2191                     src = _mm_mullo_pi16(src, color_alpha_64);
2192                     mask = _mm_unpacklo_pi8(zero, mask); //important!
2193                     src = _mm_mulhi_pi16(src, mask); //important!
2194                     src = _mm_srli_pi16(src, 12+8-16); //important!
2195                     src = _mm_packs_pu16(src,src);
2196                     *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(src);
2197                 }
2198                 __m128i zero = _mm_setzero_si128();
2199                 for( ;j<x_end00;j+=16)
2200                 {
2201                     __m128i src = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src1+j));
2202                     __m128i mask = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pAlphaMask+j));
2203                     __m128i srchi = src;
2204                     __m128i maskhi = mask;
2205                     src = _mm_unpacklo_epi8(src, zero);
2206                     srchi = _mm_unpackhi_epi8(srchi, zero);
2207                     mask = _mm_unpacklo_epi8(zero, mask); //important!
2208                     maskhi = _mm_unpackhi_epi8(zero, maskhi);
2209                     src = _mm_mullo_epi16(src, color_alpha_128);
2210                     srchi = _mm_mullo_epi16(srchi, color_alpha_128);
2211                     src = _mm_mulhi_epu16(src, mask); //important!
2212                     srchi = _mm_mulhi_epu16(srchi, maskhi);
2213                     src = _mm_srli_epi16(src, 12+8-16); //important!
2214                     srchi = _mm_srli_epi16(srchi, 12+8-16);
2215                     src = _mm_packus_epi16(src, srchi);
2216                     _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), src);
2217                 }
2218                 for( ;j<x_end0;j+=4)
2219                 {
2220                     __m64 src = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(src1+j));
2221                     __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pAlphaMask+j));
2222                     __m64 zero = _mm_setzero_si64();
2223                     src = _mm_unpacklo_pi8(src, zero);
2224                     src = _mm_mullo_pi16(src, color_alpha_64);
2225                     mask = _mm_unpacklo_pi8(zero, mask); //important!
2226                     src = _mm_mulhi_pi16(src, mask); //important!
2227                     src = _mm_srli_pi16(src, 12+8-16); //important!
2228                     src = _mm_packs_pu16(src,src);
2229                     *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(src);
2230                 }
2231                 for( ;j<x_end;j++)
2232                 {
2233                     dst[j] = (src1[j] * pAlphaMask[j] * color_alpha)>>12;
2234                 }
2235                 src1 += mOverlayPitch;
2236                 pAlphaMask += pitch;
2237                 dst += mOverlayPitch;
2238             }
2239         }
2240         else if( pAlphaMask!=NULL && pBody!=NULL && pBorder!=NULL )
2241         {
2242             while(h--)
2243             {
2244                 int j=0;
2245                 for( ; j<x0; j++ )
2246                 {
2247                     int temp = pBorder[j]-pBody[j];
2248                     temp = temp<0 ? 0 : temp;
2249                     dst[j] = (temp * pAlphaMask[j] * color_alpha)>>12;
2250                 }
2251                 for( ;j<x00;j+=4 )
2252                 {
2253                     __m64 border = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBorder+j));
2254                     __m64 body = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBody+j));
2255                     border = _mm_subs_pu8(border, body);
2256                     __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pAlphaMask+j));
2257                     __m64 zero = _mm_setzero_si64();
2258                     border = _mm_unpacklo_pi8(border, zero);
2259                     border = _mm_mullo_pi16(border, color_alpha_64);
2260                     mask = _mm_unpacklo_pi8(zero, mask); //important!
2261                     border = _mm_mulhi_pi16(border, mask); //important!
2262                     border = _mm_srli_pi16(border, 12+8-16); //important!
2263                     border = _mm_packs_pu16(border,border);
2264                     *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(border);
2265                 }
2266                 __m128i zero = _mm_setzero_si128();
2267                 for( ;j<x_end00;j+=16)
2268                 {
2269                     __m128i border = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pBorder+j));
2270                     __m128i body = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pBody+j));
2271                     border = _mm_subs_epu8(border,body);
2272
2273                     __m128i mask = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pAlphaMask+j));
2274                     __m128i srchi = border;
2275                     __m128i maskhi = mask;
2276                     border = _mm_unpacklo_epi8(border, zero);
2277                     srchi = _mm_unpackhi_epi8(srchi, zero);
2278                     mask = _mm_unpacklo_epi8(zero, mask); //important!
2279                     maskhi = _mm_unpackhi_epi8(zero, maskhi);
2280                     border = _mm_mullo_epi16(border, color_alpha_128);
2281                     srchi = _mm_mullo_epi16(srchi, color_alpha_128);
2282                     border = _mm_mulhi_epu16(border, mask); //important!
2283                     srchi = _mm_mulhi_epu16(srchi, maskhi);
2284                     border = _mm_srli_epi16(border, 12+8-16); //important!
2285                     srchi = _mm_srli_epi16(srchi, 12+8-16);
2286                     border = _mm_packus_epi16(border, srchi);
2287                     _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), border);
2288                 }
2289                 for( ;j<x_end0;j+=4)
2290                 {
2291                     __m64 border = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBorder+j));
2292                     __m64 body = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBody+j));
2293                     border = _mm_subs_pu8(border, body);
2294                     __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pAlphaMask+j));
2295                     __m64 zero = _mm_setzero_si64();
2296                     border = _mm_unpacklo_pi8(border, zero);
2297                     border = _mm_mullo_pi16(border, color_alpha_64);
2298                     mask = _mm_unpacklo_pi8(zero, mask); //important!
2299                     border = _mm_mulhi_pi16(border, mask); //important!
2300                     border = _mm_srli_pi16(border, 12+8-16); //important!
2301                     border = _mm_packs_pu16(border,border);
2302                     *reinterpret_cast<int*>(dst+j) = _mm_cvtsi64_si32(border);
2303                 }
2304                 for( ;j<x_end;j++)
2305                 {
2306                     int temp = pBorder[j]-pBody[j];
2307                     temp = temp<0 ? 0 : temp;
2308                     dst[j] = (temp * pAlphaMask[j] * color_alpha)>>12;
2309                 }
2310                 pBody += mOverlayPitch;
2311                 pBorder += mOverlayPitch;
2312                 pAlphaMask += pitch;
2313                 dst += mOverlayPitch;
2314             }
2315         }
2316         else
2317         {
2318             //should NOT happen!
2319             ASSERT(0);
2320             while(h--)
2321             {
2322                 for(int j=0;j<x_end;j++)
2323                 {
2324                     dst[j] = 0;
2325                 }
2326                 dst += mOverlayPitch;
2327             }
2328         }
2329         _mm_empty();
2330     }
2331     else
2332     {
2333         _DoFillAlphaMash_c(outputAlphaMask, pBody, pBorder, x, y, w, h, pAlphaMask, pitch, color_alpha);
2334         return;
2335     }
2336 #else
2337     _DoFillAlphaMash_c(outputAlphaMask, pBody, pBorder, x, y, w, h, pAlphaMask, pitch, color_alpha);
2338     return;
2339 #endif
2340 }
2341
2342 void Overlay::_DoFillAlphaMash_c(byte* outputAlphaMask, const byte* pBody, const byte* pBorder, int x, int y, int w, int h,
2343     const byte* pAlphaMask, int pitch, DWORD color_alpha )
2344 {
2345     pBody = pBody!=NULL ? pBody + y*mOverlayPitch + x: NULL;
2346     pBorder = pBorder!=NULL ? pBorder + y*mOverlayPitch + x: NULL;
2347     byte* dst = outputAlphaMask + y*mOverlayPitch + x;
2348
2349     if(pAlphaMask==NULL && pBody!=NULL && pBorder!=NULL)
2350     {
2351         while(h--)
2352         {
2353             int j=0;
2354             for( ;j<w;j++)
2355             {
2356                 int temp = pBorder[j]-pBody[j];
2357                 temp = temp<0 ? 0 : temp;
2358                 dst[j] = (temp * color_alpha)>>6;
2359             }
2360             pBody += mOverlayPitch;
2361             pBorder += mOverlayPitch;
2362             //pAlphaMask += pitch;
2363             dst += mOverlayPitch;
2364         }
2365     }
2366     else if( ((pBody==NULL) + (pBorder==NULL))==1 && pAlphaMask==NULL)
2367     {
2368         const BYTE* src1 = pBody!=NULL ? pBody : pBorder;
2369         while(h--)
2370         {
2371             int j=0;
2372             for( ; j<w; j++ )
2373             {
2374                 dst[j] = (src1[j] * color_alpha)>>6;
2375             }
2376             src1 += mOverlayPitch;
2377             //pAlphaMask += pitch;
2378             dst += mOverlayPitch;
2379         }
2380     }
2381     else if( ((pBody==NULL) + (pBorder==NULL))==1 && pAlphaMask!=NULL)
2382     {
2383         const BYTE* src1 = pBody!=NULL ? pBody : pBorder;
2384         while(h--)
2385         {
2386             int j=0;
2387             for( ; j<w; j++ )
2388             {
2389                 dst[j] = (src1[j] * pAlphaMask[j] * color_alpha)>>12;
2390             }
2391             src1 += mOverlayPitch;
2392             pAlphaMask += pitch;
2393             dst += mOverlayPitch;
2394         }
2395     }
2396     else if( pAlphaMask!=NULL && pBody!=NULL && pBorder!=NULL )
2397     {
2398         while(h--)
2399         {
2400             int j=0;
2401             for( ; j<w; j++ )
2402             {
2403                 int temp = pBorder[j]-pBody[j];
2404                 temp = temp<0 ? 0 : temp;
2405                 dst[j] = (temp * pAlphaMask[j] * color_alpha)>>12;
2406             }
2407             pBody += mOverlayPitch;
2408             pBorder += mOverlayPitch;
2409             pAlphaMask += pitch;
2410             dst += mOverlayPitch;
2411         }
2412     }
2413     else
2414     {
2415         //should NOT happen!
2416         ASSERT(0);
2417         while(h--)
2418         {
2419             for(int j=0;j<w;j++)
2420             {
2421                 dst[j] = 0;
2422             }
2423             dst += mOverlayPitch;
2424         }
2425     }
2426 }
2427
2428 void Overlay::FillAlphaMash( byte* outputAlphaMask, bool fBody, bool fBorder, int x, int y, int w, int h, const byte* pAlphaMask, int pitch, DWORD color_alpha)
2429 {
2430     if(!fBorder && fBody && pAlphaMask==NULL)
2431     {
2432         _DoFillAlphaMash(outputAlphaMask, mBody.get(), NULL, x, y, w, h, pAlphaMask, pitch, color_alpha);
2433     }
2434     else if(/*fBorder &&*/ fBody && pAlphaMask==NULL)
2435     {
2436         _DoFillAlphaMash(outputAlphaMask, NULL, mBorder.get(), x, y, w, h, pAlphaMask, pitch, color_alpha);
2437     }
2438     else if(!fBody && fBorder /* pAlphaMask==NULL or not*/)
2439     {
2440         _DoFillAlphaMash(outputAlphaMask, mBody.get(), mBorder.get(), x, y, w, h, pAlphaMask, pitch, color_alpha);
2441     }
2442     else if(!fBorder && fBody && pAlphaMask!=NULL)
2443     {
2444         _DoFillAlphaMash(outputAlphaMask, mBody.get(), NULL, x, y, w, h, pAlphaMask, pitch, color_alpha);
2445     }
2446     else if(fBorder && fBody && pAlphaMask!=NULL)
2447     {
2448         _DoFillAlphaMash(outputAlphaMask, NULL, mBorder.get(), x, y, w, h, pAlphaMask, pitch, color_alpha);
2449     }
2450     else
2451     {
2452         //should NOT happen
2453         ASSERT(0);
2454     }
2455 }
2456
2457 Overlay* Overlay::GetSubpixelVariance(unsigned int xshift, unsigned int yshift)
2458 {
2459     Overlay* overlay = new Overlay();
2460     if(!overlay)
2461     {
2462         return NULL;
2463     }
2464     xshift &= 7;
2465     yshift &= 7;
2466
2467     overlay->mOffsetX = mOffsetX - xshift;
2468     overlay->mOffsetY = mOffsetY - yshift;
2469     overlay->mWidth = mWidth + xshift;
2470     overlay->mHeight = mHeight + yshift;
2471
2472     overlay->mOverlayWidth = ((overlay->mWidth+7)>>3) + 1;
2473     overlay->mOverlayHeight = ((overlay->mHeight + 7)>>3) + 1;
2474     overlay->mOverlayPitch = (overlay->mOverlayWidth+15)&~15;
2475
2476
2477     overlay->mfWideOutlineEmpty = mfWideOutlineEmpty;
2478
2479     if (overlay->mOverlayPitch * overlay->mOverlayHeight<=0)
2480     {
2481         return NULL;
2482     }
2483
2484     BYTE* body = reinterpret_cast<BYTE*>(xy_malloc(overlay->mOverlayPitch * overlay->mOverlayHeight));
2485     if( body==NULL )
2486     {
2487         return NULL;
2488     }
2489     overlay->mBody.reset(body, xy_free);
2490     memset(body, 0, overlay->mOverlayPitch*overlay->mOverlayHeight);
2491     BYTE* border = NULL;
2492     if (!overlay->mfWideOutlineEmpty)
2493     {
2494         border = reinterpret_cast<BYTE*>(xy_malloc(overlay->mOverlayPitch * overlay->mOverlayHeight));
2495         if (border==NULL)
2496         {
2497             return NULL;
2498         }
2499         overlay->mBorder.reset(border, xy_free);
2500         memset(border, 0, overlay->mOverlayPitch*overlay->mOverlayHeight);
2501     }
2502
2503     if( overlay->mOverlayPitch==mOverlayPitch && overlay->mOverlayWidth==mOverlayWidth &&
2504         overlay->mOverlayHeight>=mOverlayHeight )
2505     {
2506         if (body && mBody)
2507         {
2508             memcpy(body, mBody.get(), mOverlayPitch * mOverlayHeight);
2509         }
2510         else if ( (!!body)!=(!!mBody)/*==NULL*/)
2511         {
2512             return NULL;
2513         }
2514
2515         if (border && mBorder)
2516         {
2517             memcpy(border, mBorder.get(), mOverlayPitch * mOverlayHeight);
2518         }
2519         else if ( (!!border)!=(!!mBorder)/*==NULL*/ )
2520         {
2521             return NULL;
2522         }
2523     }
2524     else
2525     {
2526         byte* dst = body;
2527         const byte* src = mBody.get();
2528         for (int i=0;i<mOverlayHeight;i++)
2529         {
2530             memcpy(dst, src, mOverlayWidth);
2531             dst += overlay->mOverlayPitch;
2532             src += mOverlayPitch;
2533         }
2534         if (!overlay->mfWideOutlineEmpty)
2535         {
2536             ASSERT(border && mBorder);
2537             dst = border;
2538             src = mBorder.get();
2539             for (int i=0;i<mOverlayHeight;i++)
2540             {
2541                 memcpy(dst, src, mOverlayWidth);
2542                 dst += overlay->mOverlayPitch;
2543                 src += mOverlayPitch;
2544             }
2545         }
2546     }
2547     //not equal
2548     //  Bilinear(overlay->mpOverlayBuffer.base, overlay->mOverlayWidth, 2*overlay->mOverlayHeight, overlay->mOverlayPitch, xshift, yshift);
2549     Bilinear(body, overlay->mOverlayWidth, overlay->mOverlayHeight, overlay->mOverlayPitch, xshift, yshift);
2550     if (!overlay->mfWideOutlineEmpty)
2551     {
2552         Bilinear(border, overlay->mOverlayWidth, overlay->mOverlayHeight, overlay->mOverlayPitch, xshift, yshift);
2553     }
2554     return overlay;
2555 }
2556
2557 ///////////////////////////////////////////////////////////////
2558
2559 // PathData
2560
2561 PathData::PathData():mpPathTypes(NULL), mpPathPoints(NULL), mPathPoints(0)
2562 {
2563 }
2564
2565 PathData::PathData( const PathData& src ):mpPathTypes(NULL), mpPathPoints(NULL), mPathPoints(src.mPathPoints)
2566 {
2567     //TODO: deal with the case that src.mPathPoints<0
2568     if(mPathPoints>0)
2569     {
2570         mpPathTypes = static_cast<BYTE*>(malloc(mPathPoints * sizeof(BYTE)));
2571         mpPathPoints = static_cast<POINT*>(malloc(mPathPoints * sizeof(POINT)));
2572     }
2573     if(mPathPoints>0)
2574     {
2575         memcpy(mpPathTypes, src.mpPathTypes, mPathPoints*sizeof(BYTE));
2576         memcpy(mpPathPoints, src.mpPathPoints, mPathPoints*sizeof(POINT));
2577     }
2578 }
2579
2580 const PathData& PathData::operator=( const PathData& src )
2581 {
2582     if(this!=&src)
2583     {
2584         if(mPathPoints!=src.mPathPoints && src.mPathPoints>0)
2585         {
2586             _TrashPath();
2587             mPathPoints = src.mPathPoints;
2588             mpPathTypes = static_cast<BYTE*>(malloc(mPathPoints * sizeof(BYTE)));
2589             mpPathPoints = static_cast<POINT*>(malloc(mPathPoints * sizeof(POINT)));//better than realloc
2590         }
2591         if(src.mPathPoints>0)
2592         {
2593             memcpy(mpPathTypes, src.mpPathTypes, mPathPoints*sizeof(BYTE));
2594             memcpy(mpPathPoints, src.mpPathPoints, mPathPoints*sizeof(POINT));
2595         }
2596     }
2597     return *this;
2598 }
2599
2600 PathData::~PathData()
2601 {
2602     _TrashPath();
2603 }
2604
2605 bool PathData::operator==( const PathData& rhs ) const
2606 {
2607     return (this==&rhs) || (
2608         mPathPoints==rhs.mPathPoints
2609         && !memcmp(mpPathTypes, rhs.mpPathTypes, mPathPoints * sizeof(BYTE) )
2610         && !memcmp(mpPathPoints, rhs.mpPathPoints, mPathPoints * sizeof(POINT) )
2611         );
2612 }
2613
2614 void PathData::_TrashPath()
2615 {
2616     if (mpPathTypes)
2617     {
2618         free(mpPathTypes);
2619         mpPathTypes = NULL;
2620     }
2621     if (mpPathPoints)
2622     {
2623         free(mpPathPoints);
2624         mpPathPoints = NULL;
2625     }
2626     mPathPoints = 0;
2627 }
2628
2629 bool PathData::BeginPath(HDC hdc)
2630 {
2631     _TrashPath();
2632     return !!::BeginPath(hdc);
2633 }
2634
2635 bool PathData::EndPath(HDC hdc)
2636 {
2637     ::CloseFigure(hdc);
2638     if(::EndPath(hdc))
2639     {
2640         mPathPoints = GetPath(hdc, NULL, NULL, 0);
2641         if(!mPathPoints)
2642             return true;
2643         mpPathTypes = (BYTE*)malloc(sizeof(BYTE) * mPathPoints);
2644         mpPathPoints = (POINT*)malloc(sizeof(POINT) * mPathPoints);
2645         if(mPathPoints == GetPath(hdc, mpPathPoints, mpPathTypes, mPathPoints))
2646             return true;
2647     }
2648     ::AbortPath(hdc);
2649     return false;
2650 }
2651
2652 bool PathData::PartialBeginPath(HDC hdc, bool bClearPath)
2653 {
2654     if(bClearPath)
2655         _TrashPath();
2656     return !!::BeginPath(hdc);
2657 }
2658
2659 bool PathData::PartialEndPath(HDC hdc, long dx, long dy)
2660 {
2661     ::CloseFigure(hdc);
2662     if(::EndPath(hdc))
2663     {
2664         int nPoints;
2665         BYTE* pNewTypes;
2666         POINT* pNewPoints;
2667         nPoints = GetPath(hdc, NULL, NULL, 0);
2668         if(!nPoints)
2669             return true;
2670         pNewTypes = (BYTE*)realloc(mpPathTypes, (mPathPoints + nPoints) * sizeof(BYTE));
2671         pNewPoints = (POINT*)realloc(mpPathPoints, (mPathPoints + nPoints) * sizeof(POINT));
2672         if(pNewTypes)
2673             mpPathTypes = pNewTypes;
2674         if(pNewPoints)
2675             mpPathPoints = pNewPoints;
2676         BYTE* pTypes = new BYTE[nPoints];
2677         POINT* pPoints = new POINT[nPoints];
2678         if(pNewTypes && pNewPoints && nPoints == GetPath(hdc, pPoints, pTypes, nPoints))
2679         {
2680             for(int i = 0; i < nPoints; ++i)
2681             {
2682                 mpPathPoints[mPathPoints + i].x = pPoints[i].x + dx;
2683                 mpPathPoints[mPathPoints + i].y = pPoints[i].y + dy;
2684                 mpPathTypes[mPathPoints + i] = pTypes[i];
2685             }
2686             mPathPoints += nPoints;
2687             delete[] pTypes;
2688             delete[] pPoints;
2689             return true;
2690         }
2691         else
2692             DebugBreak();
2693         delete[] pTypes;
2694         delete[] pPoints;
2695     }
2696     ::AbortPath(hdc);
2697     return false;
2698 }
2699
2700 void PathData::AlignLeftTop(CPoint *left_top, CSize *size)
2701 {
2702     int minx = INT_MAX;
2703     int miny = INT_MAX;
2704     int maxx = INT_MIN;
2705     int maxy = INT_MIN;
2706     for(int i=0; i<mPathPoints; ++i)
2707     {
2708         int ix = mpPathPoints[i].x;
2709         int iy = mpPathPoints[i].y;
2710         if(ix < minx) minx = ix;
2711         if(ix > maxx) maxx = ix;
2712         if(iy < miny) miny = iy;
2713         if(iy > maxy) maxy = iy;
2714     }
2715     if(minx > maxx || miny > maxy)
2716     {
2717         _TrashPath();
2718         *left_top = CPoint(0, 0);
2719         *size = CSize(0, 0);
2720         return;
2721     }
2722     minx = (minx >> 3) & ~7;
2723     miny = (miny >> 3) & ~7;
2724     maxx = (maxx + 7) >> 3;
2725     maxy = (maxy + 7) >> 3;
2726     for(int i=0; i<mPathPoints; ++i)
2727     {
2728         mpPathPoints[i].x -= minx*8;
2729         mpPathPoints[i].y -= miny*8;
2730     }
2731     *left_top = CPoint(minx, miny);
2732     *size = CSize(maxx+1-minx, maxy+1-miny);
2733     return;
2734 }
2735
2736 //////////////////////////////////////////////////////////////////////////
2737
2738 // ScanLineData
2739
2740 ScanLineData::ScanLineData()
2741 {
2742 }
2743
2744 ScanLineData::~ScanLineData()
2745 {
2746 }
2747
2748 void ScanLineData::_ReallocEdgeBuffer(int edges)
2749 {
2750     mEdgeHeapSize = edges;
2751     mpEdgeBuffer = (Edge*)realloc(mpEdgeBuffer, sizeof(Edge)*edges);
2752 }
2753
2754 void ScanLineData::_EvaluateBezier(const PathData& path_data, int ptbase, bool fBSpline)
2755 {
2756     const POINT* pt0 = path_data.mpPathPoints + ptbase;
2757     const POINT* pt1 = path_data.mpPathPoints + ptbase + 1;
2758     const POINT* pt2 = path_data.mpPathPoints + ptbase + 2;
2759     const POINT* pt3 = path_data.mpPathPoints + ptbase + 3;
2760     double x0 = pt0->x;
2761     double x1 = pt1->x;
2762     double x2 = pt2->x;
2763     double x3 = pt3->x;
2764     double y0 = pt0->y;
2765     double y1 = pt1->y;
2766     double y2 = pt2->y;
2767     double y3 = pt3->y;
2768     double cx3, cx2, cx1, cx0, cy3, cy2, cy1, cy0;
2769     if(fBSpline)
2770     {
2771         // 1   [-1 +3 -3 +1]
2772         // - * [+3 -6 +3  0]
2773         // 6   [-3  0 +3  0]
2774         //         [+1 +4 +1  0]
2775         double _1div6 = 1.0/6.0;
2776         cx3 = _1div6*(-  x0+3*x1-3*x2+x3);
2777         cx2 = _1div6*( 3*x0-6*x1+3*x2);
2778         cx1 = _1div6*(-3*x0        +3*x2);
2779         cx0 = _1div6*(   x0+4*x1+1*x2);
2780         cy3 = _1div6*(-  y0+3*y1-3*y2+y3);
2781         cy2 = _1div6*( 3*y0-6*y1+3*y2);
2782         cy1 = _1div6*(-3*y0     +3*y2);
2783         cy0 = _1div6*(   y0+4*y1+1*y2);
2784     }
2785     else // bezier
2786     {
2787         // [-1 +3 -3 +1]
2788         // [+3 -6 +3  0]
2789         // [-3 +3  0  0]
2790         // [+1  0  0  0]
2791         cx3 = -  x0+3*x1-3*x2+x3;
2792         cx2 =  3*x0-6*x1+3*x2;
2793         cx1 = -3*x0+3*x1;
2794         cx0 =    x0;
2795         cy3 = -  y0+3*y1-3*y2+y3;
2796         cy2 =  3*y0-6*y1+3*y2;
2797         cy1 = -3*y0+3*y1;
2798         cy0 =    y0;
2799     }
2800     //
2801     // This equation is from Graphics Gems I.
2802     //
2803     // The idea is that since we're approximating a cubic curve with lines,
2804     // any error we incur is due to the curvature of the line, which we can
2805     // estimate by calculating the maximum acceleration of the curve.  For
2806     // a cubic, the acceleration (second derivative) is a line, meaning that
2807     // the absolute maximum acceleration must occur at either the beginning
2808     // (|c2|) or the end (|c2+c3|).  Our bounds here are a little more
2809     // conservative than that, but that's okay.
2810     //
2811     // If the acceleration of the parametric formula is zero (c2 = c3 = 0),
2812     // that component of the curve is linear and does not incur any error.
2813     // If a=0 for both X and Y, the curve is a line segment and we can
2814     // use a step size of 1.
2815     double maxaccel1 = fabs(2*cy2) + fabs(6*cy3);
2816     double maxaccel2 = fabs(2*cx2) + fabs(6*cx3);
2817     double maxaccel = maxaccel1 > maxaccel2 ? maxaccel1 : maxaccel2;
2818     double h = 1.0;
2819     if(maxaccel > 8.0) h = sqrt(8.0 / maxaccel);
2820     if(!fFirstSet) {firstp.x = (LONG)cx0; firstp.y = (LONG)cy0; lastp = firstp; fFirstSet = true;}
2821     for(double t = 0; t < 1.0; t += h)
2822     {
2823         double x = cx0 + t*(cx1 + t*(cx2 + t*cx3));
2824         double y = cy0 + t*(cy1 + t*(cy2 + t*cy3));
2825         _EvaluateLine(lastp.x, lastp.y, (int)x, (int)y);
2826     }
2827     double x = cx0 + cx1 + cx2 + cx3;
2828     double y = cy0 + cy1 + cy2 + cy3;
2829     _EvaluateLine(lastp.x, lastp.y, (int)x, (int)y);
2830 }
2831
2832 void ScanLineData::_EvaluateLine(const PathData& path_data, int pt1idx, int pt2idx)
2833 {
2834     const POINT* pt1 = path_data.mpPathPoints + pt1idx;
2835     const POINT* pt2 = path_data.mpPathPoints + pt2idx;
2836     _EvaluateLine(pt1->x, pt1->y, pt2->x, pt2->y);
2837 }
2838
2839 void ScanLineData::_EvaluateLine(int x0, int y0, int x1, int y1)
2840 {
2841     if(lastp.x != x0 || lastp.y != y0)
2842     {
2843         _EvaluateLine(lastp.x, lastp.y, x0, y0);
2844     }
2845     if(!fFirstSet) {firstp.x = x0; firstp.y = y0; fFirstSet = true;}
2846     lastp.x = x1;
2847     lastp.y = y1;
2848     if(y1 > y0) // down
2849     {
2850         __int64 xacc = (__int64)x0 << 13;
2851         // prestep y0 down
2852         int dy = y1 - y0;
2853         int y = ((y0 + 3)&~7) + 4;
2854         int iy = y >> 3;
2855         y1 = (y1 - 5) >> 3;
2856         if(iy <= y1)
2857         {
2858             __int64 invslope = (__int64(x1 - x0) << 16) / dy;
2859             while(mEdgeNext + y1 + 1 - iy > mEdgeHeapSize)
2860                 _ReallocEdgeBuffer(mEdgeHeapSize*2);
2861             xacc += (invslope * (y - y0)) >> 3;
2862             while(iy <= y1)
2863             {
2864                 int ix = (int)((xacc + 32768) >> 16);
2865                 mpEdgeBuffer[mEdgeNext].next = mpScanBuffer[iy];
2866                 mpEdgeBuffer[mEdgeNext].posandflag = ix*2 + 1;
2867                 mpScanBuffer[iy] = mEdgeNext++;
2868                 ++iy;
2869                 xacc += invslope;
2870             }
2871         }
2872     }
2873     else if(y1 < y0) // up
2874     {
2875         __int64 xacc = (__int64)x1 << 13;
2876         // prestep y1 down
2877         int dy = y0 - y1;
2878         int y = ((y1 + 3)&~7) + 4;
2879         int iy = y >> 3;
2880         y0 = (y0 - 5) >> 3;
2881         if(iy <= y0)
2882         {
2883             __int64 invslope = (__int64(x0 - x1) << 16) / dy;
2884             while(mEdgeNext + y0 + 1 - iy > mEdgeHeapSize)
2885                 _ReallocEdgeBuffer(mEdgeHeapSize*2);
2886             xacc += (invslope * (y - y1)) >> 3;
2887             while(iy <= y0)
2888             {
2889                 int ix = (int)((xacc + 32768) >> 16);
2890                 mpEdgeBuffer[mEdgeNext].next = mpScanBuffer[iy];
2891                 mpEdgeBuffer[mEdgeNext].posandflag = ix*2;
2892                 mpScanBuffer[iy] = mEdgeNext++;
2893                 ++iy;
2894                 xacc += invslope;
2895             }
2896         }
2897     }
2898 }
2899
2900 bool ScanLineData::ScanConvert(const PathData& path_data, const CSize& size)
2901 {
2902     int lastmoveto = -1;
2903     int i;
2904     // Drop any outlines we may have.
2905     mOutline.clear();
2906     // Determine bounding box
2907     if(!path_data.mPathPoints)
2908     {
2909         mWidth = mHeight = 0;
2910         return false;
2911     }
2912     mWidth = size.cx;
2913     mHeight = size.cy;
2914     // Initialize edge buffer.  We use edge 0 as a sentinel.
2915     mEdgeNext = 1;
2916     mEdgeHeapSize = 2048;
2917     mpEdgeBuffer = (Edge*)malloc(sizeof(Edge)*mEdgeHeapSize);
2918     // Initialize scanline list.
2919     mpScanBuffer = new unsigned int[mHeight];
2920     memset(mpScanBuffer, 0, mHeight*sizeof(unsigned int));
2921     // Scan convert the outline.  Yuck, Bezier curves....
2922     // Unfortunately, Windows 95/98 GDI has a bad habit of giving us text
2923     // paths with all but the first figure left open, so we can't rely
2924     // on the PT_CLOSEFIGURE flag being used appropriately.
2925     fFirstSet = false;
2926     firstp.x = firstp.y = 0;
2927     lastp.x = lastp.y = 0;
2928     for(i=0; i<path_data.mPathPoints; ++i)
2929     {
2930         BYTE t = path_data.mpPathTypes[i] & ~PT_CLOSEFIGURE;
2931         switch(t)
2932         {
2933         case PT_MOVETO:
2934             if(lastmoveto >= 0 && firstp != lastp)
2935                 _EvaluateLine(lastp.x, lastp.y, firstp.x, firstp.y);
2936             lastmoveto = i;
2937             fFirstSet = false;
2938             lastp = path_data.mpPathPoints[i];
2939             break;
2940         case PT_MOVETONC:
2941             break;
2942         case PT_LINETO:
2943             if(path_data.mPathPoints - (i-1) >= 2) _EvaluateLine(path_data, i-1, i);
2944             break;
2945         case PT_BEZIERTO:
2946             if(path_data.mPathPoints - (i-1) >= 4) _EvaluateBezier(path_data, i-1, false);
2947             i += 2;
2948             break;
2949         case PT_BSPLINETO:
2950             if(path_data.mPathPoints - (i-1) >= 4) _EvaluateBezier(path_data, i-1, true);
2951             i += 2;
2952             break;
2953         case PT_BSPLINEPATCHTO:
2954             if(path_data.mPathPoints - (i-3) >= 4) _EvaluateBezier(path_data, i-3, true);
2955             break;
2956         }
2957     }
2958     if(lastmoveto >= 0 && firstp != lastp)
2959         _EvaluateLine(lastp.x, lastp.y, firstp.x, firstp.y);
2960     // Convert the edges to spans.  We couldn't do this before because some of
2961     // the regions may have winding numbers >+1 and it would have been a pain
2962     // to try to adjust the spans on the fly.  We use one heap to detangle
2963     // a scanline's worth of edges from the singly-linked lists, and another
2964     // to collect the actual scans.
2965     std::vector<int> heap;
2966     mOutline.reserve(mEdgeNext / 2);
2967     __int64 y = 0;
2968     for(y=0; y<mHeight; ++y)
2969     {
2970         int count = 0;
2971         // Detangle scanline into edge heap.
2972         for(unsigned ptr = (unsigned)(mpScanBuffer[y]&0xffffffff); ptr; ptr = mpEdgeBuffer[ptr].next)
2973         {
2974             heap.push_back(mpEdgeBuffer[ptr].posandflag);
2975         }
2976         // Sort edge heap.  Note that we conveniently made the opening edges
2977         // one more than closing edges at the same spot, so we won't have any
2978         // problems with abutting spans.
2979         std::sort(heap.begin(), heap.end()/*begin() + heap.size()*/);
2980         // Process edges and add spans.  Since we only check for a non-zero
2981         // winding number, it doesn't matter which way the outlines go!
2982         std::vector<int>::iterator itX1 = heap.begin();
2983         std::vector<int>::iterator itX2 = heap.end(); // begin() + heap.size();
2984         int x1, x2;
2985         for(; itX1 != itX2; ++itX1)
2986         {
2987             int x = *itX1;
2988             if(!count)
2989                 x1 = (x>>1);
2990             if(x&1)
2991                 ++count;
2992             else
2993                 --count;
2994             if(!count)
2995             {
2996                 x2 = (x>>1);
2997                 if(x2>x1)
2998                     mOutline.push_back(std::pair<__int64,__int64>((y<<32)+x1+0x4000000040000000i64, (y<<32)+x2+0x4000000040000000i64)); // G: damn Avery, this is evil! :)
2999             }
3000         }
3001         heap.clear();
3002     }
3003     // Dump the edge and scan buffers, since we no longer need them.
3004     free(mpEdgeBuffer);
3005     delete [] mpScanBuffer;
3006     // All done!
3007     return true;
3008 }
3009
3010 void ScanLineData::DeleteOutlines()
3011 {
3012     mOutline.clear();
3013 }
3014
3015 bool ScanLineData2::CreateWidenedRegion(int rx, int ry)
3016 {
3017     if(rx < 0) rx = 0;
3018     if(ry < 0) ry = 0;
3019     mWideBorder = max(rx,ry);
3020     mWideOutline.clear();
3021
3022     const tSpanBuffer& out_line = m_scan_line_data->mOutline;
3023     if (ry > 0)
3024     {
3025         WidenRegionCreater *widen_region_creater = WidenRegionCreater::GetDefaultWidenRegionCreater();
3026         widen_region_creater->xy_overlap_region(&mWideOutline, out_line, rx, ry);
3027     }
3028     else if (ry == 0 && rx > 0)
3029     {
3030         // There are artifacts if we don't make at least two overlaps of the line, even at same Y coord
3031         OverlapRegion(mWideOutline, out_line, rx, 0);
3032         OverlapRegion(mWideOutline, out_line, rx, 0);
3033     }
3034     return true;
3035 }