src/subtitles/xy_filter.cpp

   1 #include "stdafx.h"
   2 #include "../dsutil/vd.h"
   3
   4 typedef const UINT8 CUINT8, *PCUINT8;
   5 typedef const UINT CUINT, *PCUINT;
   6
   7 //
   8 // ref: "Comparing floating point numbers" by Bruce Dawson
   9 // http://www.cygnus-software.com/papers/comparingfloats/comparingfloats.htm
  10 //
  11 bool AlmostEqual(float A, float B, int maxUlps=0)
  12 {
  13     // Make sure maxUlps is non-negative and small enough that the
  14     // default NAN won't compare as equal to anything.
  15     ASSERT(maxUlps >= 0 && maxUlps < 4 * 1024 * 1024);
  16     int aInt = *(int*)&A;
  17     // Make aInt lexicographically ordered as a twos-complement int
  18     if (aInt < 0)
  19         aInt = 0x80000000 - aInt;
  20     // Make bInt lexicographically ordered as a twos-complement int
  21     int bInt = *(int*)&B;
  22     if (bInt < 0)
  23         bInt = 0x80000000 - bInt;
  24
  25     int intDiff = abs(aInt - bInt);
  26     if (intDiff <= maxUlps)
  27         return true;
  28
  29     return false;
  30 }
  31
  32 /****
  33  * See @xy_filter_c
  34  **/
  35 __forceinline void xy_filter_one_line_c(float *dst, int width, const float *filter, int filter_width)
  36 {
  37     const float *filter_start = filter;
  38     int xx_fix = width > filter_width ? 0 : filter_width - width;
  39     float *dst2 = dst - filter_width;
  40     float *dst_endr = dst + width;
  41     float *dst_end0 = dst_endr - filter_width;
  42     float *dst_endl = dst - xx_fix;
  43     ASSERT(xx_fix==0 || dst_end0==dst_endl);
  44
  45     ASSERT(filter_start == filter);
  46     filter_start += filter_width;
  47     const float *filter_end = filter_start;
  48     for (;dst2<dst_endl;dst2++, filter_start--)//left margin
  49     {
  50         const float *src = dst;
  51         float sum = 0;
  52         for(const float* f=filter_start;f<filter_end;f++, src++)
  53         {
  54             sum += src[0] * f[0];
  55         }
  56         *dst2 = sum;
  57     }
  58     for (;dst2<dst;dst2++, filter_start--, filter_end--)//if width < filter_width
  59     {
  60         const float *src = dst;
  61         float sum = 0;
  62         for(const float* f=filter_start;f<filter_end;f++, src++)
  63         {
  64             sum += src[0] * f[0];
  65         }
  66         *dst2 = sum;
  67     }
  68     ASSERT(filter_start==filter);
  69     for (;dst2<dst_end0;dst2++)
  70     {
  71         const float *src = dst2;
  72
  73         float sum = 0;
  74         for(const float* f=filter_start;f<filter_end;f++, src++)
  75         {
  76             sum += src[0] * f[0];
  77         }
  78         *dst2 = sum;
  79     }
  80     for (;dst2<dst_endr;dst2++, filter_end--)//right margin
  81     {
  82         const float *src = dst2;
  83         float sum = 0;
  84         for(const float* f=filter;f<filter_end;f++, src++)
  85         {
  86             sum += src[0] * f[0];
  87         }
  88         *dst2 = sum;
  89     }
  90 }
  91
  92 /****
  93  * dst memory layout:
  94  * 1. Source content starts from @dst;
  95  * 2. Output content starts from @dst-@mwitdth;
  96  *
  97  * |-                     stride                            -|
  98  * | <- @dst-@mwidth --------| <- @dst+0 --------------------|
  99  * |-                       -|-                             -|
 100  * |- margin                -|-  src width*height items     -|
 101  * |- mwidth*heigth items   -|-                             -|
 102  * |- do NOT need to init   -|-                             -|
 103  * |- when input            -|-                             -|
 104  * |---------------------------------------------------------|
 105  **/
 106 void xy_filter_c(float *dst, int width, int height, int stride, const float *filter, int filter_width)
 107 {
 108     ASSERT( stride>=4*(width+filter_width) );
 109     BYTE* end = reinterpret_cast<BYTE*>(dst) + height*stride;
 110     BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
 111     for( ; dst_byte<end; dst_byte+=stride )
 112     {
 113         xy_filter_one_line_c(reinterpret_cast<float*>(dst_byte), width, filter, filter_width);
 114     }
 115 }
 116
 117 /****
 118  * inline function sometimes generates stupid code
 119  *
 120  * @src4, @src_5_8, @f4, @sum : __m128
 121  * @src_5_8, @f4: const
 122  * @sum : output
 123  * @src4: undefined
 124  **/
 125 #define XY_FILTER_4(src4, src_5_8, f4, sum) \
 126     __m128 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(0,0,0,0));\
 127     f4_1 = _mm_mul_ps(f4_1, src4);\
 128     sum = _mm_add_ps(sum, f4_1);\
 129     __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
 130     f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(2,2,2,2));\
 131     f4_1 = _mm_mul_ps(f4_1, src_3_6);\
 132     sum = _mm_add_ps(sum, f4_1);\
 133     src4 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
 134     f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(1,1,1,1));\
 135     f4_1 = _mm_mul_ps(f4_1, src4);\
 136     sum = _mm_add_ps(sum, f4_1);\
 137     src_3_6 = _mm_shuffle_ps(src_3_6, src_5_8, _MM_SHUFFLE(2,1,2,1));/*4 5 6 7*/\
 138     f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(3,3,3,3));\
 139     f4_1 = _mm_mul_ps(f4_1, src_3_6);\
 140     sum = _mm_add_ps(sum, f4_1)
 141
 142 /****
 143  * @src4, @f4_1, @sum : __m128
 144  * @f4_1: const
 145  * @sum : output
 146  * @src4: undefined
 147  **/
 148 #define XY_FILTER_4_1(src4, f4_1, sum) \
 149     src4 = _mm_mul_ps(src4, f4_1);\
 150     sum = _mm_add_ps(sum, src4);
 151
 152 __forceinline void xy_filter_one_line_sse_v6(float *dst, int width, const float *filter, int filter_width)
 153 {
 154     int xx_fix = width > filter_width ? 0 : filter_width - width;
 155     const float *filter_start = filter;
 156     float *dst2 = dst - filter_width;
 157     float *dst_endr = dst + width;
 158     float *dst_end0 = dst_endr - filter_width - 4;
 159     float *dst_endl = dst - xx_fix;
 160     ASSERT(xx_fix==0 || dst_end0==dst_endl-4);
 161
 162     ASSERT(filter_start == filter);
 163     filter_start += filter_width;
 164     const float *filter_end = filter_start;
 165
 166     for (;dst2<dst_endl;dst2+=4)//left margin
 167     {
 168         const float *src = dst;
 169         filter_start -= 4;
 170
 171         //filter 4
 172         __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
 173         __m128 sum = _mm_setzero_ps();
 174         for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
 175         {
 176             __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
 177             __m128 f4 = _mm_load_ps(f);
 178
 179             { XY_FILTER_4(src4, src_5_8, f4, sum); }
 180
 181             src4 = src_5_8;
 182         }
 183         //store result
 184         _mm_store_ps(dst2, sum);
 185     }
 186     for (;dst2<dst;dst2+=4)//if width < filter_width
 187     {
 188         const float *src = dst;
 189         filter_start-=4;
 190         filter_end-=4;
 191
 192         __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
 193         __m128 sum = _mm_setzero_ps();
 194         __m128 src_5_8, f4;
 195         for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
 196         {
 197             src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
 198             f4 = _mm_load_ps(f);
 199
 200             { XY_FILTER_4(src4, src_5_8, f4, sum); }
 201             src4 = src_5_8;
 202         }
 203         src_5_8 = _mm_setzero_ps();
 204         f4 = _mm_load_ps(filter_end);
 205         { XY_FILTER_4(src4, src_5_8, f4, sum); }
 206         //store result
 207         _mm_store_ps(dst2, sum);
 208     }
 209     ASSERT(filter_start == filter);
 210     for (;dst2<dst_end0;dst2+=8)
 211     {
 212         const float *src = dst2;
 213         const float* f=filter_start;
 214
 215         //filter 8
 216         __m128 src4 = _mm_load_ps(src);/*1 2 3 4*/
 217         src+=4;
 218         __m128 src_5_8;
 219         __m128 sum = _mm_setzero_ps();
 220         __m128 sum2 = _mm_setzero_ps();
 221         __m128 f4 = _mm_load_ps(f);
 222         f+=4;
 223         src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
 224         src+=4;
 225         { XY_FILTER_4(src4, src_5_8, f4, sum); }
 226         for(;f<filter_end;f+=4,src+=4)
 227         {
 228             src4 = _mm_load_ps(src);/*1 2 3 4*/
 229             __m128 tmp = src_5_8;//important!
 230             { XY_FILTER_4(tmp, src4, f4, sum2); }
 231
 232             f4 = _mm_load_ps(f);
 233             { XY_FILTER_4(src_5_8, src4, f4, sum); }
 234             src_5_8 = src4;
 235         }
 236         src4 = _mm_load_ps(src);/*1 2 3 4*/
 237         { XY_FILTER_4(src_5_8, src4, f4, sum2); }
 238
 239         //store result
 240         _mm_store_ps(dst2, sum);
 241         _mm_store_ps(dst2+4, sum2);
 242     }
 243     if (dst2==dst_end0)
 244     {
 245         const float *src = dst2;
 246         //filter 4
 247         __m128 src4 = _mm_load_ps(src);//1 2 3 4
 248         src+=4;
 249         __m128 sum = _mm_setzero_ps();
 250         for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
 251         {
 252             __m128 src_5_8 = _mm_load_ps(src);//5 6 7 8
 253             __m128 f4 = _mm_load_ps(f);
 254
 255             { XY_FILTER_4(src4, src_5_8, f4, sum); }
 256             src4 = src_5_8;
 257         }
 258         //store result
 259         _mm_store_ps(dst2, sum);
 260         dst2+=4;
 261     }
 262     for (;dst2<dst_endr;dst2+=4)//right margin
 263     {
 264         const float *src = dst2;
 265         filter_end-=4;
 266
 267         //filter 4
 268         __m128 src4 = _mm_load_ps(src);//1 2 3 4
 269         __m128 sum = _mm_setzero_ps();
 270         __m128 src_5_8, f4;
 271         for(const float* f=filter_start;f<filter_end;f+=4)
 272         {
 273             src+=4;
 274             src_5_8 = _mm_load_ps(src);//5 6 7 8
 275             f4 = _mm_load_ps(f);
 276
 277             { XY_FILTER_4(src4, src_5_8, f4, sum); }
 278
 279             src4 = src_5_8;
 280             //move new 4 in_n_out to old 4 in_n_out
 281         }
 282         src_5_8 = _mm_setzero_ps();
 283         f4 = _mm_load_ps(filter_end);
 284         { XY_FILTER_4(src4, src_5_8, f4, sum); }
 285         //store result
 286         _mm_store_ps(dst2, sum);
 287     }
 288 }
 289
 290 /****
 291  * See @xy_filter_c
 292  **/
 293 void xy_filter_sse_v6(float *dst, int width, int height, int stride, const float *filter, int filter_width)
 294 {
 295     ASSERT( stride>=4*(width+filter_width) );
 296     ASSERT( ((stride|(4*width)|(4*filter_width)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
 297
 298     BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
 299     BYTE* end = dst_byte + height*stride;
 300     for( ; dst_byte<end; dst_byte+=stride )
 301     {
 302         xy_filter_one_line_sse_v6(reinterpret_cast<float*>(dst_byte), width, filter, filter_width);
 303     }
 304 }
 305
 306 /****
 307  * Constrain:
 308  *   LENGTH%4 == 0 || LENGTH%4 == 1
 309  **/
 310 template<int LENGTH>
 311 struct M128s
 312 {
 313     __m128 x;
 314     M128s<LENGTH - 4> next;
 315
 316     template<int Index> __forceinline __m128& GetAt()
 317     {
 318         return next.GetAt<Index - 4>();
 319     }
 320     template<> __forceinline __m128& GetAt<0>()
 321     {
 322         return x;
 323     }
 324
 325     template<int Start, int Offset> __forceinline __m128& GetAt()
 326     {
 327         return GetAt<Start + Offset>();
 328     }
 329
 330     __forceinline void Load(const float* src)
 331     {
 332         x = _mm_load_ps(src);
 333         next.Load(src+4);
 334     }
 335 };
 336
 337 template<>
 338 struct M128s<1>
 339 {
 340     __m128 x;
 341
 342     template<int Index> __forceinline __m128& GetAt()
 343     {
 344         return x;
 345     }
 346     __forceinline void Load(const float* src)
 347     {
 348         x = _mm_set1_ps(*src);
 349     }
 350 };
 351
 352 template<>
 353 struct M128s<0>
 354 {
 355     void Load(const float* src)
 356     {
 357     }
 358 };
 359
 360 template<int FILTER_LENGTH, int START, int LENGTH>
 361 struct Filter4
 362 {
 363     static __forceinline void do_cal(__m128& src0_128, const float * src4, M128s<FILTER_LENGTH>& filter128s, __m128& sum)
 364     {
 365         __m128 src4_128 = _mm_load_ps(src4);
 366         { XY_FILTER_4(src0_128, src4_128, filter128s.GetAt<START>(), sum); }
 367         Filter4<FILTER_LENGTH,START+4,LENGTH-4>::do_cal(src4_128, src4+4, filter128s, sum);
 368         src0_128 = src4_128;
 369     }
 370 };
 371
 372 template<int FILTER_LENGTH, int START>
 373 struct Filter4<FILTER_LENGTH, START, 1>
 374 {
 375     static __forceinline void do_cal(__m128& src0_128, const float * src4, M128s<FILTER_LENGTH>& filter128s, __m128& sum)
 376     {
 377         cal_tail<FILTER_LENGTH-START>(src0_128, src4, filter128s, sum);
 378     }
 379     template<int TAIL>
 380     static __forceinline void cal_tail(__m128& src0_128, const float * src4, M128s<FILTER_LENGTH>& filter128s, __m128& sum);
 381     template<>
 382     static __forceinline void cal_tail<1>(__m128& src0_128, const float * src4, M128s<FILTER_LENGTH>& filter128s, __m128& sum)
 383     {
 384         { XY_FILTER_4_1(src0_128, filter128s.GetAt<FILTER_LENGTH-1>(), sum); }
 385     }
 386 };
 387
 388 template<int FILTER_LENGTH, int START>
 389 struct Filter4<FILTER_LENGTH, START, 0>
 390 {
 391     static __forceinline void do_cal(__m128& src0_128, const float * src4, M128s<FILTER_LENGTH>& filter128s, __m128& sum)
 392     {
 393     }
 394 };
 395
 396 template<int FILTER_LENGTH,int MARGIN_LENGTH>
 397 struct FilterAllLeftMargin
 398 {
 399     static __forceinline void cal(float * src, M128s<FILTER_LENGTH>& filter128s)
 400     {
 401         do_cal<FILTER_LENGTH%4>(src, filter128s);
 402     }
 403     template<int FILTER_TAIL>
 404     static __forceinline void do_cal(float * src, M128s<FILTER_LENGTH>& filter128s)
 405     {
 406         //filter 4
 407         __m128 src0 = _mm_setzero_ps();
 408         __m128 sum = _mm_setzero_ps();
 409         Filter4<FILTER_LENGTH,MARGIN_LENGTH-4,FILTER_LENGTH-MARGIN_LENGTH+4>::do_cal(src0, src, filter128s, sum);
 410         _mm_store_ps(src-MARGIN_LENGTH, sum);
 411         FilterAllLeftMargin<FILTER_LENGTH,MARGIN_LENGTH-4>::do_cal<0>(src, filter128s);
 412     }
 413     template<>
 414     static __forceinline void do_cal<1>(float * src, M128s<FILTER_LENGTH>& filter128s)
 415     {
 416         //filter 4
 417         __m128 sum = _mm_setzero_ps();
 418         //Only one of the last 4 filter coefficiences is non-zero
 419         _mm_store_ps(src-MARGIN_LENGTH, sum);
 420         FilterAllLeftMargin<FILTER_LENGTH,MARGIN_LENGTH-4>::do_cal<0>(src, filter128s);
 421     }
 422 };
 423
 424 template<int FILTER_LENGTH, int MARGIN_LENGTH>
 425 struct FilterAllRightMargin
 426 {
 427     static __forceinline void cal(float * src, M128s<FILTER_LENGTH>& filter128s)
 428     {
 429         do_cal<FILTER_LENGTH%4>(src, filter128s);
 430     }
 431     template<int FILTER_TAIL>
 432     static __forceinline void do_cal(float * src, M128s<FILTER_LENGTH>& filter128s)
 433     {
 434         //filter 4
 435         {
 436             __m128 src0 = _mm_load_ps(src);
 437             __m128 sum = _mm_setzero_ps();
 438             Filter4<FILTER_LENGTH,0,MARGIN_LENGTH-4>::do_cal(src0, src+4, filter128s, sum);
 439             __m128 src4 = _mm_setzero_ps();
 440             { XY_FILTER_4(src0, src4, filter128s.GetAt<MARGIN_LENGTH-4>(), sum); }
 441             //store result
 442             _mm_store_ps(src, sum);
 443         }
 444         FilterAllRightMargin<FILTER_LENGTH,MARGIN_LENGTH-4>::do_cal<0>(src+4, filter128s);
 445     }
 446     template<>
 447     static __forceinline void do_cal<1>(float * src, M128s<FILTER_LENGTH>& filter128s)
 448     {
 449         //filter 4
 450         {
 451             __m128 src0 = _mm_load_ps(src);
 452             __m128 sum = _mm_setzero_ps();
 453             Filter4<FILTER_LENGTH,0,MARGIN_LENGTH-4>::do_cal(src0, src+4, filter128s, sum);
 454             //Only one of the last 4 filter coefficiences is non-zero
 455             { XY_FILTER_4_1(src0, filter128s.GetAt<MARGIN_LENGTH-4>(), sum); }
 456             //store result
 457             _mm_store_ps(src, sum);
 458         }
 459         FilterAllRightMargin<FILTER_LENGTH,MARGIN_LENGTH-4>::do_cal<0>(src+4, filter128s);
 460     }
 461 };
 462
 463 template<int FILTER_LENGTH>
 464 struct FilterAllLeftMargin<FILTER_LENGTH,0>
 465 {
 466     template<int FILTER_TAIL>
 467     static __forceinline void do_cal(float * src, M128s<FILTER_LENGTH>& filter128s)
 468     {
 469     }
 470 };
 471
 472 template<int FILTER_LENGTH>
 473 struct FilterAllRightMargin<FILTER_LENGTH,0>
 474 {
 475     template<int FILTER_TAIL>
 476     static __forceinline void do_cal(float * src, M128s<FILTER_LENGTH>& filter128s)
 477     {
 478     }
 479 };
 480
 481 /****
 482  * Equivalent:
 483  *   xy_filter_c(float *dst, int width, int height, int stride, const float *filter, (FILTER_LENGTH+3)&~3 );
 484  * See @xy_filter_c
 485  * Constrain:
 486  *   FILTER_LENGTH<=width && width%4==0
 487  **/
 488 template<int FILTER_LENGTH>
 489 void xy_filter_sse_template(float *dst, int width, int height, int stride, const float *filter)
 490 {
 491     ASSERT( stride>=4*(width+FILTER_LENGTH) );
 492     ASSERT( ((stride|(4*width)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
 493
 494     M128s<FILTER_LENGTH> filter128s;
 495     filter128s.Load(filter);
 496
 497     const float *filter_start = filter;
 498     BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
 499     BYTE* end = dst_byte + height*stride;
 500     for( ; dst_byte<end; dst_byte+=stride )
 501     {
 502         float *dst2 = reinterpret_cast<float*>(dst_byte);
 503
 504         //left margin
 505         FilterAllLeftMargin<FILTER_LENGTH,((FILTER_LENGTH+3)&~3)>::cal(dst2, filter128s);
 506         float *dst_end1 = dst2 + width;
 507         float *dst_end0 = dst_end1 - ((FILTER_LENGTH+3)&~3);
 508         for (;dst2<dst_end0;dst2+=4)
 509         {
 510             const float *src = dst2;
 511
 512             //filter 4
 513             __m128 src0 = _mm_load_ps(src);/*1 2 3 4*/
 514             src += 4;
 515             __m128 sum = _mm_setzero_ps();
 516             Filter4<FILTER_LENGTH,0,FILTER_LENGTH>::do_cal(src0, src, filter128s, sum);
 517             //store result
 518             _mm_store_ps(dst2, sum);
 519         }
 520         FilterAllRightMargin<FILTER_LENGTH,((FILTER_LENGTH+3)&~3)>::cal(dst2, filter128s);
 521     }
 522 }
 523
 524
 525 /****
 526  * @src4, @src_5_8, @f3_1, @f3_2, @sum: __m128
 527  * @src4, @src_5_8, @f3_1, @f3_2: const
 528  * @sum: output
 529  **/
 530 #define XY_3_TAG_SYMMETRIC_FILTER_4(src4, src_5_8, f3_1, f3_2, sum) \
 531     __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
 532     __m128 src_2_5 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
 533     sum = _mm_mul_ps(f3_1, src4);\
 534     __m128 mul2 = _mm_mul_ps(f3_2, src_2_5);\
 535     __m128 mul3 = _mm_mul_ps(f3_1, src_3_6);\
 536     sum = _mm_add_ps(sum, mul2);\
 537     sum = _mm_add_ps(sum, mul3);
 538
 539 /****
 540  * Equivalent:
 541  *   xy_filter_c(float *dst, int width, int height, int stride, const float *filter, 4 );
 542  * See @xy_filter_c
 543  * Constrain:
 544  *   filter[3] == 0 && filter[0] == filter[2] (symmetric) (&& sum(filter)==1)
 545  **/
 546 void xy_3_tag_symmetric_filter_sse(float *dst, int width, int height, int stride, const float *filter)
 547 {
 548     const int filter_width = 4;
 549     ASSERT( stride>=4*(width+filter_width) );
 550     ASSERT( ((stride|(4*width)|(4*filter_width)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
 551
 552     ASSERT(filter_width==4 && filter[3]==0 && filter[2]==filter[0]);
 553
 554     __m128 f3_1 = _mm_set1_ps(filter[0]);
 555     __m128 f3_2 = _mm_set1_ps(filter[1]);
 556
 557     BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
 558     BYTE* end = dst_byte + height*stride;
 559     for( ; dst_byte<end; dst_byte+=stride )
 560     {
 561         float *dst_f = reinterpret_cast<float*>(dst_byte);
 562         float *dst2 = dst_f;
 563
 564         float *dst_end0 = dst_f + width - 4;
 565         //filter 4
 566         __m128 src4 = _mm_load_ps(dst2);/*1 2 3 4*/
 567         {
 568             __m128 sum;
 569             __m128 src_4 = _mm_setzero_ps();
 570             { XY_3_TAG_SYMMETRIC_FILTER_4(src_4, src4, f3_1, f3_2, sum); }
 571             _mm_store_ps(dst2-4, sum);
 572         }
 573         for (;dst2<dst_end0;dst2+=4)
 574         {
 575             __m128 sum;
 576             __m128 src_5_8 = _mm_load_ps(dst2+4);/*5 6 7 8*/
 577             { XY_3_TAG_SYMMETRIC_FILTER_4(src4, src_5_8, f3_1, f3_2, sum); }
 578             src4 = src_5_8;
 579             //store result
 580             _mm_store_ps(dst2, sum);
 581         }
 582         {
 583             __m128 sum;
 584             __m128 src_5_8 = _mm_setzero_ps();/*5 6 7 8*/
 585             { XY_3_TAG_SYMMETRIC_FILTER_4(src4, src_5_8, f3_1, f3_2, sum); }
 586             src4 = src_5_8;
 587             //store result
 588             _mm_store_ps(dst2, sum);
 589         }
 590     }
 591 }
 592
 593
 594 /****
 595  * See @xy_filter_c
 596  **/
 597 void xy_filter_sse(float *dst, int width, int height, int stride, const float *filter, int filter_width)
 598 {
 599     typedef void (*Filter)(float *dst, int width, int height, int stride, const float *filter);
 600     const Filter filters[] = {
 601         NULL,
 602         xy_filter_sse_template<1>, xy_filter_sse_template<4>, xy_filter_sse_template<4>, xy_filter_sse_template<4>,
 603         xy_filter_sse_template<5>, xy_filter_sse_template<8>, xy_filter_sse_template<8>, xy_filter_sse_template<8>,
 604         xy_filter_sse_template<9>, xy_filter_sse_template<12>, xy_filter_sse_template<12>, xy_filter_sse_template<12>,
 605         xy_filter_sse_template<13>, xy_filter_sse_template<16>, xy_filter_sse_template<16>, xy_filter_sse_template<16>,
 606         xy_filter_sse_template<17>, xy_filter_sse_template<20>, xy_filter_sse_template<20>, xy_filter_sse_template<20>,
 607         xy_filter_sse_template<21>, xy_filter_sse_template<24>, xy_filter_sse_template<24>, xy_filter_sse_template<24>,
 608         xy_filter_sse_template<25>, xy_filter_sse_template<28>, xy_filter_sse_template<28>, xy_filter_sse_template<28>
 609     };
 610     if (filter_width<=28 && filter_width<=width)
 611     {
 612         int tmp = filter_width;
 613         // Skip tail zero, but we cannot (and don't have to) support more than 3 tail zeros currently.
 614         while( AlmostEqual(filter[tmp-1],0.0f) && filter_width-tmp<3 )
 615             tmp--;
 616         if (tmp==3&&filter[0]==filter[2])
 617         {
 618             xy_3_tag_symmetric_filter_sse(dst, width, height, stride, filter);
 619         }
 620         else
 621         {
 622             filters[tmp](dst, width, height, stride, filter);
 623         }
 624     }
 625     else
 626     {
 627         xy_filter_sse_v6(dst, width, height, stride, filter, filter_width);
 628     }
 629 }
 630
 631 /****
 632  * Copy and convert src to dst line by line.
 633  * @dst_width MUST >= @width
 634  * if @dst_width>@width, the extra elements will be filled with 0.
 635  **/
 636 void xy_byte_2_float_c(float *dst, int dst_width, int dst_stride,
 637     PCUINT8 src, int width, int height, int stride)
 638 {
 639     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
 640
 641     PCUINT8 src_end = src + height*stride;
 642     for( ; src<src_end; src+=stride, dst_byte+=dst_stride )
 643     {
 644         PCUINT8 src2 = src;
 645         PCUINT8 src_end = src2 + width;
 646         float *dst2 = reinterpret_cast<float*>(dst_byte);
 647
 648         for (;src2<src_end;src2++, dst2++)
 649         {
 650             *dst2 = *src2;
 651         }
 652         float *dst2_end=reinterpret_cast<float*>(dst_byte)+dst_width;
 653         for (;dst2<dst2_end;dst2++)
 654         {
 655             *dst2=0;
 656         }
 657     }
 658 }
 659
 660 /****
 661  * See @xy_byte_2_float_c
 662  **/
 663 void xy_byte_2_float_sse(float *dst, int dst_width, int dst_stride,
 664     PCUINT8 src, int width, int height, int stride)
 665 {
 666     ASSERT( dst_width>=width );
 667     ASSERT( ((reinterpret_cast<int>(dst)|dst_stride)&15)==0 );
 668     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
 669
 670     PCUINT8 src_end = src + height*stride;
 671     for( ; src<src_end; src+=stride, dst_byte+=dst_stride )
 672     {
 673         PCUINT8 src2 = src;
 674         PCUINT8 src2_end0 = src2 + (width&~15);
 675         float *dst2 = reinterpret_cast<float*>(dst_byte);
 676
 677         for (;src2<src2_end0;src2+=16, dst2+=16)
 678         {
 679             //filter 4
 680             __m128i src16 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src2));
 681             __m128i src16_lo = _mm_unpacklo_epi8(src16, _mm_setzero_si128());
 682             __m128i src16_hi = _mm_unpackhi_epi8(src16, _mm_setzero_si128());
 683             __m128i src16_lo_lo = _mm_unpacklo_epi8(src16_lo, _mm_setzero_si128());
 684             __m128i src16_lo_hi = _mm_unpackhi_epi8(src16_lo, _mm_setzero_si128());
 685             __m128i src16_hi_lo = _mm_unpacklo_epi8(src16_hi, _mm_setzero_si128());
 686             __m128i src16_hi_hi = _mm_unpackhi_epi8(src16_hi, _mm_setzero_si128());
 687             __m128 dst_f1 =  _mm_cvtepi32_ps(src16_lo_lo);
 688             __m128 dst_f2 =  _mm_cvtepi32_ps(src16_lo_hi);
 689             __m128 dst_f3 =  _mm_cvtepi32_ps(src16_hi_lo);
 690             __m128 dst_f4 =  _mm_cvtepi32_ps(src16_hi_hi);
 691             _mm_store_ps(dst2, dst_f1);
 692             _mm_store_ps(dst2+4, dst_f2);
 693             _mm_store_ps(dst2+8, dst_f3);
 694             _mm_store_ps(dst2+12, dst_f4);
 695         }
 696         PCUINT8 src2_end = src + width;
 697         for (;src2<src2_end;src2++,dst2++)
 698         {
 699             *dst2 = *src2;
 700         }
 701         float *dst2_end=reinterpret_cast<float*>(dst_byte)+dst_width;
 702         for (;dst2<dst2_end;dst2++)
 703         {
 704             *dst2=0;
 705         }
 706     }
 707 }
 708
 709 /****
 710  * Copy transposed Matrix src to dst.
 711  * @dst_width MUST >= @height.
 712  * if @dst_width > @height, the extra elements will be filled with 0.
 713  **/
 714 void xy_float_2_float_transpose_c(float *dst, int dst_width, int dst_stride,
 715     const float *src, int width, int height, int src_stride)
 716 {
 717     ASSERT(dst_width >= height);
 718     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
 719     const float* src_end = src + width;
 720     PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
 721     for( ; src<src_end; src++, dst_byte+=dst_stride )
 722     {
 723         PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
 724
 725         float *dst2 = reinterpret_cast<float*>(dst_byte);
 726         for (;src2<src2_end;src2+=src_stride,dst2++)
 727         {
 728             *dst2 = *reinterpret_cast<const float*>(src2);
 729         }
 730         float *dst2_end = reinterpret_cast<float*>(dst_byte) + dst_width;
 731         for (;dst2<dst2_end;dst2++)
 732         {
 733             *dst2 = 0;
 734         }
 735     }
 736 }
 737
 738 /****
 739  * see @xy_float_2_float_transpose_c
 740  **/
 741 void xy_float_2_float_transpose_sse(float *dst, int dst_width, int dst_stride,
 742     const float *src, int width, int height, int src_stride)
 743 {
 744     typedef float DstT;
 745     typedef const float SrcT;
 746
 747     ASSERT( (((int)dst|dst_stride)&15)==0 );
 748     ASSERT(dst_width >= height);
 749     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
 750     SrcT* src_end = src + width;
 751     PCUINT8 src2_end1 = reinterpret_cast<PCUINT8>(src) + (height&~3)*src_stride;
 752     PCUINT8 src2_end2 = reinterpret_cast<PCUINT8>(src) + height*src_stride;
 753     for( ; src<src_end; src++, dst_byte+=dst_stride )
 754     {
 755         PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
 756
 757         DstT *dst2 = reinterpret_cast<DstT*>(dst_byte);
 758         for (;src2<src2_end1;src2+=4*src_stride,dst2+=4)
 759         {
 760             __m128 m1 = _mm_set_ps(
 761                 *(SrcT*)(src2+3*src_stride),
 762                 *(SrcT*)(src2+2*src_stride),
 763                 *(SrcT*)(src2+src_stride),
 764                 *(SrcT*)(src2));
 765             _mm_store_ps(dst2, m1);
 766         }
 767         for (;src2<src2_end2;src2+=src_stride,dst2++)
 768         {
 769             *dst2 = *reinterpret_cast<SrcT*>(src2);
 770         }
 771         float *dst2_end = reinterpret_cast<DstT*>(dst_byte) + dst_width;
 772         for (;dst2<dst2_end;dst2++)
 773         {
 774             *dst2 = 0;
 775         }
 776     }
 777 }
 778
 779 /****
 780  * Transpose and round Matrix src, then copy to dst.
 781  * @dst_width MUST >= @height.
 782  * if @dst_width > @height, the extra elements will be filled with 0.
 783  **/
 784 void xy_float_2_byte_transpose_c(UINT8 *dst, int dst_width, int dst_stride,
 785     const float *src, int width, int height, int src_stride)
 786 {
 787     ASSERT(dst_width >= height);
 788     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
 789     const float* src_end = src + width;
 790     PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
 791     for( ; src<src_end; src++, dst_byte+=dst_stride )
 792     {
 793         PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
 794
 795         UINT8 *dst2 = reinterpret_cast<UINT8*>(dst_byte);
 796         for (;src2<src2_end;src2+=src_stride,dst2++)
 797         {
 798             *dst2 = static_cast<UINT8>(*reinterpret_cast<const float*>(src2)+0.5);
 799         }
 800         UINT8 *dst2_end = reinterpret_cast<UINT8*>(dst_byte) + dst_width;
 801         for (;dst2<dst2_end;dst2++)
 802         {
 803             *dst2 = 0;
 804         }
 805     }
 806 }
 807
 808 void xy_float_2_byte_transpose_sse(UINT8 *dst, int dst_width, int dst_stride,
 809     const float *src, int width, int height, int src_stride)
 810 {
 811     typedef UINT8 DstT;
 812     typedef const float SrcT;
 813
 814     ASSERT(dst_width >= height);
 815     ASSERT((((int)dst|dst_stride)&15)==0);
 816     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
 817     SrcT* src_end = src + width;
 818     PCUINT8 src2_end00 = reinterpret_cast<PCUINT8>(src) + (height&~15)*src_stride;
 819     PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
 820     for( ; src<src_end; src++, dst_byte+=dst_stride )
 821     {
 822         PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
 823
 824         DstT *dst2 = reinterpret_cast<DstT*>(dst_byte);
 825         for (;src2<src2_end00;src2+=16*src_stride,dst2+=16)
 826         {
 827             __m128 m1 = _mm_set_ps(
 828                 *(SrcT*)(src2+3*src_stride),
 829                 *(SrcT*)(src2+2*src_stride),
 830                 *(SrcT*)(src2+src_stride),
 831                 *(SrcT*)(src2));
 832             __m128 m2 = _mm_set_ps(
 833                 *(SrcT*)(src2+7*src_stride),
 834                 *(SrcT*)(src2+6*src_stride),
 835                 *(SrcT*)(src2+5*src_stride),
 836                 *(SrcT*)(src2+4*src_stride));
 837             __m128 m3 = _mm_set_ps(
 838                 *(SrcT*)(src2+11*src_stride),
 839                 *(SrcT*)(src2+10*src_stride),
 840                 *(SrcT*)(src2+9*src_stride),
 841                 *(SrcT*)(src2+8*src_stride));
 842             __m128 m4 = _mm_set_ps(
 843                 *(SrcT*)(src2+15*src_stride),
 844                 *(SrcT*)(src2+14*src_stride),
 845                 *(SrcT*)(src2+13*src_stride),
 846                 *(SrcT*)(src2+12*src_stride));
 847
 848             __m128i i1 = _mm_cvtps_epi32(m1);
 849             __m128i i2 = _mm_cvtps_epi32(m2);
 850             __m128i i3 = _mm_cvtps_epi32(m3);
 851             __m128i i4 = _mm_cvtps_epi32(m4);
 852
 853             i1 = _mm_packs_epi32(i1,i2);
 854             i3 = _mm_packs_epi32(i3,i4);
 855             i1 = _mm_packus_epi16(i1,i3);
 856
 857             _mm_store_si128((__m128i*)dst2, i1);
 858         }
 859         for (;src2<src2_end;src2+=src_stride,dst2++)
 860         {
 861             *dst2 = static_cast<DstT>(*reinterpret_cast<SrcT*>(src2)+0.5);
 862         }
 863         DstT *dst2_end = reinterpret_cast<DstT*>(dst_byte) + dst_width;
 864         for (;dst2<dst2_end;dst2++)
 865         {
 866             *dst2 = 0;
 867         }
 868     }
 869 }
 870
 871 /****
 872  * To Do: decent CPU capability check
 873  **/
 874 void xy_gaussian_blur(PUINT8 dst, int dst_stride,
 875     PCUINT8 src, int width, int height, int stride,
 876     const float *gt_x, int r_x, int gt_ex_width_x,
 877     const float *gt_y, int r_y, int gt_ex_width_y)
 878 {
 879     ASSERT(width<=stride && width+2*r_x<=dst_stride);
 880     int ex_mask_width_x = ((r_x*2+1)+3)&~3;
 881     ASSERT(ex_mask_width_x<=gt_ex_width_x);
 882     if (ex_mask_width_x>gt_ex_width_x)
 883     {
 884         int o=0;
 885         o=o/o;
 886         exit(-1);
 887     }
 888
 889     int ex_mask_width_y = ((r_y*2+1)+3)&~3;
 890     ASSERT(ex_mask_width_y<=gt_ex_width_y);
 891     if (ex_mask_width_y>gt_ex_width_y)
 892     {
 893         int o=0;
 894         o=o/o;
 895         exit(-1);
 896     }
 897
 898
 899     int fwidth = (width+3)&~3;
 900     int fstride = (fwidth + ex_mask_width_x)*sizeof(float);
 901     int fheight = (height+3)&~3;
 902     int fstride_ver = (fheight+ex_mask_width_y)*sizeof(float);
 903
 904     PUINT8 buff_base = reinterpret_cast<PUINT8>(xy_malloc(height*fstride + (fwidth + ex_mask_width_x)*fstride_ver));
 905
 906     float *hor_buff_base = reinterpret_cast<float*>(buff_base);
 907     float *hor_buff = hor_buff_base + ex_mask_width_x;
 908
 909     // byte to float
 910     ASSERT( ((width+15)&~15)<=stride );
 911     xy_byte_2_float_sse(hor_buff, fwidth, fstride, src, width, height, stride);
 912
 913     // horizontal pass
 914     xy_filter_sse(hor_buff, fwidth, height, fstride, gt_x, ex_mask_width_x);
 915
 916
 917     // transpose
 918     float *ver_buff_base = reinterpret_cast<float*>(buff_base + height*fstride);
 919     float *ver_buff = ver_buff_base + ex_mask_width_y;
 920
 921     int true_width = width+r_x*2;
 922     xy_float_2_float_transpose_sse(ver_buff, fheight, fstride_ver, hor_buff-r_x*2, true_width, height, fstride);
 923
 924     // vertical pass
 925     xy_filter_sse(ver_buff, fheight, true_width, fstride_ver, gt_y, ex_mask_width_y);
 926
 927     // transpose
 928     int true_height = height + 2*r_y;
 929     xy_float_2_byte_transpose_sse(dst, true_width, dst_stride, ver_buff-r_y*2, true_height, true_width, fstride_ver);
 930
 931     xy_free(buff_base);
 932     _mm_empty();
 933 }
 934
 935
 936 enum RoundingPolicy
 937 {
 938     ROUND_DOWN
 939     , ROUND_HALF_DOWN
 940     , ROUND_HALF_UP
 941     , ROUND_HALF_TO_EVEN
 942     , COUNT_ROUND_POLICY
 943 };
 944
 945 template<int ROUNDING_POLICY, int precision>
 946 struct XyRounding
 947 {
 948     __forceinline void init_sse();
 949     __forceinline __m128i round(__m128i in);
 950     __forceinline int round(int in);
 951 };
 952
 953 template<int precision>
 954 struct XyRounding<ROUND_DOWN, precision>
 955 {
 956     __forceinline void init_sse()
 957     {
 958
 959     }
 960     __forceinline __m128i round(__m128i in)
 961     {
 962         return in;
 963     }
 964
 965     __forceinline int round(unsigned in)
 966     {
 967         return in;
 968     }
 969 };
 970
 971
 972 template<int precision>
 973 struct XyRounding<ROUND_HALF_DOWN, precision>
 974 {
 975     __forceinline void init_sse()
 976     {
 977         m_rounding_patch = _mm_set1_epi16( (1<<(precision-1))-1 );
 978     }
 979     __forceinline __m128i round(__m128i in)
 980     {
 981         return _mm_adds_epu16(in, m_rounding_patch);
 982     }
 983
 984     __forceinline int round(unsigned in)
 985     {
 986         return in + ((1<<(precision-1))-1);
 987     }
 988     __m128i m_rounding_patch;
 989 };
 990
 991
 992 template<int precision>
 993 struct XyRounding<ROUND_HALF_UP, precision>
 994 {
 995     __forceinline void init_sse()
 996     {
 997         m_rounding_patch = _mm_set1_epi16( 1<<(precision-1) );
 998     }
 999     __forceinline __m128i round(__m128i in)
1000     {
1001         return _mm_adds_epu16(in, m_rounding_patch);
1002     }
1003
1004     __forceinline int round(unsigned in)
1005     {
1006         return in + (1<<(precision-1));
1007     }
1008     __m128i m_rounding_patch;
1009 };
1010
1011
1012 template<int precision>
1013 struct XyRounding<ROUND_HALF_TO_EVEN, precision>
1014 {
1015     __forceinline void init_sse()
1016     {
1017         m_rounding_patch = _mm_set1_epi16( 1<<(precision-1) );
1018     }
1019     __forceinline __m128i round(__m128i in)
1020     {
1021         in = _mm_adds_epu16(in, m_rounding_patch);
1022         __m128i tmp = _mm_slli_epi16(in, 15-precision);
1023         tmp = _mm_srli_epi16(tmp, 15);
1024         return _mm_adds_epu16(in, tmp);
1025     }
1026
1027     __forceinline int round(unsigned in)
1028     {
1029         return in + (1<<(precision-1)) + ((in>>precision)&1);
1030     }
1031     __m128i m_rounding_patch;
1032 };
1033
1034 /****
1035  * filter with [1,2,1]
1036  * 1. It is a in-place horizontal filter
1037  * 2. Boundary Pixels are filtered by padding 0. E.g.
1038  *      dst[0] = (0*1 + dst[0]*2 + dst[1]*1)/4;
1039  **/
1040 template<int ROUNDING_POLICY>
1041 void xy_be_filter_c(PUINT8 dst, int width, int height, int stride)
1042 {
1043     ASSERT(width>=1);
1044     if (width<=0)
1045     {
1046         return;
1047     }
1048     PUINT8 dst2 = NULL;
1049     XyRounding<ROUNDING_POLICY, 2> xy_rounding;
1050     for (int y=0;y<height;y++)
1051     {
1052         dst2 = dst + y*stride;
1053         int old_sum = dst2[0];
1054         int tmp = 0;
1055         int x=0;
1056         for (x=0;x<width-1;x++)
1057         {
1058             int new_sum = dst2[x]+dst2[x+1];
1059             tmp = old_sum + new_sum;//old_sum == src2[x-1]+src2[x];
1060             dst2[x] = (xy_rounding.round(tmp)>>2);
1061             old_sum = new_sum;
1062         }
1063         tmp = old_sum + dst2[x];
1064         dst2[x] = (xy_rounding.round(tmp)>>2);
1065     }
1066 }
1067
1068 /****
1069  * 1. It is a in-place symmetric 3-tag horizontal filter
1070  * 2. Boundary Pixels are filtered by padding 0. E.g.
1071  *      dst[0] = (0*1 + dst[0]*2 + dst[1]*1)/4;
1072  * 3. sum(filter) == 256
1073  **/
1074 template<int ROUNDING_POLICY>
1075 void xy_be_filter2_c(PUINT8 dst, int width, int height, int stride, PCUINT filter)
1076 {
1077     ASSERT(width>=1);
1078     if (width<=0)
1079     {
1080         return;
1081     }
1082
1083     const int VOLUME_BITS = 8;
1084     const int VOLUME = (1<<VOLUME_BITS);
1085     if (filter[0]==0)
1086     {
1087         return;//nothing to do;
1088     }
1089     else if (filter[0]== (VOLUME>>2))
1090     {
1091         return xy_be_filter_c<ROUNDING_POLICY>(dst, width, height, stride);
1092     }
1093
1094     PUINT8 dst2 = NULL;
1095     XyRounding<ROUNDING_POLICY, VOLUME_BITS> xy_rounding;
1096     for (int y=0;y<height;y++)
1097     {
1098         dst2 = dst + y*stride;
1099         int old_pix = 0;
1100         int tmp = 0;
1101         int x=0;
1102         for (x=0;x<width-1;x++)
1103         {
1104             tmp = (old_pix + dst2[x+1]) * filter[0] + dst2[x] * filter[1];
1105             old_pix = dst2[x];
1106             dst2[x] = (xy_rounding.round(tmp)>>VOLUME_BITS);
1107         }
1108         tmp = old_pix * filter[0] + dst2[x] * filter[1];
1109         dst2[x] = (xy_rounding.round(tmp)>>VOLUME_BITS);
1110     }
1111 }
1112
1113 /****
1114  * See @xy_be_filter_c
1115  * No alignment requirement.
1116  **/
1117 template<int ROUNDING_POLICY>
1118 void xy_be_filter_sse(PUINT8 dst, int width, int height, int stride)
1119 {
1120     ASSERT(width>=1);
1121     if (width<=0)
1122     {
1123         return;
1124     }
1125     int width_mod8 = ((width-1)&~7);
1126     XyRounding<ROUNDING_POLICY, 2> xy_rounding;
1127     xy_rounding.init_sse();
1128     for (int y = 0; y < height; y++) {
1129         PUINT8 dst2=dst+y*stride;
1130
1131         __m128i old_pix_128 = _mm_cvtsi32_si128(dst2[0]);
1132         __m128i old_sum_128 = old_pix_128;
1133
1134         int x = 0;
1135         for (; x < width_mod8; x+=8) {
1136             __m128i new_pix = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(dst2+x+1));
1137             new_pix = _mm_unpacklo_epi8(new_pix, _mm_setzero_si128());
1138             __m128i temp = _mm_slli_si128(new_pix,2);
1139             temp = _mm_add_epi16(temp, old_pix_128);
1140             temp = _mm_add_epi16(temp, new_pix);
1141             old_pix_128 = _mm_srli_si128(new_pix,14);
1142
1143             new_pix = _mm_slli_si128(temp,2);
1144             new_pix = _mm_add_epi16(new_pix, old_sum_128);
1145             new_pix = _mm_add_epi16(new_pix, temp);
1146             old_sum_128 = _mm_srli_si128(temp, 14);
1147
1148             new_pix = xy_rounding.round(new_pix);
1149
1150             new_pix = _mm_srli_epi16(new_pix, 2);
1151             new_pix = _mm_packus_epi16(new_pix, new_pix);
1152
1153             _mm_storel_epi64( reinterpret_cast<__m128i*>(dst2+x), new_pix );
1154         }
1155         int old_sum = _mm_cvtsi128_si32(old_sum_128);
1156         old_sum &= 0xffff;
1157         int tmp = 0;
1158         for ( ; x < width-1; x++) {
1159             int new_sum = dst2[x] + dst2[x+1];
1160             tmp = old_sum + new_sum;
1161             dst2[x] = (xy_rounding.round(tmp)>>2);
1162             old_sum = new_sum;
1163         }
1164         tmp = old_sum + dst2[x];
1165         dst2[x] = (xy_rounding.round(tmp)>>2);
1166     }
1167 }
1168
1169 /****
1170  * See @xy_be_filter2_c
1171  * No alignment requirement.
1172  **/
1173 template<int ROUNDING_POLICY>
1174 void xy_be_filter2_sse(PUINT8 dst, int width, int height, int stride, PCUINT filter)
1175 {
1176     const int VOLUME_BITS = 8;
1177     const int VOLUME = (1<<VOLUME_BITS);
1178     ASSERT(filter[0]==filter[2]);
1179     ASSERT(filter[0]+filter[1]+filter[2]==VOLUME);
1180     ASSERT(width>=1);
1181     if (width<=0)
1182     {
1183         return;
1184     }
1185
1186     XyRounding<ROUNDING_POLICY, VOLUME_BITS> xy_rounding;
1187     xy_rounding.init_sse();
1188     __m128i f3_1 = _mm_set1_epi16(filter[0]);
1189     __m128i f3_2 = _mm_set1_epi16(filter[1]);
1190
1191     int width_mod8 = ((width-1)&~7);
1192     //__m128i round = _mm_set1_epi16(8);
1193
1194     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
1195     PUINT8 end = dst_byte + height*stride;
1196     for( ; dst_byte<end; dst_byte+=stride )
1197     {
1198         PUINT8 dst2 = dst_byte;
1199
1200         PUINT8 dst_end0 = dst_byte + width - 4;
1201
1202         __m128i old_pix1_128 = _mm_setzero_si128();
1203         __m128i old_pix2_128 = _mm_cvtsi32_si128(dst2[0]);
1204
1205         int x = 0;
1206         for (; x < width_mod8; x+=8) {
1207             __m128i pix2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(dst2+x+1));
1208             pix2 = _mm_unpacklo_epi8(pix2, _mm_setzero_si128());
1209             __m128i pix1 =  _mm_slli_si128(pix2,2);
1210             pix1 = _mm_add_epi8(pix1, old_pix2_128);
1211             __m128i pix0 =  _mm_slli_si128(pix1,2);
1212             pix0 = _mm_add_epi8(pix0, old_pix1_128);
1213             old_pix1_128 = _mm_srli_si128(pix1,14);
1214             old_pix2_128 = _mm_srli_si128(pix2,14);
1215
1216             pix0 = _mm_add_epi16(pix0, pix2);
1217             pix0 = _mm_mullo_epi16(pix0, f3_1);
1218             pix1 = _mm_mullo_epi16(pix1, f3_2);
1219
1220             pix1 = _mm_adds_epu16(pix1, pix0);
1221
1222             pix1 = xy_rounding.round(pix1);
1223
1224             pix1 = _mm_srli_epi16(pix1, VOLUME_BITS);
1225             pix1 = _mm_packus_epi16(pix1, pix1);
1226
1227             _mm_storel_epi64( reinterpret_cast<__m128i*>(dst2+x), pix1 );
1228         }
1229         int old_pix1 = _mm_cvtsi128_si32(old_pix1_128);
1230         old_pix1 &= 0xff;
1231         int tmp = 0;
1232         for ( ; x < width-1; x++) {
1233             tmp = (old_pix1 + dst2[x+1]) * filter[0] + dst2[x] * filter[1];
1234             old_pix1 = dst2[x];
1235
1236             dst2[x] = (xy_rounding.round(tmp)>>VOLUME_BITS);
1237         }
1238         tmp = old_pix1*filter[0] + dst2[x]*filter[1];
1239         dst2[x] = (xy_rounding.round(tmp)>>VOLUME_BITS);
1240     }
1241 }
1242
1243 /****
1244  * See @xy_be_blur
1245  * Construct the filter used in the final horizontal/vertical pass of @xy_be_blur when @pass is NOT a integer.
1246  * This filter is constructed to satisfy:
1247  *   If @p is a pixel in the middle of the image, pixels @(p-1) and @(p+1) lie just at the left and the right
1248  *   of @p respectively. The value of @p after all horizontal filtering equals to
1249  *      a*value_old(@(p-1)) + b*value_old(@p) + a*value_old(@(p+1)) + other pixels' weighted sum,
1250  *   then
1251  *      a/b = @pass/(@pass+1).
1252  * It makes sense because the above property also holds when @pass is a integer.
1253  *
1254  * @return
1255  *   Let n = floor(pass);
1256  *   filter = [ (pass-n)(n+2) / (2*(1+3pass-n)), 1-(pass-n)(n+2)/(1+3pass-n), (pass-n)(n+2)/ (2*(1+3pass-n)) ]
1257  **/
1258 void xy_calculate_filter(float pass, PUINT filter)
1259 {
1260     const int VOLUME = (1<<8);
1261     int n = (int)pass;
1262     if (n==0)
1263     {
1264         filter[0] = VOLUME * pass/(1+3*pass);
1265     }
1266     else if (n==1)
1267     {
1268         filter[0] = VOLUME * (pass-1)/(2*pass);
1269     }
1270     else
1271     {
1272         filter[0] = VOLUME * (pass-n)*(n+2)/ (2*(1+3*pass-n));
1273     }
1274     filter[1] = VOLUME - 2*filter[0];
1275     filter[2] = filter[0];
1276
1277     if (2*filter[0]>filter[1])
1278     {
1279         //this should not happen
1280         ASSERT(0);
1281         filter[0] = VOLUME/4;
1282         filter[1] = VOLUME - 2*filter[0];
1283         filter[2] = filter[0];
1284     }
1285 }
1286
1287 /****
1288  * See @xy_float_2_byte_transpose_c
1289  **/
1290 void xy_byte_2_byte_transpose_c(UINT8 *dst, int dst_width, int dst_stride,
1291     PCUINT8 src, int width, int height, int src_stride)
1292 {
1293     ASSERT(dst_width >= height);
1294     PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
1295     PCUINT8 src_end = src + width;
1296     PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
1297     for( ; src<src_end; src++, dst_byte+=dst_stride )
1298     {
1299         PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
1300
1301         UINT8 *dst2 = reinterpret_cast<UINT8*>(dst_byte);
1302         for (;src2<src2_end;src2+=src_stride,dst2++)
1303         {
1304             *dst2 = *src2;
1305         }
1306         UINT8 *dst2_end = reinterpret_cast<UINT8*>(dst_byte) + dst_width;
1307         for (;dst2<dst2_end;dst2++)
1308         {
1309             *dst2 = 0;
1310         }
1311     }
1312 }
1313
1314 /****
1315  * Repeat filter [1,2,1] @pass_x times in horizontal and @pass_y times in vertical
1316  * Boundary Pixels are filtered by padding 0, see @xy_be_filter_c.
1317  *
1318  * @pass_x:
1319  *   When @pass_x is not a integer, horizontal filter [1,2,1] is repeated (int)@pass_x times,
1320  *   and a 3-tag symmetric filter which generated according to @pass_x is applied.
1321  *   The final 3-tag symmetric filter is constructed to satisfy the following property:
1322  *     If @pass_xx > @pass_x, the filtered result of @pass_x should NOT be more blur than
1323  *     the result of @pass_xx. More specially, the filtered result of @pass_x should NOT be more
1324  *     blur than (int)@pass_x+1 and should NOT be less blur than (int)@pass_x;
1325  *
1326  * Rounding:
1327  *   Original VSFilter \be effect uses a simple a round down, which has an bias of -7.5/16 per pass.
1328  *   That rounding error is really huge with big @pass value. It has become one part of the \be effect.
1329  *   We can simulate VSFilter's rounding bias by combining different rounding methods. However a simple
1330  *   test shows that result still has some visual difference from VSFilter's.
1331  *
1332  * Know issue:
1333  *   It seems that this separated filter implementation is more sensitive to precision in comparison to
1334  *   VSFilter's simple implementation. Vertical blocky artifact can be observe with big pass value
1335  *   (and @pass_x==@pass_y). So it is only used for filtering of fractional part of \be strength.
1336  **/
1337 void xy_be_blur(PUINT8 src, int width, int height, int stride, float pass_x, float pass_y)
1338 {
1339     //ASSERT(pass_x>0 && pass_y>0);
1340
1341     typedef void (*XyBeFilter)(PUINT8 src, int width, int height, int stride);
1342     typedef void (*XyFilter2)(PUINT8 src, int width, int height, int stride, PCUINT filter);
1343
1344     XyBeFilter filter = (g_cpuid.m_flags & CCpuID::sse2) ? xy_be_filter_sse<ROUND_HALF_TO_EVEN> : xy_be_filter_c<ROUND_HALF_TO_EVEN>;
1345     XyFilter2 filter2 = (g_cpuid.m_flags & CCpuID::sse2) ? xy_be_filter2_sse<ROUND_HALF_TO_EVEN> : xy_be_filter2_c<ROUND_HALF_TO_EVEN>;
1346
1347     int stride_ver = height;
1348     PUINT8 tmp = reinterpret_cast<PUINT8>(xy_malloc(width*height));
1349     ASSERT(tmp);
1350     // horizontal pass
1351     int pass_x_int = static_cast<int>(pass_x);
1352     for (int i=0; i<pass_x_int; i++)
1353     {
1354         filter(src, width, height, stride);
1355     }
1356     if (pass_x-pass_x_int>0)
1357     {
1358         UINT f[3] = {0};
1359         xy_calculate_filter(pass_x, f);
1360         filter2(src, width, height, stride, f);
1361     }
1362
1363     // transpose
1364     xy_byte_2_byte_transpose_c(tmp, height, stride_ver, src, width, height, stride);
1365
1366     // vertical pass
1367     int pass_y_int = static_cast<int>(pass_y);
1368     for (int i=0;i<pass_y_int;i++)
1369     {
1370         filter(tmp, height, width, stride_ver);
1371     }
1372     if (pass_y-pass_y_int>0)
1373     {
1374         UINT f[3] = {0};
1375         xy_calculate_filter(pass_y, f);
1376         filter2(tmp, height, width, stride_ver, f);
1377     }
1378
1379     // transpose
1380     xy_byte_2_byte_transpose_c(src, width, stride, tmp, height, width, stride_ver);
1381
1382     xy_free(tmp);
1383     return;
1384 }