2 * Copyright (C) 2003-2006 Gabest
3 * http://www.gabest.org
5 * This Program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2, or (at your option)
10 * This Program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with GNU Make; see the file COPYING. If not, write to
17 * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
18 * http://www.gnu.org/copyleft/gpl.html
27 #include "Rasterizer.h"
28 #include "SeparableFilter.h"
29 #include "xy_logger.h"
30 #include <boost/flyweight/key_value.hpp>
31 #include "xy_bitmap.h"
32 #include "xy_widen_regoin.h"
34 #ifndef _MAX /* avoid collision with common (nonconforming) macros */
35 #define _MAX (std::max)
36 #define _MIN (std::min)
37 #define _IMPL_MAX std::max
38 #define _IMPL_MIN std::min
40 #define _IMPL_MAX _MAX
41 #define _IMPL_MIN _MIN
44 typedef const UINT8 CUINT8
, *PCUINT8
;
46 //NOTE: signed or unsigned affects the result seriously
47 #define COMBINE_AYUV(a, y, u, v) ((((((((int)(a))<<8)|y)<<8)|u)<<8)|v)
49 #define SPLIT_AYUV(color, a, y, u, v) do { \
51 *(u)=((color)>>8) &0xff; \
52 *(y)=((color)>>16)&0xff;\
53 *(a)=((color)>>24)&0xff;\
56 class GaussianCoefficients
66 GaussianCoefficients(const double sigma
)
77 GaussianCoefficients(const GaussianCoefficients
& priv
)
78 :g_r(priv
.g_r
),g_w(priv
.g_w
),sigma(priv
.sigma
),g_f(NULL
)
81 if (this->g_w_ex
> 0 && this != &priv
) {
82 this->g_f
= reinterpret_cast<float*>(xy_malloc(this->g_w_ex
* sizeof(float)));
84 memcpy(g_f
, priv
.g_f
, this->g_w_ex
* sizeof(g_f
[0]));
88 ~GaussianCoefficients()
90 xy_free(g_f
); g_f
=NULL
;
94 int init(double sigma
)
96 double a
= -1 / (sigma
* sigma
* 2);
97 double exp_a
= exp(a
);
101 if (this->sigma
== sigma
)
106 this->g_w
= (int)ceil(sigma
*3) | 1;
107 this->g_r
= this->g_w
/ 2;
108 this->g_w_ex
= (this->g_w
+ 3) & ~3;
110 if (this->g_w_ex
> 0) {
112 this->g_f
= reinterpret_cast<float*>(xy_malloc(this->g_w_ex
* sizeof(float)));
113 if (this->g_f
== NULL
) {
122 double exp_1
= exp_a
;
123 double exp_2
= exp_1
* exp_1
;
125 this->g_f
[this->g_r
] = exp_0
;
126 float* p_left
= this->g_f
+this->g_r
-1;
127 float* p_right
= this->g_f
+this->g_r
+1;
128 for(int i
=0; i
<this->g_r
;++i
,p_left
--,p_right
++)
140 // for (i = 0; i < this->g_w; ++i) {
141 // this->g[i] = (unsigned) ( exp(a * (i - this->g_r) * (i - this->g_r))* volume_factor + .5 );
142 // volume += this->g[i];
145 for (int i
=0;i
<this->g_w
;i
++)
147 this->g_f
[i
] /= volume
;
149 for (int i
=this->g_w
;i
<this->g_w_ex
;i
++)
162 static const int VOLUME_BITS
= 22;//should not exceed 32-8, and better not exceed 31-8
164 ass_synth_priv(const double sigma
);
165 ass_synth_priv(const ass_synth_priv
& priv
);
168 int generate_tables(double sigma
);
180 // GaussianFilter = GaussianCoefficients or ass_synth_priv
181 template<typename GaussianFilter
>
182 struct GaussianFilterKey
184 const double& operator()(const GaussianFilter
& x
)const
193 ass_tmp_buf(size_t size
);
194 ass_tmp_buf(const ass_tmp_buf
& buf
);
200 struct ass_tmp_buf_get_size
202 const size_t& operator()(const ass_tmp_buf
& buf
)const
208 static const unsigned int maxcolor
= 255;
209 static const unsigned base
= 256;
211 ass_synth_priv::ass_synth_priv(const double sigma
)
220 generate_tables(sigma
);
223 ass_synth_priv::ass_synth_priv(const ass_synth_priv
& priv
):g_r(priv
.g_r
),g_w(priv
.g_w
),sigma(priv
.sigma
)
225 if (this->g_w
> 0 && this != &priv
) {
226 this->g
= (unsigned*)realloc(this->g
, this->g_w
* sizeof(unsigned));
227 this->gt2
= (unsigned*)realloc(this->gt2
, 256 * this->g_w
* sizeof(unsigned));
228 //if (this->g == null || this->gt2 == null) {
231 memcpy(g
, priv
.g
, this->g_w
* sizeof(unsigned));
232 memcpy(gt2
, priv
.gt2
, 256 * this->g_w
* sizeof(unsigned));
236 ass_synth_priv::~ass_synth_priv()
242 int ass_synth_priv::generate_tables(double sigma
)
244 const int TARGET_VOLUME
= 1<<VOLUME_BITS
;
245 const int MAX_VOLUME_ERROR
= VOLUME_BITS
>=22 ? 16 : 1;
247 double a
= -1 / (sigma
* sigma
* 2);
248 double exp_a
= exp(a
);
250 double volume_factor
= 0;
251 double volume_start
= 0, volume_end
= 0;
254 if (this->sigma
== sigma
)
259 this->g_w
= (int)ceil(sigma
*3) | 1;
260 this->g_r
= this->g_w
/ 2;
263 this->g
= (unsigned*)realloc(this->g
, this->g_w
* sizeof(unsigned));
264 this->gt2
= (unsigned*)realloc(this->gt2
, 256 * this->g_w
* sizeof(unsigned));
265 if (this->g
== NULL
|| this->gt2
== NULL
) {
274 double exp_1
= exp_a
;
275 double exp_2
= exp_1
* exp_1
;
276 volume_start
+= exp_0
;
277 for(int i
=0;i
<this->g_r
;++i
)
281 volume_start
+= exp_0
;
282 volume_start
+= exp_0
;
285 // for (i = 0; i < this->g_w; ++i) {
286 // volume_start += exp(a * (i - this->g_r) * (i - this->g_r));
289 volume_end
= (TARGET_VOLUME
+g_w
)/volume_start
;
290 volume_start
= (TARGET_VOLUME
-g_w
)/volume_start
;
293 while( volume_start
+0.000001<volume_end
)
295 volume_factor
= (volume_start
+volume_end
)*0.5;
298 exp_0
= volume_factor
;
300 exp_2
= exp_1
* exp_1
;
302 volume
= static_cast<int>(exp_0
+.5);
303 this->g
[this->g_r
] = volume
;
305 unsigned* p_left
= this->g
+this->g_r
-1;
306 unsigned* p_right
= this->g
+this->g_r
+1;
307 for(int i
=0; i
<this->g_r
;++i
,p_left
--,p_right
++)
311 *p_left
= static_cast<int>(exp_0
+.5);
313 volume
+= (*p_left
<<1);
316 // for (i = 0; i < this->g_w; ++i) {
317 // this->g[i] = (unsigned) ( exp(a * (i - this->g_r) * (i - this->g_r))* volume_factor + .5 );
318 // volume += this->g[i];
321 // volume don't have to be equal to TARGET_VOLUME,
322 // even if volume=TARGET_VOLUME+MAX_VOLUME_ERROR,
323 // max error introducing in later blur operation,
324 // which is (dot_product(g_w, pixel))/TARGET_VOLUME with pixel<256,
325 // would not exceed (MAX_VOLUME_ERROR*256)/TARGET_VOLUME,
326 // as long as MAX_VOLUME_ERROR/TARGET_VOLUME is small enough, error introduced would be kept in safe range
328 // NOTE: when it comes to rounding, no matter how small the error is,
329 // it may result a different rounding output
330 if( volume
>=TARGET_VOLUME
&& volume
< (TARGET_VOLUME
+MAX_VOLUME_ERROR
) )
332 else if(volume
< TARGET_VOLUME
)
334 volume_start
= volume_factor
;
336 else if(volume
>= TARGET_VOLUME
+MAX_VOLUME_ERROR
)
338 volume_end
= volume_factor
;
343 volume_factor
= volume_end
;
345 exp_0
= volume_factor
;
347 exp_2
= exp_1
* exp_1
;
349 volume
= static_cast<int>(exp_0
+.5);
350 this->g
[this->g_r
] = volume
;
352 unsigned* p_left
= this->g
+this->g_r
-1;
353 unsigned* p_right
= this->g
+this->g_r
+1;
354 for(int i
=0; i
<this->g_r
;++i
,p_left
--,p_right
++)
358 *p_left
= static_cast<int>(exp_0
+.5);
360 volume
+= (*p_left
<<1);
363 // for (i = 0; i < this->g_w; ++i) {
364 // this->g[i] = (unsigned) ( exp(a * (i - this->g_r) * (i - this->g_r))* volume_factor + .5 );
365 // volume += this->g[i];
370 for (int mx
= 0; mx
< this->g_w
; mx
++) {
372 unsigned *p_gt2
= this->gt2
+ mx
;
374 for (int i
= 1; i
< 256; i
++) {
375 last_mul
= last_mul
+this->g
[mx
];
379 // this->gt2[this->g_w * i+ mx] = this->g[mx] * i;
386 ass_tmp_buf::ass_tmp_buf(size_t size
)
388 tmp
= (unsigned *)malloc(size
* sizeof(unsigned));
392 ass_tmp_buf::ass_tmp_buf(const ass_tmp_buf
& buf
)
395 tmp
= (unsigned *)malloc(size
* sizeof(unsigned));
398 ass_tmp_buf::~ass_tmp_buf()
404 * \brief gaussian blur. an fast pure c implementation from libass.
406 static void ass_gauss_blur(unsigned char *buffer
, unsigned *tmp2
,
407 int width
, int height
, int stride
,
408 const unsigned *g_t_x
, int g_r_x
, int g_width_x
,
409 const unsigned *g_t_y
, int g_r_y
, int g_width_y
)
414 unsigned char *s
= buffer
;
415 unsigned *t
= tmp2
+ 1;
416 for (y
= 0; y
< height
; y
++) {
417 memset(t
- 1, 0, (width
+ 1) * sizeof(*t
));
419 if(x
< g_r_x
)//in case that r < 0
421 const int src
= s
[x
];
423 register unsigned *dstp
= t
+ x
- g_r_x
;
425 const unsigned *m3
= g_t_x
+ src
* g_width_x
;
427 for (mx
= g_width_x
-1; mx
>= g_r_x
- x
; mx
--) {
434 for (x
= 1; x
< g_r_x
; x
++) {
435 const int src
= s
[x
];
437 register unsigned *dstp
= t
+ x
- g_r_x
;
439 const unsigned *m3
= g_t_x
+ src
* g_width_x
;
440 for (mx
= g_r_x
- x
; mx
< g_width_x
; mx
++) {
446 for (; x
< width
- g_r_x
; x
++) {
447 const int src
= s
[x
];
449 register unsigned *dstp
= t
+ x
- g_r_x
;
451 const unsigned *m3
= g_t_x
+ src
* g_width_x
;
452 for (mx
= 0; mx
< g_width_x
; mx
++) {
458 for (; x
< width
-1; x
++) {
459 const int src
= s
[x
];
461 register unsigned *dstp
= t
+ x
- g_r_x
;
463 const int x2
= g_r_x
+ width
- x
;
464 const unsigned *m3
= g_t_x
+ src
* g_width_x
;
465 for (mx
= 0; mx
< x2
; mx
++) {
470 if(x
==width
-1) //important: x==width-1 failed, if r==0
472 const int src
= s
[x
];
474 register unsigned *dstp
= t
+ x
- g_r_x
;
476 const int x2
= g_r_x
+ width
- x
;
477 const unsigned *m3
= g_t_x
+ src
* g_width_x
;
479 for (mx
= 0; mx
< x2
; mx
++) {
491 for (x
= 0; x
< width
; x
++) {
493 if(y
< g_r_y
)//in case that r<0
495 unsigned *srcp
= t
+ y
* (width
+ 1) + 1;
498 register unsigned *dstp
= srcp
- 1 + (g_width_y
-g_r_y
+y
)*(width
+ 1);
499 const int src2
= (src
+ (1<<(ass_synth_priv::VOLUME_BITS
-1))) >> ass_synth_priv::VOLUME_BITS
;
500 const unsigned *m3
= g_t_y
+ src2
* g_width_y
;
503 *srcp
= (1<<(ass_synth_priv::VOLUME_BITS
-1));
504 for (mx
= g_width_y
-1; mx
>=g_r_y
- y
; mx
--) {
511 for (y
= 1; y
< g_r_y
; y
++) {
512 unsigned *srcp
= t
+ y
* (width
+ 1) + 1;
515 register unsigned *dstp
= srcp
- 1 + width
+ 1;
516 const int src2
= (src
+ (1<<(ass_synth_priv::VOLUME_BITS
-1))) >> ass_synth_priv::VOLUME_BITS
;
517 const unsigned *m3
= g_t_y
+ src2
* g_width_y
;
520 *srcp
= (1<<(ass_synth_priv::VOLUME_BITS
-1));
521 for (mx
= g_r_y
- y
; mx
< g_width_y
; mx
++) {
527 for (; y
< height
- g_r_y
; y
++) {
528 unsigned *srcp
= t
+ y
* (width
+ 1) + 1;
531 register unsigned *dstp
= srcp
- 1 - g_r_y
* (width
+ 1);
532 const int src2
= (src
+ (1<<(ass_synth_priv::VOLUME_BITS
-1))) >> ass_synth_priv::VOLUME_BITS
;
533 const unsigned *m3
= g_t_y
+ src2
* g_width_y
;
536 *srcp
= (1<<(ass_synth_priv::VOLUME_BITS
-1));
537 for (mx
= 0; mx
< g_width_y
; mx
++) {
543 for (; y
< height
-1; y
++) {
544 unsigned *srcp
= t
+ y
* (width
+ 1) + 1;
547 const int y2
= g_r_y
+ height
- y
;
548 register unsigned *dstp
= srcp
- 1 - g_r_y
* (width
+ 1);
549 const int src2
= (src
+ (1<<(ass_synth_priv::VOLUME_BITS
-1))) >> ass_synth_priv::VOLUME_BITS
;
550 const unsigned *m3
= g_t_y
+ src2
* g_width_y
;
553 *srcp
= (1<<(ass_synth_priv::VOLUME_BITS
-1));
554 for (mx
= 0; mx
< y2
; mx
++) {
560 if(y
== height
- 1)//important: y == height - 1 failed if r==0
562 unsigned *srcp
= t
+ y
* (width
+ 1) + 1;
565 const int y2
= g_r_y
+ height
- y
;
566 register unsigned *dstp
= srcp
- 1 - g_r_y
* (width
+ 1);
567 const int src2
= (src
+ (1<<(ass_synth_priv::VOLUME_BITS
-1))) >> ass_synth_priv::VOLUME_BITS
;
568 const unsigned *m3
= g_t_y
+ src2
* g_width_y
;
571 *srcp
= (1<<(ass_synth_priv::VOLUME_BITS
-1));
572 for (mx
= 0; mx
< y2
; mx
++) {
584 for (y
= 0; y
< height
; y
++) {
585 for (x
= 0; x
< width
; x
++) {
586 s
[x
] = t
[x
] >> ass_synth_priv::VOLUME_BITS
;
593 void xy_gaussian_blur(PUINT8 dst
, int dst_stride
,
594 PCUINT8 src
, int width
, int height
, int stride
,
595 const float *gt_x
, int r_x
, int gt_ex_width_x
,
596 const float *gt_y
, int r_y
, int gt_ex_width_y
);
598 void xy_be_blur(PUINT8 src
, int width
, int height
, int stride
, float pass_x
, float pass_y
);
601 * \brief blur with [[1,2,1]. [2,4,2], [1,2,1]] kernel.
603 static void be_blur(unsigned char *buf
, unsigned *tmp_base
, int w
, int h
, int stride
)
605 WORD
*col_pix_buf_base
= reinterpret_cast<WORD
*>(xy_malloc(w
*sizeof(WORD
)));
606 WORD
*col_sum_buf_base
= reinterpret_cast<WORD
*>(xy_malloc(w
*sizeof(WORD
)));
607 if(!col_sum_buf_base
|| !col_pix_buf_base
)
609 //ToDo: error handling
612 memset(col_pix_buf_base
, 0, w
*sizeof(WORD
));
613 memset(col_sum_buf_base
, 0, w
*sizeof(WORD
));
614 WORD
*col_pix_buf
= col_pix_buf_base
-2;//for aligment;
615 WORD
*col_sum_buf
= col_sum_buf_base
-2;//for aligment;
618 unsigned char *src
=buf
+y
*stride
;
621 int old_pix
= src
[x
-1];
622 int old_sum
= old_pix
+ src
[x
-2];
623 for ( ; x
< w
; x
++) {
625 int temp2
= old_pix
+ temp1
;
627 temp1
= old_sum
+ temp2
;
629 col_pix_buf
[x
] = temp1
;
634 unsigned char *src
=buf
+y
*stride
;
638 int old_pix
= src
[x
-1];
639 int old_sum
= old_pix
+ src
[x
-2];
640 for ( ; x
< w
; x
++) {
642 int temp2
= old_pix
+ temp1
;
644 temp1
= old_sum
+ temp2
;
647 temp2
= col_pix_buf
[x
] + temp1
;
648 col_pix_buf
[x
] = temp1
;
649 //dst[x-1] = (col_sum_buf[x] + temp2) >> 4;
650 col_sum_buf
[x
] = temp2
;
654 //__m128i round = _mm_set1_epi16(8);
655 for (int y
= 2; y
< h
; y
++) {
656 unsigned char *src
=buf
+y
*stride
;
657 unsigned char *dst
=buf
+(y
-1)*stride
;
661 __m128i old_pix_128
= _mm_cvtsi32_si128(src
[1]);
662 __m128i old_sum_128
= _mm_cvtsi32_si128(src
[0]+src
[1]);
663 for ( ; x
< ((w
-2)&(~7)); x
+=8) {
664 __m128i new_pix
= _mm_loadl_epi64(reinterpret_cast<const __m128i
*>(src
+x
));
665 new_pix
= _mm_unpacklo_epi8(new_pix
, _mm_setzero_si128());
666 __m128i temp
= _mm_slli_si128(new_pix
,2);
667 temp
= _mm_add_epi16(temp
, old_pix_128
);
668 temp
= _mm_add_epi16(temp
, new_pix
);
669 old_pix_128
= _mm_srli_si128(new_pix
,14);
671 new_pix
= _mm_slli_si128(temp
,2);
672 new_pix
= _mm_add_epi16(new_pix
, old_sum_128
);
673 new_pix
= _mm_add_epi16(new_pix
, temp
);
674 old_sum_128
= _mm_srli_si128(temp
, 14);
676 __m128i old_col_pix
= _mm_loadu_si128( reinterpret_cast<const __m128i
*>(col_pix_buf
+x
) );
677 __m128i old_col_sum
= _mm_loadu_si128( reinterpret_cast<const __m128i
*>(col_sum_buf
+x
) );
678 _mm_storeu_si128( reinterpret_cast<__m128i
*>(col_pix_buf
+x
), new_pix
);
679 temp
= _mm_add_epi16(new_pix
, old_col_pix
);
680 _mm_storeu_si128( reinterpret_cast<__m128i
*>(col_sum_buf
+x
), temp
);
682 old_col_sum
= _mm_add_epi16(old_col_sum
, temp
);
683 //old_col_sum = _mm_add_epi16(old_col_sum, round);
684 old_col_sum
= _mm_srli_epi16(old_col_sum
, 4);
685 old_col_sum
= _mm_packus_epi16(old_col_sum
, old_col_sum
);
686 _mm_storel_epi64( reinterpret_cast<__m128i
*>(dst
+x
-1), old_col_sum
);
688 int old_pix
= src
[x
-1];
689 int old_sum
= old_pix
+ src
[x
-2];
690 for ( ; x
< w
; x
++) {
692 int temp2
= old_pix
+ temp1
;
694 temp1
= old_sum
+ temp2
;
697 temp2
= col_pix_buf
[x
] + temp1
;
698 col_pix_buf
[x
] = temp1
;
699 dst
[x
-1] = (col_sum_buf
[x
] + temp2
) >> 4;
700 col_sum_buf
[x
] = temp2
;
704 xy_free(col_sum_buf_base
);
705 xy_free(col_pix_buf_base
);
711 static void be_blur_c(unsigned char *buf
, unsigned *tmp_base
, int w
, int h
, int stride
)
713 WORD
*col_pix_buf_base
= reinterpret_cast<WORD
*>(xy_malloc(w
*sizeof(WORD
)));
714 WORD
*col_sum_buf_base
= reinterpret_cast<WORD
*>(xy_malloc(w
*sizeof(WORD
)));
715 if(!col_sum_buf_base
|| !col_pix_buf_base
)
717 //ToDo: error handling
720 memset(col_pix_buf_base
, 0, w
*sizeof(WORD
));
721 memset(col_sum_buf_base
, 0, w
*sizeof(WORD
));
722 WORD
*col_pix_buf
= col_pix_buf_base
-2;//for aligment;
723 WORD
*col_sum_buf
= col_sum_buf_base
-2;//for aligment;
726 unsigned char *src
=buf
+y
*stride
;
729 int old_pix
= src
[x
-1];
730 int old_sum
= old_pix
+ src
[x
-2];
731 for ( ; x
< w
; x
++) {
733 int temp2
= old_pix
+ temp1
;
735 temp1
= old_sum
+ temp2
;
737 col_pix_buf
[x
] = temp1
;
742 unsigned char *src
=buf
+y
*stride
;
746 int old_pix
= src
[x
-1];
747 int old_sum
= old_pix
+ src
[x
-2];
748 for ( ; x
< w
; x
++) {
750 int temp2
= old_pix
+ temp1
;
752 temp1
= old_sum
+ temp2
;
755 temp2
= col_pix_buf
[x
] + temp1
;
756 col_pix_buf
[x
] = temp1
;
757 //dst[x-1] = (col_sum_buf[x] + temp2) >> 4;
758 col_sum_buf
[x
] = temp2
;
762 for (int y
= 2; y
< h
; y
++) {
763 unsigned char *src
=buf
+y
*stride
;
764 unsigned char *dst
=buf
+(y
-1)*stride
;
767 int old_pix
= src
[x
-1];
768 int old_sum
= old_pix
+ src
[x
-2];
769 for ( ; x
< w
; x
++) {
771 int temp2
= old_pix
+ temp1
;
773 temp1
= old_sum
+ temp2
;
776 temp2
= col_pix_buf
[x
] + temp1
;
777 col_pix_buf
[x
] = temp1
;
778 dst
[x
-1] = (col_sum_buf
[x
] + temp2
) >> 4;
779 col_sum_buf
[x
] = temp2
;
783 xy_free(col_sum_buf_base
);
784 xy_free(col_pix_buf_base
);
787 static void Bilinear(unsigned char *buf
, int w
, int h
, int stride
, int x_factor
, int y_factor
)
789 WORD
*col_pix_buf_base
= reinterpret_cast<WORD
*>(xy_malloc(w
*sizeof(WORD
)));
790 if(!col_pix_buf_base
)
792 //ToDo: error handling
795 memset(col_pix_buf_base
, 0, w
*sizeof(WORD
));
797 for (int y
= 0; y
< h
; y
++){
798 unsigned char *src
=buf
+y
*stride
;
800 WORD
*col_pix_buf
= col_pix_buf_base
;
802 for(int x
= 0; x
< w
; x
++)
805 int temp2
= temp1
*x_factor
;
811 temp2
= temp1
*y_factor
;
814 temp1
+= col_pix_buf
[x
];
815 src
[x
] = ((temp1
+32)>>6);
816 col_pix_buf
[x
] = temp2
;
819 xy_free(col_pix_buf_base
);
822 bool Rasterizer::Rasterize(const ScanLineData2
& scan_line_data2
, int xsub
, int ysub
, SharedPtrOverlay overlay
)
824 using namespace ::boost::flyweights
;
831 const ScanLineData
& scan_line_data
= *scan_line_data2
.m_scan_line_data
;
832 if(!scan_line_data
.mWidth
|| !scan_line_data
.mHeight
)
839 int width
= scan_line_data
.mWidth
+ xsub
;
840 int height
= scan_line_data
.mHeight
+ ysub
;
841 overlay
->mfWideOutlineEmpty
= scan_line_data2
.mWideOutline
.empty();
842 if(!overlay
->mfWideOutlineEmpty
)
844 int wide_border
= (scan_line_data2
.mWideBorder
+7)&~7;
846 width
+= 2*wide_border
;
847 height
+= 2*wide_border
;
848 xsub
+= wide_border
;
849 ysub
+= wide_border
;
851 overlay
->mOffsetX
= scan_line_data2
.mPathOffsetX
- xsub
;
852 overlay
->mOffsetY
= scan_line_data2
.mPathOffsetY
- ysub
;
854 overlay
->mWidth
= width
;
855 overlay
->mHeight
= height
;
856 overlay
->mOverlayWidth
= ((width
+7)>>3) + 1;
857 overlay
->mOverlayHeight
= ((height
+7)>>3) + 1;
858 overlay
->mOverlayPitch
= (overlay
->mOverlayWidth
+15)&~15;
860 BYTE
* body
= reinterpret_cast<BYTE
*>(xy_malloc(overlay
->mOverlayPitch
* overlay
->mOverlayHeight
));
865 overlay
->mBody
.reset(body
, xy_free
);
866 memset(body
, 0, overlay
->mOverlayPitch
* overlay
->mOverlayHeight
);
868 if (!overlay
->mfWideOutlineEmpty
)
870 border
= reinterpret_cast<BYTE
*>(xy_malloc(overlay
->mOverlayPitch
* overlay
->mOverlayHeight
));
875 overlay
->mBorder
.reset(border
, xy_free
);
876 memset(border
, 0, overlay
->mOverlayPitch
* overlay
->mOverlayHeight
);
879 // Are we doing a border?
880 const tSpanBuffer
* pOutline
[2] = {&(scan_line_data
.mOutline
), &(scan_line_data2
.mWideOutline
)};
881 for(int i
= countof(pOutline
)-1; i
>= 0; i
--)
883 tSpanBuffer::const_iterator it
= pOutline
[i
]->begin();
884 tSpanBuffer::const_iterator itEnd
= pOutline
[i
]->end();
885 byte
* plan_selected
= i
==0 ? body
: border
;
886 int pitch
= overlay
->mOverlayPitch
;
887 for(; it
!=itEnd
; ++it
)
889 int y
= (int)(((*it
).first
>> 32) - 0x40000000 + ysub
);
890 int x1
= (int)(((*it
).first
& 0xffffffff) - 0x40000000 + xsub
);
891 int x2
= (int)(((*it
).second
& 0xffffffff) - 0x40000000 + xsub
);
895 int last
= (x2
-1)>>3;
896 byte
* dst
= plan_selected
+ (pitch
*(y
>>3) + first
);
901 *dst
+= ((first
+1)<<3) - x1
;
903 while(++first
< last
)
908 *dst
+= x2
- (last
<<3);
917 const float Rasterizer::GAUSSIAN_BLUR_THREHOLD
= 0.333333f
;
919 bool Rasterizer::IsItReallyBlur( float be_strength
, double gaussian_blur_strength
)
921 if (be_strength
<=0 && gaussian_blur_strength
<=GAUSSIAN_BLUR_THREHOLD
)
928 // @return: true if actually a blur operation has done, or else false and output is leave unset.
929 // To Do: rewrite it or delete it
930 bool Rasterizer::OldFixedPointBlur(const Overlay
& input_overlay
, float be_strength
, double gaussian_blur_strength
,
931 double target_scale_x
, double target_scale_y
, SharedPtrOverlay output_overlay
)
933 using namespace ::boost::flyweights
;
935 ASSERT(IsItReallyBlur(be_strength
, gaussian_blur_strength
));
940 output_overlay
->CleanUp();
942 output_overlay
->mOffsetX
= input_overlay
.mOffsetX
;
943 output_overlay
->mOffsetY
= input_overlay
.mOffsetY
;
944 output_overlay
->mWidth
= input_overlay
.mWidth
;
945 output_overlay
->mHeight
= input_overlay
.mHeight
;
946 output_overlay
->mOverlayWidth
= input_overlay
.mOverlayWidth
;
947 output_overlay
->mOverlayHeight
= input_overlay
.mOverlayHeight
;
948 output_overlay
->mfWideOutlineEmpty
= input_overlay
.mfWideOutlineEmpty
;
950 double gaussian_blur_strength_x
= gaussian_blur_strength
*target_scale_x
;
951 double gaussian_blur_strength_y
= gaussian_blur_strength
*target_scale_y
;
953 int gaussian_blur_radius_x
= (static_cast<int>( ceil(gaussian_blur_strength_x
*3) ) | 1)/2;//fix me: rounding err?
954 int gaussian_blur_radius_y
= (static_cast<int>( ceil(gaussian_blur_strength_y
*3) ) | 1)/2;//fix me: rounding err?
955 if( gaussian_blur_radius_x
< 1 && gaussian_blur_strength
>GAUSSIAN_BLUR_THREHOLD
)
956 gaussian_blur_radius_x
= 1;//make sure that it really do a blur
957 if( gaussian_blur_radius_y
< 1 && gaussian_blur_strength
>GAUSSIAN_BLUR_THREHOLD
)
958 gaussian_blur_radius_y
= 1;//make sure that it really do a blur
960 int bluradjust_x
= 0, bluradjust_y
= 0;
961 if ( IsItReallyBlur(be_strength
, gaussian_blur_strength
) )
963 if (gaussian_blur_strength
> 0)
965 bluradjust_x
+= gaussian_blur_radius_x
* 8;
966 bluradjust_y
+= gaussian_blur_radius_y
* 8;
970 int be_adjust_x
= static_cast<int>( target_scale_x
*std::sqrt(be_strength
*0.25f
)+0.5 );//fix me: rounding err?
972 int be_adjust_y
= static_cast<int>(target_scale_y
*std::sqrt(be_strength
*0.25f
)+0.5);//fix me: rounding err?
975 bluradjust_x
+= be_adjust_x
;
976 bluradjust_y
+= be_adjust_y
;
978 // Expand the buffer a bit when we're blurring, since that can also widen the borders a bit
979 bluradjust_x
= (bluradjust_x
+7)&~7;
980 bluradjust_y
= (bluradjust_y
+7)&~7;
982 output_overlay
->mOffsetX
-= bluradjust_x
;
983 output_overlay
->mOffsetY
-= bluradjust_y
;
984 output_overlay
->mWidth
+= (bluradjust_x
<<1);
985 output_overlay
->mHeight
+= (bluradjust_y
<<1);
986 output_overlay
->mOverlayWidth
+= (bluradjust_x
>>2);
987 output_overlay
->mOverlayHeight
+= (bluradjust_y
>>2);
994 output_overlay
->mOverlayPitch
= (output_overlay
->mOverlayWidth
+15)&~15;
996 BYTE
* body
= reinterpret_cast<BYTE
*>(xy_malloc(output_overlay
->mOverlayPitch
* output_overlay
->mOverlayHeight
));
1001 output_overlay
->mBody
.reset(body
, xy_free
);
1002 memset(body
, 0, output_overlay
->mOverlayPitch
* output_overlay
->mOverlayHeight
);
1003 BYTE
* border
= NULL
;
1004 if (!output_overlay
->mfWideOutlineEmpty
)
1006 border
= reinterpret_cast<BYTE
*>(xy_malloc(output_overlay
->mOverlayPitch
* output_overlay
->mOverlayHeight
));
1011 output_overlay
->mBorder
.reset(border
, xy_free
);
1012 memset(border
, 0, output_overlay
->mOverlayPitch
* output_overlay
->mOverlayHeight
);
1016 for(int i
= 1; i
>= 0; i
--)
1018 byte
* plan_selected
= i
==0 ? body
: border
;
1019 const byte
* plan_input
= i
==0 ? input_overlay
.mBody
.get() : input_overlay
.mBorder
.get();
1021 plan_selected
+= (bluradjust_x
>>3) + (bluradjust_y
>>3)*output_overlay
->mOverlayPitch
;
1022 if ( plan_selected
!=NULL
&& plan_input
!=NULL
)
1024 for (int j
=0;j
<input_overlay
.mOverlayHeight
;j
++)
1026 memcpy(plan_selected
, plan_input
, input_overlay
.mOverlayPitch
);
1027 plan_selected
+= output_overlay
->mOverlayPitch
;
1028 plan_input
+= input_overlay
.mOverlayPitch
;
1033 ass_tmp_buf
tmp_buf( max((output_overlay
->mOverlayPitch
+1)*(output_overlay
->mOverlayHeight
+1),0) );
1034 //flyweight<key_value<int, ass_tmp_buf, ass_tmp_buf_get_size>, no_locking> tmp_buf((overlay->mOverlayWidth+1)*(overlay->mOverlayPitch+1));
1035 // Do some gaussian blur magic
1036 if ( gaussian_blur_strength
> GAUSSIAN_BLUR_THREHOLD
)
1038 byte
* plan_selected
= output_overlay
->mfWideOutlineEmpty
? body
: border
;
1040 flyweight
<key_value
<double, ass_synth_priv
, GaussianFilterKey
<ass_synth_priv
>>, no_locking
>
1041 fw_priv_blur_x(gaussian_blur_strength_x
);
1042 flyweight
<key_value
<double, ass_synth_priv
, GaussianFilterKey
<ass_synth_priv
>>, no_locking
>
1043 fw_priv_blur_y(gaussian_blur_strength_y
);
1045 const ass_synth_priv
& priv_blur_x
= fw_priv_blur_x
.get();
1046 const ass_synth_priv
& priv_blur_y
= fw_priv_blur_y
.get();
1047 if (output_overlay
->mOverlayWidth
>=priv_blur_x
.g_w
&& output_overlay
->mOverlayHeight
>=priv_blur_y
.g_w
)
1049 ass_gauss_blur(plan_selected
, tmp_buf
.tmp
, output_overlay
->mOverlayWidth
, output_overlay
->mOverlayHeight
, output_overlay
->mOverlayPitch
,
1050 priv_blur_x
.gt2
, priv_blur_x
.g_r
, priv_blur_x
.g_w
,
1051 priv_blur_y
.gt2
, priv_blur_y
.g_r
, priv_blur_y
.g_w
);
1055 float scaled_be_strength
= be_strength
* 0.5f
* (target_scale_x
+target_scale_y
);
1056 int pass_num
= static_cast<int>(scaled_be_strength
);
1057 int pitch
= output_overlay
->mOverlayPitch
;
1058 byte
* blur_plan
= output_overlay
->mfWideOutlineEmpty
? body
: border
;
1060 for (int pass
= 0; pass
< pass_num
; pass
++)
1062 if(output_overlay
->mOverlayWidth
>= 3 && output_overlay
->mOverlayHeight
>= 3)
1064 if (g_cpuid
.m_flags
& CCpuID::sse2
)
1066 be_blur(blur_plan
, tmp_buf
.tmp
, output_overlay
->mOverlayWidth
, output_overlay
->mOverlayHeight
, pitch
);
1070 be_blur_c(blur_plan
, tmp_buf
.tmp
, output_overlay
->mOverlayWidth
, output_overlay
->mOverlayHeight
, pitch
);
1074 if (scaled_be_strength
>pass_num
)
1076 xy_be_blur(blur_plan
, output_overlay
->mOverlayWidth
, output_overlay
->mOverlayHeight
, pitch
,
1077 scaled_be_strength
-pass_num
, scaled_be_strength
-pass_num
);
1083 // @return: true if actually a blur operation has done, or else false and output is leave unset.
1084 bool Rasterizer::Blur(const Overlay
& input_overlay
, float be_strength
,
1085 double gaussian_blur_strength
,
1086 double target_scale_x
, double target_scale_y
,
1087 SharedPtrOverlay output_overlay
)
1089 using namespace ::boost::flyweights
;
1091 ASSERT(IsItReallyBlur(be_strength
, gaussian_blur_strength
));
1092 if(!output_overlay
|| !IsItReallyBlur(be_strength
, gaussian_blur_strength
))
1096 if (input_overlay
.mOverlayWidth
<=0 || input_overlay
.mOverlayHeight
<=0)
1101 if (!(g_cpuid
.m_flags
& CCpuID::sse2
))
1103 // C code path of floating point version is extremely slow,
1104 // so we fall back to fixed point version instead
1105 return Rasterizer::OldFixedPointBlur(input_overlay
, be_strength
,
1106 gaussian_blur_strength
, target_scale_x
, target_scale_y
, output_overlay
);//fix me: important!
1109 if (gaussian_blur_strength
>0)
1111 if (be_strength
)//this insane thing should NEVER happen
1113 SharedPtrOverlay
tmp(new Overlay());
1115 bool rv
= GaussianBlur(input_overlay
, gaussian_blur_strength
, target_scale_x
, target_scale_y
, tmp
);
1117 rv
= BeBlur(*tmp
, be_strength
, target_scale_x
, target_scale_y
, output_overlay
);
1122 bool rv
= GaussianBlur(input_overlay
, gaussian_blur_strength
, target_scale_x
, target_scale_y
, output_overlay
);
1126 else if (be_strength
)
1128 bool rv
= BeBlur(input_overlay
, be_strength
, target_scale_x
, target_scale_y
, output_overlay
);
1134 bool Rasterizer::GaussianBlur( const Overlay
& input_overlay
, double gaussian_blur_strength
,
1135 double target_scale_x
, double target_scale_y
,
1136 SharedPtrOverlay output_overlay
)
1138 using namespace ::boost::flyweights
;
1140 ASSERT(output_overlay
);
1141 output_overlay
->CleanUp();
1142 output_overlay
->mfWideOutlineEmpty
= input_overlay
.mfWideOutlineEmpty
;
1144 ASSERT(gaussian_blur_strength
> 0);
1146 double gaussian_blur_strength_x
= gaussian_blur_strength
*target_scale_x
;
1147 double gaussian_blur_strength_y
= gaussian_blur_strength
*target_scale_y
;
1149 int gaussian_blur_radius_x
= (static_cast<int>( ceil(gaussian_blur_strength_x
*3) ) | 1)/2;//fix me: rounding err?
1150 int gaussian_blur_radius_y
= (static_cast<int>( ceil(gaussian_blur_strength_y
*3) ) | 1)/2;//fix me: rounding err?
1151 if( gaussian_blur_radius_x
< 1 && gaussian_blur_strength
>GAUSSIAN_BLUR_THREHOLD
)
1152 gaussian_blur_radius_x
= 1;//make sure that it really do a blur
1153 if( gaussian_blur_radius_y
< 1 && gaussian_blur_strength
>GAUSSIAN_BLUR_THREHOLD
)
1154 gaussian_blur_radius_y
= 1;//make sure that it really do a blur
1156 flyweight
<key_value
<double, GaussianCoefficients
, GaussianFilterKey
<GaussianCoefficients
>>, no_locking
>
1157 fw_filter_x(gaussian_blur_strength_x
);
1158 flyweight
<key_value
<double, GaussianCoefficients
, GaussianFilterKey
<GaussianCoefficients
>>, no_locking
>
1159 fw_filter_y(gaussian_blur_strength_y
);
1161 const GaussianCoefficients
& filter_x
= fw_filter_x
.get();
1162 const GaussianCoefficients
& filter_y
= fw_filter_y
.get();
1164 int bluradjust_x
= filter_x
.g_r
* 8;
1165 int bluradjust_y
= filter_y
.g_r
* 8;
1166 output_overlay
->mOffsetX
= input_overlay
.mOffsetX
- bluradjust_x
;
1167 output_overlay
->mOffsetY
= input_overlay
.mOffsetY
- bluradjust_y
;
1168 output_overlay
->mWidth
= input_overlay
.mWidth
+ (bluradjust_x
<<1);
1169 output_overlay
->mHeight
= input_overlay
.mHeight
+ (bluradjust_y
<<1);
1170 output_overlay
->mOverlayWidth
= input_overlay
.mOverlayWidth
+ (bluradjust_x
>>2);
1171 output_overlay
->mOverlayHeight
= input_overlay
.mOverlayHeight
+ (bluradjust_y
>>2);
1173 output_overlay
->mOverlayPitch
= (output_overlay
->mOverlayWidth
+15)&~15;
1175 BYTE
* blur_plan
= reinterpret_cast<BYTE
*>(xy_malloc(output_overlay
->mOverlayPitch
* output_overlay
->mOverlayHeight
));
1176 //memset(blur_plan, 0, output_overlay->mOverlayPitch * output_overlay->mOverlayHeight);
1178 const BYTE
* plan_input
= input_overlay
.mfWideOutlineEmpty
? input_overlay
.mBody
.get() : input_overlay
.mBorder
.get();
1179 ASSERT(output_overlay
->mOverlayWidth
>=filter_x
.g_w
&& output_overlay
->mOverlayHeight
>=filter_y
.g_w
);
1180 xy_gaussian_blur(blur_plan
, output_overlay
->mOverlayPitch
,
1181 plan_input
, input_overlay
.mOverlayWidth
, input_overlay
.mOverlayHeight
, input_overlay
.mOverlayPitch
,
1182 filter_x
.g_f
, filter_x
.g_r
, filter_x
.g_w_ex
,
1183 filter_y
.g_f
, filter_y
.g_r
, filter_y
.g_w_ex
);
1184 if (input_overlay
.mfWideOutlineEmpty
)
1186 output_overlay
->mBody
.reset(blur_plan
, xy_free
);
1190 output_overlay
->mBorder
.reset(blur_plan
, xy_free
);
1192 BYTE
* body
= reinterpret_cast<BYTE
*>(xy_malloc(output_overlay
->mOverlayPitch
* output_overlay
->mOverlayHeight
));
1197 output_overlay
->mBody
.reset(body
, xy_free
);
1198 memset(body
, 0, output_overlay
->mOverlayPitch
* (bluradjust_y
>>3));
1199 body
+= (bluradjust_y
>>3)*output_overlay
->mOverlayPitch
;
1200 plan_input
= input_overlay
.mBody
.get();
1202 for (int j
=0;j
<input_overlay
.mOverlayHeight
;j
++)
1204 memset(body
, 0, (bluradjust_x
>>3));
1205 memcpy(body
+(bluradjust_x
>>3), plan_input
, input_overlay
.mOverlayWidth
);
1206 memset(body
+(bluradjust_x
>>3)+input_overlay
.mOverlayWidth
, 0, (bluradjust_x
>>3));
1207 body
+= output_overlay
->mOverlayPitch
;
1208 plan_input
+= input_overlay
.mOverlayPitch
;
1210 memset(body
, 0, output_overlay
->mOverlayPitch
* (bluradjust_y
>>3));
1215 bool Rasterizer::BeBlur( const Overlay
& input_overlay
, float be_strength
,
1216 float target_scale_x
, float target_scale_y
, SharedPtrOverlay output_overlay
)
1218 ASSERT(output_overlay
);
1219 output_overlay
->CleanUp();
1220 output_overlay
->mfWideOutlineEmpty
= input_overlay
.mfWideOutlineEmpty
;
1222 ASSERT(be_strength
>0 && target_scale_x
>0 && target_scale_y
>0);
1223 int bluradjust_x
= static_cast<int>( target_scale_x
*std::sqrt(be_strength
*0.25f
)+0.5 );//fix me: rounding err?
1225 int bluradjust_y
= static_cast<int>(target_scale_y
*std::sqrt(be_strength
*0.25f
)+0.5);//fix me: rounding err?
1228 output_overlay
->mOffsetX
= input_overlay
.mOffsetX
- bluradjust_x
;
1229 output_overlay
->mOffsetY
= input_overlay
.mOffsetY
- bluradjust_y
;
1230 output_overlay
->mWidth
= input_overlay
.mWidth
+ (bluradjust_x
<<1);
1231 output_overlay
->mHeight
= input_overlay
.mHeight
+ (bluradjust_y
<<1);
1232 output_overlay
->mOverlayWidth
= input_overlay
.mOverlayWidth
+ (bluradjust_x
>>2);
1233 output_overlay
->mOverlayHeight
= input_overlay
.mOverlayHeight
+ (bluradjust_y
>>2);
1235 output_overlay
->mOverlayPitch
= (output_overlay
->mOverlayWidth
+15)&~15;
1237 BYTE
* body
= reinterpret_cast<BYTE
*>(xy_malloc(output_overlay
->mOverlayPitch
* output_overlay
->mOverlayHeight
));
1242 output_overlay
->mBody
.reset(body
, xy_free
);
1243 memset(body
, 0, output_overlay
->mOverlayPitch
* output_overlay
->mOverlayHeight
);
1244 BYTE
* border
= NULL
;
1245 if (!output_overlay
->mfWideOutlineEmpty
)
1247 border
= reinterpret_cast<BYTE
*>(xy_malloc(output_overlay
->mOverlayPitch
* output_overlay
->mOverlayHeight
));
1252 output_overlay
->mBorder
.reset(border
, xy_free
);
1253 memset(border
, 0, output_overlay
->mOverlayPitch
* output_overlay
->mOverlayHeight
);
1257 for(int i
= 1; i
>= 0; i
--)
1259 byte
* plan_selected
= i
==0 ? body
: border
;
1260 const byte
* plan_input
= i
==0 ? input_overlay
.mBody
.get() : input_overlay
.mBorder
.get();
1262 plan_selected
+= (bluradjust_x
>>3) + (bluradjust_y
>>3)*output_overlay
->mOverlayPitch
;
1263 if ( plan_selected
!=NULL
&& plan_input
!=NULL
)
1265 for (int j
=0;j
<input_overlay
.mOverlayHeight
;j
++)
1267 memcpy(plan_selected
, plan_input
, input_overlay
.mOverlayWidth
*sizeof(plan_input
[0]));
1268 plan_selected
+= output_overlay
->mOverlayPitch
;
1269 plan_input
+= input_overlay
.mOverlayPitch
;
1278 float scaled_be_strength
= be_strength
* 0.5f
* (target_scale_x
+target_scale_y
);
1279 int pass_num
= static_cast<int>(scaled_be_strength
);
1280 int pitch
= output_overlay
->mOverlayPitch
;
1281 byte
* blur_plan
= output_overlay
->mfWideOutlineEmpty
? body
: border
;
1282 ass_tmp_buf
tmp_buf( max((output_overlay
->mOverlayPitch
+1)*(output_overlay
->mOverlayHeight
+1),0) );
1283 for (int pass
= 0; pass
< pass_num
; pass
++)
1285 if(output_overlay
->mOverlayWidth
>= 3 && output_overlay
->mOverlayHeight
>= 3)
1287 if (g_cpuid
.m_flags
& CCpuID::sse2
)
1289 be_blur(blur_plan
, tmp_buf
.tmp
, output_overlay
->mOverlayWidth
, output_overlay
->mOverlayHeight
, pitch
);
1293 be_blur_c(blur_plan
, tmp_buf
.tmp
, output_overlay
->mOverlayWidth
, output_overlay
->mOverlayHeight
, pitch
);
1297 if (scaled_be_strength
>pass_num
)
1299 xy_be_blur(blur_plan
, output_overlay
->mOverlayWidth
, output_overlay
->mOverlayHeight
, pitch
,
1300 scaled_be_strength
-pass_num
, scaled_be_strength
-pass_num
);
1306 ///////////////////////////////////////////////////////////////////////////
1308 static __forceinline
void pixmix(DWORD
*dst
, DWORD color
, DWORD alpha
)
1311 // Make sure both a and ia are in range 1..256 for the >>8 operations below to be correct
1314 *dst
= ((((*dst
&0x00ff00ff)*ia
+ (color
&0x00ff00ff)*a
)&0xff00ff00)>>8)
1315 | ((((*dst
&0x0000ff00)*ia
+ (color
&0x0000ff00)*a
)&0x00ff0000)>>8)
1316 | ((((*dst
>>8)&0x00ff0000)*ia
)&0xff000000);
1319 static __forceinline
void pixmix2(DWORD
*dst
, DWORD color
, DWORD shapealpha
, DWORD clipalpha
)
1321 int a
= (((shapealpha
)*(clipalpha
)*(color
>>24))>>12)&0xff;
1324 *dst
= ((((*dst
&0x00ff00ff)*ia
+ (color
&0x00ff00ff)*a
)&0xff00ff00)>>8)
1325 | ((((*dst
&0x0000ff00)*ia
+ (color
&0x0000ff00)*a
)&0x00ff0000)>>8)
1326 | ((((*dst
>>8)&0x00ff0000)*ia
)&0xff000000);
1329 #include <xmmintrin.h>
1330 #include <emmintrin.h>
1332 static __forceinline
void pixmix_sse2(DWORD
* dst
, DWORD color
, DWORD alpha
)
1334 // alpha = (((alpha) * (color>>24)) >> 6) & 0xff;
1336 __m128i zero
= _mm_setzero_si128();
1337 __m128i a
= _mm_set1_epi32(((alpha
+1) << 16) | (0x100 - alpha
));
1338 __m128i d
= _mm_unpacklo_epi8(_mm_cvtsi32_si128(*dst
), zero
);
1339 __m128i s
= _mm_unpacklo_epi8(_mm_cvtsi32_si128(color
), zero
);
1340 __m128i r
= _mm_unpacklo_epi16(d
, s
);
1341 r
= _mm_madd_epi16(r
, a
);
1342 r
= _mm_srli_epi32(r
, 8);
1343 r
= _mm_packs_epi32(r
, r
);
1344 r
= _mm_packus_epi16(r
, r
);
1345 *dst
= (DWORD
)_mm_cvtsi128_si32(r
);
1348 static __forceinline
void pixmix2_sse2(DWORD
* dst
, DWORD color
, DWORD shapealpha
, DWORD clipalpha
)
1350 int alpha
= (((shapealpha
)*(clipalpha
)*(color
>>24))>>12)&0xff;
1352 __m128i zero
= _mm_setzero_si128();
1353 __m128i a
= _mm_set1_epi32(((alpha
+1) << 16) | (0x100 - alpha
));
1354 __m128i d
= _mm_unpacklo_epi8(_mm_cvtsi32_si128(*dst
), zero
);
1355 __m128i s
= _mm_unpacklo_epi8(_mm_cvtsi32_si128(color
), zero
);
1356 __m128i r
= _mm_unpacklo_epi16(d
, s
);
1357 r
= _mm_madd_epi16(r
, a
);
1358 r
= _mm_srli_epi32(r
, 8);
1359 r
= _mm_packs_epi32(r
, r
);
1360 r
= _mm_packus_epi16(r
, r
);
1361 *dst
= (DWORD
)_mm_cvtsi128_si32(r
);
1364 #include <mmintrin.h>
1366 // Calculate a - b clamping to 0 instead of underflowing
1367 static __forceinline DWORD
safe_subtract(DWORD a
, DWORD b
)
1369 __m64 ap
= _mm_cvtsi32_si64(a
);
1370 __m64 bp
= _mm_cvtsi32_si64(b
);
1371 __m64 rp
= _mm_subs_pu16(ap
, bp
);
1372 DWORD r
= (DWORD
)_mm_cvtsi64_si32(rp
);
1375 //return (b > a) ? 0 : a - b;
1379 * No aligned requirement
1382 void AlphaBlt(byte
* pY
,
1383 const byte
* pAlphaMask
,
1385 int h
, int w
, int src_stride
, int dst_stride
)
1387 __m128i zero
= _mm_setzero_si128();
1388 __m128i s
= _mm_set1_epi16(Y
); //s = c 0 c 0 c 0 c 0 c 0 c 0 c 0 c 0
1390 if( w
>16 )//IMPORTANT! The result of the following code is undefined with w<15.
1392 for( ; h
>0; h
--, pAlphaMask
+= src_stride
, pY
+= dst_stride
)
1394 const BYTE
* sa
= pAlphaMask
;
1396 const BYTE
* dy_first_mod16
= reinterpret_cast<BYTE
*>((reinterpret_cast<int>(pY
)+15)&~15); //IMPORTANT! w must >= 15
1397 const BYTE
* dy_end_mod16
= reinterpret_cast<BYTE
*>(reinterpret_cast<int>(pY
+w
)&~15);
1398 const BYTE
* dy_end
= pY
+ w
;
1400 for(;dy
< dy_first_mod16
; sa
++, dy
++)
1402 *dy
= (*dy
* (256 - *sa
)+ Y
*(*sa
+1))>>8;
1404 for(; dy
< dy_end_mod16
; sa
+=8, dy
+=16)
1406 __m128i a
= _mm_loadl_epi64((__m128i
*)sa
);
1409 __m128i d
= _mm_load_si128((__m128i
*)dy
);
1411 //__m128i ones = _mm_cmpeq_epi32(zero,zero); //ones = ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
1412 //__m128i ia = _mm_xor_si128(a,ones); //ia = ~a
1413 //ia = _mm_unpacklo_epi8(ia,zero); //ia = ~a0 0 ~a1 0 ~a2 0 ~a3 0 ~a4 0 ~a5 0 ~a6 0 ~a7 0
1414 a
= _mm_unpacklo_epi8(a
,zero
); //a= a0 0 a1 0 a2 0 a3 0 a4 0 a5 0 a6 0 a7 0
1415 __m128i ones
= _mm_set1_epi16(256); //ones = 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1
1416 __m128i ia
= _mm_sub_epi16(ones
, a
); //ia = 256-a0 ... 256-a7
1417 ones
= _mm_srli_epi16(ones
, 8);
1418 a
= _mm_add_epi16(a
, ones
); //a= 1+a0 ... 1+a7
1420 __m128i dl
= _mm_unpacklo_epi8(d
,zero
); //d = b0 0 b1 0 b2 0 b3 0 b4 0 b5 0 b6 0 b7 0
1421 __m128i sl
= _mm_mullo_epi16(s
,a
); //sl = c0*a0 c1*a1 ... c7*a7
1423 dl
= _mm_mullo_epi16(dl
,ia
); //d = b0*~a0 b1*~a1 ... b7*~a7
1425 dl
= _mm_add_epi16(dl
,sl
); //d = d + sl
1426 dl
= _mm_srli_epi16(dl
, 8); //d = d>>8
1429 a
= _mm_loadl_epi64((__m128i
*)sa
);
1431 a
= _mm_unpacklo_epi8(a
,zero
);
1432 ones
= _mm_slli_epi16(ones
, 8);
1433 ia
= _mm_sub_epi16(ones
, a
);
1434 ones
= _mm_srli_epi16(ones
, 8);
1435 a
= _mm_add_epi16(a
,ones
);
1437 d
= _mm_unpackhi_epi8(d
,zero
);
1438 sl
= _mm_mullo_epi16(s
,a
);
1439 d
= _mm_mullo_epi16(d
,ia
);
1440 d
= _mm_add_epi16(d
,sl
);
1441 d
= _mm_srli_epi16(d
, 8);
1443 dl
= _mm_packus_epi16(dl
,d
);
1445 _mm_store_si128((__m128i
*)dy
, dl
);
1447 for(;dy
< dy_end
; sa
++, dy
++)
1449 *dy
= (*dy
* (256 - *sa
)+ Y
*(*sa
+1))>>8;
1455 for( ; h
>0; h
--, pAlphaMask
+= src_stride
, pY
+= dst_stride
)
1457 const BYTE
* sa
= pAlphaMask
;
1459 const BYTE
* dy_end
= pY
+ w
;
1461 for(;dy
< dy_end
; sa
++, dy
++)
1463 *dy
= (*dy
* (256 - *sa
)+ Y
*(*sa
+1))>>8;
1471 * No aligned requirement
1474 void AlphaBlt(byte
* pY
,
1477 int h
, int w
, int dst_stride
)
1479 int yPremul
= Y
*(alpha
+1);
1480 int dstAlpha
= 0x100 - alpha
;
1481 if( w
>32 )//IMPORTANT! The result of the following code is undefined with w<15.
1483 __m128i zero
= _mm_setzero_si128();
1484 __m128i s
= _mm_set1_epi16(yPremul
); //s = c 0 c 0 c 0 c 0 c 0 c 0 c 0 c 0
1485 __m128i ia
= _mm_set1_epi16(dstAlpha
);
1486 for( ; h
>0; h
--, pY
+= dst_stride
)
1489 const BYTE
* dy_first_mod16
= reinterpret_cast<BYTE
*>((reinterpret_cast<int>(pY
)+15)&~15); //IMPORTANT! w must >= 15
1490 const BYTE
* dy_end_mod16
= reinterpret_cast<BYTE
*>(reinterpret_cast<int>(pY
+w
)&~15);
1491 const BYTE
* dy_end
= pY
+ w
;
1493 for(;dy
< dy_first_mod16
; dy
++)
1495 *dy
= (*dy
* dstAlpha
+ yPremul
)>>8;
1497 for(; dy
< dy_end_mod16
; dy
+=16)
1500 __m128i d
= _mm_load_si128(reinterpret_cast<const __m128i
*>(dy
));
1501 __m128i dl
= _mm_unpacklo_epi8(d
,zero
); //d = b0 0 b1 0 b2 0 b3 0 b4 0 b5 0 b6 0 b7 0
1503 dl
= _mm_mullo_epi16(dl
,ia
); //d = b0*~a0 b1*~a1 ... b7*~a7
1504 dl
= _mm_adds_epu16(dl
,s
); //d = d + s
1505 dl
= _mm_srli_epi16(dl
, 8); //d = d>>8
1507 d
= _mm_unpackhi_epi8(d
,zero
);
1508 d
= _mm_mullo_epi16(d
,ia
);
1509 d
= _mm_adds_epu16(d
,s
);
1510 d
= _mm_srli_epi16(d
, 8);
1512 dl
= _mm_packus_epi16(dl
,d
);
1514 _mm_store_si128(reinterpret_cast<__m128i
*>(dy
), dl
);
1516 for(;dy
< dy_end
; dy
++)
1518 *dy
= (*dy
* dstAlpha
+ yPremul
)>>8;
1524 for( ; h
>0; h
--, pY
+= dst_stride
)
1527 const BYTE
* dy_end
= pY
+ w
;
1529 for(;dy
< dy_end
; dy
++)
1531 *dy
= (*dy
* dstAlpha
+ yPremul
)>>8;
1539 * No aligned requirement
1542 void AlphaBltC(byte
* pY
,
1545 int h
, int w
, int dst_stride
)
1547 int yPremul
= Y
*(alpha
+1);
1548 int dstAlpha
= 0x100 - alpha
;
1550 for( ; h
>0; h
--, pY
+= dst_stride
)
1553 const BYTE
* dy_end
= pY
+ w
;
1555 for(;dy
< dy_end
; dy
++)
1557 *dy
= (*dy
* dstAlpha
+ yPremul
)>>8;
1562 // For CPUID usage in Rasterizer::Draw
1563 #include "../dsutil/vd.h"
1565 void OverlapRegion(tSpanBuffer
& dst
, const tSpanBuffer
& src
, int dx
, int dy
)
1568 temp
.reserve(dst
.size() + src
.size());
1570 tSpanBuffer::iterator itA
= temp
.begin();
1571 tSpanBuffer::iterator itAE
= temp
.end();
1572 tSpanBuffer::const_iterator itB
= src
.begin();
1573 tSpanBuffer::const_iterator itBE
= src
.end();
1574 // Don't worry -- even if dy<0 this will still work! // G: hehe, the evil twin :)
1575 unsigned __int64 offset1
= (((__int64
)dy
)<<32) - dx
;
1576 unsigned __int64 offset2
= (((__int64
)dy
)<<32) + dx
;
1577 while(itA
!= itAE
&& itB
!= itBE
)
1579 if((*itB
).first
+ offset1
< (*itA
).first
)
1581 // B span is earlier. Use it.
1582 unsigned __int64 x1
= (*itB
).first
+ offset1
;
1583 unsigned __int64 x2
= (*itB
).second
+ offset2
;
1585 // B spans don't overlap, so begin merge loop with A first.
1588 // If we run out of A spans or the A span doesn't overlap,
1589 // then the next B span can't either (because B spans don't
1590 // overlap) and we exit.
1591 if(itA
== itAE
|| (*itA
).first
> x2
)
1593 do {x2
= _MAX(x2
, (*itA
++).second
);}
1594 while(itA
!= itAE
&& (*itA
).first
<= x2
);
1595 // If we run out of B spans or the B span doesn't overlap,
1596 // then the next A span can't either (because A spans don't
1597 // overlap) and we exit.
1598 if(itB
== itBE
|| (*itB
).first
+ offset1
> x2
)
1600 do {x2
= _MAX(x2
, (*itB
++).second
+ offset2
);}
1601 while(itB
!= itBE
&& (*itB
).first
+ offset1
<= x2
);
1604 dst
.push_back(tSpan(x1
, x2
));
1608 // A span is earlier. Use it.
1609 unsigned __int64 x1
= (*itA
).first
;
1610 unsigned __int64 x2
= (*itA
).second
;
1612 // A spans don't overlap, so begin merge loop with B first.
1615 // If we run out of B spans or the B span doesn't overlap,
1616 // then the next A span can't either (because A spans don't
1617 // overlap) and we exit.
1618 if(itB
== itBE
|| (*itB
).first
+ offset1
> x2
)
1620 do {x2
= _MAX(x2
, (*itB
++).second
+ offset2
);}
1621 while(itB
!= itBE
&& (*itB
).first
+ offset1
<= x2
);
1622 // If we run out of A spans or the A span doesn't overlap,
1623 // then the next B span can't either (because B spans don't
1624 // overlap) and we exit.
1625 if(itA
== itAE
|| (*itA
).first
> x2
)
1627 do {x2
= _MAX(x2
, (*itA
++).second
);}
1628 while(itA
!= itAE
&& (*itA
).first
<= x2
);
1631 dst
.push_back(tSpan(x1
, x2
));
1634 // Copy over leftover spans.
1636 dst
.push_back(*itA
++);
1639 dst
.push_back(tSpan((*itB
).first
+ offset1
, (*itB
).second
+ offset2
));
1644 // Render a subpicture onto a surface.
1645 // spd is the surface to render on.
1646 // clipRect is a rectangular clip region to render inside.
1647 // pAlphaMask is an alpha clipping mask.
1648 // xsub and ysub ???
1649 // switchpts seems to be an array of fill colours interlaced with coordinates.
1650 // switchpts[i*2] contains a colour and switchpts[i*2+1] contains the coordinate to use that colour from
1651 // fBody tells whether to render the body of the subs.
1652 // fBorder tells whether to render the border of the subs.
1653 SharedPtrByte
Rasterizer::CompositeAlphaMask(const SharedPtrOverlay
& overlay
, const CRect
& clipRect
,
1654 const GrayImage2
* alpha_mask
,
1655 int xsub
, int ysub
, const DWORD
* switchpts
, bool fBody
, bool fBorder
,
1656 CRect
*outputDirtyRect
)
1658 //fix me: check and log error
1659 SharedPtrByte result
;
1660 *outputDirtyRect
= CRect(0, 0, 0, 0);
1661 if (!switchpts
|| !fBody
&& !fBorder
) return result
;
1662 if (fBorder
&& !overlay
->mBorder
) return result
;
1665 if (alpha_mask
!=NULL
)
1667 r
&= CRect(alpha_mask
->left_top
, alpha_mask
->size
);
1670 // Remember that all subtitle coordinates are specified in 1/8 pixels
1671 // (x+4)>>3 rounds to nearest whole pixel.
1672 // ??? What is xsub, ysub, mOffsetX and mOffsetY ?
1673 int x
= (xsub
+ overlay
->mOffsetX
+ 4)>>3;
1674 int y
= (ysub
+ overlay
->mOffsetY
+ 4)>>3;
1675 int w
= overlay
->mOverlayWidth
;
1676 int h
= overlay
->mOverlayHeight
;
1679 if(x
< r
.left
) {xo
= r
.left
-x
; w
-= r
.left
-x
; x
= r
.left
;}
1680 if(y
< r
.top
) {yo
= r
.top
-y
; h
-= r
.top
-y
; y
= r
.top
;}
1681 if(x
+w
> r
.right
) w
= r
.right
-x
;
1682 if(y
+h
> r
.bottom
) h
= r
.bottom
-y
;
1683 // Check if there's actually anything to render
1684 if(w
<= 0 || h
<= 0) return(result
);
1685 outputDirtyRect
->SetRect(x
, y
, x
+w
, y
+h
);
1687 bool fSingleColor
= (switchpts
[1]==0xffffffff);
1690 // Grab the first colour
1691 DWORD color
= switchpts
[0];
1692 byte
* s_base
= (byte
*)xy_malloc(overlay
->mOverlayPitch
* overlay
->mOverlayHeight
);
1693 const byte
* alpha_mask_data
= alpha_mask
!= NULL
? alpha_mask
->data
.get() : NULL
;
1694 const int alpha_mask_pitch
= alpha_mask
!= NULL
? alpha_mask
->pitch
: 0;
1695 if(alpha_mask_data
!=NULL
)
1696 alpha_mask_data
+= alpha_mask
->pitch
* y
+ x
- alpha_mask
->left_top
.y
*alpha_mask
->pitch
- alpha_mask
->left_top
.x
;
1700 overlay
->FillAlphaMash(s_base
, fBody
, fBorder
, xo
, yo
, w
, h
,
1701 alpha_mask_data
, alpha_mask_pitch
,
1707 const DWORD
*sw
= switchpts
;
1708 while( last_x
<w
+xo
)
1710 byte alpha
= sw
[0]>>24;
1711 while( sw
[3]<w
+xo
&& (sw
[2]>>24)==alpha
)
1715 int new_x
= sw
[3] < w
+xo
? sw
[3] : w
+xo
;
1716 overlay
->FillAlphaMash(s_base
, fBody
, fBorder
,
1717 last_x
, yo
, new_x
-last_x
, h
,
1718 alpha_mask_data
, alpha_mask_pitch
,
1724 result
.reset( s_base
, xy_free
);
1730 // draw overlay[clipRect] to bitmap[0,0,w,h]
1732 void Rasterizer::Draw(XyBitmap
* bitmap
, SharedPtrOverlay overlay
, const CRect
& clipRect
, byte
* s_base
,
1733 int xsub
, int ysub
, const DWORD
* switchpts
, bool fBody
, bool fBorder
)
1735 if (!switchpts
|| !fBody
&& !fBorder
) return;
1742 // Limit drawn area to rectangular clip area
1744 // Remember that all subtitle coordinates are specified in 1/8 pixels
1745 // (x+4)>>3 rounds to nearest whole pixel.
1746 int overlayPitch
= overlay
->mOverlayPitch
;
1747 int x
= (xsub
+ overlay
->mOffsetX
+ 4)>>3;
1748 int y
= (ysub
+ overlay
->mOffsetY
+ 4)>>3;
1749 int w
= overlay
->mOverlayWidth
;
1750 int h
= overlay
->mOverlayHeight
;
1753 if(x
< r
.left
) {xo
= r
.left
-x
; w
-= r
.left
-x
; x
= r
.left
;}
1754 if(y
< r
.top
) {yo
= r
.top
-y
; h
-= r
.top
-y
; y
= r
.top
;}
1755 if(x
+w
> r
.right
) w
= r
.right
-x
;
1756 if(y
+h
> r
.bottom
) h
= r
.bottom
-y
;
1757 // Check if there's actually anything to render
1758 if (w
<= 0 || h
<= 0) return;
1759 // must have enough space to draw into
1760 ASSERT(x
>= bitmap
->x
&& y
>= bitmap
->y
&& x
+w
<= bitmap
->x
+ bitmap
->w
&& y
+h
<= bitmap
->y
+ bitmap
->h
);
1763 bool fSSE2
= !!(g_cpuid
.m_flags
& CCpuID::sse2
);
1764 bool fSingleColor
= (switchpts
[1]==0xffffffff);
1765 bool PLANAR
= (bitmap
->type
==XyBitmap::PLANNA
);
1766 int draw_method
= 0;
1768 draw_method
|= DM::SINGLE_COLOR
;
1770 draw_method
|= DM::SSE2
;
1772 draw_method
|= DM::AYUV_PLANAR
;
1775 // Grab the first colour
1776 DWORD color
= switchpts
[0];
1777 const byte
* s
= s_base
+ overlay
->mOverlayPitch
*yo
+ xo
;
1780 if (bitmap
->type
==XyBitmap::PLANNA
)
1781 dst_offset
= bitmap
->pitch
*(y
-bitmap
->y
) + x
- bitmap
->x
;
1783 dst_offset
= bitmap
->pitch
*(y
-bitmap
->y
) + (x
- bitmap
->x
)*4;
1784 unsigned long* dst
= (unsigned long*)((BYTE
*)bitmap
->plans
[0] + dst_offset
);
1786 // Every remaining line in the bitmap to be rendered...
1789 case DM::SINGLE_COLOR
| DM::SSE2
| 0*DM::AYUV_PLANAR
:
1793 for(int wt
=0; wt
<w
; ++wt
)
1794 // The <<6 is due to pixmix expecting the alpha parameter to be
1795 // the multiplication of two 6-bit unsigned numbers but we
1796 // only have one here. (No alpha mask.)
1797 pixmix_sse2(&dst
[wt
], color
, s
[wt
]);
1799 dst
= (unsigned long *)((char *)dst
+ bitmap
->pitch
);
1803 case DM::SINGLE_COLOR
| 0*DM::SSE2
| 0*DM::AYUV_PLANAR
:
1807 for(int wt
=0; wt
<w
; ++wt
)
1808 pixmix(&dst
[wt
], color
, s
[wt
]);
1810 dst
= (unsigned long *)((char *)dst
+ bitmap
->pitch
);
1814 case 0*DM::SINGLE_COLOR
| DM::SSE2
| 0*DM::AYUV_PLANAR
:
1818 const DWORD
*sw
= switchpts
;
1819 for(int wt
=0; wt
<w
; ++wt
)
1821 // xo is the offset (usually negative) we have moved into the image
1822 // So if we have passed the switchpoint (?) switch to another colour
1823 // (So switchpts stores both colours *and* coordinates?)
1824 if(wt
+xo
>= sw
[1]) {while(wt
+xo
>= sw
[1]) sw
+= 2; color
= sw
[-2];}
1825 pixmix_sse2(&dst
[wt
], color
, s
[wt
]);
1828 dst
= (unsigned long *)((char *)dst
+ bitmap
->pitch
);
1832 case 0*DM::SINGLE_COLOR
| 0*DM::SSE2
| 0*DM::AYUV_PLANAR
:
1836 const DWORD
*sw
= switchpts
;
1837 for(int wt
=0; wt
<w
; ++wt
)
1839 if(wt
+xo
>= sw
[1]) {while(wt
+xo
>= sw
[1]) sw
+= 2; color
= sw
[-2];}
1840 pixmix(&dst
[wt
], color
, s
[wt
]);
1843 dst
= (unsigned long *)((char *)dst
+ bitmap
->pitch
);
1847 case DM::SINGLE_COLOR
| DM::SSE2
| DM::AYUV_PLANAR
:
1849 unsigned char* dst_A
= bitmap
->plans
[0] + dst_offset
;
1850 unsigned char* dst_Y
= bitmap
->plans
[1] + dst_offset
;
1851 unsigned char* dst_U
= bitmap
->plans
[2] + dst_offset
;
1852 unsigned char* dst_V
= bitmap
->plans
[3] + dst_offset
;
1854 AlphaBlt(dst_Y
, s
, ((color
)>>16)&0xff, h
, w
, overlayPitch
, bitmap
->pitch
);
1855 AlphaBlt(dst_U
, s
, ((color
)>>8)&0xff, h
, w
, overlayPitch
, bitmap
->pitch
);
1856 AlphaBlt(dst_V
, s
, ((color
))&0xff, h
, w
, overlayPitch
, bitmap
->pitch
);
1857 AlphaBlt(dst_A
, s
, 0, h
, w
, overlayPitch
, bitmap
->pitch
);
1860 case 0*DM::SINGLE_COLOR
| DM::SSE2
| DM::AYUV_PLANAR
:
1862 unsigned char* dst_A
= bitmap
->plans
[0] + dst_offset
;
1863 unsigned char* dst_Y
= bitmap
->plans
[1] + dst_offset
;
1864 unsigned char* dst_U
= bitmap
->plans
[2] + dst_offset
;
1865 unsigned char* dst_V
= bitmap
->plans
[3] + dst_offset
;
1867 const DWORD
*sw
= switchpts
;
1872 int new_x
= sw
[3] < w
+xo
? sw
[3] : w
+xo
;
1875 if( new_x
< last_x
)
1877 AlphaBlt(dst_Y
, s
+ last_x
- xo
, (color
>>16)&0xff, h
, new_x
-last_x
, overlayPitch
, bitmap
->pitch
);
1878 AlphaBlt(dst_U
, s
+ last_x
- xo
, (color
>>8)&0xff, h
, new_x
-last_x
, overlayPitch
, bitmap
->pitch
);
1879 AlphaBlt(dst_V
, s
+ last_x
- xo
, (color
)&0xff, h
, new_x
-last_x
, overlayPitch
, bitmap
->pitch
);
1880 AlphaBlt(dst_A
, s
+ last_x
- xo
, 0, h
, new_x
-last_x
, overlayPitch
, bitmap
->pitch
);
1882 dst_A
+= new_x
- last_x
;
1883 dst_Y
+= new_x
- last_x
;
1884 dst_U
+= new_x
- last_x
;
1885 dst_V
+= new_x
- last_x
;
1890 case DM::SINGLE_COLOR
| 0*DM::SSE2
| DM::AYUV_PLANAR
:
1892 // char * debug_dst=(char*)dst;int h2 = h;
1893 // XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", (char*)&color, sizeof(color)) );
1894 // XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", debug_dst, (h2-1)*spd.pitch) );
1895 // debug_dst += spd.pitch*spd.h;
1896 // XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", debug_dst, (h2-1)*spd.pitch) );
1897 // debug_dst += spd.pitch*spd.h;
1898 // XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", debug_dst, (h2-1)*spd.pitch) );
1899 // debug_dst += spd.pitch*spd.h;
1900 // XY_DO_ONCE( xy_logger::write_file("G:\\b2_rt", debug_dst, (h2-1)*spd.pitch) );
1901 // debug_dst=(char*)dst;
1903 unsigned char* dst_A
= bitmap
->plans
[0] + dst_offset
;
1904 unsigned char* dst_Y
= bitmap
->plans
[1] + dst_offset
;
1905 unsigned char* dst_U
= bitmap
->plans
[2] + dst_offset
;
1906 unsigned char* dst_V
= bitmap
->plans
[3] + dst_offset
;
1909 for(int wt
=0; wt
<w
; ++wt
)
1911 DWORD temp
= COMBINE_AYUV(dst_A
[wt
], dst_Y
[wt
], dst_U
[wt
], dst_V
[wt
]);
1912 pixmix(&temp
, color
, s
[wt
]);
1913 SPLIT_AYUV(temp
, dst_A
+wt
, dst_Y
+wt
, dst_U
+wt
, dst_V
+wt
);
1916 dst_A
+= bitmap
->pitch
;
1917 dst_Y
+= bitmap
->pitch
;
1918 dst_U
+= bitmap
->pitch
;
1919 dst_V
+= bitmap
->pitch
;
1921 // XY_DO_ONCE( xy_logger::write_file("G:\\a2_rt", debug_dst, (h2-1)*spd.pitch) );
1922 // debug_dst += spd.pitch*spd.h;
1923 // XY_DO_ONCE( xy_logger::write_file("G:\\a2_rt", debug_dst, (h2-1)*spd.pitch) );
1924 // debug_dst += spd.pitch*spd.h;
1925 // XY_DO_ONCE( xy_logger::write_file("G:\\a2_rt", debug_dst, (h2-1)*spd.pitch) );
1926 // debug_dst += spd.pitch*spd.h;
1927 // XY_DO_ONCE( xy_logger::write_file("G:\\a2_rt", debug_dst, (h2-1)*spd.pitch) );
1930 case 0*DM::SINGLE_COLOR
| 0*DM::SSE2
| DM::AYUV_PLANAR
:
1932 unsigned char* dst_A
= bitmap
->plans
[0] + dst_offset
;
1933 unsigned char* dst_Y
= bitmap
->plans
[1] + dst_offset
;
1934 unsigned char* dst_U
= bitmap
->plans
[2] + dst_offset
;
1935 unsigned char* dst_V
= bitmap
->plans
[3] + dst_offset
;
1938 const DWORD
*sw
= switchpts
;
1939 for(int wt
=0; wt
<w
; ++wt
)
1941 if(wt
+xo
>= sw
[1]) {while(wt
+xo
>= sw
[1]) sw
+= 2; color
= sw
[-2];}
1942 DWORD temp
= COMBINE_AYUV(dst_A
[wt
], dst_Y
[wt
], dst_U
[wt
], dst_V
[wt
]);
1943 pixmix(&temp
, color
, (s
[wt
]*(color
>>24))>>8);
1944 SPLIT_AYUV(temp
, dst_A
+wt
, dst_Y
+wt
, dst_U
+wt
, dst_V
+wt
);
1947 dst_A
+= bitmap
->pitch
;
1948 dst_Y
+= bitmap
->pitch
;
1949 dst_U
+= bitmap
->pitch
;
1950 dst_V
+= bitmap
->pitch
;
1955 // Remember to EMMS!
1956 // Rendering fails in funny ways if we don't do this.
1961 void Rasterizer::FillSolidRect(SubPicDesc
& spd
, int x
, int y
, int nWidth
, int nHeight
, DWORD argb
)
1963 bool fSSE2
= !!(g_cpuid
.m_flags
& CCpuID::sse2
);
1964 bool AYUV_PLANAR
= (spd
.type
==MSP_AYUV_PLANAR
);
1965 int draw_method
= 0;
1967 draw_method
|= DM::SSE2
;
1969 draw_method
|= DM::AYUV_PLANAR
;
1971 switch (draw_method
)
1973 case DM::SSE2
| 0*DM::AYUV_PLANAR
:
1975 for (int wy
=y
; wy
<y
+nHeight
; wy
++) {
1976 DWORD
* dst
= (DWORD
*)((BYTE
*)spd
.bits
+ spd
.pitch
* wy
) + x
;
1977 for(int wt
=0; wt
<nWidth
; ++wt
) {
1978 pixmix_sse2(&dst
[wt
], argb
, argb
>>24);
1983 case 0*DM::SSE2
| 0*DM::AYUV_PLANAR
:
1985 for (int wy
=y
; wy
<y
+nHeight
; wy
++) {
1986 DWORD
* dst
= (DWORD
*)((BYTE
*)spd
.bits
+ spd
.pitch
* wy
) + x
;
1987 for(int wt
=0; wt
<nWidth
; ++wt
) {
1988 pixmix(&dst
[wt
], argb
, argb
>>24);
1993 case DM::SSE2
| DM::AYUV_PLANAR
:
1995 BYTE
* dst
= reinterpret_cast<BYTE
*>(spd
.bits
) + spd
.pitch
* y
+ x
;
1997 BYTE
* dst_Y
= dst_A
+ spd
.pitch
*spd
.h
;
1998 BYTE
* dst_U
= dst_Y
+ spd
.pitch
*spd
.h
;
1999 BYTE
* dst_V
= dst_U
+ spd
.pitch
*spd
.h
;
2000 AlphaBlt(dst_Y
, argb
>>24, ((argb
)>>16)&0xff, nHeight
, nWidth
, spd
.pitch
);
2001 AlphaBlt(dst_U
, argb
>>24, ((argb
)>>8)&0xff, nHeight
, nWidth
, spd
.pitch
);
2002 AlphaBlt(dst_V
, argb
>>24, ((argb
))&0xff, nHeight
, nWidth
, spd
.pitch
);
2003 AlphaBlt(dst_A
, argb
>>24, 0, nHeight
, nWidth
, spd
.pitch
);
2006 case 0*DM::SSE2
| DM::AYUV_PLANAR
:
2008 BYTE
* dst
= reinterpret_cast<BYTE
*>(spd
.bits
) + spd
.pitch
* y
+ x
;
2010 BYTE
* dst_Y
= dst_A
+ spd
.pitch
*spd
.h
;
2011 BYTE
* dst_U
= dst_Y
+ spd
.pitch
*spd
.h
;
2012 BYTE
* dst_V
= dst_U
+ spd
.pitch
*spd
.h
;
2013 AlphaBltC(dst_Y
, argb
>>24, ((argb
)>>16)&0xff, nHeight
, nWidth
, spd
.pitch
);
2014 AlphaBltC(dst_U
, argb
>>24, ((argb
)>>8)&0xff, nHeight
, nWidth
, spd
.pitch
);
2015 AlphaBltC(dst_V
, argb
>>24, ((argb
))&0xff, nHeight
, nWidth
, spd
.pitch
);
2016 AlphaBltC(dst_A
, argb
>>24, 0, nHeight
, nWidth
, spd
.pitch
);
2024 ///////////////////////////////////////////////////////////////
2028 void Overlay::_DoFillAlphaMash(byte
* outputAlphaMask
, const byte
* pBody
, const byte
* pBorder
, int x
, int y
, int w
, int h
,
2029 const byte
* pAlphaMask
, int pitch
, DWORD color_alpha
)
2031 if (g_cpuid
.m_flags
& CCpuID::sse2
)
2033 pBody
= pBody
!=NULL
? pBody
+ y
*mOverlayPitch
+ x
: NULL
;
2034 pBorder
= pBorder
!=NULL
? pBorder
+ y
*mOverlayPitch
+ x
: NULL
;
2035 byte
* dst
= outputAlphaMask
+ y
*mOverlayPitch
+ x
;
2037 const int x0
= ((reinterpret_cast<int>(dst
)+3)&~3) - reinterpret_cast<int>(dst
) < w
?
2038 ((reinterpret_cast<int>(dst
)+3)&~3) - reinterpret_cast<int>(dst
) : w
; //IMPORTANT! Should not exceed w.
2039 const int x00
= ((reinterpret_cast<int>(dst
)+15)&~15) - reinterpret_cast<int>(dst
) < w
?
2040 ((reinterpret_cast<int>(dst
)+15)&~15) - reinterpret_cast<int>(dst
) : w
;//IMPORTANT! Should not exceed w.
2041 const int x_end00
= ((reinterpret_cast<int>(dst
)+w
)&~15) - reinterpret_cast<int>(dst
);
2042 const int x_end0
= ((reinterpret_cast<int>(dst
)+w
)&~3) - reinterpret_cast<int>(dst
);
2043 const int x_end
= w
;
2045 __m64 color_alpha_64
= _mm_set1_pi16(color_alpha
);
2046 __m128i color_alpha_128
= _mm_set1_epi16(color_alpha
);
2048 if(pAlphaMask
==NULL
&& pBody
!=NULL
&& pBorder
!=NULL
)
2053 mov eax, color_alpha
2055 punpcklwd XMM3, XMM3
2056 pshufd XMM3, XMM3, 0
2064 int temp
= pBorder
[j
]-pBody
[j
];
2065 temp
= temp
<0 ? 0 : temp
;
2066 dst
[j
] = (temp
* color_alpha
)>>6;
2070 __m64 border
= _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBorder
+j
));
2071 __m64 body
= _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBody
+j
));
2072 border
= _mm_subs_pu8(border
, body
);
2073 __m64 zero
= _mm_setzero_si64();
2074 border
= _mm_unpacklo_pi8(border
, zero
);
2075 border
= _mm_mullo_pi16(border
, color_alpha_64
);
2076 border
= _mm_srli_pi16(border
, 6);
2077 border
= _mm_packs_pu16(border
,border
);
2078 *reinterpret_cast<int*>(dst
+j
) = _mm_cvtsi64_si32(border
);
2080 __m128i zero
= _mm_setzero_si128();
2081 for( ;j
<x_end00
;j
+=16)
2083 __m128i border
= _mm_loadu_si128(reinterpret_cast<const __m128i
*>(pBorder
+j
));
2084 __m128i body
= _mm_loadu_si128(reinterpret_cast<const __m128i
*>(pBody
+j
));
2085 border
= _mm_subs_epu8(border
,body
);
2086 __m128i srchi
= border
;
2087 border
= _mm_unpacklo_epi8(border
, zero
);
2088 srchi
= _mm_unpackhi_epi8(srchi
, zero
);
2089 border
= _mm_mullo_epi16(border
, color_alpha_128
);
2090 srchi
= _mm_mullo_epi16(srchi
, color_alpha_128
);
2091 border
= _mm_srli_epi16(border
, 6);
2092 srchi
= _mm_srli_epi16(srchi
, 6);
2093 border
= _mm_packus_epi16(border
, srchi
);
2094 _mm_storeu_si128(reinterpret_cast<__m128i
*>(dst
+j
), border
);
2096 for( ;j
<x_end0
;j
+=4)
2098 __m64 border
= _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBorder
+j
));
2099 __m64 body
= _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBody
+j
));
2100 border
= _mm_subs_pu8(border
, body
);
2101 __m64 zero
= _mm_setzero_si64();
2102 border
= _mm_unpacklo_pi8(border
, zero
);
2103 border
= _mm_mullo_pi16(border
, color_alpha_64
);
2104 border
= _mm_srli_pi16(border
, 6);
2105 border
= _mm_packs_pu16(border
,border
);
2106 *reinterpret_cast<int*>(dst
+j
) = _mm_cvtsi64_si32(border
);
2110 int temp
= pBorder
[j
]-pBody
[j
];
2111 temp
= temp
<0 ? 0 : temp
;
2112 dst
[j
] = (temp
* color_alpha
)>>6;
2114 pBody
+= mOverlayPitch
;
2115 pBorder
+= mOverlayPitch
;
2116 //pAlphaMask += pitch;
2117 dst
+= mOverlayPitch
;
2120 else if( ((pBody
==NULL
) + (pBorder
==NULL
))==1 && pAlphaMask
==NULL
)
2122 const BYTE
* src1
= pBody
!=NULL
? pBody
: pBorder
;
2128 dst
[j
] = (src1
[j
] * color_alpha
)>>6;
2132 __m64 src
= _mm_cvtsi32_si64(*reinterpret_cast<const int*>(src1
+j
));
2133 __m64 zero
= _mm_setzero_si64();
2134 src
= _mm_unpacklo_pi8(src
, zero
);
2135 src
= _mm_mullo_pi16(src
, color_alpha_64
);
2136 src
= _mm_srli_pi16(src
, 6);
2137 src
= _mm_packs_pu16(src
,src
);
2138 *reinterpret_cast<int*>(dst
+j
) = _mm_cvtsi64_si32(src
);
2140 __m128i zero
= _mm_setzero_si128();
2141 for( ;j
<x_end00
;j
+=16)
2143 __m128i src
= _mm_loadu_si128(reinterpret_cast<const __m128i
*>(src1
+j
));
2144 __m128i srchi
= src
;
2145 src
= _mm_unpacklo_epi8(src
, zero
);
2146 srchi
= _mm_unpackhi_epi8(srchi
, zero
);
2147 src
= _mm_mullo_epi16(src
, color_alpha_128
);
2148 srchi
= _mm_mullo_epi16(srchi
, color_alpha_128
);
2149 src
= _mm_srli_epi16(src
, 6);
2150 srchi
= _mm_srli_epi16(srchi
, 6);
2151 src
= _mm_packus_epi16(src
, srchi
);
2152 _mm_storeu_si128(reinterpret_cast<__m128i
*>(dst
+j
), src
);
2154 for( ;j
<x_end0
;j
+=4)
2156 __m64 src
= _mm_cvtsi32_si64(*reinterpret_cast<const int*>(src1
+j
));
2157 __m64 zero
= _mm_setzero_si64();
2158 src
= _mm_unpacklo_pi8(src
, zero
);
2159 src
= _mm_mullo_pi16(src
, color_alpha_64
);
2160 src
= _mm_srli_pi16(src
, 6);
2161 src
= _mm_packs_pu16(src
,src
);
2162 *reinterpret_cast<int*>(dst
+j
) = _mm_cvtsi64_si32(src
);
2166 dst
[j
] = (src1
[j
] * color_alpha
)>>6;
2168 src1
+= mOverlayPitch
;
2169 //pAlphaMask += pitch;
2170 dst
+= mOverlayPitch
;
2173 else if( ((pBody
==NULL
) + (pBorder
==NULL
))==1 && pAlphaMask
!=NULL
)
2175 const BYTE
* src1
= pBody
!=NULL
? pBody
: pBorder
;
2181 dst
[j
] = (src1
[j
] * pAlphaMask
[j
] * color_alpha
)>>12;
2185 __m64 src
= _mm_cvtsi32_si64(*reinterpret_cast<const int*>(src1
+j
));
2186 __m64 mask
= _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pAlphaMask
+j
));
2187 __m64 zero
= _mm_setzero_si64();
2188 src
= _mm_unpacklo_pi8(src
, zero
);
2189 src
= _mm_mullo_pi16(src
, color_alpha_64
);
2190 mask
= _mm_unpacklo_pi8(zero
, mask
); //important!
2191 src
= _mm_mulhi_pi16(src
, mask
); //important!
2192 src
= _mm_srli_pi16(src
, 12+8-16); //important!
2193 src
= _mm_packs_pu16(src
,src
);
2194 *reinterpret_cast<int*>(dst
+j
) = _mm_cvtsi64_si32(src
);
2196 __m128i zero
= _mm_setzero_si128();
2197 for( ;j
<x_end00
;j
+=16)
2199 __m128i src
= _mm_loadu_si128(reinterpret_cast<const __m128i
*>(src1
+j
));
2200 __m128i mask
= _mm_loadu_si128(reinterpret_cast<const __m128i
*>(pAlphaMask
+j
));
2201 __m128i srchi
= src
;
2202 __m128i maskhi
= mask
;
2203 src
= _mm_unpacklo_epi8(src
, zero
);
2204 srchi
= _mm_unpackhi_epi8(srchi
, zero
);
2205 mask
= _mm_unpacklo_epi8(zero
, mask
); //important!
2206 maskhi
= _mm_unpackhi_epi8(zero
, maskhi
);
2207 src
= _mm_mullo_epi16(src
, color_alpha_128
);
2208 srchi
= _mm_mullo_epi16(srchi
, color_alpha_128
);
2209 src
= _mm_mulhi_epu16(src
, mask
); //important!
2210 srchi
= _mm_mulhi_epu16(srchi
, maskhi
);
2211 src
= _mm_srli_epi16(src
, 12+8-16); //important!
2212 srchi
= _mm_srli_epi16(srchi
, 12+8-16);
2213 src
= _mm_packus_epi16(src
, srchi
);
2214 _mm_storeu_si128(reinterpret_cast<__m128i
*>(dst
+j
), src
);
2216 for( ;j
<x_end0
;j
+=4)
2218 __m64 src
= _mm_cvtsi32_si64(*reinterpret_cast<const int*>(src1
+j
));
2219 __m64 mask
= _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pAlphaMask
+j
));
2220 __m64 zero
= _mm_setzero_si64();
2221 src
= _mm_unpacklo_pi8(src
, zero
);
2222 src
= _mm_mullo_pi16(src
, color_alpha_64
);
2223 mask
= _mm_unpacklo_pi8(zero
, mask
); //important!
2224 src
= _mm_mulhi_pi16(src
, mask
); //important!
2225 src
= _mm_srli_pi16(src
, 12+8-16); //important!
2226 src
= _mm_packs_pu16(src
,src
);
2227 *reinterpret_cast<int*>(dst
+j
) = _mm_cvtsi64_si32(src
);
2231 dst
[j
] = (src1
[j
] * pAlphaMask
[j
] * color_alpha
)>>12;
2233 src1
+= mOverlayPitch
;
2234 pAlphaMask
+= pitch
;
2235 dst
+= mOverlayPitch
;
2238 else if( pAlphaMask
!=NULL
&& pBody
!=NULL
&& pBorder
!=NULL
)
2245 int temp
= pBorder
[j
]-pBody
[j
];
2246 temp
= temp
<0 ? 0 : temp
;
2247 dst
[j
] = (temp
* pAlphaMask
[j
] * color_alpha
)>>12;
2251 __m64 border
= _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBorder
+j
));
2252 __m64 body
= _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBody
+j
));
2253 border
= _mm_subs_pu8(border
, body
);
2254 __m64 mask
= _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pAlphaMask
+j
));
2255 __m64 zero
= _mm_setzero_si64();
2256 border
= _mm_unpacklo_pi8(border
, zero
);
2257 border
= _mm_mullo_pi16(border
, color_alpha_64
);
2258 mask
= _mm_unpacklo_pi8(zero
, mask
); //important!
2259 border
= _mm_mulhi_pi16(border
, mask
); //important!
2260 border
= _mm_srli_pi16(border
, 12+8-16); //important!
2261 border
= _mm_packs_pu16(border
,border
);
2262 *reinterpret_cast<int*>(dst
+j
) = _mm_cvtsi64_si32(border
);
2264 __m128i zero
= _mm_setzero_si128();
2265 for( ;j
<x_end00
;j
+=16)
2267 __m128i border
= _mm_loadu_si128(reinterpret_cast<const __m128i
*>(pBorder
+j
));
2268 __m128i body
= _mm_loadu_si128(reinterpret_cast<const __m128i
*>(pBody
+j
));
2269 border
= _mm_subs_epu8(border
,body
);
2271 __m128i mask
= _mm_loadu_si128(reinterpret_cast<const __m128i
*>(pAlphaMask
+j
));
2272 __m128i srchi
= border
;
2273 __m128i maskhi
= mask
;
2274 border
= _mm_unpacklo_epi8(border
, zero
);
2275 srchi
= _mm_unpackhi_epi8(srchi
, zero
);
2276 mask
= _mm_unpacklo_epi8(zero
, mask
); //important!
2277 maskhi
= _mm_unpackhi_epi8(zero
, maskhi
);
2278 border
= _mm_mullo_epi16(border
, color_alpha_128
);
2279 srchi
= _mm_mullo_epi16(srchi
, color_alpha_128
);
2280 border
= _mm_mulhi_epu16(border
, mask
); //important!
2281 srchi
= _mm_mulhi_epu16(srchi
, maskhi
);
2282 border
= _mm_srli_epi16(border
, 12+8-16); //important!
2283 srchi
= _mm_srli_epi16(srchi
, 12+8-16);
2284 border
= _mm_packus_epi16(border
, srchi
);
2285 _mm_storeu_si128(reinterpret_cast<__m128i
*>(dst
+j
), border
);
2287 for( ;j
<x_end0
;j
+=4)
2289 __m64 border
= _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBorder
+j
));
2290 __m64 body
= _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pBody
+j
));
2291 border
= _mm_subs_pu8(border
, body
);
2292 __m64 mask
= _mm_cvtsi32_si64(*reinterpret_cast<const int*>(pAlphaMask
+j
));
2293 __m64 zero
= _mm_setzero_si64();
2294 border
= _mm_unpacklo_pi8(border
, zero
);
2295 border
= _mm_mullo_pi16(border
, color_alpha_64
);
2296 mask
= _mm_unpacklo_pi8(zero
, mask
); //important!
2297 border
= _mm_mulhi_pi16(border
, mask
); //important!
2298 border
= _mm_srli_pi16(border
, 12+8-16); //important!
2299 border
= _mm_packs_pu16(border
,border
);
2300 *reinterpret_cast<int*>(dst
+j
) = _mm_cvtsi64_si32(border
);
2304 int temp
= pBorder
[j
]-pBody
[j
];
2305 temp
= temp
<0 ? 0 : temp
;
2306 dst
[j
] = (temp
* pAlphaMask
[j
] * color_alpha
)>>12;
2308 pBody
+= mOverlayPitch
;
2309 pBorder
+= mOverlayPitch
;
2310 pAlphaMask
+= pitch
;
2311 dst
+= mOverlayPitch
;
2316 //should NOT happen!
2320 for(int j
=0;j
<x_end
;j
++)
2324 dst
+= mOverlayPitch
;
2330 _DoFillAlphaMash_c(outputAlphaMask
, pBody
, pBorder
, x
, y
, w
, h
, pAlphaMask
, pitch
, color_alpha
);
2335 void Overlay::_DoFillAlphaMash_c(byte
* outputAlphaMask
, const byte
* pBody
, const byte
* pBorder
, int x
, int y
, int w
, int h
,
2336 const byte
* pAlphaMask
, int pitch
, DWORD color_alpha
)
2338 pBody
= pBody
!=NULL
? pBody
+ y
*mOverlayPitch
+ x
: NULL
;
2339 pBorder
= pBorder
!=NULL
? pBorder
+ y
*mOverlayPitch
+ x
: NULL
;
2340 byte
* dst
= outputAlphaMask
+ y
*mOverlayPitch
+ x
;
2342 if(pAlphaMask
==NULL
&& pBody
!=NULL
&& pBorder
!=NULL
)
2349 int temp
= pBorder
[j
]-pBody
[j
];
2350 temp
= temp
<0 ? 0 : temp
;
2351 dst
[j
] = (temp
* color_alpha
)>>6;
2353 pBody
+= mOverlayPitch
;
2354 pBorder
+= mOverlayPitch
;
2355 //pAlphaMask += pitch;
2356 dst
+= mOverlayPitch
;
2359 else if( ((pBody
==NULL
) + (pBorder
==NULL
))==1 && pAlphaMask
==NULL
)
2361 const BYTE
* src1
= pBody
!=NULL
? pBody
: pBorder
;
2367 dst
[j
] = (src1
[j
] * color_alpha
)>>6;
2369 src1
+= mOverlayPitch
;
2370 //pAlphaMask += pitch;
2371 dst
+= mOverlayPitch
;
2374 else if( ((pBody
==NULL
) + (pBorder
==NULL
))==1 && pAlphaMask
!=NULL
)
2376 const BYTE
* src1
= pBody
!=NULL
? pBody
: pBorder
;
2382 dst
[j
] = (src1
[j
] * pAlphaMask
[j
] * color_alpha
)>>12;
2384 src1
+= mOverlayPitch
;
2385 pAlphaMask
+= pitch
;
2386 dst
+= mOverlayPitch
;
2389 else if( pAlphaMask
!=NULL
&& pBody
!=NULL
&& pBorder
!=NULL
)
2396 int temp
= pBorder
[j
]-pBody
[j
];
2397 temp
= temp
<0 ? 0 : temp
;
2398 dst
[j
] = (temp
* pAlphaMask
[j
] * color_alpha
)>>12;
2400 pBody
+= mOverlayPitch
;
2401 pBorder
+= mOverlayPitch
;
2402 pAlphaMask
+= pitch
;
2403 dst
+= mOverlayPitch
;
2408 //should NOT happen!
2412 for(int j
=0;j
<w
;j
++)
2416 dst
+= mOverlayPitch
;
2421 void Overlay::FillAlphaMash( byte
* outputAlphaMask
, bool fBody
, bool fBorder
, int x
, int y
, int w
, int h
, const byte
* pAlphaMask
, int pitch
, DWORD color_alpha
)
2423 if(!fBorder
&& fBody
&& pAlphaMask
==NULL
)
2425 _DoFillAlphaMash(outputAlphaMask
, mBody
.get(), NULL
, x
, y
, w
, h
, pAlphaMask
, pitch
, color_alpha
);
2427 else if(/*fBorder &&*/ fBody
&& pAlphaMask
==NULL
)
2429 _DoFillAlphaMash(outputAlphaMask
, NULL
, mBorder
.get(), x
, y
, w
, h
, pAlphaMask
, pitch
, color_alpha
);
2431 else if(!fBody
&& fBorder
/* pAlphaMask==NULL or not*/)
2433 _DoFillAlphaMash(outputAlphaMask
, mBody
.get(), mBorder
.get(), x
, y
, w
, h
, pAlphaMask
, pitch
, color_alpha
);
2435 else if(!fBorder
&& fBody
&& pAlphaMask
!=NULL
)
2437 _DoFillAlphaMash(outputAlphaMask
, mBody
.get(), NULL
, x
, y
, w
, h
, pAlphaMask
, pitch
, color_alpha
);
2439 else if(fBorder
&& fBody
&& pAlphaMask
!=NULL
)
2441 _DoFillAlphaMash(outputAlphaMask
, NULL
, mBorder
.get(), x
, y
, w
, h
, pAlphaMask
, pitch
, color_alpha
);
2450 Overlay
* Overlay::GetSubpixelVariance(unsigned int xshift
, unsigned int yshift
)
2452 Overlay
* overlay
= new Overlay();
2460 overlay
->mOffsetX
= mOffsetX
- xshift
;
2461 overlay
->mOffsetY
= mOffsetY
- yshift
;
2462 overlay
->mWidth
= mWidth
+ xshift
;
2463 overlay
->mHeight
= mHeight
+ yshift
;
2465 overlay
->mOverlayWidth
= ((overlay
->mWidth
+7)>>3) + 1;
2466 overlay
->mOverlayHeight
= ((overlay
->mHeight
+ 7)>>3) + 1;
2467 overlay
->mOverlayPitch
= (overlay
->mOverlayWidth
+15)&~15;
2470 overlay
->mfWideOutlineEmpty
= mfWideOutlineEmpty
;
2472 if (overlay
->mOverlayPitch
* overlay
->mOverlayHeight
<=0)
2477 BYTE
* body
= reinterpret_cast<BYTE
*>(xy_malloc(overlay
->mOverlayPitch
* overlay
->mOverlayHeight
));
2482 overlay
->mBody
.reset(body
, xy_free
);
2483 BYTE
* border
= NULL
;
2484 if (!overlay
->mfWideOutlineEmpty
)
2486 border
= reinterpret_cast<BYTE
*>(xy_malloc(overlay
->mOverlayPitch
* overlay
->mOverlayHeight
));
2491 overlay
->mBorder
.reset(border
, xy_free
);
2494 if(overlay
->mOverlayPitch
==mOverlayPitch
&& overlay
->mOverlayHeight
>=mOverlayHeight
)
2498 memcpy(body
, mBody
.get(), mOverlayPitch
* mOverlayHeight
);
2499 memset(body
+mOverlayPitch
*mOverlayHeight
, 0, mOverlayPitch
* (overlay
->mOverlayHeight
-mOverlayHeight
));
2501 else if ( (!!body
)!=(!!mBody
)/*==NULL*/)
2506 if (border
&& mBorder
)
2508 memcpy(border
, mBorder
.get(), mOverlayPitch
* mOverlayHeight
);
2509 memset(border
+mOverlayPitch
*mOverlayHeight
, 0, mOverlayPitch
* (overlay
->mOverlayHeight
-mOverlayHeight
));
2511 else if ( (!!border
)!=(!!mBorder
)/*==NULL*/ )
2518 memset(body
, 0, overlay
->mOverlayPitch
* overlay
->mOverlayHeight
);
2520 const byte
* src
= mBody
.get();
2521 for (int i
=0;i
<mOverlayHeight
;i
++)
2523 memcpy(dst
, src
, mOverlayPitch
);
2524 dst
+= overlay
->mOverlayPitch
;
2525 src
+= mOverlayPitch
;
2527 if (!overlay
->mfWideOutlineEmpty
)
2529 ASSERT(border
&& mBorder
);
2530 memset(border
, 0, overlay
->mOverlayPitch
* overlay
->mOverlayHeight
);
2532 src
= mBorder
.get();
2533 for (int i
=0;i
<mOverlayHeight
;i
++)
2535 memcpy(dst
, src
, mOverlayPitch
);
2536 dst
+= overlay
->mOverlayPitch
;
2537 src
+= mOverlayPitch
;
2542 // Bilinear(overlay->mpOverlayBuffer.base, overlay->mOverlayWidth, 2*overlay->mOverlayHeight, overlay->mOverlayPitch, xshift, yshift);
2543 Bilinear(body
, overlay
->mOverlayWidth
, overlay
->mOverlayHeight
, overlay
->mOverlayPitch
, xshift
, yshift
);
2544 if (!overlay
->mfWideOutlineEmpty
)
2546 Bilinear(border
, overlay
->mOverlayWidth
, overlay
->mOverlayHeight
, overlay
->mOverlayPitch
, xshift
, yshift
);
2551 ///////////////////////////////////////////////////////////////
2555 PathData::PathData():mpPathTypes(NULL
), mpPathPoints(NULL
), mPathPoints(0)
2559 PathData::PathData( const PathData
& src
):mpPathTypes(NULL
), mpPathPoints(NULL
), mPathPoints(src
.mPathPoints
)
2561 //TODO: deal with the case that src.mPathPoints<0
2564 mpPathTypes
= static_cast<BYTE
*>(malloc(mPathPoints
* sizeof(BYTE
)));
2565 mpPathPoints
= static_cast<POINT
*>(malloc(mPathPoints
* sizeof(POINT
)));
2569 memcpy(mpPathTypes
, src
.mpPathTypes
, mPathPoints
*sizeof(BYTE
));
2570 memcpy(mpPathPoints
, src
.mpPathPoints
, mPathPoints
*sizeof(POINT
));
2574 const PathData
& PathData::operator=( const PathData
& src
)
2578 if(mPathPoints
!=src
.mPathPoints
&& src
.mPathPoints
>0)
2581 mPathPoints
= src
.mPathPoints
;
2582 mpPathTypes
= static_cast<BYTE
*>(malloc(mPathPoints
* sizeof(BYTE
)));
2583 mpPathPoints
= static_cast<POINT
*>(malloc(mPathPoints
* sizeof(POINT
)));//better than realloc
2585 if(src
.mPathPoints
>0)
2587 memcpy(mpPathTypes
, src
.mpPathTypes
, mPathPoints
*sizeof(BYTE
));
2588 memcpy(mpPathPoints
, src
.mpPathPoints
, mPathPoints
*sizeof(POINT
));
2594 PathData::~PathData()
2599 bool PathData::operator==( const PathData
& rhs
) const
2601 return (this==&rhs
) || (
2602 mPathPoints
==rhs
.mPathPoints
2603 && !memcmp(mpPathTypes
, rhs
.mpPathTypes
, mPathPoints
* sizeof(BYTE
) )
2604 && !memcmp(mpPathPoints
, rhs
.mpPathPoints
, mPathPoints
* sizeof(POINT
) )
2608 void PathData::_TrashPath()
2618 mpPathPoints
= NULL
;
2623 bool PathData::BeginPath(HDC hdc
)
2626 return !!::BeginPath(hdc
);
2629 bool PathData::EndPath(HDC hdc
)
2634 mPathPoints
= GetPath(hdc
, NULL
, NULL
, 0);
2637 mpPathTypes
= (BYTE
*)malloc(sizeof(BYTE
) * mPathPoints
);
2638 mpPathPoints
= (POINT
*)malloc(sizeof(POINT
) * mPathPoints
);
2639 if(mPathPoints
== GetPath(hdc
, mpPathPoints
, mpPathTypes
, mPathPoints
))
2646 bool PathData::PartialBeginPath(HDC hdc
, bool bClearPath
)
2650 return !!::BeginPath(hdc
);
2653 bool PathData::PartialEndPath(HDC hdc
, long dx
, long dy
)
2661 nPoints
= GetPath(hdc
, NULL
, NULL
, 0);
2664 pNewTypes
= (BYTE
*)realloc(mpPathTypes
, (mPathPoints
+ nPoints
) * sizeof(BYTE
));
2665 pNewPoints
= (POINT
*)realloc(mpPathPoints
, (mPathPoints
+ nPoints
) * sizeof(POINT
));
2667 mpPathTypes
= pNewTypes
;
2669 mpPathPoints
= pNewPoints
;
2670 BYTE
* pTypes
= new BYTE
[nPoints
];
2671 POINT
* pPoints
= new POINT
[nPoints
];
2672 if(pNewTypes
&& pNewPoints
&& nPoints
== GetPath(hdc
, pPoints
, pTypes
, nPoints
))
2674 for(int i
= 0; i
< nPoints
; ++i
)
2676 mpPathPoints
[mPathPoints
+ i
].x
= pPoints
[i
].x
+ dx
;
2677 mpPathPoints
[mPathPoints
+ i
].y
= pPoints
[i
].y
+ dy
;
2678 mpPathTypes
[mPathPoints
+ i
] = pTypes
[i
];
2680 mPathPoints
+= nPoints
;
2694 void PathData::AlignLeftTop(CPoint
*left_top
, CSize
*size
)
2700 for(int i
=0; i
<mPathPoints
; ++i
)
2702 int ix
= mpPathPoints
[i
].x
;
2703 int iy
= mpPathPoints
[i
].y
;
2704 if(ix
< minx
) minx
= ix
;
2705 if(ix
> maxx
) maxx
= ix
;
2706 if(iy
< miny
) miny
= iy
;
2707 if(iy
> maxy
) maxy
= iy
;
2709 if(minx
> maxx
|| miny
> maxy
)
2712 *left_top
= CPoint(0, 0);
2713 *size
= CSize(0, 0);
2716 minx
= (minx
>> 3) & ~7;
2717 miny
= (miny
>> 3) & ~7;
2718 maxx
= (maxx
+ 7) >> 3;
2719 maxy
= (maxy
+ 7) >> 3;
2720 for(int i
=0; i
<mPathPoints
; ++i
)
2722 mpPathPoints
[i
].x
-= minx
*8;
2723 mpPathPoints
[i
].y
-= miny
*8;
2725 *left_top
= CPoint(minx
, miny
);
2726 *size
= CSize(maxx
+1-minx
, maxy
+1-miny
);
2730 //////////////////////////////////////////////////////////////////////////
2734 ScanLineData::ScanLineData()
2738 ScanLineData::~ScanLineData()
2742 void ScanLineData::_ReallocEdgeBuffer(int edges
)
2744 mEdgeHeapSize
= edges
;
2745 mpEdgeBuffer
= (Edge
*)realloc(mpEdgeBuffer
, sizeof(Edge
)*edges
);
2748 void ScanLineData::_EvaluateBezier(const PathData
& path_data
, int ptbase
, bool fBSpline
)
2750 const POINT
* pt0
= path_data
.mpPathPoints
+ ptbase
;
2751 const POINT
* pt1
= path_data
.mpPathPoints
+ ptbase
+ 1;
2752 const POINT
* pt2
= path_data
.mpPathPoints
+ ptbase
+ 2;
2753 const POINT
* pt3
= path_data
.mpPathPoints
+ ptbase
+ 3;
2762 double cx3
, cx2
, cx1
, cx0
, cy3
, cy2
, cy1
, cy0
;
2769 double _1div6
= 1.0/6.0;
2770 cx3
= _1div6
*(- x0
+3*x1
-3*x2
+x3
);
2771 cx2
= _1div6
*( 3*x0
-6*x1
+3*x2
);
2772 cx1
= _1div6
*(-3*x0
+3*x2
);
2773 cx0
= _1div6
*( x0
+4*x1
+1*x2
);
2774 cy3
= _1div6
*(- y0
+3*y1
-3*y2
+y3
);
2775 cy2
= _1div6
*( 3*y0
-6*y1
+3*y2
);
2776 cy1
= _1div6
*(-3*y0
+3*y2
);
2777 cy0
= _1div6
*( y0
+4*y1
+1*y2
);
2785 cx3
= - x0
+3*x1
-3*x2
+x3
;
2786 cx2
= 3*x0
-6*x1
+3*x2
;
2789 cy3
= - y0
+3*y1
-3*y2
+y3
;
2790 cy2
= 3*y0
-6*y1
+3*y2
;
2795 // This equation is from Graphics Gems I.
2797 // The idea is that since we're approximating a cubic curve with lines,
2798 // any error we incur is due to the curvature of the line, which we can
2799 // estimate by calculating the maximum acceleration of the curve. For
2800 // a cubic, the acceleration (second derivative) is a line, meaning that
2801 // the absolute maximum acceleration must occur at either the beginning
2802 // (|c2|) or the end (|c2+c3|). Our bounds here are a little more
2803 // conservative than that, but that's okay.
2805 // If the acceleration of the parametric formula is zero (c2 = c3 = 0),
2806 // that component of the curve is linear and does not incur any error.
2807 // If a=0 for both X and Y, the curve is a line segment and we can
2808 // use a step size of 1.
2809 double maxaccel1
= fabs(2*cy2
) + fabs(6*cy3
);
2810 double maxaccel2
= fabs(2*cx2
) + fabs(6*cx3
);
2811 double maxaccel
= maxaccel1
> maxaccel2
? maxaccel1
: maxaccel2
;
2813 if(maxaccel
> 8.0) h
= sqrt(8.0 / maxaccel
);
2814 if(!fFirstSet
) {firstp
.x
= (LONG
)cx0
; firstp
.y
= (LONG
)cy0
; lastp
= firstp
; fFirstSet
= true;}
2815 for(double t
= 0; t
< 1.0; t
+= h
)
2817 double x
= cx0
+ t
*(cx1
+ t
*(cx2
+ t
*cx3
));
2818 double y
= cy0
+ t
*(cy1
+ t
*(cy2
+ t
*cy3
));
2819 _EvaluateLine(lastp
.x
, lastp
.y
, (int)x
, (int)y
);
2821 double x
= cx0
+ cx1
+ cx2
+ cx3
;
2822 double y
= cy0
+ cy1
+ cy2
+ cy3
;
2823 _EvaluateLine(lastp
.x
, lastp
.y
, (int)x
, (int)y
);
2826 void ScanLineData::_EvaluateLine(const PathData
& path_data
, int pt1idx
, int pt2idx
)
2828 const POINT
* pt1
= path_data
.mpPathPoints
+ pt1idx
;
2829 const POINT
* pt2
= path_data
.mpPathPoints
+ pt2idx
;
2830 _EvaluateLine(pt1
->x
, pt1
->y
, pt2
->x
, pt2
->y
);
2833 void ScanLineData::_EvaluateLine(int x0
, int y0
, int x1
, int y1
)
2835 if(lastp
.x
!= x0
|| lastp
.y
!= y0
)
2837 _EvaluateLine(lastp
.x
, lastp
.y
, x0
, y0
);
2839 if(!fFirstSet
) {firstp
.x
= x0
; firstp
.y
= y0
; fFirstSet
= true;}
2844 __int64 xacc
= (__int64
)x0
<< 13;
2847 int y
= ((y0
+ 3)&~7) + 4;
2852 __int64 invslope
= (__int64(x1
- x0
) << 16) / dy
;
2853 while(mEdgeNext
+ y1
+ 1 - iy
> mEdgeHeapSize
)
2854 _ReallocEdgeBuffer(mEdgeHeapSize
*2);
2855 xacc
+= (invslope
* (y
- y0
)) >> 3;
2858 int ix
= (int)((xacc
+ 32768) >> 16);
2859 mpEdgeBuffer
[mEdgeNext
].next
= mpScanBuffer
[iy
];
2860 mpEdgeBuffer
[mEdgeNext
].posandflag
= ix
*2 + 1;
2861 mpScanBuffer
[iy
] = mEdgeNext
++;
2867 else if(y1
< y0
) // up
2869 __int64 xacc
= (__int64
)x1
<< 13;
2872 int y
= ((y1
+ 3)&~7) + 4;
2877 __int64 invslope
= (__int64(x0
- x1
) << 16) / dy
;
2878 while(mEdgeNext
+ y0
+ 1 - iy
> mEdgeHeapSize
)
2879 _ReallocEdgeBuffer(mEdgeHeapSize
*2);
2880 xacc
+= (invslope
* (y
- y1
)) >> 3;
2883 int ix
= (int)((xacc
+ 32768) >> 16);
2884 mpEdgeBuffer
[mEdgeNext
].next
= mpScanBuffer
[iy
];
2885 mpEdgeBuffer
[mEdgeNext
].posandflag
= ix
*2;
2886 mpScanBuffer
[iy
] = mEdgeNext
++;
2894 bool ScanLineData::ScanConvert(const PathData
& path_data
, const CSize
& size
)
2896 int lastmoveto
= -1;
2898 // Drop any outlines we may have.
2900 // Determine bounding box
2901 if(!path_data
.mPathPoints
)
2903 mWidth
= mHeight
= 0;
2908 // Initialize edge buffer. We use edge 0 as a sentinel.
2910 mEdgeHeapSize
= 2048;
2911 mpEdgeBuffer
= (Edge
*)malloc(sizeof(Edge
)*mEdgeHeapSize
);
2912 // Initialize scanline list.
2913 mpScanBuffer
= new unsigned int[mHeight
];
2914 memset(mpScanBuffer
, 0, mHeight
*sizeof(unsigned int));
2915 // Scan convert the outline. Yuck, Bezier curves....
2916 // Unfortunately, Windows 95/98 GDI has a bad habit of giving us text
2917 // paths with all but the first figure left open, so we can't rely
2918 // on the PT_CLOSEFIGURE flag being used appropriately.
2920 firstp
.x
= firstp
.y
= 0;
2921 lastp
.x
= lastp
.y
= 0;
2922 for(i
=0; i
<path_data
.mPathPoints
; ++i
)
2924 BYTE t
= path_data
.mpPathTypes
[i
] & ~PT_CLOSEFIGURE
;
2928 if(lastmoveto
>= 0 && firstp
!= lastp
)
2929 _EvaluateLine(lastp
.x
, lastp
.y
, firstp
.x
, firstp
.y
);
2932 lastp
= path_data
.mpPathPoints
[i
];
2937 if(path_data
.mPathPoints
- (i
-1) >= 2) _EvaluateLine(path_data
, i
-1, i
);
2940 if(path_data
.mPathPoints
- (i
-1) >= 4) _EvaluateBezier(path_data
, i
-1, false);
2944 if(path_data
.mPathPoints
- (i
-1) >= 4) _EvaluateBezier(path_data
, i
-1, true);
2947 case PT_BSPLINEPATCHTO
:
2948 if(path_data
.mPathPoints
- (i
-3) >= 4) _EvaluateBezier(path_data
, i
-3, true);
2952 if(lastmoveto
>= 0 && firstp
!= lastp
)
2953 _EvaluateLine(lastp
.x
, lastp
.y
, firstp
.x
, firstp
.y
);
2954 // Convert the edges to spans. We couldn't do this before because some of
2955 // the regions may have winding numbers >+1 and it would have been a pain
2956 // to try to adjust the spans on the fly. We use one heap to detangle
2957 // a scanline's worth of edges from the singly-linked lists, and another
2958 // to collect the actual scans.
2959 std::vector
<int> heap
;
2960 mOutline
.reserve(mEdgeNext
/ 2);
2962 for(y
=0; y
<mHeight
; ++y
)
2965 // Detangle scanline into edge heap.
2966 for(unsigned ptr
= (unsigned)(mpScanBuffer
[y
]&0xffffffff); ptr
; ptr
= mpEdgeBuffer
[ptr
].next
)
2968 heap
.push_back(mpEdgeBuffer
[ptr
].posandflag
);
2970 // Sort edge heap. Note that we conveniently made the opening edges
2971 // one more than closing edges at the same spot, so we won't have any
2972 // problems with abutting spans.
2973 std::sort(heap
.begin(), heap
.end()/*begin() + heap.size()*/);
2974 // Process edges and add spans. Since we only check for a non-zero
2975 // winding number, it doesn't matter which way the outlines go!
2976 std::vector
<int>::iterator itX1
= heap
.begin();
2977 std::vector
<int>::iterator itX2
= heap
.end(); // begin() + heap.size();
2979 for(; itX1
!= itX2
; ++itX1
)
2992 mOutline
.push_back(std::pair
<__int64
,__int64
>((y
<<32)+x1
+0x4000000040000000i
64, (y
<<32)+x2
+0x4000000040000000i
64)); // G: damn Avery, this is evil! :)
2997 // Dump the edge and scan buffers, since we no longer need them.
2999 delete [] mpScanBuffer
;
3004 void ScanLineData::DeleteOutlines()
3009 bool ScanLineData2::CreateWidenedRegion(int rx
, int ry
)
3013 mWideBorder
= max(rx
,ry
);
3014 mWideOutline
.clear();
3016 const tSpanBuffer
& out_line
= m_scan_line_data
->mOutline
;
3019 WidenRegionCreater
*widen_region_creater
= WidenRegionCreater::GetDefaultWidenRegionCreater();
3020 widen_region_creater
->xy_overlap_region(&mWideOutline
, out_line
, rx
, ry
);
3022 else if (ry
== 0 && rx
> 0)
3024 // There are artifacts if we don't make at least two overlaps of the line, even at same Y coord
3025 OverlapRegion(mWideOutline
, out_line
, rx
, 0);
3026 OverlapRegion(mWideOutline
, out_line
, rx
, 0);