2 #include "../dsutil/vd.h"
4 typedef const UINT8 CUINT8
, *PCUINT8
;
5 typedef const UINT CUINT
, *PCUINT
;
8 // ref: "Comparing floating point numbers" by Bruce Dawson
9 // http://www.cygnus-software.com/papers/comparingfloats/comparingfloats.htm
11 bool AlmostEqual(float A
, float B
, int maxUlps
=0)
13 // Make sure maxUlps is non-negative and small enough that the
14 // default NAN won't compare as equal to anything.
15 ASSERT(maxUlps
>= 0 && maxUlps
< 4 * 1024 * 1024);
17 // Make aInt lexicographically ordered as a twos-complement int
19 aInt
= 0x80000000 - aInt
;
20 // Make bInt lexicographically ordered as a twos-complement int
23 bInt
= 0x80000000 - bInt
;
25 int intDiff
= abs(aInt
- bInt
);
26 if (intDiff
<= maxUlps
)
35 __forceinline
void xy_filter_one_line_c(float *dst
, int width
, const float *filter
, int filter_width
)
37 const float *filter_start
= filter
;
38 int xx_fix
= width
> filter_width
? 0 : filter_width
- width
;
39 float *dst2
= dst
- filter_width
;
40 float *dst_endr
= dst
+ width
;
41 float *dst_end0
= dst_endr
- filter_width
;
42 float *dst_endl
= dst
- xx_fix
;
43 ASSERT(xx_fix
==0 || dst_end0
==dst_endl
);
45 ASSERT(filter_start
== filter
);
46 filter_start
+= filter_width
;
47 const float *filter_end
= filter_start
;
48 for (;dst2
<dst_endl
;dst2
++, filter_start
--)//left margin
50 const float *src
= dst
;
52 for(const float* f
=filter_start
;f
<filter_end
;f
++, src
++)
58 for (;dst2
<dst
;dst2
++, filter_start
--, filter_end
--)//if width < filter_width
60 const float *src
= dst
;
62 for(const float* f
=filter_start
;f
<filter_end
;f
++, src
++)
68 ASSERT(filter_start
==filter
);
69 for (;dst2
<dst_end0
;dst2
++)
71 const float *src
= dst2
;
74 for(const float* f
=filter_start
;f
<filter_end
;f
++, src
++)
80 for (;dst2
<dst_endr
;dst2
++, filter_end
--)//right margin
82 const float *src
= dst2
;
84 for(const float* f
=filter
;f
<filter_end
;f
++, src
++)
94 * 1. Source content starts from @dst;
95 * 2. Output content starts from @dst-@mwitdth;
98 * | <- @dst-@mwidth --------| <- @dst+0 --------------------|
100 * |- margin -|- src width*height items -|
101 * |- mwidth*heigth items -|- -|
102 * |- do NOT need to init -|- -|
103 * |- when input -|- -|
104 * |---------------------------------------------------------|
106 void xy_filter_c(float *dst
, int width
, int height
, int stride
, const float *filter
, int filter_width
)
108 ASSERT( stride
>=4*(width
+filter_width
) );
109 BYTE
* end
= reinterpret_cast<BYTE
*>(dst
) + height
*stride
;
110 BYTE
* dst_byte
= reinterpret_cast<BYTE
*>(dst
);
111 for( ; dst_byte
<end
; dst_byte
+=stride
)
113 xy_filter_one_line_c(reinterpret_cast<float*>(dst_byte
), width
, filter
, filter_width
);
118 * inline function sometimes generates stupid code
120 * @src4, @src_5_8, @f4, @sum : __m128
121 * @src_5_8, @f4: const
125 #define XY_FILTER_4(src4, src_5_8, f4, sum) \
126 __m128 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(0,0,0,0));\
127 f4_1 = _mm_mul_ps(f4_1, src4);\
128 sum = _mm_add_ps(sum, f4_1);\
129 __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
130 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(2,2,2,2));\
131 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
132 sum = _mm_add_ps(sum, f4_1);\
133 src4 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
134 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(1,1,1,1));\
135 f4_1 = _mm_mul_ps(f4_1, src4);\
136 sum = _mm_add_ps(sum, f4_1);\
137 src_3_6 = _mm_shuffle_ps(src_3_6, src_5_8, _MM_SHUFFLE(2,1,2,1));/*4 5 6 7*/\
138 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(3,3,3,3));\
139 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
140 sum = _mm_add_ps(sum, f4_1)
143 * @src4, @f4_1, @sum : __m128
148 #define XY_FILTER_4_1(src4, f4_1, sum) \
149 src4 = _mm_mul_ps(src4, f4_1);\
150 sum = _mm_add_ps(sum, src4);
152 __forceinline
void xy_filter_one_line_sse_v6(float *dst
, int width
, const float *filter
, int filter_width
)
154 int xx_fix
= width
> filter_width
? 0 : filter_width
- width
;
155 const float *filter_start
= filter
;
156 float *dst2
= dst
- filter_width
;
157 float *dst_endr
= dst
+ width
;
158 float *dst_end0
= dst_endr
- filter_width
- 4;
159 float *dst_endl
= dst
- xx_fix
;
160 ASSERT(xx_fix
==0 || dst_end0
==dst_endl
-4);
162 ASSERT(filter_start
== filter
);
163 filter_start
+= filter_width
;
164 const float *filter_end
= filter_start
;
166 for (;dst2
<dst_endl
;dst2
+=4)//left margin
168 const float *src
= dst
;
172 __m128 src4
= _mm_setzero_ps();/*1 2 3 4*/
173 __m128 sum
= _mm_setzero_ps();
174 for(const float* f
=filter_start
;f
<filter_end
;f
+=4,src
+=4)
176 __m128 src_5_8
= _mm_load_ps(src
);/*5 6 7 8*/
177 __m128 f4
= _mm_load_ps(f
);
179 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
184 _mm_store_ps(dst2
, sum
);
186 for (;dst2
<dst
;dst2
+=4)//if width < filter_width
188 const float *src
= dst
;
192 __m128 src4
= _mm_setzero_ps();/*1 2 3 4*/
193 __m128 sum
= _mm_setzero_ps();
195 for(const float* f
=filter_start
;f
<filter_end
;f
+=4,src
+=4)
197 src_5_8
= _mm_load_ps(src
);/*5 6 7 8*/
200 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
203 src_5_8
= _mm_setzero_ps();
204 f4
= _mm_load_ps(filter_end
);
205 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
207 _mm_store_ps(dst2
, sum
);
209 ASSERT(filter_start
== filter
);
210 for (;dst2
<dst_end0
;dst2
+=8)
212 const float *src
= dst2
;
213 const float* f
=filter_start
;
216 __m128 src4
= _mm_load_ps(src
);/*1 2 3 4*/
219 __m128 sum
= _mm_setzero_ps();
220 __m128 sum2
= _mm_setzero_ps();
221 __m128 f4
= _mm_load_ps(f
);
223 src_5_8
= _mm_load_ps(src
);/*5 6 7 8*/
225 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
226 for(;f
<filter_end
;f
+=4,src
+=4)
228 src4
= _mm_load_ps(src
);/*1 2 3 4*/
229 __m128 tmp
= src_5_8
;//important!
230 { XY_FILTER_4(tmp
, src4
, f4
, sum2
); }
233 { XY_FILTER_4(src_5_8
, src4
, f4
, sum
); }
236 src4
= _mm_load_ps(src
);/*1 2 3 4*/
237 { XY_FILTER_4(src_5_8
, src4
, f4
, sum2
); }
240 _mm_store_ps(dst2
, sum
);
241 _mm_store_ps(dst2
+4, sum2
);
245 const float *src
= dst2
;
247 __m128 src4
= _mm_load_ps(src
);//1 2 3 4
249 __m128 sum
= _mm_setzero_ps();
250 for(const float* f
=filter_start
;f
<filter_end
;f
+=4,src
+=4)
252 __m128 src_5_8
= _mm_load_ps(src
);//5 6 7 8
253 __m128 f4
= _mm_load_ps(f
);
255 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
259 _mm_store_ps(dst2
, sum
);
262 for (;dst2
<dst_endr
;dst2
+=4)//right margin
264 const float *src
= dst2
;
268 __m128 src4
= _mm_load_ps(src
);//1 2 3 4
269 __m128 sum
= _mm_setzero_ps();
271 for(const float* f
=filter_start
;f
<filter_end
;f
+=4)
274 src_5_8
= _mm_load_ps(src
);//5 6 7 8
277 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
280 //move new 4 in_n_out to old 4 in_n_out
282 src_5_8
= _mm_setzero_ps();
283 f4
= _mm_load_ps(filter_end
);
284 { XY_FILTER_4(src4
, src_5_8
, f4
, sum
); }
286 _mm_store_ps(dst2
, sum
);
293 void xy_filter_sse_v6(float *dst
, int width
, int height
, int stride
, const float *filter
, int filter_width
)
295 ASSERT( stride
>=4*(width
+filter_width
) );
296 ASSERT( ((stride
|(4*width
)|(4*filter_width
)|reinterpret_cast<int>(dst
)|reinterpret_cast<int>(filter
))&15)==0 );
298 BYTE
* dst_byte
= reinterpret_cast<BYTE
*>(dst
);
299 BYTE
* end
= dst_byte
+ height
*stride
;
300 for( ; dst_byte
<end
; dst_byte
+=stride
)
302 xy_filter_one_line_sse_v6(reinterpret_cast<float*>(dst_byte
), width
, filter
, filter_width
);
308 * LENGTH%4 == 0 || LENGTH%4 == 1
314 M128s
<LENGTH
- 4> next
;
316 template<int Index
> __forceinline __m128
& GetAt()
318 return next
.GetAt
<Index
- 4>();
320 template<> __forceinline __m128
& GetAt
<0>()
325 template<int Start
, int Offset
> __forceinline __m128
& GetAt()
327 return GetAt
<Start
+ Offset
>();
330 __forceinline
void Load(const float* src
)
332 x
= _mm_load_ps(src
);
342 template<int Index
> __forceinline __m128
& GetAt()
346 __forceinline
void Load(const float* src
)
348 x
= _mm_set1_ps(*src
);
355 void Load(const float* src
)
360 template<int FILTER_LENGTH
, int START
, int LENGTH
>
363 static __forceinline
void do_cal(__m128
& src0_128
, const float * src4
, M128s
<FILTER_LENGTH
>& filter128s
, __m128
& sum
)
365 __m128 src4_128
= _mm_load_ps(src4
);
366 { XY_FILTER_4(src0_128
, src4_128
, filter128s
.GetAt
<START
>(), sum
); }
367 Filter4
<FILTER_LENGTH
,START
+4,LENGTH
-4>::do_cal(src4_128
, src4
+4, filter128s
, sum
);
372 template<int FILTER_LENGTH
, int START
>
373 struct Filter4
<FILTER_LENGTH
, START
, 1>
375 static __forceinline
void do_cal(__m128
& src0_128
, const float * src4
, M128s
<FILTER_LENGTH
>& filter128s
, __m128
& sum
)
377 cal_tail
<FILTER_LENGTH
-START
>(src0_128
, src4
, filter128s
, sum
);
380 static __forceinline
void cal_tail(__m128
& src0_128
, const float * src4
, M128s
<FILTER_LENGTH
>& filter128s
, __m128
& sum
);
382 static __forceinline
void cal_tail
<1>(__m128
& src0_128
, const float * src4
, M128s
<FILTER_LENGTH
>& filter128s
, __m128
& sum
)
384 { XY_FILTER_4_1(src0_128
, filter128s
.GetAt
<FILTER_LENGTH
-1>(), sum
); }
388 template<int FILTER_LENGTH
, int START
>
389 struct Filter4
<FILTER_LENGTH
, START
, 0>
391 static __forceinline
void do_cal(__m128
& src0_128
, const float * src4
, M128s
<FILTER_LENGTH
>& filter128s
, __m128
& sum
)
396 template<int FILTER_LENGTH
,int MARGIN_LENGTH
>
397 struct FilterAllLeftMargin
399 static __forceinline
void cal(float * src
, M128s
<FILTER_LENGTH
>& filter128s
)
401 do_cal
<FILTER_LENGTH
%4>(src
, filter128s
);
403 template<int FILTER_TAIL
>
404 static __forceinline
void do_cal(float * src
, M128s
<FILTER_LENGTH
>& filter128s
)
407 __m128 src0
= _mm_setzero_ps();
408 __m128 sum
= _mm_setzero_ps();
409 Filter4
<FILTER_LENGTH
,MARGIN_LENGTH
-4,FILTER_LENGTH
-MARGIN_LENGTH
+4>::do_cal(src0
, src
, filter128s
, sum
);
410 _mm_store_ps(src
-MARGIN_LENGTH
, sum
);
411 FilterAllLeftMargin
<FILTER_LENGTH
,MARGIN_LENGTH
-4>::do_cal
<0>(src
, filter128s
);
414 static __forceinline
void do_cal
<1>(float * src
, M128s
<FILTER_LENGTH
>& filter128s
)
417 __m128 sum
= _mm_setzero_ps();
418 //Only one of the last 4 filter coefficiences is non-zero
419 _mm_store_ps(src
-MARGIN_LENGTH
, sum
);
420 FilterAllLeftMargin
<FILTER_LENGTH
,MARGIN_LENGTH
-4>::do_cal
<0>(src
, filter128s
);
424 template<int FILTER_LENGTH
, int MARGIN_LENGTH
>
425 struct FilterAllRightMargin
427 static __forceinline
void cal(float * src
, M128s
<FILTER_LENGTH
>& filter128s
)
429 do_cal
<FILTER_LENGTH
%4>(src
, filter128s
);
431 template<int FILTER_TAIL
>
432 static __forceinline
void do_cal(float * src
, M128s
<FILTER_LENGTH
>& filter128s
)
436 __m128 src0
= _mm_load_ps(src
);
437 __m128 sum
= _mm_setzero_ps();
438 Filter4
<FILTER_LENGTH
,0,MARGIN_LENGTH
-4>::do_cal(src0
, src
+4, filter128s
, sum
);
439 __m128 src4
= _mm_setzero_ps();
440 { XY_FILTER_4(src0
, src4
, filter128s
.GetAt
<MARGIN_LENGTH
-4>(), sum
); }
442 _mm_store_ps(src
, sum
);
444 FilterAllRightMargin
<FILTER_LENGTH
,MARGIN_LENGTH
-4>::do_cal
<0>(src
+4, filter128s
);
447 static __forceinline
void do_cal
<1>(float * src
, M128s
<FILTER_LENGTH
>& filter128s
)
451 __m128 src0
= _mm_load_ps(src
);
452 __m128 sum
= _mm_setzero_ps();
453 Filter4
<FILTER_LENGTH
,0,MARGIN_LENGTH
-4>::do_cal(src0
, src
+4, filter128s
, sum
);
454 //Only one of the last 4 filter coefficiences is non-zero
455 { XY_FILTER_4_1(src0
, filter128s
.GetAt
<MARGIN_LENGTH
-4>(), sum
); }
457 _mm_store_ps(src
, sum
);
459 FilterAllRightMargin
<FILTER_LENGTH
,MARGIN_LENGTH
-4>::do_cal
<0>(src
+4, filter128s
);
463 template<int FILTER_LENGTH
>
464 struct FilterAllLeftMargin
<FILTER_LENGTH
,0>
466 template<int FILTER_TAIL
>
467 static __forceinline
void do_cal(float * src
, M128s
<FILTER_LENGTH
>& filter128s
)
472 template<int FILTER_LENGTH
>
473 struct FilterAllRightMargin
<FILTER_LENGTH
,0>
475 template<int FILTER_TAIL
>
476 static __forceinline
void do_cal(float * src
, M128s
<FILTER_LENGTH
>& filter128s
)
483 * xy_filter_c(float *dst, int width, int height, int stride, const float *filter, (FILTER_LENGTH+3)&~3 );
486 * FILTER_LENGTH<=width && width%4==0
488 template<int FILTER_LENGTH
>
489 void xy_filter_sse_template(float *dst
, int width
, int height
, int stride
, const float *filter
)
491 ASSERT( stride
>=4*(width
+FILTER_LENGTH
) );
492 ASSERT( ((stride
|(4*width
)|reinterpret_cast<int>(dst
)|reinterpret_cast<int>(filter
))&15)==0 );
494 M128s
<FILTER_LENGTH
> filter128s
;
495 filter128s
.Load(filter
);
497 const float *filter_start
= filter
;
498 BYTE
* dst_byte
= reinterpret_cast<BYTE
*>(dst
);
499 BYTE
* end
= dst_byte
+ height
*stride
;
500 for( ; dst_byte
<end
; dst_byte
+=stride
)
502 float *dst2
= reinterpret_cast<float*>(dst_byte
);
505 FilterAllLeftMargin
<FILTER_LENGTH
,((FILTER_LENGTH
+3)&~3)>::cal(dst2
, filter128s
);
506 float *dst_end1
= dst2
+ width
;
507 float *dst_end0
= dst_end1
- ((FILTER_LENGTH
+3)&~3);
508 for (;dst2
<dst_end0
;dst2
+=4)
510 const float *src
= dst2
;
513 __m128 src0
= _mm_load_ps(src
);/*1 2 3 4*/
515 __m128 sum
= _mm_setzero_ps();
516 Filter4
<FILTER_LENGTH
,0,FILTER_LENGTH
>::do_cal(src0
, src
, filter128s
, sum
);
518 _mm_store_ps(dst2
, sum
);
520 FilterAllRightMargin
<FILTER_LENGTH
,((FILTER_LENGTH
+3)&~3)>::cal(dst2
, filter128s
);
526 * @src4, @src_5_8, @f3_1, @f3_2, @sum: __m128
527 * @src4, @src_5_8, @f3_1, @f3_2: const
530 #define XY_3_TAG_SYMMETRIC_FILTER_4(src4, src_5_8, f3_1, f3_2, sum) \
531 __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
532 __m128 src_2_5 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
533 sum = _mm_mul_ps(f3_1, src4);\
534 __m128 mul2 = _mm_mul_ps(f3_2, src_2_5);\
535 __m128 mul3 = _mm_mul_ps(f3_1, src_3_6);\
536 sum = _mm_add_ps(sum, mul2);\
537 sum = _mm_add_ps(sum, mul3);
541 * xy_filter_c(float *dst, int width, int height, int stride, const float *filter, 4 );
544 * filter[3] == 0 && filter[0] == filter[2] (symmetric) (&& sum(filter)==1)
546 void xy_3_tag_symmetric_filter_sse(float *dst
, int width
, int height
, int stride
, const float *filter
)
548 const int filter_width
= 4;
549 ASSERT( stride
>=4*(width
+filter_width
) );
550 ASSERT( ((stride
|(4*width
)|(4*filter_width
)|reinterpret_cast<int>(dst
)|reinterpret_cast<int>(filter
))&15)==0 );
552 ASSERT(filter_width
==4 && filter
[3]==0 && filter
[2]==filter
[0]);
554 __m128 f3_1
= _mm_set1_ps(filter
[0]);
555 __m128 f3_2
= _mm_set1_ps(filter
[1]);
557 BYTE
* dst_byte
= reinterpret_cast<BYTE
*>(dst
);
558 BYTE
* end
= dst_byte
+ height
*stride
;
559 for( ; dst_byte
<end
; dst_byte
+=stride
)
561 float *dst_f
= reinterpret_cast<float*>(dst_byte
);
564 float *dst_end0
= dst_f
+ width
- 4;
566 __m128 src4
= _mm_load_ps(dst2
);/*1 2 3 4*/
569 __m128 src_4
= _mm_setzero_ps();
570 { XY_3_TAG_SYMMETRIC_FILTER_4(src_4
, src4
, f3_1
, f3_2
, sum
); }
571 _mm_store_ps(dst2
-4, sum
);
573 for (;dst2
<dst_end0
;dst2
+=4)
576 __m128 src_5_8
= _mm_load_ps(dst2
+4);/*5 6 7 8*/
577 { XY_3_TAG_SYMMETRIC_FILTER_4(src4
, src_5_8
, f3_1
, f3_2
, sum
); }
580 _mm_store_ps(dst2
, sum
);
584 __m128 src_5_8
= _mm_setzero_ps();/*5 6 7 8*/
585 { XY_3_TAG_SYMMETRIC_FILTER_4(src4
, src_5_8
, f3_1
, f3_2
, sum
); }
588 _mm_store_ps(dst2
, sum
);
597 void xy_filter_sse(float *dst
, int width
, int height
, int stride
, const float *filter
, int filter_width
)
599 typedef void (*Filter
)(float *dst
, int width
, int height
, int stride
, const float *filter
);
600 const Filter filters
[] = {
602 xy_filter_sse_template
<1>, xy_filter_sse_template
<4>, xy_filter_sse_template
<4>, xy_filter_sse_template
<4>,
603 xy_filter_sse_template
<5>, xy_filter_sse_template
<8>, xy_filter_sse_template
<8>, xy_filter_sse_template
<8>,
604 xy_filter_sse_template
<9>, xy_filter_sse_template
<12>, xy_filter_sse_template
<12>, xy_filter_sse_template
<12>,
605 xy_filter_sse_template
<13>, xy_filter_sse_template
<16>, xy_filter_sse_template
<16>, xy_filter_sse_template
<16>,
606 xy_filter_sse_template
<17>, xy_filter_sse_template
<20>, xy_filter_sse_template
<20>, xy_filter_sse_template
<20>,
607 xy_filter_sse_template
<21>, xy_filter_sse_template
<24>, xy_filter_sse_template
<24>, xy_filter_sse_template
<24>,
608 xy_filter_sse_template
<25>, xy_filter_sse_template
<28>, xy_filter_sse_template
<28>, xy_filter_sse_template
<28>
610 if (filter_width
<=28 && filter_width
<=width
)
612 int tmp
= filter_width
;
613 // Skip tail zero, but we cannot (and don't have to) support more than 3 tail zeros currently.
614 while( AlmostEqual(filter
[tmp
-1],0.0f
) && filter_width
-tmp
<3 )
616 if (tmp
==3&&filter
[0]==filter
[2])
618 xy_3_tag_symmetric_filter_sse(dst
, width
, height
, stride
, filter
);
622 filters
[tmp
](dst
, width
, height
, stride
, filter
);
627 xy_filter_sse_v6(dst
, width
, height
, stride
, filter
, filter_width
);
632 * Copy and convert src to dst line by line.
633 * @dst_width MUST >= @width
634 * if @dst_width>@width, the extra elements will be filled with 0.
636 void xy_byte_2_float_c(float *dst
, int dst_width
, int dst_stride
,
637 PCUINT8 src
, int width
, int height
, int stride
)
639 PUINT8 dst_byte
= reinterpret_cast<PUINT8
>(dst
);
641 PCUINT8 src_end
= src
+ height
*stride
;
642 for( ; src
<src_end
; src
+=stride
, dst_byte
+=dst_stride
)
645 PCUINT8 src_end
= src2
+ width
;
646 float *dst2
= reinterpret_cast<float*>(dst_byte
);
648 for (;src2
<src_end
;src2
++, dst2
++)
652 float *dst2_end
=reinterpret_cast<float*>(dst_byte
)+dst_width
;
653 for (;dst2
<dst2_end
;dst2
++)
661 * See @xy_byte_2_float_c
663 void xy_byte_2_float_sse(float *dst
, int dst_width
, int dst_stride
,
664 PCUINT8 src
, int width
, int height
, int stride
)
666 ASSERT( dst_width
>=width
);
667 ASSERT( ((reinterpret_cast<int>(dst
)|dst_stride
)&15)==0 );
668 PUINT8 dst_byte
= reinterpret_cast<PUINT8
>(dst
);
670 PCUINT8 src_end
= src
+ height
*stride
;
671 for( ; src
<src_end
; src
+=stride
, dst_byte
+=dst_stride
)
674 PCUINT8 src2_end0
= src2
+ (width
&~15);
675 float *dst2
= reinterpret_cast<float*>(dst_byte
);
677 for (;src2
<src2_end0
;src2
+=16, dst2
+=16)
680 __m128i src16
= _mm_loadu_si128(reinterpret_cast<const __m128i
*>(src2
));
681 __m128i src16_lo
= _mm_unpacklo_epi8(src16
, _mm_setzero_si128());
682 __m128i src16_hi
= _mm_unpackhi_epi8(src16
, _mm_setzero_si128());
683 __m128i src16_lo_lo
= _mm_unpacklo_epi8(src16_lo
, _mm_setzero_si128());
684 __m128i src16_lo_hi
= _mm_unpackhi_epi8(src16_lo
, _mm_setzero_si128());
685 __m128i src16_hi_lo
= _mm_unpacklo_epi8(src16_hi
, _mm_setzero_si128());
686 __m128i src16_hi_hi
= _mm_unpackhi_epi8(src16_hi
, _mm_setzero_si128());
687 __m128 dst_f1
= _mm_cvtepi32_ps(src16_lo_lo
);
688 __m128 dst_f2
= _mm_cvtepi32_ps(src16_lo_hi
);
689 __m128 dst_f3
= _mm_cvtepi32_ps(src16_hi_lo
);
690 __m128 dst_f4
= _mm_cvtepi32_ps(src16_hi_hi
);
691 _mm_store_ps(dst2
, dst_f1
);
692 _mm_store_ps(dst2
+4, dst_f2
);
693 _mm_store_ps(dst2
+8, dst_f3
);
694 _mm_store_ps(dst2
+12, dst_f4
);
696 PCUINT8 src2_end
= src
+ width
;
697 for (;src2
<src2_end
;src2
++,dst2
++)
701 float *dst2_end
=reinterpret_cast<float*>(dst_byte
)+dst_width
;
702 for (;dst2
<dst2_end
;dst2
++)
710 * Copy transposed Matrix src to dst.
711 * @dst_width MUST >= @height.
712 * if @dst_width > @height, the extra elements will be filled with 0.
714 void xy_float_2_float_transpose_c(float *dst
, int dst_width
, int dst_stride
,
715 const float *src
, int width
, int height
, int src_stride
)
717 ASSERT(dst_width
>= height
);
718 PUINT8 dst_byte
= reinterpret_cast<PUINT8
>(dst
);
719 const float* src_end
= src
+ width
;
720 PCUINT8 src2_end
= reinterpret_cast<PCUINT8
>(src
) + height
*src_stride
;
721 for( ; src
<src_end
; src
++, dst_byte
+=dst_stride
)
723 PCUINT8 src2
= reinterpret_cast<PCUINT8
>(src
);
725 float *dst2
= reinterpret_cast<float*>(dst_byte
);
726 for (;src2
<src2_end
;src2
+=src_stride
,dst2
++)
728 *dst2
= *reinterpret_cast<const float*>(src2
);
730 float *dst2_end
= reinterpret_cast<float*>(dst_byte
) + dst_width
;
731 for (;dst2
<dst2_end
;dst2
++)
739 * see @xy_float_2_float_transpose_c
741 void xy_float_2_float_transpose_sse(float *dst
, int dst_width
, int dst_stride
,
742 const float *src
, int width
, int height
, int src_stride
)
745 typedef const float SrcT
;
747 ASSERT( (((int)dst
|dst_stride
)&15)==0 );
748 ASSERT(dst_width
>= height
);
749 PUINT8 dst_byte
= reinterpret_cast<PUINT8
>(dst
);
750 SrcT
* src_end
= src
+ width
;
751 PCUINT8 src2_end1
= reinterpret_cast<PCUINT8
>(src
) + (height
&~3)*src_stride
;
752 PCUINT8 src2_end2
= reinterpret_cast<PCUINT8
>(src
) + height
*src_stride
;
753 for( ; src
<src_end
; src
++, dst_byte
+=dst_stride
)
755 PCUINT8 src2
= reinterpret_cast<PCUINT8
>(src
);
757 DstT
*dst2
= reinterpret_cast<DstT
*>(dst_byte
);
758 for (;src2
<src2_end1
;src2
+=4*src_stride
,dst2
+=4)
760 __m128 m1
= _mm_set_ps(
761 *(SrcT
*)(src2
+3*src_stride
),
762 *(SrcT
*)(src2
+2*src_stride
),
763 *(SrcT
*)(src2
+src_stride
),
765 _mm_store_ps(dst2
, m1
);
767 for (;src2
<src2_end2
;src2
+=src_stride
,dst2
++)
769 *dst2
= *reinterpret_cast<SrcT
*>(src2
);
771 float *dst2_end
= reinterpret_cast<DstT
*>(dst_byte
) + dst_width
;
772 for (;dst2
<dst2_end
;dst2
++)
780 * Transpose and round Matrix src, then copy to dst.
781 * @dst_width MUST >= @height.
782 * if @dst_width > @height, the extra elements will be filled with 0.
784 void xy_float_2_byte_transpose_c(UINT8
*dst
, int dst_width
, int dst_stride
,
785 const float *src
, int width
, int height
, int src_stride
)
787 ASSERT(dst_width
>= height
);
788 PUINT8 dst_byte
= reinterpret_cast<PUINT8
>(dst
);
789 const float* src_end
= src
+ width
;
790 PCUINT8 src2_end
= reinterpret_cast<PCUINT8
>(src
) + height
*src_stride
;
791 for( ; src
<src_end
; src
++, dst_byte
+=dst_stride
)
793 PCUINT8 src2
= reinterpret_cast<PCUINT8
>(src
);
795 UINT8
*dst2
= reinterpret_cast<UINT8
*>(dst_byte
);
796 for (;src2
<src2_end
;src2
+=src_stride
,dst2
++)
798 *dst2
= static_cast<UINT8
>(*reinterpret_cast<const float*>(src2
)+0.5);
800 UINT8
*dst2_end
= reinterpret_cast<UINT8
*>(dst_byte
) + dst_width
;
801 for (;dst2
<dst2_end
;dst2
++)
808 void xy_float_2_byte_transpose_sse(UINT8
*dst
, int dst_width
, int dst_stride
,
809 const float *src
, int width
, int height
, int src_stride
)
812 typedef const float SrcT
;
814 ASSERT(dst_width
>= height
);
815 ASSERT((((int)dst
|dst_stride
)&15)==0);
816 PUINT8 dst_byte
= reinterpret_cast<PUINT8
>(dst
);
817 SrcT
* src_end
= src
+ width
;
818 PCUINT8 src2_end00
= reinterpret_cast<PCUINT8
>(src
) + (height
&~15)*src_stride
;
819 PCUINT8 src2_end
= reinterpret_cast<PCUINT8
>(src
) + height
*src_stride
;
820 for( ; src
<src_end
; src
++, dst_byte
+=dst_stride
)
822 PCUINT8 src2
= reinterpret_cast<PCUINT8
>(src
);
824 DstT
*dst2
= reinterpret_cast<DstT
*>(dst_byte
);
825 for (;src2
<src2_end00
;src2
+=16*src_stride
,dst2
+=16)
827 __m128 m1
= _mm_set_ps(
828 *(SrcT
*)(src2
+3*src_stride
),
829 *(SrcT
*)(src2
+2*src_stride
),
830 *(SrcT
*)(src2
+src_stride
),
832 __m128 m2
= _mm_set_ps(
833 *(SrcT
*)(src2
+7*src_stride
),
834 *(SrcT
*)(src2
+6*src_stride
),
835 *(SrcT
*)(src2
+5*src_stride
),
836 *(SrcT
*)(src2
+4*src_stride
));
837 __m128 m3
= _mm_set_ps(
838 *(SrcT
*)(src2
+11*src_stride
),
839 *(SrcT
*)(src2
+10*src_stride
),
840 *(SrcT
*)(src2
+9*src_stride
),
841 *(SrcT
*)(src2
+8*src_stride
));
842 __m128 m4
= _mm_set_ps(
843 *(SrcT
*)(src2
+15*src_stride
),
844 *(SrcT
*)(src2
+14*src_stride
),
845 *(SrcT
*)(src2
+13*src_stride
),
846 *(SrcT
*)(src2
+12*src_stride
));
848 __m128i i1
= _mm_cvtps_epi32(m1
);
849 __m128i i2
= _mm_cvtps_epi32(m2
);
850 __m128i i3
= _mm_cvtps_epi32(m3
);
851 __m128i i4
= _mm_cvtps_epi32(m4
);
853 i1
= _mm_packs_epi32(i1
,i2
);
854 i3
= _mm_packs_epi32(i3
,i4
);
855 i1
= _mm_packus_epi16(i1
,i3
);
857 _mm_store_si128((__m128i
*)dst2
, i1
);
859 for (;src2
<src2_end
;src2
+=src_stride
,dst2
++)
861 *dst2
= static_cast<DstT
>(*reinterpret_cast<SrcT
*>(src2
)+0.5);
863 DstT
*dst2_end
= reinterpret_cast<DstT
*>(dst_byte
) + dst_width
;
864 for (;dst2
<dst2_end
;dst2
++)
872 * To Do: decent CPU capability check
874 void xy_gaussian_blur(PUINT8 dst
, int dst_stride
,
875 PCUINT8 src
, int width
, int height
, int stride
,
876 const float *gt_x
, int r_x
, int gt_ex_width_x
,
877 const float *gt_y
, int r_y
, int gt_ex_width_y
)
879 ASSERT(width
<=stride
&& width
+2*r_x
<=dst_stride
);
880 int ex_mask_width_x
= ((r_x
*2+1)+3)&~3;
881 ASSERT(ex_mask_width_x
<=gt_ex_width_x
);
882 if (ex_mask_width_x
>gt_ex_width_x
)
889 int ex_mask_width_y
= ((r_y
*2+1)+3)&~3;
890 ASSERT(ex_mask_width_y
<=gt_ex_width_y
);
891 if (ex_mask_width_y
>gt_ex_width_y
)
899 int fwidth
= (width
+3)&~3;
900 int fstride
= (fwidth
+ ex_mask_width_x
)*sizeof(float);
901 int fheight
= (height
+3)&~3;
902 int fstride_ver
= (fheight
+ex_mask_width_y
)*sizeof(float);
904 PUINT8 buff_base
= reinterpret_cast<PUINT8
>(xy_malloc(height
*fstride
+ (fwidth
+ ex_mask_width_x
)*fstride_ver
));
906 float *hor_buff_base
= reinterpret_cast<float*>(buff_base
);
907 float *hor_buff
= hor_buff_base
+ ex_mask_width_x
;
910 ASSERT( ((width
+15)&~15)<=stride
);
911 xy_byte_2_float_sse(hor_buff
, fwidth
, fstride
, src
, width
, height
, stride
);
914 xy_filter_sse(hor_buff
, fwidth
, height
, fstride
, gt_x
, ex_mask_width_x
);
918 float *ver_buff_base
= reinterpret_cast<float*>(buff_base
+ height
*fstride
);
919 float *ver_buff
= ver_buff_base
+ ex_mask_width_y
;
921 int true_width
= width
+r_x
*2;
922 xy_float_2_float_transpose_sse(ver_buff
, fheight
, fstride_ver
, hor_buff
-r_x
*2, true_width
, height
, fstride
);
925 xy_filter_sse(ver_buff
, fheight
, true_width
, fstride_ver
, gt_y
, ex_mask_width_y
);
928 int true_height
= height
+ 2*r_y
;
929 xy_float_2_byte_transpose_sse(dst
, true_width
, dst_stride
, ver_buff
-r_y
*2, true_height
, true_width
, fstride_ver
);
945 template<int ROUNDING_POLICY
, int precision
>
948 __forceinline
void init_sse();
949 __forceinline __m128i
round(__m128i in
);
950 __forceinline
int round(int in
);
953 template<int precision
>
954 struct XyRounding
<ROUND_DOWN
, precision
>
956 __forceinline
void init_sse()
960 __forceinline __m128i
round(__m128i in
)
965 __forceinline
int round(unsigned in
)
972 template<int precision
>
973 struct XyRounding
<ROUND_HALF_DOWN
, precision
>
975 __forceinline
void init_sse()
977 m_rounding_patch
= _mm_set1_epi16( (1<<(precision
-1))-1 );
979 __forceinline __m128i
round(__m128i in
)
981 return _mm_adds_epu16(in
, m_rounding_patch
);
984 __forceinline
int round(unsigned in
)
986 return in
+ ((1<<(precision
-1))-1);
988 __m128i m_rounding_patch
;
992 template<int precision
>
993 struct XyRounding
<ROUND_HALF_UP
, precision
>
995 __forceinline
void init_sse()
997 m_rounding_patch
= _mm_set1_epi16( 1<<(precision
-1) );
999 __forceinline __m128i
round(__m128i in
)
1001 return _mm_adds_epu16(in
, m_rounding_patch
);
1004 __forceinline
int round(unsigned in
)
1006 return in
+ (1<<(precision
-1));
1008 __m128i m_rounding_patch
;
1012 template<int precision
>
1013 struct XyRounding
<ROUND_HALF_TO_EVEN
, precision
>
1015 __forceinline
void init_sse()
1017 m_rounding_patch
= _mm_set1_epi16( 1<<(precision
-1) );
1019 __forceinline __m128i
round(__m128i in
)
1021 in
= _mm_adds_epu16(in
, m_rounding_patch
);
1022 __m128i tmp
= _mm_slli_epi16(in
, 15-precision
);
1023 tmp
= _mm_srli_epi16(tmp
, 15);
1024 return _mm_adds_epu16(in
, tmp
);
1027 __forceinline
int round(unsigned in
)
1029 return in
+ (1<<(precision
-1)) + ((in
>>precision
)&1);
1031 __m128i m_rounding_patch
;
1035 * filter with [1,2,1]
1036 * 1. It is a in-place horizontal filter
1037 * 2. Boundary Pixels are filtered by padding 0. E.g.
1038 * dst[0] = (0*1 + dst[0]*2 + dst[1]*1)/4;
1040 template<int ROUNDING_POLICY
>
1041 void xy_be_filter_c(PUINT8 dst
, int width
, int height
, int stride
)
1049 XyRounding
<ROUNDING_POLICY
, 2> xy_rounding
;
1050 for (int y
=0;y
<height
;y
++)
1052 dst2
= dst
+ y
*stride
;
1053 int old_sum
= dst2
[0];
1056 for (x
=0;x
<width
-1;x
++)
1058 int new_sum
= dst2
[x
]+dst2
[x
+1];
1059 tmp
= old_sum
+ new_sum
;//old_sum == src2[x-1]+src2[x];
1060 dst2
[x
] = (xy_rounding
.round(tmp
)>>2);
1063 tmp
= old_sum
+ dst2
[x
];
1064 dst2
[x
] = (xy_rounding
.round(tmp
)>>2);
1069 * 1. It is a in-place symmetric 3-tag horizontal filter
1070 * 2. Boundary Pixels are filtered by padding 0. E.g.
1071 * dst[0] = (0*1 + dst[0]*2 + dst[1]*1)/4;
1072 * 3. sum(filter) == 256
1074 template<int ROUNDING_POLICY
>
1075 void xy_be_filter2_c(PUINT8 dst
, int width
, int height
, int stride
, PCUINT filter
)
1083 const int VOLUME_BITS
= 8;
1084 const int VOLUME
= (1<<VOLUME_BITS
);
1087 return;//nothing to do;
1089 else if (filter
[0]== (VOLUME
>>2))
1091 return xy_be_filter_c
<ROUNDING_POLICY
>(dst
, width
, height
, stride
);
1095 XyRounding
<ROUNDING_POLICY
, VOLUME_BITS
> xy_rounding
;
1096 for (int y
=0;y
<height
;y
++)
1098 dst2
= dst
+ y
*stride
;
1102 for (x
=0;x
<width
-1;x
++)
1104 tmp
= (old_pix
+ dst2
[x
+1]) * filter
[0] + dst2
[x
] * filter
[1];
1106 dst2
[x
] = (xy_rounding
.round(tmp
)>>VOLUME_BITS
);
1108 tmp
= old_pix
* filter
[0] + dst2
[x
] * filter
[1];
1109 dst2
[x
] = (xy_rounding
.round(tmp
)>>VOLUME_BITS
);
1114 * See @xy_be_filter_c
1115 * No alignment requirement.
1117 template<int ROUNDING_POLICY
>
1118 void xy_be_filter_sse(PUINT8 dst
, int width
, int height
, int stride
)
1125 int width_mod8
= ((width
-1)&~7);
1126 XyRounding
<ROUNDING_POLICY
, 2> xy_rounding
;
1127 xy_rounding
.init_sse();
1128 for (int y
= 0; y
< height
; y
++) {
1129 PUINT8 dst2
=dst
+y
*stride
;
1131 __m128i old_pix_128
= _mm_cvtsi32_si128(dst2
[0]);
1132 __m128i old_sum_128
= old_pix_128
;
1135 for (; x
< width_mod8
; x
+=8) {
1136 __m128i new_pix
= _mm_loadl_epi64(reinterpret_cast<const __m128i
*>(dst2
+x
+1));
1137 new_pix
= _mm_unpacklo_epi8(new_pix
, _mm_setzero_si128());
1138 __m128i temp
= _mm_slli_si128(new_pix
,2);
1139 temp
= _mm_add_epi16(temp
, old_pix_128
);
1140 temp
= _mm_add_epi16(temp
, new_pix
);
1141 old_pix_128
= _mm_srli_si128(new_pix
,14);
1143 new_pix
= _mm_slli_si128(temp
,2);
1144 new_pix
= _mm_add_epi16(new_pix
, old_sum_128
);
1145 new_pix
= _mm_add_epi16(new_pix
, temp
);
1146 old_sum_128
= _mm_srli_si128(temp
, 14);
1148 new_pix
= xy_rounding
.round(new_pix
);
1150 new_pix
= _mm_srli_epi16(new_pix
, 2);
1151 new_pix
= _mm_packus_epi16(new_pix
, new_pix
);
1153 _mm_storel_epi64( reinterpret_cast<__m128i
*>(dst2
+x
), new_pix
);
1155 int old_sum
= _mm_cvtsi128_si32(old_sum_128
);
1158 for ( ; x
< width
-1; x
++) {
1159 int new_sum
= dst2
[x
] + dst2
[x
+1];
1160 tmp
= old_sum
+ new_sum
;
1161 dst2
[x
] = (xy_rounding
.round(tmp
)>>2);
1164 tmp
= old_sum
+ dst2
[x
];
1165 dst2
[x
] = (xy_rounding
.round(tmp
)>>2);
1170 * See @xy_be_filter2_c
1171 * No alignment requirement.
1173 template<int ROUNDING_POLICY
>
1174 void xy_be_filter2_sse(PUINT8 dst
, int width
, int height
, int stride
, PCUINT filter
)
1176 const int VOLUME_BITS
= 8;
1177 const int VOLUME
= (1<<VOLUME_BITS
);
1178 ASSERT(filter
[0]==filter
[2]);
1179 ASSERT(filter
[0]+filter
[1]+filter
[2]==VOLUME
);
1186 XyRounding
<ROUNDING_POLICY
, VOLUME_BITS
> xy_rounding
;
1187 xy_rounding
.init_sse();
1188 __m128i f3_1
= _mm_set1_epi16(filter
[0]);
1189 __m128i f3_2
= _mm_set1_epi16(filter
[1]);
1191 int width_mod8
= ((width
-1)&~7);
1192 //__m128i round = _mm_set1_epi16(8);
1194 PUINT8 dst_byte
= reinterpret_cast<PUINT8
>(dst
);
1195 PUINT8 end
= dst_byte
+ height
*stride
;
1196 for( ; dst_byte
<end
; dst_byte
+=stride
)
1198 PUINT8 dst2
= dst_byte
;
1200 PUINT8 dst_end0
= dst_byte
+ width
- 4;
1202 __m128i old_pix1_128
= _mm_setzero_si128();
1203 __m128i old_pix2_128
= _mm_cvtsi32_si128(dst2
[0]);
1206 for (; x
< width_mod8
; x
+=8) {
1207 __m128i pix2
= _mm_loadl_epi64(reinterpret_cast<const __m128i
*>(dst2
+x
+1));
1208 pix2
= _mm_unpacklo_epi8(pix2
, _mm_setzero_si128());
1209 __m128i pix1
= _mm_slli_si128(pix2
,2);
1210 pix1
= _mm_add_epi8(pix1
, old_pix2_128
);
1211 __m128i pix0
= _mm_slli_si128(pix1
,2);
1212 pix0
= _mm_add_epi8(pix0
, old_pix1_128
);
1213 old_pix1_128
= _mm_srli_si128(pix1
,14);
1214 old_pix2_128
= _mm_srli_si128(pix2
,14);
1216 pix0
= _mm_add_epi16(pix0
, pix2
);
1217 pix0
= _mm_mullo_epi16(pix0
, f3_1
);
1218 pix1
= _mm_mullo_epi16(pix1
, f3_2
);
1220 pix1
= _mm_adds_epu16(pix1
, pix0
);
1222 pix1
= xy_rounding
.round(pix1
);
1224 pix1
= _mm_srli_epi16(pix1
, VOLUME_BITS
);
1225 pix1
= _mm_packus_epi16(pix1
, pix1
);
1227 _mm_storel_epi64( reinterpret_cast<__m128i
*>(dst2
+x
), pix1
);
1229 int old_pix1
= _mm_cvtsi128_si32(old_pix1_128
);
1232 for ( ; x
< width
-1; x
++) {
1233 tmp
= (old_pix1
+ dst2
[x
+1]) * filter
[0] + dst2
[x
] * filter
[1];
1236 dst2
[x
] = (xy_rounding
.round(tmp
)>>VOLUME_BITS
);
1238 tmp
= old_pix1
*filter
[0] + dst2
[x
]*filter
[1];
1239 dst2
[x
] = (xy_rounding
.round(tmp
)>>VOLUME_BITS
);
1245 * Construct the filter used in the final horizontal/vertical pass of @xy_be_blur when @pass is NOT a integer.
1246 * This filter is constructed to satisfy:
1247 * If @p is a pixel in the middle of the image, pixels @(p-1) and @(p+1) lie just at the left and the right
1248 * of @p respectively. The value of @p after all horizontal filtering equals to
1249 * a*value_old(@(p-1)) + b*value_old(@p) + a*value_old(@(p+1)) + other pixels' weighted sum,
1251 * a/b = @pass/(@pass+1).
1252 * It makes sense because the above property also holds when @pass is a integer.
1255 * Let n = floor(pass);
1256 * filter = [ (pass-n)(n+2) / (2*(1+3pass-n)), 1-(pass-n)(n+2)/(1+3pass-n), (pass-n)(n+2)/ (2*(1+3pass-n)) ]
1258 void xy_calculate_filter(float pass
, PUINT filter
)
1260 const int VOLUME
= (1<<8);
1264 filter
[0] = VOLUME
* pass
/(1+3*pass
);
1268 filter
[0] = VOLUME
* (pass
-1)/(2*pass
);
1272 filter
[0] = VOLUME
* (pass
-n
)*(n
+2)/ (2*(1+3*pass
-n
));
1274 filter
[1] = VOLUME
- 2*filter
[0];
1275 filter
[2] = filter
[0];
1277 if (2*filter
[0]>filter
[1])
1279 //this should not happen
1281 filter
[0] = VOLUME
/4;
1282 filter
[1] = VOLUME
- 2*filter
[0];
1283 filter
[2] = filter
[0];
1288 * See @xy_float_2_byte_transpose_c
1290 void xy_byte_2_byte_transpose_c(UINT8
*dst
, int dst_width
, int dst_stride
,
1291 PCUINT8 src
, int width
, int height
, int src_stride
)
1293 ASSERT(dst_width
>= height
);
1294 PUINT8 dst_byte
= reinterpret_cast<PUINT8
>(dst
);
1295 PCUINT8 src_end
= src
+ width
;
1296 PCUINT8 src2_end
= reinterpret_cast<PCUINT8
>(src
) + height
*src_stride
;
1297 for( ; src
<src_end
; src
++, dst_byte
+=dst_stride
)
1299 PCUINT8 src2
= reinterpret_cast<PCUINT8
>(src
);
1301 UINT8
*dst2
= reinterpret_cast<UINT8
*>(dst_byte
);
1302 for (;src2
<src2_end
;src2
+=src_stride
,dst2
++)
1306 UINT8
*dst2_end
= reinterpret_cast<UINT8
*>(dst_byte
) + dst_width
;
1307 for (;dst2
<dst2_end
;dst2
++)
1315 * Repeat filter [1,2,1] @pass_x times in horizontal and @pass_y times in vertical
1316 * Boundary Pixels are filtered by padding 0, see @xy_be_filter_c.
1319 * When @pass_x is not a integer, horizontal filter [1,2,1] is repeated (int)@pass_x times,
1320 * and a 3-tag symmetric filter which generated according to @pass_x is applied.
1321 * The final 3-tag symmetric filter is constructed to satisfy the following property:
1322 * If @pass_xx > @pass_x, the filtered result of @pass_x should NOT be more blur than
1323 * the result of @pass_xx. More specially, the filtered result of @pass_x should NOT be more
1324 * blur than (int)@pass_x+1 and should NOT be less blur than (int)@pass_x;
1327 * Original VSFilter \be effect uses a simple a round down, which has an bias of -7.5/16 per pass.
1328 * That rounding error is really huge with big @pass value. It has become one part of the \be effect.
1329 * We can simulate VSFilter's rounding bias by combining different rounding methods. However a simple
1330 * test shows that result still has some visual difference from VSFilter's.
1333 * It seems that this separated filter implementation is more sensitive to precision in comparison to
1334 * VSFilter's simple implementation. Vertical blocky artifact can be observe with big pass value
1335 * (and @pass_x==@pass_y). So it is only used for filtering of fractional part of \be strength.
1337 void xy_be_blur(PUINT8 src
, int width
, int height
, int stride
, float pass_x
, float pass_y
)
1339 //ASSERT(pass_x>0 && pass_y>0);
1341 typedef void (*XyBeFilter
)(PUINT8 src
, int width
, int height
, int stride
);
1342 typedef void (*XyFilter2
)(PUINT8 src
, int width
, int height
, int stride
, PCUINT filter
);
1344 XyBeFilter filter
= (g_cpuid
.m_flags
& CCpuID::sse2
) ? xy_be_filter_sse
<ROUND_HALF_TO_EVEN
> : xy_be_filter_c
<ROUND_HALF_TO_EVEN
>;
1345 XyFilter2 filter2
= (g_cpuid
.m_flags
& CCpuID::sse2
) ? xy_be_filter2_sse
<ROUND_HALF_TO_EVEN
> : xy_be_filter2_c
<ROUND_HALF_TO_EVEN
>;
1347 int stride_ver
= height
;
1348 PUINT8 tmp
= reinterpret_cast<PUINT8
>(xy_malloc(width
*height
));
1351 int pass_x_int
= static_cast<int>(pass_x
);
1352 for (int i
=0; i
<pass_x_int
; i
++)
1354 filter(src
, width
, height
, stride
);
1356 if (pass_x
-pass_x_int
>0)
1359 xy_calculate_filter(pass_x
, f
);
1360 filter2(src
, width
, height
, stride
, f
);
1364 xy_byte_2_byte_transpose_c(tmp
, height
, stride_ver
, src
, width
, height
, stride
);
1367 int pass_y_int
= static_cast<int>(pass_y
);
1368 for (int i
=0;i
<pass_y_int
;i
++)
1370 filter(tmp
, height
, width
, stride_ver
);
1372 if (pass_y
-pass_y_int
>0)
1375 xy_calculate_filter(pass_y
, f
);
1376 filter2(tmp
, height
, width
, stride_ver
, f
);
1380 xy_byte_2_byte_transpose_c(src
, width
, stride
, tmp
, height
, width
, stride_ver
);