Minor fix.
[xy_vsfilter.git] / src / subtitles / xy_filter.cpp
blob7c3a7c6d8dc03f44c5cf157cd9614a06d63eac53
1 #include "stdafx.h"
2 #include "../dsutil/vd.h"
4 typedef const UINT8 CUINT8, *PCUINT8;
5 typedef const UINT CUINT, *PCUINT;
7 //
8 // ref: "Comparing floating point numbers" by Bruce Dawson
9 // http://www.cygnus-software.com/papers/comparingfloats/comparingfloats.htm
11 bool AlmostEqual(float A, float B, int maxUlps=0)
13 // Make sure maxUlps is non-negative and small enough that the
14 // default NAN won't compare as equal to anything.
15 ASSERT(maxUlps >= 0 && maxUlps < 4 * 1024 * 1024);
16 int aInt = *(int*)&A;
17 // Make aInt lexicographically ordered as a twos-complement int
18 if (aInt < 0)
19 aInt = 0x80000000 - aInt;
20 // Make bInt lexicographically ordered as a twos-complement int
21 int bInt = *(int*)&B;
22 if (bInt < 0)
23 bInt = 0x80000000 - bInt;
25 int intDiff = abs(aInt - bInt);
26 if (intDiff <= maxUlps)
27 return true;
29 return false;
32 /****
33 * See @xy_filter_c
34 **/
35 __forceinline void xy_filter_one_line_c(float *dst, int width, const float *filter, int filter_width)
37 const float *filter_start = filter;
38 int xx_fix = width > filter_width ? 0 : filter_width - width;
39 float *dst2 = dst - filter_width;
40 float *dst_endr = dst + width;
41 float *dst_end0 = dst_endr - filter_width;
42 float *dst_endl = dst - xx_fix;
43 ASSERT(xx_fix==0 || dst_end0==dst_endl);
45 ASSERT(filter_start == filter);
46 filter_start += filter_width;
47 const float *filter_end = filter_start;
48 for (;dst2<dst_endl;dst2++, filter_start--)//left margin
50 const float *src = dst;
51 float sum = 0;
52 for(const float* f=filter_start;f<filter_end;f++, src++)
54 sum += src[0] * f[0];
56 *dst2 = sum;
58 for (;dst2<dst;dst2++, filter_start--, filter_end--)//if width < filter_width
60 const float *src = dst;
61 float sum = 0;
62 for(const float* f=filter_start;f<filter_end;f++, src++)
64 sum += src[0] * f[0];
66 *dst2 = sum;
68 ASSERT(filter_start==filter);
69 for (;dst2<dst_end0;dst2++)
71 const float *src = dst2;
73 float sum = 0;
74 for(const float* f=filter_start;f<filter_end;f++, src++)
76 sum += src[0] * f[0];
78 *dst2 = sum;
80 for (;dst2<dst_endr;dst2++, filter_end--)//right margin
82 const float *src = dst2;
83 float sum = 0;
84 for(const float* f=filter;f<filter_end;f++, src++)
86 sum += src[0] * f[0];
88 *dst2 = sum;
92 /****
93 * dst memory layout:
94 * 1. Source content starts from @dst;
95 * 2. Output content starts from @dst-@mwitdth;
97 * |- stride -|
98 * | <- @dst-@mwidth --------| <- @dst+0 --------------------|
99 * |- -|- -|
100 * |- margin -|- src width*height items -|
101 * |- mwidth*heigth items -|- -|
102 * |- do NOT need to init -|- -|
103 * |- when input -|- -|
104 * |---------------------------------------------------------|
106 void xy_filter_c(float *dst, int width, int height, int stride, const float *filter, int filter_width)
108 ASSERT( stride>=4*(width+filter_width) );
109 BYTE* end = reinterpret_cast<BYTE*>(dst) + height*stride;
110 BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
111 for( ; dst_byte<end; dst_byte+=stride )
113 xy_filter_one_line_c(reinterpret_cast<float*>(dst_byte), width, filter, filter_width);
117 /****
118 * inline function sometimes generates stupid code
120 * @src4, @src_5_8, @f4, @sum : __m128
121 * @src_5_8, @f4: const
122 * @sum : output
123 * @src4: undefined
124 **/
125 #define XY_FILTER_4(src4, src_5_8, f4, sum) \
126 __m128 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(0,0,0,0));\
127 f4_1 = _mm_mul_ps(f4_1, src4);\
128 sum = _mm_add_ps(sum, f4_1);\
129 __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
130 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(2,2,2,2));\
131 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
132 sum = _mm_add_ps(sum, f4_1);\
133 src4 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
134 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(1,1,1,1));\
135 f4_1 = _mm_mul_ps(f4_1, src4);\
136 sum = _mm_add_ps(sum, f4_1);\
137 src_3_6 = _mm_shuffle_ps(src_3_6, src_5_8, _MM_SHUFFLE(2,1,2,1));/*4 5 6 7*/\
138 f4_1 = _mm_shuffle_ps(f4, f4, _MM_SHUFFLE(3,3,3,3));\
139 f4_1 = _mm_mul_ps(f4_1, src_3_6);\
140 sum = _mm_add_ps(sum, f4_1)
142 /****
143 * @src4, @f4_1, @sum : __m128
144 * @f4_1: const
145 * @sum : output
146 * @src4: undefined
147 **/
148 #define XY_FILTER_4_1(src4, f4_1, sum) \
149 src4 = _mm_mul_ps(src4, f4_1);\
150 sum = _mm_add_ps(sum, src4);
152 __forceinline void xy_filter_one_line_sse_v6(float *dst, int width, const float *filter, int filter_width)
154 int xx_fix = width > filter_width ? 0 : filter_width - width;
155 const float *filter_start = filter;
156 float *dst2 = dst - filter_width;
157 float *dst_endr = dst + width;
158 float *dst_end0 = dst_endr - filter_width - 4;
159 float *dst_endl = dst - xx_fix;
160 ASSERT(xx_fix==0 || dst_end0==dst_endl-4);
162 ASSERT(filter_start == filter);
163 filter_start += filter_width;
164 const float *filter_end = filter_start;
166 for (;dst2<dst_endl;dst2+=4)//left margin
168 const float *src = dst;
169 filter_start -= 4;
171 //filter 4
172 __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
173 __m128 sum = _mm_setzero_ps();
174 for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
176 __m128 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
177 __m128 f4 = _mm_load_ps(f);
179 { XY_FILTER_4(src4, src_5_8, f4, sum); }
181 src4 = src_5_8;
183 //store result
184 _mm_store_ps(dst2, sum);
186 for (;dst2<dst;dst2+=4)//if width < filter_width
188 const float *src = dst;
189 filter_start-=4;
190 filter_end-=4;
192 __m128 src4 = _mm_setzero_ps();/*1 2 3 4*/
193 __m128 sum = _mm_setzero_ps();
194 __m128 src_5_8, f4;
195 for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
197 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
198 f4 = _mm_load_ps(f);
200 { XY_FILTER_4(src4, src_5_8, f4, sum); }
201 src4 = src_5_8;
203 src_5_8 = _mm_setzero_ps();
204 f4 = _mm_load_ps(filter_end);
205 { XY_FILTER_4(src4, src_5_8, f4, sum); }
206 //store result
207 _mm_store_ps(dst2, sum);
209 ASSERT(filter_start == filter);
210 for (;dst2<dst_end0;dst2+=8)
212 const float *src = dst2;
213 const float* f=filter_start;
215 //filter 8
216 __m128 src4 = _mm_load_ps(src);/*1 2 3 4*/
217 src+=4;
218 __m128 src_5_8;
219 __m128 sum = _mm_setzero_ps();
220 __m128 sum2 = _mm_setzero_ps();
221 __m128 f4 = _mm_load_ps(f);
222 f+=4;
223 src_5_8 = _mm_load_ps(src);/*5 6 7 8*/
224 src+=4;
225 { XY_FILTER_4(src4, src_5_8, f4, sum); }
226 for(;f<filter_end;f+=4,src+=4)
228 src4 = _mm_load_ps(src);/*1 2 3 4*/
229 __m128 tmp = src_5_8;//important!
230 { XY_FILTER_4(tmp, src4, f4, sum2); }
232 f4 = _mm_load_ps(f);
233 { XY_FILTER_4(src_5_8, src4, f4, sum); }
234 src_5_8 = src4;
236 src4 = _mm_load_ps(src);/*1 2 3 4*/
237 { XY_FILTER_4(src_5_8, src4, f4, sum2); }
239 //store result
240 _mm_store_ps(dst2, sum);
241 _mm_store_ps(dst2+4, sum2);
243 if (dst2==dst_end0)
245 const float *src = dst2;
246 //filter 4
247 __m128 src4 = _mm_load_ps(src);//1 2 3 4
248 src+=4;
249 __m128 sum = _mm_setzero_ps();
250 for(const float* f=filter_start;f<filter_end;f+=4,src+=4)
252 __m128 src_5_8 = _mm_load_ps(src);//5 6 7 8
253 __m128 f4 = _mm_load_ps(f);
255 { XY_FILTER_4(src4, src_5_8, f4, sum); }
256 src4 = src_5_8;
258 //store result
259 _mm_store_ps(dst2, sum);
260 dst2+=4;
262 for (;dst2<dst_endr;dst2+=4)//right margin
264 const float *src = dst2;
265 filter_end-=4;
267 //filter 4
268 __m128 src4 = _mm_load_ps(src);//1 2 3 4
269 __m128 sum = _mm_setzero_ps();
270 __m128 src_5_8, f4;
271 for(const float* f=filter_start;f<filter_end;f+=4)
273 src+=4;
274 src_5_8 = _mm_load_ps(src);//5 6 7 8
275 f4 = _mm_load_ps(f);
277 { XY_FILTER_4(src4, src_5_8, f4, sum); }
279 src4 = src_5_8;
280 //move new 4 in_n_out to old 4 in_n_out
282 src_5_8 = _mm_setzero_ps();
283 f4 = _mm_load_ps(filter_end);
284 { XY_FILTER_4(src4, src_5_8, f4, sum); }
285 //store result
286 _mm_store_ps(dst2, sum);
290 /****
291 * See @xy_filter_c
293 void xy_filter_sse_v6(float *dst, int width, int height, int stride, const float *filter, int filter_width)
295 ASSERT( stride>=4*(width+filter_width) );
296 ASSERT( ((stride|(4*width)|(4*filter_width)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
298 BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
299 BYTE* end = dst_byte + height*stride;
300 for( ; dst_byte<end; dst_byte+=stride )
302 xy_filter_one_line_sse_v6(reinterpret_cast<float*>(dst_byte), width, filter, filter_width);
306 /****
307 * Constrain:
308 * LENGTH%4 == 0 || LENGTH%4 == 1
310 template<int LENGTH>
311 struct M128s
313 __m128 x;
314 M128s<LENGTH - 4> next;
316 template<int Index> __forceinline __m128& GetAt()
318 return next.GetAt<Index - 4>();
320 template<> __forceinline __m128& GetAt<0>()
322 return x;
325 template<int Start, int Offset> __forceinline __m128& GetAt()
327 return GetAt<Start + Offset>();
330 __forceinline void Load(const float* src)
332 x = _mm_load_ps(src);
333 next.Load(src+4);
337 template<>
338 struct M128s<1>
340 __m128 x;
342 template<int Index> __forceinline __m128& GetAt()
344 return x;
346 __forceinline void Load(const float* src)
348 x = _mm_set1_ps(*src);
352 template<>
353 struct M128s<0>
355 void Load(const float* src)
360 template<int FILTER_LENGTH, int START, int LENGTH>
361 struct Filter4
363 static __forceinline void do_cal(__m128& src0_128, const float * src4, M128s<FILTER_LENGTH>& filter128s, __m128& sum)
365 __m128 src4_128 = _mm_load_ps(src4);
366 { XY_FILTER_4(src0_128, src4_128, filter128s.GetAt<START>(), sum); }
367 Filter4<FILTER_LENGTH,START+4,LENGTH-4>::do_cal(src4_128, src4+4, filter128s, sum);
368 src0_128 = src4_128;
372 template<int FILTER_LENGTH, int START>
373 struct Filter4<FILTER_LENGTH, START, 1>
375 static __forceinline void do_cal(__m128& src0_128, const float * src4, M128s<FILTER_LENGTH>& filter128s, __m128& sum)
377 cal_tail<FILTER_LENGTH-START>(src0_128, src4, filter128s, sum);
379 template<int TAIL>
380 static __forceinline void cal_tail(__m128& src0_128, const float * src4, M128s<FILTER_LENGTH>& filter128s, __m128& sum);
381 template<>
382 static __forceinline void cal_tail<1>(__m128& src0_128, const float * src4, M128s<FILTER_LENGTH>& filter128s, __m128& sum)
384 { XY_FILTER_4_1(src0_128, filter128s.GetAt<FILTER_LENGTH-1>(), sum); }
388 template<int FILTER_LENGTH, int START>
389 struct Filter4<FILTER_LENGTH, START, 0>
391 static __forceinline void do_cal(__m128& src0_128, const float * src4, M128s<FILTER_LENGTH>& filter128s, __m128& sum)
396 template<int FILTER_LENGTH,int MARGIN_LENGTH>
397 struct FilterAllLeftMargin
399 static __forceinline void cal(float * src, M128s<FILTER_LENGTH>& filter128s)
401 do_cal<FILTER_LENGTH%4>(src, filter128s);
403 template<int FILTER_TAIL>
404 static __forceinline void do_cal(float * src, M128s<FILTER_LENGTH>& filter128s)
406 //filter 4
407 __m128 src0 = _mm_setzero_ps();
408 __m128 sum = _mm_setzero_ps();
409 Filter4<FILTER_LENGTH,MARGIN_LENGTH-4,FILTER_LENGTH-MARGIN_LENGTH+4>::do_cal(src0, src, filter128s, sum);
410 _mm_store_ps(src-MARGIN_LENGTH, sum);
411 FilterAllLeftMargin<FILTER_LENGTH,MARGIN_LENGTH-4>::do_cal<0>(src, filter128s);
413 template<>
414 static __forceinline void do_cal<1>(float * src, M128s<FILTER_LENGTH>& filter128s)
416 //filter 4
417 __m128 sum = _mm_setzero_ps();
418 //Only one of the last 4 filter coefficiences is non-zero
419 _mm_store_ps(src-MARGIN_LENGTH, sum);
420 FilterAllLeftMargin<FILTER_LENGTH,MARGIN_LENGTH-4>::do_cal<0>(src, filter128s);
424 template<int FILTER_LENGTH, int MARGIN_LENGTH>
425 struct FilterAllRightMargin
427 static __forceinline void cal(float * src, M128s<FILTER_LENGTH>& filter128s)
429 do_cal<FILTER_LENGTH%4>(src, filter128s);
431 template<int FILTER_TAIL>
432 static __forceinline void do_cal(float * src, M128s<FILTER_LENGTH>& filter128s)
434 //filter 4
436 __m128 src0 = _mm_load_ps(src);
437 __m128 sum = _mm_setzero_ps();
438 Filter4<FILTER_LENGTH,0,MARGIN_LENGTH-4>::do_cal(src0, src+4, filter128s, sum);
439 __m128 src4 = _mm_setzero_ps();
440 { XY_FILTER_4(src0, src4, filter128s.GetAt<MARGIN_LENGTH-4>(), sum); }
441 //store result
442 _mm_store_ps(src, sum);
444 FilterAllRightMargin<FILTER_LENGTH,MARGIN_LENGTH-4>::do_cal<0>(src+4, filter128s);
446 template<>
447 static __forceinline void do_cal<1>(float * src, M128s<FILTER_LENGTH>& filter128s)
449 //filter 4
451 __m128 src0 = _mm_load_ps(src);
452 __m128 sum = _mm_setzero_ps();
453 Filter4<FILTER_LENGTH,0,MARGIN_LENGTH-4>::do_cal(src0, src+4, filter128s, sum);
454 //Only one of the last 4 filter coefficiences is non-zero
455 { XY_FILTER_4_1(src0, filter128s.GetAt<MARGIN_LENGTH-4>(), sum); }
456 //store result
457 _mm_store_ps(src, sum);
459 FilterAllRightMargin<FILTER_LENGTH,MARGIN_LENGTH-4>::do_cal<0>(src+4, filter128s);
463 template<int FILTER_LENGTH>
464 struct FilterAllLeftMargin<FILTER_LENGTH,0>
466 template<int FILTER_TAIL>
467 static __forceinline void do_cal(float * src, M128s<FILTER_LENGTH>& filter128s)
472 template<int FILTER_LENGTH>
473 struct FilterAllRightMargin<FILTER_LENGTH,0>
475 template<int FILTER_TAIL>
476 static __forceinline void do_cal(float * src, M128s<FILTER_LENGTH>& filter128s)
481 /****
482 * Equivalent:
483 * xy_filter_c(float *dst, int width, int height, int stride, const float *filter, (FILTER_LENGTH+3)&~3 );
484 * See @xy_filter_c
485 * Constrain:
486 * FILTER_LENGTH<=width && width%4==0
488 template<int FILTER_LENGTH>
489 void xy_filter_sse_template(float *dst, int width, int height, int stride, const float *filter)
491 ASSERT( stride>=4*(width+FILTER_LENGTH) );
492 ASSERT( ((stride|(4*width)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
494 M128s<FILTER_LENGTH> filter128s;
495 filter128s.Load(filter);
497 const float *filter_start = filter;
498 BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
499 BYTE* end = dst_byte + height*stride;
500 for( ; dst_byte<end; dst_byte+=stride )
502 float *dst2 = reinterpret_cast<float*>(dst_byte);
504 //left margin
505 FilterAllLeftMargin<FILTER_LENGTH,((FILTER_LENGTH+3)&~3)>::cal(dst2, filter128s);
506 float *dst_end1 = dst2 + width;
507 float *dst_end0 = dst_end1 - ((FILTER_LENGTH+3)&~3);
508 for (;dst2<dst_end0;dst2+=4)
510 const float *src = dst2;
512 //filter 4
513 __m128 src0 = _mm_load_ps(src);/*1 2 3 4*/
514 src += 4;
515 __m128 sum = _mm_setzero_ps();
516 Filter4<FILTER_LENGTH,0,FILTER_LENGTH>::do_cal(src0, src, filter128s, sum);
517 //store result
518 _mm_store_ps(dst2, sum);
520 FilterAllRightMargin<FILTER_LENGTH,((FILTER_LENGTH+3)&~3)>::cal(dst2, filter128s);
525 /****
526 * @src4, @src_5_8, @f3_1, @f3_2, @sum: __m128
527 * @src4, @src_5_8, @f3_1, @f3_2: const
528 * @sum: output
530 #define XY_3_TAG_SYMMETRIC_FILTER_4(src4, src_5_8, f3_1, f3_2, sum) \
531 __m128 src_3_6 = _mm_shuffle_ps(src4, src_5_8, _MM_SHUFFLE(1,0,3,2));/*3 4 5 6*/\
532 __m128 src_2_5 = _mm_shuffle_ps(src4, src_3_6, _MM_SHUFFLE(2,1,2,1));/*2 3 4 5*/\
533 sum = _mm_mul_ps(f3_1, src4);\
534 __m128 mul2 = _mm_mul_ps(f3_2, src_2_5);\
535 __m128 mul3 = _mm_mul_ps(f3_1, src_3_6);\
536 sum = _mm_add_ps(sum, mul2);\
537 sum = _mm_add_ps(sum, mul3);
539 /****
540 * Equivalent:
541 * xy_filter_c(float *dst, int width, int height, int stride, const float *filter, 4 );
542 * See @xy_filter_c
543 * Constrain:
544 * filter[3] == 0 && filter[0] == filter[2] (symmetric) (&& sum(filter)==1)
546 void xy_3_tag_symmetric_filter_sse(float *dst, int width, int height, int stride, const float *filter)
548 const int filter_width = 4;
549 ASSERT( stride>=4*(width+filter_width) );
550 ASSERT( ((stride|(4*width)|(4*filter_width)|reinterpret_cast<int>(dst)|reinterpret_cast<int>(filter))&15)==0 );
552 ASSERT(filter_width==4 && filter[3]==0 && filter[2]==filter[0]);
554 __m128 f3_1 = _mm_set1_ps(filter[0]);
555 __m128 f3_2 = _mm_set1_ps(filter[1]);
557 BYTE* dst_byte = reinterpret_cast<BYTE*>(dst);
558 BYTE* end = dst_byte + height*stride;
559 for( ; dst_byte<end; dst_byte+=stride )
561 float *dst_f = reinterpret_cast<float*>(dst_byte);
562 float *dst2 = dst_f;
564 float *dst_end0 = dst_f + width - 4;
565 //filter 4
566 __m128 src4 = _mm_load_ps(dst2);/*1 2 3 4*/
568 __m128 sum;
569 __m128 src_4 = _mm_setzero_ps();
570 { XY_3_TAG_SYMMETRIC_FILTER_4(src_4, src4, f3_1, f3_2, sum); }
571 _mm_store_ps(dst2-4, sum);
573 for (;dst2<dst_end0;dst2+=4)
575 __m128 sum;
576 __m128 src_5_8 = _mm_load_ps(dst2+4);/*5 6 7 8*/
577 { XY_3_TAG_SYMMETRIC_FILTER_4(src4, src_5_8, f3_1, f3_2, sum); }
578 src4 = src_5_8;
579 //store result
580 _mm_store_ps(dst2, sum);
583 __m128 sum;
584 __m128 src_5_8 = _mm_setzero_ps();/*5 6 7 8*/
585 { XY_3_TAG_SYMMETRIC_FILTER_4(src4, src_5_8, f3_1, f3_2, sum); }
586 src4 = src_5_8;
587 //store result
588 _mm_store_ps(dst2, sum);
594 /****
595 * See @xy_filter_c
597 void xy_filter_sse(float *dst, int width, int height, int stride, const float *filter, int filter_width)
599 typedef void (*Filter)(float *dst, int width, int height, int stride, const float *filter);
600 const Filter filters[] = {
601 NULL,
602 xy_filter_sse_template<1>, xy_filter_sse_template<4>, xy_filter_sse_template<4>, xy_filter_sse_template<4>,
603 xy_filter_sse_template<5>, xy_filter_sse_template<8>, xy_filter_sse_template<8>, xy_filter_sse_template<8>,
604 xy_filter_sse_template<9>, xy_filter_sse_template<12>, xy_filter_sse_template<12>, xy_filter_sse_template<12>,
605 xy_filter_sse_template<13>, xy_filter_sse_template<16>, xy_filter_sse_template<16>, xy_filter_sse_template<16>,
606 xy_filter_sse_template<17>, xy_filter_sse_template<20>, xy_filter_sse_template<20>, xy_filter_sse_template<20>,
607 xy_filter_sse_template<21>, xy_filter_sse_template<24>, xy_filter_sse_template<24>, xy_filter_sse_template<24>,
608 xy_filter_sse_template<25>, xy_filter_sse_template<28>, xy_filter_sse_template<28>, xy_filter_sse_template<28>
610 if (filter_width<=28 && filter_width<=width)
612 int tmp = filter_width;
613 // Skip tail zero, but we cannot (and don't have to) support more than 3 tail zeros currently.
614 while( AlmostEqual(filter[tmp-1],0.0f) && filter_width-tmp<3 )
615 tmp--;
616 if (tmp==3&&filter[0]==filter[2])
618 xy_3_tag_symmetric_filter_sse(dst, width, height, stride, filter);
620 else
622 filters[tmp](dst, width, height, stride, filter);
625 else
627 xy_filter_sse_v6(dst, width, height, stride, filter, filter_width);
631 /****
632 * Copy and convert src to dst line by line.
633 * @dst_width MUST >= @width
634 * if @dst_width>@width, the extra elements will be filled with 0.
636 void xy_byte_2_float_c(float *dst, int dst_width, int dst_stride,
637 PCUINT8 src, int width, int height, int stride)
639 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
641 PCUINT8 src_end = src + height*stride;
642 for( ; src<src_end; src+=stride, dst_byte+=dst_stride )
644 PCUINT8 src2 = src;
645 PCUINT8 src_end = src2 + width;
646 float *dst2 = reinterpret_cast<float*>(dst_byte);
648 for (;src2<src_end;src2++, dst2++)
650 *dst2 = *src2;
652 float *dst2_end=reinterpret_cast<float*>(dst_byte)+dst_width;
653 for (;dst2<dst2_end;dst2++)
655 *dst2=0;
660 /****
661 * See @xy_byte_2_float_c
663 void xy_byte_2_float_sse(float *dst, int dst_width, int dst_stride,
664 PCUINT8 src, int width, int height, int stride)
666 ASSERT( dst_width>=width );
667 ASSERT( ((reinterpret_cast<int>(dst)|dst_stride)&15)==0 );
668 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
670 PCUINT8 src_end = src + height*stride;
671 for( ; src<src_end; src+=stride, dst_byte+=dst_stride )
673 PCUINT8 src2 = src;
674 PCUINT8 src2_end0 = src2 + (width&~15);
675 float *dst2 = reinterpret_cast<float*>(dst_byte);
677 for (;src2<src2_end0;src2+=16, dst2+=16)
679 //filter 4
680 __m128i src16 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src2));
681 __m128i src16_lo = _mm_unpacklo_epi8(src16, _mm_setzero_si128());
682 __m128i src16_hi = _mm_unpackhi_epi8(src16, _mm_setzero_si128());
683 __m128i src16_lo_lo = _mm_unpacklo_epi8(src16_lo, _mm_setzero_si128());
684 __m128i src16_lo_hi = _mm_unpackhi_epi8(src16_lo, _mm_setzero_si128());
685 __m128i src16_hi_lo = _mm_unpacklo_epi8(src16_hi, _mm_setzero_si128());
686 __m128i src16_hi_hi = _mm_unpackhi_epi8(src16_hi, _mm_setzero_si128());
687 __m128 dst_f1 = _mm_cvtepi32_ps(src16_lo_lo);
688 __m128 dst_f2 = _mm_cvtepi32_ps(src16_lo_hi);
689 __m128 dst_f3 = _mm_cvtepi32_ps(src16_hi_lo);
690 __m128 dst_f4 = _mm_cvtepi32_ps(src16_hi_hi);
691 _mm_store_ps(dst2, dst_f1);
692 _mm_store_ps(dst2+4, dst_f2);
693 _mm_store_ps(dst2+8, dst_f3);
694 _mm_store_ps(dst2+12, dst_f4);
696 PCUINT8 src2_end = src + width;
697 for (;src2<src2_end;src2++,dst2++)
699 *dst2 = *src2;
701 float *dst2_end=reinterpret_cast<float*>(dst_byte)+dst_width;
702 for (;dst2<dst2_end;dst2++)
704 *dst2=0;
709 /****
710 * Copy transposed Matrix src to dst.
711 * @dst_width MUST >= @height.
712 * if @dst_width > @height, the extra elements will be filled with 0.
714 void xy_float_2_float_transpose_c(float *dst, int dst_width, int dst_stride,
715 const float *src, int width, int height, int src_stride)
717 ASSERT(dst_width >= height);
718 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
719 const float* src_end = src + width;
720 PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
721 for( ; src<src_end; src++, dst_byte+=dst_stride )
723 PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
725 float *dst2 = reinterpret_cast<float*>(dst_byte);
726 for (;src2<src2_end;src2+=src_stride,dst2++)
728 *dst2 = *reinterpret_cast<const float*>(src2);
730 float *dst2_end = reinterpret_cast<float*>(dst_byte) + dst_width;
731 for (;dst2<dst2_end;dst2++)
733 *dst2 = 0;
738 /****
739 * see @xy_float_2_float_transpose_c
741 void xy_float_2_float_transpose_sse(float *dst, int dst_width, int dst_stride,
742 const float *src, int width, int height, int src_stride)
744 typedef float DstT;
745 typedef const float SrcT;
747 ASSERT( (((int)dst|dst_stride)&15)==0 );
748 ASSERT(dst_width >= height);
749 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
750 SrcT* src_end = src + width;
751 PCUINT8 src2_end1 = reinterpret_cast<PCUINT8>(src) + (height&~3)*src_stride;
752 PCUINT8 src2_end2 = reinterpret_cast<PCUINT8>(src) + height*src_stride;
753 for( ; src<src_end; src++, dst_byte+=dst_stride )
755 PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
757 DstT *dst2 = reinterpret_cast<DstT*>(dst_byte);
758 for (;src2<src2_end1;src2+=4*src_stride,dst2+=4)
760 __m128 m1 = _mm_set_ps(
761 *(SrcT*)(src2+3*src_stride),
762 *(SrcT*)(src2+2*src_stride),
763 *(SrcT*)(src2+src_stride),
764 *(SrcT*)(src2));
765 _mm_store_ps(dst2, m1);
767 for (;src2<src2_end2;src2+=src_stride,dst2++)
769 *dst2 = *reinterpret_cast<SrcT*>(src2);
771 float *dst2_end = reinterpret_cast<DstT*>(dst_byte) + dst_width;
772 for (;dst2<dst2_end;dst2++)
774 *dst2 = 0;
779 /****
780 * Transpose and round Matrix src, then copy to dst.
781 * @dst_width MUST >= @height.
782 * if @dst_width > @height, the extra elements will be filled with 0.
784 void xy_float_2_byte_transpose_c(UINT8 *dst, int dst_width, int dst_stride,
785 const float *src, int width, int height, int src_stride)
787 ASSERT(dst_width >= height);
788 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
789 const float* src_end = src + width;
790 PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
791 for( ; src<src_end; src++, dst_byte+=dst_stride )
793 PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
795 UINT8 *dst2 = reinterpret_cast<UINT8*>(dst_byte);
796 for (;src2<src2_end;src2+=src_stride,dst2++)
798 *dst2 = static_cast<UINT8>(*reinterpret_cast<const float*>(src2)+0.5);
800 UINT8 *dst2_end = reinterpret_cast<UINT8*>(dst_byte) + dst_width;
801 for (;dst2<dst2_end;dst2++)
803 *dst2 = 0;
808 void xy_float_2_byte_transpose_sse(UINT8 *dst, int dst_width, int dst_stride,
809 const float *src, int width, int height, int src_stride)
811 typedef UINT8 DstT;
812 typedef const float SrcT;
814 ASSERT(dst_width >= height);
815 ASSERT((((int)dst|dst_stride)&15)==0);
816 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
817 SrcT* src_end = src + width;
818 PCUINT8 src2_end00 = reinterpret_cast<PCUINT8>(src) + (height&~15)*src_stride;
819 PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
820 for( ; src<src_end; src++, dst_byte+=dst_stride )
822 PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
824 DstT *dst2 = reinterpret_cast<DstT*>(dst_byte);
825 for (;src2<src2_end00;src2+=16*src_stride,dst2+=16)
827 __m128 m1 = _mm_set_ps(
828 *(SrcT*)(src2+3*src_stride),
829 *(SrcT*)(src2+2*src_stride),
830 *(SrcT*)(src2+src_stride),
831 *(SrcT*)(src2));
832 __m128 m2 = _mm_set_ps(
833 *(SrcT*)(src2+7*src_stride),
834 *(SrcT*)(src2+6*src_stride),
835 *(SrcT*)(src2+5*src_stride),
836 *(SrcT*)(src2+4*src_stride));
837 __m128 m3 = _mm_set_ps(
838 *(SrcT*)(src2+11*src_stride),
839 *(SrcT*)(src2+10*src_stride),
840 *(SrcT*)(src2+9*src_stride),
841 *(SrcT*)(src2+8*src_stride));
842 __m128 m4 = _mm_set_ps(
843 *(SrcT*)(src2+15*src_stride),
844 *(SrcT*)(src2+14*src_stride),
845 *(SrcT*)(src2+13*src_stride),
846 *(SrcT*)(src2+12*src_stride));
848 __m128i i1 = _mm_cvtps_epi32(m1);
849 __m128i i2 = _mm_cvtps_epi32(m2);
850 __m128i i3 = _mm_cvtps_epi32(m3);
851 __m128i i4 = _mm_cvtps_epi32(m4);
853 i1 = _mm_packs_epi32(i1,i2);
854 i3 = _mm_packs_epi32(i3,i4);
855 i1 = _mm_packus_epi16(i1,i3);
857 _mm_store_si128((__m128i*)dst2, i1);
859 for (;src2<src2_end;src2+=src_stride,dst2++)
861 *dst2 = static_cast<DstT>(*reinterpret_cast<SrcT*>(src2)+0.5);
863 DstT *dst2_end = reinterpret_cast<DstT*>(dst_byte) + dst_width;
864 for (;dst2<dst2_end;dst2++)
866 *dst2 = 0;
871 /****
872 * To Do: decent CPU capability check
874 void xy_gaussian_blur(PUINT8 dst, int dst_stride,
875 PCUINT8 src, int width, int height, int stride,
876 const float *gt_x, int r_x, int gt_ex_width_x,
877 const float *gt_y, int r_y, int gt_ex_width_y)
879 ASSERT(width<=stride && width+2*r_x<=dst_stride);
880 int ex_mask_width_x = ((r_x*2+1)+3)&~3;
881 ASSERT(ex_mask_width_x<=gt_ex_width_x);
882 if (ex_mask_width_x>gt_ex_width_x)
884 int o=0;
885 o=o/o;
886 exit(-1);
889 int ex_mask_width_y = ((r_y*2+1)+3)&~3;
890 ASSERT(ex_mask_width_y<=gt_ex_width_y);
891 if (ex_mask_width_y>gt_ex_width_y)
893 int o=0;
894 o=o/o;
895 exit(-1);
899 int fwidth = (width+3)&~3;
900 int fstride = (fwidth + ex_mask_width_x)*sizeof(float);
901 int fheight = (height+3)&~3;
902 int fstride_ver = (fheight+ex_mask_width_y)*sizeof(float);
904 PUINT8 buff_base = reinterpret_cast<PUINT8>(xy_malloc(height*fstride + (fwidth + ex_mask_width_x)*fstride_ver));
906 float *hor_buff_base = reinterpret_cast<float*>(buff_base);
907 float *hor_buff = hor_buff_base + ex_mask_width_x;
909 // byte to float
910 ASSERT( ((width+15)&~15)<=stride );
911 xy_byte_2_float_sse(hor_buff, fwidth, fstride, src, width, height, stride);
913 // horizontal pass
914 xy_filter_sse(hor_buff, fwidth, height, fstride, gt_x, ex_mask_width_x);
917 // transpose
918 float *ver_buff_base = reinterpret_cast<float*>(buff_base + height*fstride);
919 float *ver_buff = ver_buff_base + ex_mask_width_y;
921 int true_width = width+r_x*2;
922 xy_float_2_float_transpose_sse(ver_buff, fheight, fstride_ver, hor_buff-r_x*2, true_width, height, fstride);
924 // vertical pass
925 xy_filter_sse(ver_buff, fheight, true_width, fstride_ver, gt_y, ex_mask_width_y);
927 // transpose
928 int true_height = height + 2*r_y;
929 xy_float_2_byte_transpose_sse(dst, true_width, dst_stride, ver_buff-r_y*2, true_height, true_width, fstride_ver);
931 xy_free(buff_base);
932 _mm_empty();
936 enum RoundingPolicy
938 ROUND_DOWN
939 , ROUND_HALF_DOWN
940 , ROUND_HALF_UP
941 , ROUND_HALF_TO_EVEN
942 , COUNT_ROUND_POLICY
945 template<int ROUNDING_POLICY, int precision>
946 struct XyRounding
948 __forceinline void init_sse();
949 __forceinline __m128i round(__m128i in);
950 __forceinline int round(int in);
953 template<int precision>
954 struct XyRounding<ROUND_DOWN, precision>
956 __forceinline void init_sse()
960 __forceinline __m128i round(__m128i in)
962 return in;
965 __forceinline int round(unsigned in)
967 return in;
972 template<int precision>
973 struct XyRounding<ROUND_HALF_DOWN, precision>
975 __forceinline void init_sse()
977 m_rounding_patch = _mm_set1_epi16( (1<<(precision-1))-1 );
979 __forceinline __m128i round(__m128i in)
981 return _mm_adds_epu16(in, m_rounding_patch);
984 __forceinline int round(unsigned in)
986 return in + ((1<<(precision-1))-1);
988 __m128i m_rounding_patch;
992 template<int precision>
993 struct XyRounding<ROUND_HALF_UP, precision>
995 __forceinline void init_sse()
997 m_rounding_patch = _mm_set1_epi16( 1<<(precision-1) );
999 __forceinline __m128i round(__m128i in)
1001 return _mm_adds_epu16(in, m_rounding_patch);
1004 __forceinline int round(unsigned in)
1006 return in + (1<<(precision-1));
1008 __m128i m_rounding_patch;
1012 template<int precision>
1013 struct XyRounding<ROUND_HALF_TO_EVEN, precision>
1015 __forceinline void init_sse()
1017 m_rounding_patch = _mm_set1_epi16( 1<<(precision-1) );
1019 __forceinline __m128i round(__m128i in)
1021 in = _mm_adds_epu16(in, m_rounding_patch);
1022 __m128i tmp = _mm_slli_epi16(in, 15-precision);
1023 tmp = _mm_srli_epi16(tmp, 15);
1024 return _mm_adds_epu16(in, tmp);
1027 __forceinline int round(unsigned in)
1029 return in + (1<<(precision-1)) + ((in>>precision)&1);
1031 __m128i m_rounding_patch;
1034 /****
1035 * filter with [1,2,1]
1036 * 1. It is a in-place horizontal filter
1037 * 2. Boundary Pixels are filtered by padding 0. E.g.
1038 * dst[0] = (0*1 + dst[0]*2 + dst[1]*1)/4;
1040 template<int ROUNDING_POLICY>
1041 void xy_be_filter_c(PUINT8 dst, int width, int height, int stride)
1043 ASSERT(width>=1);
1044 if (width<=0)
1046 return;
1048 PUINT8 dst2 = NULL;
1049 XyRounding<ROUNDING_POLICY, 2> xy_rounding;
1050 for (int y=0;y<height;y++)
1052 dst2 = dst + y*stride;
1053 int old_sum = dst2[0];
1054 int tmp = 0;
1055 int x=0;
1056 for (x=0;x<width-1;x++)
1058 int new_sum = dst2[x]+dst2[x+1];
1059 tmp = old_sum + new_sum;//old_sum == src2[x-1]+src2[x];
1060 dst2[x] = (xy_rounding.round(tmp)>>2);
1061 old_sum = new_sum;
1063 tmp = old_sum + dst2[x];
1064 dst2[x] = (xy_rounding.round(tmp)>>2);
1068 /****
1069 * 1. It is a in-place symmetric 3-tag horizontal filter
1070 * 2. Boundary Pixels are filtered by padding 0. E.g.
1071 * dst[0] = (0*1 + dst[0]*2 + dst[1]*1)/4;
1072 * 3. sum(filter) == 256
1074 template<int ROUNDING_POLICY>
1075 void xy_be_filter2_c(PUINT8 dst, int width, int height, int stride, PCUINT filter)
1077 ASSERT(width>=1);
1078 if (width<=0)
1080 return;
1083 const int VOLUME_BITS = 8;
1084 const int VOLUME = (1<<VOLUME_BITS);
1085 if (filter[0]==0)
1087 return;//nothing to do;
1089 else if (filter[0]== (VOLUME>>2))
1091 return xy_be_filter_c<ROUNDING_POLICY>(dst, width, height, stride);
1094 PUINT8 dst2 = NULL;
1095 XyRounding<ROUNDING_POLICY, VOLUME_BITS> xy_rounding;
1096 for (int y=0;y<height;y++)
1098 dst2 = dst + y*stride;
1099 int old_pix = 0;
1100 int tmp = 0;
1101 int x=0;
1102 for (x=0;x<width-1;x++)
1104 tmp = (old_pix + dst2[x+1]) * filter[0] + dst2[x] * filter[1];
1105 old_pix = dst2[x];
1106 dst2[x] = (xy_rounding.round(tmp)>>VOLUME_BITS);
1108 tmp = old_pix * filter[0] + dst2[x] * filter[1];
1109 dst2[x] = (xy_rounding.round(tmp)>>VOLUME_BITS);
1113 /****
1114 * See @xy_be_filter_c
1115 * No alignment requirement.
1117 template<int ROUNDING_POLICY>
1118 void xy_be_filter_sse(PUINT8 dst, int width, int height, int stride)
1120 ASSERT(width>=1);
1121 if (width<=0)
1123 return;
1125 int width_mod8 = ((width-1)&~7);
1126 XyRounding<ROUNDING_POLICY, 2> xy_rounding;
1127 xy_rounding.init_sse();
1128 for (int y = 0; y < height; y++) {
1129 PUINT8 dst2=dst+y*stride;
1131 __m128i old_pix_128 = _mm_cvtsi32_si128(dst2[0]);
1132 __m128i old_sum_128 = old_pix_128;
1134 int x = 0;
1135 for (; x < width_mod8; x+=8) {
1136 __m128i new_pix = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(dst2+x+1));
1137 new_pix = _mm_unpacklo_epi8(new_pix, _mm_setzero_si128());
1138 __m128i temp = _mm_slli_si128(new_pix,2);
1139 temp = _mm_add_epi16(temp, old_pix_128);
1140 temp = _mm_add_epi16(temp, new_pix);
1141 old_pix_128 = _mm_srli_si128(new_pix,14);
1143 new_pix = _mm_slli_si128(temp,2);
1144 new_pix = _mm_add_epi16(new_pix, old_sum_128);
1145 new_pix = _mm_add_epi16(new_pix, temp);
1146 old_sum_128 = _mm_srli_si128(temp, 14);
1148 new_pix = xy_rounding.round(new_pix);
1150 new_pix = _mm_srli_epi16(new_pix, 2);
1151 new_pix = _mm_packus_epi16(new_pix, new_pix);
1153 _mm_storel_epi64( reinterpret_cast<__m128i*>(dst2+x), new_pix );
1155 int old_sum = _mm_cvtsi128_si32(old_sum_128);
1156 old_sum &= 0xffff;
1157 int tmp = 0;
1158 for ( ; x < width-1; x++) {
1159 int new_sum = dst2[x] + dst2[x+1];
1160 tmp = old_sum + new_sum;
1161 dst2[x] = (xy_rounding.round(tmp)>>2);
1162 old_sum = new_sum;
1164 tmp = old_sum + dst2[x];
1165 dst2[x] = (xy_rounding.round(tmp)>>2);
1169 /****
1170 * See @xy_be_filter2_c
1171 * No alignment requirement.
1173 template<int ROUNDING_POLICY>
1174 void xy_be_filter2_sse(PUINT8 dst, int width, int height, int stride, PCUINT filter)
1176 const int VOLUME_BITS = 8;
1177 const int VOLUME = (1<<VOLUME_BITS);
1178 ASSERT(filter[0]==filter[2]);
1179 ASSERT(filter[0]+filter[1]+filter[2]==VOLUME);
1180 ASSERT(width>=1);
1181 if (width<=0)
1183 return;
1186 XyRounding<ROUNDING_POLICY, VOLUME_BITS> xy_rounding;
1187 xy_rounding.init_sse();
1188 __m128i f3_1 = _mm_set1_epi16(filter[0]);
1189 __m128i f3_2 = _mm_set1_epi16(filter[1]);
1191 int width_mod8 = ((width-1)&~7);
1192 //__m128i round = _mm_set1_epi16(8);
1194 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
1195 PUINT8 end = dst_byte + height*stride;
1196 for( ; dst_byte<end; dst_byte+=stride )
1198 PUINT8 dst2 = dst_byte;
1200 PUINT8 dst_end0 = dst_byte + width - 4;
1202 __m128i old_pix1_128 = _mm_setzero_si128();
1203 __m128i old_pix2_128 = _mm_cvtsi32_si128(dst2[0]);
1205 int x = 0;
1206 for (; x < width_mod8; x+=8) {
1207 __m128i pix2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(dst2+x+1));
1208 pix2 = _mm_unpacklo_epi8(pix2, _mm_setzero_si128());
1209 __m128i pix1 = _mm_slli_si128(pix2,2);
1210 pix1 = _mm_add_epi8(pix1, old_pix2_128);
1211 __m128i pix0 = _mm_slli_si128(pix1,2);
1212 pix0 = _mm_add_epi8(pix0, old_pix1_128);
1213 old_pix1_128 = _mm_srli_si128(pix1,14);
1214 old_pix2_128 = _mm_srli_si128(pix2,14);
1216 pix0 = _mm_add_epi16(pix0, pix2);
1217 pix0 = _mm_mullo_epi16(pix0, f3_1);
1218 pix1 = _mm_mullo_epi16(pix1, f3_2);
1220 pix1 = _mm_adds_epu16(pix1, pix0);
1222 pix1 = xy_rounding.round(pix1);
1224 pix1 = _mm_srli_epi16(pix1, VOLUME_BITS);
1225 pix1 = _mm_packus_epi16(pix1, pix1);
1227 _mm_storel_epi64( reinterpret_cast<__m128i*>(dst2+x), pix1 );
1229 int old_pix1 = _mm_cvtsi128_si32(old_pix1_128);
1230 old_pix1 &= 0xff;
1231 int tmp = 0;
1232 for ( ; x < width-1; x++) {
1233 tmp = (old_pix1 + dst2[x+1]) * filter[0] + dst2[x] * filter[1];
1234 old_pix1 = dst2[x];
1236 dst2[x] = (xy_rounding.round(tmp)>>VOLUME_BITS);
1238 tmp = old_pix1*filter[0] + dst2[x]*filter[1];
1239 dst2[x] = (xy_rounding.round(tmp)>>VOLUME_BITS);
1243 /****
1244 * See @xy_be_blur
1245 * Construct the filter used in the final horizontal/vertical pass of @xy_be_blur when @pass is NOT a integer.
1246 * This filter is constructed to satisfy:
1247 * If @p is a pixel in the middle of the image, pixels @(p-1) and @(p+1) lie just at the left and the right
1248 * of @p respectively. The value of @p after all horizontal filtering equals to
1249 * a*value_old(@(p-1)) + b*value_old(@p) + a*value_old(@(p+1)) + other pixels' weighted sum,
1250 * then
1251 * a/b = @pass/(@pass+1).
1252 * It makes sense because the above property also holds when @pass is a integer.
1254 * @return
1255 * Let n = floor(pass);
1256 * filter = [ (pass-n)(n+2) / (2*(1+3pass-n)), 1-(pass-n)(n+2)/(1+3pass-n), (pass-n)(n+2)/ (2*(1+3pass-n)) ]
1258 void xy_calculate_filter(float pass, PUINT filter)
1260 const int VOLUME = (1<<8);
1261 int n = (int)pass;
1262 if (n==0)
1264 filter[0] = VOLUME * pass/(1+3*pass);
1266 else if (n==1)
1268 filter[0] = VOLUME * (pass-1)/(2*pass);
1270 else
1272 filter[0] = VOLUME * (pass-n)*(n+2)/ (2*(1+3*pass-n));
1274 filter[1] = VOLUME - 2*filter[0];
1275 filter[2] = filter[0];
1277 if (2*filter[0]>filter[1])
1279 //this should not happen
1280 ASSERT(0);
1281 filter[0] = VOLUME/4;
1282 filter[1] = VOLUME - 2*filter[0];
1283 filter[2] = filter[0];
1287 /****
1288 * See @xy_float_2_byte_transpose_c
1290 void xy_byte_2_byte_transpose_c(UINT8 *dst, int dst_width, int dst_stride,
1291 PCUINT8 src, int width, int height, int src_stride)
1293 ASSERT(dst_width >= height);
1294 PUINT8 dst_byte = reinterpret_cast<PUINT8>(dst);
1295 PCUINT8 src_end = src + width;
1296 PCUINT8 src2_end = reinterpret_cast<PCUINT8>(src) + height*src_stride;
1297 for( ; src<src_end; src++, dst_byte+=dst_stride )
1299 PCUINT8 src2 = reinterpret_cast<PCUINT8>(src);
1301 UINT8 *dst2 = reinterpret_cast<UINT8*>(dst_byte);
1302 for (;src2<src2_end;src2+=src_stride,dst2++)
1304 *dst2 = *src2;
1306 UINT8 *dst2_end = reinterpret_cast<UINT8*>(dst_byte) + dst_width;
1307 for (;dst2<dst2_end;dst2++)
1309 *dst2 = 0;
1314 /****
1315 * Repeat filter [1,2,1] @pass_x times in horizontal and @pass_y times in vertical
1316 * Boundary Pixels are filtered by padding 0, see @xy_be_filter_c.
1318 * @pass_x:
1319 * When @pass_x is not a integer, horizontal filter [1,2,1] is repeated (int)@pass_x times,
1320 * and a 3-tag symmetric filter which generated according to @pass_x is applied.
1321 * The final 3-tag symmetric filter is constructed to satisfy the following property:
1322 * If @pass_xx > @pass_x, the filtered result of @pass_x should NOT be more blur than
1323 * the result of @pass_xx. More specially, the filtered result of @pass_x should NOT be more
1324 * blur than (int)@pass_x+1 and should NOT be less blur than (int)@pass_x;
1326 * Rounding:
1327 * Original VSFilter \be effect uses a simple a round down, which has an bias of -7.5/16 per pass.
1328 * That rounding error is really huge with big @pass value. It has become one part of the \be effect.
1329 * We can simulate VSFilter's rounding bias by combining different rounding methods. However a simple
1330 * test shows that result still has some visual difference from VSFilter's.
1332 * Know issue:
1333 * It seems that this separated filter implementation is more sensitive to precision in comparison to
1334 * VSFilter's simple implementation. Vertical blocky artifact can be observe with big pass value
1335 * (and @pass_x==@pass_y). So it is only used for filtering of fractional part of \be strength.
1337 void xy_be_blur(PUINT8 src, int width, int height, int stride, float pass_x, float pass_y)
1339 //ASSERT(pass_x>0 && pass_y>0);
1341 typedef void (*XyBeFilter)(PUINT8 src, int width, int height, int stride);
1342 typedef void (*XyFilter2)(PUINT8 src, int width, int height, int stride, PCUINT filter);
1344 XyBeFilter filter = (g_cpuid.m_flags & CCpuID::sse2) ? xy_be_filter_sse<ROUND_HALF_TO_EVEN> : xy_be_filter_c<ROUND_HALF_TO_EVEN>;
1345 XyFilter2 filter2 = (g_cpuid.m_flags & CCpuID::sse2) ? xy_be_filter2_sse<ROUND_HALF_TO_EVEN> : xy_be_filter2_c<ROUND_HALF_TO_EVEN>;
1347 int stride_ver = height;
1348 PUINT8 tmp = reinterpret_cast<PUINT8>(xy_malloc(width*height));
1349 ASSERT(tmp);
1350 // horizontal pass
1351 int pass_x_int = static_cast<int>(pass_x);
1352 for (int i=0; i<pass_x_int; i++)
1354 filter(src, width, height, stride);
1356 if (pass_x-pass_x_int>0)
1358 UINT f[3] = {0};
1359 xy_calculate_filter(pass_x, f);
1360 filter2(src, width, height, stride, f);
1363 // transpose
1364 xy_byte_2_byte_transpose_c(tmp, height, stride_ver, src, width, height, stride);
1366 // vertical pass
1367 int pass_y_int = static_cast<int>(pass_y);
1368 for (int i=0;i<pass_y_int;i++)
1370 filter(tmp, height, width, stride_ver);
1372 if (pass_y-pass_y_int>0)
1374 UINT f[3] = {0};
1375 xy_calculate_filter(pass_y, f);
1376 filter2(tmp, height, width, stride_ver, f);
1379 // transpose
1380 xy_byte_2_byte_transpose_c(src, width, stride, tmp, height, width, stride_ver);
1382 xy_free(tmp);
1383 return;