2 * High quality image resampling with polyphase filters
3 * Copyright (c) 2001 Fabrice Bellard.
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 * High quality image resampling with polyphase filters .
29 #include "fastmemcpy.h"
32 #define NB_COMPONENTS 3
35 #define NB_PHASES (1 << PHASE_BITS)
37 #define FCENTER 1 /* index of the center of the filter */
38 //#define TEST 1 /* Test it */
40 #define POS_FRAC_BITS 16
41 #define POS_FRAC (1 << POS_FRAC_BITS)
42 /* 6 bits precision is needed for MMX */
45 #define LINE_BUF_HEIGHT (NB_TAPS * 4)
47 struct ImgReSampleContext
{
48 int iwidth
, iheight
, owidth
, oheight
;
49 int topBand
, bottomBand
, leftBand
, rightBand
;
50 int padtop
, padbottom
, padleft
, padright
;
51 int pad_owidth
, pad_oheight
;
53 int16_t h_filters
[NB_PHASES
][NB_TAPS
] __align8
; /* horizontal filters */
54 int16_t v_filters
[NB_PHASES
][NB_TAPS
] __align8
; /* vertical filters */
58 void av_build_filter(int16_t *filter
, double factor
, int tap_count
, int phase_count
, int scale
, int type
);
60 static inline int get_phase(int pos
)
62 return ((pos
) >> (POS_FRAC_BITS
- PHASE_BITS
)) & ((1 << PHASE_BITS
) - 1);
65 /* This function must be optimized */
66 static void h_resample_fast(uint8_t *dst
, int dst_width
, const uint8_t *src
,
67 int src_width
, int src_start
, int src_incr
,
70 int src_pos
, phase
, sum
, i
;
75 for(i
=0;i
<dst_width
;i
++) {
78 if ((src_pos
>> POS_FRAC_BITS
) < 0 ||
79 (src_pos
>> POS_FRAC_BITS
) > (src_width
- NB_TAPS
))
82 s
= src
+ (src_pos
>> POS_FRAC_BITS
);
83 phase
= get_phase(src_pos
);
84 filter
= filters
+ phase
* NB_TAPS
;
86 sum
= s
[0] * filter
[0] +
94 for(j
=0;j
<NB_TAPS
;j
++)
95 sum
+= s
[j
] * filter
[j
];
98 sum
= sum
>> FILTER_BITS
;
109 /* This function must be optimized */
110 static void v_resample(uint8_t *dst
, int dst_width
, const uint8_t *src
,
111 int wrap
, int16_t *filter
)
117 for(i
=0;i
<dst_width
;i
++) {
119 sum
= s
[0 * wrap
] * filter
[0] +
120 s
[1 * wrap
] * filter
[1] +
121 s
[2 * wrap
] * filter
[2] +
122 s
[3 * wrap
] * filter
[3];
129 for(j
=0;j
<NB_TAPS
;j
++) {
130 sum
+= s1
[0] * filter
[j
];
135 sum
= sum
>> FILTER_BITS
;
148 #include "i386/mmx.h"
150 #define FILTER4(reg) \
152 s = src + (src_pos >> POS_FRAC_BITS);\
153 phase = get_phase(src_pos);\
154 filter = filters + phase * NB_TAPS;\
156 punpcklbw_r2r(mm7, reg);\
157 movq_m2r(*filter, mm6);\
158 pmaddwd_r2r(reg, mm6);\
161 paddd_r2r(mm6, reg);\
162 psrad_i2r(FILTER_BITS, reg);\
163 src_pos += src_incr;\
166 #define DUMP(reg) movq_r2m(reg, tmp); printf(#reg "=%016Lx\n", tmp.uq);
168 /* XXX: do four pixels at a time */
169 static void h_resample_fast4_mmx(uint8_t *dst
, int dst_width
,
170 const uint8_t *src
, int src_width
,
171 int src_start
, int src_incr
, int16_t *filters
)
181 while (dst_width
>= 4) {
188 packuswb_r2r(mm7
, mm0
);
189 packuswb_r2r(mm7
, mm1
);
190 packuswb_r2r(mm7
, mm3
);
191 packuswb_r2r(mm7
, mm2
);
203 while (dst_width
> 0) {
205 packuswb_r2r(mm7
, mm0
);
214 static void v_resample4_mmx(uint8_t *dst
, int dst_width
, const uint8_t *src
,
215 int wrap
, int16_t *filter
)
232 while (dst_width
>= 4) {
233 movq_m2r(s
[0 * wrap
], mm0
);
234 punpcklbw_r2r(mm7
, mm0
);
235 movq_m2r(s
[1 * wrap
], mm1
);
236 punpcklbw_r2r(mm7
, mm1
);
237 movq_m2r(s
[2 * wrap
], mm2
);
238 punpcklbw_r2r(mm7
, mm2
);
239 movq_m2r(s
[3 * wrap
], mm3
);
240 punpcklbw_r2r(mm7
, mm3
);
242 pmullw_m2r(coefs
[0], mm0
);
243 pmullw_m2r(coefs
[1], mm1
);
244 pmullw_m2r(coefs
[2], mm2
);
245 pmullw_m2r(coefs
[3], mm3
);
250 psraw_i2r(FILTER_BITS
, mm0
);
252 packuswb_r2r(mm7
, mm0
);
255 *(uint32_t *)dst
= tmp
.ud
[0];
260 while (dst_width
> 0) {
261 sum
= s
[0 * wrap
] * filter
[0] +
262 s
[1 * wrap
] * filter
[1] +
263 s
[2 * wrap
] * filter
[2] +
264 s
[3 * wrap
] * filter
[3];
265 sum
= sum
>> FILTER_BITS
;
281 vector
unsigned char v
;
286 vector
signed short v
;
290 void v_resample16_altivec(uint8_t *dst
, int dst_width
, const uint8_t *src
,
291 int wrap
, int16_t *filter
)
295 vector
unsigned char *tv
, tmp
, dstv
, zero
;
296 vec_ss_t srchv
[4], srclv
[4], fv
[4];
297 vector
signed short zeros
, sumhv
, sumlv
;
303 The vec_madds later on does an implicit >>15 on the result.
304 Since FILTER_BITS is 8, and we have 15 bits of magnitude in
305 a signed short, we have just enough bits to pre-shift our
306 filter constants <<7 to compensate for vec_madds.
308 fv
[i
].s
[0] = filter
[i
] << (15-FILTER_BITS
);
309 fv
[i
].v
= vec_splat(fv
[i
].v
, 0);
312 zero
= vec_splat_u8(0);
313 zeros
= vec_splat_s16(0);
317 When we're resampling, we'd ideally like both our input buffers,
318 and output buffers to be 16-byte aligned, so we can do both aligned
319 reads and writes. Sadly we can't always have this at the moment, so
320 we opt for aligned writes, as unaligned writes have a huge overhead.
321 To do this, do enough scalar resamples to get dst 16-byte aligned.
323 i
= (-(int)dst
) & 0xf;
325 sum
= s
[0 * wrap
] * filter
[0] +
326 s
[1 * wrap
] * filter
[1] +
327 s
[2 * wrap
] * filter
[2] +
328 s
[3 * wrap
] * filter
[3];
329 sum
= sum
>> FILTER_BITS
;
330 if (sum
<0) sum
= 0; else if (sum
>255) sum
=255;
338 /* Do our altivec resampling on 16 pixels at once. */
339 while(dst_width
>=16) {
341 Read 16 (potentially unaligned) bytes from each of
342 4 lines into 4 vectors, and split them into shorts.
343 Interleave the multipy/accumulate for the resample
344 filter with the loads to hide the 3 cycle latency
347 tv
= (vector
unsigned char *) &s
[0 * wrap
];
348 tmp
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &s
[i
* wrap
]));
349 srchv
[0].v
= (vector
signed short) vec_mergeh(zero
, tmp
);
350 srclv
[0].v
= (vector
signed short) vec_mergel(zero
, tmp
);
351 sumhv
= vec_madds(srchv
[0].v
, fv
[0].v
, zeros
);
352 sumlv
= vec_madds(srclv
[0].v
, fv
[0].v
, zeros
);
354 tv
= (vector
unsigned char *) &s
[1 * wrap
];
355 tmp
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &s
[1 * wrap
]));
356 srchv
[1].v
= (vector
signed short) vec_mergeh(zero
, tmp
);
357 srclv
[1].v
= (vector
signed short) vec_mergel(zero
, tmp
);
358 sumhv
= vec_madds(srchv
[1].v
, fv
[1].v
, sumhv
);
359 sumlv
= vec_madds(srclv
[1].v
, fv
[1].v
, sumlv
);
361 tv
= (vector
unsigned char *) &s
[2 * wrap
];
362 tmp
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &s
[2 * wrap
]));
363 srchv
[2].v
= (vector
signed short) vec_mergeh(zero
, tmp
);
364 srclv
[2].v
= (vector
signed short) vec_mergel(zero
, tmp
);
365 sumhv
= vec_madds(srchv
[2].v
, fv
[2].v
, sumhv
);
366 sumlv
= vec_madds(srclv
[2].v
, fv
[2].v
, sumlv
);
368 tv
= (vector
unsigned char *) &s
[3 * wrap
];
369 tmp
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &s
[3 * wrap
]));
370 srchv
[3].v
= (vector
signed short) vec_mergeh(zero
, tmp
);
371 srclv
[3].v
= (vector
signed short) vec_mergel(zero
, tmp
);
372 sumhv
= vec_madds(srchv
[3].v
, fv
[3].v
, sumhv
);
373 sumlv
= vec_madds(srclv
[3].v
, fv
[3].v
, sumlv
);
376 Pack the results into our destination vector,
377 and do an aligned write of that back to memory.
379 dstv
= vec_packsu(sumhv
, sumlv
) ;
380 vec_st(dstv
, 0, (vector
unsigned char *) dst
);
388 If there are any leftover pixels, resample them
389 with the slow scalar method.
392 sum
= s
[0 * wrap
] * filter
[0] +
393 s
[1 * wrap
] * filter
[1] +
394 s
[2 * wrap
] * filter
[2] +
395 s
[3 * wrap
] * filter
[3];
396 sum
= sum
>> FILTER_BITS
;
397 if (sum
<0) sum
= 0; else if (sum
>255) sum
=255;
406 /* slow version to handle limit cases. Does not need optimisation */
407 static void h_resample_slow(uint8_t *dst
, int dst_width
,
408 const uint8_t *src
, int src_width
,
409 int src_start
, int src_incr
, int16_t *filters
)
411 int src_pos
, phase
, sum
, j
, v
, i
;
412 const uint8_t *s
, *src_end
;
415 src_end
= src
+ src_width
;
417 for(i
=0;i
<dst_width
;i
++) {
418 s
= src
+ (src_pos
>> POS_FRAC_BITS
);
419 phase
= get_phase(src_pos
);
420 filter
= filters
+ phase
* NB_TAPS
;
422 for(j
=0;j
<NB_TAPS
;j
++) {
425 else if (s
>= src_end
)
429 sum
+= v
* filter
[j
];
432 sum
= sum
>> FILTER_BITS
;
443 static void h_resample(uint8_t *dst
, int dst_width
, const uint8_t *src
,
444 int src_width
, int src_start
, int src_incr
,
450 n
= (0 - src_start
+ src_incr
- 1) / src_incr
;
451 h_resample_slow(dst
, n
, src
, src_width
, src_start
, src_incr
, filters
);
454 src_start
+= n
* src_incr
;
456 src_end
= src_start
+ dst_width
* src_incr
;
457 if (src_end
> ((src_width
- NB_TAPS
) << POS_FRAC_BITS
)) {
458 n
= (((src_width
- NB_TAPS
+ 1) << POS_FRAC_BITS
) - 1 - src_start
) /
464 if ((mm_flags
& MM_MMX
) && NB_TAPS
== 4)
465 h_resample_fast4_mmx(dst
, n
,
466 src
, src_width
, src_start
, src_incr
, filters
);
469 h_resample_fast(dst
, n
,
470 src
, src_width
, src_start
, src_incr
, filters
);
474 src_start
+= n
* src_incr
;
475 h_resample_slow(dst
, dst_width
,
476 src
, src_width
, src_start
, src_incr
, filters
);
480 static void component_resample(ImgReSampleContext
*s
,
481 uint8_t *output
, int owrap
, int owidth
, int oheight
,
482 uint8_t *input
, int iwrap
, int iwidth
, int iheight
)
484 int src_y
, src_y1
, last_src_y
, ring_y
, phase_y
, y1
, y
;
485 uint8_t *new_line
, *src_line
;
487 last_src_y
= - FCENTER
- 1;
488 /* position of the bottom of the filter in the source image */
489 src_y
= (last_src_y
+ NB_TAPS
) * POS_FRAC
;
490 ring_y
= NB_TAPS
; /* position in ring buffer */
491 for(y
=0;y
<oheight
;y
++) {
492 /* apply horizontal filter on new lines from input if needed */
493 src_y1
= src_y
>> POS_FRAC_BITS
;
494 while (last_src_y
< src_y1
) {
495 if (++ring_y
>= LINE_BUF_HEIGHT
+ NB_TAPS
)
498 /* handle limit conditions : replicate line (slightly
499 inefficient because we filter multiple times) */
503 } else if (y1
>= iheight
) {
506 src_line
= input
+ y1
* iwrap
;
507 new_line
= s
->line_buf
+ ring_y
* owidth
;
508 /* apply filter and handle limit cases correctly */
509 h_resample(new_line
, owidth
,
510 src_line
, iwidth
, - FCENTER
* POS_FRAC
, s
->h_incr
,
511 &s
->h_filters
[0][0]);
512 /* handle ring buffer wraping */
513 if (ring_y
>= LINE_BUF_HEIGHT
) {
514 memcpy(s
->line_buf
+ (ring_y
- LINE_BUF_HEIGHT
) * owidth
,
518 /* apply vertical filter */
519 phase_y
= get_phase(src_y
);
521 /* desactivated MMX because loss of precision */
522 if ((mm_flags
& MM_MMX
) && NB_TAPS
== 4 && 0)
523 v_resample4_mmx(output
, owidth
,
524 s
->line_buf
+ (ring_y
- NB_TAPS
+ 1) * owidth
, owidth
,
525 &s
->v_filters
[phase_y
][0]);
529 if ((mm_flags
& MM_ALTIVEC
) && NB_TAPS
== 4 && FILTER_BITS
<= 6)
530 v_resample16_altivec(output
, owidth
,
531 s
->line_buf
+ (ring_y
- NB_TAPS
+ 1) * owidth
, owidth
,
532 &s
->v_filters
[phase_y
][0]);
535 v_resample(output
, owidth
,
536 s
->line_buf
+ (ring_y
- NB_TAPS
+ 1) * owidth
, owidth
,
537 &s
->v_filters
[phase_y
][0]);
545 ImgReSampleContext
*img_resample_init(int owidth
, int oheight
,
546 int iwidth
, int iheight
)
548 return img_resample_full_init(owidth
, oheight
, iwidth
, iheight
,
549 0, 0, 0, 0, 0, 0, 0, 0);
552 ImgReSampleContext
*img_resample_full_init(int owidth
, int oheight
,
553 int iwidth
, int iheight
,
554 int topBand
, int bottomBand
,
555 int leftBand
, int rightBand
,
556 int padtop
, int padbottom
,
557 int padleft
, int padright
)
559 ImgReSampleContext
*s
;
561 s
= av_mallocz(sizeof(ImgReSampleContext
));
564 if((unsigned)owidth
>= UINT_MAX
/ (LINE_BUF_HEIGHT
+ NB_TAPS
))
566 s
->line_buf
= av_mallocz(owidth
* (LINE_BUF_HEIGHT
+ NB_TAPS
));
571 s
->oheight
= oheight
;
573 s
->iheight
= iheight
;
575 s
->topBand
= topBand
;
576 s
->bottomBand
= bottomBand
;
577 s
->leftBand
= leftBand
;
578 s
->rightBand
= rightBand
;
581 s
->padbottom
= padbottom
;
582 s
->padleft
= padleft
;
583 s
->padright
= padright
;
585 s
->pad_owidth
= owidth
- (padleft
+ padright
);
586 s
->pad_oheight
= oheight
- (padtop
+ padbottom
);
588 s
->h_incr
= ((iwidth
- leftBand
- rightBand
) * POS_FRAC
) / s
->pad_owidth
;
589 s
->v_incr
= ((iheight
- topBand
- bottomBand
) * POS_FRAC
) / s
->pad_oheight
;
591 av_build_filter(&s
->h_filters
[0][0], (float) s
->pad_owidth
/
592 (float) (iwidth
- leftBand
- rightBand
), NB_TAPS
, NB_PHASES
, 1<<FILTER_BITS
, 0);
593 av_build_filter(&s
->v_filters
[0][0], (float) s
->pad_oheight
/
594 (float) (iheight
- topBand
- bottomBand
), NB_TAPS
, NB_PHASES
, 1<<FILTER_BITS
, 0);
602 void img_resample(ImgReSampleContext
*s
,
603 AVPicture
*output
, const AVPicture
*input
)
609 shift
= (i
== 0) ? 0 : 1;
611 optr
= output
->data
[i
] + (((output
->linesize
[i
] *
612 s
->padtop
) + s
->padleft
) >> shift
);
614 component_resample(s
, optr
, output
->linesize
[i
],
615 s
->pad_owidth
>> shift
, s
->pad_oheight
>> shift
,
616 input
->data
[i
] + (input
->linesize
[i
] *
617 (s
->topBand
>> shift
)) + (s
->leftBand
>> shift
),
618 input
->linesize
[i
], ((s
->iwidth
- s
->leftBand
-
619 s
->rightBand
) >> shift
),
620 (s
->iheight
- s
->topBand
- s
->bottomBand
) >> shift
);
624 void img_resample_close(ImgReSampleContext
*s
)
626 av_free(s
->line_buf
);
636 uint8_t img
[XSIZE
* YSIZE
];
641 uint8_t img1
[XSIZE1
* YSIZE1
];
642 uint8_t img2
[XSIZE1
* YSIZE1
];
644 void save_pgm(const char *filename
, uint8_t *img
, int xsize
, int ysize
)
647 f
=fopen(filename
,"w");
648 fprintf(f
,"P5\n%d %d\n%d\n", xsize
, ysize
, 255);
649 fwrite(img
,1, xsize
* ysize
,f
);
653 static void dump_filter(int16_t *filter
)
657 for(ph
=0;ph
<NB_PHASES
;ph
++) {
658 av_log(NULL
, AV_LOG_INFO
, "%2d: ", ph
);
659 for(i
=0;i
<NB_TAPS
;i
++) {
660 av_log(NULL
, AV_LOG_INFO
, " %5.2f", filter
[ph
* NB_TAPS
+ i
] / 256.0);
662 av_log(NULL
, AV_LOG_INFO
, "\n");
670 int main(int argc
, char **argv
)
672 int x
, y
, v
, i
, xsize
, ysize
;
673 ImgReSampleContext
*s
;
674 float fact
, factors
[] = { 1/2.0, 3.0/4.0, 1.0, 4.0/3.0, 16.0/9.0, 2.0 };
677 /* build test image */
678 for(y
=0;y
<YSIZE
;y
++) {
679 for(x
=0;x
<XSIZE
;x
++) {
680 if (x
< XSIZE
/2 && y
< YSIZE
/2) {
681 if (x
< XSIZE
/4 && y
< YSIZE
/4) {
687 } else if (x
< XSIZE
/4) {
692 } else if (y
< XSIZE
/4) {
704 if (((x
+3) % 4) <= 1 &&
711 } else if (x
< XSIZE
/2) {
712 v
= ((x
- (XSIZE
/2)) * 255) / (XSIZE
/2);
713 } else if (y
< XSIZE
/2) {
714 v
= ((y
- (XSIZE
/2)) * 255) / (XSIZE
/2);
716 v
= ((x
+ y
- XSIZE
) * 255) / XSIZE
;
718 img
[(YSIZE
- y
) * XSIZE
+ (XSIZE
- x
)] = v
;
721 save_pgm("/tmp/in.pgm", img
, XSIZE
, YSIZE
);
722 for(i
=0;i
<sizeof(factors
)/sizeof(float);i
++) {
724 xsize
= (int)(XSIZE
* fact
);
725 ysize
= (int)((YSIZE
- 100) * fact
);
726 s
= img_resample_full_init(xsize
, ysize
, XSIZE
, YSIZE
, 50 ,50, 0, 0, 0, 0, 0, 0);
727 av_log(NULL
, AV_LOG_INFO
, "Factor=%0.2f\n", fact
);
728 dump_filter(&s
->h_filters
[0][0]);
729 component_resample(s
, img1
, xsize
, xsize
, ysize
,
730 img
+ 50 * XSIZE
, XSIZE
, XSIZE
, YSIZE
- 100);
731 img_resample_close(s
);
733 snprintf(buf
, sizeof(buf
), "/tmp/out%d.pgm", i
);
734 save_pgm(buf
, img1
, xsize
, ysize
);
739 av_log(NULL
, AV_LOG_INFO
, "MMX test\n");
741 xsize
= (int)(XSIZE
* fact
);
742 ysize
= (int)(YSIZE
* fact
);
744 s
= img_resample_init(xsize
, ysize
, XSIZE
, YSIZE
);
745 component_resample(s
, img1
, xsize
, xsize
, ysize
,
746 img
, XSIZE
, XSIZE
, YSIZE
);
749 s
= img_resample_init(xsize
, ysize
, XSIZE
, YSIZE
);
750 component_resample(s
, img2
, xsize
, xsize
, ysize
,
751 img
, XSIZE
, XSIZE
, YSIZE
);
752 if (memcmp(img1
, img2
, xsize
* ysize
) != 0) {
753 av_log(NULL
, AV_LOG_ERROR
, "mmx error\n");
756 av_log(NULL
, AV_LOG_INFO
, "MMX OK\n");