2 * High quality image resampling with polyphase filters
3 * Copyright (c) 2001 Fabrice Bellard.
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 #include "fastmemcpy.h"
27 #define NB_COMPONENTS 3
30 #define NB_PHASES (1 << PHASE_BITS)
32 #define FCENTER 1 /* index of the center of the filter */
33 //#define TEST 1 /* Test it */
35 #define POS_FRAC_BITS 16
36 #define POS_FRAC (1 << POS_FRAC_BITS)
37 /* 6 bits precision is needed for MMX */
40 #define LINE_BUF_HEIGHT (NB_TAPS * 4)
42 struct ImgReSampleContext
{
43 int iwidth
, iheight
, owidth
, oheight
, topBand
, bottomBand
, leftBand
, rightBand
;
45 INT16 h_filters
[NB_PHASES
][NB_TAPS
] __align8
; /* horizontal filters */
46 INT16 v_filters
[NB_PHASES
][NB_TAPS
] __align8
; /* vertical filters */
50 static inline int get_phase(int pos
)
52 return ((pos
) >> (POS_FRAC_BITS
- PHASE_BITS
)) & ((1 << PHASE_BITS
) - 1);
55 /* This function must be optimized */
56 static void h_resample_fast(UINT8
*dst
, int dst_width
, UINT8
*src
, int src_width
,
57 int src_start
, int src_incr
, INT16
*filters
)
59 int src_pos
, phase
, sum
, i
;
64 for(i
=0;i
<dst_width
;i
++) {
67 if ((src_pos
>> POS_FRAC_BITS
) < 0 ||
68 (src_pos
>> POS_FRAC_BITS
) > (src_width
- NB_TAPS
))
71 s
= src
+ (src_pos
>> POS_FRAC_BITS
);
72 phase
= get_phase(src_pos
);
73 filter
= filters
+ phase
* NB_TAPS
;
75 sum
= s
[0] * filter
[0] +
83 for(j
=0;j
<NB_TAPS
;j
++)
84 sum
+= s
[j
] * filter
[j
];
87 sum
= sum
>> FILTER_BITS
;
98 /* This function must be optimized */
99 static void v_resample(UINT8
*dst
, int dst_width
, UINT8
*src
, int wrap
,
106 for(i
=0;i
<dst_width
;i
++) {
108 sum
= s
[0 * wrap
] * filter
[0] +
109 s
[1 * wrap
] * filter
[1] +
110 s
[2 * wrap
] * filter
[2] +
111 s
[3 * wrap
] * filter
[3];
118 for(j
=0;j
<NB_TAPS
;j
++) {
119 sum
+= s1
[0] * filter
[j
];
124 sum
= sum
>> FILTER_BITS
;
137 #include "i386/mmx.h"
139 #define FILTER4(reg) \
141 s = src + (src_pos >> POS_FRAC_BITS);\
142 phase = get_phase(src_pos);\
143 filter = filters + phase * NB_TAPS;\
145 punpcklbw_r2r(mm7, reg);\
146 movq_m2r(*filter, mm6);\
147 pmaddwd_r2r(reg, mm6);\
150 paddd_r2r(mm6, reg);\
151 psrad_i2r(FILTER_BITS, reg);\
152 src_pos += src_incr;\
155 #define DUMP(reg) movq_r2m(reg, tmp); printf(#reg "=%016Lx\n", tmp.uq);
157 /* XXX: do four pixels at a time */
158 static void h_resample_fast4_mmx(UINT8
*dst
, int dst_width
, UINT8
*src
, int src_width
,
159 int src_start
, int src_incr
, INT16
*filters
)
169 while (dst_width
>= 4) {
176 packuswb_r2r(mm7
, mm0
);
177 packuswb_r2r(mm7
, mm1
);
178 packuswb_r2r(mm7
, mm3
);
179 packuswb_r2r(mm7
, mm2
);
191 while (dst_width
> 0) {
193 packuswb_r2r(mm7
, mm0
);
202 static void v_resample4_mmx(UINT8
*dst
, int dst_width
, UINT8
*src
, int wrap
,
220 while (dst_width
>= 4) {
221 movq_m2r(s
[0 * wrap
], mm0
);
222 punpcklbw_r2r(mm7
, mm0
);
223 movq_m2r(s
[1 * wrap
], mm1
);
224 punpcklbw_r2r(mm7
, mm1
);
225 movq_m2r(s
[2 * wrap
], mm2
);
226 punpcklbw_r2r(mm7
, mm2
);
227 movq_m2r(s
[3 * wrap
], mm3
);
228 punpcklbw_r2r(mm7
, mm3
);
230 pmullw_m2r(coefs
[0], mm0
);
231 pmullw_m2r(coefs
[1], mm1
);
232 pmullw_m2r(coefs
[2], mm2
);
233 pmullw_m2r(coefs
[3], mm3
);
238 psraw_i2r(FILTER_BITS
, mm0
);
240 packuswb_r2r(mm7
, mm0
);
243 *(UINT32
*)dst
= tmp
.ud
[0];
248 while (dst_width
> 0) {
249 sum
= s
[0 * wrap
] * filter
[0] +
250 s
[1 * wrap
] * filter
[1] +
251 s
[2 * wrap
] * filter
[2] +
252 s
[3 * wrap
] * filter
[3];
253 sum
= sum
>> FILTER_BITS
;
269 vector
unsigned char v
;
274 vector
signed short v
;
278 void v_resample16_altivec(UINT8
*dst
, int dst_width
, UINT8
*src
, int wrap
,
283 vector
unsigned char *tv
, tmp
, dstv
, zero
;
284 vec_ss_t srchv
[4], srclv
[4], fv
[4];
285 vector
signed short zeros
, sumhv
, sumlv
;
291 The vec_madds later on does an implicit >>15 on the result.
292 Since FILTER_BITS is 8, and we have 15 bits of magnitude in
293 a signed short, we have just enough bits to pre-shift our
294 filter constants <<7 to compensate for vec_madds.
296 fv
[i
].s
[0] = filter
[i
] << (15-FILTER_BITS
);
297 fv
[i
].v
= vec_splat(fv
[i
].v
, 0);
300 zero
= vec_splat_u8(0);
301 zeros
= vec_splat_s16(0);
305 When we're resampling, we'd ideally like both our input buffers,
306 and output buffers to be 16-byte aligned, so we can do both aligned
307 reads and writes. Sadly we can't always have this at the moment, so
308 we opt for aligned writes, as unaligned writes have a huge overhead.
309 To do this, do enough scalar resamples to get dst 16-byte aligned.
311 i
= (-(int)dst
) & 0xf;
313 sum
= s
[0 * wrap
] * filter
[0] +
314 s
[1 * wrap
] * filter
[1] +
315 s
[2 * wrap
] * filter
[2] +
316 s
[3 * wrap
] * filter
[3];
317 sum
= sum
>> FILTER_BITS
;
318 if (sum
<0) sum
= 0; else if (sum
>255) sum
=255;
326 /* Do our altivec resampling on 16 pixels at once. */
327 while(dst_width
>=16) {
329 Read 16 (potentially unaligned) bytes from each of
330 4 lines into 4 vectors, and split them into shorts.
331 Interleave the multipy/accumulate for the resample
332 filter with the loads to hide the 3 cycle latency
335 tv
= (vector
unsigned char *) &s
[0 * wrap
];
336 tmp
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &s
[i
* wrap
]));
337 srchv
[0].v
= (vector
signed short) vec_mergeh(zero
, tmp
);
338 srclv
[0].v
= (vector
signed short) vec_mergel(zero
, tmp
);
339 sumhv
= vec_madds(srchv
[0].v
, fv
[0].v
, zeros
);
340 sumlv
= vec_madds(srclv
[0].v
, fv
[0].v
, zeros
);
342 tv
= (vector
unsigned char *) &s
[1 * wrap
];
343 tmp
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &s
[1 * wrap
]));
344 srchv
[1].v
= (vector
signed short) vec_mergeh(zero
, tmp
);
345 srclv
[1].v
= (vector
signed short) vec_mergel(zero
, tmp
);
346 sumhv
= vec_madds(srchv
[1].v
, fv
[1].v
, sumhv
);
347 sumlv
= vec_madds(srclv
[1].v
, fv
[1].v
, sumlv
);
349 tv
= (vector
unsigned char *) &s
[2 * wrap
];
350 tmp
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &s
[2 * wrap
]));
351 srchv
[2].v
= (vector
signed short) vec_mergeh(zero
, tmp
);
352 srclv
[2].v
= (vector
signed short) vec_mergel(zero
, tmp
);
353 sumhv
= vec_madds(srchv
[2].v
, fv
[2].v
, sumhv
);
354 sumlv
= vec_madds(srclv
[2].v
, fv
[2].v
, sumlv
);
356 tv
= (vector
unsigned char *) &s
[3 * wrap
];
357 tmp
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &s
[3 * wrap
]));
358 srchv
[3].v
= (vector
signed short) vec_mergeh(zero
, tmp
);
359 srclv
[3].v
= (vector
signed short) vec_mergel(zero
, tmp
);
360 sumhv
= vec_madds(srchv
[3].v
, fv
[3].v
, sumhv
);
361 sumlv
= vec_madds(srclv
[3].v
, fv
[3].v
, sumlv
);
364 Pack the results into our destination vector,
365 and do an aligned write of that back to memory.
367 dstv
= vec_packsu(sumhv
, sumlv
) ;
368 vec_st(dstv
, 0, (vector
unsigned char *) dst
);
376 If there are any leftover pixels, resample them
377 with the slow scalar method.
380 sum
= s
[0 * wrap
] * filter
[0] +
381 s
[1 * wrap
] * filter
[1] +
382 s
[2 * wrap
] * filter
[2] +
383 s
[3 * wrap
] * filter
[3];
384 sum
= sum
>> FILTER_BITS
;
385 if (sum
<0) sum
= 0; else if (sum
>255) sum
=255;
394 /* slow version to handle limit cases. Does not need optimisation */
395 static void h_resample_slow(UINT8
*dst
, int dst_width
, UINT8
*src
, int src_width
,
396 int src_start
, int src_incr
, INT16
*filters
)
398 int src_pos
, phase
, sum
, j
, v
, i
;
402 src_end
= src
+ src_width
;
404 for(i
=0;i
<dst_width
;i
++) {
405 s
= src
+ (src_pos
>> POS_FRAC_BITS
);
406 phase
= get_phase(src_pos
);
407 filter
= filters
+ phase
* NB_TAPS
;
409 for(j
=0;j
<NB_TAPS
;j
++) {
412 else if (s
>= src_end
)
416 sum
+= v
* filter
[j
];
419 sum
= sum
>> FILTER_BITS
;
430 static void h_resample(UINT8
*dst
, int dst_width
, UINT8
*src
, int src_width
,
431 int src_start
, int src_incr
, INT16
*filters
)
436 n
= (0 - src_start
+ src_incr
- 1) / src_incr
;
437 h_resample_slow(dst
, n
, src
, src_width
, src_start
, src_incr
, filters
);
440 src_start
+= n
* src_incr
;
442 src_end
= src_start
+ dst_width
* src_incr
;
443 if (src_end
> ((src_width
- NB_TAPS
) << POS_FRAC_BITS
)) {
444 n
= (((src_width
- NB_TAPS
+ 1) << POS_FRAC_BITS
) - 1 - src_start
) /
450 if ((mm_flags
& MM_MMX
) && NB_TAPS
== 4)
451 h_resample_fast4_mmx(dst
, n
,
452 src
, src_width
, src_start
, src_incr
, filters
);
455 h_resample_fast(dst
, n
,
456 src
, src_width
, src_start
, src_incr
, filters
);
460 src_start
+= n
* src_incr
;
461 h_resample_slow(dst
, dst_width
,
462 src
, src_width
, src_start
, src_incr
, filters
);
466 static void component_resample(ImgReSampleContext
*s
,
467 UINT8
*output
, int owrap
, int owidth
, int oheight
,
468 UINT8
*input
, int iwrap
, int iwidth
, int iheight
)
470 int src_y
, src_y1
, last_src_y
, ring_y
, phase_y
, y1
, y
;
471 UINT8
*new_line
, *src_line
;
473 last_src_y
= - FCENTER
- 1;
474 /* position of the bottom of the filter in the source image */
475 src_y
= (last_src_y
+ NB_TAPS
) * POS_FRAC
;
476 ring_y
= NB_TAPS
; /* position in ring buffer */
477 for(y
=0;y
<oheight
;y
++) {
478 /* apply horizontal filter on new lines from input if needed */
479 src_y1
= src_y
>> POS_FRAC_BITS
;
480 while (last_src_y
< src_y1
) {
481 if (++ring_y
>= LINE_BUF_HEIGHT
+ NB_TAPS
)
484 /* handle limit conditions : replicate line (slightly
485 inefficient because we filter multiple times) */
489 } else if (y1
>= iheight
) {
492 src_line
= input
+ y1
* iwrap
;
493 new_line
= s
->line_buf
+ ring_y
* owidth
;
494 /* apply filter and handle limit cases correctly */
495 h_resample(new_line
, owidth
,
496 src_line
, iwidth
, - FCENTER
* POS_FRAC
, s
->h_incr
,
497 &s
->h_filters
[0][0]);
498 /* handle ring buffer wraping */
499 if (ring_y
>= LINE_BUF_HEIGHT
) {
500 memcpy(s
->line_buf
+ (ring_y
- LINE_BUF_HEIGHT
) * owidth
,
504 /* apply vertical filter */
505 phase_y
= get_phase(src_y
);
507 /* desactivated MMX because loss of precision */
508 if ((mm_flags
& MM_MMX
) && NB_TAPS
== 4 && 0)
509 v_resample4_mmx(output
, owidth
,
510 s
->line_buf
+ (ring_y
- NB_TAPS
+ 1) * owidth
, owidth
,
511 &s
->v_filters
[phase_y
][0]);
515 if ((mm_flags
& MM_ALTIVEC
) && NB_TAPS
== 4 && FILTER_BITS
<= 6)
516 v_resample16_altivec(output
, owidth
,
517 s
->line_buf
+ (ring_y
- NB_TAPS
+ 1) * owidth
, owidth
,
518 &s
->v_filters
[phase_y
][0]);
521 v_resample(output
, owidth
,
522 s
->line_buf
+ (ring_y
- NB_TAPS
+ 1) * owidth
, owidth
,
523 &s
->v_filters
[phase_y
][0]);
530 /* XXX: the following filter is quite naive, but it seems to suffice
532 static void build_filter(INT16
*filter
, float factor
)
535 float x
, y
, tab
[NB_TAPS
], norm
, mult
;
537 /* if upsampling, only need to interpolate, no filter */
541 for(ph
=0;ph
<NB_PHASES
;ph
++) {
543 for(i
=0;i
<NB_TAPS
;i
++) {
545 x
= M_PI
* ((float)(i
- FCENTER
) - (float)ph
/ NB_PHASES
) * factor
;
554 /* normalize so that an uniform color remains the same */
555 mult
= (float)(1 << FILTER_BITS
) / norm
;
556 for(i
=0;i
<NB_TAPS
;i
++) {
557 v
= (int)(tab
[i
] * mult
);
558 filter
[ph
* NB_TAPS
+ i
] = v
;
563 ImgReSampleContext
*img_resample_init(int owidth
, int oheight
,
564 int iwidth
, int iheight
)
566 return img_resample_full_init(owidth
, oheight
, iwidth
, iheight
, 0, 0, 0, 0);
569 ImgReSampleContext
*img_resample_full_init(int owidth
, int oheight
,
570 int iwidth
, int iheight
,
571 int topBand
, int bottomBand
,
572 int leftBand
, int rightBand
)
574 ImgReSampleContext
*s
;
576 s
= av_mallocz(sizeof(ImgReSampleContext
));
579 s
->line_buf
= av_mallocz(owidth
* (LINE_BUF_HEIGHT
+ NB_TAPS
));
584 s
->oheight
= oheight
;
586 s
->iheight
= iheight
;
587 s
->topBand
= topBand
;
588 s
->bottomBand
= bottomBand
;
589 s
->leftBand
= leftBand
;
590 s
->rightBand
= rightBand
;
592 s
->h_incr
= ((iwidth
- leftBand
- rightBand
) * POS_FRAC
) / owidth
;
593 s
->v_incr
= ((iheight
- topBand
- bottomBand
) * POS_FRAC
) / oheight
;
595 build_filter(&s
->h_filters
[0][0], (float) owidth
/ (float) (iwidth
- leftBand
- rightBand
));
596 build_filter(&s
->v_filters
[0][0], (float) oheight
/ (float) (iheight
- topBand
- bottomBand
));
604 void img_resample(ImgReSampleContext
*s
,
605 AVPicture
*output
, AVPicture
*input
)
610 shift
= (i
== 0) ? 0 : 1;
611 component_resample(s
, output
->data
[i
], output
->linesize
[i
],
612 s
->owidth
>> shift
, s
->oheight
>> shift
,
613 input
->data
[i
] + (input
->linesize
[i
] * (s
->topBand
>> shift
)) + (s
->leftBand
>> shift
),
614 input
->linesize
[i
], ((s
->iwidth
- s
->leftBand
- s
->rightBand
) >> shift
),
615 (s
->iheight
- s
->topBand
- s
->bottomBand
) >> shift
);
619 void img_resample_close(ImgReSampleContext
*s
)
621 av_free(s
->line_buf
);
627 void *av_mallocz(int size
)
631 memset(ptr
, 0, size
);
635 void av_free(void *ptr
)
637 /* XXX: this test should not be needed on most libcs */
645 UINT8 img
[XSIZE
* YSIZE
];
650 UINT8 img1
[XSIZE1
* YSIZE1
];
651 UINT8 img2
[XSIZE1
* YSIZE1
];
653 void save_pgm(const char *filename
, UINT8
*img
, int xsize
, int ysize
)
656 f
=fopen(filename
,"w");
657 fprintf(f
,"P5\n%d %d\n%d\n", xsize
, ysize
, 255);
658 fwrite(img
,1, xsize
* ysize
,f
);
662 static void dump_filter(INT16
*filter
)
666 for(ph
=0;ph
<NB_PHASES
;ph
++) {
668 for(i
=0;i
<NB_TAPS
;i
++) {
669 printf(" %5.2f", filter
[ph
* NB_TAPS
+ i
] / 256.0);
679 int main(int argc
, char **argv
)
681 int x
, y
, v
, i
, xsize
, ysize
;
682 ImgReSampleContext
*s
;
683 float fact
, factors
[] = { 1/2.0, 3.0/4.0, 1.0, 4.0/3.0, 16.0/9.0, 2.0 };
686 /* build test image */
687 for(y
=0;y
<YSIZE
;y
++) {
688 for(x
=0;x
<XSIZE
;x
++) {
689 if (x
< XSIZE
/2 && y
< YSIZE
/2) {
690 if (x
< XSIZE
/4 && y
< YSIZE
/4) {
696 } else if (x
< XSIZE
/4) {
701 } else if (y
< XSIZE
/4) {
713 if (((x
+3) % 4) <= 1 &&
720 } else if (x
< XSIZE
/2) {
721 v
= ((x
- (XSIZE
/2)) * 255) / (XSIZE
/2);
722 } else if (y
< XSIZE
/2) {
723 v
= ((y
- (XSIZE
/2)) * 255) / (XSIZE
/2);
725 v
= ((x
+ y
- XSIZE
) * 255) / XSIZE
;
727 img
[(YSIZE
- y
) * XSIZE
+ (XSIZE
- x
)] = v
;
730 save_pgm("/tmp/in.pgm", img
, XSIZE
, YSIZE
);
731 for(i
=0;i
<sizeof(factors
)/sizeof(float);i
++) {
733 xsize
= (int)(XSIZE
* fact
);
734 ysize
= (int)((YSIZE
- 100) * fact
);
735 s
= img_resample_full_init(xsize
, ysize
, XSIZE
, YSIZE
, 50 ,50, 0, 0);
736 printf("Factor=%0.2f\n", fact
);
737 dump_filter(&s
->h_filters
[0][0]);
738 component_resample(s
, img1
, xsize
, xsize
, ysize
,
739 img
+ 50 * XSIZE
, XSIZE
, XSIZE
, YSIZE
- 100);
740 img_resample_close(s
);
742 sprintf(buf
, "/tmp/out%d.pgm", i
);
743 save_pgm(buf
, img1
, xsize
, ysize
);
748 printf("MMX test\n");
750 xsize
= (int)(XSIZE
* fact
);
751 ysize
= (int)(YSIZE
* fact
);
753 s
= img_resample_init(xsize
, ysize
, XSIZE
, YSIZE
);
754 component_resample(s
, img1
, xsize
, xsize
, ysize
,
755 img
, XSIZE
, XSIZE
, YSIZE
);
758 s
= img_resample_init(xsize
, ysize
, XSIZE
, YSIZE
);
759 component_resample(s
, img2
, xsize
, xsize
, ysize
,
760 img
, XSIZE
, XSIZE
, YSIZE
);
761 if (memcmp(img1
, img2
, xsize
* ysize
) != 0) {
762 fprintf(stderr
, "mmx error\n");