2 * High quality image resampling with polyphase filters
3 * Copyright (c) 2001 Fabrice Bellard.
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 * @file imgresample_altivec.c
24 * High quality image resampling with polyphase filters - AltiVec bits
27 #include "gcc_fixes.h"
30 vector
unsigned char v
;
35 vector
signed short v
;
39 void v_resample16_altivec(uint8_t *dst
, int dst_width
, const uint8_t *src
,
40 int wrap
, int16_t *filter
)
44 vector
unsigned char *tv
, tmp
, dstv
, zero
;
45 vec_ss_t srchv
[4], srclv
[4], fv
[4];
46 vector
signed short zeros
, sumhv
, sumlv
;
52 The vec_madds later on does an implicit >>15 on the result.
53 Since FILTER_BITS is 8, and we have 15 bits of magnitude in
54 a signed short, we have just enough bits to pre-shift our
55 filter constants <<7 to compensate for vec_madds.
57 fv
[i
].s
[0] = filter
[i
] << (15-FILTER_BITS
);
58 fv
[i
].v
= vec_splat(fv
[i
].v
, 0);
61 zero
= vec_splat_u8(0);
62 zeros
= vec_splat_s16(0);
66 When we're resampling, we'd ideally like both our input buffers,
67 and output buffers to be 16-byte aligned, so we can do both aligned
68 reads and writes. Sadly we can't always have this at the moment, so
69 we opt for aligned writes, as unaligned writes have a huge overhead.
70 To do this, do enough scalar resamples to get dst 16-byte aligned.
72 i
= (-(int)dst
) & 0xf;
74 sum
= s
[0 * wrap
] * filter
[0] +
75 s
[1 * wrap
] * filter
[1] +
76 s
[2 * wrap
] * filter
[2] +
77 s
[3 * wrap
] * filter
[3];
78 sum
= sum
>> FILTER_BITS
;
79 if (sum
<0) sum
= 0; else if (sum
>255) sum
=255;
87 /* Do our altivec resampling on 16 pixels at once. */
88 while(dst_width
>=16) {
90 Read 16 (potentially unaligned) bytes from each of
91 4 lines into 4 vectors, and split them into shorts.
92 Interleave the multipy/accumulate for the resample
93 filter with the loads to hide the 3 cycle latency
96 tv
= (vector
unsigned char *) &s
[0 * wrap
];
97 tmp
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &s
[i
* wrap
]));
98 srchv
[0].v
= (vector
signed short) vec_mergeh(zero
, tmp
);
99 srclv
[0].v
= (vector
signed short) vec_mergel(zero
, tmp
);
100 sumhv
= vec_madds(srchv
[0].v
, fv
[0].v
, zeros
);
101 sumlv
= vec_madds(srclv
[0].v
, fv
[0].v
, zeros
);
103 tv
= (vector
unsigned char *) &s
[1 * wrap
];
104 tmp
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &s
[1 * wrap
]));
105 srchv
[1].v
= (vector
signed short) vec_mergeh(zero
, tmp
);
106 srclv
[1].v
= (vector
signed short) vec_mergel(zero
, tmp
);
107 sumhv
= vec_madds(srchv
[1].v
, fv
[1].v
, sumhv
);
108 sumlv
= vec_madds(srclv
[1].v
, fv
[1].v
, sumlv
);
110 tv
= (vector
unsigned char *) &s
[2 * wrap
];
111 tmp
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &s
[2 * wrap
]));
112 srchv
[2].v
= (vector
signed short) vec_mergeh(zero
, tmp
);
113 srclv
[2].v
= (vector
signed short) vec_mergel(zero
, tmp
);
114 sumhv
= vec_madds(srchv
[2].v
, fv
[2].v
, sumhv
);
115 sumlv
= vec_madds(srclv
[2].v
, fv
[2].v
, sumlv
);
117 tv
= (vector
unsigned char *) &s
[3 * wrap
];
118 tmp
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &s
[3 * wrap
]));
119 srchv
[3].v
= (vector
signed short) vec_mergeh(zero
, tmp
);
120 srclv
[3].v
= (vector
signed short) vec_mergel(zero
, tmp
);
121 sumhv
= vec_madds(srchv
[3].v
, fv
[3].v
, sumhv
);
122 sumlv
= vec_madds(srclv
[3].v
, fv
[3].v
, sumlv
);
125 Pack the results into our destination vector,
126 and do an aligned write of that back to memory.
128 dstv
= vec_packsu(sumhv
, sumlv
) ;
129 vec_st(dstv
, 0, (vector
unsigned char *) dst
);
137 If there are any leftover pixels, resample them
138 with the slow scalar method.
141 sum
= s
[0 * wrap
] * filter
[0] +
142 s
[1 * wrap
] * filter
[1] +
143 s
[2 * wrap
] * filter
[2] +
144 s
[3 * wrap
] * filter
[3];
145 sum
= sum
>> FILTER_BITS
;
146 if (sum
<0) sum
= 0; else if (sum
>255) sum
=255;