Merge branch 'master' of http://repo.or.cz/r/FFMpeg-mirror
[FFMpeg-mirror/DVCPRO-HD.git] / libavcodec / dsputil.c
blob80efd07825bf026558abc0f16052592002ddde17
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 /**
26 * @file dsputil.c
27 * DSP utils
30 #include "avcodec.h"
31 #include "dsputil.h"
32 #include "simple_idct.h"
33 #include "faandct.h"
34 #include "faanidct.h"
35 #include "h263.h"
36 #include "snow.h"
38 /* snow.c */
39 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
41 /* vorbis.c */
42 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
44 /* flacenc.c */
45 void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
47 /* pngdec.c */
48 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
50 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
51 uint32_t ff_squareTbl[512] = {0, };
53 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
54 #define pb_7f (~0UL/255 * 0x7f)
55 #define pb_80 (~0UL/255 * 0x80)
57 const uint8_t ff_zigzag_direct[64] = {
58 0, 1, 8, 16, 9, 2, 3, 10,
59 17, 24, 32, 25, 18, 11, 4, 5,
60 12, 19, 26, 33, 40, 48, 41, 34,
61 27, 20, 13, 6, 7, 14, 21, 28,
62 35, 42, 49, 56, 57, 50, 43, 36,
63 29, 22, 15, 23, 30, 37, 44, 51,
64 58, 59, 52, 45, 38, 31, 39, 46,
65 53, 60, 61, 54, 47, 55, 62, 63
68 /* Specific zigzag scan for 248 idct. NOTE that unlike the
69 specification, we interleave the fields */
70 const uint8_t ff_zigzag248_direct[64] = {
71 0, 8, 1, 9, 16, 24, 2, 10,
72 17, 25, 32, 40, 48, 56, 33, 41,
73 18, 26, 3, 11, 4, 12, 19, 27,
74 34, 42, 49, 57, 50, 58, 35, 43,
75 20, 28, 5, 13, 6, 14, 21, 29,
76 36, 44, 51, 59, 52, 60, 37, 45,
77 22, 30, 7, 15, 23, 31, 38, 46,
78 53, 61, 54, 62, 39, 47, 55, 63,
81 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
82 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
84 const uint8_t ff_alternate_horizontal_scan[64] = {
85 0, 1, 2, 3, 8, 9, 16, 17,
86 10, 11, 4, 5, 6, 7, 15, 14,
87 13, 12, 19, 18, 24, 25, 32, 33,
88 26, 27, 20, 21, 22, 23, 28, 29,
89 30, 31, 34, 35, 40, 41, 48, 49,
90 42, 43, 36, 37, 38, 39, 44, 45,
91 46, 47, 50, 51, 56, 57, 58, 59,
92 52, 53, 54, 55, 60, 61, 62, 63,
95 const uint8_t ff_alternate_vertical_scan[64] = {
96 0, 8, 16, 24, 1, 9, 2, 10,
97 17, 25, 32, 40, 48, 56, 57, 49,
98 41, 33, 26, 18, 3, 11, 4, 12,
99 19, 27, 34, 42, 50, 58, 35, 43,
100 51, 59, 20, 28, 5, 13, 6, 14,
101 21, 29, 36, 44, 52, 60, 37, 45,
102 53, 61, 22, 30, 7, 15, 23, 31,
103 38, 46, 54, 62, 39, 47, 55, 63,
106 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
107 const uint32_t ff_inverse[256]={
108 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
109 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
110 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
111 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
112 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
113 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
114 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
115 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
116 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
117 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
118 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
119 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
120 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
121 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
122 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
123 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
124 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
125 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
126 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
127 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
128 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
129 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
130 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
131 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
132 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
133 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
134 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
135 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
136 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
137 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
138 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
139 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
142 /* Input permutation for the simple_idct_mmx */
143 static const uint8_t simple_mmx_permutation[64]={
144 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
145 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
146 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
147 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
148 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
149 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
150 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
151 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
154 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
156 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
157 int i;
158 int end;
160 st->scantable= src_scantable;
162 for(i=0; i<64; i++){
163 int j;
164 j = src_scantable[i];
165 st->permutated[i] = permutation[j];
166 #ifdef ARCH_POWERPC
167 st->inverse[j] = i;
168 #endif
171 end=-1;
172 for(i=0; i<64; i++){
173 int j;
174 j = st->permutated[i];
175 if(j>end) end=j;
176 st->raster_end[i]= end;
180 static int pix_sum_c(uint8_t * pix, int line_size)
182 int s, i, j;
184 s = 0;
185 for (i = 0; i < 16; i++) {
186 for (j = 0; j < 16; j += 8) {
187 s += pix[0];
188 s += pix[1];
189 s += pix[2];
190 s += pix[3];
191 s += pix[4];
192 s += pix[5];
193 s += pix[6];
194 s += pix[7];
195 pix += 8;
197 pix += line_size - 16;
199 return s;
202 static int pix_norm1_c(uint8_t * pix, int line_size)
204 int s, i, j;
205 uint32_t *sq = ff_squareTbl + 256;
207 s = 0;
208 for (i = 0; i < 16; i++) {
209 for (j = 0; j < 16; j += 8) {
210 #if 0
211 s += sq[pix[0]];
212 s += sq[pix[1]];
213 s += sq[pix[2]];
214 s += sq[pix[3]];
215 s += sq[pix[4]];
216 s += sq[pix[5]];
217 s += sq[pix[6]];
218 s += sq[pix[7]];
219 #else
220 #if LONG_MAX > 2147483647
221 register uint64_t x=*(uint64_t*)pix;
222 s += sq[x&0xff];
223 s += sq[(x>>8)&0xff];
224 s += sq[(x>>16)&0xff];
225 s += sq[(x>>24)&0xff];
226 s += sq[(x>>32)&0xff];
227 s += sq[(x>>40)&0xff];
228 s += sq[(x>>48)&0xff];
229 s += sq[(x>>56)&0xff];
230 #else
231 register uint32_t x=*(uint32_t*)pix;
232 s += sq[x&0xff];
233 s += sq[(x>>8)&0xff];
234 s += sq[(x>>16)&0xff];
235 s += sq[(x>>24)&0xff];
236 x=*(uint32_t*)(pix+4);
237 s += sq[x&0xff];
238 s += sq[(x>>8)&0xff];
239 s += sq[(x>>16)&0xff];
240 s += sq[(x>>24)&0xff];
241 #endif
242 #endif
243 pix += 8;
245 pix += line_size - 16;
247 return s;
250 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
251 int i;
253 for(i=0; i+8<=w; i+=8){
254 dst[i+0]= bswap_32(src[i+0]);
255 dst[i+1]= bswap_32(src[i+1]);
256 dst[i+2]= bswap_32(src[i+2]);
257 dst[i+3]= bswap_32(src[i+3]);
258 dst[i+4]= bswap_32(src[i+4]);
259 dst[i+5]= bswap_32(src[i+5]);
260 dst[i+6]= bswap_32(src[i+6]);
261 dst[i+7]= bswap_32(src[i+7]);
263 for(;i<w; i++){
264 dst[i+0]= bswap_32(src[i+0]);
268 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
270 int s, i;
271 uint32_t *sq = ff_squareTbl + 256;
273 s = 0;
274 for (i = 0; i < h; i++) {
275 s += sq[pix1[0] - pix2[0]];
276 s += sq[pix1[1] - pix2[1]];
277 s += sq[pix1[2] - pix2[2]];
278 s += sq[pix1[3] - pix2[3]];
279 pix1 += line_size;
280 pix2 += line_size;
282 return s;
285 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
287 int s, i;
288 uint32_t *sq = ff_squareTbl + 256;
290 s = 0;
291 for (i = 0; i < h; i++) {
292 s += sq[pix1[0] - pix2[0]];
293 s += sq[pix1[1] - pix2[1]];
294 s += sq[pix1[2] - pix2[2]];
295 s += sq[pix1[3] - pix2[3]];
296 s += sq[pix1[4] - pix2[4]];
297 s += sq[pix1[5] - pix2[5]];
298 s += sq[pix1[6] - pix2[6]];
299 s += sq[pix1[7] - pix2[7]];
300 pix1 += line_size;
301 pix2 += line_size;
303 return s;
306 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
308 int s, i;
309 uint32_t *sq = ff_squareTbl + 256;
311 s = 0;
312 for (i = 0; i < h; i++) {
313 s += sq[pix1[ 0] - pix2[ 0]];
314 s += sq[pix1[ 1] - pix2[ 1]];
315 s += sq[pix1[ 2] - pix2[ 2]];
316 s += sq[pix1[ 3] - pix2[ 3]];
317 s += sq[pix1[ 4] - pix2[ 4]];
318 s += sq[pix1[ 5] - pix2[ 5]];
319 s += sq[pix1[ 6] - pix2[ 6]];
320 s += sq[pix1[ 7] - pix2[ 7]];
321 s += sq[pix1[ 8] - pix2[ 8]];
322 s += sq[pix1[ 9] - pix2[ 9]];
323 s += sq[pix1[10] - pix2[10]];
324 s += sq[pix1[11] - pix2[11]];
325 s += sq[pix1[12] - pix2[12]];
326 s += sq[pix1[13] - pix2[13]];
327 s += sq[pix1[14] - pix2[14]];
328 s += sq[pix1[15] - pix2[15]];
330 pix1 += line_size;
331 pix2 += line_size;
333 return s;
337 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
338 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
339 int s, i, j;
340 const int dec_count= w==8 ? 3 : 4;
341 int tmp[32*32];
342 int level, ori;
343 static const int scale[2][2][4][4]={
346 // 9/7 8x8 dec=3
347 {268, 239, 239, 213},
348 { 0, 224, 224, 152},
349 { 0, 135, 135, 110},
351 // 9/7 16x16 or 32x32 dec=4
352 {344, 310, 310, 280},
353 { 0, 320, 320, 228},
354 { 0, 175, 175, 136},
355 { 0, 129, 129, 102},
359 // 5/3 8x8 dec=3
360 {275, 245, 245, 218},
361 { 0, 230, 230, 156},
362 { 0, 138, 138, 113},
364 // 5/3 16x16 or 32x32 dec=4
365 {352, 317, 317, 286},
366 { 0, 328, 328, 233},
367 { 0, 180, 180, 140},
368 { 0, 132, 132, 105},
373 for (i = 0; i < h; i++) {
374 for (j = 0; j < w; j+=4) {
375 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
376 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
377 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
378 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
380 pix1 += line_size;
381 pix2 += line_size;
384 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
386 s=0;
387 assert(w==h);
388 for(level=0; level<dec_count; level++){
389 for(ori= level ? 1 : 0; ori<4; ori++){
390 int size= w>>(dec_count-level);
391 int sx= (ori&1) ? size : 0;
392 int stride= 32<<(dec_count-level);
393 int sy= (ori&2) ? stride>>1 : 0;
395 for(i=0; i<size; i++){
396 for(j=0; j<size; j++){
397 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
398 s += FFABS(v);
403 assert(s>=0);
404 return s>>9;
407 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
408 return w_c(v, pix1, pix2, line_size, 8, h, 1);
411 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
412 return w_c(v, pix1, pix2, line_size, 8, h, 0);
415 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
416 return w_c(v, pix1, pix2, line_size, 16, h, 1);
419 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
420 return w_c(v, pix1, pix2, line_size, 16, h, 0);
423 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
424 return w_c(v, pix1, pix2, line_size, 32, h, 1);
427 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
428 return w_c(v, pix1, pix2, line_size, 32, h, 0);
430 #endif
432 /* draw the edges of width 'w' of an image of size width, height */
433 //FIXME check that this is ok for mpeg4 interlaced
434 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
436 uint8_t *ptr, *last_line;
437 int i;
439 last_line = buf + (height - 1) * wrap;
440 for(i=0;i<w;i++) {
441 /* top and bottom */
442 memcpy(buf - (i + 1) * wrap, buf, width);
443 memcpy(last_line + (i + 1) * wrap, last_line, width);
445 /* left and right */
446 ptr = buf;
447 for(i=0;i<height;i++) {
448 memset(ptr - w, ptr[0], w);
449 memset(ptr + width, ptr[width-1], w);
450 ptr += wrap;
452 /* corners */
453 for(i=0;i<w;i++) {
454 memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
455 memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
456 memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
457 memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
462 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
463 * @param buf destination buffer
464 * @param src source buffer
465 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
466 * @param block_w width of block
467 * @param block_h height of block
468 * @param src_x x coordinate of the top left sample of the block in the source buffer
469 * @param src_y y coordinate of the top left sample of the block in the source buffer
470 * @param w width of the source buffer
471 * @param h height of the source buffer
473 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
474 int src_x, int src_y, int w, int h){
475 int x, y;
476 int start_y, start_x, end_y, end_x;
478 if(src_y>= h){
479 src+= (h-1-src_y)*linesize;
480 src_y=h-1;
481 }else if(src_y<=-block_h){
482 src+= (1-block_h-src_y)*linesize;
483 src_y=1-block_h;
485 if(src_x>= w){
486 src+= (w-1-src_x);
487 src_x=w-1;
488 }else if(src_x<=-block_w){
489 src+= (1-block_w-src_x);
490 src_x=1-block_w;
493 start_y= FFMAX(0, -src_y);
494 start_x= FFMAX(0, -src_x);
495 end_y= FFMIN(block_h, h-src_y);
496 end_x= FFMIN(block_w, w-src_x);
498 // copy existing part
499 for(y=start_y; y<end_y; y++){
500 for(x=start_x; x<end_x; x++){
501 buf[x + y*linesize]= src[x + y*linesize];
505 //top
506 for(y=0; y<start_y; y++){
507 for(x=start_x; x<end_x; x++){
508 buf[x + y*linesize]= buf[x + start_y*linesize];
512 //bottom
513 for(y=end_y; y<block_h; y++){
514 for(x=start_x; x<end_x; x++){
515 buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
519 for(y=0; y<block_h; y++){
520 //left
521 for(x=0; x<start_x; x++){
522 buf[x + y*linesize]= buf[start_x + y*linesize];
525 //right
526 for(x=end_x; x<block_w; x++){
527 buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
532 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
534 int i;
536 /* read the pixels */
537 for(i=0;i<8;i++) {
538 block[0] = pixels[0];
539 block[1] = pixels[1];
540 block[2] = pixels[2];
541 block[3] = pixels[3];
542 block[4] = pixels[4];
543 block[5] = pixels[5];
544 block[6] = pixels[6];
545 block[7] = pixels[7];
546 pixels += line_size;
547 block += 8;
551 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
552 const uint8_t *s2, int stride){
553 int i;
555 /* read the pixels */
556 for(i=0;i<8;i++) {
557 block[0] = s1[0] - s2[0];
558 block[1] = s1[1] - s2[1];
559 block[2] = s1[2] - s2[2];
560 block[3] = s1[3] - s2[3];
561 block[4] = s1[4] - s2[4];
562 block[5] = s1[5] - s2[5];
563 block[6] = s1[6] - s2[6];
564 block[7] = s1[7] - s2[7];
565 s1 += stride;
566 s2 += stride;
567 block += 8;
572 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
573 int line_size)
575 int i;
576 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
578 /* read the pixels */
579 for(i=0;i<8;i++) {
580 pixels[0] = cm[block[0]];
581 pixels[1] = cm[block[1]];
582 pixels[2] = cm[block[2]];
583 pixels[3] = cm[block[3]];
584 pixels[4] = cm[block[4]];
585 pixels[5] = cm[block[5]];
586 pixels[6] = cm[block[6]];
587 pixels[7] = cm[block[7]];
589 pixels += line_size;
590 block += 8;
594 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
595 int line_size)
597 int i;
598 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
600 /* read the pixels */
601 for(i=0;i<4;i++) {
602 pixels[0] = cm[block[0]];
603 pixels[1] = cm[block[1]];
604 pixels[2] = cm[block[2]];
605 pixels[3] = cm[block[3]];
607 pixels += line_size;
608 block += 8;
612 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
613 int line_size)
615 int i;
616 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
618 /* read the pixels */
619 for(i=0;i<2;i++) {
620 pixels[0] = cm[block[0]];
621 pixels[1] = cm[block[1]];
623 pixels += line_size;
624 block += 8;
628 static void put_signed_pixels_clamped_c(const DCTELEM *block,
629 uint8_t *restrict pixels,
630 int line_size)
632 int i, j;
634 for (i = 0; i < 8; i++) {
635 for (j = 0; j < 8; j++) {
636 if (*block < -128)
637 *pixels = 0;
638 else if (*block > 127)
639 *pixels = 255;
640 else
641 *pixels = (uint8_t)(*block + 128);
642 block++;
643 pixels++;
645 pixels += (line_size - 8);
649 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
650 int line_size)
652 int i;
653 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
655 /* read the pixels */
656 for(i=0;i<8;i++) {
657 pixels[0] = cm[pixels[0] + block[0]];
658 pixels[1] = cm[pixels[1] + block[1]];
659 pixels[2] = cm[pixels[2] + block[2]];
660 pixels[3] = cm[pixels[3] + block[3]];
661 pixels[4] = cm[pixels[4] + block[4]];
662 pixels[5] = cm[pixels[5] + block[5]];
663 pixels[6] = cm[pixels[6] + block[6]];
664 pixels[7] = cm[pixels[7] + block[7]];
665 pixels += line_size;
666 block += 8;
670 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
671 int line_size)
673 int i;
674 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
676 /* read the pixels */
677 for(i=0;i<4;i++) {
678 pixels[0] = cm[pixels[0] + block[0]];
679 pixels[1] = cm[pixels[1] + block[1]];
680 pixels[2] = cm[pixels[2] + block[2]];
681 pixels[3] = cm[pixels[3] + block[3]];
682 pixels += line_size;
683 block += 8;
687 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
688 int line_size)
690 int i;
691 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
693 /* read the pixels */
694 for(i=0;i<2;i++) {
695 pixels[0] = cm[pixels[0] + block[0]];
696 pixels[1] = cm[pixels[1] + block[1]];
697 pixels += line_size;
698 block += 8;
702 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
704 int i;
705 for(i=0;i<8;i++) {
706 pixels[0] += block[0];
707 pixels[1] += block[1];
708 pixels[2] += block[2];
709 pixels[3] += block[3];
710 pixels[4] += block[4];
711 pixels[5] += block[5];
712 pixels[6] += block[6];
713 pixels[7] += block[7];
714 pixels += line_size;
715 block += 8;
719 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
721 int i;
722 for(i=0;i<4;i++) {
723 pixels[0] += block[0];
724 pixels[1] += block[1];
725 pixels[2] += block[2];
726 pixels[3] += block[3];
727 pixels += line_size;
728 block += 4;
732 static int sum_abs_dctelem_c(DCTELEM *block)
734 int sum=0, i;
735 for(i=0; i<64; i++)
736 sum+= FFABS(block[i]);
737 return sum;
740 #if 0
742 #define PIXOP2(OPNAME, OP) \
743 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
745 int i;\
746 for(i=0; i<h; i++){\
747 OP(*((uint64_t*)block), AV_RN64(pixels));\
748 pixels+=line_size;\
749 block +=line_size;\
753 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
755 int i;\
756 for(i=0; i<h; i++){\
757 const uint64_t a= AV_RN64(pixels );\
758 const uint64_t b= AV_RN64(pixels+1);\
759 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
760 pixels+=line_size;\
761 block +=line_size;\
765 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
767 int i;\
768 for(i=0; i<h; i++){\
769 const uint64_t a= AV_RN64(pixels );\
770 const uint64_t b= AV_RN64(pixels+1);\
771 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
772 pixels+=line_size;\
773 block +=line_size;\
777 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
779 int i;\
780 for(i=0; i<h; i++){\
781 const uint64_t a= AV_RN64(pixels );\
782 const uint64_t b= AV_RN64(pixels+line_size);\
783 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
784 pixels+=line_size;\
785 block +=line_size;\
789 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
791 int i;\
792 for(i=0; i<h; i++){\
793 const uint64_t a= AV_RN64(pixels );\
794 const uint64_t b= AV_RN64(pixels+line_size);\
795 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
796 pixels+=line_size;\
797 block +=line_size;\
801 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
803 int i;\
804 const uint64_t a= AV_RN64(pixels );\
805 const uint64_t b= AV_RN64(pixels+1);\
806 uint64_t l0= (a&0x0303030303030303ULL)\
807 + (b&0x0303030303030303ULL)\
808 + 0x0202020202020202ULL;\
809 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
810 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
811 uint64_t l1,h1;\
813 pixels+=line_size;\
814 for(i=0; i<h; i+=2){\
815 uint64_t a= AV_RN64(pixels );\
816 uint64_t b= AV_RN64(pixels+1);\
817 l1= (a&0x0303030303030303ULL)\
818 + (b&0x0303030303030303ULL);\
819 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
820 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
821 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
822 pixels+=line_size;\
823 block +=line_size;\
824 a= AV_RN64(pixels );\
825 b= AV_RN64(pixels+1);\
826 l0= (a&0x0303030303030303ULL)\
827 + (b&0x0303030303030303ULL)\
828 + 0x0202020202020202ULL;\
829 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
830 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
831 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
832 pixels+=line_size;\
833 block +=line_size;\
837 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
839 int i;\
840 const uint64_t a= AV_RN64(pixels );\
841 const uint64_t b= AV_RN64(pixels+1);\
842 uint64_t l0= (a&0x0303030303030303ULL)\
843 + (b&0x0303030303030303ULL)\
844 + 0x0101010101010101ULL;\
845 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
846 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
847 uint64_t l1,h1;\
849 pixels+=line_size;\
850 for(i=0; i<h; i+=2){\
851 uint64_t a= AV_RN64(pixels );\
852 uint64_t b= AV_RN64(pixels+1);\
853 l1= (a&0x0303030303030303ULL)\
854 + (b&0x0303030303030303ULL);\
855 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
856 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
857 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
858 pixels+=line_size;\
859 block +=line_size;\
860 a= AV_RN64(pixels );\
861 b= AV_RN64(pixels+1);\
862 l0= (a&0x0303030303030303ULL)\
863 + (b&0x0303030303030303ULL)\
864 + 0x0101010101010101ULL;\
865 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
866 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
867 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
868 pixels+=line_size;\
869 block +=line_size;\
873 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
874 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
875 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
876 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
877 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
878 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
879 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
881 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
882 #else // 64 bit variant
884 #define PIXOP2(OPNAME, OP) \
885 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
886 int i;\
887 for(i=0; i<h; i++){\
888 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
889 pixels+=line_size;\
890 block +=line_size;\
893 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
894 int i;\
895 for(i=0; i<h; i++){\
896 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
897 pixels+=line_size;\
898 block +=line_size;\
901 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
902 int i;\
903 for(i=0; i<h; i++){\
904 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
905 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
906 pixels+=line_size;\
907 block +=line_size;\
910 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
911 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
914 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
915 int src_stride1, int src_stride2, int h){\
916 int i;\
917 for(i=0; i<h; i++){\
918 uint32_t a,b;\
919 a= AV_RN32(&src1[i*src_stride1 ]);\
920 b= AV_RN32(&src2[i*src_stride2 ]);\
921 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
922 a= AV_RN32(&src1[i*src_stride1+4]);\
923 b= AV_RN32(&src2[i*src_stride2+4]);\
924 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
928 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
929 int src_stride1, int src_stride2, int h){\
930 int i;\
931 for(i=0; i<h; i++){\
932 uint32_t a,b;\
933 a= AV_RN32(&src1[i*src_stride1 ]);\
934 b= AV_RN32(&src2[i*src_stride2 ]);\
935 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
936 a= AV_RN32(&src1[i*src_stride1+4]);\
937 b= AV_RN32(&src2[i*src_stride2+4]);\
938 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
942 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
943 int src_stride1, int src_stride2, int h){\
944 int i;\
945 for(i=0; i<h; i++){\
946 uint32_t a,b;\
947 a= AV_RN32(&src1[i*src_stride1 ]);\
948 b= AV_RN32(&src2[i*src_stride2 ]);\
949 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
953 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
954 int src_stride1, int src_stride2, int h){\
955 int i;\
956 for(i=0; i<h; i++){\
957 uint32_t a,b;\
958 a= AV_RN16(&src1[i*src_stride1 ]);\
959 b= AV_RN16(&src2[i*src_stride2 ]);\
960 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
964 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
965 int src_stride1, int src_stride2, int h){\
966 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
967 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
970 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
971 int src_stride1, int src_stride2, int h){\
972 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
973 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
976 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
977 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
980 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
981 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
984 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
985 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
988 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
989 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
992 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
993 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
994 int i;\
995 for(i=0; i<h; i++){\
996 uint32_t a, b, c, d, l0, l1, h0, h1;\
997 a= AV_RN32(&src1[i*src_stride1]);\
998 b= AV_RN32(&src2[i*src_stride2]);\
999 c= AV_RN32(&src3[i*src_stride3]);\
1000 d= AV_RN32(&src4[i*src_stride4]);\
1001 l0= (a&0x03030303UL)\
1002 + (b&0x03030303UL)\
1003 + 0x02020202UL;\
1004 h0= ((a&0xFCFCFCFCUL)>>2)\
1005 + ((b&0xFCFCFCFCUL)>>2);\
1006 l1= (c&0x03030303UL)\
1007 + (d&0x03030303UL);\
1008 h1= ((c&0xFCFCFCFCUL)>>2)\
1009 + ((d&0xFCFCFCFCUL)>>2);\
1010 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1011 a= AV_RN32(&src1[i*src_stride1+4]);\
1012 b= AV_RN32(&src2[i*src_stride2+4]);\
1013 c= AV_RN32(&src3[i*src_stride3+4]);\
1014 d= AV_RN32(&src4[i*src_stride4+4]);\
1015 l0= (a&0x03030303UL)\
1016 + (b&0x03030303UL)\
1017 + 0x02020202UL;\
1018 h0= ((a&0xFCFCFCFCUL)>>2)\
1019 + ((b&0xFCFCFCFCUL)>>2);\
1020 l1= (c&0x03030303UL)\
1021 + (d&0x03030303UL);\
1022 h1= ((c&0xFCFCFCFCUL)>>2)\
1023 + ((d&0xFCFCFCFCUL)>>2);\
1024 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1028 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1029 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1032 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1033 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1036 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1037 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1040 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1041 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1044 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1045 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1046 int i;\
1047 for(i=0; i<h; i++){\
1048 uint32_t a, b, c, d, l0, l1, h0, h1;\
1049 a= AV_RN32(&src1[i*src_stride1]);\
1050 b= AV_RN32(&src2[i*src_stride2]);\
1051 c= AV_RN32(&src3[i*src_stride3]);\
1052 d= AV_RN32(&src4[i*src_stride4]);\
1053 l0= (a&0x03030303UL)\
1054 + (b&0x03030303UL)\
1055 + 0x01010101UL;\
1056 h0= ((a&0xFCFCFCFCUL)>>2)\
1057 + ((b&0xFCFCFCFCUL)>>2);\
1058 l1= (c&0x03030303UL)\
1059 + (d&0x03030303UL);\
1060 h1= ((c&0xFCFCFCFCUL)>>2)\
1061 + ((d&0xFCFCFCFCUL)>>2);\
1062 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1063 a= AV_RN32(&src1[i*src_stride1+4]);\
1064 b= AV_RN32(&src2[i*src_stride2+4]);\
1065 c= AV_RN32(&src3[i*src_stride3+4]);\
1066 d= AV_RN32(&src4[i*src_stride4+4]);\
1067 l0= (a&0x03030303UL)\
1068 + (b&0x03030303UL)\
1069 + 0x01010101UL;\
1070 h0= ((a&0xFCFCFCFCUL)>>2)\
1071 + ((b&0xFCFCFCFCUL)>>2);\
1072 l1= (c&0x03030303UL)\
1073 + (d&0x03030303UL);\
1074 h1= ((c&0xFCFCFCFCUL)>>2)\
1075 + ((d&0xFCFCFCFCUL)>>2);\
1076 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1079 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1080 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1081 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1082 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1084 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1085 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1086 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1087 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1090 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1092 int i, a0, b0, a1, b1;\
1093 a0= pixels[0];\
1094 b0= pixels[1] + 2;\
1095 a0 += b0;\
1096 b0 += pixels[2];\
1098 pixels+=line_size;\
1099 for(i=0; i<h; i+=2){\
1100 a1= pixels[0];\
1101 b1= pixels[1];\
1102 a1 += b1;\
1103 b1 += pixels[2];\
1105 block[0]= (a1+a0)>>2; /* FIXME non put */\
1106 block[1]= (b1+b0)>>2;\
1108 pixels+=line_size;\
1109 block +=line_size;\
1111 a0= pixels[0];\
1112 b0= pixels[1] + 2;\
1113 a0 += b0;\
1114 b0 += pixels[2];\
1116 block[0]= (a1+a0)>>2;\
1117 block[1]= (b1+b0)>>2;\
1118 pixels+=line_size;\
1119 block +=line_size;\
1123 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1125 int i;\
1126 const uint32_t a= AV_RN32(pixels );\
1127 const uint32_t b= AV_RN32(pixels+1);\
1128 uint32_t l0= (a&0x03030303UL)\
1129 + (b&0x03030303UL)\
1130 + 0x02020202UL;\
1131 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1132 + ((b&0xFCFCFCFCUL)>>2);\
1133 uint32_t l1,h1;\
1135 pixels+=line_size;\
1136 for(i=0; i<h; i+=2){\
1137 uint32_t a= AV_RN32(pixels );\
1138 uint32_t b= AV_RN32(pixels+1);\
1139 l1= (a&0x03030303UL)\
1140 + (b&0x03030303UL);\
1141 h1= ((a&0xFCFCFCFCUL)>>2)\
1142 + ((b&0xFCFCFCFCUL)>>2);\
1143 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1144 pixels+=line_size;\
1145 block +=line_size;\
1146 a= AV_RN32(pixels );\
1147 b= AV_RN32(pixels+1);\
1148 l0= (a&0x03030303UL)\
1149 + (b&0x03030303UL)\
1150 + 0x02020202UL;\
1151 h0= ((a&0xFCFCFCFCUL)>>2)\
1152 + ((b&0xFCFCFCFCUL)>>2);\
1153 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1154 pixels+=line_size;\
1155 block +=line_size;\
1159 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1161 int j;\
1162 for(j=0; j<2; j++){\
1163 int i;\
1164 const uint32_t a= AV_RN32(pixels );\
1165 const uint32_t b= AV_RN32(pixels+1);\
1166 uint32_t l0= (a&0x03030303UL)\
1167 + (b&0x03030303UL)\
1168 + 0x02020202UL;\
1169 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1170 + ((b&0xFCFCFCFCUL)>>2);\
1171 uint32_t l1,h1;\
1173 pixels+=line_size;\
1174 for(i=0; i<h; i+=2){\
1175 uint32_t a= AV_RN32(pixels );\
1176 uint32_t b= AV_RN32(pixels+1);\
1177 l1= (a&0x03030303UL)\
1178 + (b&0x03030303UL);\
1179 h1= ((a&0xFCFCFCFCUL)>>2)\
1180 + ((b&0xFCFCFCFCUL)>>2);\
1181 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1182 pixels+=line_size;\
1183 block +=line_size;\
1184 a= AV_RN32(pixels );\
1185 b= AV_RN32(pixels+1);\
1186 l0= (a&0x03030303UL)\
1187 + (b&0x03030303UL)\
1188 + 0x02020202UL;\
1189 h0= ((a&0xFCFCFCFCUL)>>2)\
1190 + ((b&0xFCFCFCFCUL)>>2);\
1191 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1192 pixels+=line_size;\
1193 block +=line_size;\
1195 pixels+=4-line_size*(h+1);\
1196 block +=4-line_size*h;\
1200 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1202 int j;\
1203 for(j=0; j<2; j++){\
1204 int i;\
1205 const uint32_t a= AV_RN32(pixels );\
1206 const uint32_t b= AV_RN32(pixels+1);\
1207 uint32_t l0= (a&0x03030303UL)\
1208 + (b&0x03030303UL)\
1209 + 0x01010101UL;\
1210 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1211 + ((b&0xFCFCFCFCUL)>>2);\
1212 uint32_t l1,h1;\
1214 pixels+=line_size;\
1215 for(i=0; i<h; i+=2){\
1216 uint32_t a= AV_RN32(pixels );\
1217 uint32_t b= AV_RN32(pixels+1);\
1218 l1= (a&0x03030303UL)\
1219 + (b&0x03030303UL);\
1220 h1= ((a&0xFCFCFCFCUL)>>2)\
1221 + ((b&0xFCFCFCFCUL)>>2);\
1222 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1223 pixels+=line_size;\
1224 block +=line_size;\
1225 a= AV_RN32(pixels );\
1226 b= AV_RN32(pixels+1);\
1227 l0= (a&0x03030303UL)\
1228 + (b&0x03030303UL)\
1229 + 0x01010101UL;\
1230 h0= ((a&0xFCFCFCFCUL)>>2)\
1231 + ((b&0xFCFCFCFCUL)>>2);\
1232 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1233 pixels+=line_size;\
1234 block +=line_size;\
1236 pixels+=4-line_size*(h+1);\
1237 block +=4-line_size*h;\
1241 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1242 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1243 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1244 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1245 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1246 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1247 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1248 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1250 #define op_avg(a, b) a = rnd_avg32(a, b)
1251 #endif
1252 #define op_put(a, b) a = b
1254 PIXOP2(avg, op_avg)
1255 PIXOP2(put, op_put)
1256 #undef op_avg
1257 #undef op_put
1259 #define avg2(a,b) ((a+b+1)>>1)
1260 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1262 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1263 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1266 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1267 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1270 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1272 const int A=(16-x16)*(16-y16);
1273 const int B=( x16)*(16-y16);
1274 const int C=(16-x16)*( y16);
1275 const int D=( x16)*( y16);
1276 int i;
1278 for(i=0; i<h; i++)
1280 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1281 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1282 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1283 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1284 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1285 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1286 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1287 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1288 dst+= stride;
1289 src+= stride;
1293 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1294 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1296 int y, vx, vy;
1297 const int s= 1<<shift;
1299 width--;
1300 height--;
1302 for(y=0; y<h; y++){
1303 int x;
1305 vx= ox;
1306 vy= oy;
1307 for(x=0; x<8; x++){ //XXX FIXME optimize
1308 int src_x, src_y, frac_x, frac_y, index;
1310 src_x= vx>>16;
1311 src_y= vy>>16;
1312 frac_x= src_x&(s-1);
1313 frac_y= src_y&(s-1);
1314 src_x>>=shift;
1315 src_y>>=shift;
1317 if((unsigned)src_x < width){
1318 if((unsigned)src_y < height){
1319 index= src_x + src_y*stride;
1320 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1321 + src[index +1]* frac_x )*(s-frac_y)
1322 + ( src[index+stride ]*(s-frac_x)
1323 + src[index+stride+1]* frac_x )* frac_y
1324 + r)>>(shift*2);
1325 }else{
1326 index= src_x + av_clip(src_y, 0, height)*stride;
1327 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1328 + src[index +1]* frac_x )*s
1329 + r)>>(shift*2);
1331 }else{
1332 if((unsigned)src_y < height){
1333 index= av_clip(src_x, 0, width) + src_y*stride;
1334 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1335 + src[index+stride ]* frac_y )*s
1336 + r)>>(shift*2);
1337 }else{
1338 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1339 dst[y*stride + x]= src[index ];
1343 vx+= dxx;
1344 vy+= dyx;
1346 ox += dxy;
1347 oy += dyy;
1351 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1352 switch(width){
1353 case 2: put_pixels2_c (dst, src, stride, height); break;
1354 case 4: put_pixels4_c (dst, src, stride, height); break;
1355 case 8: put_pixels8_c (dst, src, stride, height); break;
1356 case 16:put_pixels16_c(dst, src, stride, height); break;
1360 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1361 int i,j;
1362 for (i=0; i < height; i++) {
1363 for (j=0; j < width; j++) {
1364 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1366 src += stride;
1367 dst += stride;
1371 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1372 int i,j;
1373 for (i=0; i < height; i++) {
1374 for (j=0; j < width; j++) {
1375 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1377 src += stride;
1378 dst += stride;
1382 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1383 int i,j;
1384 for (i=0; i < height; i++) {
1385 for (j=0; j < width; j++) {
1386 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1388 src += stride;
1389 dst += stride;
1393 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1394 int i,j;
1395 for (i=0; i < height; i++) {
1396 for (j=0; j < width; j++) {
1397 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1399 src += stride;
1400 dst += stride;
1404 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1405 int i,j;
1406 for (i=0; i < height; i++) {
1407 for (j=0; j < width; j++) {
1408 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1410 src += stride;
1411 dst += stride;
1415 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1416 int i,j;
1417 for (i=0; i < height; i++) {
1418 for (j=0; j < width; j++) {
1419 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1421 src += stride;
1422 dst += stride;
1426 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1427 int i,j;
1428 for (i=0; i < height; i++) {
1429 for (j=0; j < width; j++) {
1430 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1432 src += stride;
1433 dst += stride;
1437 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1438 int i,j;
1439 for (i=0; i < height; i++) {
1440 for (j=0; j < width; j++) {
1441 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1443 src += stride;
1444 dst += stride;
1448 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1449 switch(width){
1450 case 2: avg_pixels2_c (dst, src, stride, height); break;
1451 case 4: avg_pixels4_c (dst, src, stride, height); break;
1452 case 8: avg_pixels8_c (dst, src, stride, height); break;
1453 case 16:avg_pixels16_c(dst, src, stride, height); break;
1457 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1458 int i,j;
1459 for (i=0; i < height; i++) {
1460 for (j=0; j < width; j++) {
1461 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1463 src += stride;
1464 dst += stride;
1468 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1469 int i,j;
1470 for (i=0; i < height; i++) {
1471 for (j=0; j < width; j++) {
1472 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1474 src += stride;
1475 dst += stride;
1479 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1480 int i,j;
1481 for (i=0; i < height; i++) {
1482 for (j=0; j < width; j++) {
1483 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1485 src += stride;
1486 dst += stride;
1490 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1491 int i,j;
1492 for (i=0; i < height; i++) {
1493 for (j=0; j < width; j++) {
1494 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1496 src += stride;
1497 dst += stride;
1501 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1502 int i,j;
1503 for (i=0; i < height; i++) {
1504 for (j=0; j < width; j++) {
1505 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1507 src += stride;
1508 dst += stride;
1512 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1513 int i,j;
1514 for (i=0; i < height; i++) {
1515 for (j=0; j < width; j++) {
1516 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1518 src += stride;
1519 dst += stride;
1523 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1524 int i,j;
1525 for (i=0; i < height; i++) {
1526 for (j=0; j < width; j++) {
1527 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1529 src += stride;
1530 dst += stride;
1534 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1535 int i,j;
1536 for (i=0; i < height; i++) {
1537 for (j=0; j < width; j++) {
1538 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1540 src += stride;
1541 dst += stride;
1544 #if 0
1545 #define TPEL_WIDTH(width)\
1546 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1547 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1548 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1549 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1550 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1551 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1552 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1553 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1554 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1555 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1556 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1557 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1558 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1559 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1560 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1561 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1562 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1563 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1564 #endif
1566 #define H264_CHROMA_MC(OPNAME, OP)\
1567 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1568 const int A=(8-x)*(8-y);\
1569 const int B=( x)*(8-y);\
1570 const int C=(8-x)*( y);\
1571 const int D=( x)*( y);\
1572 int i;\
1574 assert(x<8 && y<8 && x>=0 && y>=0);\
1576 if(D){\
1577 for(i=0; i<h; i++){\
1578 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1579 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1580 dst+= stride;\
1581 src+= stride;\
1583 }else{\
1584 const int E= B+C;\
1585 const int step= C ? stride : 1;\
1586 for(i=0; i<h; i++){\
1587 OP(dst[0], (A*src[0] + E*src[step+0]));\
1588 OP(dst[1], (A*src[1] + E*src[step+1]));\
1589 dst+= stride;\
1590 src+= stride;\
1595 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1596 const int A=(8-x)*(8-y);\
1597 const int B=( x)*(8-y);\
1598 const int C=(8-x)*( y);\
1599 const int D=( x)*( y);\
1600 int i;\
1602 assert(x<8 && y<8 && x>=0 && y>=0);\
1604 if(D){\
1605 for(i=0; i<h; i++){\
1606 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1607 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1608 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1609 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1610 dst+= stride;\
1611 src+= stride;\
1613 }else{\
1614 const int E= B+C;\
1615 const int step= C ? stride : 1;\
1616 for(i=0; i<h; i++){\
1617 OP(dst[0], (A*src[0] + E*src[step+0]));\
1618 OP(dst[1], (A*src[1] + E*src[step+1]));\
1619 OP(dst[2], (A*src[2] + E*src[step+2]));\
1620 OP(dst[3], (A*src[3] + E*src[step+3]));\
1621 dst+= stride;\
1622 src+= stride;\
1627 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1628 const int A=(8-x)*(8-y);\
1629 const int B=( x)*(8-y);\
1630 const int C=(8-x)*( y);\
1631 const int D=( x)*( y);\
1632 int i;\
1634 assert(x<8 && y<8 && x>=0 && y>=0);\
1636 if(D){\
1637 for(i=0; i<h; i++){\
1638 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1639 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1640 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1641 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1642 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1643 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1644 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1645 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1646 dst+= stride;\
1647 src+= stride;\
1649 }else{\
1650 const int E= B+C;\
1651 const int step= C ? stride : 1;\
1652 for(i=0; i<h; i++){\
1653 OP(dst[0], (A*src[0] + E*src[step+0]));\
1654 OP(dst[1], (A*src[1] + E*src[step+1]));\
1655 OP(dst[2], (A*src[2] + E*src[step+2]));\
1656 OP(dst[3], (A*src[3] + E*src[step+3]));\
1657 OP(dst[4], (A*src[4] + E*src[step+4]));\
1658 OP(dst[5], (A*src[5] + E*src[step+5]));\
1659 OP(dst[6], (A*src[6] + E*src[step+6]));\
1660 OP(dst[7], (A*src[7] + E*src[step+7]));\
1661 dst+= stride;\
1662 src+= stride;\
1667 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1668 #define op_put(a, b) a = (((b) + 32)>>6)
1670 H264_CHROMA_MC(put_ , op_put)
1671 H264_CHROMA_MC(avg_ , op_avg)
1672 #undef op_avg
1673 #undef op_put
1675 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1676 const int A=(8-x)*(8-y);
1677 const int B=( x)*(8-y);
1678 const int C=(8-x)*( y);
1679 const int D=( x)*( y);
1680 int i;
1682 assert(x<8 && y<8 && x>=0 && y>=0);
1684 for(i=0; i<h; i++)
1686 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1687 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1688 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1689 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1690 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1691 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1692 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1693 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1694 dst+= stride;
1695 src+= stride;
1699 #define QPEL_MC(r, OPNAME, RND, OP) \
1700 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1701 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1702 int i;\
1703 for(i=0; i<h; i++)\
1705 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1706 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1707 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1708 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1709 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1710 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1711 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1712 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1713 dst+=dstStride;\
1714 src+=srcStride;\
1718 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1719 const int w=8;\
1720 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1721 int i;\
1722 for(i=0; i<w; i++)\
1724 const int src0= src[0*srcStride];\
1725 const int src1= src[1*srcStride];\
1726 const int src2= src[2*srcStride];\
1727 const int src3= src[3*srcStride];\
1728 const int src4= src[4*srcStride];\
1729 const int src5= src[5*srcStride];\
1730 const int src6= src[6*srcStride];\
1731 const int src7= src[7*srcStride];\
1732 const int src8= src[8*srcStride];\
1733 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1734 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1735 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1736 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1737 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1738 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1739 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1740 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1741 dst++;\
1742 src++;\
1746 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1747 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1748 int i;\
1750 for(i=0; i<h; i++)\
1752 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1753 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1754 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1755 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1756 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1757 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1758 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1759 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1760 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1761 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1762 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1763 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1764 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1765 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1766 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1767 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1768 dst+=dstStride;\
1769 src+=srcStride;\
1773 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1774 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1775 int i;\
1776 const int w=16;\
1777 for(i=0; i<w; i++)\
1779 const int src0= src[0*srcStride];\
1780 const int src1= src[1*srcStride];\
1781 const int src2= src[2*srcStride];\
1782 const int src3= src[3*srcStride];\
1783 const int src4= src[4*srcStride];\
1784 const int src5= src[5*srcStride];\
1785 const int src6= src[6*srcStride];\
1786 const int src7= src[7*srcStride];\
1787 const int src8= src[8*srcStride];\
1788 const int src9= src[9*srcStride];\
1789 const int src10= src[10*srcStride];\
1790 const int src11= src[11*srcStride];\
1791 const int src12= src[12*srcStride];\
1792 const int src13= src[13*srcStride];\
1793 const int src14= src[14*srcStride];\
1794 const int src15= src[15*srcStride];\
1795 const int src16= src[16*srcStride];\
1796 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1797 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1798 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1799 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1800 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1801 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1802 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1803 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1804 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1805 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1806 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1807 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1808 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1809 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1810 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1811 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1812 dst++;\
1813 src++;\
1817 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1818 OPNAME ## pixels8_c(dst, src, stride, 8);\
1821 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1822 uint8_t half[64];\
1823 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1824 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1827 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1828 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1831 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1832 uint8_t half[64];\
1833 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1834 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1837 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1838 uint8_t full[16*9];\
1839 uint8_t half[64];\
1840 copy_block9(full, src, 16, stride, 9);\
1841 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1842 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1845 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1846 uint8_t full[16*9];\
1847 copy_block9(full, src, 16, stride, 9);\
1848 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1851 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1852 uint8_t full[16*9];\
1853 uint8_t half[64];\
1854 copy_block9(full, src, 16, stride, 9);\
1855 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1856 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1858 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1859 uint8_t full[16*9];\
1860 uint8_t halfH[72];\
1861 uint8_t halfV[64];\
1862 uint8_t halfHV[64];\
1863 copy_block9(full, src, 16, stride, 9);\
1864 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1865 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1866 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1867 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1869 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1870 uint8_t full[16*9];\
1871 uint8_t halfH[72];\
1872 uint8_t halfHV[64];\
1873 copy_block9(full, src, 16, stride, 9);\
1874 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1875 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1876 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1877 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1879 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1880 uint8_t full[16*9];\
1881 uint8_t halfH[72];\
1882 uint8_t halfV[64];\
1883 uint8_t halfHV[64];\
1884 copy_block9(full, src, 16, stride, 9);\
1885 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1886 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1887 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1888 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1890 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1891 uint8_t full[16*9];\
1892 uint8_t halfH[72];\
1893 uint8_t halfHV[64];\
1894 copy_block9(full, src, 16, stride, 9);\
1895 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1896 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1897 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1898 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1900 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1901 uint8_t full[16*9];\
1902 uint8_t halfH[72];\
1903 uint8_t halfV[64];\
1904 uint8_t halfHV[64];\
1905 copy_block9(full, src, 16, stride, 9);\
1906 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1907 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1908 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1909 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1911 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1912 uint8_t full[16*9];\
1913 uint8_t halfH[72];\
1914 uint8_t halfHV[64];\
1915 copy_block9(full, src, 16, stride, 9);\
1916 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1917 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1918 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1919 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1921 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1922 uint8_t full[16*9];\
1923 uint8_t halfH[72];\
1924 uint8_t halfV[64];\
1925 uint8_t halfHV[64];\
1926 copy_block9(full, src, 16, stride, 9);\
1927 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1928 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1929 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1930 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1932 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1933 uint8_t full[16*9];\
1934 uint8_t halfH[72];\
1935 uint8_t halfHV[64];\
1936 copy_block9(full, src, 16, stride, 9);\
1937 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1938 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1939 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1940 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1942 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1943 uint8_t halfH[72];\
1944 uint8_t halfHV[64];\
1945 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1946 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1947 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1949 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1950 uint8_t halfH[72];\
1951 uint8_t halfHV[64];\
1952 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1953 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1954 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1956 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1957 uint8_t full[16*9];\
1958 uint8_t halfH[72];\
1959 uint8_t halfV[64];\
1960 uint8_t halfHV[64];\
1961 copy_block9(full, src, 16, stride, 9);\
1962 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1963 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1964 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1965 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1967 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1968 uint8_t full[16*9];\
1969 uint8_t halfH[72];\
1970 copy_block9(full, src, 16, stride, 9);\
1971 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1972 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1973 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1975 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1976 uint8_t full[16*9];\
1977 uint8_t halfH[72];\
1978 uint8_t halfV[64];\
1979 uint8_t halfHV[64];\
1980 copy_block9(full, src, 16, stride, 9);\
1981 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1982 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1983 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1984 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1986 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1987 uint8_t full[16*9];\
1988 uint8_t halfH[72];\
1989 copy_block9(full, src, 16, stride, 9);\
1990 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1991 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1992 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1994 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1995 uint8_t halfH[72];\
1996 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1997 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1999 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2000 OPNAME ## pixels16_c(dst, src, stride, 16);\
2003 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2004 uint8_t half[256];\
2005 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2006 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2009 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2010 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2013 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2014 uint8_t half[256];\
2015 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2016 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2019 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2020 uint8_t full[24*17];\
2021 uint8_t half[256];\
2022 copy_block17(full, src, 24, stride, 17);\
2023 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2024 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2027 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2028 uint8_t full[24*17];\
2029 copy_block17(full, src, 24, stride, 17);\
2030 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2033 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2034 uint8_t full[24*17];\
2035 uint8_t half[256];\
2036 copy_block17(full, src, 24, stride, 17);\
2037 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2038 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2040 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2041 uint8_t full[24*17];\
2042 uint8_t halfH[272];\
2043 uint8_t halfV[256];\
2044 uint8_t halfHV[256];\
2045 copy_block17(full, src, 24, stride, 17);\
2046 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2047 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2048 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2049 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2051 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2052 uint8_t full[24*17];\
2053 uint8_t halfH[272];\
2054 uint8_t halfHV[256];\
2055 copy_block17(full, src, 24, stride, 17);\
2056 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2057 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2058 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2059 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2061 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2062 uint8_t full[24*17];\
2063 uint8_t halfH[272];\
2064 uint8_t halfV[256];\
2065 uint8_t halfHV[256];\
2066 copy_block17(full, src, 24, stride, 17);\
2067 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2068 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2069 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2070 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2072 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2073 uint8_t full[24*17];\
2074 uint8_t halfH[272];\
2075 uint8_t halfHV[256];\
2076 copy_block17(full, src, 24, stride, 17);\
2077 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2078 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2079 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2080 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2082 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2083 uint8_t full[24*17];\
2084 uint8_t halfH[272];\
2085 uint8_t halfV[256];\
2086 uint8_t halfHV[256];\
2087 copy_block17(full, src, 24, stride, 17);\
2088 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2089 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2090 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2091 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2093 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2094 uint8_t full[24*17];\
2095 uint8_t halfH[272];\
2096 uint8_t halfHV[256];\
2097 copy_block17(full, src, 24, stride, 17);\
2098 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2099 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2100 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2101 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2103 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2104 uint8_t full[24*17];\
2105 uint8_t halfH[272];\
2106 uint8_t halfV[256];\
2107 uint8_t halfHV[256];\
2108 copy_block17(full, src, 24, stride, 17);\
2109 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2110 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2111 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2112 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2114 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2115 uint8_t full[24*17];\
2116 uint8_t halfH[272];\
2117 uint8_t halfHV[256];\
2118 copy_block17(full, src, 24, stride, 17);\
2119 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2120 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2121 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2122 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2124 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2125 uint8_t halfH[272];\
2126 uint8_t halfHV[256];\
2127 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2128 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2129 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2131 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2132 uint8_t halfH[272];\
2133 uint8_t halfHV[256];\
2134 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2135 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2136 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2138 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2139 uint8_t full[24*17];\
2140 uint8_t halfH[272];\
2141 uint8_t halfV[256];\
2142 uint8_t halfHV[256];\
2143 copy_block17(full, src, 24, stride, 17);\
2144 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2145 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2146 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2147 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2149 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2150 uint8_t full[24*17];\
2151 uint8_t halfH[272];\
2152 copy_block17(full, src, 24, stride, 17);\
2153 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2154 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2155 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2157 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2158 uint8_t full[24*17];\
2159 uint8_t halfH[272];\
2160 uint8_t halfV[256];\
2161 uint8_t halfHV[256];\
2162 copy_block17(full, src, 24, stride, 17);\
2163 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2164 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2165 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2166 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2168 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2169 uint8_t full[24*17];\
2170 uint8_t halfH[272];\
2171 copy_block17(full, src, 24, stride, 17);\
2172 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2173 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2174 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2176 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2177 uint8_t halfH[272];\
2178 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2179 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2182 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2183 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2184 #define op_put(a, b) a = cm[((b) + 16)>>5]
2185 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2187 QPEL_MC(0, put_ , _ , op_put)
2188 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2189 QPEL_MC(0, avg_ , _ , op_avg)
2190 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2191 #undef op_avg
2192 #undef op_avg_no_rnd
2193 #undef op_put
2194 #undef op_put_no_rnd
2196 #if 1
2197 #define H264_LOWPASS(OPNAME, OP, OP2) \
2198 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2199 const int h=2;\
2200 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2201 int i;\
2202 for(i=0; i<h; i++)\
2204 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2205 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2206 dst+=dstStride;\
2207 src+=srcStride;\
2211 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2212 const int w=2;\
2213 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2214 int i;\
2215 for(i=0; i<w; i++)\
2217 const int srcB= src[-2*srcStride];\
2218 const int srcA= src[-1*srcStride];\
2219 const int src0= src[0 *srcStride];\
2220 const int src1= src[1 *srcStride];\
2221 const int src2= src[2 *srcStride];\
2222 const int src3= src[3 *srcStride];\
2223 const int src4= src[4 *srcStride];\
2224 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2225 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2226 dst++;\
2227 src++;\
2231 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2232 const int h=2;\
2233 const int w=2;\
2234 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2235 int i;\
2236 src -= 2*srcStride;\
2237 for(i=0; i<h+5; i++)\
2239 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2240 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2241 tmp+=tmpStride;\
2242 src+=srcStride;\
2244 tmp -= tmpStride*(h+5-2);\
2245 for(i=0; i<w; i++)\
2247 const int tmpB= tmp[-2*tmpStride];\
2248 const int tmpA= tmp[-1*tmpStride];\
2249 const int tmp0= tmp[0 *tmpStride];\
2250 const int tmp1= tmp[1 *tmpStride];\
2251 const int tmp2= tmp[2 *tmpStride];\
2252 const int tmp3= tmp[3 *tmpStride];\
2253 const int tmp4= tmp[4 *tmpStride];\
2254 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2255 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2256 dst++;\
2257 tmp++;\
2260 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2261 const int h=4;\
2262 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2263 int i;\
2264 for(i=0; i<h; i++)\
2266 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2267 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2268 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2269 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2270 dst+=dstStride;\
2271 src+=srcStride;\
2275 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2276 const int w=4;\
2277 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2278 int i;\
2279 for(i=0; i<w; i++)\
2281 const int srcB= src[-2*srcStride];\
2282 const int srcA= src[-1*srcStride];\
2283 const int src0= src[0 *srcStride];\
2284 const int src1= src[1 *srcStride];\
2285 const int src2= src[2 *srcStride];\
2286 const int src3= src[3 *srcStride];\
2287 const int src4= src[4 *srcStride];\
2288 const int src5= src[5 *srcStride];\
2289 const int src6= src[6 *srcStride];\
2290 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2291 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2292 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2293 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2294 dst++;\
2295 src++;\
2299 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2300 const int h=4;\
2301 const int w=4;\
2302 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2303 int i;\
2304 src -= 2*srcStride;\
2305 for(i=0; i<h+5; i++)\
2307 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2308 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2309 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2310 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2311 tmp+=tmpStride;\
2312 src+=srcStride;\
2314 tmp -= tmpStride*(h+5-2);\
2315 for(i=0; i<w; i++)\
2317 const int tmpB= tmp[-2*tmpStride];\
2318 const int tmpA= tmp[-1*tmpStride];\
2319 const int tmp0= tmp[0 *tmpStride];\
2320 const int tmp1= tmp[1 *tmpStride];\
2321 const int tmp2= tmp[2 *tmpStride];\
2322 const int tmp3= tmp[3 *tmpStride];\
2323 const int tmp4= tmp[4 *tmpStride];\
2324 const int tmp5= tmp[5 *tmpStride];\
2325 const int tmp6= tmp[6 *tmpStride];\
2326 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2327 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2328 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2329 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2330 dst++;\
2331 tmp++;\
2335 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2336 const int h=8;\
2337 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2338 int i;\
2339 for(i=0; i<h; i++)\
2341 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2342 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2343 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2344 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2345 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2346 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2347 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2348 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2349 dst+=dstStride;\
2350 src+=srcStride;\
2354 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2355 const int w=8;\
2356 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2357 int i;\
2358 for(i=0; i<w; i++)\
2360 const int srcB= src[-2*srcStride];\
2361 const int srcA= src[-1*srcStride];\
2362 const int src0= src[0 *srcStride];\
2363 const int src1= src[1 *srcStride];\
2364 const int src2= src[2 *srcStride];\
2365 const int src3= src[3 *srcStride];\
2366 const int src4= src[4 *srcStride];\
2367 const int src5= src[5 *srcStride];\
2368 const int src6= src[6 *srcStride];\
2369 const int src7= src[7 *srcStride];\
2370 const int src8= src[8 *srcStride];\
2371 const int src9= src[9 *srcStride];\
2372 const int src10=src[10*srcStride];\
2373 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2374 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2375 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2376 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2377 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2378 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2379 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2380 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2381 dst++;\
2382 src++;\
2386 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2387 const int h=8;\
2388 const int w=8;\
2389 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2390 int i;\
2391 src -= 2*srcStride;\
2392 for(i=0; i<h+5; i++)\
2394 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2395 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2396 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2397 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2398 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2399 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2400 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2401 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2402 tmp+=tmpStride;\
2403 src+=srcStride;\
2405 tmp -= tmpStride*(h+5-2);\
2406 for(i=0; i<w; i++)\
2408 const int tmpB= tmp[-2*tmpStride];\
2409 const int tmpA= tmp[-1*tmpStride];\
2410 const int tmp0= tmp[0 *tmpStride];\
2411 const int tmp1= tmp[1 *tmpStride];\
2412 const int tmp2= tmp[2 *tmpStride];\
2413 const int tmp3= tmp[3 *tmpStride];\
2414 const int tmp4= tmp[4 *tmpStride];\
2415 const int tmp5= tmp[5 *tmpStride];\
2416 const int tmp6= tmp[6 *tmpStride];\
2417 const int tmp7= tmp[7 *tmpStride];\
2418 const int tmp8= tmp[8 *tmpStride];\
2419 const int tmp9= tmp[9 *tmpStride];\
2420 const int tmp10=tmp[10*tmpStride];\
2421 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2422 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2423 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2424 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2425 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2426 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2427 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2428 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2429 dst++;\
2430 tmp++;\
2434 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2435 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2436 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2437 src += 8*srcStride;\
2438 dst += 8*dstStride;\
2439 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2440 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2443 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2444 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2445 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2446 src += 8*srcStride;\
2447 dst += 8*dstStride;\
2448 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2449 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2452 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2453 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2454 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2455 src += 8*srcStride;\
2456 dst += 8*dstStride;\
2457 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2458 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2461 #define H264_MC(OPNAME, SIZE) \
2462 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2463 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2466 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2467 uint8_t half[SIZE*SIZE];\
2468 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2469 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2472 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2473 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2476 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2477 uint8_t half[SIZE*SIZE];\
2478 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2479 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2482 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2483 uint8_t full[SIZE*(SIZE+5)];\
2484 uint8_t * const full_mid= full + SIZE*2;\
2485 uint8_t half[SIZE*SIZE];\
2486 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2487 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2488 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2491 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2492 uint8_t full[SIZE*(SIZE+5)];\
2493 uint8_t * const full_mid= full + SIZE*2;\
2494 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2495 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2498 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2499 uint8_t full[SIZE*(SIZE+5)];\
2500 uint8_t * const full_mid= full + SIZE*2;\
2501 uint8_t half[SIZE*SIZE];\
2502 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2503 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2504 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2507 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2508 uint8_t full[SIZE*(SIZE+5)];\
2509 uint8_t * const full_mid= full + SIZE*2;\
2510 uint8_t halfH[SIZE*SIZE];\
2511 uint8_t halfV[SIZE*SIZE];\
2512 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2513 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2514 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2515 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2518 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2519 uint8_t full[SIZE*(SIZE+5)];\
2520 uint8_t * const full_mid= full + SIZE*2;\
2521 uint8_t halfH[SIZE*SIZE];\
2522 uint8_t halfV[SIZE*SIZE];\
2523 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2524 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2525 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2526 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2529 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2530 uint8_t full[SIZE*(SIZE+5)];\
2531 uint8_t * const full_mid= full + SIZE*2;\
2532 uint8_t halfH[SIZE*SIZE];\
2533 uint8_t halfV[SIZE*SIZE];\
2534 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2535 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2536 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2537 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2540 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2541 uint8_t full[SIZE*(SIZE+5)];\
2542 uint8_t * const full_mid= full + SIZE*2;\
2543 uint8_t halfH[SIZE*SIZE];\
2544 uint8_t halfV[SIZE*SIZE];\
2545 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2546 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2547 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2548 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2551 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2552 int16_t tmp[SIZE*(SIZE+5)];\
2553 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2556 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2557 int16_t tmp[SIZE*(SIZE+5)];\
2558 uint8_t halfH[SIZE*SIZE];\
2559 uint8_t halfHV[SIZE*SIZE];\
2560 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2561 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2562 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2565 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2566 int16_t tmp[SIZE*(SIZE+5)];\
2567 uint8_t halfH[SIZE*SIZE];\
2568 uint8_t halfHV[SIZE*SIZE];\
2569 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2570 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2571 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2574 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2575 uint8_t full[SIZE*(SIZE+5)];\
2576 uint8_t * const full_mid= full + SIZE*2;\
2577 int16_t tmp[SIZE*(SIZE+5)];\
2578 uint8_t halfV[SIZE*SIZE];\
2579 uint8_t halfHV[SIZE*SIZE];\
2580 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2581 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2582 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2583 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2586 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2587 uint8_t full[SIZE*(SIZE+5)];\
2588 uint8_t * const full_mid= full + SIZE*2;\
2589 int16_t tmp[SIZE*(SIZE+5)];\
2590 uint8_t halfV[SIZE*SIZE];\
2591 uint8_t halfHV[SIZE*SIZE];\
2592 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2593 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2594 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2595 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2598 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2599 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2600 #define op_put(a, b) a = cm[((b) + 16)>>5]
2601 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2602 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2604 H264_LOWPASS(put_ , op_put, op2_put)
2605 H264_LOWPASS(avg_ , op_avg, op2_avg)
2606 H264_MC(put_, 2)
2607 H264_MC(put_, 4)
2608 H264_MC(put_, 8)
2609 H264_MC(put_, 16)
2610 H264_MC(avg_, 4)
2611 H264_MC(avg_, 8)
2612 H264_MC(avg_, 16)
2614 #undef op_avg
2615 #undef op_put
2616 #undef op2_avg
2617 #undef op2_put
2618 #endif
2620 #define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2621 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2622 #define H264_WEIGHT(W,H) \
2623 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2624 int y; \
2625 offset <<= log2_denom; \
2626 if(log2_denom) offset += 1<<(log2_denom-1); \
2627 for(y=0; y<H; y++, block += stride){ \
2628 op_scale1(0); \
2629 op_scale1(1); \
2630 if(W==2) continue; \
2631 op_scale1(2); \
2632 op_scale1(3); \
2633 if(W==4) continue; \
2634 op_scale1(4); \
2635 op_scale1(5); \
2636 op_scale1(6); \
2637 op_scale1(7); \
2638 if(W==8) continue; \
2639 op_scale1(8); \
2640 op_scale1(9); \
2641 op_scale1(10); \
2642 op_scale1(11); \
2643 op_scale1(12); \
2644 op_scale1(13); \
2645 op_scale1(14); \
2646 op_scale1(15); \
2649 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2650 int y; \
2651 offset = ((offset + 1) | 1) << log2_denom; \
2652 for(y=0; y<H; y++, dst += stride, src += stride){ \
2653 op_scale2(0); \
2654 op_scale2(1); \
2655 if(W==2) continue; \
2656 op_scale2(2); \
2657 op_scale2(3); \
2658 if(W==4) continue; \
2659 op_scale2(4); \
2660 op_scale2(5); \
2661 op_scale2(6); \
2662 op_scale2(7); \
2663 if(W==8) continue; \
2664 op_scale2(8); \
2665 op_scale2(9); \
2666 op_scale2(10); \
2667 op_scale2(11); \
2668 op_scale2(12); \
2669 op_scale2(13); \
2670 op_scale2(14); \
2671 op_scale2(15); \
2675 H264_WEIGHT(16,16)
2676 H264_WEIGHT(16,8)
2677 H264_WEIGHT(8,16)
2678 H264_WEIGHT(8,8)
2679 H264_WEIGHT(8,4)
2680 H264_WEIGHT(4,8)
2681 H264_WEIGHT(4,4)
2682 H264_WEIGHT(4,2)
2683 H264_WEIGHT(2,4)
2684 H264_WEIGHT(2,2)
2686 #undef op_scale1
2687 #undef op_scale2
2688 #undef H264_WEIGHT
2690 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2691 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2692 int i;
2694 for(i=0; i<h; i++){
2695 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2696 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2697 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2698 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2699 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2700 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2701 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2702 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2703 dst+=dstStride;
2704 src+=srcStride;
2708 #ifdef CONFIG_CAVS_DECODER
2709 /* AVS specific */
2710 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2712 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2713 put_pixels8_c(dst, src, stride, 8);
2715 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2716 avg_pixels8_c(dst, src, stride, 8);
2718 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2719 put_pixels16_c(dst, src, stride, 16);
2721 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2722 avg_pixels16_c(dst, src, stride, 16);
2724 #endif /* CONFIG_CAVS_DECODER */
2726 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2727 /* VC-1 specific */
2728 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2730 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2731 put_pixels8_c(dst, src, stride, 8);
2733 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2735 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2737 /* H264 specific */
2738 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2740 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2741 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2742 int i;
2744 for(i=0; i<w; i++){
2745 const int src_1= src[ -srcStride];
2746 const int src0 = src[0 ];
2747 const int src1 = src[ srcStride];
2748 const int src2 = src[2*srcStride];
2749 const int src3 = src[3*srcStride];
2750 const int src4 = src[4*srcStride];
2751 const int src5 = src[5*srcStride];
2752 const int src6 = src[6*srcStride];
2753 const int src7 = src[7*srcStride];
2754 const int src8 = src[8*srcStride];
2755 const int src9 = src[9*srcStride];
2756 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2757 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2758 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2759 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2760 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2761 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2762 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2763 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2764 src++;
2765 dst++;
2769 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2770 put_pixels8_c(dst, src, stride, 8);
2773 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2774 uint8_t half[64];
2775 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2776 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2779 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2780 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2783 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2784 uint8_t half[64];
2785 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2786 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2789 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2790 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2793 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2794 uint8_t halfH[88];
2795 uint8_t halfV[64];
2796 uint8_t halfHV[64];
2797 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2798 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2799 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2800 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2802 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2803 uint8_t halfH[88];
2804 uint8_t halfV[64];
2805 uint8_t halfHV[64];
2806 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2807 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2808 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2809 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2811 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2812 uint8_t halfH[88];
2813 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2814 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2817 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2818 if(ENABLE_ANY_H263) {
2819 int x;
2820 const int strength= ff_h263_loop_filter_strength[qscale];
2822 for(x=0; x<8; x++){
2823 int d1, d2, ad1;
2824 int p0= src[x-2*stride];
2825 int p1= src[x-1*stride];
2826 int p2= src[x+0*stride];
2827 int p3= src[x+1*stride];
2828 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2830 if (d<-2*strength) d1= 0;
2831 else if(d<- strength) d1=-2*strength - d;
2832 else if(d< strength) d1= d;
2833 else if(d< 2*strength) d1= 2*strength - d;
2834 else d1= 0;
2836 p1 += d1;
2837 p2 -= d1;
2838 if(p1&256) p1= ~(p1>>31);
2839 if(p2&256) p2= ~(p2>>31);
2841 src[x-1*stride] = p1;
2842 src[x+0*stride] = p2;
2844 ad1= FFABS(d1)>>1;
2846 d2= av_clip((p0-p3)/4, -ad1, ad1);
2848 src[x-2*stride] = p0 - d2;
2849 src[x+ stride] = p3 + d2;
2854 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2855 if(ENABLE_ANY_H263) {
2856 int y;
2857 const int strength= ff_h263_loop_filter_strength[qscale];
2859 for(y=0; y<8; y++){
2860 int d1, d2, ad1;
2861 int p0= src[y*stride-2];
2862 int p1= src[y*stride-1];
2863 int p2= src[y*stride+0];
2864 int p3= src[y*stride+1];
2865 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2867 if (d<-2*strength) d1= 0;
2868 else if(d<- strength) d1=-2*strength - d;
2869 else if(d< strength) d1= d;
2870 else if(d< 2*strength) d1= 2*strength - d;
2871 else d1= 0;
2873 p1 += d1;
2874 p2 -= d1;
2875 if(p1&256) p1= ~(p1>>31);
2876 if(p2&256) p2= ~(p2>>31);
2878 src[y*stride-1] = p1;
2879 src[y*stride+0] = p2;
2881 ad1= FFABS(d1)>>1;
2883 d2= av_clip((p0-p3)/4, -ad1, ad1);
2885 src[y*stride-2] = p0 - d2;
2886 src[y*stride+1] = p3 + d2;
2891 static void h261_loop_filter_c(uint8_t *src, int stride){
2892 int x,y,xy,yz;
2893 int temp[64];
2895 for(x=0; x<8; x++){
2896 temp[x ] = 4*src[x ];
2897 temp[x + 7*8] = 4*src[x + 7*stride];
2899 for(y=1; y<7; y++){
2900 for(x=0; x<8; x++){
2901 xy = y * stride + x;
2902 yz = y * 8 + x;
2903 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2907 for(y=0; y<8; y++){
2908 src[ y*stride] = (temp[ y*8] + 2)>>2;
2909 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2910 for(x=1; x<7; x++){
2911 xy = y * stride + x;
2912 yz = y * 8 + x;
2913 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2918 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2920 int i, d;
2921 for( i = 0; i < 4; i++ ) {
2922 if( tc0[i] < 0 ) {
2923 pix += 4*ystride;
2924 continue;
2926 for( d = 0; d < 4; d++ ) {
2927 const int p0 = pix[-1*xstride];
2928 const int p1 = pix[-2*xstride];
2929 const int p2 = pix[-3*xstride];
2930 const int q0 = pix[0];
2931 const int q1 = pix[1*xstride];
2932 const int q2 = pix[2*xstride];
2934 if( FFABS( p0 - q0 ) < alpha &&
2935 FFABS( p1 - p0 ) < beta &&
2936 FFABS( q1 - q0 ) < beta ) {
2938 int tc = tc0[i];
2939 int i_delta;
2941 if( FFABS( p2 - p0 ) < beta ) {
2942 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2943 tc++;
2945 if( FFABS( q2 - q0 ) < beta ) {
2946 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2947 tc++;
2950 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2951 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */
2952 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */
2954 pix += ystride;
2958 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2960 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2962 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2964 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2967 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2969 int i, d;
2970 for( i = 0; i < 4; i++ ) {
2971 const int tc = tc0[i];
2972 if( tc <= 0 ) {
2973 pix += 2*ystride;
2974 continue;
2976 for( d = 0; d < 2; d++ ) {
2977 const int p0 = pix[-1*xstride];
2978 const int p1 = pix[-2*xstride];
2979 const int q0 = pix[0];
2980 const int q1 = pix[1*xstride];
2982 if( FFABS( p0 - q0 ) < alpha &&
2983 FFABS( p1 - p0 ) < beta &&
2984 FFABS( q1 - q0 ) < beta ) {
2986 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2988 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */
2989 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */
2991 pix += ystride;
2995 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2997 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2999 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3001 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3004 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3006 int d;
3007 for( d = 0; d < 8; d++ ) {
3008 const int p0 = pix[-1*xstride];
3009 const int p1 = pix[-2*xstride];
3010 const int q0 = pix[0];
3011 const int q1 = pix[1*xstride];
3013 if( FFABS( p0 - q0 ) < alpha &&
3014 FFABS( p1 - p0 ) < beta &&
3015 FFABS( q1 - q0 ) < beta ) {
3017 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
3018 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
3020 pix += ystride;
3023 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3025 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3027 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3029 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3032 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3034 int s, i;
3036 s = 0;
3037 for(i=0;i<h;i++) {
3038 s += abs(pix1[0] - pix2[0]);
3039 s += abs(pix1[1] - pix2[1]);
3040 s += abs(pix1[2] - pix2[2]);
3041 s += abs(pix1[3] - pix2[3]);
3042 s += abs(pix1[4] - pix2[4]);
3043 s += abs(pix1[5] - pix2[5]);
3044 s += abs(pix1[6] - pix2[6]);
3045 s += abs(pix1[7] - pix2[7]);
3046 s += abs(pix1[8] - pix2[8]);
3047 s += abs(pix1[9] - pix2[9]);
3048 s += abs(pix1[10] - pix2[10]);
3049 s += abs(pix1[11] - pix2[11]);
3050 s += abs(pix1[12] - pix2[12]);
3051 s += abs(pix1[13] - pix2[13]);
3052 s += abs(pix1[14] - pix2[14]);
3053 s += abs(pix1[15] - pix2[15]);
3054 pix1 += line_size;
3055 pix2 += line_size;
3057 return s;
3060 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3062 int s, i;
3064 s = 0;
3065 for(i=0;i<h;i++) {
3066 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3067 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3068 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3069 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3070 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3071 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3072 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3073 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3074 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3075 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3076 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3077 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3078 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3079 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3080 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3081 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3082 pix1 += line_size;
3083 pix2 += line_size;
3085 return s;
3088 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3090 int s, i;
3091 uint8_t *pix3 = pix2 + line_size;
3093 s = 0;
3094 for(i=0;i<h;i++) {
3095 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3096 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3097 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3098 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3099 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3100 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3101 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3102 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3103 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3104 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3105 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3106 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3107 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3108 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3109 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3110 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3111 pix1 += line_size;
3112 pix2 += line_size;
3113 pix3 += line_size;
3115 return s;
3118 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3120 int s, i;
3121 uint8_t *pix3 = pix2 + line_size;
3123 s = 0;
3124 for(i=0;i<h;i++) {
3125 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3126 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3127 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3128 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3129 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3130 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3131 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3132 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3133 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3134 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3135 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3136 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3137 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3138 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3139 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3140 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3141 pix1 += line_size;
3142 pix2 += line_size;
3143 pix3 += line_size;
3145 return s;
3148 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3150 int s, i;
3152 s = 0;
3153 for(i=0;i<h;i++) {
3154 s += abs(pix1[0] - pix2[0]);
3155 s += abs(pix1[1] - pix2[1]);
3156 s += abs(pix1[2] - pix2[2]);
3157 s += abs(pix1[3] - pix2[3]);
3158 s += abs(pix1[4] - pix2[4]);
3159 s += abs(pix1[5] - pix2[5]);
3160 s += abs(pix1[6] - pix2[6]);
3161 s += abs(pix1[7] - pix2[7]);
3162 pix1 += line_size;
3163 pix2 += line_size;
3165 return s;
3168 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3170 int s, i;
3172 s = 0;
3173 for(i=0;i<h;i++) {
3174 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3175 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3176 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3177 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3178 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3179 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3180 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3181 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3182 pix1 += line_size;
3183 pix2 += line_size;
3185 return s;
3188 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3190 int s, i;
3191 uint8_t *pix3 = pix2 + line_size;
3193 s = 0;
3194 for(i=0;i<h;i++) {
3195 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3196 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3197 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3198 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3199 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3200 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3201 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3202 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3203 pix1 += line_size;
3204 pix2 += line_size;
3205 pix3 += line_size;
3207 return s;
3210 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3212 int s, i;
3213 uint8_t *pix3 = pix2 + line_size;
3215 s = 0;
3216 for(i=0;i<h;i++) {
3217 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3218 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3219 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3220 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3221 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3222 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3223 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3224 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3225 pix1 += line_size;
3226 pix2 += line_size;
3227 pix3 += line_size;
3229 return s;
3232 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3233 MpegEncContext *c = v;
3234 int score1=0;
3235 int score2=0;
3236 int x,y;
3238 for(y=0; y<h; y++){
3239 for(x=0; x<16; x++){
3240 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3242 if(y+1<h){
3243 for(x=0; x<15; x++){
3244 score2+= FFABS( s1[x ] - s1[x +stride]
3245 - s1[x+1] + s1[x+1+stride])
3246 -FFABS( s2[x ] - s2[x +stride]
3247 - s2[x+1] + s2[x+1+stride]);
3250 s1+= stride;
3251 s2+= stride;
3254 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3255 else return score1 + FFABS(score2)*8;
3258 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3259 MpegEncContext *c = v;
3260 int score1=0;
3261 int score2=0;
3262 int x,y;
3264 for(y=0; y<h; y++){
3265 for(x=0; x<8; x++){
3266 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3268 if(y+1<h){
3269 for(x=0; x<7; x++){
3270 score2+= FFABS( s1[x ] - s1[x +stride]
3271 - s1[x+1] + s1[x+1+stride])
3272 -FFABS( s2[x ] - s2[x +stride]
3273 - s2[x+1] + s2[x+1+stride]);
3276 s1+= stride;
3277 s2+= stride;
3280 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3281 else return score1 + FFABS(score2)*8;
3284 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3285 int i;
3286 unsigned int sum=0;
3288 for(i=0; i<8*8; i++){
3289 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3290 int w= weight[i];
3291 b>>= RECON_SHIFT;
3292 assert(-512<b && b<512);
3294 sum += (w*b)*(w*b)>>4;
3296 return sum>>2;
3299 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3300 int i;
3302 for(i=0; i<8*8; i++){
3303 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3308 * permutes an 8x8 block.
3309 * @param block the block which will be permuted according to the given permutation vector
3310 * @param permutation the permutation vector
3311 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3312 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3313 * (inverse) permutated to scantable order!
3315 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3317 int i;
3318 DCTELEM temp[64];
3320 if(last<=0) return;
3321 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3323 for(i=0; i<=last; i++){
3324 const int j= scantable[i];
3325 temp[j]= block[j];
3326 block[j]=0;
3329 for(i=0; i<=last; i++){
3330 const int j= scantable[i];
3331 const int perm_j= permutation[j];
3332 block[perm_j]= temp[j];
3336 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3337 return 0;
3340 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3341 int i;
3343 memset(cmp, 0, sizeof(void*)*5);
3345 for(i=0; i<5; i++){
3346 switch(type&0xFF){
3347 case FF_CMP_SAD:
3348 cmp[i]= c->sad[i];
3349 break;
3350 case FF_CMP_SATD:
3351 cmp[i]= c->hadamard8_diff[i];
3352 break;
3353 case FF_CMP_SSE:
3354 cmp[i]= c->sse[i];
3355 break;
3356 case FF_CMP_DCT:
3357 cmp[i]= c->dct_sad[i];
3358 break;
3359 case FF_CMP_DCT264:
3360 cmp[i]= c->dct264_sad[i];
3361 break;
3362 case FF_CMP_DCTMAX:
3363 cmp[i]= c->dct_max[i];
3364 break;
3365 case FF_CMP_PSNR:
3366 cmp[i]= c->quant_psnr[i];
3367 break;
3368 case FF_CMP_BIT:
3369 cmp[i]= c->bit[i];
3370 break;
3371 case FF_CMP_RD:
3372 cmp[i]= c->rd[i];
3373 break;
3374 case FF_CMP_VSAD:
3375 cmp[i]= c->vsad[i];
3376 break;
3377 case FF_CMP_VSSE:
3378 cmp[i]= c->vsse[i];
3379 break;
3380 case FF_CMP_ZERO:
3381 cmp[i]= zero_cmp;
3382 break;
3383 case FF_CMP_NSSE:
3384 cmp[i]= c->nsse[i];
3385 break;
3386 #ifdef CONFIG_SNOW_ENCODER
3387 case FF_CMP_W53:
3388 cmp[i]= c->w53[i];
3389 break;
3390 case FF_CMP_W97:
3391 cmp[i]= c->w97[i];
3392 break;
3393 #endif
3394 default:
3395 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3401 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3403 static void clear_blocks_c(DCTELEM *blocks)
3405 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3408 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3409 long i;
3410 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3411 long a = *(long*)(src+i);
3412 long b = *(long*)(dst+i);
3413 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3415 for(; i<w; i++)
3416 dst[i+0] += src[i+0];
3419 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3420 long i;
3421 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3422 long a = *(long*)(src1+i);
3423 long b = *(long*)(src2+i);
3424 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3426 for(; i<w; i++)
3427 dst[i] = src1[i]+src2[i];
3430 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3431 long i;
3432 #ifndef HAVE_FAST_UNALIGNED
3433 if((long)src2 & (sizeof(long)-1)){
3434 for(i=0; i+7<w; i+=8){
3435 dst[i+0] = src1[i+0]-src2[i+0];
3436 dst[i+1] = src1[i+1]-src2[i+1];
3437 dst[i+2] = src1[i+2]-src2[i+2];
3438 dst[i+3] = src1[i+3]-src2[i+3];
3439 dst[i+4] = src1[i+4]-src2[i+4];
3440 dst[i+5] = src1[i+5]-src2[i+5];
3441 dst[i+6] = src1[i+6]-src2[i+6];
3442 dst[i+7] = src1[i+7]-src2[i+7];
3444 }else
3445 #endif
3446 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3447 long a = *(long*)(src1+i);
3448 long b = *(long*)(src2+i);
3449 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3451 for(; i<w; i++)
3452 dst[i+0] = src1[i+0]-src2[i+0];
3455 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3456 int i;
3457 uint8_t l, lt;
3459 l= *left;
3460 lt= *left_top;
3462 for(i=0; i<w; i++){
3463 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3464 lt= src1[i];
3465 l= src2[i];
3466 dst[i]= l - pred;
3469 *left= l;
3470 *left_top= lt;
3473 #define BUTTERFLY2(o1,o2,i1,i2) \
3474 o1= (i1)+(i2);\
3475 o2= (i1)-(i2);
3477 #define BUTTERFLY1(x,y) \
3479 int a,b;\
3480 a= x;\
3481 b= y;\
3482 x= a+b;\
3483 y= a-b;\
3486 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3488 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3489 int i;
3490 int temp[64];
3491 int sum=0;
3493 assert(h==8);
3495 for(i=0; i<8; i++){
3496 //FIXME try pointer walks
3497 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3498 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3499 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3500 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3502 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3503 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3504 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3505 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3507 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3508 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3509 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3510 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3513 for(i=0; i<8; i++){
3514 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3515 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3516 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3517 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3519 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3520 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3521 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3522 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3524 sum +=
3525 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3526 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3527 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3528 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3530 #if 0
3531 static int maxi=0;
3532 if(sum>maxi){
3533 maxi=sum;
3534 printf("MAX:%d\n", maxi);
3536 #endif
3537 return sum;
3540 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3541 int i;
3542 int temp[64];
3543 int sum=0;
3545 assert(h==8);
3547 for(i=0; i<8; i++){
3548 //FIXME try pointer walks
3549 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3550 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3551 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3552 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3554 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3555 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3556 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3557 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3559 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3560 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3561 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3562 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3565 for(i=0; i<8; i++){
3566 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3567 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3568 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3569 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3571 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3572 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3573 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3574 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3576 sum +=
3577 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3578 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3579 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3580 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3583 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3585 return sum;
3588 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3589 MpegEncContext * const s= (MpegEncContext *)c;
3590 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3591 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3593 assert(h==8);
3595 s->dsp.diff_pixels(temp, src1, src2, stride);
3596 s->dsp.fdct(temp);
3597 return s->dsp.sum_abs_dctelem(temp);
3600 #ifdef CONFIG_GPL
3601 #define DCT8_1D {\
3602 const int s07 = SRC(0) + SRC(7);\
3603 const int s16 = SRC(1) + SRC(6);\
3604 const int s25 = SRC(2) + SRC(5);\
3605 const int s34 = SRC(3) + SRC(4);\
3606 const int a0 = s07 + s34;\
3607 const int a1 = s16 + s25;\
3608 const int a2 = s07 - s34;\
3609 const int a3 = s16 - s25;\
3610 const int d07 = SRC(0) - SRC(7);\
3611 const int d16 = SRC(1) - SRC(6);\
3612 const int d25 = SRC(2) - SRC(5);\
3613 const int d34 = SRC(3) - SRC(4);\
3614 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3615 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3616 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3617 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3618 DST(0, a0 + a1 ) ;\
3619 DST(1, a4 + (a7>>2)) ;\
3620 DST(2, a2 + (a3>>1)) ;\
3621 DST(3, a5 + (a6>>2)) ;\
3622 DST(4, a0 - a1 ) ;\
3623 DST(5, a6 - (a5>>2)) ;\
3624 DST(6, (a2>>1) - a3 ) ;\
3625 DST(7, (a4>>2) - a7 ) ;\
3628 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3629 MpegEncContext * const s= (MpegEncContext *)c;
3630 DCTELEM dct[8][8];
3631 int i;
3632 int sum=0;
3634 s->dsp.diff_pixels(dct[0], src1, src2, stride);
3636 #define SRC(x) dct[i][x]
3637 #define DST(x,v) dct[i][x]= v
3638 for( i = 0; i < 8; i++ )
3639 DCT8_1D
3640 #undef SRC
3641 #undef DST
3643 #define SRC(x) dct[x][i]
3644 #define DST(x,v) sum += FFABS(v)
3645 for( i = 0; i < 8; i++ )
3646 DCT8_1D
3647 #undef SRC
3648 #undef DST
3649 return sum;
3651 #endif
3653 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3654 MpegEncContext * const s= (MpegEncContext *)c;
3655 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3656 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3657 int sum=0, i;
3659 assert(h==8);
3661 s->dsp.diff_pixels(temp, src1, src2, stride);
3662 s->dsp.fdct(temp);
3664 for(i=0; i<64; i++)
3665 sum= FFMAX(sum, FFABS(temp[i]));
3667 return sum;
3670 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3671 MpegEncContext * const s= (MpegEncContext *)c;
3672 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3673 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3674 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3675 int sum=0, i;
3677 assert(h==8);
3678 s->mb_intra=0;
3680 s->dsp.diff_pixels(temp, src1, src2, stride);
3682 memcpy(bak, temp, 64*sizeof(DCTELEM));
3684 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3685 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3686 ff_simple_idct(temp); //FIXME
3688 for(i=0; i<64; i++)
3689 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3691 return sum;
3694 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3695 MpegEncContext * const s= (MpegEncContext *)c;
3696 const uint8_t *scantable= s->intra_scantable.permutated;
3697 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3698 DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3699 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3700 uint8_t * const bak= (uint8_t*)aligned_bak;
3701 int i, last, run, bits, level, distortion, start_i;
3702 const int esc_length= s->ac_esc_length;
3703 uint8_t * length;
3704 uint8_t * last_length;
3706 assert(h==8);
3708 for(i=0; i<8; i++){
3709 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3710 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3713 s->dsp.diff_pixels(temp, src1, src2, stride);
3715 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3717 bits=0;
3719 if (s->mb_intra) {
3720 start_i = 1;
3721 length = s->intra_ac_vlc_length;
3722 last_length= s->intra_ac_vlc_last_length;
3723 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3724 } else {
3725 start_i = 0;
3726 length = s->inter_ac_vlc_length;
3727 last_length= s->inter_ac_vlc_last_length;
3730 if(last>=start_i){
3731 run=0;
3732 for(i=start_i; i<last; i++){
3733 int j= scantable[i];
3734 level= temp[j];
3736 if(level){
3737 level+=64;
3738 if((level&(~127)) == 0){
3739 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3740 }else
3741 bits+= esc_length;
3742 run=0;
3743 }else
3744 run++;
3746 i= scantable[last];
3748 level= temp[i] + 64;
3750 assert(level - 64);
3752 if((level&(~127)) == 0){
3753 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3754 }else
3755 bits+= esc_length;
3759 if(last>=0){
3760 if(s->mb_intra)
3761 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3762 else
3763 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3766 s->dsp.idct_add(bak, stride, temp);
3768 distortion= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3770 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3773 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3774 MpegEncContext * const s= (MpegEncContext *)c;
3775 const uint8_t *scantable= s->intra_scantable.permutated;
3776 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3777 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3778 int i, last, run, bits, level, start_i;
3779 const int esc_length= s->ac_esc_length;
3780 uint8_t * length;
3781 uint8_t * last_length;
3783 assert(h==8);
3785 s->dsp.diff_pixels(temp, src1, src2, stride);
3787 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3789 bits=0;
3791 if (s->mb_intra) {
3792 start_i = 1;
3793 length = s->intra_ac_vlc_length;
3794 last_length= s->intra_ac_vlc_last_length;
3795 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3796 } else {
3797 start_i = 0;
3798 length = s->inter_ac_vlc_length;
3799 last_length= s->inter_ac_vlc_last_length;
3802 if(last>=start_i){
3803 run=0;
3804 for(i=start_i; i<last; i++){
3805 int j= scantable[i];
3806 level= temp[j];
3808 if(level){
3809 level+=64;
3810 if((level&(~127)) == 0){
3811 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3812 }else
3813 bits+= esc_length;
3814 run=0;
3815 }else
3816 run++;
3818 i= scantable[last];
3820 level= temp[i] + 64;
3822 assert(level - 64);
3824 if((level&(~127)) == 0){
3825 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3826 }else
3827 bits+= esc_length;
3830 return bits;
3833 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3834 int score=0;
3835 int x,y;
3837 for(y=1; y<h; y++){
3838 for(x=0; x<16; x+=4){
3839 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride])
3840 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3842 s+= stride;
3845 return score;
3848 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3849 int score=0;
3850 int x,y;
3852 for(y=1; y<h; y++){
3853 for(x=0; x<16; x++){
3854 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3856 s1+= stride;
3857 s2+= stride;
3860 return score;
3863 #define SQ(a) ((a)*(a))
3864 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3865 int score=0;
3866 int x,y;
3868 for(y=1; y<h; y++){
3869 for(x=0; x<16; x+=4){
3870 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3871 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3873 s+= stride;
3876 return score;
3879 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3880 int score=0;
3881 int x,y;
3883 for(y=1; y<h; y++){
3884 for(x=0; x<16; x++){
3885 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3887 s1+= stride;
3888 s2+= stride;
3891 return score;
3894 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3895 int size){
3896 int score=0;
3897 int i;
3898 for(i=0; i<size; i++)
3899 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3900 return score;
3903 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3904 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3905 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3906 #ifdef CONFIG_GPL
3907 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3908 #endif
3909 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3910 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3911 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3912 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3914 static void vector_fmul_c(float *dst, const float *src, int len){
3915 int i;
3916 for(i=0; i<len; i++)
3917 dst[i] *= src[i];
3920 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3921 int i;
3922 src1 += len-1;
3923 for(i=0; i<len; i++)
3924 dst[i] = src0[i] * src1[-i];
3927 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3928 int i;
3929 for(i=0; i<len; i++)
3930 dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3933 void ff_float_to_int16_c(int16_t *dst, const float *src, int len){
3934 int i;
3935 for(i=0; i<len; i++) {
3936 int_fast32_t tmp = ((const int32_t*)src)[i];
3937 if(tmp & 0xf0000){
3938 tmp = (0x43c0ffff - tmp)>>31;
3939 // is this faster on some gcc/cpu combinations?
3940 // if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3941 // else tmp = 0;
3943 dst[i] = tmp - 0x8000;
3947 #define W0 2048
3948 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3949 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3950 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3951 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3952 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3953 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3954 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
3956 static void wmv2_idct_row(short * b)
3958 int s1,s2;
3959 int a0,a1,a2,a3,a4,a5,a6,a7;
3960 /*step 1*/
3961 a1 = W1*b[1]+W7*b[7];
3962 a7 = W7*b[1]-W1*b[7];
3963 a5 = W5*b[5]+W3*b[3];
3964 a3 = W3*b[5]-W5*b[3];
3965 a2 = W2*b[2]+W6*b[6];
3966 a6 = W6*b[2]-W2*b[6];
3967 a0 = W0*b[0]+W0*b[4];
3968 a4 = W0*b[0]-W0*b[4];
3969 /*step 2*/
3970 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3971 s2 = (181*(a1-a5-a7+a3)+128)>>8;
3972 /*step 3*/
3973 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3974 b[1] = (a4+a6 +s1 + (1<<7))>>8;
3975 b[2] = (a4-a6 +s2 + (1<<7))>>8;
3976 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3977 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3978 b[5] = (a4-a6 -s2 + (1<<7))>>8;
3979 b[6] = (a4+a6 -s1 + (1<<7))>>8;
3980 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3982 static void wmv2_idct_col(short * b)
3984 int s1,s2;
3985 int a0,a1,a2,a3,a4,a5,a6,a7;
3986 /*step 1, with extended precision*/
3987 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
3988 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
3989 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
3990 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
3991 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
3992 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
3993 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
3994 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
3995 /*step 2*/
3996 s1 = (181*(a1-a5+a7-a3)+128)>>8;
3997 s2 = (181*(a1-a5-a7+a3)+128)>>8;
3998 /*step 3*/
3999 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4000 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
4001 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
4002 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4004 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4005 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
4006 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
4007 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4009 void ff_wmv2_idct_c(short * block){
4010 int i;
4012 for(i=0;i<64;i+=8){
4013 wmv2_idct_row(block+i);
4015 for(i=0;i<8;i++){
4016 wmv2_idct_col(block+i);
4019 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4020 converted */
4021 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4023 ff_wmv2_idct_c(block);
4024 put_pixels_clamped_c(block, dest, line_size);
4026 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4028 ff_wmv2_idct_c(block);
4029 add_pixels_clamped_c(block, dest, line_size);
4031 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4033 j_rev_dct (block);
4034 put_pixels_clamped_c(block, dest, line_size);
4036 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4038 j_rev_dct (block);
4039 add_pixels_clamped_c(block, dest, line_size);
4042 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4044 j_rev_dct4 (block);
4045 put_pixels_clamped4_c(block, dest, line_size);
4047 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4049 j_rev_dct4 (block);
4050 add_pixels_clamped4_c(block, dest, line_size);
4053 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4055 j_rev_dct2 (block);
4056 put_pixels_clamped2_c(block, dest, line_size);
4058 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4060 j_rev_dct2 (block);
4061 add_pixels_clamped2_c(block, dest, line_size);
4064 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4066 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4068 dest[0] = cm[(block[0] + 4)>>3];
4070 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4072 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4074 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4077 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4079 /* init static data */
4080 void dsputil_static_init(void)
4082 int i;
4084 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4085 for(i=0;i<MAX_NEG_CROP;i++) {
4086 ff_cropTbl[i] = 0;
4087 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4090 for(i=0;i<512;i++) {
4091 ff_squareTbl[i] = (i - 256) * (i - 256);
4094 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4097 int ff_check_alignment(void){
4098 static int did_fail=0;
4099 DECLARE_ALIGNED_16(int, aligned);
4101 if((long)&aligned & 15){
4102 if(!did_fail){
4103 #if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
4104 av_log(NULL, AV_LOG_ERROR,
4105 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4106 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4107 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4108 "Do not report crashes to FFmpeg developers.\n");
4109 #endif
4110 did_fail=1;
4112 return -1;
4114 return 0;
4117 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4119 int i;
4121 ff_check_alignment();
4123 #ifdef CONFIG_ENCODERS
4124 if(avctx->dct_algo==FF_DCT_FASTINT) {
4125 c->fdct = fdct_ifast;
4126 c->fdct248 = fdct_ifast248;
4128 else if(avctx->dct_algo==FF_DCT_FAAN) {
4129 c->fdct = ff_faandct;
4130 c->fdct248 = ff_faandct248;
4132 else {
4133 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4134 c->fdct248 = ff_fdct248_islow;
4136 #endif //CONFIG_ENCODERS
4138 if(avctx->lowres==1){
4139 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !ENABLE_H264_DECODER){
4140 c->idct_put= ff_jref_idct4_put;
4141 c->idct_add= ff_jref_idct4_add;
4142 }else{
4143 c->idct_put= ff_h264_lowres_idct_put_c;
4144 c->idct_add= ff_h264_lowres_idct_add_c;
4146 c->idct = j_rev_dct4;
4147 c->idct_permutation_type= FF_NO_IDCT_PERM;
4148 }else if(avctx->lowres==2){
4149 c->idct_put= ff_jref_idct2_put;
4150 c->idct_add= ff_jref_idct2_add;
4151 c->idct = j_rev_dct2;
4152 c->idct_permutation_type= FF_NO_IDCT_PERM;
4153 }else if(avctx->lowres==3){
4154 c->idct_put= ff_jref_idct1_put;
4155 c->idct_add= ff_jref_idct1_add;
4156 c->idct = j_rev_dct1;
4157 c->idct_permutation_type= FF_NO_IDCT_PERM;
4158 }else{
4159 if(avctx->idct_algo==FF_IDCT_INT){
4160 c->idct_put= ff_jref_idct_put;
4161 c->idct_add= ff_jref_idct_add;
4162 c->idct = j_rev_dct;
4163 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4164 }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER ) &&
4165 avctx->idct_algo==FF_IDCT_VP3){
4166 c->idct_put= ff_vp3_idct_put_c;
4167 c->idct_add= ff_vp3_idct_add_c;
4168 c->idct = ff_vp3_idct_c;
4169 c->idct_permutation_type= FF_NO_IDCT_PERM;
4170 }else if(avctx->idct_algo==FF_IDCT_WMV2){
4171 c->idct_put= ff_wmv2_idct_put_c;
4172 c->idct_add= ff_wmv2_idct_add_c;
4173 c->idct = ff_wmv2_idct_c;
4174 c->idct_permutation_type= FF_NO_IDCT_PERM;
4175 }else if(avctx->idct_algo==FF_IDCT_FAAN){
4176 c->idct_put= ff_faanidct_put;
4177 c->idct_add= ff_faanidct_add;
4178 c->idct = ff_faanidct;
4179 c->idct_permutation_type= FF_NO_IDCT_PERM;
4180 }else{ //accurate/default
4181 c->idct_put= ff_simple_idct_put;
4182 c->idct_add= ff_simple_idct_add;
4183 c->idct = ff_simple_idct;
4184 c->idct_permutation_type= FF_NO_IDCT_PERM;
4188 if (ENABLE_H264_DECODER) {
4189 c->h264_idct_add= ff_h264_idct_add_c;
4190 c->h264_idct8_add= ff_h264_idct8_add_c;
4191 c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4192 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4195 c->get_pixels = get_pixels_c;
4196 c->diff_pixels = diff_pixels_c;
4197 c->put_pixels_clamped = put_pixels_clamped_c;
4198 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4199 c->add_pixels_clamped = add_pixels_clamped_c;
4200 c->add_pixels8 = add_pixels8_c;
4201 c->add_pixels4 = add_pixels4_c;
4202 c->sum_abs_dctelem = sum_abs_dctelem_c;
4203 c->gmc1 = gmc1_c;
4204 c->gmc = ff_gmc_c;
4205 c->clear_blocks = clear_blocks_c;
4206 c->pix_sum = pix_sum_c;
4207 c->pix_norm1 = pix_norm1_c;
4209 /* TODO [0] 16 [1] 8 */
4210 c->pix_abs[0][0] = pix_abs16_c;
4211 c->pix_abs[0][1] = pix_abs16_x2_c;
4212 c->pix_abs[0][2] = pix_abs16_y2_c;
4213 c->pix_abs[0][3] = pix_abs16_xy2_c;
4214 c->pix_abs[1][0] = pix_abs8_c;
4215 c->pix_abs[1][1] = pix_abs8_x2_c;
4216 c->pix_abs[1][2] = pix_abs8_y2_c;
4217 c->pix_abs[1][3] = pix_abs8_xy2_c;
4219 #define dspfunc(PFX, IDX, NUM) \
4220 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
4221 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
4222 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
4223 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4225 dspfunc(put, 0, 16);
4226 dspfunc(put_no_rnd, 0, 16);
4227 dspfunc(put, 1, 8);
4228 dspfunc(put_no_rnd, 1, 8);
4229 dspfunc(put, 2, 4);
4230 dspfunc(put, 3, 2);
4232 dspfunc(avg, 0, 16);
4233 dspfunc(avg_no_rnd, 0, 16);
4234 dspfunc(avg, 1, 8);
4235 dspfunc(avg_no_rnd, 1, 8);
4236 dspfunc(avg, 2, 4);
4237 dspfunc(avg, 3, 2);
4238 #undef dspfunc
4240 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4241 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4243 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4244 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4245 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4246 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4247 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4248 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4249 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4250 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4251 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4253 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4254 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4255 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4256 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4257 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4258 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4259 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4260 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4261 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4263 #define dspfunc(PFX, IDX, NUM) \
4264 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4265 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4266 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4267 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4268 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4269 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4270 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4271 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4272 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4273 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4274 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4275 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4276 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4277 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4278 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4279 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4281 dspfunc(put_qpel, 0, 16);
4282 dspfunc(put_no_rnd_qpel, 0, 16);
4284 dspfunc(avg_qpel, 0, 16);
4285 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4287 dspfunc(put_qpel, 1, 8);
4288 dspfunc(put_no_rnd_qpel, 1, 8);
4290 dspfunc(avg_qpel, 1, 8);
4291 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4293 dspfunc(put_h264_qpel, 0, 16);
4294 dspfunc(put_h264_qpel, 1, 8);
4295 dspfunc(put_h264_qpel, 2, 4);
4296 dspfunc(put_h264_qpel, 3, 2);
4297 dspfunc(avg_h264_qpel, 0, 16);
4298 dspfunc(avg_h264_qpel, 1, 8);
4299 dspfunc(avg_h264_qpel, 2, 4);
4301 #undef dspfunc
4302 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4303 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4304 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4305 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4306 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4307 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4308 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4310 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4311 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4312 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4313 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4314 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4315 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4316 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4317 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4318 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4319 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4320 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4321 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4322 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4323 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4324 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4325 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4326 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4327 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4328 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4329 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4331 c->draw_edges = draw_edges_c;
4333 #ifdef CONFIG_CAVS_DECODER
4334 ff_cavsdsp_init(c,avctx);
4335 #endif
4336 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4337 ff_vc1dsp_init(c,avctx);
4338 #endif
4339 #if defined(CONFIG_WMV2_DECODER) || defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4340 ff_intrax8dsp_init(c,avctx);
4341 #endif
4342 #if defined(CONFIG_H264_ENCODER)
4343 ff_h264dspenc_init(c,avctx);
4344 #endif
4346 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4347 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4348 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4349 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4350 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4351 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4352 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4353 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4355 #define SET_CMP_FUNC(name) \
4356 c->name[0]= name ## 16_c;\
4357 c->name[1]= name ## 8x8_c;
4359 SET_CMP_FUNC(hadamard8_diff)
4360 c->hadamard8_diff[4]= hadamard8_intra16_c;
4361 SET_CMP_FUNC(dct_sad)
4362 SET_CMP_FUNC(dct_max)
4363 #ifdef CONFIG_GPL
4364 SET_CMP_FUNC(dct264_sad)
4365 #endif
4366 c->sad[0]= pix_abs16_c;
4367 c->sad[1]= pix_abs8_c;
4368 c->sse[0]= sse16_c;
4369 c->sse[1]= sse8_c;
4370 c->sse[2]= sse4_c;
4371 SET_CMP_FUNC(quant_psnr)
4372 SET_CMP_FUNC(rd)
4373 SET_CMP_FUNC(bit)
4374 c->vsad[0]= vsad16_c;
4375 c->vsad[4]= vsad_intra16_c;
4376 c->vsse[0]= vsse16_c;
4377 c->vsse[4]= vsse_intra16_c;
4378 c->nsse[0]= nsse16_c;
4379 c->nsse[1]= nsse8_c;
4380 #ifdef CONFIG_SNOW_ENCODER
4381 c->w53[0]= w53_16_c;
4382 c->w53[1]= w53_8_c;
4383 c->w97[0]= w97_16_c;
4384 c->w97[1]= w97_8_c;
4385 #endif
4387 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4389 c->add_bytes= add_bytes_c;
4390 c->add_bytes_l2= add_bytes_l2_c;
4391 c->diff_bytes= diff_bytes_c;
4392 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4393 c->bswap_buf= bswap_buf;
4394 #ifdef CONFIG_PNG_DECODER
4395 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4396 #endif
4398 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4399 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4400 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4401 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4402 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4403 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4404 c->h264_loop_filter_strength= NULL;
4406 if (ENABLE_ANY_H263) {
4407 c->h263_h_loop_filter= h263_h_loop_filter_c;
4408 c->h263_v_loop_filter= h263_v_loop_filter_c;
4411 c->h261_loop_filter= h261_loop_filter_c;
4413 c->try_8x8basis= try_8x8basis_c;
4414 c->add_8x8basis= add_8x8basis_c;
4416 #ifdef CONFIG_SNOW_DECODER
4417 c->vertical_compose97i = ff_snow_vertical_compose97i;
4418 c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4419 c->inner_add_yblock = ff_snow_inner_add_yblock;
4420 #endif
4422 #ifdef CONFIG_VORBIS_DECODER
4423 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4424 #endif
4425 #ifdef CONFIG_FLAC_ENCODER
4426 c->flac_compute_autocorr = ff_flac_compute_autocorr;
4427 #endif
4428 c->vector_fmul = vector_fmul_c;
4429 c->vector_fmul_reverse = vector_fmul_reverse_c;
4430 c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4431 c->float_to_int16 = ff_float_to_int16_c;
4433 c->shrink[0]= ff_img_copy_plane;
4434 c->shrink[1]= ff_shrink22;
4435 c->shrink[2]= ff_shrink44;
4436 c->shrink[3]= ff_shrink88;
4438 c->prefetch= just_return;
4440 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4441 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4443 if (ENABLE_MMX) dsputil_init_mmx (c, avctx);
4444 if (ENABLE_ARMV4L) dsputil_init_armv4l(c, avctx);
4445 if (ENABLE_MLIB) dsputil_init_mlib (c, avctx);
4446 if (ENABLE_VIS) dsputil_init_vis (c, avctx);
4447 if (ENABLE_ALPHA) dsputil_init_alpha (c, avctx);
4448 if (ENABLE_POWERPC) dsputil_init_ppc (c, avctx);
4449 if (ENABLE_MMI) dsputil_init_mmi (c, avctx);
4450 if (ENABLE_SH4) dsputil_init_sh4 (c, avctx);
4451 if (ENABLE_BFIN) dsputil_init_bfin (c, avctx);
4453 for(i=0; i<64; i++){
4454 if(!c->put_2tap_qpel_pixels_tab[0][i])
4455 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4456 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4457 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4460 switch(c->idct_permutation_type){
4461 case FF_NO_IDCT_PERM:
4462 for(i=0; i<64; i++)
4463 c->idct_permutation[i]= i;
4464 break;
4465 case FF_LIBMPEG2_IDCT_PERM:
4466 for(i=0; i<64; i++)
4467 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4468 break;
4469 case FF_SIMPLE_IDCT_PERM:
4470 for(i=0; i<64; i++)
4471 c->idct_permutation[i]= simple_mmx_permutation[i];
4472 break;
4473 case FF_TRANSPOSE_IDCT_PERM:
4474 for(i=0; i<64; i++)
4475 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4476 break;
4477 case FF_PARTTRANS_IDCT_PERM:
4478 for(i=0; i<64; i++)
4479 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4480 break;
4481 case FF_SSE2_IDCT_PERM:
4482 for(i=0; i<64; i++)
4483 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4484 break;
4485 default:
4486 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");