Remove table that was forgotten in the split.
[FFMpeg-mirror/ordered_chapters.git] / libavcodec / dsputil.c
blobe1f2eda76ae68bf4ddc31231d6e8f6296256b824
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 /**
26 * @file libavcodec/dsputil.c
27 * DSP utils
30 #include "avcodec.h"
31 #include "dsputil.h"
32 #include "simple_idct.h"
33 #include "faandct.h"
34 #include "faanidct.h"
35 #include "mathops.h"
36 #include "h263.h"
37 #include "snow.h"
39 /* snow.c */
40 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
42 /* vorbis.c */
43 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
45 /* ac3dec.c */
46 void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
48 /* flacenc.c */
49 void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
51 /* pngdec.c */
52 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
54 /* eaidct.c */
55 void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
57 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
58 uint32_t ff_squareTbl[512] = {0, };
60 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
61 #define pb_7f (~0UL/255 * 0x7f)
62 #define pb_80 (~0UL/255 * 0x80)
64 const uint8_t ff_zigzag_direct[64] = {
65 0, 1, 8, 16, 9, 2, 3, 10,
66 17, 24, 32, 25, 18, 11, 4, 5,
67 12, 19, 26, 33, 40, 48, 41, 34,
68 27, 20, 13, 6, 7, 14, 21, 28,
69 35, 42, 49, 56, 57, 50, 43, 36,
70 29, 22, 15, 23, 30, 37, 44, 51,
71 58, 59, 52, 45, 38, 31, 39, 46,
72 53, 60, 61, 54, 47, 55, 62, 63
75 /* Specific zigzag scan for 248 idct. NOTE that unlike the
76 specification, we interleave the fields */
77 const uint8_t ff_zigzag248_direct[64] = {
78 0, 8, 1, 9, 16, 24, 2, 10,
79 17, 25, 32, 40, 48, 56, 33, 41,
80 18, 26, 3, 11, 4, 12, 19, 27,
81 34, 42, 49, 57, 50, 58, 35, 43,
82 20, 28, 5, 13, 6, 14, 21, 29,
83 36, 44, 51, 59, 52, 60, 37, 45,
84 22, 30, 7, 15, 23, 31, 38, 46,
85 53, 61, 54, 62, 39, 47, 55, 63,
88 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
89 DECLARE_ALIGNED_16(uint16_t, inv_zigzag_direct16[64]);
91 const uint8_t ff_alternate_horizontal_scan[64] = {
92 0, 1, 2, 3, 8, 9, 16, 17,
93 10, 11, 4, 5, 6, 7, 15, 14,
94 13, 12, 19, 18, 24, 25, 32, 33,
95 26, 27, 20, 21, 22, 23, 28, 29,
96 30, 31, 34, 35, 40, 41, 48, 49,
97 42, 43, 36, 37, 38, 39, 44, 45,
98 46, 47, 50, 51, 56, 57, 58, 59,
99 52, 53, 54, 55, 60, 61, 62, 63,
102 const uint8_t ff_alternate_vertical_scan[64] = {
103 0, 8, 16, 24, 1, 9, 2, 10,
104 17, 25, 32, 40, 48, 56, 57, 49,
105 41, 33, 26, 18, 3, 11, 4, 12,
106 19, 27, 34, 42, 50, 58, 35, 43,
107 51, 59, 20, 28, 5, 13, 6, 14,
108 21, 29, 36, 44, 52, 60, 37, 45,
109 53, 61, 22, 30, 7, 15, 23, 31,
110 38, 46, 54, 62, 39, 47, 55, 63,
113 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
114 const uint32_t ff_inverse[256]={
115 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
116 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
117 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
118 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
119 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
120 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
121 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
122 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
123 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
124 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
125 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
126 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
127 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
128 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
129 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
130 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
131 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
132 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
133 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
134 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
135 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
136 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
137 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
138 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
139 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
140 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
141 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
142 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
143 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
144 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
145 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
146 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
149 /* Input permutation for the simple_idct_mmx */
150 static const uint8_t simple_mmx_permutation[64]={
151 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
152 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
153 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
154 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
155 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
156 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
157 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
158 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
161 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
163 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
164 int i;
165 int end;
167 st->scantable= src_scantable;
169 for(i=0; i<64; i++){
170 int j;
171 j = src_scantable[i];
172 st->permutated[i] = permutation[j];
173 #if ARCH_PPC
174 st->inverse[j] = i;
175 #endif
178 end=-1;
179 for(i=0; i<64; i++){
180 int j;
181 j = st->permutated[i];
182 if(j>end) end=j;
183 st->raster_end[i]= end;
187 static int pix_sum_c(uint8_t * pix, int line_size)
189 int s, i, j;
191 s = 0;
192 for (i = 0; i < 16; i++) {
193 for (j = 0; j < 16; j += 8) {
194 s += pix[0];
195 s += pix[1];
196 s += pix[2];
197 s += pix[3];
198 s += pix[4];
199 s += pix[5];
200 s += pix[6];
201 s += pix[7];
202 pix += 8;
204 pix += line_size - 16;
206 return s;
209 static int pix_norm1_c(uint8_t * pix, int line_size)
211 int s, i, j;
212 uint32_t *sq = ff_squareTbl + 256;
214 s = 0;
215 for (i = 0; i < 16; i++) {
216 for (j = 0; j < 16; j += 8) {
217 #if 0
218 s += sq[pix[0]];
219 s += sq[pix[1]];
220 s += sq[pix[2]];
221 s += sq[pix[3]];
222 s += sq[pix[4]];
223 s += sq[pix[5]];
224 s += sq[pix[6]];
225 s += sq[pix[7]];
226 #else
227 #if LONG_MAX > 2147483647
228 register uint64_t x=*(uint64_t*)pix;
229 s += sq[x&0xff];
230 s += sq[(x>>8)&0xff];
231 s += sq[(x>>16)&0xff];
232 s += sq[(x>>24)&0xff];
233 s += sq[(x>>32)&0xff];
234 s += sq[(x>>40)&0xff];
235 s += sq[(x>>48)&0xff];
236 s += sq[(x>>56)&0xff];
237 #else
238 register uint32_t x=*(uint32_t*)pix;
239 s += sq[x&0xff];
240 s += sq[(x>>8)&0xff];
241 s += sq[(x>>16)&0xff];
242 s += sq[(x>>24)&0xff];
243 x=*(uint32_t*)(pix+4);
244 s += sq[x&0xff];
245 s += sq[(x>>8)&0xff];
246 s += sq[(x>>16)&0xff];
247 s += sq[(x>>24)&0xff];
248 #endif
249 #endif
250 pix += 8;
252 pix += line_size - 16;
254 return s;
257 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
258 int i;
260 for(i=0; i+8<=w; i+=8){
261 dst[i+0]= bswap_32(src[i+0]);
262 dst[i+1]= bswap_32(src[i+1]);
263 dst[i+2]= bswap_32(src[i+2]);
264 dst[i+3]= bswap_32(src[i+3]);
265 dst[i+4]= bswap_32(src[i+4]);
266 dst[i+5]= bswap_32(src[i+5]);
267 dst[i+6]= bswap_32(src[i+6]);
268 dst[i+7]= bswap_32(src[i+7]);
270 for(;i<w; i++){
271 dst[i+0]= bswap_32(src[i+0]);
275 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
277 int s, i;
278 uint32_t *sq = ff_squareTbl + 256;
280 s = 0;
281 for (i = 0; i < h; i++) {
282 s += sq[pix1[0] - pix2[0]];
283 s += sq[pix1[1] - pix2[1]];
284 s += sq[pix1[2] - pix2[2]];
285 s += sq[pix1[3] - pix2[3]];
286 pix1 += line_size;
287 pix2 += line_size;
289 return s;
292 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
294 int s, i;
295 uint32_t *sq = ff_squareTbl + 256;
297 s = 0;
298 for (i = 0; i < h; i++) {
299 s += sq[pix1[0] - pix2[0]];
300 s += sq[pix1[1] - pix2[1]];
301 s += sq[pix1[2] - pix2[2]];
302 s += sq[pix1[3] - pix2[3]];
303 s += sq[pix1[4] - pix2[4]];
304 s += sq[pix1[5] - pix2[5]];
305 s += sq[pix1[6] - pix2[6]];
306 s += sq[pix1[7] - pix2[7]];
307 pix1 += line_size;
308 pix2 += line_size;
310 return s;
313 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
315 int s, i;
316 uint32_t *sq = ff_squareTbl + 256;
318 s = 0;
319 for (i = 0; i < h; i++) {
320 s += sq[pix1[ 0] - pix2[ 0]];
321 s += sq[pix1[ 1] - pix2[ 1]];
322 s += sq[pix1[ 2] - pix2[ 2]];
323 s += sq[pix1[ 3] - pix2[ 3]];
324 s += sq[pix1[ 4] - pix2[ 4]];
325 s += sq[pix1[ 5] - pix2[ 5]];
326 s += sq[pix1[ 6] - pix2[ 6]];
327 s += sq[pix1[ 7] - pix2[ 7]];
328 s += sq[pix1[ 8] - pix2[ 8]];
329 s += sq[pix1[ 9] - pix2[ 9]];
330 s += sq[pix1[10] - pix2[10]];
331 s += sq[pix1[11] - pix2[11]];
332 s += sq[pix1[12] - pix2[12]];
333 s += sq[pix1[13] - pix2[13]];
334 s += sq[pix1[14] - pix2[14]];
335 s += sq[pix1[15] - pix2[15]];
337 pix1 += line_size;
338 pix2 += line_size;
340 return s;
344 #if CONFIG_SNOW_ENCODER //dwt is in snow.c
345 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
346 int s, i, j;
347 const int dec_count= w==8 ? 3 : 4;
348 int tmp[32*32];
349 int level, ori;
350 static const int scale[2][2][4][4]={
353 // 9/7 8x8 dec=3
354 {268, 239, 239, 213},
355 { 0, 224, 224, 152},
356 { 0, 135, 135, 110},
358 // 9/7 16x16 or 32x32 dec=4
359 {344, 310, 310, 280},
360 { 0, 320, 320, 228},
361 { 0, 175, 175, 136},
362 { 0, 129, 129, 102},
366 // 5/3 8x8 dec=3
367 {275, 245, 245, 218},
368 { 0, 230, 230, 156},
369 { 0, 138, 138, 113},
371 // 5/3 16x16 or 32x32 dec=4
372 {352, 317, 317, 286},
373 { 0, 328, 328, 233},
374 { 0, 180, 180, 140},
375 { 0, 132, 132, 105},
380 for (i = 0; i < h; i++) {
381 for (j = 0; j < w; j+=4) {
382 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
383 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
384 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
385 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
387 pix1 += line_size;
388 pix2 += line_size;
391 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
393 s=0;
394 assert(w==h);
395 for(level=0; level<dec_count; level++){
396 for(ori= level ? 1 : 0; ori<4; ori++){
397 int size= w>>(dec_count-level);
398 int sx= (ori&1) ? size : 0;
399 int stride= 32<<(dec_count-level);
400 int sy= (ori&2) ? stride>>1 : 0;
402 for(i=0; i<size; i++){
403 for(j=0; j<size; j++){
404 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
405 s += FFABS(v);
410 assert(s>=0);
411 return s>>9;
414 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
415 return w_c(v, pix1, pix2, line_size, 8, h, 1);
418 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
419 return w_c(v, pix1, pix2, line_size, 8, h, 0);
422 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
423 return w_c(v, pix1, pix2, line_size, 16, h, 1);
426 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
427 return w_c(v, pix1, pix2, line_size, 16, h, 0);
430 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
431 return w_c(v, pix1, pix2, line_size, 32, h, 1);
434 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
435 return w_c(v, pix1, pix2, line_size, 32, h, 0);
437 #endif
439 /* draw the edges of width 'w' of an image of size width, height */
440 //FIXME check that this is ok for mpeg4 interlaced
441 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
443 uint8_t *ptr, *last_line;
444 int i;
446 last_line = buf + (height - 1) * wrap;
447 for(i=0;i<w;i++) {
448 /* top and bottom */
449 memcpy(buf - (i + 1) * wrap, buf, width);
450 memcpy(last_line + (i + 1) * wrap, last_line, width);
452 /* left and right */
453 ptr = buf;
454 for(i=0;i<height;i++) {
455 memset(ptr - w, ptr[0], w);
456 memset(ptr + width, ptr[width-1], w);
457 ptr += wrap;
459 /* corners */
460 for(i=0;i<w;i++) {
461 memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
462 memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
463 memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
464 memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
469 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
470 * @param buf destination buffer
471 * @param src source buffer
472 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
473 * @param block_w width of block
474 * @param block_h height of block
475 * @param src_x x coordinate of the top left sample of the block in the source buffer
476 * @param src_y y coordinate of the top left sample of the block in the source buffer
477 * @param w width of the source buffer
478 * @param h height of the source buffer
480 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
481 int src_x, int src_y, int w, int h){
482 int x, y;
483 int start_y, start_x, end_y, end_x;
485 if(src_y>= h){
486 src+= (h-1-src_y)*linesize;
487 src_y=h-1;
488 }else if(src_y<=-block_h){
489 src+= (1-block_h-src_y)*linesize;
490 src_y=1-block_h;
492 if(src_x>= w){
493 src+= (w-1-src_x);
494 src_x=w-1;
495 }else if(src_x<=-block_w){
496 src+= (1-block_w-src_x);
497 src_x=1-block_w;
500 start_y= FFMAX(0, -src_y);
501 start_x= FFMAX(0, -src_x);
502 end_y= FFMIN(block_h, h-src_y);
503 end_x= FFMIN(block_w, w-src_x);
505 // copy existing part
506 for(y=start_y; y<end_y; y++){
507 for(x=start_x; x<end_x; x++){
508 buf[x + y*linesize]= src[x + y*linesize];
512 //top
513 for(y=0; y<start_y; y++){
514 for(x=start_x; x<end_x; x++){
515 buf[x + y*linesize]= buf[x + start_y*linesize];
519 //bottom
520 for(y=end_y; y<block_h; y++){
521 for(x=start_x; x<end_x; x++){
522 buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
526 for(y=0; y<block_h; y++){
527 //left
528 for(x=0; x<start_x; x++){
529 buf[x + y*linesize]= buf[start_x + y*linesize];
532 //right
533 for(x=end_x; x<block_w; x++){
534 buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
539 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
541 int i;
543 /* read the pixels */
544 for(i=0;i<8;i++) {
545 block[0] = pixels[0];
546 block[1] = pixels[1];
547 block[2] = pixels[2];
548 block[3] = pixels[3];
549 block[4] = pixels[4];
550 block[5] = pixels[5];
551 block[6] = pixels[6];
552 block[7] = pixels[7];
553 pixels += line_size;
554 block += 8;
558 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
559 const uint8_t *s2, int stride){
560 int i;
562 /* read the pixels */
563 for(i=0;i<8;i++) {
564 block[0] = s1[0] - s2[0];
565 block[1] = s1[1] - s2[1];
566 block[2] = s1[2] - s2[2];
567 block[3] = s1[3] - s2[3];
568 block[4] = s1[4] - s2[4];
569 block[5] = s1[5] - s2[5];
570 block[6] = s1[6] - s2[6];
571 block[7] = s1[7] - s2[7];
572 s1 += stride;
573 s2 += stride;
574 block += 8;
579 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
580 int line_size)
582 int i;
583 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
585 /* read the pixels */
586 for(i=0;i<8;i++) {
587 pixels[0] = cm[block[0]];
588 pixels[1] = cm[block[1]];
589 pixels[2] = cm[block[2]];
590 pixels[3] = cm[block[3]];
591 pixels[4] = cm[block[4]];
592 pixels[5] = cm[block[5]];
593 pixels[6] = cm[block[6]];
594 pixels[7] = cm[block[7]];
596 pixels += line_size;
597 block += 8;
601 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
602 int line_size)
604 int i;
605 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
607 /* read the pixels */
608 for(i=0;i<4;i++) {
609 pixels[0] = cm[block[0]];
610 pixels[1] = cm[block[1]];
611 pixels[2] = cm[block[2]];
612 pixels[3] = cm[block[3]];
614 pixels += line_size;
615 block += 8;
619 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
620 int line_size)
622 int i;
623 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
625 /* read the pixels */
626 for(i=0;i<2;i++) {
627 pixels[0] = cm[block[0]];
628 pixels[1] = cm[block[1]];
630 pixels += line_size;
631 block += 8;
635 static void put_signed_pixels_clamped_c(const DCTELEM *block,
636 uint8_t *restrict pixels,
637 int line_size)
639 int i, j;
641 for (i = 0; i < 8; i++) {
642 for (j = 0; j < 8; j++) {
643 if (*block < -128)
644 *pixels = 0;
645 else if (*block > 127)
646 *pixels = 255;
647 else
648 *pixels = (uint8_t)(*block + 128);
649 block++;
650 pixels++;
652 pixels += (line_size - 8);
656 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
657 int line_size)
659 int i;
660 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
662 /* read the pixels */
663 for(i=0;i<8;i++) {
664 pixels[0] = cm[pixels[0] + block[0]];
665 pixels[1] = cm[pixels[1] + block[1]];
666 pixels[2] = cm[pixels[2] + block[2]];
667 pixels[3] = cm[pixels[3] + block[3]];
668 pixels[4] = cm[pixels[4] + block[4]];
669 pixels[5] = cm[pixels[5] + block[5]];
670 pixels[6] = cm[pixels[6] + block[6]];
671 pixels[7] = cm[pixels[7] + block[7]];
672 pixels += line_size;
673 block += 8;
677 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
678 int line_size)
680 int i;
681 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
683 /* read the pixels */
684 for(i=0;i<4;i++) {
685 pixels[0] = cm[pixels[0] + block[0]];
686 pixels[1] = cm[pixels[1] + block[1]];
687 pixels[2] = cm[pixels[2] + block[2]];
688 pixels[3] = cm[pixels[3] + block[3]];
689 pixels += line_size;
690 block += 8;
694 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
695 int line_size)
697 int i;
698 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
700 /* read the pixels */
701 for(i=0;i<2;i++) {
702 pixels[0] = cm[pixels[0] + block[0]];
703 pixels[1] = cm[pixels[1] + block[1]];
704 pixels += line_size;
705 block += 8;
709 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
711 int i;
712 for(i=0;i<8;i++) {
713 pixels[0] += block[0];
714 pixels[1] += block[1];
715 pixels[2] += block[2];
716 pixels[3] += block[3];
717 pixels[4] += block[4];
718 pixels[5] += block[5];
719 pixels[6] += block[6];
720 pixels[7] += block[7];
721 pixels += line_size;
722 block += 8;
726 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
728 int i;
729 for(i=0;i<4;i++) {
730 pixels[0] += block[0];
731 pixels[1] += block[1];
732 pixels[2] += block[2];
733 pixels[3] += block[3];
734 pixels += line_size;
735 block += 4;
739 static int sum_abs_dctelem_c(DCTELEM *block)
741 int sum=0, i;
742 for(i=0; i<64; i++)
743 sum+= FFABS(block[i]);
744 return sum;
747 #if 0
749 #define PIXOP2(OPNAME, OP) \
750 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
752 int i;\
753 for(i=0; i<h; i++){\
754 OP(*((uint64_t*)block), AV_RN64(pixels));\
755 pixels+=line_size;\
756 block +=line_size;\
760 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
762 int i;\
763 for(i=0; i<h; i++){\
764 const uint64_t a= AV_RN64(pixels );\
765 const uint64_t b= AV_RN64(pixels+1);\
766 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
767 pixels+=line_size;\
768 block +=line_size;\
772 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
774 int i;\
775 for(i=0; i<h; i++){\
776 const uint64_t a= AV_RN64(pixels );\
777 const uint64_t b= AV_RN64(pixels+1);\
778 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
779 pixels+=line_size;\
780 block +=line_size;\
784 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
786 int i;\
787 for(i=0; i<h; i++){\
788 const uint64_t a= AV_RN64(pixels );\
789 const uint64_t b= AV_RN64(pixels+line_size);\
790 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
791 pixels+=line_size;\
792 block +=line_size;\
796 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
798 int i;\
799 for(i=0; i<h; i++){\
800 const uint64_t a= AV_RN64(pixels );\
801 const uint64_t b= AV_RN64(pixels+line_size);\
802 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
803 pixels+=line_size;\
804 block +=line_size;\
808 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
810 int i;\
811 const uint64_t a= AV_RN64(pixels );\
812 const uint64_t b= AV_RN64(pixels+1);\
813 uint64_t l0= (a&0x0303030303030303ULL)\
814 + (b&0x0303030303030303ULL)\
815 + 0x0202020202020202ULL;\
816 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
817 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
818 uint64_t l1,h1;\
820 pixels+=line_size;\
821 for(i=0; i<h; i+=2){\
822 uint64_t a= AV_RN64(pixels );\
823 uint64_t b= AV_RN64(pixels+1);\
824 l1= (a&0x0303030303030303ULL)\
825 + (b&0x0303030303030303ULL);\
826 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
827 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
828 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
829 pixels+=line_size;\
830 block +=line_size;\
831 a= AV_RN64(pixels );\
832 b= AV_RN64(pixels+1);\
833 l0= (a&0x0303030303030303ULL)\
834 + (b&0x0303030303030303ULL)\
835 + 0x0202020202020202ULL;\
836 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
837 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
838 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
839 pixels+=line_size;\
840 block +=line_size;\
844 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
846 int i;\
847 const uint64_t a= AV_RN64(pixels );\
848 const uint64_t b= AV_RN64(pixels+1);\
849 uint64_t l0= (a&0x0303030303030303ULL)\
850 + (b&0x0303030303030303ULL)\
851 + 0x0101010101010101ULL;\
852 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
853 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
854 uint64_t l1,h1;\
856 pixels+=line_size;\
857 for(i=0; i<h; i+=2){\
858 uint64_t a= AV_RN64(pixels );\
859 uint64_t b= AV_RN64(pixels+1);\
860 l1= (a&0x0303030303030303ULL)\
861 + (b&0x0303030303030303ULL);\
862 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
863 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
864 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
865 pixels+=line_size;\
866 block +=line_size;\
867 a= AV_RN64(pixels );\
868 b= AV_RN64(pixels+1);\
869 l0= (a&0x0303030303030303ULL)\
870 + (b&0x0303030303030303ULL)\
871 + 0x0101010101010101ULL;\
872 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
873 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
874 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
875 pixels+=line_size;\
876 block +=line_size;\
880 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
881 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
882 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
883 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
884 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
885 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
886 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
888 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
889 #else // 64 bit variant
891 #define PIXOP2(OPNAME, OP) \
892 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
893 int i;\
894 for(i=0; i<h; i++){\
895 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
896 pixels+=line_size;\
897 block +=line_size;\
900 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
901 int i;\
902 for(i=0; i<h; i++){\
903 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
904 pixels+=line_size;\
905 block +=line_size;\
908 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
909 int i;\
910 for(i=0; i<h; i++){\
911 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
912 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
913 pixels+=line_size;\
914 block +=line_size;\
917 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
918 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
921 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
922 int src_stride1, int src_stride2, int h){\
923 int i;\
924 for(i=0; i<h; i++){\
925 uint32_t a,b;\
926 a= AV_RN32(&src1[i*src_stride1 ]);\
927 b= AV_RN32(&src2[i*src_stride2 ]);\
928 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
929 a= AV_RN32(&src1[i*src_stride1+4]);\
930 b= AV_RN32(&src2[i*src_stride2+4]);\
931 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
935 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
936 int src_stride1, int src_stride2, int h){\
937 int i;\
938 for(i=0; i<h; i++){\
939 uint32_t a,b;\
940 a= AV_RN32(&src1[i*src_stride1 ]);\
941 b= AV_RN32(&src2[i*src_stride2 ]);\
942 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
943 a= AV_RN32(&src1[i*src_stride1+4]);\
944 b= AV_RN32(&src2[i*src_stride2+4]);\
945 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
949 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
950 int src_stride1, int src_stride2, int h){\
951 int i;\
952 for(i=0; i<h; i++){\
953 uint32_t a,b;\
954 a= AV_RN32(&src1[i*src_stride1 ]);\
955 b= AV_RN32(&src2[i*src_stride2 ]);\
956 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
960 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
961 int src_stride1, int src_stride2, int h){\
962 int i;\
963 for(i=0; i<h; i++){\
964 uint32_t a,b;\
965 a= AV_RN16(&src1[i*src_stride1 ]);\
966 b= AV_RN16(&src2[i*src_stride2 ]);\
967 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
971 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
972 int src_stride1, int src_stride2, int h){\
973 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
974 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
977 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
978 int src_stride1, int src_stride2, int h){\
979 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
980 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
983 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
984 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
987 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
988 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
991 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
992 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
995 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
996 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
999 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1000 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1001 int i;\
1002 for(i=0; i<h; i++){\
1003 uint32_t a, b, c, d, l0, l1, h0, h1;\
1004 a= AV_RN32(&src1[i*src_stride1]);\
1005 b= AV_RN32(&src2[i*src_stride2]);\
1006 c= AV_RN32(&src3[i*src_stride3]);\
1007 d= AV_RN32(&src4[i*src_stride4]);\
1008 l0= (a&0x03030303UL)\
1009 + (b&0x03030303UL)\
1010 + 0x02020202UL;\
1011 h0= ((a&0xFCFCFCFCUL)>>2)\
1012 + ((b&0xFCFCFCFCUL)>>2);\
1013 l1= (c&0x03030303UL)\
1014 + (d&0x03030303UL);\
1015 h1= ((c&0xFCFCFCFCUL)>>2)\
1016 + ((d&0xFCFCFCFCUL)>>2);\
1017 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1018 a= AV_RN32(&src1[i*src_stride1+4]);\
1019 b= AV_RN32(&src2[i*src_stride2+4]);\
1020 c= AV_RN32(&src3[i*src_stride3+4]);\
1021 d= AV_RN32(&src4[i*src_stride4+4]);\
1022 l0= (a&0x03030303UL)\
1023 + (b&0x03030303UL)\
1024 + 0x02020202UL;\
1025 h0= ((a&0xFCFCFCFCUL)>>2)\
1026 + ((b&0xFCFCFCFCUL)>>2);\
1027 l1= (c&0x03030303UL)\
1028 + (d&0x03030303UL);\
1029 h1= ((c&0xFCFCFCFCUL)>>2)\
1030 + ((d&0xFCFCFCFCUL)>>2);\
1031 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1035 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1036 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1039 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1040 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1043 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1044 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1047 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1048 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1051 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1052 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1053 int i;\
1054 for(i=0; i<h; i++){\
1055 uint32_t a, b, c, d, l0, l1, h0, h1;\
1056 a= AV_RN32(&src1[i*src_stride1]);\
1057 b= AV_RN32(&src2[i*src_stride2]);\
1058 c= AV_RN32(&src3[i*src_stride3]);\
1059 d= AV_RN32(&src4[i*src_stride4]);\
1060 l0= (a&0x03030303UL)\
1061 + (b&0x03030303UL)\
1062 + 0x01010101UL;\
1063 h0= ((a&0xFCFCFCFCUL)>>2)\
1064 + ((b&0xFCFCFCFCUL)>>2);\
1065 l1= (c&0x03030303UL)\
1066 + (d&0x03030303UL);\
1067 h1= ((c&0xFCFCFCFCUL)>>2)\
1068 + ((d&0xFCFCFCFCUL)>>2);\
1069 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1070 a= AV_RN32(&src1[i*src_stride1+4]);\
1071 b= AV_RN32(&src2[i*src_stride2+4]);\
1072 c= AV_RN32(&src3[i*src_stride3+4]);\
1073 d= AV_RN32(&src4[i*src_stride4+4]);\
1074 l0= (a&0x03030303UL)\
1075 + (b&0x03030303UL)\
1076 + 0x01010101UL;\
1077 h0= ((a&0xFCFCFCFCUL)>>2)\
1078 + ((b&0xFCFCFCFCUL)>>2);\
1079 l1= (c&0x03030303UL)\
1080 + (d&0x03030303UL);\
1081 h1= ((c&0xFCFCFCFCUL)>>2)\
1082 + ((d&0xFCFCFCFCUL)>>2);\
1083 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1086 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1087 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1088 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1089 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1091 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1092 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1093 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1094 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1097 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1099 int i, a0, b0, a1, b1;\
1100 a0= pixels[0];\
1101 b0= pixels[1] + 2;\
1102 a0 += b0;\
1103 b0 += pixels[2];\
1105 pixels+=line_size;\
1106 for(i=0; i<h; i+=2){\
1107 a1= pixels[0];\
1108 b1= pixels[1];\
1109 a1 += b1;\
1110 b1 += pixels[2];\
1112 block[0]= (a1+a0)>>2; /* FIXME non put */\
1113 block[1]= (b1+b0)>>2;\
1115 pixels+=line_size;\
1116 block +=line_size;\
1118 a0= pixels[0];\
1119 b0= pixels[1] + 2;\
1120 a0 += b0;\
1121 b0 += pixels[2];\
1123 block[0]= (a1+a0)>>2;\
1124 block[1]= (b1+b0)>>2;\
1125 pixels+=line_size;\
1126 block +=line_size;\
1130 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1132 int i;\
1133 const uint32_t a= AV_RN32(pixels );\
1134 const uint32_t b= AV_RN32(pixels+1);\
1135 uint32_t l0= (a&0x03030303UL)\
1136 + (b&0x03030303UL)\
1137 + 0x02020202UL;\
1138 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1139 + ((b&0xFCFCFCFCUL)>>2);\
1140 uint32_t l1,h1;\
1142 pixels+=line_size;\
1143 for(i=0; i<h; i+=2){\
1144 uint32_t a= AV_RN32(pixels );\
1145 uint32_t b= AV_RN32(pixels+1);\
1146 l1= (a&0x03030303UL)\
1147 + (b&0x03030303UL);\
1148 h1= ((a&0xFCFCFCFCUL)>>2)\
1149 + ((b&0xFCFCFCFCUL)>>2);\
1150 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1151 pixels+=line_size;\
1152 block +=line_size;\
1153 a= AV_RN32(pixels );\
1154 b= AV_RN32(pixels+1);\
1155 l0= (a&0x03030303UL)\
1156 + (b&0x03030303UL)\
1157 + 0x02020202UL;\
1158 h0= ((a&0xFCFCFCFCUL)>>2)\
1159 + ((b&0xFCFCFCFCUL)>>2);\
1160 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1161 pixels+=line_size;\
1162 block +=line_size;\
1166 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1168 int j;\
1169 for(j=0; j<2; j++){\
1170 int i;\
1171 const uint32_t a= AV_RN32(pixels );\
1172 const uint32_t b= AV_RN32(pixels+1);\
1173 uint32_t l0= (a&0x03030303UL)\
1174 + (b&0x03030303UL)\
1175 + 0x02020202UL;\
1176 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1177 + ((b&0xFCFCFCFCUL)>>2);\
1178 uint32_t l1,h1;\
1180 pixels+=line_size;\
1181 for(i=0; i<h; i+=2){\
1182 uint32_t a= AV_RN32(pixels );\
1183 uint32_t b= AV_RN32(pixels+1);\
1184 l1= (a&0x03030303UL)\
1185 + (b&0x03030303UL);\
1186 h1= ((a&0xFCFCFCFCUL)>>2)\
1187 + ((b&0xFCFCFCFCUL)>>2);\
1188 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1189 pixels+=line_size;\
1190 block +=line_size;\
1191 a= AV_RN32(pixels );\
1192 b= AV_RN32(pixels+1);\
1193 l0= (a&0x03030303UL)\
1194 + (b&0x03030303UL)\
1195 + 0x02020202UL;\
1196 h0= ((a&0xFCFCFCFCUL)>>2)\
1197 + ((b&0xFCFCFCFCUL)>>2);\
1198 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1199 pixels+=line_size;\
1200 block +=line_size;\
1202 pixels+=4-line_size*(h+1);\
1203 block +=4-line_size*h;\
1207 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1209 int j;\
1210 for(j=0; j<2; j++){\
1211 int i;\
1212 const uint32_t a= AV_RN32(pixels );\
1213 const uint32_t b= AV_RN32(pixels+1);\
1214 uint32_t l0= (a&0x03030303UL)\
1215 + (b&0x03030303UL)\
1216 + 0x01010101UL;\
1217 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1218 + ((b&0xFCFCFCFCUL)>>2);\
1219 uint32_t l1,h1;\
1221 pixels+=line_size;\
1222 for(i=0; i<h; i+=2){\
1223 uint32_t a= AV_RN32(pixels );\
1224 uint32_t b= AV_RN32(pixels+1);\
1225 l1= (a&0x03030303UL)\
1226 + (b&0x03030303UL);\
1227 h1= ((a&0xFCFCFCFCUL)>>2)\
1228 + ((b&0xFCFCFCFCUL)>>2);\
1229 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1230 pixels+=line_size;\
1231 block +=line_size;\
1232 a= AV_RN32(pixels );\
1233 b= AV_RN32(pixels+1);\
1234 l0= (a&0x03030303UL)\
1235 + (b&0x03030303UL)\
1236 + 0x01010101UL;\
1237 h0= ((a&0xFCFCFCFCUL)>>2)\
1238 + ((b&0xFCFCFCFCUL)>>2);\
1239 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1240 pixels+=line_size;\
1241 block +=line_size;\
1243 pixels+=4-line_size*(h+1);\
1244 block +=4-line_size*h;\
1248 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1249 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1250 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1251 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1252 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1253 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1254 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1255 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1257 #define op_avg(a, b) a = rnd_avg32(a, b)
1258 #endif
1259 #define op_put(a, b) a = b
1261 PIXOP2(avg, op_avg)
1262 PIXOP2(put, op_put)
1263 #undef op_avg
1264 #undef op_put
1266 #define avg2(a,b) ((a+b+1)>>1)
1267 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1269 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1270 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1273 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1274 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1277 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1279 const int A=(16-x16)*(16-y16);
1280 const int B=( x16)*(16-y16);
1281 const int C=(16-x16)*( y16);
1282 const int D=( x16)*( y16);
1283 int i;
1285 for(i=0; i<h; i++)
1287 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1288 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1289 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1290 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1291 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1292 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1293 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1294 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1295 dst+= stride;
1296 src+= stride;
1300 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1301 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1303 int y, vx, vy;
1304 const int s= 1<<shift;
1306 width--;
1307 height--;
1309 for(y=0; y<h; y++){
1310 int x;
1312 vx= ox;
1313 vy= oy;
1314 for(x=0; x<8; x++){ //XXX FIXME optimize
1315 int src_x, src_y, frac_x, frac_y, index;
1317 src_x= vx>>16;
1318 src_y= vy>>16;
1319 frac_x= src_x&(s-1);
1320 frac_y= src_y&(s-1);
1321 src_x>>=shift;
1322 src_y>>=shift;
1324 if((unsigned)src_x < width){
1325 if((unsigned)src_y < height){
1326 index= src_x + src_y*stride;
1327 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1328 + src[index +1]* frac_x )*(s-frac_y)
1329 + ( src[index+stride ]*(s-frac_x)
1330 + src[index+stride+1]* frac_x )* frac_y
1331 + r)>>(shift*2);
1332 }else{
1333 index= src_x + av_clip(src_y, 0, height)*stride;
1334 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1335 + src[index +1]* frac_x )*s
1336 + r)>>(shift*2);
1338 }else{
1339 if((unsigned)src_y < height){
1340 index= av_clip(src_x, 0, width) + src_y*stride;
1341 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1342 + src[index+stride ]* frac_y )*s
1343 + r)>>(shift*2);
1344 }else{
1345 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1346 dst[y*stride + x]= src[index ];
1350 vx+= dxx;
1351 vy+= dyx;
1353 ox += dxy;
1354 oy += dyy;
1358 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1359 switch(width){
1360 case 2: put_pixels2_c (dst, src, stride, height); break;
1361 case 4: put_pixels4_c (dst, src, stride, height); break;
1362 case 8: put_pixels8_c (dst, src, stride, height); break;
1363 case 16:put_pixels16_c(dst, src, stride, height); break;
1367 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1368 int i,j;
1369 for (i=0; i < height; i++) {
1370 for (j=0; j < width; j++) {
1371 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1373 src += stride;
1374 dst += stride;
1378 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1379 int i,j;
1380 for (i=0; i < height; i++) {
1381 for (j=0; j < width; j++) {
1382 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1384 src += stride;
1385 dst += stride;
1389 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1390 int i,j;
1391 for (i=0; i < height; i++) {
1392 for (j=0; j < width; j++) {
1393 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1395 src += stride;
1396 dst += stride;
1400 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1401 int i,j;
1402 for (i=0; i < height; i++) {
1403 for (j=0; j < width; j++) {
1404 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1406 src += stride;
1407 dst += stride;
1411 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1412 int i,j;
1413 for (i=0; i < height; i++) {
1414 for (j=0; j < width; j++) {
1415 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1417 src += stride;
1418 dst += stride;
1422 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1423 int i,j;
1424 for (i=0; i < height; i++) {
1425 for (j=0; j < width; j++) {
1426 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1428 src += stride;
1429 dst += stride;
1433 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1434 int i,j;
1435 for (i=0; i < height; i++) {
1436 for (j=0; j < width; j++) {
1437 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1439 src += stride;
1440 dst += stride;
1444 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1445 int i,j;
1446 for (i=0; i < height; i++) {
1447 for (j=0; j < width; j++) {
1448 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1450 src += stride;
1451 dst += stride;
1455 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1456 switch(width){
1457 case 2: avg_pixels2_c (dst, src, stride, height); break;
1458 case 4: avg_pixels4_c (dst, src, stride, height); break;
1459 case 8: avg_pixels8_c (dst, src, stride, height); break;
1460 case 16:avg_pixels16_c(dst, src, stride, height); break;
1464 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1465 int i,j;
1466 for (i=0; i < height; i++) {
1467 for (j=0; j < width; j++) {
1468 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1470 src += stride;
1471 dst += stride;
1475 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1476 int i,j;
1477 for (i=0; i < height; i++) {
1478 for (j=0; j < width; j++) {
1479 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1481 src += stride;
1482 dst += stride;
1486 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1487 int i,j;
1488 for (i=0; i < height; i++) {
1489 for (j=0; j < width; j++) {
1490 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1492 src += stride;
1493 dst += stride;
1497 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1498 int i,j;
1499 for (i=0; i < height; i++) {
1500 for (j=0; j < width; j++) {
1501 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1503 src += stride;
1504 dst += stride;
1508 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1509 int i,j;
1510 for (i=0; i < height; i++) {
1511 for (j=0; j < width; j++) {
1512 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1514 src += stride;
1515 dst += stride;
1519 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1520 int i,j;
1521 for (i=0; i < height; i++) {
1522 for (j=0; j < width; j++) {
1523 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1525 src += stride;
1526 dst += stride;
1530 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1531 int i,j;
1532 for (i=0; i < height; i++) {
1533 for (j=0; j < width; j++) {
1534 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1536 src += stride;
1537 dst += stride;
1541 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1542 int i,j;
1543 for (i=0; i < height; i++) {
1544 for (j=0; j < width; j++) {
1545 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1547 src += stride;
1548 dst += stride;
1551 #if 0
1552 #define TPEL_WIDTH(width)\
1553 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1554 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1555 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1556 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1557 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1558 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1559 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1560 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1561 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1562 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1563 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1564 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1565 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1566 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1567 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1568 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1569 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1570 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1571 #endif
1573 #define H264_CHROMA_MC(OPNAME, OP)\
1574 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1575 const int A=(8-x)*(8-y);\
1576 const int B=( x)*(8-y);\
1577 const int C=(8-x)*( y);\
1578 const int D=( x)*( y);\
1579 int i;\
1581 assert(x<8 && y<8 && x>=0 && y>=0);\
1583 if(D){\
1584 for(i=0; i<h; i++){\
1585 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1586 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1587 dst+= stride;\
1588 src+= stride;\
1590 }else{\
1591 const int E= B+C;\
1592 const int step= C ? stride : 1;\
1593 for(i=0; i<h; i++){\
1594 OP(dst[0], (A*src[0] + E*src[step+0]));\
1595 OP(dst[1], (A*src[1] + E*src[step+1]));\
1596 dst+= stride;\
1597 src+= stride;\
1602 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1603 const int A=(8-x)*(8-y);\
1604 const int B=( x)*(8-y);\
1605 const int C=(8-x)*( y);\
1606 const int D=( x)*( y);\
1607 int i;\
1609 assert(x<8 && y<8 && x>=0 && y>=0);\
1611 if(D){\
1612 for(i=0; i<h; i++){\
1613 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1614 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1615 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1616 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1617 dst+= stride;\
1618 src+= stride;\
1620 }else{\
1621 const int E= B+C;\
1622 const int step= C ? stride : 1;\
1623 for(i=0; i<h; i++){\
1624 OP(dst[0], (A*src[0] + E*src[step+0]));\
1625 OP(dst[1], (A*src[1] + E*src[step+1]));\
1626 OP(dst[2], (A*src[2] + E*src[step+2]));\
1627 OP(dst[3], (A*src[3] + E*src[step+3]));\
1628 dst+= stride;\
1629 src+= stride;\
1634 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1635 const int A=(8-x)*(8-y);\
1636 const int B=( x)*(8-y);\
1637 const int C=(8-x)*( y);\
1638 const int D=( x)*( y);\
1639 int i;\
1641 assert(x<8 && y<8 && x>=0 && y>=0);\
1643 if(D){\
1644 for(i=0; i<h; i++){\
1645 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1646 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1647 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1648 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1649 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1650 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1651 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1652 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1653 dst+= stride;\
1654 src+= stride;\
1656 }else{\
1657 const int E= B+C;\
1658 const int step= C ? stride : 1;\
1659 for(i=0; i<h; i++){\
1660 OP(dst[0], (A*src[0] + E*src[step+0]));\
1661 OP(dst[1], (A*src[1] + E*src[step+1]));\
1662 OP(dst[2], (A*src[2] + E*src[step+2]));\
1663 OP(dst[3], (A*src[3] + E*src[step+3]));\
1664 OP(dst[4], (A*src[4] + E*src[step+4]));\
1665 OP(dst[5], (A*src[5] + E*src[step+5]));\
1666 OP(dst[6], (A*src[6] + E*src[step+6]));\
1667 OP(dst[7], (A*src[7] + E*src[step+7]));\
1668 dst+= stride;\
1669 src+= stride;\
1674 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1675 #define op_put(a, b) a = (((b) + 32)>>6)
1677 H264_CHROMA_MC(put_ , op_put)
1678 H264_CHROMA_MC(avg_ , op_avg)
1679 #undef op_avg
1680 #undef op_put
1682 static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1683 const int A=(8-x)*(8-y);
1684 const int B=( x)*(8-y);
1685 const int C=(8-x)*( y);
1686 const int D=( x)*( y);
1687 int i;
1689 assert(x<8 && y<8 && x>=0 && y>=0);
1691 for(i=0; i<h; i++)
1693 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1694 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1695 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1696 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1697 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1698 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1699 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1700 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1701 dst+= stride;
1702 src+= stride;
1706 static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1707 const int A=(8-x)*(8-y);
1708 const int B=( x)*(8-y);
1709 const int C=(8-x)*( y);
1710 const int D=( x)*( y);
1711 int i;
1713 assert(x<8 && y<8 && x>=0 && y>=0);
1715 for(i=0; i<h; i++)
1717 dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1718 dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1719 dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1720 dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1721 dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1722 dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1723 dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1724 dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1725 dst+= stride;
1726 src+= stride;
1730 #define QPEL_MC(r, OPNAME, RND, OP) \
1731 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1732 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1733 int i;\
1734 for(i=0; i<h; i++)\
1736 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1737 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1738 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1739 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1740 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1741 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1742 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1743 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1744 dst+=dstStride;\
1745 src+=srcStride;\
1749 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1750 const int w=8;\
1751 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1752 int i;\
1753 for(i=0; i<w; i++)\
1755 const int src0= src[0*srcStride];\
1756 const int src1= src[1*srcStride];\
1757 const int src2= src[2*srcStride];\
1758 const int src3= src[3*srcStride];\
1759 const int src4= src[4*srcStride];\
1760 const int src5= src[5*srcStride];\
1761 const int src6= src[6*srcStride];\
1762 const int src7= src[7*srcStride];\
1763 const int src8= src[8*srcStride];\
1764 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1765 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1766 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1767 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1768 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1769 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1770 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1771 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1772 dst++;\
1773 src++;\
1777 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1778 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1779 int i;\
1781 for(i=0; i<h; i++)\
1783 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1784 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1785 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1786 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1787 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1788 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1789 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1790 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1791 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1792 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1793 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1794 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1795 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1796 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1797 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1798 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1799 dst+=dstStride;\
1800 src+=srcStride;\
1804 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1805 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1806 int i;\
1807 const int w=16;\
1808 for(i=0; i<w; i++)\
1810 const int src0= src[0*srcStride];\
1811 const int src1= src[1*srcStride];\
1812 const int src2= src[2*srcStride];\
1813 const int src3= src[3*srcStride];\
1814 const int src4= src[4*srcStride];\
1815 const int src5= src[5*srcStride];\
1816 const int src6= src[6*srcStride];\
1817 const int src7= src[7*srcStride];\
1818 const int src8= src[8*srcStride];\
1819 const int src9= src[9*srcStride];\
1820 const int src10= src[10*srcStride];\
1821 const int src11= src[11*srcStride];\
1822 const int src12= src[12*srcStride];\
1823 const int src13= src[13*srcStride];\
1824 const int src14= src[14*srcStride];\
1825 const int src15= src[15*srcStride];\
1826 const int src16= src[16*srcStride];\
1827 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1828 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1829 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1830 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1831 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1832 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1833 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1834 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1835 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1836 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1837 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1838 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1839 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1840 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1841 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1842 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1843 dst++;\
1844 src++;\
1848 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1849 OPNAME ## pixels8_c(dst, src, stride, 8);\
1852 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1853 uint8_t half[64];\
1854 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1855 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1858 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1859 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1862 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1863 uint8_t half[64];\
1864 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1865 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1868 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1869 uint8_t full[16*9];\
1870 uint8_t half[64];\
1871 copy_block9(full, src, 16, stride, 9);\
1872 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1873 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1876 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1877 uint8_t full[16*9];\
1878 copy_block9(full, src, 16, stride, 9);\
1879 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1882 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1883 uint8_t full[16*9];\
1884 uint8_t half[64];\
1885 copy_block9(full, src, 16, stride, 9);\
1886 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1887 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1889 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1890 uint8_t full[16*9];\
1891 uint8_t halfH[72];\
1892 uint8_t halfV[64];\
1893 uint8_t halfHV[64];\
1894 copy_block9(full, src, 16, stride, 9);\
1895 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1896 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1897 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1898 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1900 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1901 uint8_t full[16*9];\
1902 uint8_t halfH[72];\
1903 uint8_t halfHV[64];\
1904 copy_block9(full, src, 16, stride, 9);\
1905 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1906 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1907 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1908 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1910 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1911 uint8_t full[16*9];\
1912 uint8_t halfH[72];\
1913 uint8_t halfV[64];\
1914 uint8_t halfHV[64];\
1915 copy_block9(full, src, 16, stride, 9);\
1916 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1917 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1918 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1919 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1921 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1922 uint8_t full[16*9];\
1923 uint8_t halfH[72];\
1924 uint8_t halfHV[64];\
1925 copy_block9(full, src, 16, stride, 9);\
1926 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1927 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1928 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1929 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1931 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1932 uint8_t full[16*9];\
1933 uint8_t halfH[72];\
1934 uint8_t halfV[64];\
1935 uint8_t halfHV[64];\
1936 copy_block9(full, src, 16, stride, 9);\
1937 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1938 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1939 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1940 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1942 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1943 uint8_t full[16*9];\
1944 uint8_t halfH[72];\
1945 uint8_t halfHV[64];\
1946 copy_block9(full, src, 16, stride, 9);\
1947 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1948 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1949 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1950 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1952 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1953 uint8_t full[16*9];\
1954 uint8_t halfH[72];\
1955 uint8_t halfV[64];\
1956 uint8_t halfHV[64];\
1957 copy_block9(full, src, 16, stride, 9);\
1958 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1959 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1960 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1961 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1963 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1964 uint8_t full[16*9];\
1965 uint8_t halfH[72];\
1966 uint8_t halfHV[64];\
1967 copy_block9(full, src, 16, stride, 9);\
1968 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1969 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1970 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1971 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1973 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1974 uint8_t halfH[72];\
1975 uint8_t halfHV[64];\
1976 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1977 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1978 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1980 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1981 uint8_t halfH[72];\
1982 uint8_t halfHV[64];\
1983 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1984 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1985 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1987 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1988 uint8_t full[16*9];\
1989 uint8_t halfH[72];\
1990 uint8_t halfV[64];\
1991 uint8_t halfHV[64];\
1992 copy_block9(full, src, 16, stride, 9);\
1993 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1994 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1995 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1996 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1998 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1999 uint8_t full[16*9];\
2000 uint8_t halfH[72];\
2001 copy_block9(full, src, 16, stride, 9);\
2002 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2003 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
2004 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2006 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2007 uint8_t full[16*9];\
2008 uint8_t halfH[72];\
2009 uint8_t halfV[64];\
2010 uint8_t halfHV[64];\
2011 copy_block9(full, src, 16, stride, 9);\
2012 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2013 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
2014 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2015 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2017 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2018 uint8_t full[16*9];\
2019 uint8_t halfH[72];\
2020 copy_block9(full, src, 16, stride, 9);\
2021 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2022 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
2023 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2025 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2026 uint8_t halfH[72];\
2027 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2028 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2030 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2031 OPNAME ## pixels16_c(dst, src, stride, 16);\
2034 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2035 uint8_t half[256];\
2036 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2037 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2040 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2041 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2044 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2045 uint8_t half[256];\
2046 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2047 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2050 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2051 uint8_t full[24*17];\
2052 uint8_t half[256];\
2053 copy_block17(full, src, 24, stride, 17);\
2054 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2055 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2058 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2059 uint8_t full[24*17];\
2060 copy_block17(full, src, 24, stride, 17);\
2061 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2064 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2065 uint8_t full[24*17];\
2066 uint8_t half[256];\
2067 copy_block17(full, src, 24, stride, 17);\
2068 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2069 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2071 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2072 uint8_t full[24*17];\
2073 uint8_t halfH[272];\
2074 uint8_t halfV[256];\
2075 uint8_t halfHV[256];\
2076 copy_block17(full, src, 24, stride, 17);\
2077 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2078 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2079 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2080 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2082 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2083 uint8_t full[24*17];\
2084 uint8_t halfH[272];\
2085 uint8_t halfHV[256];\
2086 copy_block17(full, src, 24, stride, 17);\
2087 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2088 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2089 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2090 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2092 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2093 uint8_t full[24*17];\
2094 uint8_t halfH[272];\
2095 uint8_t halfV[256];\
2096 uint8_t halfHV[256];\
2097 copy_block17(full, src, 24, stride, 17);\
2098 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2099 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2100 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2101 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2103 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2104 uint8_t full[24*17];\
2105 uint8_t halfH[272];\
2106 uint8_t halfHV[256];\
2107 copy_block17(full, src, 24, stride, 17);\
2108 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2109 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2110 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2111 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2113 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2114 uint8_t full[24*17];\
2115 uint8_t halfH[272];\
2116 uint8_t halfV[256];\
2117 uint8_t halfHV[256];\
2118 copy_block17(full, src, 24, stride, 17);\
2119 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2120 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2121 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2122 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2124 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2125 uint8_t full[24*17];\
2126 uint8_t halfH[272];\
2127 uint8_t halfHV[256];\
2128 copy_block17(full, src, 24, stride, 17);\
2129 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2130 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2131 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2132 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2134 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2135 uint8_t full[24*17];\
2136 uint8_t halfH[272];\
2137 uint8_t halfV[256];\
2138 uint8_t halfHV[256];\
2139 copy_block17(full, src, 24, stride, 17);\
2140 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2141 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2142 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2143 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2145 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2146 uint8_t full[24*17];\
2147 uint8_t halfH[272];\
2148 uint8_t halfHV[256];\
2149 copy_block17(full, src, 24, stride, 17);\
2150 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2151 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2152 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2153 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2155 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2156 uint8_t halfH[272];\
2157 uint8_t halfHV[256];\
2158 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2159 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2160 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2162 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2163 uint8_t halfH[272];\
2164 uint8_t halfHV[256];\
2165 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2166 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2167 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2169 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2170 uint8_t full[24*17];\
2171 uint8_t halfH[272];\
2172 uint8_t halfV[256];\
2173 uint8_t halfHV[256];\
2174 copy_block17(full, src, 24, stride, 17);\
2175 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2176 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2177 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2178 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2180 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2181 uint8_t full[24*17];\
2182 uint8_t halfH[272];\
2183 copy_block17(full, src, 24, stride, 17);\
2184 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2185 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2186 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2188 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2189 uint8_t full[24*17];\
2190 uint8_t halfH[272];\
2191 uint8_t halfV[256];\
2192 uint8_t halfHV[256];\
2193 copy_block17(full, src, 24, stride, 17);\
2194 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2195 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2196 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2197 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2199 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2200 uint8_t full[24*17];\
2201 uint8_t halfH[272];\
2202 copy_block17(full, src, 24, stride, 17);\
2203 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2204 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2205 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2207 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2208 uint8_t halfH[272];\
2209 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2210 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2213 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2214 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2215 #define op_put(a, b) a = cm[((b) + 16)>>5]
2216 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2218 QPEL_MC(0, put_ , _ , op_put)
2219 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2220 QPEL_MC(0, avg_ , _ , op_avg)
2221 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2222 #undef op_avg
2223 #undef op_avg_no_rnd
2224 #undef op_put
2225 #undef op_put_no_rnd
2227 #if 1
2228 #define H264_LOWPASS(OPNAME, OP, OP2) \
2229 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2230 const int h=2;\
2231 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2232 int i;\
2233 for(i=0; i<h; i++)\
2235 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2236 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2237 dst+=dstStride;\
2238 src+=srcStride;\
2242 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2243 const int w=2;\
2244 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2245 int i;\
2246 for(i=0; i<w; i++)\
2248 const int srcB= src[-2*srcStride];\
2249 const int srcA= src[-1*srcStride];\
2250 const int src0= src[0 *srcStride];\
2251 const int src1= src[1 *srcStride];\
2252 const int src2= src[2 *srcStride];\
2253 const int src3= src[3 *srcStride];\
2254 const int src4= src[4 *srcStride];\
2255 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2256 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2257 dst++;\
2258 src++;\
2262 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2263 const int h=2;\
2264 const int w=2;\
2265 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2266 int i;\
2267 src -= 2*srcStride;\
2268 for(i=0; i<h+5; i++)\
2270 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2271 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2272 tmp+=tmpStride;\
2273 src+=srcStride;\
2275 tmp -= tmpStride*(h+5-2);\
2276 for(i=0; i<w; i++)\
2278 const int tmpB= tmp[-2*tmpStride];\
2279 const int tmpA= tmp[-1*tmpStride];\
2280 const int tmp0= tmp[0 *tmpStride];\
2281 const int tmp1= tmp[1 *tmpStride];\
2282 const int tmp2= tmp[2 *tmpStride];\
2283 const int tmp3= tmp[3 *tmpStride];\
2284 const int tmp4= tmp[4 *tmpStride];\
2285 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2286 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2287 dst++;\
2288 tmp++;\
2291 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2292 const int h=4;\
2293 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2294 int i;\
2295 for(i=0; i<h; i++)\
2297 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2298 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2299 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2300 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2301 dst+=dstStride;\
2302 src+=srcStride;\
2306 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2307 const int w=4;\
2308 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2309 int i;\
2310 for(i=0; i<w; i++)\
2312 const int srcB= src[-2*srcStride];\
2313 const int srcA= src[-1*srcStride];\
2314 const int src0= src[0 *srcStride];\
2315 const int src1= src[1 *srcStride];\
2316 const int src2= src[2 *srcStride];\
2317 const int src3= src[3 *srcStride];\
2318 const int src4= src[4 *srcStride];\
2319 const int src5= src[5 *srcStride];\
2320 const int src6= src[6 *srcStride];\
2321 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2322 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2323 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2324 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2325 dst++;\
2326 src++;\
2330 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2331 const int h=4;\
2332 const int w=4;\
2333 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2334 int i;\
2335 src -= 2*srcStride;\
2336 for(i=0; i<h+5; i++)\
2338 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2339 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2340 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2341 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2342 tmp+=tmpStride;\
2343 src+=srcStride;\
2345 tmp -= tmpStride*(h+5-2);\
2346 for(i=0; i<w; i++)\
2348 const int tmpB= tmp[-2*tmpStride];\
2349 const int tmpA= tmp[-1*tmpStride];\
2350 const int tmp0= tmp[0 *tmpStride];\
2351 const int tmp1= tmp[1 *tmpStride];\
2352 const int tmp2= tmp[2 *tmpStride];\
2353 const int tmp3= tmp[3 *tmpStride];\
2354 const int tmp4= tmp[4 *tmpStride];\
2355 const int tmp5= tmp[5 *tmpStride];\
2356 const int tmp6= tmp[6 *tmpStride];\
2357 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2358 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2359 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2360 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2361 dst++;\
2362 tmp++;\
2366 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2367 const int h=8;\
2368 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2369 int i;\
2370 for(i=0; i<h; i++)\
2372 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2373 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2374 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2375 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2376 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2377 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2378 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2379 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2380 dst+=dstStride;\
2381 src+=srcStride;\
2385 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2386 const int w=8;\
2387 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2388 int i;\
2389 for(i=0; i<w; i++)\
2391 const int srcB= src[-2*srcStride];\
2392 const int srcA= src[-1*srcStride];\
2393 const int src0= src[0 *srcStride];\
2394 const int src1= src[1 *srcStride];\
2395 const int src2= src[2 *srcStride];\
2396 const int src3= src[3 *srcStride];\
2397 const int src4= src[4 *srcStride];\
2398 const int src5= src[5 *srcStride];\
2399 const int src6= src[6 *srcStride];\
2400 const int src7= src[7 *srcStride];\
2401 const int src8= src[8 *srcStride];\
2402 const int src9= src[9 *srcStride];\
2403 const int src10=src[10*srcStride];\
2404 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2405 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2406 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2407 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2408 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2409 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2410 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2411 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2412 dst++;\
2413 src++;\
2417 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2418 const int h=8;\
2419 const int w=8;\
2420 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2421 int i;\
2422 src -= 2*srcStride;\
2423 for(i=0; i<h+5; i++)\
2425 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2426 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2427 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2428 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2429 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2430 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2431 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2432 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2433 tmp+=tmpStride;\
2434 src+=srcStride;\
2436 tmp -= tmpStride*(h+5-2);\
2437 for(i=0; i<w; i++)\
2439 const int tmpB= tmp[-2*tmpStride];\
2440 const int tmpA= tmp[-1*tmpStride];\
2441 const int tmp0= tmp[0 *tmpStride];\
2442 const int tmp1= tmp[1 *tmpStride];\
2443 const int tmp2= tmp[2 *tmpStride];\
2444 const int tmp3= tmp[3 *tmpStride];\
2445 const int tmp4= tmp[4 *tmpStride];\
2446 const int tmp5= tmp[5 *tmpStride];\
2447 const int tmp6= tmp[6 *tmpStride];\
2448 const int tmp7= tmp[7 *tmpStride];\
2449 const int tmp8= tmp[8 *tmpStride];\
2450 const int tmp9= tmp[9 *tmpStride];\
2451 const int tmp10=tmp[10*tmpStride];\
2452 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2453 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2454 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2455 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2456 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2457 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2458 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2459 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2460 dst++;\
2461 tmp++;\
2465 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2466 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2467 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2468 src += 8*srcStride;\
2469 dst += 8*dstStride;\
2470 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2471 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2474 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2475 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2476 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2477 src += 8*srcStride;\
2478 dst += 8*dstStride;\
2479 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2480 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2483 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2484 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2485 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2486 src += 8*srcStride;\
2487 dst += 8*dstStride;\
2488 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2489 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2492 #define H264_MC(OPNAME, SIZE) \
2493 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2494 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2497 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2498 uint8_t half[SIZE*SIZE];\
2499 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2500 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2503 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2504 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2507 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2508 uint8_t half[SIZE*SIZE];\
2509 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2510 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2513 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2514 uint8_t full[SIZE*(SIZE+5)];\
2515 uint8_t * const full_mid= full + SIZE*2;\
2516 uint8_t half[SIZE*SIZE];\
2517 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2518 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2519 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2522 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2523 uint8_t full[SIZE*(SIZE+5)];\
2524 uint8_t * const full_mid= full + SIZE*2;\
2525 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2526 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2529 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2530 uint8_t full[SIZE*(SIZE+5)];\
2531 uint8_t * const full_mid= full + SIZE*2;\
2532 uint8_t half[SIZE*SIZE];\
2533 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2534 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2535 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2538 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2539 uint8_t full[SIZE*(SIZE+5)];\
2540 uint8_t * const full_mid= full + SIZE*2;\
2541 uint8_t halfH[SIZE*SIZE];\
2542 uint8_t halfV[SIZE*SIZE];\
2543 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2544 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2545 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2546 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2549 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2550 uint8_t full[SIZE*(SIZE+5)];\
2551 uint8_t * const full_mid= full + SIZE*2;\
2552 uint8_t halfH[SIZE*SIZE];\
2553 uint8_t halfV[SIZE*SIZE];\
2554 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2555 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2556 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2557 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2560 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2561 uint8_t full[SIZE*(SIZE+5)];\
2562 uint8_t * const full_mid= full + SIZE*2;\
2563 uint8_t halfH[SIZE*SIZE];\
2564 uint8_t halfV[SIZE*SIZE];\
2565 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2566 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2567 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2568 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2571 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2572 uint8_t full[SIZE*(SIZE+5)];\
2573 uint8_t * const full_mid= full + SIZE*2;\
2574 uint8_t halfH[SIZE*SIZE];\
2575 uint8_t halfV[SIZE*SIZE];\
2576 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2577 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2578 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2579 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2582 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2583 int16_t tmp[SIZE*(SIZE+5)];\
2584 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2587 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2588 int16_t tmp[SIZE*(SIZE+5)];\
2589 uint8_t halfH[SIZE*SIZE];\
2590 uint8_t halfHV[SIZE*SIZE];\
2591 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2592 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2593 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2596 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2597 int16_t tmp[SIZE*(SIZE+5)];\
2598 uint8_t halfH[SIZE*SIZE];\
2599 uint8_t halfHV[SIZE*SIZE];\
2600 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2601 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2602 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2605 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2606 uint8_t full[SIZE*(SIZE+5)];\
2607 uint8_t * const full_mid= full + SIZE*2;\
2608 int16_t tmp[SIZE*(SIZE+5)];\
2609 uint8_t halfV[SIZE*SIZE];\
2610 uint8_t halfHV[SIZE*SIZE];\
2611 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2612 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2613 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2614 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2617 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2618 uint8_t full[SIZE*(SIZE+5)];\
2619 uint8_t * const full_mid= full + SIZE*2;\
2620 int16_t tmp[SIZE*(SIZE+5)];\
2621 uint8_t halfV[SIZE*SIZE];\
2622 uint8_t halfHV[SIZE*SIZE];\
2623 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2624 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2625 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2626 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2629 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2630 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2631 #define op_put(a, b) a = cm[((b) + 16)>>5]
2632 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2633 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2635 H264_LOWPASS(put_ , op_put, op2_put)
2636 H264_LOWPASS(avg_ , op_avg, op2_avg)
2637 H264_MC(put_, 2)
2638 H264_MC(put_, 4)
2639 H264_MC(put_, 8)
2640 H264_MC(put_, 16)
2641 H264_MC(avg_, 4)
2642 H264_MC(avg_, 8)
2643 H264_MC(avg_, 16)
2645 #undef op_avg
2646 #undef op_put
2647 #undef op2_avg
2648 #undef op2_put
2649 #endif
2651 #define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2652 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2653 #define H264_WEIGHT(W,H) \
2654 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2655 int y; \
2656 offset <<= log2_denom; \
2657 if(log2_denom) offset += 1<<(log2_denom-1); \
2658 for(y=0; y<H; y++, block += stride){ \
2659 op_scale1(0); \
2660 op_scale1(1); \
2661 if(W==2) continue; \
2662 op_scale1(2); \
2663 op_scale1(3); \
2664 if(W==4) continue; \
2665 op_scale1(4); \
2666 op_scale1(5); \
2667 op_scale1(6); \
2668 op_scale1(7); \
2669 if(W==8) continue; \
2670 op_scale1(8); \
2671 op_scale1(9); \
2672 op_scale1(10); \
2673 op_scale1(11); \
2674 op_scale1(12); \
2675 op_scale1(13); \
2676 op_scale1(14); \
2677 op_scale1(15); \
2680 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2681 int y; \
2682 offset = ((offset + 1) | 1) << log2_denom; \
2683 for(y=0; y<H; y++, dst += stride, src += stride){ \
2684 op_scale2(0); \
2685 op_scale2(1); \
2686 if(W==2) continue; \
2687 op_scale2(2); \
2688 op_scale2(3); \
2689 if(W==4) continue; \
2690 op_scale2(4); \
2691 op_scale2(5); \
2692 op_scale2(6); \
2693 op_scale2(7); \
2694 if(W==8) continue; \
2695 op_scale2(8); \
2696 op_scale2(9); \
2697 op_scale2(10); \
2698 op_scale2(11); \
2699 op_scale2(12); \
2700 op_scale2(13); \
2701 op_scale2(14); \
2702 op_scale2(15); \
2706 H264_WEIGHT(16,16)
2707 H264_WEIGHT(16,8)
2708 H264_WEIGHT(8,16)
2709 H264_WEIGHT(8,8)
2710 H264_WEIGHT(8,4)
2711 H264_WEIGHT(4,8)
2712 H264_WEIGHT(4,4)
2713 H264_WEIGHT(4,2)
2714 H264_WEIGHT(2,4)
2715 H264_WEIGHT(2,2)
2717 #undef op_scale1
2718 #undef op_scale2
2719 #undef H264_WEIGHT
2721 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2722 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2723 int i;
2725 for(i=0; i<h; i++){
2726 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2727 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2728 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2729 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2730 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2731 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2732 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2733 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2734 dst+=dstStride;
2735 src+=srcStride;
2739 #if CONFIG_CAVS_DECODER
2740 /* AVS specific */
2741 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2743 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2744 put_pixels8_c(dst, src, stride, 8);
2746 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2747 avg_pixels8_c(dst, src, stride, 8);
2749 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2750 put_pixels16_c(dst, src, stride, 16);
2752 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2753 avg_pixels16_c(dst, src, stride, 16);
2755 #endif /* CONFIG_CAVS_DECODER */
2757 void ff_mlp_init(DSPContext* c, AVCodecContext *avctx);
2759 #if CONFIG_VC1_DECODER
2760 /* VC-1 specific */
2761 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2763 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2764 put_pixels8_c(dst, src, stride, 8);
2766 void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2767 avg_pixels8_c(dst, src, stride, 8);
2769 #endif /* CONFIG_VC1_DECODER */
2771 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2773 /* H264 specific */
2774 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2776 #if CONFIG_RV30_DECODER
2777 void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx);
2778 #endif /* CONFIG_RV30_DECODER */
2780 #if CONFIG_RV40_DECODER
2781 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2782 put_pixels16_xy2_c(dst, src, stride, 16);
2784 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2785 avg_pixels16_xy2_c(dst, src, stride, 16);
2787 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2788 put_pixels8_xy2_c(dst, src, stride, 8);
2790 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2791 avg_pixels8_xy2_c(dst, src, stride, 8);
2794 void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
2795 #endif /* CONFIG_RV40_DECODER */
2797 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2798 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2799 int i;
2801 for(i=0; i<w; i++){
2802 const int src_1= src[ -srcStride];
2803 const int src0 = src[0 ];
2804 const int src1 = src[ srcStride];
2805 const int src2 = src[2*srcStride];
2806 const int src3 = src[3*srcStride];
2807 const int src4 = src[4*srcStride];
2808 const int src5 = src[5*srcStride];
2809 const int src6 = src[6*srcStride];
2810 const int src7 = src[7*srcStride];
2811 const int src8 = src[8*srcStride];
2812 const int src9 = src[9*srcStride];
2813 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2814 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2815 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2816 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2817 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2818 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2819 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2820 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2821 src++;
2822 dst++;
2826 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2827 put_pixels8_c(dst, src, stride, 8);
2830 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2831 uint8_t half[64];
2832 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2833 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2836 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2837 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2840 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2841 uint8_t half[64];
2842 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2843 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2846 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2847 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2850 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2851 uint8_t halfH[88];
2852 uint8_t halfV[64];
2853 uint8_t halfHV[64];
2854 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2855 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2856 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2857 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2859 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2860 uint8_t halfH[88];
2861 uint8_t halfV[64];
2862 uint8_t halfHV[64];
2863 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2864 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2865 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2866 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2868 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2869 uint8_t halfH[88];
2870 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2871 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2874 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2875 if(CONFIG_ANY_H263) {
2876 int x;
2877 const int strength= ff_h263_loop_filter_strength[qscale];
2879 for(x=0; x<8; x++){
2880 int d1, d2, ad1;
2881 int p0= src[x-2*stride];
2882 int p1= src[x-1*stride];
2883 int p2= src[x+0*stride];
2884 int p3= src[x+1*stride];
2885 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2887 if (d<-2*strength) d1= 0;
2888 else if(d<- strength) d1=-2*strength - d;
2889 else if(d< strength) d1= d;
2890 else if(d< 2*strength) d1= 2*strength - d;
2891 else d1= 0;
2893 p1 += d1;
2894 p2 -= d1;
2895 if(p1&256) p1= ~(p1>>31);
2896 if(p2&256) p2= ~(p2>>31);
2898 src[x-1*stride] = p1;
2899 src[x+0*stride] = p2;
2901 ad1= FFABS(d1)>>1;
2903 d2= av_clip((p0-p3)/4, -ad1, ad1);
2905 src[x-2*stride] = p0 - d2;
2906 src[x+ stride] = p3 + d2;
2911 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2912 if(CONFIG_ANY_H263) {
2913 int y;
2914 const int strength= ff_h263_loop_filter_strength[qscale];
2916 for(y=0; y<8; y++){
2917 int d1, d2, ad1;
2918 int p0= src[y*stride-2];
2919 int p1= src[y*stride-1];
2920 int p2= src[y*stride+0];
2921 int p3= src[y*stride+1];
2922 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2924 if (d<-2*strength) d1= 0;
2925 else if(d<- strength) d1=-2*strength - d;
2926 else if(d< strength) d1= d;
2927 else if(d< 2*strength) d1= 2*strength - d;
2928 else d1= 0;
2930 p1 += d1;
2931 p2 -= d1;
2932 if(p1&256) p1= ~(p1>>31);
2933 if(p2&256) p2= ~(p2>>31);
2935 src[y*stride-1] = p1;
2936 src[y*stride+0] = p2;
2938 ad1= FFABS(d1)>>1;
2940 d2= av_clip((p0-p3)/4, -ad1, ad1);
2942 src[y*stride-2] = p0 - d2;
2943 src[y*stride+1] = p3 + d2;
2948 static void h261_loop_filter_c(uint8_t *src, int stride){
2949 int x,y,xy,yz;
2950 int temp[64];
2952 for(x=0; x<8; x++){
2953 temp[x ] = 4*src[x ];
2954 temp[x + 7*8] = 4*src[x + 7*stride];
2956 for(y=1; y<7; y++){
2957 for(x=0; x<8; x++){
2958 xy = y * stride + x;
2959 yz = y * 8 + x;
2960 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2964 for(y=0; y<8; y++){
2965 src[ y*stride] = (temp[ y*8] + 2)>>2;
2966 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2967 for(x=1; x<7; x++){
2968 xy = y * stride + x;
2969 yz = y * 8 + x;
2970 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2975 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2977 int i, d;
2978 for( i = 0; i < 4; i++ ) {
2979 if( tc0[i] < 0 ) {
2980 pix += 4*ystride;
2981 continue;
2983 for( d = 0; d < 4; d++ ) {
2984 const int p0 = pix[-1*xstride];
2985 const int p1 = pix[-2*xstride];
2986 const int p2 = pix[-3*xstride];
2987 const int q0 = pix[0];
2988 const int q1 = pix[1*xstride];
2989 const int q2 = pix[2*xstride];
2991 if( FFABS( p0 - q0 ) < alpha &&
2992 FFABS( p1 - p0 ) < beta &&
2993 FFABS( q1 - q0 ) < beta ) {
2995 int tc = tc0[i];
2996 int i_delta;
2998 if( FFABS( p2 - p0 ) < beta ) {
2999 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
3000 tc++;
3002 if( FFABS( q2 - q0 ) < beta ) {
3003 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
3004 tc++;
3007 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3008 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */
3009 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */
3011 pix += ystride;
3015 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3017 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
3019 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3021 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
3024 static inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3026 int d;
3027 for( d = 0; d < 16; d++ ) {
3028 const int p2 = pix[-3*xstride];
3029 const int p1 = pix[-2*xstride];
3030 const int p0 = pix[-1*xstride];
3032 const int q0 = pix[ 0*xstride];
3033 const int q1 = pix[ 1*xstride];
3034 const int q2 = pix[ 2*xstride];
3036 if( FFABS( p0 - q0 ) < alpha &&
3037 FFABS( p1 - p0 ) < beta &&
3038 FFABS( q1 - q0 ) < beta ) {
3040 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
3041 if( FFABS( p2 - p0 ) < beta)
3043 const int p3 = pix[-4*xstride];
3044 /* p0', p1', p2' */
3045 pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
3046 pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
3047 pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
3048 } else {
3049 /* p0' */
3050 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3052 if( FFABS( q2 - q0 ) < beta)
3054 const int q3 = pix[3*xstride];
3055 /* q0', q1', q2' */
3056 pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
3057 pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
3058 pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
3059 } else {
3060 /* q0' */
3061 pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3063 }else{
3064 /* p0', q0' */
3065 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3066 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3069 pix += ystride;
3072 static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3074 h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
3076 static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3078 h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
3081 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3083 int i, d;
3084 for( i = 0; i < 4; i++ ) {
3085 const int tc = tc0[i];
3086 if( tc <= 0 ) {
3087 pix += 2*ystride;
3088 continue;
3090 for( d = 0; d < 2; d++ ) {
3091 const int p0 = pix[-1*xstride];
3092 const int p1 = pix[-2*xstride];
3093 const int q0 = pix[0];
3094 const int q1 = pix[1*xstride];
3096 if( FFABS( p0 - q0 ) < alpha &&
3097 FFABS( p1 - p0 ) < beta &&
3098 FFABS( q1 - q0 ) < beta ) {
3100 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3102 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */
3103 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */
3105 pix += ystride;
3109 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3111 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3113 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3115 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3118 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3120 int d;
3121 for( d = 0; d < 8; d++ ) {
3122 const int p0 = pix[-1*xstride];
3123 const int p1 = pix[-2*xstride];
3124 const int q0 = pix[0];
3125 const int q1 = pix[1*xstride];
3127 if( FFABS( p0 - q0 ) < alpha &&
3128 FFABS( p1 - p0 ) < beta &&
3129 FFABS( q1 - q0 ) < beta ) {
3131 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
3132 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
3134 pix += ystride;
3137 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3139 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3141 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3143 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3146 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3148 int s, i;
3150 s = 0;
3151 for(i=0;i<h;i++) {
3152 s += abs(pix1[0] - pix2[0]);
3153 s += abs(pix1[1] - pix2[1]);
3154 s += abs(pix1[2] - pix2[2]);
3155 s += abs(pix1[3] - pix2[3]);
3156 s += abs(pix1[4] - pix2[4]);
3157 s += abs(pix1[5] - pix2[5]);
3158 s += abs(pix1[6] - pix2[6]);
3159 s += abs(pix1[7] - pix2[7]);
3160 s += abs(pix1[8] - pix2[8]);
3161 s += abs(pix1[9] - pix2[9]);
3162 s += abs(pix1[10] - pix2[10]);
3163 s += abs(pix1[11] - pix2[11]);
3164 s += abs(pix1[12] - pix2[12]);
3165 s += abs(pix1[13] - pix2[13]);
3166 s += abs(pix1[14] - pix2[14]);
3167 s += abs(pix1[15] - pix2[15]);
3168 pix1 += line_size;
3169 pix2 += line_size;
3171 return s;
3174 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3176 int s, i;
3178 s = 0;
3179 for(i=0;i<h;i++) {
3180 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3181 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3182 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3183 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3184 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3185 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3186 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3187 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3188 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3189 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3190 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3191 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3192 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3193 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3194 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3195 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3196 pix1 += line_size;
3197 pix2 += line_size;
3199 return s;
3202 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3204 int s, i;
3205 uint8_t *pix3 = pix2 + line_size;
3207 s = 0;
3208 for(i=0;i<h;i++) {
3209 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3210 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3211 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3212 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3213 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3214 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3215 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3216 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3217 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3218 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3219 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3220 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3221 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3222 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3223 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3224 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3225 pix1 += line_size;
3226 pix2 += line_size;
3227 pix3 += line_size;
3229 return s;
3232 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3234 int s, i;
3235 uint8_t *pix3 = pix2 + line_size;
3237 s = 0;
3238 for(i=0;i<h;i++) {
3239 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3240 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3241 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3242 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3243 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3244 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3245 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3246 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3247 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3248 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3249 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3250 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3251 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3252 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3253 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3254 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3255 pix1 += line_size;
3256 pix2 += line_size;
3257 pix3 += line_size;
3259 return s;
3262 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3264 int s, i;
3266 s = 0;
3267 for(i=0;i<h;i++) {
3268 s += abs(pix1[0] - pix2[0]);
3269 s += abs(pix1[1] - pix2[1]);
3270 s += abs(pix1[2] - pix2[2]);
3271 s += abs(pix1[3] - pix2[3]);
3272 s += abs(pix1[4] - pix2[4]);
3273 s += abs(pix1[5] - pix2[5]);
3274 s += abs(pix1[6] - pix2[6]);
3275 s += abs(pix1[7] - pix2[7]);
3276 pix1 += line_size;
3277 pix2 += line_size;
3279 return s;
3282 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3284 int s, i;
3286 s = 0;
3287 for(i=0;i<h;i++) {
3288 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3289 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3290 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3291 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3292 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3293 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3294 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3295 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3296 pix1 += line_size;
3297 pix2 += line_size;
3299 return s;
3302 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3304 int s, i;
3305 uint8_t *pix3 = pix2 + line_size;
3307 s = 0;
3308 for(i=0;i<h;i++) {
3309 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3310 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3311 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3312 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3313 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3314 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3315 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3316 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3317 pix1 += line_size;
3318 pix2 += line_size;
3319 pix3 += line_size;
3321 return s;
3324 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3326 int s, i;
3327 uint8_t *pix3 = pix2 + line_size;
3329 s = 0;
3330 for(i=0;i<h;i++) {
3331 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3332 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3333 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3334 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3335 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3336 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3337 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3338 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3339 pix1 += line_size;
3340 pix2 += line_size;
3341 pix3 += line_size;
3343 return s;
3346 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3347 MpegEncContext *c = v;
3348 int score1=0;
3349 int score2=0;
3350 int x,y;
3352 for(y=0; y<h; y++){
3353 for(x=0; x<16; x++){
3354 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3356 if(y+1<h){
3357 for(x=0; x<15; x++){
3358 score2+= FFABS( s1[x ] - s1[x +stride]
3359 - s1[x+1] + s1[x+1+stride])
3360 -FFABS( s2[x ] - s2[x +stride]
3361 - s2[x+1] + s2[x+1+stride]);
3364 s1+= stride;
3365 s2+= stride;
3368 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3369 else return score1 + FFABS(score2)*8;
3372 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3373 MpegEncContext *c = v;
3374 int score1=0;
3375 int score2=0;
3376 int x,y;
3378 for(y=0; y<h; y++){
3379 for(x=0; x<8; x++){
3380 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3382 if(y+1<h){
3383 for(x=0; x<7; x++){
3384 score2+= FFABS( s1[x ] - s1[x +stride]
3385 - s1[x+1] + s1[x+1+stride])
3386 -FFABS( s2[x ] - s2[x +stride]
3387 - s2[x+1] + s2[x+1+stride]);
3390 s1+= stride;
3391 s2+= stride;
3394 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3395 else return score1 + FFABS(score2)*8;
3398 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3399 int i;
3400 unsigned int sum=0;
3402 for(i=0; i<8*8; i++){
3403 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3404 int w= weight[i];
3405 b>>= RECON_SHIFT;
3406 assert(-512<b && b<512);
3408 sum += (w*b)*(w*b)>>4;
3410 return sum>>2;
3413 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3414 int i;
3416 for(i=0; i<8*8; i++){
3417 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3422 * permutes an 8x8 block.
3423 * @param block the block which will be permuted according to the given permutation vector
3424 * @param permutation the permutation vector
3425 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3426 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3427 * (inverse) permutated to scantable order!
3429 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3431 int i;
3432 DCTELEM temp[64];
3434 if(last<=0) return;
3435 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3437 for(i=0; i<=last; i++){
3438 const int j= scantable[i];
3439 temp[j]= block[j];
3440 block[j]=0;
3443 for(i=0; i<=last; i++){
3444 const int j= scantable[i];
3445 const int perm_j= permutation[j];
3446 block[perm_j]= temp[j];
3450 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3451 return 0;
3454 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3455 int i;
3457 memset(cmp, 0, sizeof(void*)*6);
3459 for(i=0; i<6; i++){
3460 switch(type&0xFF){
3461 case FF_CMP_SAD:
3462 cmp[i]= c->sad[i];
3463 break;
3464 case FF_CMP_SATD:
3465 cmp[i]= c->hadamard8_diff[i];
3466 break;
3467 case FF_CMP_SSE:
3468 cmp[i]= c->sse[i];
3469 break;
3470 case FF_CMP_DCT:
3471 cmp[i]= c->dct_sad[i];
3472 break;
3473 case FF_CMP_DCT264:
3474 cmp[i]= c->dct264_sad[i];
3475 break;
3476 case FF_CMP_DCTMAX:
3477 cmp[i]= c->dct_max[i];
3478 break;
3479 case FF_CMP_PSNR:
3480 cmp[i]= c->quant_psnr[i];
3481 break;
3482 case FF_CMP_BIT:
3483 cmp[i]= c->bit[i];
3484 break;
3485 case FF_CMP_RD:
3486 cmp[i]= c->rd[i];
3487 break;
3488 case FF_CMP_VSAD:
3489 cmp[i]= c->vsad[i];
3490 break;
3491 case FF_CMP_VSSE:
3492 cmp[i]= c->vsse[i];
3493 break;
3494 case FF_CMP_ZERO:
3495 cmp[i]= zero_cmp;
3496 break;
3497 case FF_CMP_NSSE:
3498 cmp[i]= c->nsse[i];
3499 break;
3500 #if CONFIG_SNOW_ENCODER
3501 case FF_CMP_W53:
3502 cmp[i]= c->w53[i];
3503 break;
3504 case FF_CMP_W97:
3505 cmp[i]= c->w97[i];
3506 break;
3507 #endif
3508 default:
3509 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3514 static void clear_block_c(DCTELEM *block)
3516 memset(block, 0, sizeof(DCTELEM)*64);
3520 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3522 static void clear_blocks_c(DCTELEM *blocks)
3524 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3527 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3528 long i;
3529 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3530 long a = *(long*)(src+i);
3531 long b = *(long*)(dst+i);
3532 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3534 for(; i<w; i++)
3535 dst[i+0] += src[i+0];
3538 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3539 long i;
3540 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3541 long a = *(long*)(src1+i);
3542 long b = *(long*)(src2+i);
3543 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3545 for(; i<w; i++)
3546 dst[i] = src1[i]+src2[i];
3549 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3550 long i;
3551 #if !HAVE_FAST_UNALIGNED
3552 if((long)src2 & (sizeof(long)-1)){
3553 for(i=0; i+7<w; i+=8){
3554 dst[i+0] = src1[i+0]-src2[i+0];
3555 dst[i+1] = src1[i+1]-src2[i+1];
3556 dst[i+2] = src1[i+2]-src2[i+2];
3557 dst[i+3] = src1[i+3]-src2[i+3];
3558 dst[i+4] = src1[i+4]-src2[i+4];
3559 dst[i+5] = src1[i+5]-src2[i+5];
3560 dst[i+6] = src1[i+6]-src2[i+6];
3561 dst[i+7] = src1[i+7]-src2[i+7];
3563 }else
3564 #endif
3565 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3566 long a = *(long*)(src1+i);
3567 long b = *(long*)(src2+i);
3568 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3570 for(; i<w; i++)
3571 dst[i+0] = src1[i+0]-src2[i+0];
3574 static void add_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *diff, int w, int *left, int *left_top){
3575 int i;
3576 uint8_t l, lt;
3578 l= *left;
3579 lt= *left_top;
3581 for(i=0; i<w; i++){
3582 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3583 lt= src1[i];
3584 dst[i]= l;
3587 *left= l;
3588 *left_top= lt;
3591 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3592 int i;
3593 uint8_t l, lt;
3595 l= *left;
3596 lt= *left_top;
3598 for(i=0; i<w; i++){
3599 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3600 lt= src1[i];
3601 l= src2[i];
3602 dst[i]= l - pred;
3605 *left= l;
3606 *left_top= lt;
3609 #define BUTTERFLY2(o1,o2,i1,i2) \
3610 o1= (i1)+(i2);\
3611 o2= (i1)-(i2);
3613 #define BUTTERFLY1(x,y) \
3615 int a,b;\
3616 a= x;\
3617 b= y;\
3618 x= a+b;\
3619 y= a-b;\
3622 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3624 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3625 int i;
3626 int temp[64];
3627 int sum=0;
3629 assert(h==8);
3631 for(i=0; i<8; i++){
3632 //FIXME try pointer walks
3633 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3634 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3635 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3636 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3638 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3639 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3640 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3641 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3643 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3644 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3645 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3646 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3649 for(i=0; i<8; i++){
3650 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3651 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3652 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3653 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3655 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3656 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3657 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3658 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3660 sum +=
3661 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3662 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3663 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3664 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3666 #if 0
3667 static int maxi=0;
3668 if(sum>maxi){
3669 maxi=sum;
3670 printf("MAX:%d\n", maxi);
3672 #endif
3673 return sum;
3676 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3677 int i;
3678 int temp[64];
3679 int sum=0;
3681 assert(h==8);
3683 for(i=0; i<8; i++){
3684 //FIXME try pointer walks
3685 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3686 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3687 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3688 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3690 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3691 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3692 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3693 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3695 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3696 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3697 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3698 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3701 for(i=0; i<8; i++){
3702 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3703 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3704 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3705 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3707 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3708 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3709 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3710 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3712 sum +=
3713 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3714 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3715 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3716 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3719 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3721 return sum;
3724 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3725 MpegEncContext * const s= (MpegEncContext *)c;
3726 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3727 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3729 assert(h==8);
3731 s->dsp.diff_pixels(temp, src1, src2, stride);
3732 s->dsp.fdct(temp);
3733 return s->dsp.sum_abs_dctelem(temp);
3736 #if CONFIG_GPL
3737 #define DCT8_1D {\
3738 const int s07 = SRC(0) + SRC(7);\
3739 const int s16 = SRC(1) + SRC(6);\
3740 const int s25 = SRC(2) + SRC(5);\
3741 const int s34 = SRC(3) + SRC(4);\
3742 const int a0 = s07 + s34;\
3743 const int a1 = s16 + s25;\
3744 const int a2 = s07 - s34;\
3745 const int a3 = s16 - s25;\
3746 const int d07 = SRC(0) - SRC(7);\
3747 const int d16 = SRC(1) - SRC(6);\
3748 const int d25 = SRC(2) - SRC(5);\
3749 const int d34 = SRC(3) - SRC(4);\
3750 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3751 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3752 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3753 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3754 DST(0, a0 + a1 ) ;\
3755 DST(1, a4 + (a7>>2)) ;\
3756 DST(2, a2 + (a3>>1)) ;\
3757 DST(3, a5 + (a6>>2)) ;\
3758 DST(4, a0 - a1 ) ;\
3759 DST(5, a6 - (a5>>2)) ;\
3760 DST(6, (a2>>1) - a3 ) ;\
3761 DST(7, (a4>>2) - a7 ) ;\
3764 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3765 MpegEncContext * const s= (MpegEncContext *)c;
3766 DCTELEM dct[8][8];
3767 int i;
3768 int sum=0;
3770 s->dsp.diff_pixels(dct[0], src1, src2, stride);
3772 #define SRC(x) dct[i][x]
3773 #define DST(x,v) dct[i][x]= v
3774 for( i = 0; i < 8; i++ )
3775 DCT8_1D
3776 #undef SRC
3777 #undef DST
3779 #define SRC(x) dct[x][i]
3780 #define DST(x,v) sum += FFABS(v)
3781 for( i = 0; i < 8; i++ )
3782 DCT8_1D
3783 #undef SRC
3784 #undef DST
3785 return sum;
3787 #endif
3789 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3790 MpegEncContext * const s= (MpegEncContext *)c;
3791 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3792 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3793 int sum=0, i;
3795 assert(h==8);
3797 s->dsp.diff_pixels(temp, src1, src2, stride);
3798 s->dsp.fdct(temp);
3800 for(i=0; i<64; i++)
3801 sum= FFMAX(sum, FFABS(temp[i]));
3803 return sum;
3806 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3807 MpegEncContext * const s= (MpegEncContext *)c;
3808 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3809 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3810 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3811 int sum=0, i;
3813 assert(h==8);
3814 s->mb_intra=0;
3816 s->dsp.diff_pixels(temp, src1, src2, stride);
3818 memcpy(bak, temp, 64*sizeof(DCTELEM));
3820 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3821 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3822 ff_simple_idct(temp); //FIXME
3824 for(i=0; i<64; i++)
3825 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3827 return sum;
3830 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3831 MpegEncContext * const s= (MpegEncContext *)c;
3832 const uint8_t *scantable= s->intra_scantable.permutated;
3833 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3834 DECLARE_ALIGNED_16(uint64_t, aligned_src1[8]);
3835 DECLARE_ALIGNED_16(uint64_t, aligned_src2[8]);
3836 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3837 uint8_t * const lsrc1 = (uint8_t*)aligned_src1;
3838 uint8_t * const lsrc2 = (uint8_t*)aligned_src2;
3839 int i, last, run, bits, level, distortion, start_i;
3840 const int esc_length= s->ac_esc_length;
3841 uint8_t * length;
3842 uint8_t * last_length;
3844 assert(h==8);
3846 copy_block8(lsrc1, src1, 8, stride, 8);
3847 copy_block8(lsrc2, src2, 8, stride, 8);
3849 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3851 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3853 bits=0;
3855 if (s->mb_intra) {
3856 start_i = 1;
3857 length = s->intra_ac_vlc_length;
3858 last_length= s->intra_ac_vlc_last_length;
3859 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3860 } else {
3861 start_i = 0;
3862 length = s->inter_ac_vlc_length;
3863 last_length= s->inter_ac_vlc_last_length;
3866 if(last>=start_i){
3867 run=0;
3868 for(i=start_i; i<last; i++){
3869 int j= scantable[i];
3870 level= temp[j];
3872 if(level){
3873 level+=64;
3874 if((level&(~127)) == 0){
3875 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3876 }else
3877 bits+= esc_length;
3878 run=0;
3879 }else
3880 run++;
3882 i= scantable[last];
3884 level= temp[i] + 64;
3886 assert(level - 64);
3888 if((level&(~127)) == 0){
3889 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3890 }else
3891 bits+= esc_length;
3895 if(last>=0){
3896 if(s->mb_intra)
3897 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3898 else
3899 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3902 s->dsp.idct_add(lsrc2, 8, temp);
3904 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3906 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3909 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3910 MpegEncContext * const s= (MpegEncContext *)c;
3911 const uint8_t *scantable= s->intra_scantable.permutated;
3912 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3913 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3914 int i, last, run, bits, level, start_i;
3915 const int esc_length= s->ac_esc_length;
3916 uint8_t * length;
3917 uint8_t * last_length;
3919 assert(h==8);
3921 s->dsp.diff_pixels(temp, src1, src2, stride);
3923 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3925 bits=0;
3927 if (s->mb_intra) {
3928 start_i = 1;
3929 length = s->intra_ac_vlc_length;
3930 last_length= s->intra_ac_vlc_last_length;
3931 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3932 } else {
3933 start_i = 0;
3934 length = s->inter_ac_vlc_length;
3935 last_length= s->inter_ac_vlc_last_length;
3938 if(last>=start_i){
3939 run=0;
3940 for(i=start_i; i<last; i++){
3941 int j= scantable[i];
3942 level= temp[j];
3944 if(level){
3945 level+=64;
3946 if((level&(~127)) == 0){
3947 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3948 }else
3949 bits+= esc_length;
3950 run=0;
3951 }else
3952 run++;
3954 i= scantable[last];
3956 level= temp[i] + 64;
3958 assert(level - 64);
3960 if((level&(~127)) == 0){
3961 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3962 }else
3963 bits+= esc_length;
3966 return bits;
3969 #define VSAD_INTRA(size) \
3970 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3971 int score=0; \
3972 int x,y; \
3974 for(y=1; y<h; y++){ \
3975 for(x=0; x<size; x+=4){ \
3976 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
3977 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
3979 s+= stride; \
3982 return score; \
3984 VSAD_INTRA(8)
3985 VSAD_INTRA(16)
3987 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3988 int score=0;
3989 int x,y;
3991 for(y=1; y<h; y++){
3992 for(x=0; x<16; x++){
3993 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3995 s1+= stride;
3996 s2+= stride;
3999 return score;
4002 #define SQ(a) ((a)*(a))
4003 #define VSSE_INTRA(size) \
4004 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
4005 int score=0; \
4006 int x,y; \
4008 for(y=1; y<h; y++){ \
4009 for(x=0; x<size; x+=4){ \
4010 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
4011 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
4013 s+= stride; \
4016 return score; \
4018 VSSE_INTRA(8)
4019 VSSE_INTRA(16)
4021 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
4022 int score=0;
4023 int x,y;
4025 for(y=1; y<h; y++){
4026 for(x=0; x<16; x++){
4027 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
4029 s1+= stride;
4030 s2+= stride;
4033 return score;
4036 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
4037 int size){
4038 int score=0;
4039 int i;
4040 for(i=0; i<size; i++)
4041 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
4042 return score;
4045 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
4046 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
4047 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
4048 #if CONFIG_GPL
4049 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
4050 #endif
4051 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
4052 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
4053 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
4054 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
4056 static void vector_fmul_c(float *dst, const float *src, int len){
4057 int i;
4058 for(i=0; i<len; i++)
4059 dst[i] *= src[i];
4062 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
4063 int i;
4064 src1 += len-1;
4065 for(i=0; i<len; i++)
4066 dst[i] = src0[i] * src1[-i];
4069 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
4070 int i;
4071 for(i=0; i<len; i++)
4072 dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
4075 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
4076 int i,j;
4077 dst += len;
4078 win += len;
4079 src0+= len;
4080 for(i=-len, j=len-1; i<0; i++, j--) {
4081 float s0 = src0[i];
4082 float s1 = src1[j];
4083 float wi = win[i];
4084 float wj = win[j];
4085 dst[i] = s0*wj - s1*wi + add_bias;
4086 dst[j] = s0*wi + s1*wj + add_bias;
4090 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
4091 int i;
4092 for(i=0; i<len; i++)
4093 dst[i] = src[i] * mul;
4096 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
4097 uint32_t maxi, uint32_t maxisign)
4100 if(a > mini) return mini;
4101 else if((a^(1<<31)) > maxisign) return maxi;
4102 else return a;
4105 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
4106 int i;
4107 uint32_t mini = *(uint32_t*)min;
4108 uint32_t maxi = *(uint32_t*)max;
4109 uint32_t maxisign = maxi ^ (1<<31);
4110 uint32_t *dsti = (uint32_t*)dst;
4111 const uint32_t *srci = (const uint32_t*)src;
4112 for(i=0; i<len; i+=8) {
4113 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
4114 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
4115 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
4116 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
4117 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
4118 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
4119 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
4120 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
4123 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
4124 int i;
4125 if(min < 0 && max > 0) {
4126 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
4127 } else {
4128 for(i=0; i < len; i+=8) {
4129 dst[i ] = av_clipf(src[i ], min, max);
4130 dst[i + 1] = av_clipf(src[i + 1], min, max);
4131 dst[i + 2] = av_clipf(src[i + 2], min, max);
4132 dst[i + 3] = av_clipf(src[i + 3], min, max);
4133 dst[i + 4] = av_clipf(src[i + 4], min, max);
4134 dst[i + 5] = av_clipf(src[i + 5], min, max);
4135 dst[i + 6] = av_clipf(src[i + 6], min, max);
4136 dst[i + 7] = av_clipf(src[i + 7], min, max);
4141 static av_always_inline int float_to_int16_one(const float *src){
4142 int_fast32_t tmp = *(const int32_t*)src;
4143 if(tmp & 0xf0000){
4144 tmp = (0x43c0ffff - tmp)>>31;
4145 // is this faster on some gcc/cpu combinations?
4146 // if(tmp > 0x43c0ffff) tmp = 0xFFFF;
4147 // else tmp = 0;
4149 return tmp - 0x8000;
4152 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
4153 int i;
4154 for(i=0; i<len; i++)
4155 dst[i] = float_to_int16_one(src+i);
4158 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
4159 int i,j,c;
4160 if(channels==2){
4161 for(i=0; i<len; i++){
4162 dst[2*i] = float_to_int16_one(src[0]+i);
4163 dst[2*i+1] = float_to_int16_one(src[1]+i);
4165 }else{
4166 for(c=0; c<channels; c++)
4167 for(i=0, j=c; i<len; i++, j+=channels)
4168 dst[j] = float_to_int16_one(src[c]+i);
4172 static void add_int16_c(int16_t * v1, int16_t * v2, int order)
4174 while (order--)
4175 *v1++ += *v2++;
4178 static void sub_int16_c(int16_t * v1, int16_t * v2, int order)
4180 while (order--)
4181 *v1++ -= *v2++;
4184 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
4186 int res = 0;
4188 while (order--)
4189 res += (*v1++ * *v2++) >> shift;
4191 return res;
4194 #define W0 2048
4195 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4196 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
4197 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
4198 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
4199 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
4200 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
4201 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
4203 static void wmv2_idct_row(short * b)
4205 int s1,s2;
4206 int a0,a1,a2,a3,a4,a5,a6,a7;
4207 /*step 1*/
4208 a1 = W1*b[1]+W7*b[7];
4209 a7 = W7*b[1]-W1*b[7];
4210 a5 = W5*b[5]+W3*b[3];
4211 a3 = W3*b[5]-W5*b[3];
4212 a2 = W2*b[2]+W6*b[6];
4213 a6 = W6*b[2]-W2*b[6];
4214 a0 = W0*b[0]+W0*b[4];
4215 a4 = W0*b[0]-W0*b[4];
4216 /*step 2*/
4217 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
4218 s2 = (181*(a1-a5-a7+a3)+128)>>8;
4219 /*step 3*/
4220 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4221 b[1] = (a4+a6 +s1 + (1<<7))>>8;
4222 b[2] = (a4-a6 +s2 + (1<<7))>>8;
4223 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4224 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4225 b[5] = (a4-a6 -s2 + (1<<7))>>8;
4226 b[6] = (a4+a6 -s1 + (1<<7))>>8;
4227 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4229 static void wmv2_idct_col(short * b)
4231 int s1,s2;
4232 int a0,a1,a2,a3,a4,a5,a6,a7;
4233 /*step 1, with extended precision*/
4234 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4235 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4236 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4237 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4238 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4239 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4240 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
4241 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
4242 /*step 2*/
4243 s1 = (181*(a1-a5+a7-a3)+128)>>8;
4244 s2 = (181*(a1-a5-a7+a3)+128)>>8;
4245 /*step 3*/
4246 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4247 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
4248 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
4249 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4251 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4252 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
4253 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
4254 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4256 void ff_wmv2_idct_c(short * block){
4257 int i;
4259 for(i=0;i<64;i+=8){
4260 wmv2_idct_row(block+i);
4262 for(i=0;i<8;i++){
4263 wmv2_idct_col(block+i);
4266 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4267 converted */
4268 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4270 ff_wmv2_idct_c(block);
4271 put_pixels_clamped_c(block, dest, line_size);
4273 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4275 ff_wmv2_idct_c(block);
4276 add_pixels_clamped_c(block, dest, line_size);
4278 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4280 j_rev_dct (block);
4281 put_pixels_clamped_c(block, dest, line_size);
4283 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4285 j_rev_dct (block);
4286 add_pixels_clamped_c(block, dest, line_size);
4289 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4291 j_rev_dct4 (block);
4292 put_pixels_clamped4_c(block, dest, line_size);
4294 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4296 j_rev_dct4 (block);
4297 add_pixels_clamped4_c(block, dest, line_size);
4300 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4302 j_rev_dct2 (block);
4303 put_pixels_clamped2_c(block, dest, line_size);
4305 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4307 j_rev_dct2 (block);
4308 add_pixels_clamped2_c(block, dest, line_size);
4311 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4313 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4315 dest[0] = cm[(block[0] + 4)>>3];
4317 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4319 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4321 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4324 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4326 /* init static data */
4327 void dsputil_static_init(void)
4329 int i;
4331 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4332 for(i=0;i<MAX_NEG_CROP;i++) {
4333 ff_cropTbl[i] = 0;
4334 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4337 for(i=0;i<512;i++) {
4338 ff_squareTbl[i] = (i - 256) * (i - 256);
4341 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4344 int ff_check_alignment(void){
4345 static int did_fail=0;
4346 DECLARE_ALIGNED_16(int, aligned);
4348 if((intptr_t)&aligned & 15){
4349 if(!did_fail){
4350 #if HAVE_MMX || HAVE_ALTIVEC
4351 av_log(NULL, AV_LOG_ERROR,
4352 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4353 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4354 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4355 "Do not report crashes to FFmpeg developers.\n");
4356 #endif
4357 did_fail=1;
4359 return -1;
4361 return 0;
4364 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4366 int i;
4368 ff_check_alignment();
4370 #if CONFIG_ENCODERS
4371 if(avctx->dct_algo==FF_DCT_FASTINT) {
4372 c->fdct = fdct_ifast;
4373 c->fdct248 = fdct_ifast248;
4375 else if(avctx->dct_algo==FF_DCT_FAAN) {
4376 c->fdct = ff_faandct;
4377 c->fdct248 = ff_faandct248;
4379 else {
4380 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4381 c->fdct248 = ff_fdct248_islow;
4383 #endif //CONFIG_ENCODERS
4385 if(avctx->lowres==1){
4386 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4387 c->idct_put= ff_jref_idct4_put;
4388 c->idct_add= ff_jref_idct4_add;
4389 }else{
4390 c->idct_put= ff_h264_lowres_idct_put_c;
4391 c->idct_add= ff_h264_lowres_idct_add_c;
4393 c->idct = j_rev_dct4;
4394 c->idct_permutation_type= FF_NO_IDCT_PERM;
4395 }else if(avctx->lowres==2){
4396 c->idct_put= ff_jref_idct2_put;
4397 c->idct_add= ff_jref_idct2_add;
4398 c->idct = j_rev_dct2;
4399 c->idct_permutation_type= FF_NO_IDCT_PERM;
4400 }else if(avctx->lowres==3){
4401 c->idct_put= ff_jref_idct1_put;
4402 c->idct_add= ff_jref_idct1_add;
4403 c->idct = j_rev_dct1;
4404 c->idct_permutation_type= FF_NO_IDCT_PERM;
4405 }else{
4406 if(avctx->idct_algo==FF_IDCT_INT){
4407 c->idct_put= ff_jref_idct_put;
4408 c->idct_add= ff_jref_idct_add;
4409 c->idct = j_rev_dct;
4410 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4411 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4412 avctx->idct_algo==FF_IDCT_VP3){
4413 c->idct_put= ff_vp3_idct_put_c;
4414 c->idct_add= ff_vp3_idct_add_c;
4415 c->idct = ff_vp3_idct_c;
4416 c->idct_permutation_type= FF_NO_IDCT_PERM;
4417 }else if(avctx->idct_algo==FF_IDCT_WMV2){
4418 c->idct_put= ff_wmv2_idct_put_c;
4419 c->idct_add= ff_wmv2_idct_add_c;
4420 c->idct = ff_wmv2_idct_c;
4421 c->idct_permutation_type= FF_NO_IDCT_PERM;
4422 }else if(avctx->idct_algo==FF_IDCT_FAAN){
4423 c->idct_put= ff_faanidct_put;
4424 c->idct_add= ff_faanidct_add;
4425 c->idct = ff_faanidct;
4426 c->idct_permutation_type= FF_NO_IDCT_PERM;
4427 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4428 c->idct_put= ff_ea_idct_put_c;
4429 c->idct_permutation_type= FF_NO_IDCT_PERM;
4430 }else{ //accurate/default
4431 c->idct_put= ff_simple_idct_put;
4432 c->idct_add= ff_simple_idct_add;
4433 c->idct = ff_simple_idct;
4434 c->idct_permutation_type= FF_NO_IDCT_PERM;
4438 if (CONFIG_H264_DECODER) {
4439 c->h264_idct_add= ff_h264_idct_add_c;
4440 c->h264_idct8_add= ff_h264_idct8_add_c;
4441 c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4442 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4443 c->h264_idct_add16 = ff_h264_idct_add16_c;
4444 c->h264_idct8_add4 = ff_h264_idct8_add4_c;
4445 c->h264_idct_add8 = ff_h264_idct_add8_c;
4446 c->h264_idct_add16intra= ff_h264_idct_add16intra_c;
4449 c->get_pixels = get_pixels_c;
4450 c->diff_pixels = diff_pixels_c;
4451 c->put_pixels_clamped = put_pixels_clamped_c;
4452 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4453 c->add_pixels_clamped = add_pixels_clamped_c;
4454 c->add_pixels8 = add_pixels8_c;
4455 c->add_pixels4 = add_pixels4_c;
4456 c->sum_abs_dctelem = sum_abs_dctelem_c;
4457 c->gmc1 = gmc1_c;
4458 c->gmc = ff_gmc_c;
4459 c->clear_block = clear_block_c;
4460 c->clear_blocks = clear_blocks_c;
4461 c->pix_sum = pix_sum_c;
4462 c->pix_norm1 = pix_norm1_c;
4464 /* TODO [0] 16 [1] 8 */
4465 c->pix_abs[0][0] = pix_abs16_c;
4466 c->pix_abs[0][1] = pix_abs16_x2_c;
4467 c->pix_abs[0][2] = pix_abs16_y2_c;
4468 c->pix_abs[0][3] = pix_abs16_xy2_c;
4469 c->pix_abs[1][0] = pix_abs8_c;
4470 c->pix_abs[1][1] = pix_abs8_x2_c;
4471 c->pix_abs[1][2] = pix_abs8_y2_c;
4472 c->pix_abs[1][3] = pix_abs8_xy2_c;
4474 #define dspfunc(PFX, IDX, NUM) \
4475 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
4476 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
4477 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
4478 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4480 dspfunc(put, 0, 16);
4481 dspfunc(put_no_rnd, 0, 16);
4482 dspfunc(put, 1, 8);
4483 dspfunc(put_no_rnd, 1, 8);
4484 dspfunc(put, 2, 4);
4485 dspfunc(put, 3, 2);
4487 dspfunc(avg, 0, 16);
4488 dspfunc(avg_no_rnd, 0, 16);
4489 dspfunc(avg, 1, 8);
4490 dspfunc(avg_no_rnd, 1, 8);
4491 dspfunc(avg, 2, 4);
4492 dspfunc(avg, 3, 2);
4493 #undef dspfunc
4495 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4496 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4498 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4499 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4500 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4501 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4502 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4503 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4504 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4505 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4506 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4508 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4509 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4510 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4511 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4512 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4513 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4514 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4515 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4516 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4518 #define dspfunc(PFX, IDX, NUM) \
4519 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4520 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4521 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4522 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4523 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4524 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4525 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4526 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4527 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4528 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4529 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4530 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4531 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4532 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4533 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4534 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4536 dspfunc(put_qpel, 0, 16);
4537 dspfunc(put_no_rnd_qpel, 0, 16);
4539 dspfunc(avg_qpel, 0, 16);
4540 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4542 dspfunc(put_qpel, 1, 8);
4543 dspfunc(put_no_rnd_qpel, 1, 8);
4545 dspfunc(avg_qpel, 1, 8);
4546 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4548 dspfunc(put_h264_qpel, 0, 16);
4549 dspfunc(put_h264_qpel, 1, 8);
4550 dspfunc(put_h264_qpel, 2, 4);
4551 dspfunc(put_h264_qpel, 3, 2);
4552 dspfunc(avg_h264_qpel, 0, 16);
4553 dspfunc(avg_h264_qpel, 1, 8);
4554 dspfunc(avg_h264_qpel, 2, 4);
4556 #undef dspfunc
4557 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4558 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4559 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4560 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4561 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4562 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4563 c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_no_rnd_vc1_chroma_mc8_c;
4564 c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_no_rnd_vc1_chroma_mc8_c;
4566 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4567 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4568 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4569 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4570 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4571 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4572 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4573 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4574 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4575 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4576 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4577 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4578 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4579 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4580 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4581 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4582 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4583 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4584 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4585 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4587 c->draw_edges = draw_edges_c;
4589 #if CONFIG_CAVS_DECODER
4590 ff_cavsdsp_init(c,avctx);
4591 #endif
4593 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4594 ff_mlp_init(c, avctx);
4595 #endif
4596 #if CONFIG_VC1_DECODER
4597 ff_vc1dsp_init(c,avctx);
4598 #endif
4599 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4600 ff_intrax8dsp_init(c,avctx);
4601 #endif
4602 #if CONFIG_RV30_DECODER
4603 ff_rv30dsp_init(c,avctx);
4604 #endif
4605 #if CONFIG_RV40_DECODER
4606 ff_rv40dsp_init(c,avctx);
4607 c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4608 c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4609 c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4610 c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4611 #endif
4613 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4614 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4615 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4616 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4617 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4618 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4619 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4620 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4622 #define SET_CMP_FUNC(name) \
4623 c->name[0]= name ## 16_c;\
4624 c->name[1]= name ## 8x8_c;
4626 SET_CMP_FUNC(hadamard8_diff)
4627 c->hadamard8_diff[4]= hadamard8_intra16_c;
4628 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4629 SET_CMP_FUNC(dct_sad)
4630 SET_CMP_FUNC(dct_max)
4631 #if CONFIG_GPL
4632 SET_CMP_FUNC(dct264_sad)
4633 #endif
4634 c->sad[0]= pix_abs16_c;
4635 c->sad[1]= pix_abs8_c;
4636 c->sse[0]= sse16_c;
4637 c->sse[1]= sse8_c;
4638 c->sse[2]= sse4_c;
4639 SET_CMP_FUNC(quant_psnr)
4640 SET_CMP_FUNC(rd)
4641 SET_CMP_FUNC(bit)
4642 c->vsad[0]= vsad16_c;
4643 c->vsad[4]= vsad_intra16_c;
4644 c->vsad[5]= vsad_intra8_c;
4645 c->vsse[0]= vsse16_c;
4646 c->vsse[4]= vsse_intra16_c;
4647 c->vsse[5]= vsse_intra8_c;
4648 c->nsse[0]= nsse16_c;
4649 c->nsse[1]= nsse8_c;
4650 #if CONFIG_SNOW_ENCODER
4651 c->w53[0]= w53_16_c;
4652 c->w53[1]= w53_8_c;
4653 c->w97[0]= w97_16_c;
4654 c->w97[1]= w97_8_c;
4655 #endif
4657 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4659 c->add_bytes= add_bytes_c;
4660 c->add_bytes_l2= add_bytes_l2_c;
4661 c->diff_bytes= diff_bytes_c;
4662 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4663 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4664 c->bswap_buf= bswap_buf;
4665 #if CONFIG_PNG_DECODER
4666 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4667 #endif
4669 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4670 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4671 c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c;
4672 c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c;
4673 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4674 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4675 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4676 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4677 c->h264_loop_filter_strength= NULL;
4679 if (CONFIG_ANY_H263) {
4680 c->h263_h_loop_filter= h263_h_loop_filter_c;
4681 c->h263_v_loop_filter= h263_v_loop_filter_c;
4684 if (CONFIG_VP3_DECODER) {
4685 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4686 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4688 if (CONFIG_VP6_DECODER) {
4689 c->vp6_filter_diag4= ff_vp6_filter_diag4_c;
4692 c->h261_loop_filter= h261_loop_filter_c;
4694 c->try_8x8basis= try_8x8basis_c;
4695 c->add_8x8basis= add_8x8basis_c;
4697 #if CONFIG_SNOW_DECODER
4698 c->vertical_compose97i = ff_snow_vertical_compose97i;
4699 c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4700 c->inner_add_yblock = ff_snow_inner_add_yblock;
4701 #endif
4703 #if CONFIG_VORBIS_DECODER
4704 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4705 #endif
4706 #if CONFIG_AC3_DECODER
4707 c->ac3_downmix = ff_ac3_downmix_c;
4708 #endif
4709 #if CONFIG_FLAC_ENCODER
4710 c->flac_compute_autocorr = ff_flac_compute_autocorr;
4711 #endif
4712 c->vector_fmul = vector_fmul_c;
4713 c->vector_fmul_reverse = vector_fmul_reverse_c;
4714 c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4715 c->vector_fmul_window = ff_vector_fmul_window_c;
4716 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4717 c->vector_clipf = vector_clipf_c;
4718 c->float_to_int16 = ff_float_to_int16_c;
4719 c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4720 c->add_int16 = add_int16_c;
4721 c->sub_int16 = sub_int16_c;
4722 c->scalarproduct_int16 = scalarproduct_int16_c;
4724 c->shrink[0]= ff_img_copy_plane;
4725 c->shrink[1]= ff_shrink22;
4726 c->shrink[2]= ff_shrink44;
4727 c->shrink[3]= ff_shrink88;
4729 c->prefetch= just_return;
4731 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4732 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4734 if (HAVE_MMX) dsputil_init_mmx (c, avctx);
4735 if (ARCH_ARM) dsputil_init_arm (c, avctx);
4736 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx);
4737 if (HAVE_VIS) dsputil_init_vis (c, avctx);
4738 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx);
4739 if (ARCH_PPC) dsputil_init_ppc (c, avctx);
4740 if (HAVE_MMI) dsputil_init_mmi (c, avctx);
4741 if (ARCH_SH4) dsputil_init_sh4 (c, avctx);
4742 if (ARCH_BFIN) dsputil_init_bfin (c, avctx);
4744 for(i=0; i<64; i++){
4745 if(!c->put_2tap_qpel_pixels_tab[0][i])
4746 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4747 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4748 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4751 switch(c->idct_permutation_type){
4752 case FF_NO_IDCT_PERM:
4753 for(i=0; i<64; i++)
4754 c->idct_permutation[i]= i;
4755 break;
4756 case FF_LIBMPEG2_IDCT_PERM:
4757 for(i=0; i<64; i++)
4758 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4759 break;
4760 case FF_SIMPLE_IDCT_PERM:
4761 for(i=0; i<64; i++)
4762 c->idct_permutation[i]= simple_mmx_permutation[i];
4763 break;
4764 case FF_TRANSPOSE_IDCT_PERM:
4765 for(i=0; i<64; i++)
4766 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4767 break;
4768 case FF_PARTTRANS_IDCT_PERM:
4769 for(i=0; i<64; i++)
4770 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4771 break;
4772 case FF_SSE2_IDCT_PERM:
4773 for(i=0; i<64; i++)
4774 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4775 break;
4776 default:
4777 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");