2 this is optimized for sh, which have post increment addressing (*p++)
3 some cpu may be index (p[n]) faster than post increment (*p++)
6 #define LD(adr) *(uint32_t*)(adr)
8 #define PIXOP2(OPNAME, OP) \
9 /*static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
12 OP(LP(dst ),no_rnd_avg32(LD32(src1 ),LD32(src2 )) ); \
13 OP(LP(dst+4),no_rnd_avg32(LD32(src1+4),LD32(src2+4)) ); \
20 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
23 OP(LP(dst ),rnd_avg32(LD32(src1 ),LD32(src2 )) ); \
24 OP(LP(dst+4),rnd_avg32(LD32(src1+4),LD32(src2+4)) ); \
31 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
34 OP(LP(dst ),rnd_avg32(LD32(src1 ),LD32(src2 )) ); \
41 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
44 OP(LP(dst ),no_rnd_avg32(LD32(src1 ),LD32(src2 )) ); \
45 OP(LP(dst+4),no_rnd_avg32(LD32(src1+4),LD32(src2+4)) ); \
46 OP(LP(dst+8),no_rnd_avg32(LD32(src1+8),LD32(src2+8)) ); \
47 OP(LP(dst+12),no_rnd_avg32(LD32(src1+12),LD32(src2+12)) ); \
54 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
57 OP(LP(dst ),rnd_avg32(LD32(src1 ),LD32(src2 )) ); \
58 OP(LP(dst+4),rnd_avg32(LD32(src1+4),LD32(src2+4)) ); \
59 OP(LP(dst+8),rnd_avg32(LD32(src1+8),LD32(src2+8)) ); \
60 OP(LP(dst+12),rnd_avg32(LD32(src1+12),LD32(src2+12)) ); \
67 static inline void OPNAME ## _pixels4_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
70 OP(LP(dst ),rnd_avg32(LP(src1 ),LP(src2 )) ); \
77 static inline void OPNAME ## _pixels4_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
80 OP(LP(dst ),rnd_avg32(LD32(src1 ),LP(src2 )) ); \
87 static inline void OPNAME ## _no_rnd_pixels16_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
90 OP(LP(dst ),no_rnd_avg32(LD32(src1 ),LP(src2 )) ); \
91 OP(LP(dst+4),no_rnd_avg32(LD32(src1+4),LP(src2+4)) ); \
92 OP(LP(dst+8),no_rnd_avg32(LD32(src1+8),LP(src2+8)) ); \
93 OP(LP(dst+12),no_rnd_avg32(LD32(src1+12),LP(src2+12)) ); \
100 static inline void OPNAME ## _pixels16_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
103 OP(LP(dst ),rnd_avg32(LD32(src1 ),LP(src2 )) ); \
104 OP(LP(dst+4),rnd_avg32(LD32(src1+4),LP(src2+4)) ); \
105 OP(LP(dst+8),rnd_avg32(LD32(src1+8),LP(src2+8)) ); \
106 OP(LP(dst+12),rnd_avg32(LD32(src1+12),LP(src2+12)) ); \
113 static inline void OPNAME ## _no_rnd_pixels8_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
115 do { /* onlye src2 aligned */\
116 OP(LP(dst ),no_rnd_avg32(LD32(src1 ),LP(src2 )) ); \
117 OP(LP(dst+4),no_rnd_avg32(LD32(src1+4),LP(src2+4)) ); \
124 static inline void OPNAME ## _pixels8_l2_aligned2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
127 OP(LP(dst ),rnd_avg32(LD32(src1 ),LP(src2 )) ); \
128 OP(LP(dst+4),rnd_avg32(LD32(src1+4),LP(src2+4)) ); \
135 static inline void OPNAME ## _no_rnd_pixels8_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
138 OP(LP(dst ),no_rnd_avg32(LP(src1 ),LP(src2 )) ); \
139 OP(LP(dst+4),no_rnd_avg32(LP(src1+4),LP(src2+4)) ); \
146 static inline void OPNAME ## _pixels8_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
149 OP(LP(dst ),rnd_avg32(LP(src1 ),LP(src2 )) ); \
150 OP(LP(dst+4),rnd_avg32(LP(src1+4),LP(src2+4)) ); \
157 static inline void OPNAME ## _no_rnd_pixels16_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
160 OP(LP(dst ),no_rnd_avg32(LP(src1 ),LP(src2 )) ); \
161 OP(LP(dst+4),no_rnd_avg32(LP(src1+4),LP(src2+4)) ); \
162 OP(LP(dst+8),no_rnd_avg32(LP(src1+8),LP(src2+8)) ); \
163 OP(LP(dst+12),no_rnd_avg32(LP(src1+12),LP(src2+12)) ); \
170 static inline void OPNAME ## _pixels16_l2_aligned(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
173 OP(LP(dst ),rnd_avg32(LP(src1 ),LP(src2 )) ); \
174 OP(LP(dst+4),rnd_avg32(LP(src1+4),LP(src2+4)) ); \
175 OP(LP(dst+8),rnd_avg32(LP(src1+8),LP(src2+8)) ); \
176 OP(LP(dst+12),rnd_avg32(LP(src1+12),LP(src2+12)) ); \
183 static inline void OPNAME ## _no_rnd_pixels16_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
184 { OPNAME ## _no_rnd_pixels16_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
186 static inline void OPNAME ## _pixels16_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
187 { OPNAME ## _pixels16_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
189 static inline void OPNAME ## _no_rnd_pixels8_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
190 { OPNAME ## _no_rnd_pixels8_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
192 static inline void OPNAME ## _pixels8_l2_aligned1(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, int src_stride1, int src_stride2, int h) \
193 { OPNAME ## _pixels8_l2_aligned2(dst,src2,src1,dst_stride,src_stride2,src_stride1,h); } \
195 static inline void OPNAME ## _pixels8_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
197 uint32_t a0,a1,a2,a3; \
198 UNPACK(a0,a1,LP(src1),LP(src2)); \
199 UNPACK(a2,a3,LP(src3),LP(src4)); \
200 OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
201 UNPACK(a0,a1,LP(src1+4),LP(src2+4)); \
202 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \
203 OP(LP(dst+4),rnd_PACK(a0,a1,a2,a3)); \
212 static inline void OPNAME ## _no_rnd_pixels8_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
214 uint32_t a0,a1,a2,a3; \
215 UNPACK(a0,a1,LP(src1),LP(src2)); \
216 UNPACK(a2,a3,LP(src3),LP(src4)); \
217 OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
218 UNPACK(a0,a1,LP(src1+4),LP(src2+4)); \
219 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \
220 OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
229 static inline void OPNAME ## _pixels8_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
231 uint32_t a0,a1,a2,a3; /* src1 only not aligned */\
232 UNPACK(a0,a1,LD32(src1),LP(src2)); \
233 UNPACK(a2,a3,LP(src3),LP(src4)); \
234 OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
235 UNPACK(a0,a1,LD32(src1+4),LP(src2+4)); \
236 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \
237 OP(LP(dst+4),rnd_PACK(a0,a1,a2,a3)); \
246 static inline void OPNAME ## _no_rnd_pixels8_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
248 uint32_t a0,a1,a2,a3; \
249 UNPACK(a0,a1,LD32(src1),LP(src2)); \
250 UNPACK(a2,a3,LP(src3),LP(src4)); \
251 OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
252 UNPACK(a0,a1,LD32(src1+4),LP(src2+4)); \
253 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \
254 OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
263 static inline void OPNAME ## _pixels16_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
265 uint32_t a0,a1,a2,a3; \
266 UNPACK(a0,a1,LP(src1),LP(src2)); \
267 UNPACK(a2,a3,LP(src3),LP(src4)); \
268 OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
269 UNPACK(a0,a1,LP(src1+4),LP(src2+4)); \
270 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \
271 OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
272 UNPACK(a0,a1,LP(src1+8),LP(src2+8)); \
273 UNPACK(a2,a3,LP(src3+8),LP(src4+8)); \
274 OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
275 UNPACK(a0,a1,LP(src1+12),LP(src2+12)); \
276 UNPACK(a2,a3,LP(src3+12),LP(src4+12)); \
277 OP(LP(dst+12),rnd_PACK(a0,a1,a2,a3)); \
286 static inline void OPNAME ## _no_rnd_pixels16_l4_aligned(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
288 uint32_t a0,a1,a2,a3; \
289 UNPACK(a0,a1,LP(src1),LP(src2)); \
290 UNPACK(a2,a3,LP(src3),LP(src4)); \
291 OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
292 UNPACK(a0,a1,LP(src1+4),LP(src2+4)); \
293 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \
294 OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
295 UNPACK(a0,a1,LP(src1+8),LP(src2+8)); \
296 UNPACK(a2,a3,LP(src3+8),LP(src4+8)); \
297 OP(LP(dst+8),no_rnd_PACK(a0,a1,a2,a3)); \
298 UNPACK(a0,a1,LP(src1+12),LP(src2+12)); \
299 UNPACK(a2,a3,LP(src3+12),LP(src4+12)); \
300 OP(LP(dst+12),no_rnd_PACK(a0,a1,a2,a3)); \
309 static inline void OPNAME ## _pixels16_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
310 do { /* src1 is unaligned */\
311 uint32_t a0,a1,a2,a3; \
312 UNPACK(a0,a1,LD32(src1),LP(src2)); \
313 UNPACK(a2,a3,LP(src3),LP(src4)); \
314 OP(LP(dst),rnd_PACK(a0,a1,a2,a3)); \
315 UNPACK(a0,a1,LD32(src1+4),LP(src2+4)); \
316 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \
317 OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
318 UNPACK(a0,a1,LD32(src1+8),LP(src2+8)); \
319 UNPACK(a2,a3,LP(src3+8),LP(src4+8)); \
320 OP(LP(dst+8),rnd_PACK(a0,a1,a2,a3)); \
321 UNPACK(a0,a1,LD32(src1+12),LP(src2+12)); \
322 UNPACK(a2,a3,LP(src3+12),LP(src4+12)); \
323 OP(LP(dst+12),rnd_PACK(a0,a1,a2,a3)); \
332 static inline void OPNAME ## _no_rnd_pixels16_l4_aligned0(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
334 uint32_t a0,a1,a2,a3; \
335 UNPACK(a0,a1,LD32(src1),LP(src2)); \
336 UNPACK(a2,a3,LP(src3),LP(src4)); \
337 OP(LP(dst),no_rnd_PACK(a0,a1,a2,a3)); \
338 UNPACK(a0,a1,LD32(src1+4),LP(src2+4)); \
339 UNPACK(a2,a3,LP(src3+4),LP(src4+4)); \
340 OP(LP(dst+4),no_rnd_PACK(a0,a1,a2,a3)); \
341 UNPACK(a0,a1,LD32(src1+8),LP(src2+8)); \
342 UNPACK(a2,a3,LP(src3+8),LP(src4+8)); \
343 OP(LP(dst+8),no_rnd_PACK(a0,a1,a2,a3)); \
344 UNPACK(a0,a1,LD32(src1+12),LP(src2+12)); \
345 UNPACK(a2,a3,LP(src3+12),LP(src4+12)); \
346 OP(LP(dst+12),no_rnd_PACK(a0,a1,a2,a3)); \
356 #define op_avg(a, b) a = rnd_avg32(a,b)
357 #define op_put(a, b) a = b
364 #define avg2(a,b) ((a+b+1)>>1)
365 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
368 static void gmc1_c(uint8_t *dst
, uint8_t *src
, int stride
, int h
, int x16
, int y16
, int rounder
)
370 const int A
=(16-x16
)*(16-y16
);
371 const int B
=( x16
)*(16-y16
);
372 const int C
=(16-x16
)*( y16
);
373 const int D
=( x16
)*( y16
);
378 uint8_t *s1
= src
+stride
;
379 t0
= *s0
++; t2
= *s1
++;
380 t1
= *s0
++; t3
= *s1
++;
381 dst
[0]= (A
*t0
+ B
*t1
+ C
*t2
+ D
*t3
+ rounder
)>>8;
382 t0
= *s0
++; t2
= *s1
++;
383 dst
[1]= (A
*t1
+ B
*t0
+ C
*t3
+ D
*t2
+ rounder
)>>8;
384 t1
= *s0
++; t3
= *s1
++;
385 dst
[2]= (A
*t0
+ B
*t1
+ C
*t2
+ D
*t3
+ rounder
)>>8;
386 t0
= *s0
++; t2
= *s1
++;
387 dst
[3]= (A
*t1
+ B
*t0
+ C
*t3
+ D
*t2
+ rounder
)>>8;
388 t1
= *s0
++; t3
= *s1
++;
389 dst
[4]= (A
*t0
+ B
*t1
+ C
*t2
+ D
*t3
+ rounder
)>>8;
390 t0
= *s0
++; t2
= *s1
++;
391 dst
[5]= (A
*t1
+ B
*t0
+ C
*t3
+ D
*t2
+ rounder
)>>8;
392 t1
= *s0
++; t3
= *s1
++;
393 dst
[6]= (A
*t0
+ B
*t1
+ C
*t2
+ D
*t3
+ rounder
)>>8;
394 t0
= *s0
++; t2
= *s1
++;
395 dst
[7]= (A
*t1
+ B
*t0
+ C
*t3
+ D
*t2
+ rounder
)>>8;
401 static void gmc_c(uint8_t *dst
, uint8_t *src
, int stride
, int h
, int ox
, int oy
,
402 int dxx
, int dxy
, int dyx
, int dyy
, int shift
, int r
, int width
, int height
)
405 const int s
= 1<<shift
;
415 for(x
=0; x
<8; x
++){ //XXX FIXME optimize
416 int src_x
, src_y
, frac_x
, frac_y
, index
;
425 if((unsigned)src_x
< width
){
426 if((unsigned)src_y
< height
){
427 index
= src_x
+ src_y
*stride
;
428 dst
[y
*stride
+ x
]= ( ( src
[index
]*(s
-frac_x
)
429 + src
[index
+1]* frac_x
)*(s
-frac_y
)
430 + ( src
[index
+stride
]*(s
-frac_x
)
431 + src
[index
+stride
+1]* frac_x
)* frac_y
434 index
= src_x
+ clip(src_y
, 0, height
)*stride
;
435 dst
[y
*stride
+ x
]= ( ( src
[index
]*(s
-frac_x
)
436 + src
[index
+1]* frac_x
)*s
440 if((unsigned)src_y
< height
){
441 index
= clip(src_x
, 0, width
) + src_y
*stride
;
442 dst
[y
*stride
+ x
]= ( ( src
[index
]*(s
-frac_y
)
443 + src
[index
+stride
]* frac_y
)*s
446 index
= clip(src_x
, 0, width
) + clip(src_y
, 0, height
)*stride
;
447 dst
[y
*stride
+ x
]= src
[index
];
458 #define H264_CHROMA_MC(OPNAME, OP)\
459 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
460 const int A=(8-x)*(8-y);\
461 const int B=( x)*(8-y);\
462 const int C=(8-x)*( y);\
463 const int D=( x)*( y);\
465 assert(x<8 && y<8 && x>=0 && y>=0);\
470 uint8_t *s1 = src+stride; \
471 t0 = *s0++; t2 = *s1++; \
472 t1 = *s0++; t3 = *s1++; \
473 OP(dst[0], (A*t0 + B*t1 + C*t2 + D*t3));\
474 t0 = *s0++; t2 = *s1++; \
475 OP(dst[1], (A*t1 + B*t0 + C*t3 + D*t2));\
481 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
482 const int A=(8-x)*(8-y);\
483 const int B=( x)*(8-y);\
484 const int C=(8-x)*( y);\
485 const int D=( x)*( y);\
487 assert(x<8 && y<8 && x>=0 && y>=0);\
492 uint8_t *s1 = src+stride; \
493 t0 = *s0++; t2 = *s1++; \
494 t1 = *s0++; t3 = *s1++; \
495 OP(dst[0], (A*t0 + B*t1 + C*t2 + D*t3));\
496 t0 = *s0++; t2 = *s1++; \
497 OP(dst[1], (A*t1 + B*t0 + C*t3 + D*t2));\
498 t1 = *s0++; t3 = *s1++; \
499 OP(dst[2], (A*t0 + B*t1 + C*t2 + D*t3));\
500 t0 = *s0++; t2 = *s1++; \
501 OP(dst[3], (A*t1 + B*t0 + C*t3 + D*t2));\
507 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
508 const int A=(8-x)*(8-y);\
509 const int B=( x)*(8-y);\
510 const int C=(8-x)*( y);\
511 const int D=( x)*( y);\
513 assert(x<8 && y<8 && x>=0 && y>=0);\
518 uint8_t *s1 = src+stride; \
519 t0 = *s0++; t2 = *s1++; \
520 t1 = *s0++; t3 = *s1++; \
521 OP(dst[0], (A*t0 + B*t1 + C*t2 + D*t3));\
522 t0 = *s0++; t2 = *s1++; \
523 OP(dst[1], (A*t1 + B*t0 + C*t3 + D*t2));\
524 t1 = *s0++; t3 = *s1++; \
525 OP(dst[2], (A*t0 + B*t1 + C*t2 + D*t3));\
526 t0 = *s0++; t2 = *s1++; \
527 OP(dst[3], (A*t1 + B*t0 + C*t3 + D*t2));\
528 t1 = *s0++; t3 = *s1++; \
529 OP(dst[4], (A*t0 + B*t1 + C*t2 + D*t3));\
530 t0 = *s0++; t2 = *s1++; \
531 OP(dst[5], (A*t1 + B*t0 + C*t3 + D*t2));\
532 t1 = *s0++; t3 = *s1++; \
533 OP(dst[6], (A*t0 + B*t1 + C*t2 + D*t3));\
534 t0 = *s0++; t2 = *s1++; \
535 OP(dst[7], (A*t1 + B*t0 + C*t3 + D*t2));\
541 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
542 #define op_put(a, b) a = (((b) + 32)>>6)
544 H264_CHROMA_MC(put_
, op_put
)
545 H264_CHROMA_MC(avg_
, op_avg
)
549 /* not yet optimized */
550 static inline void copy_block4(uint8_t *dst
, uint8_t *src
, int dstStride
, int srcStride
, int h
)
555 ST32(dst
, LD32(src
));
561 static inline void copy_block8(uint8_t *dst
, uint8_t *src
, int dstStride
, int srcStride
, int h
)
566 ST32(dst
, LD32(src
));
567 ST32(dst
+4 , LD32(src
+4 ));
573 static inline void copy_block16(uint8_t *dst
, uint8_t *src
, int dstStride
, int srcStride
, int h
)
578 ST32(dst
, LD32(src
));
579 ST32(dst
+4 , LD32(src
+4 ));
580 ST32(dst
+8 , LD32(src
+8 ));
581 ST32(dst
+12, LD32(src
+12));
587 static inline void copy_block17(uint8_t *dst
, uint8_t *src
, int dstStride
, int srcStride
, int h
)
592 ST32(dst
, LD32(src
));
593 ST32(dst
+4 , LD32(src
+4 ));
594 ST32(dst
+8 , LD32(src
+8 ));
595 ST32(dst
+12, LD32(src
+12));
602 static inline void copy_block9(uint8_t *dst
, uint8_t *src
, int dstStride
, int srcStride
, int h
)
607 ST32(dst
, LD32(src
));
608 ST32(dst
+4 , LD32(src
+4 ));
614 /* end not optimized */
616 #define QPEL_MC(r, OPNAME, RND, OP) \
617 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
618 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
621 int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
627 OP(dst[0], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
629 OP(dst[1], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
631 OP(dst[2], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
633 OP(dst[3], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
635 OP(dst[4], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
636 OP(dst[5], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
637 OP(dst[6], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
638 OP(dst[7], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
644 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
645 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
648 uint8_t *s = src, *d=dst;\
649 int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
650 src0 = *s; s+=srcStride; \
651 src1 = *s; s+=srcStride; \
652 src2 = *s; s+=srcStride; \
653 src3 = *s; s+=srcStride; \
654 src4 = *s; s+=srcStride; \
655 OP(*d, (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));d+=dstStride;\
656 src5 = *s; s+=srcStride; \
657 OP(*d, (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));d+=dstStride;\
658 src6 = *s; s+=srcStride; \
659 OP(*d, (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));d+=dstStride;\
660 src7 = *s; s+=srcStride; \
661 OP(*d, (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));d+=dstStride;\
663 OP(*d, (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));d+=dstStride;\
664 OP(*d, (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));d+=dstStride;\
665 OP(*d, (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));d+=dstStride;\
666 OP(*d, (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
672 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
673 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
676 int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
677 int src9,src10,src11,src12,src13,src14,src15,src16;\
683 OP(dst[ 0], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
685 OP(dst[ 1], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
687 OP(dst[ 2], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
689 OP(dst[ 3], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
691 OP(dst[ 4], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
693 OP(dst[ 5], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
695 OP(dst[ 6], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
697 OP(dst[ 7], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
699 OP(dst[ 8], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
701 OP(dst[ 9], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
703 OP(dst[10], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
705 OP(dst[11], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
707 OP(dst[12], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
708 OP(dst[13], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
709 OP(dst[14], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
710 OP(dst[15], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
716 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
717 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
720 uint8_t *s = src, *d=dst;\
721 int src0,src1,src2,src3,src4,src5,src6,src7,src8;\
722 int src9,src10,src11,src12,src13,src14,src15,src16;\
723 src0 = *s; s+=srcStride; \
724 src1 = *s; s+=srcStride; \
725 src2 = *s; s+=srcStride; \
726 src3 = *s; s+=srcStride; \
727 src4 = *s; s+=srcStride; \
728 OP(*d, (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));d+=dstStride;\
729 src5 = *s; s+=srcStride; \
730 OP(*d, (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));d+=dstStride;\
731 src6 = *s; s+=srcStride; \
732 OP(*d, (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));d+=dstStride;\
733 src7 = *s; s+=srcStride; \
734 OP(*d, (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));d+=dstStride;\
735 src8 = *s; s+=srcStride; \
736 OP(*d, (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));d+=dstStride;\
737 src9 = *s; s+=srcStride; \
738 OP(*d, (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));d+=dstStride;\
739 src10 = *s; s+=srcStride; \
740 OP(*d, (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));d+=dstStride;\
741 src11 = *s; s+=srcStride; \
742 OP(*d, (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));d+=dstStride;\
743 src12 = *s; s+=srcStride; \
744 OP(*d, (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));d+=dstStride;\
745 src13 = *s; s+=srcStride; \
746 OP(*d, (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));d+=dstStride;\
747 src14 = *s; s+=srcStride; \
748 OP(*d, (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));d+=dstStride;\
749 src15 = *s; s+=srcStride; \
750 OP(*d, (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));d+=dstStride;\
752 OP(*d, (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));d+=dstStride;\
753 OP(*d, (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));d+=dstStride;\
754 OP(*d, (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));d+=dstStride;\
755 OP(*d, (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
761 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
762 OPNAME ## pixels8_c(dst, src, stride, 8);\
765 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
767 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
768 OPNAME ## pixels8_l2_aligned2(dst, src, half, stride, stride, 8, 8);\
771 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
772 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
775 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
777 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
778 OPNAME ## pixels8_l2_aligned2(dst, src+1, half, stride, stride, 8, 8);\
781 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
784 copy_block9(full, src, 16, stride, 9);\
785 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
786 OPNAME ## pixels8_l2_aligned(dst, full, half, stride, 16, 8, 8);\
789 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
791 copy_block9(full, src, 16, stride, 9);\
792 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
795 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
798 copy_block9(full, src, 16, stride, 9);\
799 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
800 OPNAME ## pixels8_l2_aligned(dst, full+16, half, stride, 16, 8, 8);\
802 static void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
807 copy_block9(full, src, 16, stride, 9);\
808 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
809 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
810 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
811 OPNAME ## pixels8_l4_aligned(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
813 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
817 copy_block9(full, src, 16, stride, 9);\
818 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
819 put ## RND ## pixels8_l2_aligned(halfH, halfH, full, 8, 8, 16, 9);\
820 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
821 OPNAME ## pixels8_l2_aligned(dst, halfH, halfHV, stride, 8, 8, 8);\
823 static void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
828 copy_block9(full, src, 16, stride, 9);\
829 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
830 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
831 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
832 OPNAME ## pixels8_l4_aligned0(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
834 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
838 copy_block9(full, src, 16, stride, 9);\
839 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
840 put ## RND ## pixels8_l2_aligned1(halfH, halfH, full+1, 8, 8, 16, 9);\
841 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
842 OPNAME ## pixels8_l2_aligned(dst, halfH, halfHV, stride, 8, 8, 8);\
844 static void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
849 copy_block9(full, src, 16, stride, 9);\
850 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
851 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
852 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
853 OPNAME ## pixels8_l4_aligned(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
855 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
859 copy_block9(full, src, 16, stride, 9);\
860 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
861 put ## RND ## pixels8_l2_aligned(halfH, halfH, full, 8, 8, 16, 9);\
862 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
863 OPNAME ## pixels8_l2_aligned(dst, halfH+8, halfHV, stride, 8, 8, 8);\
865 static void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
870 copy_block9(full, src, 16, stride, 9);\
871 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
872 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
873 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
874 OPNAME ## pixels8_l4_aligned0(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
876 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
880 copy_block9(full, src, 16, stride, 9);\
881 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
882 put ## RND ## pixels8_l2_aligned1(halfH, halfH, full+1, 8, 8, 16, 9);\
883 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
884 OPNAME ## pixels8_l2_aligned(dst, halfH+8, halfHV, stride, 8, 8, 8);\
886 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
889 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
890 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
891 OPNAME ## pixels8_l2_aligned(dst, halfH, halfHV, stride, 8, 8, 8);\
893 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
896 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
897 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
898 OPNAME ## pixels8_l2_aligned(dst, halfH+8, halfHV, stride, 8, 8, 8);\
900 static void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
905 copy_block9(full, src, 16, stride, 9);\
906 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
907 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
908 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
909 OPNAME ## pixels8_l2_aligned(dst, halfV, halfHV, stride, 8, 8, 8);\
911 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
914 copy_block9(full, src, 16, stride, 9);\
915 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
916 put ## RND ## pixels8_l2_aligned(halfH, halfH, full, 8, 8, 16, 9);\
917 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
919 static void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
924 copy_block9(full, src, 16, stride, 9);\
925 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
926 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
927 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
928 OPNAME ## pixels8_l2_aligned(dst, halfV, halfHV, stride, 8, 8, 8);\
930 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
933 copy_block9(full, src, 16, stride, 9);\
934 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
935 put ## RND ## pixels8_l2_aligned1(halfH, halfH, full+1, 8, 8, 16, 9);\
936 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
938 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
940 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
941 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
943 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
944 OPNAME ## pixels16_c(dst, src, stride, 16);\
947 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
949 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
950 OPNAME ## pixels16_l2_aligned2(dst, src, half, stride, stride, 16, 16);\
953 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
954 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
957 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
959 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
960 OPNAME ## pixels16_l2_aligned2(dst, src+1, half, stride, stride, 16, 16);\
963 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
964 uint8_t full[24*17];\
966 copy_block17(full, src, 24, stride, 17);\
967 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
968 OPNAME ## pixels16_l2_aligned(dst, full, half, stride, 24, 16, 16);\
971 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
972 uint8_t full[24*17];\
973 copy_block17(full, src, 24, stride, 17);\
974 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
977 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
978 uint8_t full[24*17];\
980 copy_block17(full, src, 24, stride, 17);\
981 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
982 OPNAME ## pixels16_l2_aligned(dst, full+24, half, stride, 24, 16, 16);\
984 static void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
985 uint8_t full[24*17];\
988 uint8_t halfHV[256];\
989 copy_block17(full, src, 24, stride, 17);\
990 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
991 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
992 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
993 OPNAME ## pixels16_l4_aligned(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
995 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
996 uint8_t full[24*17];\
998 uint8_t halfHV[256];\
999 copy_block17(full, src, 24, stride, 17);\
1000 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1001 put ## RND ## pixels16_l2_aligned(halfH, halfH, full, 16, 16, 24, 17);\
1002 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1003 OPNAME ## pixels16_l2_aligned(dst, halfH, halfHV, stride, 16, 16, 16);\
1005 static void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1006 uint8_t full[24*17];\
1007 uint8_t halfH[272];\
1008 uint8_t halfV[256];\
1009 uint8_t halfHV[256];\
1010 copy_block17(full, src, 24, stride, 17);\
1011 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1012 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1013 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1014 OPNAME ## pixels16_l4_aligned0(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1016 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1017 uint8_t full[24*17];\
1018 uint8_t halfH[272];\
1019 uint8_t halfHV[256];\
1020 copy_block17(full, src, 24, stride, 17);\
1021 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1022 put ## RND ## pixels16_l2_aligned1(halfH, halfH, full+1, 16, 16, 24, 17);\
1023 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1024 OPNAME ## pixels16_l2_aligned(dst, halfH, halfHV, stride, 16, 16, 16);\
1026 static void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1027 uint8_t full[24*17];\
1028 uint8_t halfH[272];\
1029 uint8_t halfV[256];\
1030 uint8_t halfHV[256];\
1031 copy_block17(full, src, 24, stride, 17);\
1032 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1033 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1034 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1035 OPNAME ## pixels16_l4_aligned(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1037 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1038 uint8_t full[24*17];\
1039 uint8_t halfH[272];\
1040 uint8_t halfHV[256];\
1041 copy_block17(full, src, 24, stride, 17);\
1042 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1043 put ## RND ## pixels16_l2_aligned(halfH, halfH, full, 16, 16, 24, 17);\
1044 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1045 OPNAME ## pixels16_l2_aligned(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1047 static void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1048 uint8_t full[24*17];\
1049 uint8_t halfH[272];\
1050 uint8_t halfV[256];\
1051 uint8_t halfHV[256];\
1052 copy_block17(full, src, 24, stride, 17);\
1053 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1054 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1055 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1056 OPNAME ## pixels16_l4_aligned0(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1058 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1059 uint8_t full[24*17];\
1060 uint8_t halfH[272];\
1061 uint8_t halfHV[256];\
1062 copy_block17(full, src, 24, stride, 17);\
1063 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1064 put ## RND ## pixels16_l2_aligned1(halfH, halfH, full+1, 16, 16, 24, 17);\
1065 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1066 OPNAME ## pixels16_l2_aligned(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1068 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1069 uint8_t halfH[272];\
1070 uint8_t halfHV[256];\
1071 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1072 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1073 OPNAME ## pixels16_l2_aligned(dst, halfH, halfHV, stride, 16, 16, 16);\
1075 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1076 uint8_t halfH[272];\
1077 uint8_t halfHV[256];\
1078 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1079 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1080 OPNAME ## pixels16_l2_aligned(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1082 static void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1083 uint8_t full[24*17];\
1084 uint8_t halfH[272];\
1085 uint8_t halfV[256];\
1086 uint8_t halfHV[256];\
1087 copy_block17(full, src, 24, stride, 17);\
1088 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1089 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1090 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1091 OPNAME ## pixels16_l2_aligned(dst, halfV, halfHV, stride, 16, 16, 16);\
1093 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1094 uint8_t full[24*17];\
1095 uint8_t halfH[272];\
1096 copy_block17(full, src, 24, stride, 17);\
1097 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1098 put ## RND ## pixels16_l2_aligned(halfH, halfH, full, 16, 16, 24, 17);\
1099 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1101 static void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1102 uint8_t full[24*17];\
1103 uint8_t halfH[272];\
1104 uint8_t halfV[256];\
1105 uint8_t halfHV[256];\
1106 copy_block17(full, src, 24, stride, 17);\
1107 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1108 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1109 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1110 OPNAME ## pixels16_l2_aligned(dst, halfV, halfHV, stride, 16, 16, 16);\
1112 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1113 uint8_t full[24*17];\
1114 uint8_t halfH[272];\
1115 copy_block17(full, src, 24, stride, 17);\
1116 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1117 put ## RND ## pixels16_l2_aligned1(halfH, halfH, full+1, 16, 16, 24, 17);\
1118 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1120 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1121 uint8_t halfH[272];\
1122 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1123 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1126 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1127 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1128 #define op_put(a, b) a = cm[((b) + 16)>>5]
1129 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1131 QPEL_MC(0, put_
, _
, op_put
)
1132 QPEL_MC(1, put_no_rnd_
, _no_rnd_
, op_put_no_rnd
)
1133 QPEL_MC(0, avg_
, _
, op_avg
)
1134 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1136 #undef op_avg_no_rnd
1138 #undef op_put_no_rnd
1141 #define H264_LOWPASS(OPNAME, OP, OP2) \
1142 static inline void OPNAME ## h264_qpel_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride,int w,int h){\
1143 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1145 int srcB,srcA,src0,src1,src2,src3,src4,src5,src6;\
1146 uint8_t *s = src-2;\
1153 OP(dst[0], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1155 OP(dst[1], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1157 OP(dst[2], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1159 OP(dst[3], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1160 if (w>4) { /* it optimized */ \
1161 int src7,src8,src9,src10; \
1163 OP(dst[4], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1165 OP(dst[5], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1167 OP(dst[6], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1169 OP(dst[7], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1171 int src11,src12,src13,src14,src15,src16,src17,src18; \
1173 OP(dst[8] , (src8 +src9 )*20 - (src7 +src10)*5 + (src6 +src11));\
1175 OP(dst[9] , (src9 +src10)*20 - (src8 +src11)*5 + (src7 +src12));\
1177 OP(dst[10], (src10+src11)*20 - (src9 +src12)*5 + (src8 +src13));\
1179 OP(dst[11], (src11+src12)*20 - (src10+src13)*5 + (src9 +src14));\
1181 OP(dst[12], (src12+src13)*20 - (src11+src14)*5 + (src10+src15));\
1183 OP(dst[13], (src13+src14)*20 - (src12+src15)*5 + (src11+src16));\
1185 OP(dst[14], (src14+src15)*20 - (src13+src16)*5 + (src12+src17));\
1187 OP(dst[15], (src15+src16)*20 - (src14+src17)*5 + (src13+src18));\
1195 static inline void OPNAME ## h264_qpel_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride,int w,int h){\
1196 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1198 int srcB,srcA,src0,src1,src2,src3,src4,src5,src6;\
1199 uint8_t *s = src-2*srcStride,*d=dst;\
1200 srcB = *s; s+=srcStride;\
1201 srcA = *s; s+=srcStride;\
1202 src0 = *s; s+=srcStride;\
1203 src1 = *s; s+=srcStride;\
1204 src2 = *s; s+=srcStride;\
1205 src3 = *s; s+=srcStride;\
1206 OP(*d, (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));d+=dstStride;\
1207 src4 = *s; s+=srcStride;\
1208 OP(*d, (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));d+=dstStride;\
1209 src5 = *s; s+=srcStride;\
1210 OP(*d, (src2+src3)*20 - (src1+src4)*5 + (src0+src5));d+=dstStride;\
1211 src6 = *s; s+=srcStride;\
1212 OP(*d, (src3+src4)*20 - (src2+src5)*5 + (src1+src6));d+=dstStride;\
1214 int src7,src8,src9,src10; \
1215 src7 = *s; s+=srcStride;\
1216 OP(*d, (src4+src5)*20 - (src3+src6)*5 + (src2+src7));d+=dstStride;\
1217 src8 = *s; s+=srcStride;\
1218 OP(*d, (src5+src6)*20 - (src4+src7)*5 + (src3+src8));d+=dstStride;\
1219 src9 = *s; s+=srcStride;\
1220 OP(*d, (src6+src7)*20 - (src5+src8)*5 + (src4+src9));d+=dstStride;\
1221 src10 = *s; s+=srcStride;\
1222 OP(*d, (src7+src8)*20 - (src6+src9)*5 + (src5+src10));d+=dstStride;\
1224 int src11,src12,src13,src14,src15,src16,src17,src18; \
1225 src11 = *s; s+=srcStride;\
1226 OP(*d , (src8 +src9 )*20 - (src7 +src10)*5 + (src6 +src11));d+=dstStride;\
1227 src12 = *s; s+=srcStride;\
1228 OP(*d , (src9 +src10)*20 - (src8 +src11)*5 + (src7 +src12));d+=dstStride;\
1229 src13 = *s; s+=srcStride;\
1230 OP(*d, (src10+src11)*20 - (src9 +src12)*5 + (src8 +src13));d+=dstStride;\
1231 src14 = *s; s+=srcStride;\
1232 OP(*d, (src11+src12)*20 - (src10+src13)*5 + (src9 +src14));d+=dstStride;\
1233 src15 = *s; s+=srcStride;\
1234 OP(*d, (src12+src13)*20 - (src11+src14)*5 + (src10+src15));d+=dstStride;\
1235 src16 = *s; s+=srcStride;\
1236 OP(*d, (src13+src14)*20 - (src12+src15)*5 + (src11+src16));d+=dstStride;\
1237 src17 = *s; s+=srcStride;\
1238 OP(*d, (src14+src15)*20 - (src13+src16)*5 + (src12+src17));d+=dstStride;\
1239 src18 = *s; s+=srcStride;\
1240 OP(*d, (src15+src16)*20 - (src14+src17)*5 + (src13+src18));d+=dstStride;\
1248 static inline void OPNAME ## h264_qpel_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride,int w,int h){\
1249 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1251 src -= 2*srcStride;\
1254 int srcB,srcA,src0,src1,src2,src3,src4,src5,src6;\
1255 uint8_t *s = src-2;\
1262 tmp[0] = ((src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1264 tmp[1] = ((src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1266 tmp[2] = ((src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1268 tmp[3] = ((src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1269 if (w>4) { /* it optimized */ \
1270 int src7,src8,src9,src10; \
1272 tmp[4] = ((src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1274 tmp[5] = ((src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1276 tmp[6] = ((src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1278 tmp[7] = ((src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1280 int src11,src12,src13,src14,src15,src16,src17,src18; \
1282 tmp[8] = ((src8 +src9 )*20 - (src7 +src10)*5 + (src6 +src11));\
1284 tmp[9] = ((src9 +src10)*20 - (src8 +src11)*5 + (src7 +src12));\
1286 tmp[10] = ((src10+src11)*20 - (src9 +src12)*5 + (src8 +src13));\
1288 tmp[11] = ((src11+src12)*20 - (src10+src13)*5 + (src9 +src14));\
1290 tmp[12] = ((src12+src13)*20 - (src11+src14)*5 + (src10+src15));\
1292 tmp[13] = ((src13+src14)*20 - (src12+src15)*5 + (src11+src16));\
1294 tmp[14] = ((src14+src15)*20 - (src13+src16)*5 + (src12+src17));\
1296 tmp[15] = ((src15+src16)*20 - (src14+src17)*5 + (src13+src18));\
1302 tmp -= tmpStride*(h+5-2);\
1305 int tmpB,tmpA,tmp0,tmp1,tmp2,tmp3,tmp4,tmp5,tmp6;\
1306 int16_t *s = tmp-2*tmpStride; \
1308 tmpB = *s; s+=tmpStride;\
1309 tmpA = *s; s+=tmpStride;\
1310 tmp0 = *s; s+=tmpStride;\
1311 tmp1 = *s; s+=tmpStride;\
1312 tmp2 = *s; s+=tmpStride;\
1313 tmp3 = *s; s+=tmpStride;\
1314 OP2(*d, (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));d+=dstStride;\
1315 tmp4 = *s; s+=tmpStride;\
1316 OP2(*d, (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));d+=dstStride;\
1317 tmp5 = *s; s+=tmpStride;\
1318 OP2(*d, (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));d+=dstStride;\
1319 tmp6 = *s; s+=tmpStride;\
1320 OP2(*d, (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));d+=dstStride;\
1322 int tmp7,tmp8,tmp9,tmp10; \
1323 tmp7 = *s; s+=tmpStride;\
1324 OP2(*d, (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));d+=dstStride;\
1325 tmp8 = *s; s+=tmpStride;\
1326 OP2(*d, (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));d+=dstStride;\
1327 tmp9 = *s; s+=tmpStride;\
1328 OP2(*d, (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));d+=dstStride;\
1329 tmp10 = *s; s+=tmpStride;\
1330 OP2(*d, (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));d+=dstStride;\
1332 int tmp11,tmp12,tmp13,tmp14,tmp15,tmp16,tmp17,tmp18; \
1333 tmp11 = *s; s+=tmpStride;\
1334 OP2(*d , (tmp8 +tmp9 )*20 - (tmp7 +tmp10)*5 + (tmp6 +tmp11));d+=dstStride;\
1335 tmp12 = *s; s+=tmpStride;\
1336 OP2(*d , (tmp9 +tmp10)*20 - (tmp8 +tmp11)*5 + (tmp7 +tmp12));d+=dstStride;\
1337 tmp13 = *s; s+=tmpStride;\
1338 OP2(*d, (tmp10+tmp11)*20 - (tmp9 +tmp12)*5 + (tmp8 +tmp13));d+=dstStride;\
1339 tmp14 = *s; s+=tmpStride;\
1340 OP2(*d, (tmp11+tmp12)*20 - (tmp10+tmp13)*5 + (tmp9 +tmp14));d+=dstStride;\
1341 tmp15 = *s; s+=tmpStride;\
1342 OP2(*d, (tmp12+tmp13)*20 - (tmp11+tmp14)*5 + (tmp10+tmp15));d+=dstStride;\
1343 tmp16 = *s; s+=tmpStride;\
1344 OP2(*d, (tmp13+tmp14)*20 - (tmp12+tmp15)*5 + (tmp11+tmp16));d+=dstStride;\
1345 tmp17 = *s; s+=tmpStride;\
1346 OP2(*d, (tmp14+tmp15)*20 - (tmp13+tmp16)*5 + (tmp12+tmp17));d+=dstStride;\
1347 tmp18 = *s; s+=tmpStride;\
1348 OP2(*d, (tmp15+tmp16)*20 - (tmp14+tmp17)*5 + (tmp13+tmp18));d+=dstStride;\
1356 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1357 OPNAME ## h264_qpel_h_lowpass(dst,src,dstStride,srcStride,4,4); \
1359 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1360 OPNAME ## h264_qpel_h_lowpass(dst,src,dstStride,srcStride,8,8); \
1362 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1363 OPNAME ## h264_qpel_h_lowpass(dst,src,dstStride,srcStride,16,16); \
1366 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1367 OPNAME ## h264_qpel_v_lowpass(dst,src,dstStride,srcStride,4,4); \
1369 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1370 OPNAME ## h264_qpel_v_lowpass(dst,src,dstStride,srcStride,8,8); \
1372 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1373 OPNAME ## h264_qpel_v_lowpass(dst,src,dstStride,srcStride,16,16); \
1375 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1376 OPNAME ## h264_qpel_hv_lowpass(dst,tmp,src,dstStride,tmpStride,srcStride,4,4); \
1378 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1379 OPNAME ## h264_qpel_hv_lowpass(dst,tmp,src,dstStride,tmpStride,srcStride,8,8); \
1381 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1382 OPNAME ## h264_qpel_hv_lowpass(dst,tmp,src,dstStride,tmpStride,srcStride,16,16); \
1385 #define H264_MC(OPNAME, SIZE) \
1386 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1387 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
1390 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1391 uint8_t half[SIZE*SIZE];\
1392 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
1393 OPNAME ## pixels ## SIZE ## _l2_aligned2(dst, src, half, stride, stride, SIZE, SIZE);\
1396 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1397 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
1400 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1401 uint8_t half[SIZE*SIZE];\
1402 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
1403 OPNAME ## pixels ## SIZE ## _l2_aligned2(dst, src+1, half, stride, stride, SIZE, SIZE);\
1406 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1407 uint8_t full[SIZE*(SIZE+5)];\
1408 uint8_t * const full_mid= full + SIZE*2;\
1409 uint8_t half[SIZE*SIZE];\
1410 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1411 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
1412 OPNAME ## pixels ## SIZE ## _l2_aligned(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
1415 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1416 uint8_t full[SIZE*(SIZE+5)];\
1417 uint8_t * const full_mid= full + SIZE*2;\
1418 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1419 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
1422 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1423 uint8_t full[SIZE*(SIZE+5)];\
1424 uint8_t * const full_mid= full + SIZE*2;\
1425 uint8_t half[SIZE*SIZE];\
1426 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1427 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
1428 OPNAME ## pixels ## SIZE ## _l2_aligned(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
1431 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1432 uint8_t full[SIZE*(SIZE+5)];\
1433 uint8_t * const full_mid= full + SIZE*2;\
1434 uint8_t halfH[SIZE*SIZE];\
1435 uint8_t halfV[SIZE*SIZE];\
1436 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
1437 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1438 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1439 OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1442 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1443 uint8_t full[SIZE*(SIZE+5)];\
1444 uint8_t * const full_mid= full + SIZE*2;\
1445 uint8_t halfH[SIZE*SIZE];\
1446 uint8_t halfV[SIZE*SIZE];\
1447 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
1448 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
1449 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1450 OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1453 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1454 uint8_t full[SIZE*(SIZE+5)];\
1455 uint8_t * const full_mid= full + SIZE*2;\
1456 uint8_t halfH[SIZE*SIZE];\
1457 uint8_t halfV[SIZE*SIZE];\
1458 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
1459 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1460 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1461 OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1464 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1465 uint8_t full[SIZE*(SIZE+5)];\
1466 uint8_t * const full_mid= full + SIZE*2;\
1467 uint8_t halfH[SIZE*SIZE];\
1468 uint8_t halfV[SIZE*SIZE];\
1469 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
1470 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
1471 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1472 OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1475 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1476 int16_t tmp[SIZE*(SIZE+5)];\
1477 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
1480 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1481 int16_t tmp[SIZE*(SIZE+5)];\
1482 uint8_t halfH[SIZE*SIZE];\
1483 uint8_t halfHV[SIZE*SIZE];\
1484 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
1485 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1486 OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
1489 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1490 int16_t tmp[SIZE*(SIZE+5)];\
1491 uint8_t halfH[SIZE*SIZE];\
1492 uint8_t halfHV[SIZE*SIZE];\
1493 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
1494 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1495 OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
1498 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1499 uint8_t full[SIZE*(SIZE+5)];\
1500 uint8_t * const full_mid= full + SIZE*2;\
1501 int16_t tmp[SIZE*(SIZE+5)];\
1502 uint8_t halfV[SIZE*SIZE];\
1503 uint8_t halfHV[SIZE*SIZE];\
1504 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1505 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1506 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1507 OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
1510 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1511 uint8_t full[SIZE*(SIZE+5)];\
1512 uint8_t * const full_mid= full + SIZE*2;\
1513 int16_t tmp[SIZE*(SIZE+5)];\
1514 uint8_t halfV[SIZE*SIZE];\
1515 uint8_t halfHV[SIZE*SIZE];\
1516 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
1517 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1518 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1519 OPNAME ## pixels ## SIZE ## _l2_aligned(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
1522 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1523 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
1524 #define op_put(a, b) a = cm[((b) + 16)>>5]
1525 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
1526 #define op2_put(a, b) a = cm[((b) + 512)>>10]
1528 H264_LOWPASS(put_
, op_put
, op2_put
)
1529 H264_LOWPASS(avg_
, op_avg
, op2_avg
)
1543 static void wmv2_mspel8_h_lowpass(uint8_t *dst
, uint8_t *src
, int dstStride
, int srcStride
, int h
){
1544 uint8_t *cm
= cropTbl
+ MAX_NEG_CROP
;
1547 int src_1
,src0
,src1
,src2
,src3
,src4
,src5
,src6
,src7
,src8
,src9
;
1553 dst
[0]= cm
[(9*(src0
+ src1
) - (src_1
+ src2
) + 8)>>4];
1555 dst
[1]= cm
[(9*(src1
+ src2
) - (src0
+ src3
) + 8)>>4];
1557 dst
[2]= cm
[(9*(src2
+ src3
) - (src1
+ src4
) + 8)>>4];
1559 dst
[3]= cm
[(9*(src3
+ src4
) - (src2
+ src5
) + 8)>>4];
1561 dst
[4]= cm
[(9*(src4
+ src5
) - (src3
+ src6
) + 8)>>4];
1563 dst
[5]= cm
[(9*(src5
+ src6
) - (src4
+ src7
) + 8)>>4];
1565 dst
[6]= cm
[(9*(src6
+ src7
) - (src5
+ src8
) + 8)>>4];
1567 dst
[7]= cm
[(9*(src7
+ src8
) - (src6
+ src9
) + 8)>>4];
1573 static void wmv2_mspel8_v_lowpass(uint8_t *dst
, uint8_t *src
, int dstStride
, int srcStride
, int w
){
1574 uint8_t *cm
= cropTbl
+ MAX_NEG_CROP
;
1577 int src_1
,src0
,src1
,src2
,src3
,src4
,src5
,src6
,src7
,src8
,src9
;
1578 uint8_t *s
= src
,*d
= dst
;
1579 src_1
= *(s
-srcStride
);
1580 src0
= *s
; s
+=srcStride
;
1581 src1
= *s
; s
+=srcStride
;
1582 src2
= *s
; s
+=srcStride
;
1583 *d
= cm
[(9*(src0
+ src1
) - (src_1
+ src2
) + 8)>>4]; d
+=dstStride
;
1584 src3
= *s
; s
+=srcStride
;
1585 *d
= cm
[(9*(src1
+ src2
) - (src0
+ src3
) + 8)>>4]; d
+=dstStride
;
1586 src4
= *s
; s
+=srcStride
;
1587 *d
= cm
[(9*(src2
+ src3
) - (src1
+ src4
) + 8)>>4]; d
+=dstStride
;
1588 src5
= *s
; s
+=srcStride
;
1589 *d
= cm
[(9*(src3
+ src4
) - (src2
+ src5
) + 8)>>4]; d
+=dstStride
;
1590 src6
= *s
; s
+=srcStride
;
1591 *d
= cm
[(9*(src4
+ src5
) - (src3
+ src6
) + 8)>>4]; d
+=dstStride
;
1592 src7
= *s
; s
+=srcStride
;
1593 *d
= cm
[(9*(src5
+ src6
) - (src4
+ src7
) + 8)>>4]; d
+=dstStride
;
1594 src8
= *s
; s
+=srcStride
;
1595 *d
= cm
[(9*(src6
+ src7
) - (src5
+ src8
) + 8)>>4]; d
+=dstStride
;
1597 *d
= cm
[(9*(src7
+ src8
) - (src6
+ src9
) + 8)>>4]; d
+=dstStride
;
1603 static void put_mspel8_mc00_c (uint8_t *dst
, uint8_t *src
, int stride
){
1604 put_pixels8_c(dst
, src
, stride
, 8);
1607 static void put_mspel8_mc10_c(uint8_t *dst
, uint8_t *src
, int stride
){
1609 wmv2_mspel8_h_lowpass(half
, src
, 8, stride
, 8);
1610 put_pixels8_l2_aligned2(dst
, src
, half
, stride
, stride
, 8, 8);
1613 static void put_mspel8_mc20_c(uint8_t *dst
, uint8_t *src
, int stride
){
1614 wmv2_mspel8_h_lowpass(dst
, src
, stride
, stride
, 8);
1617 static void put_mspel8_mc30_c(uint8_t *dst
, uint8_t *src
, int stride
){
1619 wmv2_mspel8_h_lowpass(half
, src
, 8, stride
, 8);
1620 put_pixels8_l2_aligned2(dst
, src
+1, half
, stride
, stride
, 8, 8);
1623 static void put_mspel8_mc02_c(uint8_t *dst
, uint8_t *src
, int stride
){
1624 wmv2_mspel8_v_lowpass(dst
, src
, stride
, stride
, 8);
1627 static void put_mspel8_mc12_c(uint8_t *dst
, uint8_t *src
, int stride
){
1631 wmv2_mspel8_h_lowpass(halfH
, src
-stride
, 8, stride
, 11);
1632 wmv2_mspel8_v_lowpass(halfV
, src
, 8, stride
, 8);
1633 wmv2_mspel8_v_lowpass(halfHV
, halfH
+8, 8, 8, 8);
1634 put_pixels8_l2_aligned(dst
, halfV
, halfHV
, stride
, 8, 8, 8);
1636 static void put_mspel8_mc32_c(uint8_t *dst
, uint8_t *src
, int stride
){
1640 wmv2_mspel8_h_lowpass(halfH
, src
-stride
, 8, stride
, 11);
1641 wmv2_mspel8_v_lowpass(halfV
, src
+1, 8, stride
, 8);
1642 wmv2_mspel8_v_lowpass(halfHV
, halfH
+8, 8, 8, 8);
1643 put_pixels8_l2_aligned(dst
, halfV
, halfHV
, stride
, 8, 8, 8);
1645 static void put_mspel8_mc22_c(uint8_t *dst
, uint8_t *src
, int stride
){
1647 wmv2_mspel8_h_lowpass(halfH
, src
-stride
, 8, stride
, 11);
1648 wmv2_mspel8_v_lowpass(dst
, halfH
+8, stride
, 8, 8);