2 * AltiVec-optimized snow DSP utils
3 * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 #include "libavcodec/dsputil.h"
23 #include "libavcodec/snow.h"
25 #include "gcc_fixes.h"
26 #include "dsputil_altivec.h"
33 //FIXME remove this replication
34 #define slice_buffer_get_line(slice_buf, line_num) ((slice_buf)->line[line_num] ? (slice_buf)->line[line_num] : slice_buffer_load_line((slice_buf), (line_num)))
36 static DWTELEM
* slice_buffer_load_line(slice_buffer
* buf
, int line
)
41 // av_log(NULL, AV_LOG_DEBUG, "Cache hit: %d\n", line);
43 assert(buf
->data_stack_top
>= 0);
44 // assert(!buf->line[line]);
46 return buf
->line
[line
];
48 offset
= buf
->line_width
* line
;
49 buffer
= buf
->data_stack
[buf
->data_stack_top
];
50 buf
->data_stack_top
--;
51 buf
->line
[line
] = buffer
;
53 // av_log(NULL, AV_LOG_DEBUG, "slice_buffer_load_line: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
61 void ff_snow_horizontal_compose97i_altivec(IDWTELEM
*b
, int width
)
64 const int w2
= (width
+1)>>1;
65 DECLARE_ALIGNED_16(IDWTELEM
, temp
[(width
>>1)]);
66 const int w_l
= (width
>>1);
67 const int w_r
= w2
- 1;
69 vector
signed short t1
, t2
, x
, y
, tmp1
, tmp2
;
70 vector
signed short *vbuf
, *vtmp
;
71 vector
unsigned char align
;
74 IDWTELEM
* const ref
= b
+ w2
- 1;
76 vector
signed short v7
= vec_splat_s16(7);
77 vbuf
= (vector
signed short *)b
;
79 tmp1
= vec_ld (0, ref
);
80 align
= vec_lvsl (0, ref
);
81 tmp2
= vec_ld (15, ref
);
82 t1
= vec_perm(tmp1
, tmp2
, align
);
84 for (i
=0; i
<w_l
-15; i
+=16) {
86 /* b[i+0] = b[i+0] - ((3 * (ref[i+0] + ref[i+1]) + 4) >> 3);
87 b[i+1] = b[i+1] - ((3 * (ref[i+1] + ref[i+2]) + 4) >> 3);
88 b[i+2] = b[i+2] - ((3 * (ref[i+2] + ref[i+3]) + 4) >> 3);
89 b[i+3] = b[i+3] - ((3 * (ref[i+3] + ref[i+4]) + 4) >> 3);*/
90 b
[i
+0] = b
[i
+0] + ((7 * (ref
[i
+0] + ref
[i
+1])-1) >> 8);
93 tmp1
= vec_ld (0, ref
+8+i
);
94 tmp2
= vec_ld (15, ref
+8+i
);
96 t2
= vec_perm(tmp1
, tmp2
, align
);
98 y
= vec_add(t1
, vec_sld(t1
,t2
,2));
99 // y = vec_add(vec_add(y,y),y);
101 tmp1
= vec_ld (0, ref
+12+i
);
103 y
= vec_add(y
, vec_splat_s32(4));
104 y
= vec_sra(y
, vec_splat_u32(3));
106 tmp2
= vec_ld (15, ref
+12+i
);
108 *vbuf
= vec_sub(*vbuf
, y
);
114 t2
= vec_perm(tmp1
, tmp2
, align
);
116 y
= vec_add(t1
,vec_sld(t1
,t2
,4));
117 y
= vec_add(vec_add(y
,y
),y
);
119 tmp1
= vec_ld (0, ref
+12+i
);
121 y
= vec_add(y
, vec_splat_s32(4));
122 y
= vec_sra(y
, vec_splat_u32(3));
124 tmp2
= vec_ld (15, ref
+12+i
);
126 *vbuf
= vec_sub(*vbuf
, y
);
132 t2
= vec_perm(tmp1
, tmp2
, align
);
134 y
= vec_add(t1
,vec_sld(t1
,t2
,4));
135 y
= vec_add(vec_add(y
,y
),y
);
137 tmp1
= vec_ld (0, ref
+16+i
);
139 y
= vec_add(y
, vec_splat_s32(4));
140 y
= vec_sra(y
, vec_splat_u32(3));
142 tmp2
= vec_ld (15, ref
+16+i
);
144 *vbuf
= vec_sub(*vbuf
, y
);
148 t2
= vec_perm(tmp1
, tmp2
, align
);
150 y
= vec_add(t1
,vec_sld(t1
,t2
,4));
151 y
= vec_add(vec_add(y
,y
),y
);
155 y
= vec_add(y
, vec_splat_s32(4));
156 y
= vec_sra(y
, vec_splat_u32(3));
157 *vbuf
= vec_sub(*vbuf
, y
);
167 snow_horizontal_compose_lift_lead_out(i
, b
, b
, ref
, width
, w_l
, 0, W_DM
, W_DO
, W_DS
);
168 b
[0] = b_0
- ((W_DM
* 2 * ref
[1]+W_DO
)>>W_DS
);
172 DWTELEM
* const dst
= b
+w2
;
175 for(; (((long)&dst
[i
]) & 0xF) && i
<w_r
; i
++){
176 dst
[i
] = dst
[i
] - (b
[i
] + b
[i
+ 1]);
179 align
= vec_lvsl(0, b
+i
);
180 tmp1
= vec_ld(0, b
+i
);
181 vbuf
= (vector
signed int*) (dst
+ i
);
182 tmp2
= vec_ld(15, b
+i
);
184 t1
= vec_perm(tmp1
, tmp2
, align
);
186 for (; i
<w_r
-3; i
+=4) {
189 dst
[i
] = dst
[i
] - (b
[i
] + b
[i
+ 1]);
190 dst
[i
+1] = dst
[i
+1] - (b
[i
+1] + b
[i
+ 2]);
191 dst
[i
+2] = dst
[i
+2] - (b
[i
+2] + b
[i
+ 3]);
192 dst
[i
+3] = dst
[i
+3] - (b
[i
+3] + b
[i
+ 4]);
195 tmp1
= vec_ld(0, b
+4+i
);
196 tmp2
= vec_ld(15, b
+4+i
);
198 t2
= vec_perm(tmp1
, tmp2
, align
);
200 y
= vec_add(t1
, vec_sld(t1
,t2
,4));
201 *vbuf
= vec_sub (*vbuf
, y
);
211 snow_horizontal_compose_lift_lead_out(i
, dst
, dst
, b
, width
, w_r
, 1, W_CM
, W_CO
, W_CS
);
215 DWTELEM
* const ref
= b
+w2
- 1;
217 vbuf
= (vector
signed int *) b
;
219 tmp1
= vec_ld (0, ref
);
220 align
= vec_lvsl (0, ref
);
221 tmp2
= vec_ld (15, ref
);
222 t1
= vec_perm(tmp1
, tmp2
, align
);
225 for (; i
<w_l
-15; i
+=16) {
227 b
[i
] = b
[i
] - (((8 -(ref
[i
] + ref
[i
+1])) - (b
[i
] <<2)) >> 4);
228 b
[i
+1] = b
[i
+1] - (((8 -(ref
[i
+1] + ref
[i
+2])) - (b
[i
+1]<<2)) >> 4);
229 b
[i
+2] = b
[i
+2] - (((8 -(ref
[i
+2] + ref
[i
+3])) - (b
[i
+2]<<2)) >> 4);
230 b
[i
+3] = b
[i
+3] - (((8 -(ref
[i
+3] + ref
[i
+4])) - (b
[i
+3]<<2)) >> 4);
232 tmp1
= vec_ld (0, ref
+4+i
);
233 tmp2
= vec_ld (15, ref
+4+i
);
235 t2
= vec_perm(tmp1
, tmp2
, align
);
237 y
= vec_add(t1
,vec_sld(t1
,t2
,4));
238 y
= vec_sub(vec_splat_s32(8),y
);
240 tmp1
= vec_ld (0, ref
+8+i
);
242 x
= vec_sl(*vbuf
,vec_splat_u32(2));
243 y
= vec_sra(vec_sub(y
,x
),vec_splat_u32(4));
245 tmp2
= vec_ld (15, ref
+8+i
);
247 *vbuf
= vec_sub( *vbuf
, y
);
253 t2
= vec_perm(tmp1
, tmp2
, align
);
255 y
= vec_add(t1
,vec_sld(t1
,t2
,4));
256 y
= vec_sub(vec_splat_s32(8),y
);
258 tmp1
= vec_ld (0, ref
+12+i
);
260 x
= vec_sl(*vbuf
,vec_splat_u32(2));
261 y
= vec_sra(vec_sub(y
,x
),vec_splat_u32(4));
263 tmp2
= vec_ld (15, ref
+12+i
);
265 *vbuf
= vec_sub( *vbuf
, y
);
271 t2
= vec_perm(tmp1
, tmp2
, align
);
273 y
= vec_add(t1
,vec_sld(t1
,t2
,4));
274 y
= vec_sub(vec_splat_s32(8),y
);
276 tmp1
= vec_ld (0, ref
+16+i
);
278 x
= vec_sl(*vbuf
,vec_splat_u32(2));
279 y
= vec_sra(vec_sub(y
,x
),vec_splat_u32(4));
281 tmp2
= vec_ld (15, ref
+16+i
);
283 *vbuf
= vec_sub( *vbuf
, y
);
289 t2
= vec_perm(tmp1
, tmp2
, align
);
291 y
= vec_add(t1
,vec_sld(t1
,t2
,4));
292 y
= vec_sub(vec_splat_s32(8),y
);
296 x
= vec_sl(*vbuf
,vec_splat_u32(2));
297 y
= vec_sra(vec_sub(y
,x
),vec_splat_u32(4));
298 *vbuf
= vec_sub( *vbuf
, y
);
305 snow_horizontal_compose_liftS_lead_out(i
, b
, b
, ref
, width
, w_l
);
306 b
[0] = b_0
- (((-2 * ref
[1] + W_BO
) - 4 * b_0
) >> W_BS
);
310 DWTELEM
* const src
= b
+w2
;
312 vbuf
= (vector
signed int *)b
;
313 vtmp
= (vector
signed int *)temp
;
316 align
= vec_lvsl(0, src
);
318 for (; i
<w_r
-3; i
+=4) {
320 temp
[i
] = src
[i
] - ((-3*(b
[i
] + b
[i
+1]))>>1);
321 temp
[i
+1] = src
[i
+1] - ((-3*(b
[i
+1] + b
[i
+2]))>>1);
322 temp
[i
+2] = src
[i
+2] - ((-3*(b
[i
+2] + b
[i
+3]))>>1);
323 temp
[i
+3] = src
[i
+3] - ((-3*(b
[i
+3] + b
[i
+4]))>>1);
325 tmp1
= vec_ld(0,src
+i
);
326 t1
= vec_add(vbuf
[0],vec_sld(vbuf
[0],vbuf
[1],4));
327 tmp2
= vec_ld(15,src
+i
);
328 t1
= vec_sub(vec_splat_s32(0),t1
); //bad!
329 t1
= vec_add(t1
,vec_add(t1
,t1
));
330 t2
= vec_perm(tmp1
,tmp2
,align
);
331 t1
= vec_sra(t1
,vec_splat_u32(1));
333 *vtmp
= vec_sub(t2
,t1
);
340 snow_horizontal_compose_lift_lead_out(i
, temp
, src
, b
, width
, w_r
, 1, -3, 0, 1);
346 vector
signed int *t
= (vector
signed int *)temp
,
347 *v
= (vector
signed int *)b
;
349 snow_interleave_line_header(&i
, width
, b
, temp
);
351 for (; (i
& 0xE) != 0xE; i
-=2){
355 for (i
-=14; i
>=0; i
-=16){
358 v
[a
+3]=vec_mergel(v
[(a
>>1)+1],t
[(a
>>1)+1]);
359 v
[a
+2]=vec_mergeh(v
[(a
>>1)+1],t
[(a
>>1)+1]);
360 v
[a
+1]=vec_mergel(v
[a
>>1],t
[a
>>1]);
361 v
[a
]=vec_mergeh(v
[a
>>1],t
[a
>>1]);
369 void ff_snow_vertical_compose97i_altivec(DWTELEM
*b0
, DWTELEM
*b1
, DWTELEM
*b2
, DWTELEM
*b3
, DWTELEM
*b4
, DWTELEM
*b5
, int width
)
372 vector
signed int *v0
, *v1
,*v2
,*v3
,*v4
,*v5
;
373 vector
signed int t1
, t2
;
375 v0
=(vector
signed int *)b0
;
376 v1
=(vector
signed int *)b1
;
377 v2
=(vector
signed int *)b2
;
378 v3
=(vector
signed int *)b3
;
379 v4
=(vector
signed int *)b4
;
380 v5
=(vector
signed int *)b5
;
386 b4
[i
] -= (3*(b3
[i
] + b5
[i
])+4)>>3;
387 b3
[i
] -= ((b2
[i
] + b4
[i
]));
388 b2
[i
] += ((b1
[i
] + b3
[i
])+4*b2
[i
]+8)>>4;
389 b1
[i
] += (3*(b0
[i
] + b2
[i
]))>>1;
391 t1
= vec_add(v3
[i
], v5
[i
]);
392 t2
= vec_add(t1
, vec_add(t1
,t1
));
393 t1
= vec_add(t2
, vec_splat_s32(4));
394 v4
[i
] = vec_sub(v4
[i
], vec_sra(t1
,vec_splat_u32(3)));
396 v3
[i
] = vec_sub(v3
[i
], vec_add(v2
[i
], v4
[i
]));
398 t1
= vec_add(vec_splat_s32(8), vec_add(v1
[i
], v3
[i
]));
399 t2
= vec_sl(v2
[i
], vec_splat_u32(2));
400 v2
[i
] = vec_add(v2
[i
], vec_sra(vec_add(t1
,t2
),vec_splat_u32(4)));
401 t1
= vec_add(v0
[i
], v2
[i
]);
402 t2
= vec_add(t1
, vec_add(t1
,t1
));
403 v1
[i
] = vec_add(v1
[i
], vec_sra(t2
,vec_splat_u32(1)));
408 for(i
*=4; i
< width
; i
++)
410 b4
[i
] -= (W_DM
*(b3
[i
] + b5
[i
])+W_DO
)>>W_DS
;
411 b3
[i
] -= (W_CM
*(b2
[i
] + b4
[i
])+W_CO
)>>W_CS
;
412 b2
[i
] += (W_BM
*(b1
[i
] + b3
[i
])+4*b2
[i
]+W_BO
)>>W_BS
;
413 b1
[i
] += (W_AM
*(b0
[i
] + b2
[i
])+W_AO
)>>W_AS
;
417 #define LOAD_BLOCKS \
418 tmp1 = vec_ld(0, &block[3][y*src_stride]);\
419 align = vec_lvsl(0, &block[3][y*src_stride]);\
420 tmp2 = vec_ld(15, &block[3][y*src_stride]);\
422 b3 = vec_perm(tmp1,tmp2,align);\
424 tmp1 = vec_ld(0, &block[2][y*src_stride]);\
425 align = vec_lvsl(0, &block[2][y*src_stride]);\
426 tmp2 = vec_ld(15, &block[2][y*src_stride]);\
428 b2 = vec_perm(tmp1,tmp2,align);\
430 tmp1 = vec_ld(0, &block[1][y*src_stride]);\
431 align = vec_lvsl(0, &block[1][y*src_stride]);\
432 tmp2 = vec_ld(15, &block[1][y*src_stride]);\
434 b1 = vec_perm(tmp1,tmp2,align);\
436 tmp1 = vec_ld(0, &block[0][y*src_stride]);\
437 align = vec_lvsl(0, &block[0][y*src_stride]);\
438 tmp2 = vec_ld(15, &block[0][y*src_stride]);\
440 b0 = vec_perm(tmp1,tmp2,align);
443 tmp1 = vec_ld(0, obmc1);\
444 align = vec_lvsl(0, obmc1);\
445 tmp2 = vec_ld(15, obmc1);\
447 ob1 = vec_perm(tmp1,tmp2,align);\
449 tmp1 = vec_ld(0, obmc2);\
450 align = vec_lvsl(0, obmc2);\
451 tmp2 = vec_ld(15, obmc2);\
453 ob2 = vec_perm(tmp1,tmp2,align);\
455 tmp1 = vec_ld(0, obmc3);\
456 align = vec_lvsl(0, obmc3);\
457 tmp2 = vec_ld(15, obmc3);\
459 ob3 = vec_perm(tmp1,tmp2,align);\
461 tmp1 = vec_ld(0, obmc4);\
462 align = vec_lvsl(0, obmc4);\
463 tmp2 = vec_ld(15, obmc4);\
465 ob4 = vec_perm(tmp1,tmp2,align);
468 * h1 <- [ a,b,a,b, a,b,a,b, a,b,a,b, a,b,a,b ]
469 * h2 <- [ c,d,c,d, c,d,c,d, c,d,c,d, c,d,c,d ]
470 * h <- [ a,b,c,d, a,b,c,d, a,b,c,d, a,b,c,d ]
474 h1 = (vector unsigned short)\
475 vec_mergeh(ob1, ob2);\
477 h2 = (vector unsigned short)\
478 vec_mergeh(ob3, ob4);\
480 ih = (vector unsigned char)\
483 l1 = (vector unsigned short) vec_mergeh(b3, b2);\
485 ih1 = (vector unsigned char) vec_mergel(h1, h2);\
487 l2 = (vector unsigned short) vec_mergeh(b1, b0);\
489 il = (vector unsigned char) vec_mergeh(l1, l2);\
491 v[0] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));\
493 il1 = (vector unsigned char) vec_mergel(l1, l2);\
495 v[1] = (vector signed int) vec_msum(ih1, il1, vec_splat_u32(0));
497 #define FINAL_STEP_SCALAR\
498 for(x=0; x<b_w; x++)\
500 vbuf[x] += dst[x + src_x];\
501 vbuf[x] = (vbuf[x] + (1<<(FRAC_BITS-1))) >> FRAC_BITS;\
502 if(vbuf[x]&(~255)) vbuf[x]= ~(vbuf[x]>>31);\
503 dst8[x + y*src_stride] = vbuf[x];\
505 dst[x + src_x] -= vbuf[x];\
508 static void inner_add_yblock_bw_8_obmc_16_altivec(uint8_t *obmc
,
509 const int obmc_stride
,
510 uint8_t * * block
, int b_w
,
511 int b_h
, int src_x
, int src_y
,
512 int src_stride
, slice_buffer
* sb
,
513 int add
, uint8_t * dst8
)
517 vector
unsigned short h1
, h2
, l1
, l2
;
518 vector
unsigned char ih
, il
, ih1
, il1
, tmp1
, tmp2
, align
;
519 vector
unsigned char b0
,b1
,b2
,b3
;
520 vector
unsigned char ob1
,ob2
,ob3
,ob4
;
522 DECLARE_ALIGNED_16(int, vbuf
[16]);
523 vector
signed int *v
= (vector
signed int *)vbuf
, *d
;
525 for(y
=0; y
<b_h
; y
++){
526 //FIXME ugly misuse of obmc_stride
528 uint8_t *obmc1
= obmc
+ y
*obmc_stride
;
529 uint8_t *obmc2
= obmc1
+ (obmc_stride
>>1);
530 uint8_t *obmc3
= obmc1
+ obmc_stride
*(obmc_stride
>>1);
531 uint8_t *obmc4
= obmc3
+ (obmc_stride
>>1);
533 dst
= slice_buffer_get_line(sb
, src_y
+ y
);
534 d
= (vector
signed int *)(dst
+ src_x
);
536 //FIXME i could avoid some loads!
554 h1 = (vector unsigned short) vec_mergel(ob1, ob2);\
556 h2 = (vector unsigned short) vec_mergel(ob3, ob4);\
558 ih = (vector unsigned char) vec_mergeh(h1,h2);\
560 l1 = (vector unsigned short) vec_mergel(b3, b2);\
562 l2 = (vector unsigned short) vec_mergel(b1, b0);\
564 ih1 = (vector unsigned char) vec_mergel(h1,h2);\
566 il = (vector unsigned char) vec_mergeh(l1,l2);\
568 v[2] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));\
570 il1 = (vector unsigned char) vec_mergel(l1,l2);\
572 v[3] = (vector signed int) vec_msum(ih1, il1, vec_splat_u32(0));
575 static void inner_add_yblock_bw_16_obmc_32_altivec(uint8_t *obmc
,
576 const int obmc_stride
,
577 uint8_t * * block
, int b_w
,
578 int b_h
, int src_x
, int src_y
,
579 int src_stride
, slice_buffer
* sb
,
580 int add
, uint8_t * dst8
)
584 vector
unsigned short h1
, h2
, l1
, l2
;
585 vector
unsigned char ih
, il
, ih1
, il1
, tmp1
, tmp2
, align
;
586 vector
unsigned char b0
,b1
,b2
,b3
;
587 vector
unsigned char ob1
,ob2
,ob3
,ob4
;
588 DECLARE_ALIGNED_16(int, vbuf
[b_w
]);
589 vector
signed int *v
= (vector
signed int *)vbuf
, *d
;
591 for(y
=0; y
<b_h
; y
++){
592 //FIXME ugly misuse of obmc_stride
594 uint8_t *obmc1
= obmc
+ y
*obmc_stride
;
595 uint8_t *obmc2
= obmc1
+ (obmc_stride
>>1);
596 uint8_t *obmc3
= obmc1
+ obmc_stride
*(obmc_stride
>>1);
597 uint8_t *obmc4
= obmc3
+ (obmc_stride
>>1);
599 dst
= slice_buffer_get_line(sb
, src_y
+ y
);
600 d
= (vector
signed int *)(dst
+ src_x
);
618 #define FINAL_STEP_VEC \
622 for(x=0; x<b_w/4; x++)\
624 v[x] = vec_add(v[x], d[x]);\
625 v[x] = vec_sra(vec_add(v[x],\
626 vec_sl( vec_splat_s32(1),\
630 mask = (vector bool int) vec_sl((vector signed int)\
631 vec_cmpeq(v[x],v[x]),vec_splat_u32(8));\
632 mask = (vector bool int) vec_and(v[x],vec_nor(mask,mask));\
634 mask = (vector bool int)\
635 vec_cmpeq((vector signed int)mask,\
636 (vector signed int)vec_splat_u32(0));\
638 vs = vec_sra(v[x],vec_splat_u32(8));\
639 vs = vec_sra(v[x],vec_splat_u32(8));\
640 vs = vec_sra(v[x],vec_splat_u32(15));\
642 vs = vec_nor(vs,vs);\
644 v[x]= vec_sel(v[x],vs,mask);\
647 for(x=0; x<b_w; x++)\
648 dst8[x + y*src_stride] = vbuf[x];\
652 for(x=0; x<b_w/4; x++)\
653 d[x] = vec_sub(d[x], v[x]);
655 static void inner_add_yblock_a_bw_8_obmc_16_altivec(uint8_t *obmc
,
656 const int obmc_stride
,
657 uint8_t * * block
, int b_w
,
658 int b_h
, int src_x
, int src_y
,
659 int src_stride
, slice_buffer
* sb
,
660 int add
, uint8_t * dst8
)
664 vector
bool int mask
;
665 vector
signed int vs
;
666 vector
unsigned short h1
, h2
, l1
, l2
;
667 vector
unsigned char ih
, il
, ih1
, il1
, tmp1
, tmp2
, align
;
668 vector
unsigned char b0
,b1
,b2
,b3
;
669 vector
unsigned char ob1
,ob2
,ob3
,ob4
;
671 DECLARE_ALIGNED_16(int, vbuf
[16]);
672 vector
signed int *v
= (vector
signed int *)vbuf
, *d
;
674 for(y
=0; y
<b_h
; y
++){
675 //FIXME ugly misuse of obmc_stride
677 uint8_t *obmc1
= obmc
+ y
*obmc_stride
;
678 uint8_t *obmc2
= obmc1
+ (obmc_stride
>>1);
679 uint8_t *obmc3
= obmc1
+ obmc_stride
*(obmc_stride
>>1);
680 uint8_t *obmc4
= obmc3
+ (obmc_stride
>>1);
682 dst
= slice_buffer_get_line(sb
, src_y
+ y
);
683 d
= (vector
signed int *)(dst
+ src_x
);
685 //FIXME i could avoid some loads!
702 static void inner_add_yblock_a_bw_16_obmc_32_altivec(uint8_t *obmc
,
703 const int obmc_stride
,
704 uint8_t * * block
, int b_w
,
705 int b_h
, int src_x
, int src_y
,
706 int src_stride
, slice_buffer
* sb
,
707 int add
, uint8_t * dst8
)
711 vector
bool int mask
;
712 vector
signed int vs
;
713 vector
unsigned short h1
, h2
, l1
, l2
;
714 vector
unsigned char ih
, il
, ih1
, il1
, tmp1
, tmp2
, align
;
715 vector
unsigned char b0
,b1
,b2
,b3
;
716 vector
unsigned char ob1
,ob2
,ob3
,ob4
;
717 DECLARE_ALIGNED_16(int, vbuf
[b_w
]);
718 vector
signed int *v
= (vector
signed int *)vbuf
, *d
;
720 for(y
=0; y
<b_h
; y
++){
721 //FIXME ugly misuse of obmc_stride
723 uint8_t *obmc1
= obmc
+ y
*obmc_stride
;
724 uint8_t *obmc2
= obmc1
+ (obmc_stride
>>1);
725 uint8_t *obmc3
= obmc1
+ obmc_stride
*(obmc_stride
>>1);
726 uint8_t *obmc4
= obmc3
+ (obmc_stride
>>1);
728 dst
= slice_buffer_get_line(sb
, src_y
+ y
);
729 d
= (vector
signed int *)(dst
+ src_x
);
748 void ff_snow_inner_add_yblock_altivec(uint8_t *obmc
, const int obmc_stride
,
749 uint8_t * * block
, int b_w
, int b_h
,
750 int src_x
, int src_y
, int src_stride
,
751 slice_buffer
* sb
, int add
,
756 inner_add_yblock_bw_16_obmc_32_altivec(obmc
, obmc_stride
, block
,
757 b_w
, b_h
, src_x
, src_y
,
758 src_stride
, sb
, add
, dst8
);
760 inner_add_yblock_bw_8_obmc_16_altivec(obmc
, obmc_stride
, block
,
761 b_w
, b_h
, src_x
, src_y
,
762 src_stride
, sb
, add
, dst8
);
764 ff_snow_inner_add_yblock(obmc
, obmc_stride
, block
, b_w
, b_h
, src_x
,
765 src_y
, src_stride
, sb
, add
, dst8
);
768 inner_add_yblock_a_bw_16_obmc_32_altivec(obmc
, obmc_stride
, block
,
769 b_w
, b_h
, src_x
, src_y
,
770 src_stride
, sb
, add
, dst8
);
772 inner_add_yblock_a_bw_8_obmc_16_altivec(obmc
, obmc_stride
, block
,
773 b_w
, b_h
, src_x
, src_y
,
774 src_stride
, sb
, add
, dst8
);
776 ff_snow_inner_add_yblock(obmc
, obmc_stride
, block
, b_w
, b_h
, src_x
,
777 src_y
, src_stride
, sb
, add
, dst8
);
782 void snow_init_altivec(DSPContext
* c
, AVCodecContext
*avctx
)
785 c
->horizontal_compose97i
= ff_snow_horizontal_compose97i_altivec
;
786 c
->vertical_compose97i
= ff_snow_vertical_compose97i_altivec
;
787 c
->inner_add_yblock
= ff_snow_inner_add_yblock_altivec
;