2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 //#define DEBUG_ALIGNMENT
22 #ifdef DEBUG_ALIGNMENT
23 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
25 #define ASSERT_ALIGNED(ptr) ;
28 /* this code assume that stride % 16 == 0 */
30 #define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
31 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
32 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
34 psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
35 psum = vec_mladd(vB, vsrc1ssH, psum);\
36 psum = vec_mladd(vC, vsrc2ssH, psum);\
37 psum = vec_mladd(vD, vsrc3ssH, psum);\
39 psum = vec_sr(psum, v6us);\
41 vdst = vec_ld(0, dst);\
42 ppsum = (vec_u8)vec_pack(psum, psum);\
43 vfdst = vec_perm(vdst, ppsum, fperm);\
45 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
47 vec_st(fsum, 0, dst);\
55 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
57 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
58 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
60 psum = vec_mladd(vA, vsrc0ssH, v32ss);\
61 psum = vec_mladd(vE, vsrc1ssH, psum);\
62 psum = vec_sr(psum, v6us);\
64 vdst = vec_ld(0, dst);\
65 ppsum = (vec_u8)vec_pack(psum, psum);\
66 vfdst = vec_perm(vdst, ppsum, fperm);\
68 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
70 vec_st(fsum, 0, dst);\
76 #define add28(a) vec_add(v28ss, a)
78 void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst
, uint8_t * src
,
79 int stride
, int h
, int x
, int y
) {
80 POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num
, 1);
81 DECLARE_ALIGNED_16(signed int, ABCD
[4]) =
88 const vec_s32 vABCD
= vec_ld(0, ABCD
);
89 const vec_s16 vA
= vec_splat((vec_s16
)vABCD
, 1);
90 const vec_s16 vB
= vec_splat((vec_s16
)vABCD
, 3);
91 const vec_s16 vC
= vec_splat((vec_s16
)vABCD
, 5);
92 const vec_s16 vD
= vec_splat((vec_s16
)vABCD
, 7);
94 const vec_s16 v32ss
= vec_sl(vec_splat_s16(1),vec_splat_u16(5));
95 const vec_u16 v6us
= vec_splat_u16(6);
96 register int loadSecond
= (((unsigned long)src
) % 16) <= 7 ? 0 : 1;
97 register int reallyBadAlign
= (((unsigned long)src
) % 16) == 15 ? 1 : 0;
99 vec_u8 vsrcAuc
, av_uninit(vsrcBuc
), vsrcperm0
, vsrcperm1
;
100 vec_u8 vsrc0uc
, vsrc1uc
;
101 vec_s16 vsrc0ssH
, vsrc1ssH
;
102 vec_u8 vsrcCuc
, vsrc2uc
, vsrc3uc
;
103 vec_s16 vsrc2ssH
, vsrc3ssH
, psum
;
104 vec_u8 vdst
, ppsum
, vfdst
, fsum
;
106 POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num
, 1);
108 if (((unsigned long)dst
) % 16 == 0) {
109 fperm
= (vec_u8
){0x10, 0x11, 0x12, 0x13,
110 0x14, 0x15, 0x16, 0x17,
111 0x08, 0x09, 0x0A, 0x0B,
112 0x0C, 0x0D, 0x0E, 0x0F};
114 fperm
= (vec_u8
){0x00, 0x01, 0x02, 0x03,
115 0x04, 0x05, 0x06, 0x07,
116 0x18, 0x19, 0x1A, 0x1B,
117 0x1C, 0x1D, 0x1E, 0x1F};
120 vsrcAuc
= vec_ld(0, src
);
123 vsrcBuc
= vec_ld(16, src
);
124 vsrcperm0
= vec_lvsl(0, src
);
125 vsrcperm1
= vec_lvsl(1, src
);
127 vsrc0uc
= vec_perm(vsrcAuc
, vsrcBuc
, vsrcperm0
);
131 vsrc1uc
= vec_perm(vsrcAuc
, vsrcBuc
, vsrcperm1
);
133 vsrc0ssH
= (vec_s16
)vec_mergeh(zero_u8v
,(vec_u8
)vsrc0uc
);
134 vsrc1ssH
= (vec_s16
)vec_mergeh(zero_u8v
,(vec_u8
)vsrc1uc
);
137 if (!loadSecond
) {// -> !reallyBadAlign
138 for (i
= 0 ; i
< h
; i
++) {
139 vsrcCuc
= vec_ld(stride
+ 0, src
);
140 vsrc2uc
= vec_perm(vsrcCuc
, vsrcCuc
, vsrcperm0
);
141 vsrc3uc
= vec_perm(vsrcCuc
, vsrcCuc
, vsrcperm1
);
143 CHROMA_MC8_ALTIVEC_CORE(v32ss
, noop
)
147 for (i
= 0 ; i
< h
; i
++) {
148 vsrcCuc
= vec_ld(stride
+ 0, src
);
149 vsrcDuc
= vec_ld(stride
+ 16, src
);
150 vsrc2uc
= vec_perm(vsrcCuc
, vsrcDuc
, vsrcperm0
);
154 vsrc3uc
= vec_perm(vsrcCuc
, vsrcDuc
, vsrcperm1
);
156 CHROMA_MC8_ALTIVEC_CORE(v32ss
, noop
)
160 const vec_s16 vE
= vec_add(vB
, vC
);
161 if (ABCD
[2]) { // x == 0 B == 0
162 if (!loadSecond
) {// -> !reallyBadAlign
163 for (i
= 0 ; i
< h
; i
++) {
164 vsrcCuc
= vec_ld(stride
+ 0, src
);
165 vsrc1uc
= vec_perm(vsrcCuc
, vsrcCuc
, vsrcperm0
);
166 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
172 for (i
= 0 ; i
< h
; i
++) {
173 vsrcCuc
= vec_ld(stride
+ 0, src
);
174 vsrcDuc
= vec_ld(stride
+ 15, src
);
175 vsrc1uc
= vec_perm(vsrcCuc
, vsrcDuc
, vsrcperm0
);
176 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
181 } else { // y == 0 C == 0
182 if (!loadSecond
) {// -> !reallyBadAlign
183 for (i
= 0 ; i
< h
; i
++) {
184 vsrcCuc
= vec_ld(0, src
);
185 vsrc0uc
= vec_perm(vsrcCuc
, vsrcCuc
, vsrcperm0
);
186 vsrc1uc
= vec_perm(vsrcCuc
, vsrcCuc
, vsrcperm1
);
188 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
192 for (i
= 0 ; i
< h
; i
++) {
193 vsrcCuc
= vec_ld(0, src
);
194 vsrcDuc
= vec_ld(15, src
);
195 vsrc0uc
= vec_perm(vsrcCuc
, vsrcDuc
, vsrcperm0
);
199 vsrc1uc
= vec_perm(vsrcCuc
, vsrcDuc
, vsrcperm1
);
201 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
206 POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num
, 1);
209 /* this code assume that stride % 16 == 0 */
210 void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst
, uint8_t * src
, int stride
, int h
, int x
, int y
) {
211 DECLARE_ALIGNED_16(signed int, ABCD
[4]) =
212 {((8 - x
) * (8 - y
)),
218 const vec_s32 vABCD
= vec_ld(0, ABCD
);
219 const vec_s16 vA
= vec_splat((vec_s16
)vABCD
, 1);
220 const vec_s16 vB
= vec_splat((vec_s16
)vABCD
, 3);
221 const vec_s16 vC
= vec_splat((vec_s16
)vABCD
, 5);
222 const vec_s16 vD
= vec_splat((vec_s16
)vABCD
, 7);
224 const vec_s16 v28ss
= vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
225 const vec_u16 v6us
= vec_splat_u16(6);
226 register int loadSecond
= (((unsigned long)src
) % 16) <= 7 ? 0 : 1;
227 register int reallyBadAlign
= (((unsigned long)src
) % 16) == 15 ? 1 : 0;
229 vec_u8 vsrcAuc
, av_uninit(vsrcBuc
), vsrcperm0
, vsrcperm1
;
230 vec_u8 vsrc0uc
, vsrc1uc
;
231 vec_s16 vsrc0ssH
, vsrc1ssH
;
232 vec_u8 vsrcCuc
, vsrc2uc
, vsrc3uc
;
233 vec_s16 vsrc2ssH
, vsrc3ssH
, psum
;
234 vec_u8 vdst
, ppsum
, vfdst
, fsum
;
236 if (((unsigned long)dst
) % 16 == 0) {
237 fperm
= (vec_u8
){0x10, 0x11, 0x12, 0x13,
238 0x14, 0x15, 0x16, 0x17,
239 0x08, 0x09, 0x0A, 0x0B,
240 0x0C, 0x0D, 0x0E, 0x0F};
242 fperm
= (vec_u8
){0x00, 0x01, 0x02, 0x03,
243 0x04, 0x05, 0x06, 0x07,
244 0x18, 0x19, 0x1A, 0x1B,
245 0x1C, 0x1D, 0x1E, 0x1F};
248 vsrcAuc
= vec_ld(0, src
);
251 vsrcBuc
= vec_ld(16, src
);
252 vsrcperm0
= vec_lvsl(0, src
);
253 vsrcperm1
= vec_lvsl(1, src
);
255 vsrc0uc
= vec_perm(vsrcAuc
, vsrcBuc
, vsrcperm0
);
259 vsrc1uc
= vec_perm(vsrcAuc
, vsrcBuc
, vsrcperm1
);
261 vsrc0ssH
= (vec_s16
)vec_mergeh(zero_u8v
, (vec_u8
)vsrc0uc
);
262 vsrc1ssH
= (vec_s16
)vec_mergeh(zero_u8v
, (vec_u8
)vsrc1uc
);
264 if (!loadSecond
) {// -> !reallyBadAlign
265 for (i
= 0 ; i
< h
; i
++) {
268 vsrcCuc
= vec_ld(stride
+ 0, src
);
270 vsrc2uc
= vec_perm(vsrcCuc
, vsrcCuc
, vsrcperm0
);
271 vsrc3uc
= vec_perm(vsrcCuc
, vsrcCuc
, vsrcperm1
);
273 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28
)
277 for (i
= 0 ; i
< h
; i
++) {
278 vsrcCuc
= vec_ld(stride
+ 0, src
);
279 vsrcDuc
= vec_ld(stride
+ 16, src
);
281 vsrc2uc
= vec_perm(vsrcCuc
, vsrcDuc
, vsrcperm0
);
285 vsrc3uc
= vec_perm(vsrcCuc
, vsrcDuc
, vsrcperm1
);
287 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28
)
294 #undef CHROMA_MC8_ALTIVEC_CORE
296 /* this code assume stride % 16 == 0 */
297 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst
, uint8_t * src
, int dstStride
, int srcStride
) {
298 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num
, 1);
302 const vec_u8 permM2
= vec_lvsl(-2, src
);
303 const vec_u8 permM1
= vec_lvsl(-1, src
);
304 const vec_u8 permP0
= vec_lvsl(+0, src
);
305 const vec_u8 permP1
= vec_lvsl(+1, src
);
306 const vec_u8 permP2
= vec_lvsl(+2, src
);
307 const vec_u8 permP3
= vec_lvsl(+3, src
);
308 const vec_s16 v5ss
= vec_splat_s16(5);
309 const vec_u16 v5us
= vec_splat_u16(5);
310 const vec_s16 v20ss
= vec_sl(vec_splat_s16(5),vec_splat_u16(2));
311 const vec_s16 v16ss
= vec_sl(vec_splat_s16(1),vec_splat_u16(4));
313 vec_u8 srcM2
, srcM1
, srcP0
, srcP1
, srcP2
, srcP3
;
315 register int align
= ((((unsigned long)src
) - 2) % 16);
317 vec_s16 srcP0A
, srcP0B
, srcP1A
, srcP1B
,
318 srcP2A
, srcP2B
, srcP3A
, srcP3B
,
319 srcM1A
, srcM1B
, srcM2A
, srcM2B
,
320 sum1A
, sum1B
, sum2A
, sum2B
, sum3A
, sum3B
,
321 pp1A
, pp1B
, pp2A
, pp2B
, pp3A
, pp3B
,
322 psumA
, psumB
, sumA
, sumB
;
324 vec_u8 sum
, vdst
, fsum
;
326 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num
, 1);
328 for (i
= 0 ; i
< 16 ; i
++) {
329 vec_u8 srcR1
= vec_ld(-2, src
);
330 vec_u8 srcR2
= vec_ld(14, src
);
334 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
335 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
336 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
337 srcP1
= vec_perm(srcR1
, srcR2
, permP1
);
338 srcP2
= vec_perm(srcR1
, srcR2
, permP2
);
339 srcP3
= vec_perm(srcR1
, srcR2
, permP3
);
342 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
343 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
344 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
345 srcP1
= vec_perm(srcR1
, srcR2
, permP1
);
346 srcP2
= vec_perm(srcR1
, srcR2
, permP2
);
350 vec_u8 srcR3
= vec_ld(30, src
);
351 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
352 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
353 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
354 srcP1
= vec_perm(srcR1
, srcR2
, permP1
);
356 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
359 vec_u8 srcR3
= vec_ld(30, src
);
360 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
361 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
362 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
364 srcP2
= vec_perm(srcR2
, srcR3
, permP2
);
365 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
368 vec_u8 srcR3
= vec_ld(30, src
);
369 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
370 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
372 srcP1
= vec_perm(srcR2
, srcR3
, permP1
);
373 srcP2
= vec_perm(srcR2
, srcR3
, permP2
);
374 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
377 vec_u8 srcR3
= vec_ld(30, src
);
378 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
380 srcP0
= vec_perm(srcR2
, srcR3
, permP0
);
381 srcP1
= vec_perm(srcR2
, srcR3
, permP1
);
382 srcP2
= vec_perm(srcR2
, srcR3
, permP2
);
383 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
387 srcP0A
= (vec_s16
) vec_mergeh(zero_u8v
, srcP0
);
388 srcP0B
= (vec_s16
) vec_mergel(zero_u8v
, srcP0
);
389 srcP1A
= (vec_s16
) vec_mergeh(zero_u8v
, srcP1
);
390 srcP1B
= (vec_s16
) vec_mergel(zero_u8v
, srcP1
);
392 srcP2A
= (vec_s16
) vec_mergeh(zero_u8v
, srcP2
);
393 srcP2B
= (vec_s16
) vec_mergel(zero_u8v
, srcP2
);
394 srcP3A
= (vec_s16
) vec_mergeh(zero_u8v
, srcP3
);
395 srcP3B
= (vec_s16
) vec_mergel(zero_u8v
, srcP3
);
397 srcM1A
= (vec_s16
) vec_mergeh(zero_u8v
, srcM1
);
398 srcM1B
= (vec_s16
) vec_mergel(zero_u8v
, srcM1
);
399 srcM2A
= (vec_s16
) vec_mergeh(zero_u8v
, srcM2
);
400 srcM2B
= (vec_s16
) vec_mergel(zero_u8v
, srcM2
);
402 sum1A
= vec_adds(srcP0A
, srcP1A
);
403 sum1B
= vec_adds(srcP0B
, srcP1B
);
404 sum2A
= vec_adds(srcM1A
, srcP2A
);
405 sum2B
= vec_adds(srcM1B
, srcP2B
);
406 sum3A
= vec_adds(srcM2A
, srcP3A
);
407 sum3B
= vec_adds(srcM2B
, srcP3B
);
409 pp1A
= vec_mladd(sum1A
, v20ss
, v16ss
);
410 pp1B
= vec_mladd(sum1B
, v20ss
, v16ss
);
412 pp2A
= vec_mladd(sum2A
, v5ss
, zero_s16v
);
413 pp2B
= vec_mladd(sum2B
, v5ss
, zero_s16v
);
415 pp3A
= vec_add(sum3A
, pp1A
);
416 pp3B
= vec_add(sum3B
, pp1B
);
418 psumA
= vec_sub(pp3A
, pp2A
);
419 psumB
= vec_sub(pp3B
, pp2B
);
421 sumA
= vec_sra(psumA
, v5us
);
422 sumB
= vec_sra(psumB
, v5us
);
424 sum
= vec_packsu(sumA
, sumB
);
427 vdst
= vec_ld(0, dst
);
429 OP_U8_ALTIVEC(fsum
, sum
, vdst
);
431 vec_st(fsum
, 0, dst
);
436 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num
, 1);
439 /* this code assume stride % 16 == 0 */
440 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst
, uint8_t * src
, int dstStride
, int srcStride
) {
441 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num
, 1);
446 const vec_u8 perm
= vec_lvsl(0, src
);
447 const vec_s16 v20ss
= vec_sl(vec_splat_s16(5),vec_splat_u16(2));
448 const vec_u16 v5us
= vec_splat_u16(5);
449 const vec_s16 v5ss
= vec_splat_s16(5);
450 const vec_s16 v16ss
= vec_sl(vec_splat_s16(1),vec_splat_u16(4));
452 uint8_t *srcbis
= src
- (srcStride
* 2);
454 const vec_u8 srcM2a
= vec_ld(0, srcbis
);
455 const vec_u8 srcM2b
= vec_ld(16, srcbis
);
456 const vec_u8 srcM2
= vec_perm(srcM2a
, srcM2b
, perm
);
457 //srcbis += srcStride;
458 const vec_u8 srcM1a
= vec_ld(0, srcbis
+= srcStride
);
459 const vec_u8 srcM1b
= vec_ld(16, srcbis
);
460 const vec_u8 srcM1
= vec_perm(srcM1a
, srcM1b
, perm
);
461 //srcbis += srcStride;
462 const vec_u8 srcP0a
= vec_ld(0, srcbis
+= srcStride
);
463 const vec_u8 srcP0b
= vec_ld(16, srcbis
);
464 const vec_u8 srcP0
= vec_perm(srcP0a
, srcP0b
, perm
);
465 //srcbis += srcStride;
466 const vec_u8 srcP1a
= vec_ld(0, srcbis
+= srcStride
);
467 const vec_u8 srcP1b
= vec_ld(16, srcbis
);
468 const vec_u8 srcP1
= vec_perm(srcP1a
, srcP1b
, perm
);
469 //srcbis += srcStride;
470 const vec_u8 srcP2a
= vec_ld(0, srcbis
+= srcStride
);
471 const vec_u8 srcP2b
= vec_ld(16, srcbis
);
472 const vec_u8 srcP2
= vec_perm(srcP2a
, srcP2b
, perm
);
473 //srcbis += srcStride;
475 vec_s16 srcM2ssA
= (vec_s16
) vec_mergeh(zero_u8v
, srcM2
);
476 vec_s16 srcM2ssB
= (vec_s16
) vec_mergel(zero_u8v
, srcM2
);
477 vec_s16 srcM1ssA
= (vec_s16
) vec_mergeh(zero_u8v
, srcM1
);
478 vec_s16 srcM1ssB
= (vec_s16
) vec_mergel(zero_u8v
, srcM1
);
479 vec_s16 srcP0ssA
= (vec_s16
) vec_mergeh(zero_u8v
, srcP0
);
480 vec_s16 srcP0ssB
= (vec_s16
) vec_mergel(zero_u8v
, srcP0
);
481 vec_s16 srcP1ssA
= (vec_s16
) vec_mergeh(zero_u8v
, srcP1
);
482 vec_s16 srcP1ssB
= (vec_s16
) vec_mergel(zero_u8v
, srcP1
);
483 vec_s16 srcP2ssA
= (vec_s16
) vec_mergeh(zero_u8v
, srcP2
);
484 vec_s16 srcP2ssB
= (vec_s16
) vec_mergel(zero_u8v
, srcP2
);
486 vec_s16 pp1A
, pp1B
, pp2A
, pp2B
, pp3A
, pp3B
,
487 psumA
, psumB
, sumA
, sumB
,
489 sum1A
, sum1B
, sum2A
, sum2B
, sum3A
, sum3B
;
491 vec_u8 sum
, vdst
, fsum
, srcP3a
, srcP3b
, srcP3
;
493 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num
, 1);
495 for (i
= 0 ; i
< 16 ; i
++) {
496 srcP3a
= vec_ld(0, srcbis
+= srcStride
);
497 srcP3b
= vec_ld(16, srcbis
);
498 srcP3
= vec_perm(srcP3a
, srcP3b
, perm
);
499 srcP3ssA
= (vec_s16
) vec_mergeh(zero_u8v
, srcP3
);
500 srcP3ssB
= (vec_s16
) vec_mergel(zero_u8v
, srcP3
);
501 //srcbis += srcStride;
503 sum1A
= vec_adds(srcP0ssA
, srcP1ssA
);
504 sum1B
= vec_adds(srcP0ssB
, srcP1ssB
);
505 sum2A
= vec_adds(srcM1ssA
, srcP2ssA
);
506 sum2B
= vec_adds(srcM1ssB
, srcP2ssB
);
507 sum3A
= vec_adds(srcM2ssA
, srcP3ssA
);
508 sum3B
= vec_adds(srcM2ssB
, srcP3ssB
);
521 pp1A
= vec_mladd(sum1A
, v20ss
, v16ss
);
522 pp1B
= vec_mladd(sum1B
, v20ss
, v16ss
);
524 pp2A
= vec_mladd(sum2A
, v5ss
, zero_s16v
);
525 pp2B
= vec_mladd(sum2B
, v5ss
, zero_s16v
);
527 pp3A
= vec_add(sum3A
, pp1A
);
528 pp3B
= vec_add(sum3B
, pp1B
);
530 psumA
= vec_sub(pp3A
, pp2A
);
531 psumB
= vec_sub(pp3B
, pp2B
);
533 sumA
= vec_sra(psumA
, v5us
);
534 sumB
= vec_sra(psumB
, v5us
);
536 sum
= vec_packsu(sumA
, sumB
);
539 vdst
= vec_ld(0, dst
);
541 OP_U8_ALTIVEC(fsum
, sum
, vdst
);
543 vec_st(fsum
, 0, dst
);
547 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num
, 1);
550 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
551 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst
, int16_t * tmp
, uint8_t * src
, int dstStride
, int tmpStride
, int srcStride
) {
552 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num
, 1);
555 const vec_u8 permM2
= vec_lvsl(-2, src
);
556 const vec_u8 permM1
= vec_lvsl(-1, src
);
557 const vec_u8 permP0
= vec_lvsl(+0, src
);
558 const vec_u8 permP1
= vec_lvsl(+1, src
);
559 const vec_u8 permP2
= vec_lvsl(+2, src
);
560 const vec_u8 permP3
= vec_lvsl(+3, src
);
561 const vec_s16 v20ss
= vec_sl(vec_splat_s16(5),vec_splat_u16(2));
562 const vec_u32 v10ui
= vec_splat_u32(10);
563 const vec_s16 v5ss
= vec_splat_s16(5);
564 const vec_s16 v1ss
= vec_splat_s16(1);
565 const vec_s32 v512si
= vec_sl(vec_splat_s32(1),vec_splat_u32(9));
566 const vec_u32 v16ui
= vec_sl(vec_splat_u32(1),vec_splat_u32(4));
568 register int align
= ((((unsigned long)src
) - 2) % 16);
570 vec_s16 srcP0A
, srcP0B
, srcP1A
, srcP1B
,
571 srcP2A
, srcP2B
, srcP3A
, srcP3B
,
572 srcM1A
, srcM1B
, srcM2A
, srcM2B
,
573 sum1A
, sum1B
, sum2A
, sum2B
, sum3A
, sum3B
,
574 pp1A
, pp1B
, pp2A
, pp2B
, psumA
, psumB
;
576 const vec_u8 mperm
= (const vec_u8
)
577 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
578 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
579 int16_t *tmpbis
= tmp
;
581 vec_s16 tmpM1ssA
, tmpM1ssB
, tmpM2ssA
, tmpM2ssB
,
582 tmpP0ssA
, tmpP0ssB
, tmpP1ssA
, tmpP1ssB
,
585 vec_s32 pp1Ae
, pp1Ao
, pp1Be
, pp1Bo
, pp2Ae
, pp2Ao
, pp2Be
, pp2Bo
,
586 pp3Ae
, pp3Ao
, pp3Be
, pp3Bo
, pp1cAe
, pp1cAo
, pp1cBe
, pp1cBo
,
587 pp32Ae
, pp32Ao
, pp32Be
, pp32Bo
, sumAe
, sumAo
, sumBe
, sumBo
,
588 ssumAe
, ssumAo
, ssumBe
, ssumBo
;
589 vec_u8 fsum
, sumv
, sum
, vdst
;
590 vec_s16 ssume
, ssumo
;
592 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num
, 1);
593 src
-= (2 * srcStride
);
594 for (i
= 0 ; i
< 21 ; i
++) {
595 vec_u8 srcM2
, srcM1
, srcP0
, srcP1
, srcP2
, srcP3
;
596 vec_u8 srcR1
= vec_ld(-2, src
);
597 vec_u8 srcR2
= vec_ld(14, src
);
601 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
602 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
603 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
604 srcP1
= vec_perm(srcR1
, srcR2
, permP1
);
605 srcP2
= vec_perm(srcR1
, srcR2
, permP2
);
606 srcP3
= vec_perm(srcR1
, srcR2
, permP3
);
609 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
610 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
611 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
612 srcP1
= vec_perm(srcR1
, srcR2
, permP1
);
613 srcP2
= vec_perm(srcR1
, srcR2
, permP2
);
617 vec_u8 srcR3
= vec_ld(30, src
);
618 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
619 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
620 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
621 srcP1
= vec_perm(srcR1
, srcR2
, permP1
);
623 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
626 vec_u8 srcR3
= vec_ld(30, src
);
627 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
628 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
629 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
631 srcP2
= vec_perm(srcR2
, srcR3
, permP2
);
632 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
635 vec_u8 srcR3
= vec_ld(30, src
);
636 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
637 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
639 srcP1
= vec_perm(srcR2
, srcR3
, permP1
);
640 srcP2
= vec_perm(srcR2
, srcR3
, permP2
);
641 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
644 vec_u8 srcR3
= vec_ld(30, src
);
645 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
647 srcP0
= vec_perm(srcR2
, srcR3
, permP0
);
648 srcP1
= vec_perm(srcR2
, srcR3
, permP1
);
649 srcP2
= vec_perm(srcR2
, srcR3
, permP2
);
650 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
654 srcP0A
= (vec_s16
) vec_mergeh(zero_u8v
, srcP0
);
655 srcP0B
= (vec_s16
) vec_mergel(zero_u8v
, srcP0
);
656 srcP1A
= (vec_s16
) vec_mergeh(zero_u8v
, srcP1
);
657 srcP1B
= (vec_s16
) vec_mergel(zero_u8v
, srcP1
);
659 srcP2A
= (vec_s16
) vec_mergeh(zero_u8v
, srcP2
);
660 srcP2B
= (vec_s16
) vec_mergel(zero_u8v
, srcP2
);
661 srcP3A
= (vec_s16
) vec_mergeh(zero_u8v
, srcP3
);
662 srcP3B
= (vec_s16
) vec_mergel(zero_u8v
, srcP3
);
664 srcM1A
= (vec_s16
) vec_mergeh(zero_u8v
, srcM1
);
665 srcM1B
= (vec_s16
) vec_mergel(zero_u8v
, srcM1
);
666 srcM2A
= (vec_s16
) vec_mergeh(zero_u8v
, srcM2
);
667 srcM2B
= (vec_s16
) vec_mergel(zero_u8v
, srcM2
);
669 sum1A
= vec_adds(srcP0A
, srcP1A
);
670 sum1B
= vec_adds(srcP0B
, srcP1B
);
671 sum2A
= vec_adds(srcM1A
, srcP2A
);
672 sum2B
= vec_adds(srcM1B
, srcP2B
);
673 sum3A
= vec_adds(srcM2A
, srcP3A
);
674 sum3B
= vec_adds(srcM2B
, srcP3B
);
676 pp1A
= vec_mladd(sum1A
, v20ss
, sum3A
);
677 pp1B
= vec_mladd(sum1B
, v20ss
, sum3B
);
679 pp2A
= vec_mladd(sum2A
, v5ss
, zero_s16v
);
680 pp2B
= vec_mladd(sum2B
, v5ss
, zero_s16v
);
682 psumA
= vec_sub(pp1A
, pp2A
);
683 psumB
= vec_sub(pp1B
, pp2B
);
685 vec_st(psumA
, 0, tmp
);
686 vec_st(psumB
, 16, tmp
);
689 tmp
+= tmpStride
; /* int16_t*, and stride is 16, so it's OK here */
692 tmpM2ssA
= vec_ld(0, tmpbis
);
693 tmpM2ssB
= vec_ld(16, tmpbis
);
695 tmpM1ssA
= vec_ld(0, tmpbis
);
696 tmpM1ssB
= vec_ld(16, tmpbis
);
698 tmpP0ssA
= vec_ld(0, tmpbis
);
699 tmpP0ssB
= vec_ld(16, tmpbis
);
701 tmpP1ssA
= vec_ld(0, tmpbis
);
702 tmpP1ssB
= vec_ld(16, tmpbis
);
704 tmpP2ssA
= vec_ld(0, tmpbis
);
705 tmpP2ssB
= vec_ld(16, tmpbis
);
708 for (i
= 0 ; i
< 16 ; i
++) {
709 const vec_s16 tmpP3ssA
= vec_ld(0, tmpbis
);
710 const vec_s16 tmpP3ssB
= vec_ld(16, tmpbis
);
712 const vec_s16 sum1A
= vec_adds(tmpP0ssA
, tmpP1ssA
);
713 const vec_s16 sum1B
= vec_adds(tmpP0ssB
, tmpP1ssB
);
714 const vec_s16 sum2A
= vec_adds(tmpM1ssA
, tmpP2ssA
);
715 const vec_s16 sum2B
= vec_adds(tmpM1ssB
, tmpP2ssB
);
716 const vec_s16 sum3A
= vec_adds(tmpM2ssA
, tmpP3ssA
);
717 const vec_s16 sum3B
= vec_adds(tmpM2ssB
, tmpP3ssB
);
732 pp1Ae
= vec_mule(sum1A
, v20ss
);
733 pp1Ao
= vec_mulo(sum1A
, v20ss
);
734 pp1Be
= vec_mule(sum1B
, v20ss
);
735 pp1Bo
= vec_mulo(sum1B
, v20ss
);
737 pp2Ae
= vec_mule(sum2A
, v5ss
);
738 pp2Ao
= vec_mulo(sum2A
, v5ss
);
739 pp2Be
= vec_mule(sum2B
, v5ss
);
740 pp2Bo
= vec_mulo(sum2B
, v5ss
);
742 pp3Ae
= vec_sra((vec_s32
)sum3A
, v16ui
);
743 pp3Ao
= vec_mulo(sum3A
, v1ss
);
744 pp3Be
= vec_sra((vec_s32
)sum3B
, v16ui
);
745 pp3Bo
= vec_mulo(sum3B
, v1ss
);
747 pp1cAe
= vec_add(pp1Ae
, v512si
);
748 pp1cAo
= vec_add(pp1Ao
, v512si
);
749 pp1cBe
= vec_add(pp1Be
, v512si
);
750 pp1cBo
= vec_add(pp1Bo
, v512si
);
752 pp32Ae
= vec_sub(pp3Ae
, pp2Ae
);
753 pp32Ao
= vec_sub(pp3Ao
, pp2Ao
);
754 pp32Be
= vec_sub(pp3Be
, pp2Be
);
755 pp32Bo
= vec_sub(pp3Bo
, pp2Bo
);
757 sumAe
= vec_add(pp1cAe
, pp32Ae
);
758 sumAo
= vec_add(pp1cAo
, pp32Ao
);
759 sumBe
= vec_add(pp1cBe
, pp32Be
);
760 sumBo
= vec_add(pp1cBo
, pp32Bo
);
762 ssumAe
= vec_sra(sumAe
, v10ui
);
763 ssumAo
= vec_sra(sumAo
, v10ui
);
764 ssumBe
= vec_sra(sumBe
, v10ui
);
765 ssumBo
= vec_sra(sumBo
, v10ui
);
767 ssume
= vec_packs(ssumAe
, ssumBe
);
768 ssumo
= vec_packs(ssumAo
, ssumBo
);
770 sumv
= vec_packsu(ssume
, ssumo
);
771 sum
= vec_perm(sumv
, sumv
, mperm
);
774 vdst
= vec_ld(0, dst
);
776 OP_U8_ALTIVEC(fsum
, sum
, vdst
);
778 vec_st(fsum
, 0, dst
);
782 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num
, 1);