Use add_*flags only after compiler-specific configuration
[FFMpeg-mirror/ordered_chapters.git] / libavcodec / ppc / h264_template_altivec.c
blob954fd16bd929950ce38bdba0a0a3824d8c089f03
1 /*
2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 //#define DEBUG_ALIGNMENT
22 #ifdef DEBUG_ALIGNMENT
23 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
24 #else
25 #define ASSERT_ALIGNED(ptr) ;
26 #endif
28 /* this code assume that stride % 16 == 0 */
30 #define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
31 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
32 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
34 psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
35 psum = vec_mladd(vB, vsrc1ssH, psum);\
36 psum = vec_mladd(vC, vsrc2ssH, psum);\
37 psum = vec_mladd(vD, vsrc3ssH, psum);\
38 psum = BIAS2(psum);\
39 psum = vec_sr(psum, v6us);\
41 vdst = vec_ld(0, dst);\
42 ppsum = (vec_u8)vec_pack(psum, psum);\
43 vfdst = vec_perm(vdst, ppsum, fperm);\
45 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
47 vec_st(fsum, 0, dst);\
49 vsrc0ssH = vsrc2ssH;\
50 vsrc1ssH = vsrc3ssH;\
52 dst += stride;\
53 src += stride;
55 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
57 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
58 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
60 psum = vec_mladd(vA, vsrc0ssH, v32ss);\
61 psum = vec_mladd(vE, vsrc1ssH, psum);\
62 psum = vec_sr(psum, v6us);\
64 vdst = vec_ld(0, dst);\
65 ppsum = (vec_u8)vec_pack(psum, psum);\
66 vfdst = vec_perm(vdst, ppsum, fperm);\
68 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
70 vec_st(fsum, 0, dst);\
72 dst += stride;\
73 src += stride;
75 #define noop(a) a
76 #define add28(a) vec_add(v28ss, a)
78 void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
79 int stride, int h, int x, int y) {
80 POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
81 DECLARE_ALIGNED_16(signed int, ABCD[4]) =
82 {((8 - x) * (8 - y)),
83 (( x) * (8 - y)),
84 ((8 - x) * ( y)),
85 (( x) * ( y))};
86 register int i;
87 vec_u8 fperm;
88 const vec_s32 vABCD = vec_ld(0, ABCD);
89 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
90 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
91 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
92 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
93 LOAD_ZERO;
94 const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
95 const vec_u16 v6us = vec_splat_u16(6);
96 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
97 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
99 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
100 vec_u8 vsrc0uc, vsrc1uc;
101 vec_s16 vsrc0ssH, vsrc1ssH;
102 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
103 vec_s16 vsrc2ssH, vsrc3ssH, psum;
104 vec_u8 vdst, ppsum, vfdst, fsum;
106 POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
108 if (((unsigned long)dst) % 16 == 0) {
109 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
110 0x14, 0x15, 0x16, 0x17,
111 0x08, 0x09, 0x0A, 0x0B,
112 0x0C, 0x0D, 0x0E, 0x0F};
113 } else {
114 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
115 0x04, 0x05, 0x06, 0x07,
116 0x18, 0x19, 0x1A, 0x1B,
117 0x1C, 0x1D, 0x1E, 0x1F};
120 vsrcAuc = vec_ld(0, src);
122 if (loadSecond)
123 vsrcBuc = vec_ld(16, src);
124 vsrcperm0 = vec_lvsl(0, src);
125 vsrcperm1 = vec_lvsl(1, src);
127 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
128 if (reallyBadAlign)
129 vsrc1uc = vsrcBuc;
130 else
131 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
133 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
134 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
136 if (ABCD[3]) {
137 if (!loadSecond) {// -> !reallyBadAlign
138 for (i = 0 ; i < h ; i++) {
139 vsrcCuc = vec_ld(stride + 0, src);
140 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
141 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
143 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
145 } else {
146 vec_u8 vsrcDuc;
147 for (i = 0 ; i < h ; i++) {
148 vsrcCuc = vec_ld(stride + 0, src);
149 vsrcDuc = vec_ld(stride + 16, src);
150 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
151 if (reallyBadAlign)
152 vsrc3uc = vsrcDuc;
153 else
154 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
156 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
159 } else {
160 const vec_s16 vE = vec_add(vB, vC);
161 if (ABCD[2]) { // x == 0 B == 0
162 if (!loadSecond) {// -> !reallyBadAlign
163 for (i = 0 ; i < h ; i++) {
164 vsrcCuc = vec_ld(stride + 0, src);
165 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
166 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
168 vsrc0uc = vsrc1uc;
170 } else {
171 vec_u8 vsrcDuc;
172 for (i = 0 ; i < h ; i++) {
173 vsrcCuc = vec_ld(stride + 0, src);
174 vsrcDuc = vec_ld(stride + 15, src);
175 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
176 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
178 vsrc0uc = vsrc1uc;
181 } else { // y == 0 C == 0
182 if (!loadSecond) {// -> !reallyBadAlign
183 for (i = 0 ; i < h ; i++) {
184 vsrcCuc = vec_ld(0, src);
185 vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
186 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
188 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
190 } else {
191 vec_u8 vsrcDuc;
192 for (i = 0 ; i < h ; i++) {
193 vsrcCuc = vec_ld(0, src);
194 vsrcDuc = vec_ld(15, src);
195 vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
196 if (reallyBadAlign)
197 vsrc1uc = vsrcDuc;
198 else
199 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
201 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
206 POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
209 /* this code assume that stride % 16 == 0 */
210 void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
211 DECLARE_ALIGNED_16(signed int, ABCD[4]) =
212 {((8 - x) * (8 - y)),
213 (( x) * (8 - y)),
214 ((8 - x) * ( y)),
215 (( x) * ( y))};
216 register int i;
217 vec_u8 fperm;
218 const vec_s32 vABCD = vec_ld(0, ABCD);
219 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
220 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
221 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
222 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
223 LOAD_ZERO;
224 const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
225 const vec_u16 v6us = vec_splat_u16(6);
226 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
227 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
229 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
230 vec_u8 vsrc0uc, vsrc1uc;
231 vec_s16 vsrc0ssH, vsrc1ssH;
232 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
233 vec_s16 vsrc2ssH, vsrc3ssH, psum;
234 vec_u8 vdst, ppsum, vfdst, fsum;
236 if (((unsigned long)dst) % 16 == 0) {
237 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
238 0x14, 0x15, 0x16, 0x17,
239 0x08, 0x09, 0x0A, 0x0B,
240 0x0C, 0x0D, 0x0E, 0x0F};
241 } else {
242 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
243 0x04, 0x05, 0x06, 0x07,
244 0x18, 0x19, 0x1A, 0x1B,
245 0x1C, 0x1D, 0x1E, 0x1F};
248 vsrcAuc = vec_ld(0, src);
250 if (loadSecond)
251 vsrcBuc = vec_ld(16, src);
252 vsrcperm0 = vec_lvsl(0, src);
253 vsrcperm1 = vec_lvsl(1, src);
255 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
256 if (reallyBadAlign)
257 vsrc1uc = vsrcBuc;
258 else
259 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
261 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
262 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
264 if (!loadSecond) {// -> !reallyBadAlign
265 for (i = 0 ; i < h ; i++) {
268 vsrcCuc = vec_ld(stride + 0, src);
270 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
271 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
273 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
275 } else {
276 vec_u8 vsrcDuc;
277 for (i = 0 ; i < h ; i++) {
278 vsrcCuc = vec_ld(stride + 0, src);
279 vsrcDuc = vec_ld(stride + 16, src);
281 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
282 if (reallyBadAlign)
283 vsrc3uc = vsrcDuc;
284 else
285 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
287 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
292 #undef noop
293 #undef add28
294 #undef CHROMA_MC8_ALTIVEC_CORE
296 /* this code assume stride % 16 == 0 */
297 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
298 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
299 register int i;
301 LOAD_ZERO;
302 const vec_u8 permM2 = vec_lvsl(-2, src);
303 const vec_u8 permM1 = vec_lvsl(-1, src);
304 const vec_u8 permP0 = vec_lvsl(+0, src);
305 const vec_u8 permP1 = vec_lvsl(+1, src);
306 const vec_u8 permP2 = vec_lvsl(+2, src);
307 const vec_u8 permP3 = vec_lvsl(+3, src);
308 const vec_s16 v5ss = vec_splat_s16(5);
309 const vec_u16 v5us = vec_splat_u16(5);
310 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
311 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
313 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
315 register int align = ((((unsigned long)src) - 2) % 16);
317 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
318 srcP2A, srcP2B, srcP3A, srcP3B,
319 srcM1A, srcM1B, srcM2A, srcM2B,
320 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
321 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
322 psumA, psumB, sumA, sumB;
324 vec_u8 sum, vdst, fsum;
326 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
328 for (i = 0 ; i < 16 ; i ++) {
329 vec_u8 srcR1 = vec_ld(-2, src);
330 vec_u8 srcR2 = vec_ld(14, src);
332 switch (align) {
333 default: {
334 srcM2 = vec_perm(srcR1, srcR2, permM2);
335 srcM1 = vec_perm(srcR1, srcR2, permM1);
336 srcP0 = vec_perm(srcR1, srcR2, permP0);
337 srcP1 = vec_perm(srcR1, srcR2, permP1);
338 srcP2 = vec_perm(srcR1, srcR2, permP2);
339 srcP3 = vec_perm(srcR1, srcR2, permP3);
340 } break;
341 case 11: {
342 srcM2 = vec_perm(srcR1, srcR2, permM2);
343 srcM1 = vec_perm(srcR1, srcR2, permM1);
344 srcP0 = vec_perm(srcR1, srcR2, permP0);
345 srcP1 = vec_perm(srcR1, srcR2, permP1);
346 srcP2 = vec_perm(srcR1, srcR2, permP2);
347 srcP3 = srcR2;
348 } break;
349 case 12: {
350 vec_u8 srcR3 = vec_ld(30, src);
351 srcM2 = vec_perm(srcR1, srcR2, permM2);
352 srcM1 = vec_perm(srcR1, srcR2, permM1);
353 srcP0 = vec_perm(srcR1, srcR2, permP0);
354 srcP1 = vec_perm(srcR1, srcR2, permP1);
355 srcP2 = srcR2;
356 srcP3 = vec_perm(srcR2, srcR3, permP3);
357 } break;
358 case 13: {
359 vec_u8 srcR3 = vec_ld(30, src);
360 srcM2 = vec_perm(srcR1, srcR2, permM2);
361 srcM1 = vec_perm(srcR1, srcR2, permM1);
362 srcP0 = vec_perm(srcR1, srcR2, permP0);
363 srcP1 = srcR2;
364 srcP2 = vec_perm(srcR2, srcR3, permP2);
365 srcP3 = vec_perm(srcR2, srcR3, permP3);
366 } break;
367 case 14: {
368 vec_u8 srcR3 = vec_ld(30, src);
369 srcM2 = vec_perm(srcR1, srcR2, permM2);
370 srcM1 = vec_perm(srcR1, srcR2, permM1);
371 srcP0 = srcR2;
372 srcP1 = vec_perm(srcR2, srcR3, permP1);
373 srcP2 = vec_perm(srcR2, srcR3, permP2);
374 srcP3 = vec_perm(srcR2, srcR3, permP3);
375 } break;
376 case 15: {
377 vec_u8 srcR3 = vec_ld(30, src);
378 srcM2 = vec_perm(srcR1, srcR2, permM2);
379 srcM1 = srcR2;
380 srcP0 = vec_perm(srcR2, srcR3, permP0);
381 srcP1 = vec_perm(srcR2, srcR3, permP1);
382 srcP2 = vec_perm(srcR2, srcR3, permP2);
383 srcP3 = vec_perm(srcR2, srcR3, permP3);
384 } break;
387 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
388 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
389 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
390 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
392 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
393 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
394 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
395 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
397 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
398 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
399 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
400 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
402 sum1A = vec_adds(srcP0A, srcP1A);
403 sum1B = vec_adds(srcP0B, srcP1B);
404 sum2A = vec_adds(srcM1A, srcP2A);
405 sum2B = vec_adds(srcM1B, srcP2B);
406 sum3A = vec_adds(srcM2A, srcP3A);
407 sum3B = vec_adds(srcM2B, srcP3B);
409 pp1A = vec_mladd(sum1A, v20ss, v16ss);
410 pp1B = vec_mladd(sum1B, v20ss, v16ss);
412 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
413 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
415 pp3A = vec_add(sum3A, pp1A);
416 pp3B = vec_add(sum3B, pp1B);
418 psumA = vec_sub(pp3A, pp2A);
419 psumB = vec_sub(pp3B, pp2B);
421 sumA = vec_sra(psumA, v5us);
422 sumB = vec_sra(psumB, v5us);
424 sum = vec_packsu(sumA, sumB);
426 ASSERT_ALIGNED(dst);
427 vdst = vec_ld(0, dst);
429 OP_U8_ALTIVEC(fsum, sum, vdst);
431 vec_st(fsum, 0, dst);
433 src += srcStride;
434 dst += dstStride;
436 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
439 /* this code assume stride % 16 == 0 */
440 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
441 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
443 register int i;
445 LOAD_ZERO;
446 const vec_u8 perm = vec_lvsl(0, src);
447 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
448 const vec_u16 v5us = vec_splat_u16(5);
449 const vec_s16 v5ss = vec_splat_s16(5);
450 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
452 uint8_t *srcbis = src - (srcStride * 2);
454 const vec_u8 srcM2a = vec_ld(0, srcbis);
455 const vec_u8 srcM2b = vec_ld(16, srcbis);
456 const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
457 //srcbis += srcStride;
458 const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
459 const vec_u8 srcM1b = vec_ld(16, srcbis);
460 const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
461 //srcbis += srcStride;
462 const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
463 const vec_u8 srcP0b = vec_ld(16, srcbis);
464 const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
465 //srcbis += srcStride;
466 const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
467 const vec_u8 srcP1b = vec_ld(16, srcbis);
468 const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
469 //srcbis += srcStride;
470 const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
471 const vec_u8 srcP2b = vec_ld(16, srcbis);
472 const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
473 //srcbis += srcStride;
475 vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
476 vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
477 vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
478 vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
479 vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
480 vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
481 vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
482 vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
483 vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
484 vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
486 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
487 psumA, psumB, sumA, sumB,
488 srcP3ssA, srcP3ssB,
489 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
491 vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3;
493 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
495 for (i = 0 ; i < 16 ; i++) {
496 srcP3a = vec_ld(0, srcbis += srcStride);
497 srcP3b = vec_ld(16, srcbis);
498 srcP3 = vec_perm(srcP3a, srcP3b, perm);
499 srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
500 srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
501 //srcbis += srcStride;
503 sum1A = vec_adds(srcP0ssA, srcP1ssA);
504 sum1B = vec_adds(srcP0ssB, srcP1ssB);
505 sum2A = vec_adds(srcM1ssA, srcP2ssA);
506 sum2B = vec_adds(srcM1ssB, srcP2ssB);
507 sum3A = vec_adds(srcM2ssA, srcP3ssA);
508 sum3B = vec_adds(srcM2ssB, srcP3ssB);
510 srcM2ssA = srcM1ssA;
511 srcM2ssB = srcM1ssB;
512 srcM1ssA = srcP0ssA;
513 srcM1ssB = srcP0ssB;
514 srcP0ssA = srcP1ssA;
515 srcP0ssB = srcP1ssB;
516 srcP1ssA = srcP2ssA;
517 srcP1ssB = srcP2ssB;
518 srcP2ssA = srcP3ssA;
519 srcP2ssB = srcP3ssB;
521 pp1A = vec_mladd(sum1A, v20ss, v16ss);
522 pp1B = vec_mladd(sum1B, v20ss, v16ss);
524 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
525 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
527 pp3A = vec_add(sum3A, pp1A);
528 pp3B = vec_add(sum3B, pp1B);
530 psumA = vec_sub(pp3A, pp2A);
531 psumB = vec_sub(pp3B, pp2B);
533 sumA = vec_sra(psumA, v5us);
534 sumB = vec_sra(psumB, v5us);
536 sum = vec_packsu(sumA, sumB);
538 ASSERT_ALIGNED(dst);
539 vdst = vec_ld(0, dst);
541 OP_U8_ALTIVEC(fsum, sum, vdst);
543 vec_st(fsum, 0, dst);
545 dst += dstStride;
547 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
550 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
551 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
552 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
553 register int i;
554 LOAD_ZERO;
555 const vec_u8 permM2 = vec_lvsl(-2, src);
556 const vec_u8 permM1 = vec_lvsl(-1, src);
557 const vec_u8 permP0 = vec_lvsl(+0, src);
558 const vec_u8 permP1 = vec_lvsl(+1, src);
559 const vec_u8 permP2 = vec_lvsl(+2, src);
560 const vec_u8 permP3 = vec_lvsl(+3, src);
561 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
562 const vec_u32 v10ui = vec_splat_u32(10);
563 const vec_s16 v5ss = vec_splat_s16(5);
564 const vec_s16 v1ss = vec_splat_s16(1);
565 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
566 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
568 register int align = ((((unsigned long)src) - 2) % 16);
570 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
571 srcP2A, srcP2B, srcP3A, srcP3B,
572 srcM1A, srcM1B, srcM2A, srcM2B,
573 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
574 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
576 const vec_u8 mperm = (const vec_u8)
577 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
578 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
579 int16_t *tmpbis = tmp;
581 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
582 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
583 tmpP2ssA, tmpP2ssB;
585 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
586 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
587 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
588 ssumAe, ssumAo, ssumBe, ssumBo;
589 vec_u8 fsum, sumv, sum, vdst;
590 vec_s16 ssume, ssumo;
592 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
593 src -= (2 * srcStride);
594 for (i = 0 ; i < 21 ; i ++) {
595 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
596 vec_u8 srcR1 = vec_ld(-2, src);
597 vec_u8 srcR2 = vec_ld(14, src);
599 switch (align) {
600 default: {
601 srcM2 = vec_perm(srcR1, srcR2, permM2);
602 srcM1 = vec_perm(srcR1, srcR2, permM1);
603 srcP0 = vec_perm(srcR1, srcR2, permP0);
604 srcP1 = vec_perm(srcR1, srcR2, permP1);
605 srcP2 = vec_perm(srcR1, srcR2, permP2);
606 srcP3 = vec_perm(srcR1, srcR2, permP3);
607 } break;
608 case 11: {
609 srcM2 = vec_perm(srcR1, srcR2, permM2);
610 srcM1 = vec_perm(srcR1, srcR2, permM1);
611 srcP0 = vec_perm(srcR1, srcR2, permP0);
612 srcP1 = vec_perm(srcR1, srcR2, permP1);
613 srcP2 = vec_perm(srcR1, srcR2, permP2);
614 srcP3 = srcR2;
615 } break;
616 case 12: {
617 vec_u8 srcR3 = vec_ld(30, src);
618 srcM2 = vec_perm(srcR1, srcR2, permM2);
619 srcM1 = vec_perm(srcR1, srcR2, permM1);
620 srcP0 = vec_perm(srcR1, srcR2, permP0);
621 srcP1 = vec_perm(srcR1, srcR2, permP1);
622 srcP2 = srcR2;
623 srcP3 = vec_perm(srcR2, srcR3, permP3);
624 } break;
625 case 13: {
626 vec_u8 srcR3 = vec_ld(30, src);
627 srcM2 = vec_perm(srcR1, srcR2, permM2);
628 srcM1 = vec_perm(srcR1, srcR2, permM1);
629 srcP0 = vec_perm(srcR1, srcR2, permP0);
630 srcP1 = srcR2;
631 srcP2 = vec_perm(srcR2, srcR3, permP2);
632 srcP3 = vec_perm(srcR2, srcR3, permP3);
633 } break;
634 case 14: {
635 vec_u8 srcR3 = vec_ld(30, src);
636 srcM2 = vec_perm(srcR1, srcR2, permM2);
637 srcM1 = vec_perm(srcR1, srcR2, permM1);
638 srcP0 = srcR2;
639 srcP1 = vec_perm(srcR2, srcR3, permP1);
640 srcP2 = vec_perm(srcR2, srcR3, permP2);
641 srcP3 = vec_perm(srcR2, srcR3, permP3);
642 } break;
643 case 15: {
644 vec_u8 srcR3 = vec_ld(30, src);
645 srcM2 = vec_perm(srcR1, srcR2, permM2);
646 srcM1 = srcR2;
647 srcP0 = vec_perm(srcR2, srcR3, permP0);
648 srcP1 = vec_perm(srcR2, srcR3, permP1);
649 srcP2 = vec_perm(srcR2, srcR3, permP2);
650 srcP3 = vec_perm(srcR2, srcR3, permP3);
651 } break;
654 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
655 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
656 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
657 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
659 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
660 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
661 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
662 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
664 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
665 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
666 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
667 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
669 sum1A = vec_adds(srcP0A, srcP1A);
670 sum1B = vec_adds(srcP0B, srcP1B);
671 sum2A = vec_adds(srcM1A, srcP2A);
672 sum2B = vec_adds(srcM1B, srcP2B);
673 sum3A = vec_adds(srcM2A, srcP3A);
674 sum3B = vec_adds(srcM2B, srcP3B);
676 pp1A = vec_mladd(sum1A, v20ss, sum3A);
677 pp1B = vec_mladd(sum1B, v20ss, sum3B);
679 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
680 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
682 psumA = vec_sub(pp1A, pp2A);
683 psumB = vec_sub(pp1B, pp2B);
685 vec_st(psumA, 0, tmp);
686 vec_st(psumB, 16, tmp);
688 src += srcStride;
689 tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
692 tmpM2ssA = vec_ld(0, tmpbis);
693 tmpM2ssB = vec_ld(16, tmpbis);
694 tmpbis += tmpStride;
695 tmpM1ssA = vec_ld(0, tmpbis);
696 tmpM1ssB = vec_ld(16, tmpbis);
697 tmpbis += tmpStride;
698 tmpP0ssA = vec_ld(0, tmpbis);
699 tmpP0ssB = vec_ld(16, tmpbis);
700 tmpbis += tmpStride;
701 tmpP1ssA = vec_ld(0, tmpbis);
702 tmpP1ssB = vec_ld(16, tmpbis);
703 tmpbis += tmpStride;
704 tmpP2ssA = vec_ld(0, tmpbis);
705 tmpP2ssB = vec_ld(16, tmpbis);
706 tmpbis += tmpStride;
708 for (i = 0 ; i < 16 ; i++) {
709 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
710 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
712 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
713 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
714 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
715 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
716 const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
717 const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
719 tmpbis += tmpStride;
721 tmpM2ssA = tmpM1ssA;
722 tmpM2ssB = tmpM1ssB;
723 tmpM1ssA = tmpP0ssA;
724 tmpM1ssB = tmpP0ssB;
725 tmpP0ssA = tmpP1ssA;
726 tmpP0ssB = tmpP1ssB;
727 tmpP1ssA = tmpP2ssA;
728 tmpP1ssB = tmpP2ssB;
729 tmpP2ssA = tmpP3ssA;
730 tmpP2ssB = tmpP3ssB;
732 pp1Ae = vec_mule(sum1A, v20ss);
733 pp1Ao = vec_mulo(sum1A, v20ss);
734 pp1Be = vec_mule(sum1B, v20ss);
735 pp1Bo = vec_mulo(sum1B, v20ss);
737 pp2Ae = vec_mule(sum2A, v5ss);
738 pp2Ao = vec_mulo(sum2A, v5ss);
739 pp2Be = vec_mule(sum2B, v5ss);
740 pp2Bo = vec_mulo(sum2B, v5ss);
742 pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
743 pp3Ao = vec_mulo(sum3A, v1ss);
744 pp3Be = vec_sra((vec_s32)sum3B, v16ui);
745 pp3Bo = vec_mulo(sum3B, v1ss);
747 pp1cAe = vec_add(pp1Ae, v512si);
748 pp1cAo = vec_add(pp1Ao, v512si);
749 pp1cBe = vec_add(pp1Be, v512si);
750 pp1cBo = vec_add(pp1Bo, v512si);
752 pp32Ae = vec_sub(pp3Ae, pp2Ae);
753 pp32Ao = vec_sub(pp3Ao, pp2Ao);
754 pp32Be = vec_sub(pp3Be, pp2Be);
755 pp32Bo = vec_sub(pp3Bo, pp2Bo);
757 sumAe = vec_add(pp1cAe, pp32Ae);
758 sumAo = vec_add(pp1cAo, pp32Ao);
759 sumBe = vec_add(pp1cBe, pp32Be);
760 sumBo = vec_add(pp1cBo, pp32Bo);
762 ssumAe = vec_sra(sumAe, v10ui);
763 ssumAo = vec_sra(sumAo, v10ui);
764 ssumBe = vec_sra(sumBe, v10ui);
765 ssumBo = vec_sra(sumBo, v10ui);
767 ssume = vec_packs(ssumAe, ssumBe);
768 ssumo = vec_packs(ssumAo, ssumBo);
770 sumv = vec_packsu(ssume, ssumo);
771 sum = vec_perm(sumv, sumv, mperm);
773 ASSERT_ALIGNED(dst);
774 vdst = vec_ld(0, dst);
776 OP_U8_ALTIVEC(fsum, sum, vdst);
778 vec_st(fsum, 0, dst);
780 dst += dstStride;
782 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);