2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 //#define DEBUG_ALIGNMENT
22 #ifdef DEBUG_ALIGNMENT
23 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
25 #define ASSERT_ALIGNED(ptr) ;
28 /* this code assume that stride % 16 == 0 */
30 #define CHROMA_MC8_ALTIVEC_CORE \
31 vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc);\
32 vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc);\
34 psum = vec_mladd(vA, vsrc0ssH, v32ss);\
35 psum = vec_mladd(vB, vsrc1ssH, psum);\
36 psum = vec_mladd(vC, vsrc2ssH, psum);\
37 psum = vec_mladd(vD, vsrc3ssH, psum);\
38 psum = vec_sr(psum, v6us);\
40 vdst = vec_ld(0, dst);\
41 ppsum = (vec_u8_t)vec_pack(psum, psum);\
42 vfdst = vec_perm(vdst, ppsum, fperm);\
44 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
46 vec_st(fsum, 0, dst);\
54 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
56 vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc);\
57 vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc);\
59 psum = vec_mladd(vA, vsrc0ssH, v32ss);\
60 psum = vec_mladd(vE, vsrc1ssH, psum);\
61 psum = vec_sr(psum, v6us);\
63 vdst = vec_ld(0, dst);\
64 ppsum = (vec_u8_t)vec_pack(psum, psum);\
65 vfdst = vec_perm(vdst, ppsum, fperm);\
67 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
69 vec_st(fsum, 0, dst);\
74 void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst
, uint8_t * src
,
75 int stride
, int h
, int x
, int y
) {
76 POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num
, 1);
77 DECLARE_ALIGNED_16(signed int, ABCD
[4]) =
84 const vec_s32_t vABCD
= vec_ld(0, ABCD
);
85 const vec_s16_t vA
= vec_splat((vec_s16_t
)vABCD
, 1);
86 const vec_s16_t vB
= vec_splat((vec_s16_t
)vABCD
, 3);
87 const vec_s16_t vC
= vec_splat((vec_s16_t
)vABCD
, 5);
88 const vec_s16_t vD
= vec_splat((vec_s16_t
)vABCD
, 7);
90 const vec_s16_t v32ss
= vec_sl(vec_splat_s16(1),vec_splat_u16(5));
91 const vec_u16_t v6us
= vec_splat_u16(6);
92 register int loadSecond
= (((unsigned long)src
) % 16) <= 7 ? 0 : 1;
93 register int reallyBadAlign
= (((unsigned long)src
) % 16) == 15 ? 1 : 0;
95 vec_u8_t vsrcAuc
, vsrcBuc
, vsrcperm0
, vsrcperm1
;
96 vec_u8_t vsrc0uc
, vsrc1uc
;
97 vec_s16_t vsrc0ssH
, vsrc1ssH
;
98 vec_u8_t vsrcCuc
, vsrc2uc
, vsrc3uc
;
99 vec_s16_t vsrc2ssH
, vsrc3ssH
, psum
;
100 vec_u8_t vdst
, ppsum
, vfdst
, fsum
;
102 POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num
, 1);
104 if (((unsigned long)dst
) % 16 == 0) {
105 fperm
= (vec_u8_t
)AVV(0x10, 0x11, 0x12, 0x13,
106 0x14, 0x15, 0x16, 0x17,
107 0x08, 0x09, 0x0A, 0x0B,
108 0x0C, 0x0D, 0x0E, 0x0F);
110 fperm
= (vec_u8_t
)AVV(0x00, 0x01, 0x02, 0x03,
111 0x04, 0x05, 0x06, 0x07,
112 0x18, 0x19, 0x1A, 0x1B,
113 0x1C, 0x1D, 0x1E, 0x1F);
116 vsrcAuc
= vec_ld(0, src
);
119 vsrcBuc
= vec_ld(16, src
);
120 vsrcperm0
= vec_lvsl(0, src
);
121 vsrcperm1
= vec_lvsl(1, src
);
123 vsrc0uc
= vec_perm(vsrcAuc
, vsrcBuc
, vsrcperm0
);
127 vsrc1uc
= vec_perm(vsrcAuc
, vsrcBuc
, vsrcperm1
);
129 vsrc0ssH
= (vec_s16_t
)vec_mergeh(zero_u8v
,(vec_u8_t
)vsrc0uc
);
130 vsrc1ssH
= (vec_s16_t
)vec_mergeh(zero_u8v
,(vec_u8_t
)vsrc1uc
);
133 if (!loadSecond
) {// -> !reallyBadAlign
134 for (i
= 0 ; i
< h
; i
++) {
135 vsrcCuc
= vec_ld(stride
+ 0, src
);
136 vsrc2uc
= vec_perm(vsrcCuc
, vsrcCuc
, vsrcperm0
);
137 vsrc3uc
= vec_perm(vsrcCuc
, vsrcCuc
, vsrcperm1
);
139 CHROMA_MC8_ALTIVEC_CORE
143 for (i
= 0 ; i
< h
; i
++) {
144 vsrcCuc
= vec_ld(stride
+ 0, src
);
145 vsrcDuc
= vec_ld(stride
+ 16, src
);
146 vsrc2uc
= vec_perm(vsrcCuc
, vsrcDuc
, vsrcperm0
);
150 vsrc3uc
= vec_perm(vsrcCuc
, vsrcDuc
, vsrcperm1
);
152 CHROMA_MC8_ALTIVEC_CORE
156 const vec_s16_t vE
= vec_add(vB
, vC
);
157 if (ABCD
[2]) { // x == 0 B == 0
158 if (!loadSecond
) {// -> !reallyBadAlign
159 for (i
= 0 ; i
< h
; i
++) {
160 vsrcCuc
= vec_ld(stride
+ 0, src
);
161 vsrc1uc
= vec_perm(vsrcCuc
, vsrcCuc
, vsrcperm0
);
162 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
168 for (i
= 0 ; i
< h
; i
++) {
169 vsrcCuc
= vec_ld(stride
+ 0, src
);
170 vsrcDuc
= vec_ld(stride
+ 15, src
);
171 vsrc1uc
= vec_perm(vsrcCuc
, vsrcDuc
, vsrcperm0
);
172 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
177 } else { // y == 0 C == 0
178 if (!loadSecond
) {// -> !reallyBadAlign
179 for (i
= 0 ; i
< h
; i
++) {
180 vsrcCuc
= vec_ld(0, src
);
181 vsrc0uc
= vec_perm(vsrcCuc
, vsrcCuc
, vsrcperm0
);
182 vsrc1uc
= vec_perm(vsrcCuc
, vsrcCuc
, vsrcperm1
);
184 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
188 for (i
= 0 ; i
< h
; i
++) {
189 vsrcCuc
= vec_ld(0, src
);
190 vsrcDuc
= vec_ld(15, src
);
191 vsrc0uc
= vec_perm(vsrcCuc
, vsrcDuc
, vsrcperm0
);
195 vsrc1uc
= vec_perm(vsrcCuc
, vsrcDuc
, vsrcperm1
);
197 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
202 POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num
, 1);
205 #undef CHROMA_MC8_ALTIVEC_CORE
207 /* this code assume stride % 16 == 0 */
208 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst
, uint8_t * src
, int dstStride
, int srcStride
) {
209 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num
, 1);
213 const vec_u8_t permM2
= vec_lvsl(-2, src
);
214 const vec_u8_t permM1
= vec_lvsl(-1, src
);
215 const vec_u8_t permP0
= vec_lvsl(+0, src
);
216 const vec_u8_t permP1
= vec_lvsl(+1, src
);
217 const vec_u8_t permP2
= vec_lvsl(+2, src
);
218 const vec_u8_t permP3
= vec_lvsl(+3, src
);
219 const vec_s16_t v5ss
= vec_splat_s16(5);
220 const vec_u16_t v5us
= vec_splat_u16(5);
221 const vec_s16_t v20ss
= vec_sl(vec_splat_s16(5),vec_splat_u16(2));
222 const vec_s16_t v16ss
= vec_sl(vec_splat_s16(1),vec_splat_u16(4));
224 vec_u8_t srcM2
, srcM1
, srcP0
, srcP1
, srcP2
, srcP3
;
226 register int align
= ((((unsigned long)src
) - 2) % 16);
228 vec_s16_t srcP0A
, srcP0B
, srcP1A
, srcP1B
,
229 srcP2A
, srcP2B
, srcP3A
, srcP3B
,
230 srcM1A
, srcM1B
, srcM2A
, srcM2B
,
231 sum1A
, sum1B
, sum2A
, sum2B
, sum3A
, sum3B
,
232 pp1A
, pp1B
, pp2A
, pp2B
, pp3A
, pp3B
,
233 psumA
, psumB
, sumA
, sumB
;
235 vec_u8_t sum
, vdst
, fsum
;
237 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num
, 1);
239 for (i
= 0 ; i
< 16 ; i
++) {
240 vec_u8_t srcR1
= vec_ld(-2, src
);
241 vec_u8_t srcR2
= vec_ld(14, src
);
245 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
246 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
247 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
248 srcP1
= vec_perm(srcR1
, srcR2
, permP1
);
249 srcP2
= vec_perm(srcR1
, srcR2
, permP2
);
250 srcP3
= vec_perm(srcR1
, srcR2
, permP3
);
253 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
254 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
255 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
256 srcP1
= vec_perm(srcR1
, srcR2
, permP1
);
257 srcP2
= vec_perm(srcR1
, srcR2
, permP2
);
261 vec_u8_t srcR3
= vec_ld(30, src
);
262 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
263 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
264 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
265 srcP1
= vec_perm(srcR1
, srcR2
, permP1
);
267 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
270 vec_u8_t srcR3
= vec_ld(30, src
);
271 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
272 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
273 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
275 srcP2
= vec_perm(srcR2
, srcR3
, permP2
);
276 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
279 vec_u8_t srcR3
= vec_ld(30, src
);
280 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
281 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
283 srcP1
= vec_perm(srcR2
, srcR3
, permP1
);
284 srcP2
= vec_perm(srcR2
, srcR3
, permP2
);
285 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
288 vec_u8_t srcR3
= vec_ld(30, src
);
289 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
291 srcP0
= vec_perm(srcR2
, srcR3
, permP0
);
292 srcP1
= vec_perm(srcR2
, srcR3
, permP1
);
293 srcP2
= vec_perm(srcR2
, srcR3
, permP2
);
294 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
298 srcP0A
= (vec_s16_t
) vec_mergeh(zero_u8v
, srcP0
);
299 srcP0B
= (vec_s16_t
) vec_mergel(zero_u8v
, srcP0
);
300 srcP1A
= (vec_s16_t
) vec_mergeh(zero_u8v
, srcP1
);
301 srcP1B
= (vec_s16_t
) vec_mergel(zero_u8v
, srcP1
);
303 srcP2A
= (vec_s16_t
) vec_mergeh(zero_u8v
, srcP2
);
304 srcP2B
= (vec_s16_t
) vec_mergel(zero_u8v
, srcP2
);
305 srcP3A
= (vec_s16_t
) vec_mergeh(zero_u8v
, srcP3
);
306 srcP3B
= (vec_s16_t
) vec_mergel(zero_u8v
, srcP3
);
308 srcM1A
= (vec_s16_t
) vec_mergeh(zero_u8v
, srcM1
);
309 srcM1B
= (vec_s16_t
) vec_mergel(zero_u8v
, srcM1
);
310 srcM2A
= (vec_s16_t
) vec_mergeh(zero_u8v
, srcM2
);
311 srcM2B
= (vec_s16_t
) vec_mergel(zero_u8v
, srcM2
);
313 sum1A
= vec_adds(srcP0A
, srcP1A
);
314 sum1B
= vec_adds(srcP0B
, srcP1B
);
315 sum2A
= vec_adds(srcM1A
, srcP2A
);
316 sum2B
= vec_adds(srcM1B
, srcP2B
);
317 sum3A
= vec_adds(srcM2A
, srcP3A
);
318 sum3B
= vec_adds(srcM2B
, srcP3B
);
320 pp1A
= vec_mladd(sum1A
, v20ss
, v16ss
);
321 pp1B
= vec_mladd(sum1B
, v20ss
, v16ss
);
323 pp2A
= vec_mladd(sum2A
, v5ss
, zero_s16v
);
324 pp2B
= vec_mladd(sum2B
, v5ss
, zero_s16v
);
326 pp3A
= vec_add(sum3A
, pp1A
);
327 pp3B
= vec_add(sum3B
, pp1B
);
329 psumA
= vec_sub(pp3A
, pp2A
);
330 psumB
= vec_sub(pp3B
, pp2B
);
332 sumA
= vec_sra(psumA
, v5us
);
333 sumB
= vec_sra(psumB
, v5us
);
335 sum
= vec_packsu(sumA
, sumB
);
338 vdst
= vec_ld(0, dst
);
340 OP_U8_ALTIVEC(fsum
, sum
, vdst
);
342 vec_st(fsum
, 0, dst
);
347 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num
, 1);
350 /* this code assume stride % 16 == 0 */
351 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst
, uint8_t * src
, int dstStride
, int srcStride
) {
352 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num
, 1);
357 const vec_u8_t perm
= vec_lvsl(0, src
);
358 const vec_s16_t v20ss
= vec_sl(vec_splat_s16(5),vec_splat_u16(2));
359 const vec_u16_t v5us
= vec_splat_u16(5);
360 const vec_s16_t v5ss
= vec_splat_s16(5);
361 const vec_s16_t v16ss
= vec_sl(vec_splat_s16(1),vec_splat_u16(4));
363 uint8_t *srcbis
= src
- (srcStride
* 2);
365 const vec_u8_t srcM2a
= vec_ld(0, srcbis
);
366 const vec_u8_t srcM2b
= vec_ld(16, srcbis
);
367 const vec_u8_t srcM2
= vec_perm(srcM2a
, srcM2b
, perm
);
368 // srcbis += srcStride;
369 const vec_u8_t srcM1a
= vec_ld(0, srcbis
+= srcStride
);
370 const vec_u8_t srcM1b
= vec_ld(16, srcbis
);
371 const vec_u8_t srcM1
= vec_perm(srcM1a
, srcM1b
, perm
);
372 // srcbis += srcStride;
373 const vec_u8_t srcP0a
= vec_ld(0, srcbis
+= srcStride
);
374 const vec_u8_t srcP0b
= vec_ld(16, srcbis
);
375 const vec_u8_t srcP0
= vec_perm(srcP0a
, srcP0b
, perm
);
376 // srcbis += srcStride;
377 const vec_u8_t srcP1a
= vec_ld(0, srcbis
+= srcStride
);
378 const vec_u8_t srcP1b
= vec_ld(16, srcbis
);
379 const vec_u8_t srcP1
= vec_perm(srcP1a
, srcP1b
, perm
);
380 // srcbis += srcStride;
381 const vec_u8_t srcP2a
= vec_ld(0, srcbis
+= srcStride
);
382 const vec_u8_t srcP2b
= vec_ld(16, srcbis
);
383 const vec_u8_t srcP2
= vec_perm(srcP2a
, srcP2b
, perm
);
384 // srcbis += srcStride;
386 vec_s16_t srcM2ssA
= (vec_s16_t
) vec_mergeh(zero_u8v
, srcM2
);
387 vec_s16_t srcM2ssB
= (vec_s16_t
) vec_mergel(zero_u8v
, srcM2
);
388 vec_s16_t srcM1ssA
= (vec_s16_t
) vec_mergeh(zero_u8v
, srcM1
);
389 vec_s16_t srcM1ssB
= (vec_s16_t
) vec_mergel(zero_u8v
, srcM1
);
390 vec_s16_t srcP0ssA
= (vec_s16_t
) vec_mergeh(zero_u8v
, srcP0
);
391 vec_s16_t srcP0ssB
= (vec_s16_t
) vec_mergel(zero_u8v
, srcP0
);
392 vec_s16_t srcP1ssA
= (vec_s16_t
) vec_mergeh(zero_u8v
, srcP1
);
393 vec_s16_t srcP1ssB
= (vec_s16_t
) vec_mergel(zero_u8v
, srcP1
);
394 vec_s16_t srcP2ssA
= (vec_s16_t
) vec_mergeh(zero_u8v
, srcP2
);
395 vec_s16_t srcP2ssB
= (vec_s16_t
) vec_mergel(zero_u8v
, srcP2
);
397 vec_s16_t pp1A
, pp1B
, pp2A
, pp2B
, pp3A
, pp3B
,
398 psumA
, psumB
, sumA
, sumB
,
400 sum1A
, sum1B
, sum2A
, sum2B
, sum3A
, sum3B
;
402 vec_u8_t sum
, vdst
, fsum
, srcP3a
, srcP3b
, srcP3
;
404 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num
, 1);
406 for (i
= 0 ; i
< 16 ; i
++) {
407 srcP3a
= vec_ld(0, srcbis
+= srcStride
);
408 srcP3b
= vec_ld(16, srcbis
);
409 srcP3
= vec_perm(srcP3a
, srcP3b
, perm
);
410 srcP3ssA
= (vec_s16_t
) vec_mergeh(zero_u8v
, srcP3
);
411 srcP3ssB
= (vec_s16_t
) vec_mergel(zero_u8v
, srcP3
);
412 // srcbis += srcStride;
414 sum1A
= vec_adds(srcP0ssA
, srcP1ssA
);
415 sum1B
= vec_adds(srcP0ssB
, srcP1ssB
);
416 sum2A
= vec_adds(srcM1ssA
, srcP2ssA
);
417 sum2B
= vec_adds(srcM1ssB
, srcP2ssB
);
418 sum3A
= vec_adds(srcM2ssA
, srcP3ssA
);
419 sum3B
= vec_adds(srcM2ssB
, srcP3ssB
);
432 pp1A
= vec_mladd(sum1A
, v20ss
, v16ss
);
433 pp1B
= vec_mladd(sum1B
, v20ss
, v16ss
);
435 pp2A
= vec_mladd(sum2A
, v5ss
, zero_s16v
);
436 pp2B
= vec_mladd(sum2B
, v5ss
, zero_s16v
);
438 pp3A
= vec_add(sum3A
, pp1A
);
439 pp3B
= vec_add(sum3B
, pp1B
);
441 psumA
= vec_sub(pp3A
, pp2A
);
442 psumB
= vec_sub(pp3B
, pp2B
);
444 sumA
= vec_sra(psumA
, v5us
);
445 sumB
= vec_sra(psumB
, v5us
);
447 sum
= vec_packsu(sumA
, sumB
);
450 vdst
= vec_ld(0, dst
);
452 OP_U8_ALTIVEC(fsum
, sum
, vdst
);
454 vec_st(fsum
, 0, dst
);
458 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num
, 1);
461 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
462 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst
, int16_t * tmp
, uint8_t * src
, int dstStride
, int tmpStride
, int srcStride
) {
463 POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num
, 1);
466 const vec_u8_t permM2
= vec_lvsl(-2, src
);
467 const vec_u8_t permM1
= vec_lvsl(-1, src
);
468 const vec_u8_t permP0
= vec_lvsl(+0, src
);
469 const vec_u8_t permP1
= vec_lvsl(+1, src
);
470 const vec_u8_t permP2
= vec_lvsl(+2, src
);
471 const vec_u8_t permP3
= vec_lvsl(+3, src
);
472 const vec_s16_t v20ss
= vec_sl(vec_splat_s16(5),vec_splat_u16(2));
473 const vec_u32_t v10ui
= vec_splat_u32(10);
474 const vec_s16_t v5ss
= vec_splat_s16(5);
475 const vec_s16_t v1ss
= vec_splat_s16(1);
476 const vec_s32_t v512si
= vec_sl(vec_splat_s32(1),vec_splat_u32(9));
477 const vec_u32_t v16ui
= vec_sl(vec_splat_u32(1),vec_splat_u32(4));
479 register int align
= ((((unsigned long)src
) - 2) % 16);
481 vec_s16_t srcP0A
, srcP0B
, srcP1A
, srcP1B
,
482 srcP2A
, srcP2B
, srcP3A
, srcP3B
,
483 srcM1A
, srcM1B
, srcM2A
, srcM2B
,
484 sum1A
, sum1B
, sum2A
, sum2B
, sum3A
, sum3B
,
485 pp1A
, pp1B
, pp2A
, pp2B
, psumA
, psumB
;
487 const vec_u8_t mperm
= (const vec_u8_t
)
488 AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
489 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
490 int16_t *tmpbis
= tmp
;
492 vec_s16_t tmpM1ssA
, tmpM1ssB
, tmpM2ssA
, tmpM2ssB
,
493 tmpP0ssA
, tmpP0ssB
, tmpP1ssA
, tmpP1ssB
,
496 vec_s32_t pp1Ae
, pp1Ao
, pp1Be
, pp1Bo
, pp2Ae
, pp2Ao
, pp2Be
, pp2Bo
,
497 pp3Ae
, pp3Ao
, pp3Be
, pp3Bo
, pp1cAe
, pp1cAo
, pp1cBe
, pp1cBo
,
498 pp32Ae
, pp32Ao
, pp32Be
, pp32Bo
, sumAe
, sumAo
, sumBe
, sumBo
,
499 ssumAe
, ssumAo
, ssumBe
, ssumBo
;
500 vec_u8_t fsum
, sumv
, sum
, vdst
;
501 vec_s16_t ssume
, ssumo
;
503 POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num
, 1);
504 src
-= (2 * srcStride
);
505 for (i
= 0 ; i
< 21 ; i
++) {
506 vec_u8_t srcM2
, srcM1
, srcP0
, srcP1
, srcP2
, srcP3
;
507 vec_u8_t srcR1
= vec_ld(-2, src
);
508 vec_u8_t srcR2
= vec_ld(14, src
);
512 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
513 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
514 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
515 srcP1
= vec_perm(srcR1
, srcR2
, permP1
);
516 srcP2
= vec_perm(srcR1
, srcR2
, permP2
);
517 srcP3
= vec_perm(srcR1
, srcR2
, permP3
);
520 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
521 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
522 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
523 srcP1
= vec_perm(srcR1
, srcR2
, permP1
);
524 srcP2
= vec_perm(srcR1
, srcR2
, permP2
);
528 vec_u8_t srcR3
= vec_ld(30, src
);
529 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
530 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
531 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
532 srcP1
= vec_perm(srcR1
, srcR2
, permP1
);
534 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
537 vec_u8_t srcR3
= vec_ld(30, src
);
538 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
539 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
540 srcP0
= vec_perm(srcR1
, srcR2
, permP0
);
542 srcP2
= vec_perm(srcR2
, srcR3
, permP2
);
543 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
546 vec_u8_t srcR3
= vec_ld(30, src
);
547 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
548 srcM1
= vec_perm(srcR1
, srcR2
, permM1
);
550 srcP1
= vec_perm(srcR2
, srcR3
, permP1
);
551 srcP2
= vec_perm(srcR2
, srcR3
, permP2
);
552 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
555 vec_u8_t srcR3
= vec_ld(30, src
);
556 srcM2
= vec_perm(srcR1
, srcR2
, permM2
);
558 srcP0
= vec_perm(srcR2
, srcR3
, permP0
);
559 srcP1
= vec_perm(srcR2
, srcR3
, permP1
);
560 srcP2
= vec_perm(srcR2
, srcR3
, permP2
);
561 srcP3
= vec_perm(srcR2
, srcR3
, permP3
);
565 srcP0A
= (vec_s16_t
) vec_mergeh(zero_u8v
, srcP0
);
566 srcP0B
= (vec_s16_t
) vec_mergel(zero_u8v
, srcP0
);
567 srcP1A
= (vec_s16_t
) vec_mergeh(zero_u8v
, srcP1
);
568 srcP1B
= (vec_s16_t
) vec_mergel(zero_u8v
, srcP1
);
570 srcP2A
= (vec_s16_t
) vec_mergeh(zero_u8v
, srcP2
);
571 srcP2B
= (vec_s16_t
) vec_mergel(zero_u8v
, srcP2
);
572 srcP3A
= (vec_s16_t
) vec_mergeh(zero_u8v
, srcP3
);
573 srcP3B
= (vec_s16_t
) vec_mergel(zero_u8v
, srcP3
);
575 srcM1A
= (vec_s16_t
) vec_mergeh(zero_u8v
, srcM1
);
576 srcM1B
= (vec_s16_t
) vec_mergel(zero_u8v
, srcM1
);
577 srcM2A
= (vec_s16_t
) vec_mergeh(zero_u8v
, srcM2
);
578 srcM2B
= (vec_s16_t
) vec_mergel(zero_u8v
, srcM2
);
580 sum1A
= vec_adds(srcP0A
, srcP1A
);
581 sum1B
= vec_adds(srcP0B
, srcP1B
);
582 sum2A
= vec_adds(srcM1A
, srcP2A
);
583 sum2B
= vec_adds(srcM1B
, srcP2B
);
584 sum3A
= vec_adds(srcM2A
, srcP3A
);
585 sum3B
= vec_adds(srcM2B
, srcP3B
);
587 pp1A
= vec_mladd(sum1A
, v20ss
, sum3A
);
588 pp1B
= vec_mladd(sum1B
, v20ss
, sum3B
);
590 pp2A
= vec_mladd(sum2A
, v5ss
, zero_s16v
);
591 pp2B
= vec_mladd(sum2B
, v5ss
, zero_s16v
);
593 psumA
= vec_sub(pp1A
, pp2A
);
594 psumB
= vec_sub(pp1B
, pp2B
);
596 vec_st(psumA
, 0, tmp
);
597 vec_st(psumB
, 16, tmp
);
600 tmp
+= tmpStride
; /* int16_t*, and stride is 16, so it's OK here */
603 tmpM2ssA
= vec_ld(0, tmpbis
);
604 tmpM2ssB
= vec_ld(16, tmpbis
);
606 tmpM1ssA
= vec_ld(0, tmpbis
);
607 tmpM1ssB
= vec_ld(16, tmpbis
);
609 tmpP0ssA
= vec_ld(0, tmpbis
);
610 tmpP0ssB
= vec_ld(16, tmpbis
);
612 tmpP1ssA
= vec_ld(0, tmpbis
);
613 tmpP1ssB
= vec_ld(16, tmpbis
);
615 tmpP2ssA
= vec_ld(0, tmpbis
);
616 tmpP2ssB
= vec_ld(16, tmpbis
);
619 for (i
= 0 ; i
< 16 ; i
++) {
620 const vec_s16_t tmpP3ssA
= vec_ld(0, tmpbis
);
621 const vec_s16_t tmpP3ssB
= vec_ld(16, tmpbis
);
623 const vec_s16_t sum1A
= vec_adds(tmpP0ssA
, tmpP1ssA
);
624 const vec_s16_t sum1B
= vec_adds(tmpP0ssB
, tmpP1ssB
);
625 const vec_s16_t sum2A
= vec_adds(tmpM1ssA
, tmpP2ssA
);
626 const vec_s16_t sum2B
= vec_adds(tmpM1ssB
, tmpP2ssB
);
627 const vec_s16_t sum3A
= vec_adds(tmpM2ssA
, tmpP3ssA
);
628 const vec_s16_t sum3B
= vec_adds(tmpM2ssB
, tmpP3ssB
);
643 pp1Ae
= vec_mule(sum1A
, v20ss
);
644 pp1Ao
= vec_mulo(sum1A
, v20ss
);
645 pp1Be
= vec_mule(sum1B
, v20ss
);
646 pp1Bo
= vec_mulo(sum1B
, v20ss
);
648 pp2Ae
= vec_mule(sum2A
, v5ss
);
649 pp2Ao
= vec_mulo(sum2A
, v5ss
);
650 pp2Be
= vec_mule(sum2B
, v5ss
);
651 pp2Bo
= vec_mulo(sum2B
, v5ss
);
653 pp3Ae
= vec_sra((vec_s32_t
)sum3A
, v16ui
);
654 pp3Ao
= vec_mulo(sum3A
, v1ss
);
655 pp3Be
= vec_sra((vec_s32_t
)sum3B
, v16ui
);
656 pp3Bo
= vec_mulo(sum3B
, v1ss
);
658 pp1cAe
= vec_add(pp1Ae
, v512si
);
659 pp1cAo
= vec_add(pp1Ao
, v512si
);
660 pp1cBe
= vec_add(pp1Be
, v512si
);
661 pp1cBo
= vec_add(pp1Bo
, v512si
);
663 pp32Ae
= vec_sub(pp3Ae
, pp2Ae
);
664 pp32Ao
= vec_sub(pp3Ao
, pp2Ao
);
665 pp32Be
= vec_sub(pp3Be
, pp2Be
);
666 pp32Bo
= vec_sub(pp3Bo
, pp2Bo
);
668 sumAe
= vec_add(pp1cAe
, pp32Ae
);
669 sumAo
= vec_add(pp1cAo
, pp32Ao
);
670 sumBe
= vec_add(pp1cBe
, pp32Be
);
671 sumBo
= vec_add(pp1cBo
, pp32Bo
);
673 ssumAe
= vec_sra(sumAe
, v10ui
);
674 ssumAo
= vec_sra(sumAo
, v10ui
);
675 ssumBe
= vec_sra(sumBe
, v10ui
);
676 ssumBo
= vec_sra(sumBo
, v10ui
);
678 ssume
= vec_packs(ssumAe
, ssumBe
);
679 ssumo
= vec_packs(ssumAo
, ssumBo
);
681 sumv
= vec_packsu(ssume
, ssumo
);
682 sum
= vec_perm(sumv
, sumv
, mperm
);
685 vdst
= vec_ld(0, dst
);
687 OP_U8_ALTIVEC(fsum
, sum
, vdst
);
689 vec_st(fsum
, 0, dst
);
693 POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num
, 1);