2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
4 * This file is part of Libav.
6 * Libav is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * Libav is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with Libav; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mem.h"
23 /* this code assume that stride % 16 == 0 */
25 #define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
26 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
27 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
29 psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
30 psum = vec_mladd(vB, vsrc1ssH, psum);\
31 psum = vec_mladd(vC, vsrc2ssH, psum);\
32 psum = vec_mladd(vD, vsrc3ssH, psum);\
34 psum = vec_sr(psum, v6us);\
36 vdst = vec_ld(0, dst);\
37 ppsum = (vec_u8)vec_pack(psum, psum);\
38 vfdst = vec_perm(vdst, ppsum, fperm);\
40 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
42 vec_st(fsum, 0, dst);\
50 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
52 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
53 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
55 psum = vec_mladd(vA, vsrc0ssH, v32ss);\
56 psum = vec_mladd(vE, vsrc1ssH, psum);\
57 psum = vec_sr(psum, v6us);\
59 vdst = vec_ld(0, dst);\
60 ppsum = (vec_u8)vec_pack(psum, psum);\
61 vfdst = vec_perm(vdst, ppsum, fperm);\
63 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
65 vec_st(fsum, 0, dst);\
71 #define add28(a) vec_add(v28ss, a)
73 #ifdef PREFIX_h264_chroma_mc8_altivec
74 static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst
, uint8_t * src
,
75 int stride
, int h
, int x
, int y
) {
76 DECLARE_ALIGNED(16, signed int, ABCD
)[4] =
83 const vec_s32 vABCD
= vec_ld(0, ABCD
);
84 const vec_s16 vA
= vec_splat((vec_s16
)vABCD
, 1);
85 const vec_s16 vB
= vec_splat((vec_s16
)vABCD
, 3);
86 const vec_s16 vC
= vec_splat((vec_s16
)vABCD
, 5);
87 const vec_s16 vD
= vec_splat((vec_s16
)vABCD
, 7);
89 const vec_s16 v32ss
= vec_sl(vec_splat_s16(1),vec_splat_u16(5));
90 const vec_u16 v6us
= vec_splat_u16(6);
91 register int loadSecond
= (((unsigned long)src
) % 16) <= 7 ? 0 : 1;
92 register int reallyBadAlign
= (((unsigned long)src
) % 16) == 15 ? 1 : 0;
94 vec_u8 vsrcAuc
, av_uninit(vsrcBuc
), vsrcperm0
, vsrcperm1
;
95 vec_u8 vsrc0uc
, vsrc1uc
;
96 vec_s16 vsrc0ssH
, vsrc1ssH
;
97 vec_u8 vsrcCuc
, vsrc2uc
, vsrc3uc
;
98 vec_s16 vsrc2ssH
, vsrc3ssH
, psum
;
99 vec_u8 vdst
, ppsum
, vfdst
, fsum
;
101 if (((unsigned long)dst
) % 16 == 0) {
102 fperm
= (vec_u8
){0x10, 0x11, 0x12, 0x13,
103 0x14, 0x15, 0x16, 0x17,
104 0x08, 0x09, 0x0A, 0x0B,
105 0x0C, 0x0D, 0x0E, 0x0F};
107 fperm
= (vec_u8
){0x00, 0x01, 0x02, 0x03,
108 0x04, 0x05, 0x06, 0x07,
109 0x18, 0x19, 0x1A, 0x1B,
110 0x1C, 0x1D, 0x1E, 0x1F};
113 vsrcAuc
= vec_ld(0, src
);
116 vsrcBuc
= vec_ld(16, src
);
117 vsrcperm0
= vec_lvsl(0, src
);
118 vsrcperm1
= vec_lvsl(1, src
);
120 vsrc0uc
= vec_perm(vsrcAuc
, vsrcBuc
, vsrcperm0
);
124 vsrc1uc
= vec_perm(vsrcAuc
, vsrcBuc
, vsrcperm1
);
126 vsrc0ssH
= (vec_s16
)vec_mergeh(zero_u8v
,(vec_u8
)vsrc0uc
);
127 vsrc1ssH
= (vec_s16
)vec_mergeh(zero_u8v
,(vec_u8
)vsrc1uc
);
130 if (!loadSecond
) {// -> !reallyBadAlign
131 for (i
= 0 ; i
< h
; i
++) {
132 vsrcCuc
= vec_ld(stride
+ 0, src
);
133 vsrc2uc
= vec_perm(vsrcCuc
, vsrcCuc
, vsrcperm0
);
134 vsrc3uc
= vec_perm(vsrcCuc
, vsrcCuc
, vsrcperm1
);
136 CHROMA_MC8_ALTIVEC_CORE(v32ss
, noop
)
140 for (i
= 0 ; i
< h
; i
++) {
141 vsrcCuc
= vec_ld(stride
+ 0, src
);
142 vsrcDuc
= vec_ld(stride
+ 16, src
);
143 vsrc2uc
= vec_perm(vsrcCuc
, vsrcDuc
, vsrcperm0
);
147 vsrc3uc
= vec_perm(vsrcCuc
, vsrcDuc
, vsrcperm1
);
149 CHROMA_MC8_ALTIVEC_CORE(v32ss
, noop
)
153 const vec_s16 vE
= vec_add(vB
, vC
);
154 if (ABCD
[2]) { // x == 0 B == 0
155 if (!loadSecond
) {// -> !reallyBadAlign
156 for (i
= 0 ; i
< h
; i
++) {
157 vsrcCuc
= vec_ld(stride
+ 0, src
);
158 vsrc1uc
= vec_perm(vsrcCuc
, vsrcCuc
, vsrcperm0
);
159 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
165 for (i
= 0 ; i
< h
; i
++) {
166 vsrcCuc
= vec_ld(stride
+ 0, src
);
167 vsrcDuc
= vec_ld(stride
+ 15, src
);
168 vsrc1uc
= vec_perm(vsrcCuc
, vsrcDuc
, vsrcperm0
);
169 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
174 } else { // y == 0 C == 0
175 if (!loadSecond
) {// -> !reallyBadAlign
176 for (i
= 0 ; i
< h
; i
++) {
177 vsrcCuc
= vec_ld(0, src
);
178 vsrc0uc
= vec_perm(vsrcCuc
, vsrcCuc
, vsrcperm0
);
179 vsrc1uc
= vec_perm(vsrcCuc
, vsrcCuc
, vsrcperm1
);
181 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
185 for (i
= 0 ; i
< h
; i
++) {
186 vsrcCuc
= vec_ld(0, src
);
187 vsrcDuc
= vec_ld(15, src
);
188 vsrc0uc
= vec_perm(vsrcCuc
, vsrcDuc
, vsrcperm0
);
192 vsrc1uc
= vec_perm(vsrcCuc
, vsrcDuc
, vsrcperm1
);
194 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
202 /* this code assume that stride % 16 == 0 */
203 #ifdef PREFIX_no_rnd_vc1_chroma_mc8_altivec
204 static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst
, uint8_t * src
, int stride
, int h
, int x
, int y
) {
205 DECLARE_ALIGNED(16, signed int, ABCD
)[4] =
206 {((8 - x
) * (8 - y
)),
212 const vec_s32 vABCD
= vec_ld(0, ABCD
);
213 const vec_s16 vA
= vec_splat((vec_s16
)vABCD
, 1);
214 const vec_s16 vB
= vec_splat((vec_s16
)vABCD
, 3);
215 const vec_s16 vC
= vec_splat((vec_s16
)vABCD
, 5);
216 const vec_s16 vD
= vec_splat((vec_s16
)vABCD
, 7);
218 const vec_s16 v28ss
= vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
219 const vec_u16 v6us
= vec_splat_u16(6);
220 register int loadSecond
= (((unsigned long)src
) % 16) <= 7 ? 0 : 1;
221 register int reallyBadAlign
= (((unsigned long)src
) % 16) == 15 ? 1 : 0;
223 vec_u8 vsrcAuc
, av_uninit(vsrcBuc
), vsrcperm0
, vsrcperm1
;
224 vec_u8 vsrc0uc
, vsrc1uc
;
225 vec_s16 vsrc0ssH
, vsrc1ssH
;
226 vec_u8 vsrcCuc
, vsrc2uc
, vsrc3uc
;
227 vec_s16 vsrc2ssH
, vsrc3ssH
, psum
;
228 vec_u8 vdst
, ppsum
, vfdst
, fsum
;
230 if (((unsigned long)dst
) % 16 == 0) {
231 fperm
= (vec_u8
){0x10, 0x11, 0x12, 0x13,
232 0x14, 0x15, 0x16, 0x17,
233 0x08, 0x09, 0x0A, 0x0B,
234 0x0C, 0x0D, 0x0E, 0x0F};
236 fperm
= (vec_u8
){0x00, 0x01, 0x02, 0x03,
237 0x04, 0x05, 0x06, 0x07,
238 0x18, 0x19, 0x1A, 0x1B,
239 0x1C, 0x1D, 0x1E, 0x1F};
242 vsrcAuc
= vec_ld(0, src
);
245 vsrcBuc
= vec_ld(16, src
);
246 vsrcperm0
= vec_lvsl(0, src
);
247 vsrcperm1
= vec_lvsl(1, src
);
249 vsrc0uc
= vec_perm(vsrcAuc
, vsrcBuc
, vsrcperm0
);
253 vsrc1uc
= vec_perm(vsrcAuc
, vsrcBuc
, vsrcperm1
);
255 vsrc0ssH
= (vec_s16
)vec_mergeh(zero_u8v
, (vec_u8
)vsrc0uc
);
256 vsrc1ssH
= (vec_s16
)vec_mergeh(zero_u8v
, (vec_u8
)vsrc1uc
);
258 if (!loadSecond
) {// -> !reallyBadAlign
259 for (i
= 0 ; i
< h
; i
++) {
262 vsrcCuc
= vec_ld(stride
+ 0, src
);
264 vsrc2uc
= vec_perm(vsrcCuc
, vsrcCuc
, vsrcperm0
);
265 vsrc3uc
= vec_perm(vsrcCuc
, vsrcCuc
, vsrcperm1
);
267 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28
)
271 for (i
= 0 ; i
< h
; i
++) {
272 vsrcCuc
= vec_ld(stride
+ 0, src
);
273 vsrcDuc
= vec_ld(stride
+ 16, src
);
275 vsrc2uc
= vec_perm(vsrcCuc
, vsrcDuc
, vsrcperm0
);
279 vsrc3uc
= vec_perm(vsrcCuc
, vsrcDuc
, vsrcperm1
);
281 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28
)
289 #undef CHROMA_MC8_ALTIVEC_CORE