2 * Copyright (c) 2002 Brian Foley
3 * Copyright (c) 2002 Dieter Shirley
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 #include "../dsputil.h"
21 #include "dsputil_altivec.h"
24 #include <sys/sysctl.h>
27 int pix_abs16x16_x2_altivec(uint8_t *pix1
, uint8_t *pix2
, int line_size
)
30 vector
unsigned char *tv
, zero
;
31 vector
unsigned char pix1v
, pix2v
, pix2iv
, avgv
, t5
;
32 vector
unsigned int sad
;
33 vector
signed int sumdiffs
;
36 zero
= vec_splat_u8(0);
37 sad
= vec_splat_u32(0);
40 Read unaligned pixels into our vectors. The vectors are as follows:
41 pix1v: pix1[0]-pix1[15]
42 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
44 tv
= (vector
unsigned char *) pix1
;
45 pix1v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, pix1
));
47 tv
= (vector
unsigned char *) &pix2
[0];
48 pix2v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &pix2
[0]));
50 tv
= (vector
unsigned char *) &pix2
[1];
51 pix2iv
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &pix2
[1]));
53 /* Calculate the average vector */
54 avgv
= vec_avg(pix2v
, pix2iv
);
56 /* Calculate a sum of abs differences vector */
57 t5
= vec_sub(vec_max(pix1v
, avgv
), vec_min(pix1v
, avgv
));
59 /* Add each 4 pixel group together and put 4 results into sad */
60 sad
= vec_sum4s(t5
, sad
);
65 /* Sum up the four partial sums, and put the result into s */
66 sumdiffs
= vec_sums((vector
signed int) sad
, (vector
signed int) zero
);
67 sumdiffs
= vec_splat(sumdiffs
, 3);
68 vec_ste(sumdiffs
, 0, &s
);
73 int pix_abs16x16_y2_altivec(uint8_t *pix1
, uint8_t *pix2
, int line_size
)
76 vector
unsigned char *tv
, zero
;
77 vector
unsigned char pix1v
, pix2v
, pix3v
, avgv
, t5
;
78 vector
unsigned int sad
;
79 vector
signed int sumdiffs
;
80 uint8_t *pix3
= pix2
+ line_size
;
83 zero
= vec_splat_u8(0);
84 sad
= vec_splat_u32(0);
87 Due to the fact that pix3 = pix2 + line_size, the pix3 of one
88 iteration becomes pix2 in the next iteration. We can use this
89 fact to avoid a potentially expensive unaligned read, each
91 Read unaligned pixels into our vectors. The vectors are as follows:
92 pix2v: pix2[0]-pix2[15]
93 Split the pixel vectors into shorts
95 tv
= (vector
unsigned char *) &pix2
[0];
96 pix2v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &pix2
[0]));
100 Read unaligned pixels into our vectors. The vectors are as follows:
101 pix1v: pix1[0]-pix1[15]
102 pix3v: pix3[0]-pix3[15]
104 tv
= (vector
unsigned char *) pix1
;
105 pix1v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, pix1
));
107 tv
= (vector
unsigned char *) &pix3
[0];
108 pix3v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &pix3
[0]));
110 /* Calculate the average vector */
111 avgv
= vec_avg(pix2v
, pix3v
);
113 /* Calculate a sum of abs differences vector */
114 t5
= vec_sub(vec_max(pix1v
, avgv
), vec_min(pix1v
, avgv
));
116 /* Add each 4 pixel group together and put 4 results into sad */
117 sad
= vec_sum4s(t5
, sad
);
125 /* Sum up the four partial sums, and put the result into s */
126 sumdiffs
= vec_sums((vector
signed int) sad
, (vector
signed int) zero
);
127 sumdiffs
= vec_splat(sumdiffs
, 3);
128 vec_ste(sumdiffs
, 0, &s
);
132 int pix_abs16x16_xy2_altivec(uint8_t *pix1
, uint8_t *pix2
, int line_size
)
135 uint8_t *pix3
= pix2
+ line_size
;
136 vector
unsigned char *tv
, avgv
, t5
, zero
;
137 vector
unsigned char pix1v
, pix2v
, pix3v
, pix2iv
, pix3iv
;
138 vector
unsigned short pix2lv
, pix2hv
, pix2ilv
, pix2ihv
;
139 vector
unsigned short pix3lv
, pix3hv
, pix3ilv
, pix3ihv
;
140 vector
unsigned short avghv
, avglv
, two
;
141 vector
unsigned short t1
, t2
, t3
, t4
;
142 vector
unsigned int sad
;
143 vector
signed int sumdiffs
;
145 zero
= vec_splat_u8(0);
146 two
= vec_splat_u16(2);
147 sad
= vec_splat_u32(0);
152 Due to the fact that pix3 = pix2 + line_size, the pix3 of one
153 iteration becomes pix2 in the next iteration. We can use this
154 fact to avoid a potentially expensive unaligned read, as well
155 as some splitting, and vector addition each time around the loop.
156 Read unaligned pixels into our vectors. The vectors are as follows:
157 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
158 Split the pixel vectors into shorts
160 tv
= (vector
unsigned char *) &pix2
[0];
161 pix2v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &pix2
[0]));
163 tv
= (vector
unsigned char *) &pix2
[1];
164 pix2iv
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &pix2
[1]));
166 pix2hv
= (vector
unsigned short) vec_mergeh(zero
, pix2v
);
167 pix2lv
= (vector
unsigned short) vec_mergel(zero
, pix2v
);
168 pix2ihv
= (vector
unsigned short) vec_mergeh(zero
, pix2iv
);
169 pix2ilv
= (vector
unsigned short) vec_mergel(zero
, pix2iv
);
170 t1
= vec_add(pix2hv
, pix2ihv
);
171 t2
= vec_add(pix2lv
, pix2ilv
);
175 Read unaligned pixels into our vectors. The vectors are as follows:
176 pix1v: pix1[0]-pix1[15]
177 pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16]
179 tv
= (vector
unsigned char *) pix1
;
180 pix1v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, pix1
));
182 tv
= (vector
unsigned char *) &pix3
[0];
183 pix3v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &pix3
[0]));
185 tv
= (vector
unsigned char *) &pix3
[1];
186 pix3iv
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, &pix3
[1]));
189 Note that Altivec does have vec_avg, but this works on vector pairs
190 and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
191 would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
192 Instead, we have to split the pixel vectors into vectors of shorts,
193 and do the averaging by hand.
196 /* Split the pixel vectors into shorts */
197 pix3hv
= (vector
unsigned short) vec_mergeh(zero
, pix3v
);
198 pix3lv
= (vector
unsigned short) vec_mergel(zero
, pix3v
);
199 pix3ihv
= (vector
unsigned short) vec_mergeh(zero
, pix3iv
);
200 pix3ilv
= (vector
unsigned short) vec_mergel(zero
, pix3iv
);
202 /* Do the averaging on them */
203 t3
= vec_add(pix3hv
, pix3ihv
);
204 t4
= vec_add(pix3lv
, pix3ilv
);
206 avghv
= vec_sr(vec_add(vec_add(t1
, t3
), two
), two
);
207 avglv
= vec_sr(vec_add(vec_add(t2
, t4
), two
), two
);
209 /* Pack the shorts back into a result */
210 avgv
= vec_pack(avghv
, avglv
);
212 /* Calculate a sum of abs differences vector */
213 t5
= vec_sub(vec_max(pix1v
, avgv
), vec_min(pix1v
, avgv
));
215 /* Add each 4 pixel group together and put 4 results into sad */
216 sad
= vec_sum4s(t5
, sad
);
220 /* Transfer the calculated values for pix3 into pix2 */
224 /* Sum up the four partial sums, and put the result into s */
225 sumdiffs
= vec_sums((vector
signed int) sad
, (vector
signed int) zero
);
226 sumdiffs
= vec_splat(sumdiffs
, 3);
227 vec_ste(sumdiffs
, 0, &s
);
232 int pix_abs16x16_altivec(uint8_t *pix1
, uint8_t *pix2
, int line_size
)
235 vector
unsigned char perm1
, perm2
, *pix1v
, *pix2v
;
236 vector
unsigned char t1
, t2
, t3
,t4
, t5
;
237 vector
unsigned int sad
, zero
;
238 vector
signed int sumdiffs
;
240 zero
= (vector
unsigned int) (0);
241 sad
= (vector
unsigned int) (0);
245 /* Read potentially unaligned pixels into t1 and t2 */
246 perm1
= vec_lvsl(0, pix1
);
247 pix1v
= (vector
unsigned char *) pix1
;
248 perm2
= vec_lvsl(0, pix2
);
249 pix2v
= (vector
unsigned char *) pix2
;
250 t1
= vec_perm(pix1v
[0], pix1v
[1], perm1
);
251 t2
= vec_perm(pix2v
[0], pix2v
[1], perm2
);
253 /* Calculate a sum of abs differences vector */
254 t3
= vec_max(t1
, t2
);
255 t4
= vec_min(t1
, t2
);
256 t5
= vec_sub(t3
, t4
);
258 /* Add each 4 pixel group together and put 4 results into sad */
259 sad
= vec_sum4s(t5
, sad
);
265 /* Sum up the four partial sums, and put the result into s */
266 sumdiffs
= vec_sums((vector
signed int) sad
, (vector
signed int) zero
);
267 sumdiffs
= vec_splat(sumdiffs
, 3);
268 vec_ste(sumdiffs
, 0, &s
);
273 int pix_abs8x8_altivec(uint8_t *pix1
, uint8_t *pix2
, int line_size
)
276 vector
unsigned char perm1
, perm2
, permclear
, *pix1v
, *pix2v
;
277 vector
unsigned char t1
, t2
, t3
,t4
, t5
;
278 vector
unsigned int sad
, zero
;
279 vector
signed int sumdiffs
;
281 zero
= (vector
unsigned int) (0);
282 sad
= (vector
unsigned int) (0);
283 permclear
= (vector
unsigned char) (255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
286 /* Read potentially unaligned pixels into t1 and t2
287 Since we're reading 16 pixels, and actually only want 8,
288 mask out the last 8 pixels. The 0s don't change the sum. */
289 perm1
= vec_lvsl(0, pix1
);
290 pix1v
= (vector
unsigned char *) pix1
;
291 perm2
= vec_lvsl(0, pix2
);
292 pix2v
= (vector
unsigned char *) pix2
;
293 t1
= vec_and(vec_perm(pix1v
[0], pix1v
[1], perm1
), permclear
);
294 t2
= vec_and(vec_perm(pix2v
[0], pix2v
[1], perm2
), permclear
);
296 /* Calculate a sum of abs differences vector */
297 t3
= vec_max(t1
, t2
);
298 t4
= vec_min(t1
, t2
);
299 t5
= vec_sub(t3
, t4
);
301 /* Add each 4 pixel group together and put 4 results into sad */
302 sad
= vec_sum4s(t5
, sad
);
308 /* Sum up the four partial sums, and put the result into s */
309 sumdiffs
= vec_sums((vector
signed int) sad
, (vector
signed int) zero
);
310 sumdiffs
= vec_splat(sumdiffs
, 3);
311 vec_ste(sumdiffs
, 0, &s
);
316 int pix_norm1_altivec(uint8_t *pix
, int line_size
)
319 vector
unsigned char *tv
, zero
;
320 vector
unsigned char pixv
;
321 vector
unsigned int sv
;
322 vector
signed int sum
;
324 zero
= vec_splat_u8(0);
325 sv
= vec_splat_u32(0);
328 for (i
= 0; i
< 16; i
++) {
329 /* Read in the potentially unaligned pixels */
330 tv
= (vector
unsigned char *) pix
;
331 pixv
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, pix
));
333 /* Square the values, and add them to our sum */
334 sv
= vec_msum(pixv
, pixv
, sv
);
338 /* Sum up the four partial sums, and put the result into s */
339 sum
= vec_sums((vector
signed int) sv
, (vector
signed int) zero
);
340 sum
= vec_splat(sum
, 3);
347 int pix_norm_altivec(uint8_t *pix1
, uint8_t *pix2
, int line_size
)
350 vector
unsigned char *tv
, zero
;
351 vector
unsigned char pix1v
, pix2v
, t5
;
352 vector
unsigned int sv
;
353 vector
signed int sum
;
355 zero
= vec_splat_u8(0);
356 sv
= vec_splat_u32(0);
358 for (i
= 0; i
< 16; i
++) {
359 /* Read in the potentially unaligned pixels */
360 tv
= (vector
unsigned char *) pix1
;
361 pix1v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, pix1
));
363 tv
= (vector
unsigned char *) pix2
;
364 pix2v
= vec_perm(tv
[0], tv
[1], vec_lvsl(0, pix2
));
367 Since we want to use unsigned chars, we can take advantage
368 of the fact that abs(a-b)^2 = (a-b)^2.
371 /* Calculate a sum of abs differences vector */
372 t5
= vec_sub(vec_max(pix1v
, pix2v
), vec_min(pix1v
, pix2v
));
374 /* Square the values and add them to our sum */
375 sv
= vec_msum(t5
, t5
, sv
);
380 /* Sum up the four partial sums, and put the result into s */
381 sum
= vec_sums((vector
signed int) sv
, (vector
signed int) zero
);
382 sum
= vec_splat(sum
, 3);
388 int pix_sum_altivec(UINT8
* pix
, int line_size
)
391 vector
unsigned char perm
, *pixv
;
392 vector
unsigned char t1
;
393 vector
unsigned int sad
, zero
;
394 vector
signed int sumdiffs
;
398 zero
= (vector
unsigned int) (0);
399 sad
= (vector
unsigned int) (0);
401 for (i
= 0; i
< 16; i
++) {
402 /* Read the potentially unaligned 16 pixels into t1 */
403 perm
= vec_lvsl(0, pix
);
404 pixv
= (vector
unsigned char *) pix
;
405 t1
= vec_perm(pixv
[0], pixv
[1], perm
);
407 /* Add each 4 pixel group together and put 4 results into sad */
408 sad
= vec_sum4s(t1
, sad
);
413 /* Sum up the four partial sums, and put the result into s */
414 sumdiffs
= vec_sums((vector
signed int) sad
, (vector
signed int) zero
);
415 sumdiffs
= vec_splat(sumdiffs
, 3);
416 vec_ste(sumdiffs
, 0, &s
);
421 void get_pixels_altivec(DCTELEM
*restrict block
, const UINT8
*pixels
, int line_size
)
424 vector
unsigned char perm
, bytes
, *pixv
;
425 vector
unsigned char zero
= (vector
unsigned char) (0);
426 vector
signed short shorts
;
430 // Read potentially unaligned pixels.
431 // We're reading 16 pixels, and actually only want 8,
432 // but we simply ignore the extras.
433 perm
= vec_lvsl(0, pixels
);
434 pixv
= (vector
unsigned char *) pixels
;
435 bytes
= vec_perm(pixv
[0], pixv
[1], perm
);
437 // convert the bytes into shorts
438 shorts
= (vector
signed short)vec_mergeh(zero
, bytes
);
440 // save the data to the block, we assume the block is 16-byte aligned
441 vec_st(shorts
, i
*16, (vector
signed short*)block
);
447 void diff_pixels_altivec(DCTELEM
*restrict block
, const UINT8
*s1
,
448 const UINT8
*s2
, int stride
)
451 vector
unsigned char perm
, bytes
, *pixv
;
452 vector
unsigned char zero
= (vector
unsigned char) (0);
453 vector
signed short shorts1
, shorts2
;
457 // Read potentially unaligned pixels
458 // We're reading 16 pixels, and actually only want 8,
459 // but we simply ignore the extras.
460 perm
= vec_lvsl(0, s1
);
461 pixv
= (vector
unsigned char *) s1
;
462 bytes
= vec_perm(pixv
[0], pixv
[1], perm
);
464 // convert the bytes into shorts
465 shorts1
= (vector
signed short)vec_mergeh(zero
, bytes
);
467 // Do the same for the second block of pixels
468 perm
= vec_lvsl(0, s2
);
469 pixv
= (vector
unsigned char *) s2
;
470 bytes
= vec_perm(pixv
[0], pixv
[1], perm
);
472 // convert the bytes into shorts
473 shorts2
= (vector
signed short)vec_mergeh(zero
, bytes
);
475 // Do the subtraction
476 shorts1
= vec_sub(shorts1
, shorts2
);
478 // save the data to the block, we assume the block is 16-byte aligned
479 vec_st(shorts1
, 0, (vector
signed short*)block
);
486 // The code below is a copy of the code above... This is a manual
489 // Read potentially unaligned pixels
490 // We're reading 16 pixels, and actually only want 8,
491 // but we simply ignore the extras.
492 perm
= vec_lvsl(0, s1
);
493 pixv
= (vector
unsigned char *) s1
;
494 bytes
= vec_perm(pixv
[0], pixv
[1], perm
);
496 // convert the bytes into shorts
497 shorts1
= (vector
signed short)vec_mergeh(zero
, bytes
);
499 // Do the same for the second block of pixels
500 perm
= vec_lvsl(0, s2
);
501 pixv
= (vector
unsigned char *) s2
;
502 bytes
= vec_perm(pixv
[0], pixv
[1], perm
);
504 // convert the bytes into shorts
505 shorts2
= (vector
signed short)vec_mergeh(zero
, bytes
);
507 // Do the subtraction
508 shorts1
= vec_sub(shorts1
, shorts2
);
510 // save the data to the block, we assume the block is 16-byte aligned
511 vec_st(shorts1
, 0, (vector
signed short*)block
);
520 int has_altivec(void)
523 int sels
[2] = {CTL_HW
, HW_VECTORUNIT
};
525 size_t len
= sizeof(has_vu
);
528 err
= sysctl(sels
, 2, &has_vu
, &len
, NULL
, 0);
530 if (err
== 0) return (has_vu
!= 0);