hvirtual/quicktime/libavcodec/ppc/dsputil_altivec.c

   1 /*
   2  * Copyright (c) 2002 Brian Foley
   3  * Copyright (c) 2002 Dieter Shirley
   4  *
   5  * This library is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU Lesser General Public
   7  * License as published by the Free Software Foundation; either
   8  * version 2 of the License, or (at your option) any later version.
   9  *
  10  * This library is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Lesser General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Lesser General Public
  16  * License along with this library; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  18  */
  19
  20 #include "../dsputil.h"
  21 #include "dsputil_altivec.h"
  22
  23 #if CONFIG_DARWIN
  24 #include <sys/sysctl.h>
  25 #endif
  26
  27 int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
  28 {
  29     int s, i;
  30     vector unsigned char *tv, zero;
  31     vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
  32     vector unsigned int sad;
  33     vector signed int sumdiffs;
  34
  35     s = 0;
  36     zero = vec_splat_u8(0);
  37     sad = vec_splat_u32(0);
  38     for(i=0;i<16;i++) {
  39         /*
  40            Read unaligned pixels into our vectors. The vectors are as follows:
  41            pix1v: pix1[0]-pix1[15]
  42            pix2v: pix2[0]-pix2[15]      pix2iv: pix2[1]-pix2[16]
  43         */
  44         tv = (vector unsigned char *) pix1;
  45         pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
  46
  47         tv = (vector unsigned char *) &pix2[0];
  48         pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
  49
  50         tv = (vector unsigned char *) &pix2[1];
  51         pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
  52
  53         /* Calculate the average vector */
  54         avgv = vec_avg(pix2v, pix2iv);
  55
  56         /* Calculate a sum of abs differences vector */
  57         t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  58
  59         /* Add each 4 pixel group together and put 4 results into sad */
  60         sad = vec_sum4s(t5, sad);
  61
  62         pix1 += line_size;
  63         pix2 += line_size;
  64     }
  65     /* Sum up the four partial sums, and put the result into s */
  66     sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  67     sumdiffs = vec_splat(sumdiffs, 3);
  68     vec_ste(sumdiffs, 0, &s);
  69
  70     return s;
  71 }
  72
  73 int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
  74 {
  75     int s, i;
  76     vector unsigned char *tv, zero;
  77     vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
  78     vector unsigned int sad;
  79     vector signed int sumdiffs;
  80     uint8_t *pix3 = pix2 + line_size;
  81
  82     s = 0;
  83     zero = vec_splat_u8(0);
  84     sad = vec_splat_u32(0);
  85
  86     /*
  87        Due to the fact that pix3 = pix2 + line_size, the pix3 of one
  88        iteration becomes pix2 in the next iteration. We can use this
  89        fact to avoid a potentially expensive unaligned read, each
  90        time around the loop.
  91        Read unaligned pixels into our vectors. The vectors are as follows:
  92        pix2v: pix2[0]-pix2[15]
  93        Split the pixel vectors into shorts
  94     */
  95     tv = (vector unsigned char *) &pix2[0];
  96     pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
  97
  98     for(i=0;i<16;i++) {
  99         /*
 100            Read unaligned pixels into our vectors. The vectors are as follows:
 101            pix1v: pix1[0]-pix1[15]
 102            pix3v: pix3[0]-pix3[15]
 103         */
 104         tv = (vector unsigned char *) pix1;
 105         pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
 106
 107         tv = (vector unsigned char *) &pix3[0];
 108         pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
 109
 110         /* Calculate the average vector */
 111         avgv = vec_avg(pix2v, pix3v);
 112
 113         /* Calculate a sum of abs differences vector */
 114         t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
 115
 116         /* Add each 4 pixel group together and put 4 results into sad */
 117         sad = vec_sum4s(t5, sad);
 118
 119         pix1 += line_size;
 120         pix2v = pix3v;
 121         pix3 += line_size;
 122
 123     }
 124
 125     /* Sum up the four partial sums, and put the result into s */
 126     sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
 127     sumdiffs = vec_splat(sumdiffs, 3);
 128     vec_ste(sumdiffs, 0, &s);
 129     return s;
 130 }
 131
 132 int pix_abs16x16_xy2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 133 {
 134     int s, i;
 135     uint8_t *pix3 = pix2 + line_size;
 136     vector unsigned char *tv, avgv, t5, zero;
 137     vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
 138     vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
 139     vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
 140     vector unsigned short avghv, avglv, two;
 141     vector unsigned short t1, t2, t3, t4;
 142     vector unsigned int sad;
 143     vector signed int sumdiffs;
 144
 145     zero = vec_splat_u8(0);
 146     two = vec_splat_u16(2);
 147     sad = vec_splat_u32(0);
 148
 149     s = 0;
 150
 151     /*
 152        Due to the fact that pix3 = pix2 + line_size, the pix3 of one
 153        iteration becomes pix2 in the next iteration. We can use this
 154        fact to avoid a potentially expensive unaligned read, as well
 155        as some splitting, and vector addition each time around the loop.
 156        Read unaligned pixels into our vectors. The vectors are as follows:
 157        pix2v: pix2[0]-pix2[15]  pix2iv: pix2[1]-pix2[16]
 158        Split the pixel vectors into shorts
 159     */
 160     tv = (vector unsigned char *) &pix2[0];
 161     pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
 162
 163     tv = (vector unsigned char *) &pix2[1];
 164     pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
 165
 166     pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
 167     pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
 168     pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
 169     pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
 170     t1 = vec_add(pix2hv, pix2ihv);
 171     t2 = vec_add(pix2lv, pix2ilv);
 172
 173     for(i=0;i<16;i++) {
 174         /*
 175            Read unaligned pixels into our vectors. The vectors are as follows:
 176            pix1v: pix1[0]-pix1[15]
 177            pix3v: pix3[0]-pix3[15]      pix3iv: pix3[1]-pix3[16]
 178         */
 179         tv = (vector unsigned char *) pix1;
 180         pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
 181
 182         tv = (vector unsigned char *) &pix3[0];
 183         pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
 184
 185         tv = (vector unsigned char *) &pix3[1];
 186         pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
 187
 188         /*
 189           Note that Altivec does have vec_avg, but this works on vector pairs
 190           and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
 191           would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
 192           Instead, we have to split the pixel vectors into vectors of shorts,
 193           and do the averaging by hand.
 194         */
 195
 196         /* Split the pixel vectors into shorts */
 197         pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
 198         pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
 199         pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
 200         pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
 201
 202         /* Do the averaging on them */
 203         t3 = vec_add(pix3hv, pix3ihv);
 204         t4 = vec_add(pix3lv, pix3ilv);
 205
 206         avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
 207         avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
 208
 209         /* Pack the shorts back into a result */
 210         avgv = vec_pack(avghv, avglv);
 211
 212         /* Calculate a sum of abs differences vector */
 213         t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
 214
 215         /* Add each 4 pixel group together and put 4 results into sad */
 216         sad = vec_sum4s(t5, sad);
 217
 218         pix1 += line_size;
 219         pix3 += line_size;
 220         /* Transfer the calculated values for pix3 into pix2 */
 221         t1 = t3;
 222         t2 = t4;
 223     }
 224     /* Sum up the four partial sums, and put the result into s */
 225     sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
 226     sumdiffs = vec_splat(sumdiffs, 3);
 227     vec_ste(sumdiffs, 0, &s);
 228
 229     return s;
 230 }
 231
 232 int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 233 {
 234     int i, s;
 235     vector unsigned char perm1, perm2, *pix1v, *pix2v;
 236     vector unsigned char t1, t2, t3,t4, t5;
 237     vector unsigned int sad, zero;
 238     vector signed int sumdiffs;
 239
 240     zero = (vector unsigned int) (0);
 241     sad = (vector unsigned int) (0);
 242
 243
 244     for(i=0;i<16;i++) {
 245         /* Read potentially unaligned pixels into t1 and t2 */
 246         perm1 = vec_lvsl(0, pix1);
 247         pix1v = (vector unsigned char *) pix1;
 248         perm2 = vec_lvsl(0, pix2);
 249         pix2v = (vector unsigned char *) pix2;
 250         t1 = vec_perm(pix1v[0], pix1v[1], perm1);
 251         t2 = vec_perm(pix2v[0], pix2v[1], perm2);
 252
 253         /* Calculate a sum of abs differences vector */
 254         t3 = vec_max(t1, t2);
 255         t4 = vec_min(t1, t2);
 256         t5 = vec_sub(t3, t4);
 257
 258         /* Add each 4 pixel group together and put 4 results into sad */
 259         sad = vec_sum4s(t5, sad);
 260
 261         pix1 += line_size;
 262         pix2 += line_size;
 263     }
 264
 265     /* Sum up the four partial sums, and put the result into s */
 266     sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
 267     sumdiffs = vec_splat(sumdiffs, 3);
 268     vec_ste(sumdiffs, 0, &s);
 269
 270     return s;
 271 }
 272
 273 int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 274 {
 275     int i, s;
 276     vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
 277     vector unsigned char t1, t2, t3,t4, t5;
 278     vector unsigned int sad, zero;
 279     vector signed int sumdiffs;
 280
 281     zero = (vector unsigned int) (0);
 282     sad = (vector unsigned int) (0);
 283     permclear = (vector unsigned char) (255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
 284
 285     for(i=0;i<8;i++) {
 286         /* Read potentially unaligned pixels into t1 and t2
 287            Since we're reading 16 pixels, and actually only want 8,
 288            mask out the last 8 pixels. The 0s don't change the sum. */
 289         perm1 = vec_lvsl(0, pix1);
 290         pix1v = (vector unsigned char *) pix1;
 291         perm2 = vec_lvsl(0, pix2);
 292         pix2v = (vector unsigned char *) pix2;
 293         t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
 294         t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
 295
 296         /* Calculate a sum of abs differences vector */
 297         t3 = vec_max(t1, t2);
 298         t4 = vec_min(t1, t2);
 299         t5 = vec_sub(t3, t4);
 300
 301         /* Add each 4 pixel group together and put 4 results into sad */
 302         sad = vec_sum4s(t5, sad);
 303
 304         pix1 += line_size;
 305         pix2 += line_size;
 306     }
 307
 308     /* Sum up the four partial sums, and put the result into s */
 309     sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
 310     sumdiffs = vec_splat(sumdiffs, 3);
 311     vec_ste(sumdiffs, 0, &s);
 312
 313     return s;
 314 }
 315
 316 int pix_norm1_altivec(uint8_t *pix, int line_size)
 317 {
 318     int s, i;
 319     vector unsigned char *tv, zero;
 320     vector unsigned char pixv;
 321     vector unsigned int sv;
 322     vector signed int sum;
 323
 324     zero = vec_splat_u8(0);
 325     sv = vec_splat_u32(0);
 326
 327     s = 0;
 328     for (i = 0; i < 16; i++) {
 329         /* Read in the potentially unaligned pixels */
 330         tv = (vector unsigned char *) pix;
 331         pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
 332
 333         /* Square the values, and add them to our sum */
 334         sv = vec_msum(pixv, pixv, sv);
 335
 336         pix += line_size;
 337     }
 338     /* Sum up the four partial sums, and put the result into s */
 339     sum = vec_sums((vector signed int) sv, (vector signed int) zero);
 340     sum = vec_splat(sum, 3);
 341     vec_ste(sum, 0, &s);
 342
 343     return s;
 344 }
 345
 346
 347 int pix_norm_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
 348 {
 349     int s, i;
 350     vector unsigned char *tv, zero;
 351     vector unsigned char pix1v, pix2v, t5;
 352     vector unsigned int sv;
 353     vector signed int sum;
 354
 355     zero = vec_splat_u8(0);
 356     sv = vec_splat_u32(0);
 357     s = 0;
 358     for (i = 0; i < 16; i++) {
 359         /* Read in the potentially unaligned pixels */
 360         tv = (vector unsigned char *) pix1;
 361         pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
 362
 363         tv = (vector unsigned char *) pix2;
 364         pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix2));
 365
 366         /*
 367            Since we want to use unsigned chars, we can take advantage
 368            of the fact that abs(a-b)^2 = (a-b)^2.
 369         */
 370
 371         /* Calculate a sum of abs differences vector */
 372         t5 = vec_sub(vec_max(pix1v, pix2v), vec_min(pix1v, pix2v));
 373
 374         /* Square the values and add them to our sum */
 375         sv = vec_msum(t5, t5, sv);
 376
 377         pix1 += line_size;
 378         pix2 += line_size;
 379     }
 380     /* Sum up the four partial sums, and put the result into s */
 381     sum = vec_sums((vector signed int) sv, (vector signed int) zero);
 382     sum = vec_splat(sum, 3);
 383     vec_ste(sum, 0, &s);
 384     return s;
 385 }
 386
 387
 388 int pix_sum_altivec(UINT8 * pix, int line_size)
 389 {
 390
 391     vector unsigned char perm, *pixv;
 392     vector unsigned char t1;
 393     vector unsigned int sad, zero;
 394     vector signed int sumdiffs;
 395
 396     int s, i;
 397
 398     zero = (vector unsigned int) (0);
 399     sad = (vector unsigned int) (0);
 400
 401     for (i = 0; i < 16; i++) {
 402         /* Read the potentially unaligned 16 pixels into t1 */
 403         perm = vec_lvsl(0, pix);
 404         pixv = (vector unsigned char *) pix;
 405         t1 = vec_perm(pixv[0], pixv[1], perm);
 406
 407         /* Add each 4 pixel group together and put 4 results into sad */
 408         sad = vec_sum4s(t1, sad);
 409
 410         pix += line_size;
 411     }
 412
 413     /* Sum up the four partial sums, and put the result into s */
 414     sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
 415     sumdiffs = vec_splat(sumdiffs, 3);
 416     vec_ste(sumdiffs, 0, &s);
 417
 418     return s;
 419 }
 420
 421 void get_pixels_altivec(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
 422 {
 423     int i;
 424     vector unsigned char perm, bytes, *pixv;
 425     vector unsigned char zero = (vector unsigned char) (0);
 426     vector signed short shorts;
 427
 428     for(i=0;i<8;i++)
 429     {
 430         // Read potentially unaligned pixels.
 431         // We're reading 16 pixels, and actually only want 8,
 432         // but we simply ignore the extras.
 433         perm = vec_lvsl(0, pixels);
 434         pixv = (vector unsigned char *) pixels;
 435         bytes = vec_perm(pixv[0], pixv[1], perm);
 436
 437         // convert the bytes into shorts
 438         shorts = (vector signed short)vec_mergeh(zero, bytes);
 439
 440         // save the data to the block, we assume the block is 16-byte aligned
 441         vec_st(shorts, i*16, (vector signed short*)block);
 442
 443         pixels += line_size;
 444     }
 445 }
 446
 447 void diff_pixels_altivec(DCTELEM *restrict block, const UINT8 *s1,
 448         const UINT8 *s2, int stride)
 449 {
 450     int i;
 451     vector unsigned char perm, bytes, *pixv;
 452     vector unsigned char zero = (vector unsigned char) (0);
 453     vector signed short shorts1, shorts2;
 454
 455     for(i=0;i<4;i++)
 456     {
 457         // Read potentially unaligned pixels
 458         // We're reading 16 pixels, and actually only want 8,
 459         // but we simply ignore the extras.
 460         perm = vec_lvsl(0, s1);
 461         pixv = (vector unsigned char *) s1;
 462         bytes = vec_perm(pixv[0], pixv[1], perm);
 463
 464         // convert the bytes into shorts
 465         shorts1 = (vector signed short)vec_mergeh(zero, bytes);
 466
 467         // Do the same for the second block of pixels
 468         perm = vec_lvsl(0, s2);
 469         pixv = (vector unsigned char *) s2;
 470         bytes = vec_perm(pixv[0], pixv[1], perm);
 471
 472         // convert the bytes into shorts
 473         shorts2 = (vector signed short)vec_mergeh(zero, bytes);
 474
 475         // Do the subtraction
 476         shorts1 = vec_sub(shorts1, shorts2);
 477
 478         // save the data to the block, we assume the block is 16-byte aligned
 479         vec_st(shorts1, 0, (vector signed short*)block);
 480
 481         s1 += stride;
 482         s2 += stride;
 483         block += 8;
 484
 485
 486         // The code below is a copy of the code above... This is a manual
 487         // unroll.
 488
 489         // Read potentially unaligned pixels
 490         // We're reading 16 pixels, and actually only want 8,
 491         // but we simply ignore the extras.
 492         perm = vec_lvsl(0, s1);
 493         pixv = (vector unsigned char *) s1;
 494         bytes = vec_perm(pixv[0], pixv[1], perm);
 495
 496         // convert the bytes into shorts
 497         shorts1 = (vector signed short)vec_mergeh(zero, bytes);
 498
 499         // Do the same for the second block of pixels
 500         perm = vec_lvsl(0, s2);
 501         pixv = (vector unsigned char *) s2;
 502         bytes = vec_perm(pixv[0], pixv[1], perm);
 503
 504         // convert the bytes into shorts
 505         shorts2 = (vector signed short)vec_mergeh(zero, bytes);
 506
 507         // Do the subtraction
 508         shorts1 = vec_sub(shorts1, shorts2);
 509
 510         // save the data to the block, we assume the block is 16-byte aligned
 511         vec_st(shorts1, 0, (vector signed short*)block);
 512
 513         s1 += stride;
 514         s2 += stride;
 515         block += 8;
 516     }
 517 }
 518
 519
 520 int has_altivec(void)
 521 {
 522 #if CONFIG_DARWIN
 523     int sels[2] = {CTL_HW, HW_VECTORUNIT};
 524     int has_vu = 0;
 525     size_t len = sizeof(has_vu);
 526     int err;
 527
 528     err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
 529
 530     if (err == 0) return (has_vu != 0);
 531 #endif
 532     return 0;
 533 }
 534