libavcodec/ppc/imgresample_altivec.c

   1 /*
   2  * High quality image resampling with polyphase filters
   3  * Copyright (c) 2001 Fabrice Bellard.
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file imgresample_altivec.c
  24  * High quality image resampling with polyphase filters - AltiVec bits
  25  */
  26
  27 #include "gcc_fixes.h"
  28
  29 typedef         union {
  30     vector unsigned char v;
  31     unsigned char c[16];
  32 } vec_uc_t;
  33
  34 typedef         union {
  35     vector signed short v;
  36     signed short s[8];
  37 } vec_ss_t;
  38
  39 void v_resample16_altivec(uint8_t *dst, int dst_width, const uint8_t *src,
  40                           int wrap, int16_t *filter)
  41 {
  42     int sum, i;
  43     const uint8_t *s;
  44     vector unsigned char *tv, tmp, dstv, zero;
  45     vec_ss_t srchv[4], srclv[4], fv[4];
  46     vector signed short zeros, sumhv, sumlv;
  47     s = src;
  48
  49     for(i=0;i<4;i++)
  50     {
  51         /*
  52            The vec_madds later on does an implicit >>15 on the result.
  53            Since FILTER_BITS is 8, and we have 15 bits of magnitude in
  54            a signed short, we have just enough bits to pre-shift our
  55            filter constants <<7 to compensate for vec_madds.
  56         */
  57         fv[i].s[0] = filter[i] << (15-FILTER_BITS);
  58         fv[i].v = vec_splat(fv[i].v, 0);
  59     }
  60
  61     zero = vec_splat_u8(0);
  62     zeros = vec_splat_s16(0);
  63
  64
  65     /*
  66        When we're resampling, we'd ideally like both our input buffers,
  67        and output buffers to be 16-byte aligned, so we can do both aligned
  68        reads and writes. Sadly we can't always have this at the moment, so
  69        we opt for aligned writes, as unaligned writes have a huge overhead.
  70        To do this, do enough scalar resamples to get dst 16-byte aligned.
  71     */
  72     i = (-(int)dst) & 0xf;
  73     while(i>0) {
  74         sum = s[0 * wrap] * filter[0] +
  75         s[1 * wrap] * filter[1] +
  76         s[2 * wrap] * filter[2] +
  77         s[3 * wrap] * filter[3];
  78         sum = sum >> FILTER_BITS;
  79         if (sum<0) sum = 0; else if (sum>255) sum=255;
  80         dst[0] = sum;
  81         dst++;
  82         s++;
  83         dst_width--;
  84         i--;
  85     }
  86
  87     /* Do our altivec resampling on 16 pixels at once. */
  88     while(dst_width>=16) {
  89         /*
  90            Read 16 (potentially unaligned) bytes from each of
  91            4 lines into 4 vectors, and split them into shorts.
  92            Interleave the multipy/accumulate for the resample
  93            filter with the loads to hide the 3 cycle latency
  94            the vec_madds have.
  95         */
  96         tv = (vector unsigned char *) &s[0 * wrap];
  97         tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[i * wrap]));
  98         srchv[0].v = (vector signed short) vec_mergeh(zero, tmp);
  99         srclv[0].v = (vector signed short) vec_mergel(zero, tmp);
 100         sumhv = vec_madds(srchv[0].v, fv[0].v, zeros);
 101         sumlv = vec_madds(srclv[0].v, fv[0].v, zeros);
 102
 103         tv = (vector unsigned char *) &s[1 * wrap];
 104         tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[1 * wrap]));
 105         srchv[1].v = (vector signed short) vec_mergeh(zero, tmp);
 106         srclv[1].v = (vector signed short) vec_mergel(zero, tmp);
 107         sumhv = vec_madds(srchv[1].v, fv[1].v, sumhv);
 108         sumlv = vec_madds(srclv[1].v, fv[1].v, sumlv);
 109
 110         tv = (vector unsigned char *) &s[2 * wrap];
 111         tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[2 * wrap]));
 112         srchv[2].v = (vector signed short) vec_mergeh(zero, tmp);
 113         srclv[2].v = (vector signed short) vec_mergel(zero, tmp);
 114         sumhv = vec_madds(srchv[2].v, fv[2].v, sumhv);
 115         sumlv = vec_madds(srclv[2].v, fv[2].v, sumlv);
 116
 117         tv = (vector unsigned char *) &s[3 * wrap];
 118         tmp = vec_perm(tv[0], tv[1], vec_lvsl(0, &s[3 * wrap]));
 119         srchv[3].v = (vector signed short) vec_mergeh(zero, tmp);
 120         srclv[3].v = (vector signed short) vec_mergel(zero, tmp);
 121         sumhv = vec_madds(srchv[3].v, fv[3].v, sumhv);
 122         sumlv = vec_madds(srclv[3].v, fv[3].v, sumlv);
 123
 124         /*
 125            Pack the results into our destination vector,
 126            and do an aligned write of that back to memory.
 127         */
 128         dstv = vec_packsu(sumhv, sumlv) ;
 129         vec_st(dstv, 0, (vector unsigned char *) dst);
 130
 131         dst+=16;
 132         s+=16;
 133         dst_width-=16;
 134     }
 135
 136     /*
 137        If there are any leftover pixels, resample them
 138        with the slow scalar method.
 139     */
 140     while(dst_width>0) {
 141         sum = s[0 * wrap] * filter[0] +
 142         s[1 * wrap] * filter[1] +
 143         s[2 * wrap] * filter[2] +
 144         s[3 * wrap] * filter[3];
 145         sum = sum >> FILTER_BITS;
 146         if (sum<0) sum = 0; else if (sum>255) sum=255;
 147         dst[0] = sum;
 148         dst++;
 149         s++;
 150         dst_width--;
 151     }
 152 }
 153