quicktime/ffmpeg/libavcodec/i386/motion_est_mmx.c

   1 /*
   2  * MMX optimized motion estimation
   3  * Copyright (c) 2001 Fabrice Bellard.
   4  * Copyright (c) 2002-2004 Michael Niedermayer
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the Free Software
  18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  19  *
  20  * mostly by Michael Niedermayer <michaelni@gmx.at>
  21  */
  22 #include "../dsputil.h"
  23 #include "mmx.h"
  24
  25 static const __attribute__ ((aligned(8))) uint64_t round_tab[3]={
  26 0x0000000000000000ULL,
  27 0x0001000100010001ULL,
  28 0x0002000200020002ULL,
  29 };
  30
  31 static attribute_used __attribute__ ((aligned(8))) uint64_t bone= 0x0101010101010101LL;
  32
  33 static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
  34 {
  35     long len= -(stride*h);
  36     asm volatile(
  37         ".balign 16                     \n\t"
  38         "1:                             \n\t"
  39         "movq (%1, %%"REG_a"), %%mm0    \n\t"
  40         "movq (%2, %%"REG_a"), %%mm2    \n\t"
  41         "movq (%2, %%"REG_a"), %%mm4    \n\t"
  42         "add %3, %%"REG_a"              \n\t"
  43         "psubusb %%mm0, %%mm2           \n\t"
  44         "psubusb %%mm4, %%mm0           \n\t"
  45         "movq (%1, %%"REG_a"), %%mm1    \n\t"
  46         "movq (%2, %%"REG_a"), %%mm3    \n\t"
  47         "movq (%2, %%"REG_a"), %%mm5    \n\t"
  48         "psubusb %%mm1, %%mm3           \n\t"
  49         "psubusb %%mm5, %%mm1           \n\t"
  50         "por %%mm2, %%mm0               \n\t"
  51         "por %%mm1, %%mm3               \n\t"
  52         "movq %%mm0, %%mm1              \n\t"
  53         "movq %%mm3, %%mm2              \n\t"
  54         "punpcklbw %%mm7, %%mm0         \n\t"
  55         "punpckhbw %%mm7, %%mm1         \n\t"
  56         "punpcklbw %%mm7, %%mm3         \n\t"
  57         "punpckhbw %%mm7, %%mm2         \n\t"
  58         "paddw %%mm1, %%mm0             \n\t"
  59         "paddw %%mm3, %%mm2             \n\t"
  60         "paddw %%mm2, %%mm0             \n\t"
  61         "paddw %%mm0, %%mm6             \n\t"
  62         "add %3, %%"REG_a"              \n\t"
  63         " js 1b                         \n\t"
  64         : "+a" (len)
  65         : "r" (blk1 - len), "r" (blk2 - len), "r" ((long)stride)
  66     );
  67 }
  68
  69 static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
  70 {
  71     long len= -(stride*h);
  72     asm volatile(
  73         ".balign 16                     \n\t"
  74         "1:                             \n\t"
  75         "movq (%1, %%"REG_a"), %%mm0    \n\t"
  76         "movq (%2, %%"REG_a"), %%mm2    \n\t"
  77         "psadbw %%mm2, %%mm0            \n\t"
  78         "add %3, %%"REG_a"              \n\t"
  79         "movq (%1, %%"REG_a"), %%mm1    \n\t"
  80         "movq (%2, %%"REG_a"), %%mm3    \n\t"
  81         "psadbw %%mm1, %%mm3            \n\t"
  82         "paddw %%mm3, %%mm0             \n\t"
  83         "paddw %%mm0, %%mm6             \n\t"
  84         "add %3, %%"REG_a"              \n\t"
  85         " js 1b                         \n\t"
  86         : "+a" (len)
  87         : "r" (blk1 - len), "r" (blk2 - len), "r" ((long)stride)
  88     );
  89 }
  90
  91 static inline void sad8_2_mmx2(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
  92 {
  93     long len= -(stride*h);
  94     asm volatile(
  95         ".balign 16                     \n\t"
  96         "1:                             \n\t"
  97         "movq (%1, %%"REG_a"), %%mm0    \n\t"
  98         "movq (%2, %%"REG_a"), %%mm2    \n\t"
  99         "pavgb %%mm2, %%mm0             \n\t"
 100         "movq (%3, %%"REG_a"), %%mm2    \n\t"
 101         "psadbw %%mm2, %%mm0            \n\t"
 102         "add %4, %%"REG_a"              \n\t"
 103         "movq (%1, %%"REG_a"), %%mm1    \n\t"
 104         "movq (%2, %%"REG_a"), %%mm3    \n\t"
 105         "pavgb %%mm1, %%mm3             \n\t"
 106         "movq (%3, %%"REG_a"), %%mm1    \n\t"
 107         "psadbw %%mm1, %%mm3            \n\t"
 108         "paddw %%mm3, %%mm0             \n\t"
 109         "paddw %%mm0, %%mm6             \n\t"
 110         "add %4, %%"REG_a"              \n\t"
 111         " js 1b                         \n\t"
 112         : "+a" (len)
 113         : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((long)stride)
 114     );
 115 }
 116
 117 static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
 118 { //FIXME reuse src
 119     long len= -(stride*h);
 120     asm volatile(
 121         ".balign 16                     \n\t"
 122         "movq "MANGLE(bone)", %%mm5     \n\t"
 123         "1:                             \n\t"
 124         "movq (%1, %%"REG_a"), %%mm0    \n\t"
 125         "movq (%2, %%"REG_a"), %%mm2    \n\t"
 126         "movq 1(%1, %%"REG_a"), %%mm1   \n\t"
 127         "movq 1(%2, %%"REG_a"), %%mm3   \n\t"
 128         "pavgb %%mm2, %%mm0             \n\t"
 129         "pavgb %%mm1, %%mm3             \n\t"
 130         "psubusb %%mm5, %%mm3           \n\t"
 131         "pavgb %%mm3, %%mm0             \n\t"
 132         "movq (%3, %%"REG_a"), %%mm2    \n\t"
 133         "psadbw %%mm2, %%mm0            \n\t"
 134         "add %4, %%"REG_a"              \n\t"
 135         "movq (%1, %%"REG_a"), %%mm1    \n\t"
 136         "movq (%2, %%"REG_a"), %%mm3    \n\t"
 137         "movq 1(%1, %%"REG_a"), %%mm2   \n\t"
 138         "movq 1(%2, %%"REG_a"), %%mm4   \n\t"
 139         "pavgb %%mm3, %%mm1             \n\t"
 140         "pavgb %%mm4, %%mm2             \n\t"
 141         "psubusb %%mm5, %%mm2           \n\t"
 142         "pavgb %%mm1, %%mm2             \n\t"
 143         "movq (%3, %%"REG_a"), %%mm1    \n\t"
 144         "psadbw %%mm1, %%mm2            \n\t"
 145         "paddw %%mm2, %%mm0             \n\t"
 146         "paddw %%mm0, %%mm6             \n\t"
 147         "add %4, %%"REG_a"              \n\t"
 148         " js 1b                         \n\t"
 149         : "+a" (len)
 150         : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), "r" ((long)stride)
 151     );
 152 }
 153
 154 static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
 155 {
 156     long len= -(stride*h);
 157     asm volatile(
 158         ".balign 16                     \n\t"
 159         "1:                             \n\t"
 160         "movq (%1, %%"REG_a"), %%mm0    \n\t"
 161         "movq (%2, %%"REG_a"), %%mm1    \n\t"
 162         "movq (%1, %%"REG_a"), %%mm2    \n\t"
 163         "movq (%2, %%"REG_a"), %%mm3    \n\t"
 164         "punpcklbw %%mm7, %%mm0         \n\t"
 165         "punpcklbw %%mm7, %%mm1         \n\t"
 166         "punpckhbw %%mm7, %%mm2         \n\t"
 167         "punpckhbw %%mm7, %%mm3         \n\t"
 168         "paddw %%mm0, %%mm1             \n\t"
 169         "paddw %%mm2, %%mm3             \n\t"
 170         "movq (%3, %%"REG_a"), %%mm4    \n\t"
 171         "movq (%3, %%"REG_a"), %%mm2    \n\t"
 172         "paddw %%mm5, %%mm1             \n\t"
 173         "paddw %%mm5, %%mm3             \n\t"
 174         "psrlw $1, %%mm1                \n\t"
 175         "psrlw $1, %%mm3                \n\t"
 176         "packuswb %%mm3, %%mm1          \n\t"
 177         "psubusb %%mm1, %%mm4           \n\t"
 178         "psubusb %%mm2, %%mm1           \n\t"
 179         "por %%mm4, %%mm1               \n\t"
 180         "movq %%mm1, %%mm0              \n\t"
 181         "punpcklbw %%mm7, %%mm0         \n\t"
 182         "punpckhbw %%mm7, %%mm1         \n\t"
 183         "paddw %%mm1, %%mm0             \n\t"
 184         "paddw %%mm0, %%mm6             \n\t"
 185         "add %4, %%"REG_a"              \n\t"
 186         " js 1b                         \n\t"
 187         : "+a" (len)
 188         : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((long)stride)
 189     );
 190 }
 191
 192 static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
 193 {
 194     long len= -(stride*h);
 195     asm volatile(
 196         ".balign 16                     \n\t"
 197         "1:                             \n\t"
 198         "movq (%1, %%"REG_a"), %%mm0    \n\t"
 199         "movq (%2, %%"REG_a"), %%mm1    \n\t"
 200         "movq %%mm0, %%mm4              \n\t"
 201         "movq %%mm1, %%mm2              \n\t"
 202         "punpcklbw %%mm7, %%mm0         \n\t"
 203         "punpcklbw %%mm7, %%mm1         \n\t"
 204         "punpckhbw %%mm7, %%mm4         \n\t"
 205         "punpckhbw %%mm7, %%mm2         \n\t"
 206         "paddw %%mm1, %%mm0             \n\t"
 207         "paddw %%mm2, %%mm4             \n\t"
 208         "movq 1(%1, %%"REG_a"), %%mm2   \n\t"
 209         "movq 1(%2, %%"REG_a"), %%mm3   \n\t"
 210         "movq %%mm2, %%mm1              \n\t"
 211         "punpcklbw %%mm7, %%mm2         \n\t"
 212         "punpckhbw %%mm7, %%mm1         \n\t"
 213         "paddw %%mm0, %%mm2             \n\t"
 214         "paddw %%mm4, %%mm1             \n\t"
 215         "movq %%mm3, %%mm4              \n\t"
 216         "punpcklbw %%mm7, %%mm3         \n\t"
 217         "punpckhbw %%mm7, %%mm4         \n\t"
 218         "paddw %%mm3, %%mm2             \n\t"
 219         "paddw %%mm4, %%mm1             \n\t"
 220         "movq (%3, %%"REG_a"), %%mm3    \n\t"
 221         "movq (%3, %%"REG_a"), %%mm4    \n\t"
 222         "paddw %%mm5, %%mm2             \n\t"
 223         "paddw %%mm5, %%mm1             \n\t"
 224         "psrlw $2, %%mm2                \n\t"
 225         "psrlw $2, %%mm1                \n\t"
 226         "packuswb %%mm1, %%mm2          \n\t"
 227         "psubusb %%mm2, %%mm3           \n\t"
 228         "psubusb %%mm4, %%mm2           \n\t"
 229         "por %%mm3, %%mm2               \n\t"
 230         "movq %%mm2, %%mm0              \n\t"
 231         "punpcklbw %%mm7, %%mm0         \n\t"
 232         "punpckhbw %%mm7, %%mm2         \n\t"
 233         "paddw %%mm2, %%mm0             \n\t"
 234         "paddw %%mm0, %%mm6             \n\t"
 235         "add %4, %%"REG_a"              \n\t"
 236         " js 1b                         \n\t"
 237         : "+a" (len)
 238         : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((long)stride)
 239     );
 240 }
 241
 242 static inline int sum_mmx(void)
 243 {
 244     int ret;
 245     asm volatile(
 246         "movq %%mm6, %%mm0              \n\t"
 247         "psrlq $32, %%mm6               \n\t"
 248         "paddw %%mm0, %%mm6             \n\t"
 249         "movq %%mm6, %%mm0              \n\t"
 250         "psrlq $16, %%mm6               \n\t"
 251         "paddw %%mm0, %%mm6             \n\t"
 252         "movd %%mm6, %0                 \n\t"
 253         : "=r" (ret)
 254     );
 255     return ret&0xFFFF;
 256 }
 257
 258 static inline int sum_mmx2(void)
 259 {
 260     int ret;
 261     asm volatile(
 262         "movd %%mm6, %0                 \n\t"
 263         : "=r" (ret)
 264     );
 265     return ret;
 266 }
 267
 268
 269 #define PIX_SAD(suf)\
 270 static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
 271 {\
 272     assert(h==8);\
 273     asm volatile("pxor %%mm7, %%mm7             \n\t"\
 274                  "pxor %%mm6, %%mm6             \n\t":);\
 275 \
 276     sad8_1_ ## suf(blk1, blk2, stride, 8);\
 277 \
 278     return sum_ ## suf();\
 279 }\
 280 static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
 281 {\
 282     assert(h==8);\
 283     asm volatile("pxor %%mm7, %%mm7             \n\t"\
 284                  "pxor %%mm6, %%mm6             \n\t"\
 285                  "movq %0, %%mm5                \n\t"\
 286                  :: "m"(round_tab[1]) \
 287                  );\
 288 \
 289     sad8_2_ ## suf(blk1, blk1+1, blk2, stride, 8);\
 290 \
 291     return sum_ ## suf();\
 292 }\
 293 \
 294 static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
 295 {\
 296     assert(h==8);\
 297     asm volatile("pxor %%mm7, %%mm7             \n\t"\
 298                  "pxor %%mm6, %%mm6             \n\t"\
 299                  "movq %0, %%mm5                \n\t"\
 300                  :: "m"(round_tab[1]) \
 301                  );\
 302 \
 303     sad8_2_ ## suf(blk1, blk1+stride, blk2, stride, 8);\
 304 \
 305     return sum_ ## suf();\
 306 }\
 307 \
 308 static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
 309 {\
 310     assert(h==8);\
 311     asm volatile("pxor %%mm7, %%mm7             \n\t"\
 312                  "pxor %%mm6, %%mm6             \n\t"\
 313                  "movq %0, %%mm5                \n\t"\
 314                  :: "m"(round_tab[2]) \
 315                  );\
 316 \
 317     sad8_4_ ## suf(blk1, blk2, stride, 8);\
 318 \
 319     return sum_ ## suf();\
 320 }\
 321 \
 322 static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
 323 {\
 324     asm volatile("pxor %%mm7, %%mm7             \n\t"\
 325                  "pxor %%mm6, %%mm6             \n\t":);\
 326 \
 327     sad8_1_ ## suf(blk1  , blk2  , stride, h);\
 328     sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\
 329 \
 330     return sum_ ## suf();\
 331 }\
 332 static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
 333 {\
 334     asm volatile("pxor %%mm7, %%mm7             \n\t"\
 335                  "pxor %%mm6, %%mm6             \n\t"\
 336                  "movq %0, %%mm5                \n\t"\
 337                  :: "m"(round_tab[1]) \
 338                  );\
 339 \
 340     sad8_2_ ## suf(blk1  , blk1+1, blk2  , stride, h);\
 341     sad8_2_ ## suf(blk1+8, blk1+9, blk2+8, stride, h);\
 342 \
 343     return sum_ ## suf();\
 344 }\
 345 static int sad16_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
 346 {\
 347     asm volatile("pxor %%mm7, %%mm7             \n\t"\
 348                  "pxor %%mm6, %%mm6             \n\t"\
 349                  "movq %0, %%mm5                \n\t"\
 350                  :: "m"(round_tab[1]) \
 351                  );\
 352 \
 353     sad8_2_ ## suf(blk1  , blk1+stride,  blk2  , stride, h);\
 354     sad8_2_ ## suf(blk1+8, blk1+stride+8,blk2+8, stride, h);\
 355 \
 356     return sum_ ## suf();\
 357 }\
 358 static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
 359 {\
 360     asm volatile("pxor %%mm7, %%mm7             \n\t"\
 361                  "pxor %%mm6, %%mm6             \n\t"\
 362                  "movq %0, %%mm5                \n\t"\
 363                  :: "m"(round_tab[2]) \
 364                  );\
 365 \
 366     sad8_4_ ## suf(blk1  , blk2  , stride, h);\
 367     sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\
 368 \
 369     return sum_ ## suf();\
 370 }\
 371
 372 PIX_SAD(mmx)
 373 PIX_SAD(mmx2)
 374
 375 void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx)
 376 {
 377     if (mm_flags & MM_MMX) {
 378         c->pix_abs[0][0] = sad16_mmx;
 379         c->pix_abs[0][1] = sad16_x2_mmx;
 380         c->pix_abs[0][2] = sad16_y2_mmx;
 381         c->pix_abs[0][3] = sad16_xy2_mmx;
 382         c->pix_abs[1][0] = sad8_mmx;
 383         c->pix_abs[1][1] = sad8_x2_mmx;
 384         c->pix_abs[1][2] = sad8_y2_mmx;
 385         c->pix_abs[1][3] = sad8_xy2_mmx;
 386
 387         c->sad[0]= sad16_mmx;
 388         c->sad[1]= sad8_mmx;
 389     }
 390     if (mm_flags & MM_MMXEXT) {
 391         c->pix_abs[0][0] = sad16_mmx2;
 392         c->pix_abs[1][0] = sad8_mmx2;
 393
 394         c->sad[0]= sad16_mmx2;
 395         c->sad[1]= sad8_mmx2;
 396
 397         if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
 398             c->pix_abs[0][1] = sad16_x2_mmx2;
 399             c->pix_abs[0][2] = sad16_y2_mmx2;
 400             c->pix_abs[0][3] = sad16_xy2_mmx2;
 401             c->pix_abs[1][1] = sad8_x2_mmx2;
 402             c->pix_abs[1][2] = sad8_y2_mmx2;
 403             c->pix_abs[1][3] = sad8_xy2_mmx2;
 404         }
 405     }
 406 }