quicktime/ffmpeg/libavcodec/armv4l/dsputil_iwmmxt_rnd.h

   1 void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
   2 {
   3     int stride = line_size;
   4     __asm__ __volatile__ (
   5         "and r12, %[pixels], #7 \n\t"
   6         "bic %[pixels], %[pixels], #7 \n\t"
   7         "tmcr wcgr1, r12 \n\t"
   8         "add r4, %[pixels], %[line_size] \n\t"
   9         "add r5, %[block], %[line_size] \n\t"
  10         "mov %[line_size], %[line_size], lsl #1 \n\t"
  11         "1: \n\t"
  12         "wldrd wr0, [%[pixels]] \n\t"
  13         "subs %[h], %[h], #2 \n\t"
  14         "wldrd wr1, [%[pixels], #8] \n\t"
  15         "add %[pixels], %[pixels], %[line_size] \n\t"
  16         "wldrd wr3, [r4] \n\t"
  17         "pld [%[pixels]] \n\t"
  18         "pld [%[pixels], #32] \n\t"
  19         "wldrd wr4, [r4, #8] \n\t"
  20         "add r4, r4, %[line_size] \n\t"
  21         "walignr1 wr8, wr0, wr1 \n\t"
  22         "pld [r4] \n\t"
  23         "pld [r4, #32] \n\t"
  24         "walignr1 wr10, wr3, wr4 \n\t"
  25         "wstrd wr8, [%[block]] \n\t"
  26         "add %[block], %[block], %[line_size] \n\t"
  27         "wstrd wr10, [r5] \n\t"
  28         "add r5, r5, %[line_size] \n\t"
  29         "bne 1b \n\t"
  30         : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
  31         :
  32         : "memory", "r4", "r5", "r12");
  33 }
  34
  35 void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
  36 {
  37     int stride = line_size;
  38     __asm__ __volatile__ (
  39         "and r12, %[pixels], #7 \n\t"
  40         "bic %[pixels], %[pixels], #7 \n\t"
  41         "tmcr wcgr1, r12 \n\t"
  42         "add r4, %[pixels], %[line_size] \n\t"
  43         "add r5, %[block], %[line_size] \n\t"
  44         "mov %[line_size], %[line_size], lsl #1 \n\t"
  45         "1: \n\t"
  46         "wldrd wr0, [%[pixels]] \n\t"
  47         "subs %[h], %[h], #2 \n\t"
  48         "wldrd wr1, [%[pixels], #8] \n\t"
  49         "add %[pixels], %[pixels], %[line_size] \n\t"
  50         "wldrd wr3, [r4] \n\t"
  51         "pld [%[pixels]] \n\t"
  52         "pld [%[pixels], #32] \n\t"
  53         "wldrd wr4, [r4, #8] \n\t"
  54         "add r4, r4, %[line_size] \n\t"
  55         "walignr1 wr8, wr0, wr1 \n\t"
  56         "wldrd wr0, [%[block]] \n\t"
  57         "wldrd wr2, [r5] \n\t"
  58         "pld [r4] \n\t"
  59         "pld [r4, #32] \n\t"
  60         "walignr1 wr10, wr3, wr4 \n\t"
  61         WAVG2B" wr8, wr8, wr0 \n\t"
  62         WAVG2B" wr10, wr10, wr2 \n\t"
  63         "wstrd wr8, [%[block]] \n\t"
  64         "add %[block], %[block], %[line_size] \n\t"
  65         "wstrd wr10, [r5] \n\t"
  66         "pld [%[block]] \n\t"
  67         "pld [%[block], #32] \n\t"
  68         "add r5, r5, %[line_size] \n\t"
  69         "pld [r5] \n\t"
  70         "pld [r5, #32] \n\t"
  71         "bne 1b \n\t"
  72         : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
  73         :
  74         : "memory", "r4", "r5", "r12");
  75 }
  76
  77 void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
  78 {
  79     int stride = line_size;
  80     __asm__ __volatile__ (
  81         "and r12, %[pixels], #7 \n\t"
  82         "bic %[pixels], %[pixels], #7 \n\t"
  83         "tmcr wcgr1, r12 \n\t"
  84         "add r4, %[pixels], %[line_size] \n\t"
  85         "add r5, %[block], %[line_size] \n\t"
  86         "mov %[line_size], %[line_size], lsl #1 \n\t"
  87         "1: \n\t"
  88         "wldrd wr0, [%[pixels]] \n\t"
  89         "wldrd wr1, [%[pixels], #8] \n\t"
  90         "subs %[h], %[h], #2 \n\t"
  91         "wldrd wr2, [%[pixels], #16] \n\t"
  92         "add %[pixels], %[pixels], %[line_size] \n\t"
  93         "wldrd wr3, [r4] \n\t"
  94         "pld [%[pixels]] \n\t"
  95         "pld [%[pixels], #32] \n\t"
  96         "walignr1 wr8, wr0, wr1 \n\t"
  97         "wldrd wr4, [r4, #8] \n\t"
  98         "walignr1 wr9, wr1, wr2 \n\t"
  99         "wldrd wr5, [r4, #16] \n\t"
 100         "add r4, r4, %[line_size] \n\t"
 101         "pld [r4] \n\t"
 102         "pld [r4, #32] \n\t"
 103         "walignr1 wr10, wr3, wr4 \n\t"
 104         "wstrd wr8, [%[block]] \n\t"
 105         "walignr1 wr11, wr4, wr5 \n\t"
 106         "wstrd wr9, [%[block], #8] \n\t"
 107         "add %[block], %[block], %[line_size] \n\t"
 108         "wstrd wr10, [r5] \n\t"
 109         "wstrd wr11, [r5, #8] \n\t"
 110         "add r5, r5, %[line_size] \n\t"
 111         "bne 1b \n\t"
 112         : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
 113         :
 114         : "memory", "r4", "r5", "r12");
 115 }
 116
 117 void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
 118 {
 119     int stride = line_size;
 120     __asm__ __volatile__ (
 121         "pld [%[pixels]]                \n\t"
 122         "pld [%[pixels], #32]           \n\t"
 123         "pld [%[block]]                 \n\t"
 124         "pld [%[block], #32]            \n\t"
 125         "and r12, %[pixels], #7         \n\t"
 126         "bic %[pixels], %[pixels], #7   \n\t"
 127         "tmcr wcgr1, r12                \n\t"
 128         "add r4, %[pixels], %[line_size]\n\t"
 129         "add r5, %[block], %[line_size] \n\t"
 130         "mov %[line_size], %[line_size], lsl #1 \n\t"
 131         "1:                             \n\t"
 132         "wldrd wr0, [%[pixels]]         \n\t"
 133         "wldrd wr1, [%[pixels], #8]     \n\t"
 134         "subs %[h], %[h], #2            \n\t"
 135         "wldrd wr2, [%[pixels], #16]    \n\t"
 136         "add %[pixels], %[pixels], %[line_size] \n\t"
 137         "wldrd wr3, [r4]                \n\t"
 138         "pld [%[pixels]]                \n\t"
 139         "pld [%[pixels], #32]           \n\t"
 140         "walignr1 wr8, wr0, wr1         \n\t"
 141         "wldrd wr4, [r4, #8]            \n\t"
 142         "walignr1 wr9, wr1, wr2         \n\t"
 143         "wldrd wr5, [r4, #16]           \n\t"
 144         "add r4, r4, %[line_size]       \n\t"
 145         "wldrd wr0, [%[block]]          \n\t"
 146         "pld [r4]                       \n\t"
 147         "wldrd wr1, [%[block], #8]      \n\t"
 148         "pld [r4, #32]                  \n\t"
 149         "wldrd wr2, [r5]                \n\t"
 150         "walignr1 wr10, wr3, wr4        \n\t"
 151         "wldrd wr3, [r5, #8]            \n\t"
 152         WAVG2B" wr8, wr8, wr0           \n\t"
 153         WAVG2B" wr9, wr9, wr1           \n\t"
 154         WAVG2B" wr10, wr10, wr2         \n\t"
 155         "wstrd wr8, [%[block]]          \n\t"
 156         "walignr1 wr11, wr4, wr5        \n\t"
 157         WAVG2B" wr11, wr11, wr3         \n\t"
 158         "wstrd wr9, [%[block], #8]      \n\t"
 159         "add %[block], %[block], %[line_size] \n\t"
 160         "wstrd wr10, [r5]               \n\t"
 161         "pld [%[block]]                 \n\t"
 162         "pld [%[block], #32]            \n\t"
 163         "wstrd wr11, [r5, #8]           \n\t"
 164         "add r5, r5, %[line_size]       \n\t"
 165         "pld [r5]                       \n\t"
 166         "pld [r5, #32]                  \n\t"
 167         "bne 1b \n\t"
 168         : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h)
 169         :
 170         : "memory", "r4", "r5", "r12");
 171 }
 172
 173 void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
 174 {
 175     int stride = line_size;
 176     // [wr0 wr1 wr2 wr3] for previous line
 177     // [wr4 wr5 wr6 wr7] for current line
 178     SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
 179     __asm__ __volatile__(
 180         "pld [%[pixels]]                \n\t"
 181         "pld [%[pixels], #32]           \n\t"
 182         "and r12, %[pixels], #7         \n\t"
 183         "bic %[pixels], %[pixels], #7   \n\t"
 184         "tmcr wcgr1, r12                \n\t"
 185         "add r12, r12, #1               \n\t"
 186         "add r4, %[pixels], %[line_size]\n\t"
 187         "tmcr wcgr2, r12                \n\t"
 188         "add r5, %[block], %[line_size] \n\t"
 189         "mov %[line_size], %[line_size], lsl #1 \n\t"
 190
 191         "1:                             \n\t"
 192         "wldrd wr10, [%[pixels]]        \n\t"
 193         "cmp r12, #8                    \n\t"
 194         "wldrd wr11, [%[pixels], #8]    \n\t"
 195         "add %[pixels], %[pixels], %[line_size] \n\t"
 196         "wldrd wr13, [r4]               \n\t"
 197         "pld [%[pixels]]                \n\t"
 198         "wldrd wr14, [r4, #8]           \n\t"
 199         "pld [%[pixels], #32]           \n\t"
 200         "add r4, r4, %[line_size]       \n\t"
 201         "walignr1 wr0, wr10, wr11       \n\t"
 202         "pld [r4]                       \n\t"
 203         "pld [r4, #32]                  \n\t"
 204         "walignr1 wr2, wr13, wr14       \n\t"
 205         "wmoveq wr4, wr11               \n\t"
 206         "wmoveq wr6, wr14               \n\t"
 207         "walignr2ne wr4, wr10, wr11     \n\t"
 208         "walignr2ne wr6, wr13, wr14     \n\t"
 209         WAVG2B" wr0, wr0, wr4           \n\t"
 210         WAVG2B" wr2, wr2, wr6           \n\t"
 211         "wstrd wr0, [%[block]]          \n\t"
 212         "subs %[h], %[h], #2            \n\t"
 213         "wstrd wr2, [r5]                \n\t"
 214         "add %[block], %[block], %[line_size]   \n\t"
 215         "add r5, r5, %[line_size]       \n\t"
 216         "bne 1b                         \n\t"
 217         : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
 218         :
 219         : "r4", "r5", "r12", "memory");
 220 }
 221
 222 void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
 223 {
 224     int stride = line_size;
 225     // [wr0 wr1 wr2 wr3] for previous line
 226     // [wr4 wr5 wr6 wr7] for current line
 227     SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
 228     __asm__ __volatile__(
 229         "pld [%[pixels]]                \n\t"
 230         "pld [%[pixels], #32]           \n\t"
 231         "and r12, %[pixels], #7         \n\t"
 232         "bic %[pixels], %[pixels], #7   \n\t"
 233         "tmcr wcgr1, r12                \n\t"
 234         "add r12, r12, #1               \n\t"
 235         "add r4, %[pixels], %[line_size]\n\t"
 236         "tmcr wcgr2, r12                \n\t"
 237         "add r5, %[block], %[line_size] \n\t"
 238         "mov %[line_size], %[line_size], lsl #1 \n\t"
 239
 240         "1:                             \n\t"
 241         "wldrd wr10, [%[pixels]]        \n\t"
 242         "cmp r12, #8                    \n\t"
 243         "wldrd wr11, [%[pixels], #8]    \n\t"
 244         "wldrd wr12, [%[pixels], #16]   \n\t"
 245         "add %[pixels], %[pixels], %[line_size] \n\t"
 246         "wldrd wr13, [r4]               \n\t"
 247         "pld [%[pixels]]                \n\t"
 248         "wldrd wr14, [r4, #8]           \n\t"
 249         "pld [%[pixels], #32]           \n\t"
 250         "wldrd wr15, [r4, #16]          \n\t"
 251         "add r4, r4, %[line_size]       \n\t"
 252         "walignr1 wr0, wr10, wr11       \n\t"
 253         "pld [r4]                       \n\t"
 254         "pld [r4, #32]                  \n\t"
 255         "walignr1 wr1, wr11, wr12       \n\t"
 256         "walignr1 wr2, wr13, wr14       \n\t"
 257         "walignr1 wr3, wr14, wr15       \n\t"
 258         "wmoveq wr4, wr11               \n\t"
 259         "wmoveq wr5, wr12               \n\t"
 260         "wmoveq wr6, wr14               \n\t"
 261         "wmoveq wr7, wr15               \n\t"
 262         "walignr2ne wr4, wr10, wr11     \n\t"
 263         "walignr2ne wr5, wr11, wr12     \n\t"
 264         "walignr2ne wr6, wr13, wr14     \n\t"
 265         "walignr2ne wr7, wr14, wr15     \n\t"
 266         WAVG2B" wr0, wr0, wr4           \n\t"
 267         WAVG2B" wr1, wr1, wr5           \n\t"
 268         "wstrd wr0, [%[block]]          \n\t"
 269         WAVG2B" wr2, wr2, wr6           \n\t"
 270         "wstrd wr1, [%[block], #8]      \n\t"
 271         WAVG2B" wr3, wr3, wr7           \n\t"
 272         "add %[block], %[block], %[line_size]   \n\t"
 273         "wstrd wr2, [r5]                \n\t"
 274         "subs %[h], %[h], #2            \n\t"
 275         "wstrd wr3, [r5, #8]            \n\t"
 276         "add r5, r5, %[line_size]       \n\t"
 277         "bne 1b                         \n\t"
 278         : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
 279         :
 280         : "r4", "r5", "r12", "memory");
 281 }
 282
 283 void DEF(avg, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
 284 {
 285     int stride = line_size;
 286     // [wr0 wr1 wr2 wr3] for previous line
 287     // [wr4 wr5 wr6 wr7] for current line
 288     SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
 289     __asm__ __volatile__(
 290         "pld [%[pixels]]                \n\t"
 291         "pld [%[pixels], #32]           \n\t"
 292         "pld [%[block]]                 \n\t"
 293         "pld [%[block], #32]            \n\t"
 294         "and r12, %[pixels], #7         \n\t"
 295         "bic %[pixels], %[pixels], #7   \n\t"
 296         "tmcr wcgr1, r12                \n\t"
 297         "add r12, r12, #1               \n\t"
 298         "add r4, %[pixels], %[line_size]\n\t"
 299         "tmcr wcgr2, r12                \n\t"
 300         "add r5, %[block], %[line_size] \n\t"
 301         "mov %[line_size], %[line_size], lsl #1 \n\t"
 302         "pld [r5]                       \n\t"
 303         "pld [r5, #32]                  \n\t"
 304
 305         "1:                             \n\t"
 306         "wldrd wr10, [%[pixels]]        \n\t"
 307         "cmp r12, #8                    \n\t"
 308         "wldrd wr11, [%[pixels], #8]    \n\t"
 309         "add %[pixels], %[pixels], %[line_size] \n\t"
 310         "wldrd wr13, [r4]               \n\t"
 311         "pld [%[pixels]]                \n\t"
 312         "wldrd wr14, [r4, #8]           \n\t"
 313         "pld [%[pixels], #32]           \n\t"
 314         "add r4, r4, %[line_size]       \n\t"
 315         "walignr1 wr0, wr10, wr11       \n\t"
 316         "pld [r4]                       \n\t"
 317         "pld [r4, #32]                  \n\t"
 318         "walignr1 wr2, wr13, wr14       \n\t"
 319         "wmoveq wr4, wr11               \n\t"
 320         "wmoveq wr6, wr14               \n\t"
 321         "walignr2ne wr4, wr10, wr11     \n\t"
 322         "wldrd wr10, [%[block]]         \n\t"
 323         "walignr2ne wr6, wr13, wr14     \n\t"
 324         "wldrd wr12, [r5]               \n\t"
 325         WAVG2B" wr0, wr0, wr4           \n\t"
 326         WAVG2B" wr2, wr2, wr6           \n\t"
 327         WAVG2B" wr0, wr0, wr10          \n\t"
 328         WAVG2B" wr2, wr2, wr12          \n\t"
 329         "wstrd wr0, [%[block]]          \n\t"
 330         "subs %[h], %[h], #2            \n\t"
 331         "wstrd wr2, [r5]                \n\t"
 332         "add %[block], %[block], %[line_size]   \n\t"
 333         "add r5, r5, %[line_size]       \n\t"
 334         "pld [%[block]]                 \n\t"
 335         "pld [%[block], #32]            \n\t"
 336         "pld [r5]                       \n\t"
 337         "pld [r5, #32]                  \n\t"
 338         "bne 1b                         \n\t"
 339         : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
 340         :
 341         : "r4", "r5", "r12", "memory");
 342 }
 343
 344 void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
 345 {
 346     int stride = line_size;
 347     // [wr0 wr1 wr2 wr3] for previous line
 348     // [wr4 wr5 wr6 wr7] for current line
 349     SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
 350     __asm__ __volatile__(
 351         "pld [%[pixels]]                \n\t"
 352         "pld [%[pixels], #32]           \n\t"
 353         "pld [%[block]]                 \n\t"
 354         "pld [%[block], #32]            \n\t"
 355         "and r12, %[pixels], #7         \n\t"
 356         "bic %[pixels], %[pixels], #7   \n\t"
 357         "tmcr wcgr1, r12                \n\t"
 358         "add r12, r12, #1               \n\t"
 359         "add r4, %[pixels], %[line_size]\n\t"
 360         "tmcr wcgr2, r12                \n\t"
 361         "add r5, %[block], %[line_size] \n\t"
 362         "mov %[line_size], %[line_size], lsl #1 \n\t"
 363         "pld [r5]                       \n\t"
 364         "pld [r5, #32]                  \n\t"
 365
 366         "1:                             \n\t"
 367         "wldrd wr10, [%[pixels]]        \n\t"
 368         "cmp r12, #8                    \n\t"
 369         "wldrd wr11, [%[pixels], #8]    \n\t"
 370         "wldrd wr12, [%[pixels], #16]   \n\t"
 371         "add %[pixels], %[pixels], %[line_size] \n\t"
 372         "wldrd wr13, [r4]               \n\t"
 373         "pld [%[pixels]]                \n\t"
 374         "wldrd wr14, [r4, #8]           \n\t"
 375         "pld [%[pixels], #32]           \n\t"
 376         "wldrd wr15, [r4, #16]          \n\t"
 377         "add r4, r4, %[line_size]       \n\t"
 378         "walignr1 wr0, wr10, wr11       \n\t"
 379         "pld [r4]                       \n\t"
 380         "pld [r4, #32]                  \n\t"
 381         "walignr1 wr1, wr11, wr12       \n\t"
 382         "walignr1 wr2, wr13, wr14       \n\t"
 383         "walignr1 wr3, wr14, wr15       \n\t"
 384         "wmoveq wr4, wr11               \n\t"
 385         "wmoveq wr5, wr12               \n\t"
 386         "wmoveq wr6, wr14               \n\t"
 387         "wmoveq wr7, wr15               \n\t"
 388         "walignr2ne wr4, wr10, wr11     \n\t"
 389         "walignr2ne wr5, wr11, wr12     \n\t"
 390         "walignr2ne wr6, wr13, wr14     \n\t"
 391         "walignr2ne wr7, wr14, wr15     \n\t"
 392         "wldrd wr10, [%[block]]         \n\t"
 393         WAVG2B" wr0, wr0, wr4           \n\t"
 394         "wldrd wr11, [%[block], #8]     \n\t"
 395         WAVG2B" wr1, wr1, wr5           \n\t"
 396         "wldrd wr12, [r5]               \n\t"
 397         WAVG2B" wr2, wr2, wr6           \n\t"
 398         "wldrd wr13, [r5, #8]           \n\t"
 399         WAVG2B" wr3, wr3, wr7           \n\t"
 400         WAVG2B" wr0, wr0, wr10          \n\t"
 401         WAVG2B" wr1, wr1, wr11          \n\t"
 402         WAVG2B" wr2, wr2, wr12          \n\t"
 403         WAVG2B" wr3, wr3, wr13          \n\t"
 404         "wstrd wr0, [%[block]]          \n\t"
 405         "subs %[h], %[h], #2            \n\t"
 406         "wstrd wr1, [%[block], #8]      \n\t"
 407         "add %[block], %[block], %[line_size]   \n\t"
 408         "wstrd wr2, [r5]                \n\t"
 409         "pld [%[block]]                 \n\t"
 410         "wstrd wr3, [r5, #8]            \n\t"
 411         "add r5, r5, %[line_size]       \n\t"
 412         "pld [%[block], #32]            \n\t"
 413         "pld [r5]                       \n\t"
 414         "pld [r5, #32]                  \n\t"
 415         "bne 1b                         \n\t"
 416         : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
 417         :
 418         :"r4", "r5", "r12", "memory");
 419 }
 420
 421 void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
 422 {
 423     int stride = line_size;
 424     // [wr0 wr1 wr2 wr3] for previous line
 425     // [wr4 wr5 wr6 wr7] for current line
 426     __asm__ __volatile__(
 427         "pld            [%[pixels]]                             \n\t"
 428         "pld            [%[pixels], #32]                        \n\t"
 429         "and            r12, %[pixels], #7                      \n\t"
 430         "tmcr           wcgr1, r12                              \n\t"
 431         "bic            %[pixels], %[pixels], #7                \n\t"
 432
 433         "wldrd          wr10, [%[pixels]]                       \n\t"
 434         "wldrd          wr11, [%[pixels], #8]                   \n\t"
 435         "pld            [%[block]]                              \n\t"
 436         "add            %[pixels], %[pixels], %[line_size]      \n\t"
 437         "walignr1       wr0, wr10, wr11                         \n\t"
 438         "pld            [%[pixels]]                             \n\t"
 439         "pld            [%[pixels], #32]                        \n\t"
 440
 441       "1:                                                       \n\t"
 442         "wldrd          wr10, [%[pixels]]                       \n\t"
 443         "wldrd          wr11, [%[pixels], #8]                   \n\t"
 444         "add            %[pixels], %[pixels], %[line_size]      \n\t"
 445         "pld            [%[pixels]]                             \n\t"
 446         "pld            [%[pixels], #32]                        \n\t"
 447         "walignr1       wr4, wr10, wr11                         \n\t"
 448         "wldrd          wr10, [%[block]]                        \n\t"
 449          WAVG2B"        wr8, wr0, wr4                           \n\t"
 450          WAVG2B"        wr8, wr8, wr10                          \n\t"
 451         "wstrd          wr8, [%[block]]                         \n\t"
 452         "add            %[block], %[block], %[line_size]        \n\t"
 453
 454         "wldrd          wr10, [%[pixels]]                       \n\t"
 455         "wldrd          wr11, [%[pixels], #8]                   \n\t"
 456         "pld            [%[block]]                              \n\t"
 457         "add            %[pixels], %[pixels], %[line_size]      \n\t"
 458         "pld            [%[pixels]]                             \n\t"
 459         "pld            [%[pixels], #32]                        \n\t"
 460         "walignr1       wr0, wr10, wr11                         \n\t"
 461         "wldrd          wr10, [%[block]]                        \n\t"
 462          WAVG2B"        wr8, wr0, wr4                           \n\t"
 463          WAVG2B"        wr8, wr8, wr10                          \n\t"
 464         "wstrd          wr8, [%[block]]                         \n\t"
 465         "add            %[block], %[block], %[line_size]        \n\t"
 466
 467         "subs           %[h], %[h], #2                          \n\t"
 468         "pld            [%[block]]                              \n\t"
 469         "bne            1b                                      \n\t"
 470         : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
 471         :
 472         : "cc", "memory", "r12");
 473 }
 474
 475 void DEF(put, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
 476 {
 477     int stride = line_size;
 478     // [wr0 wr1 wr2 wr3] for previous line
 479     // [wr4 wr5 wr6 wr7] for current line
 480     __asm__ __volatile__(
 481         "pld [%[pixels]]                \n\t"
 482         "pld [%[pixels], #32]           \n\t"
 483         "and r12, %[pixels], #7         \n\t"
 484         "tmcr wcgr1, r12                \n\t"
 485         "bic %[pixels], %[pixels], #7   \n\t"
 486
 487         "wldrd wr10, [%[pixels]]        \n\t"
 488         "wldrd wr11, [%[pixels], #8]    \n\t"
 489         "wldrd wr12, [%[pixels], #16]   \n\t"
 490         "add %[pixels], %[pixels], %[line_size] \n\t"
 491         "pld [%[pixels]]                \n\t"
 492         "pld [%[pixels], #32]           \n\t"
 493         "walignr1 wr0, wr10, wr11       \n\t"
 494         "walignr1 wr1, wr11, wr12       \n\t"
 495
 496         "1:                             \n\t"
 497         "wldrd wr10, [%[pixels]]        \n\t"
 498         "wldrd wr11, [%[pixels], #8]    \n\t"
 499         "wldrd wr12, [%[pixels], #16]   \n\t"
 500         "add %[pixels], %[pixels], %[line_size] \n\t"
 501         "pld [%[pixels]]                \n\t"
 502         "pld [%[pixels], #32]           \n\t"
 503         "walignr1 wr4, wr10, wr11       \n\t"
 504         "walignr1 wr5, wr11, wr12       \n\t"
 505         WAVG2B" wr8, wr0, wr4           \n\t"
 506         WAVG2B" wr9, wr1, wr5           \n\t"
 507         "wstrd wr8, [%[block]]          \n\t"
 508         "wstrd wr9, [%[block], #8]      \n\t"
 509         "add %[block], %[block], %[line_size]   \n\t"
 510
 511         "wldrd wr10, [%[pixels]]        \n\t"
 512         "wldrd wr11, [%[pixels], #8]    \n\t"
 513         "wldrd wr12, [%[pixels], #16]   \n\t"
 514         "add %[pixels], %[pixels], %[line_size] \n\t"
 515         "pld [%[pixels]]                \n\t"
 516         "pld [%[pixels], #32]           \n\t"
 517         "walignr1 wr0, wr10, wr11       \n\t"
 518         "walignr1 wr1, wr11, wr12       \n\t"
 519         WAVG2B" wr8, wr0, wr4           \n\t"
 520         WAVG2B" wr9, wr1, wr5           \n\t"
 521         "wstrd wr8, [%[block]]          \n\t"
 522         "wstrd wr9, [%[block], #8]      \n\t"
 523         "add %[block], %[block], %[line_size]   \n\t"
 524
 525         "subs %[h], %[h], #2            \n\t"
 526         "bne 1b                         \n\t"
 527         : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
 528         :
 529         : "r4", "r5", "r12", "memory");
 530 }
 531
 532 void DEF(avg, pixels16_y2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
 533 {
 534     int stride = line_size;
 535     // [wr0 wr1 wr2 wr3] for previous line
 536     // [wr4 wr5 wr6 wr7] for current line
 537     __asm__ __volatile__(
 538         "pld [%[pixels]]                \n\t"
 539         "pld [%[pixels], #32]           \n\t"
 540         "and r12, %[pixels], #7         \n\t"
 541         "tmcr wcgr1, r12                \n\t"
 542         "bic %[pixels], %[pixels], #7   \n\t"
 543
 544         "wldrd wr10, [%[pixels]]        \n\t"
 545         "wldrd wr11, [%[pixels], #8]    \n\t"
 546         "pld [%[block]]                 \n\t"
 547         "wldrd wr12, [%[pixels], #16]   \n\t"
 548         "add %[pixels], %[pixels], %[line_size] \n\t"
 549         "pld [%[pixels]]                \n\t"
 550         "pld [%[pixels], #32]           \n\t"
 551         "walignr1 wr0, wr10, wr11       \n\t"
 552         "walignr1 wr1, wr11, wr12       \n\t"
 553
 554         "1:                             \n\t"
 555         "wldrd wr10, [%[pixels]]        \n\t"
 556         "wldrd wr11, [%[pixels], #8]    \n\t"
 557         "wldrd wr12, [%[pixels], #16]   \n\t"
 558         "add %[pixels], %[pixels], %[line_size] \n\t"
 559         "pld [%[pixels]]                \n\t"
 560         "pld [%[pixels], #32]           \n\t"
 561         "walignr1 wr4, wr10, wr11       \n\t"
 562         "walignr1 wr5, wr11, wr12       \n\t"
 563         "wldrd wr10, [%[block]]         \n\t"
 564         "wldrd wr11, [%[block], #8]     \n\t"
 565         WAVG2B" wr8, wr0, wr4           \n\t"
 566         WAVG2B" wr9, wr1, wr5           \n\t"
 567         WAVG2B" wr8, wr8, wr10          \n\t"
 568         WAVG2B" wr9, wr9, wr11          \n\t"
 569         "wstrd wr8, [%[block]]          \n\t"
 570         "wstrd wr9, [%[block], #8]      \n\t"
 571         "add %[block], %[block], %[line_size]   \n\t"
 572
 573         "wldrd wr10, [%[pixels]]        \n\t"
 574         "wldrd wr11, [%[pixels], #8]    \n\t"
 575         "pld [%[block]]                 \n\t"
 576         "wldrd wr12, [%[pixels], #16]   \n\t"
 577         "add %[pixels], %[pixels], %[line_size] \n\t"
 578         "pld [%[pixels]]                \n\t"
 579         "pld [%[pixels], #32]           \n\t"
 580         "walignr1 wr0, wr10, wr11       \n\t"
 581         "walignr1 wr1, wr11, wr12       \n\t"
 582         "wldrd wr10, [%[block]]         \n\t"
 583         "wldrd wr11, [%[block], #8]     \n\t"
 584         WAVG2B" wr8, wr0, wr4           \n\t"
 585         WAVG2B" wr9, wr1, wr5           \n\t"
 586         WAVG2B" wr8, wr8, wr10          \n\t"
 587         WAVG2B" wr9, wr9, wr11          \n\t"
 588         "wstrd wr8, [%[block]]          \n\t"
 589         "wstrd wr9, [%[block], #8]      \n\t"
 590         "add %[block], %[block], %[line_size]   \n\t"
 591
 592         "subs %[h], %[h], #2            \n\t"
 593         "pld [%[block]]                 \n\t"
 594         "bne 1b                         \n\t"
 595         : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride)
 596         :
 597         : "r4", "r5", "r12", "memory");
 598 }
 599
 600 void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
 601 {
 602     // [wr0 wr1 wr2 wr3] for previous line
 603     // [wr4 wr5 wr6 wr7] for current line
 604     SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
 605     __asm__ __volatile__(
 606         "pld [%[pixels]]                \n\t"
 607         "mov r12, #2                    \n\t"
 608         "pld [%[pixels], #32]           \n\t"
 609         "tmcr wcgr0, r12                \n\t" /* for shift value */
 610         "and r12, %[pixels], #7         \n\t"
 611         "bic %[pixels], %[pixels], #7   \n\t"
 612         "tmcr wcgr1, r12                \n\t"
 613
 614         // [wr0 wr1 wr2 wr3] <= *
 615         // [wr4 wr5 wr6 wr7]
 616         "wldrd wr12, [%[pixels]]        \n\t"
 617         "add r12, r12, #1               \n\t"
 618         "wldrd wr13, [%[pixels], #8]    \n\t"
 619         "tmcr wcgr2, r12                \n\t"
 620         "add %[pixels], %[pixels], %[line_size] \n\t"
 621         "cmp r12, #8                    \n\t"
 622         "pld [%[pixels]]                \n\t"
 623         "pld [%[pixels], #32]           \n\t"
 624         "walignr1 wr2, wr12, wr13       \n\t"
 625         "wmoveq wr10, wr13              \n\t"
 626         "walignr2ne wr10, wr12, wr13    \n\t"
 627         "wunpckelub wr0, wr2            \n\t"
 628         "wunpckehub wr1, wr2            \n\t"
 629         "wunpckelub wr8, wr10           \n\t"
 630         "wunpckehub wr9, wr10           \n\t"
 631         "waddhus wr0, wr0, wr8          \n\t"
 632         "waddhus wr1, wr1, wr9          \n\t"
 633
 634         "1:                             \n\t"
 635         // [wr0 wr1 wr2 wr3]
 636         // [wr4 wr5 wr6 wr7] <= *
 637         "wldrd wr12, [%[pixels]]        \n\t"
 638         "cmp r12, #8                    \n\t"
 639         "wldrd wr13, [%[pixels], #8]    \n\t"
 640         "add %[pixels], %[pixels], %[line_size] \n\t"
 641         "walignr1 wr6, wr12, wr13       \n\t"
 642         "pld [%[pixels]]                \n\t"
 643         "pld [%[pixels], #32]           \n\t"
 644         "wmoveq wr10, wr13              \n\t"
 645         "walignr2ne wr10, wr12, wr13    \n\t"
 646         "wunpckelub wr4, wr6            \n\t"
 647         "wunpckehub wr5, wr6            \n\t"
 648         "wunpckelub wr8, wr10           \n\t"
 649         "wunpckehub wr9, wr10           \n\t"
 650         "waddhus wr4, wr4, wr8          \n\t"
 651         "waddhus wr5, wr5, wr9          \n\t"
 652         "waddhus wr8, wr0, wr4          \n\t"
 653         "waddhus wr9, wr1, wr5          \n\t"
 654         "waddhus wr8, wr8, wr15         \n\t"
 655         "waddhus wr9, wr9, wr15         \n\t"
 656         "wsrlhg wr8, wr8, wcgr0         \n\t"
 657         "wsrlhg wr9, wr9, wcgr0         \n\t"
 658         "wpackhus wr8, wr8, wr9         \n\t"
 659         "wstrd wr8, [%[block]]          \n\t"
 660         "add %[block], %[block], %[line_size]   \n\t"
 661
 662         // [wr0 wr1 wr2 wr3] <= *
 663         // [wr4 wr5 wr6 wr7]
 664         "wldrd wr12, [%[pixels]]        \n\t"
 665         "wldrd wr13, [%[pixels], #8]    \n\t"
 666         "add %[pixels], %[pixels], %[line_size] \n\t"
 667         "walignr1 wr2, wr12, wr13       \n\t"
 668         "pld [%[pixels]]                \n\t"
 669         "pld [%[pixels], #32]           \n\t"
 670         "wmoveq wr10, wr13              \n\t"
 671         "walignr2ne wr10, wr12, wr13    \n\t"
 672         "wunpckelub wr0, wr2            \n\t"
 673         "wunpckehub wr1, wr2            \n\t"
 674         "wunpckelub wr8, wr10           \n\t"
 675         "wunpckehub wr9, wr10           \n\t"
 676         "waddhus wr0, wr0, wr8          \n\t"
 677         "waddhus wr1, wr1, wr9          \n\t"
 678         "waddhus wr8, wr0, wr4          \n\t"
 679         "waddhus wr9, wr1, wr5          \n\t"
 680         "waddhus wr8, wr8, wr15         \n\t"
 681         "waddhus wr9, wr9, wr15         \n\t"
 682         "wsrlhg wr8, wr8, wcgr0         \n\t"
 683         "wsrlhg wr9, wr9, wcgr0         \n\t"
 684         "wpackhus wr8, wr8, wr9         \n\t"
 685         "subs %[h], %[h], #2            \n\t"
 686         "wstrd wr8, [%[block]]          \n\t"
 687         "add %[block], %[block], %[line_size]   \n\t"
 688         "bne 1b                         \n\t"
 689         : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
 690         : [line_size]"r"(line_size)
 691         : "r12", "memory");
 692 }
 693
 694 void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
 695 {
 696     // [wr0 wr1 wr2 wr3] for previous line
 697     // [wr4 wr5 wr6 wr7] for current line
 698     SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
 699     __asm__ __volatile__(
 700         "pld [%[pixels]]                \n\t"
 701         "mov r12, #2                    \n\t"
 702         "pld [%[pixels], #32]           \n\t"
 703         "tmcr wcgr0, r12                \n\t" /* for shift value */
 704         /* alignment */
 705         "and r12, %[pixels], #7         \n\t"
 706         "bic %[pixels], %[pixels], #7   \n\t"
 707         "tmcr wcgr1, r12                \n\t"
 708         "add r12, r12, #1               \n\t"
 709         "tmcr wcgr2, r12                \n\t"
 710
 711         // [wr0 wr1 wr2 wr3] <= *
 712         // [wr4 wr5 wr6 wr7]
 713         "wldrd wr12, [%[pixels]]        \n\t"
 714         "cmp r12, #8                    \n\t"
 715         "wldrd wr13, [%[pixels], #8]    \n\t"
 716         "wldrd wr14, [%[pixels], #16]   \n\t"
 717         "add %[pixels], %[pixels], %[line_size] \n\t"
 718         "pld [%[pixels]]                \n\t"
 719         "walignr1 wr2, wr12, wr13       \n\t"
 720         "pld [%[pixels], #32]           \n\t"
 721         "walignr1 wr3, wr13, wr14       \n\t"
 722         "wmoveq wr10, wr13              \n\t"
 723         "wmoveq wr11, wr14              \n\t"
 724         "walignr2ne wr10, wr12, wr13    \n\t"
 725         "walignr2ne wr11, wr13, wr14    \n\t"
 726         "wunpckelub wr0, wr2            \n\t"
 727         "wunpckehub wr1, wr2            \n\t"
 728         "wunpckelub wr2, wr3            \n\t"
 729         "wunpckehub wr3, wr3            \n\t"
 730         "wunpckelub wr8, wr10           \n\t"
 731         "wunpckehub wr9, wr10           \n\t"
 732         "wunpckelub wr10, wr11          \n\t"
 733         "wunpckehub wr11, wr11          \n\t"
 734         "waddhus wr0, wr0, wr8          \n\t"
 735         "waddhus wr1, wr1, wr9          \n\t"
 736         "waddhus wr2, wr2, wr10         \n\t"
 737         "waddhus wr3, wr3, wr11         \n\t"
 738
 739         "1:                             \n\t"
 740         // [wr0 wr1 wr2 wr3]
 741         // [wr4 wr5 wr6 wr7] <= *
 742         "wldrd wr12, [%[pixels]]        \n\t"
 743         "cmp r12, #8                    \n\t"
 744         "wldrd wr13, [%[pixels], #8]    \n\t"
 745         "wldrd wr14, [%[pixels], #16]   \n\t"
 746         "add %[pixels], %[pixels], %[line_size] \n\t"
 747         "walignr1 wr6, wr12, wr13       \n\t"
 748         "pld [%[pixels]]                \n\t"
 749         "pld [%[pixels], #32]           \n\t"
 750         "walignr1 wr7, wr13, wr14       \n\t"
 751         "wmoveq wr10, wr13              \n\t"
 752         "wmoveq wr11, wr14              \n\t"
 753         "walignr2ne wr10, wr12, wr13    \n\t"
 754         "walignr2ne wr11, wr13, wr14    \n\t"
 755         "wunpckelub wr4, wr6            \n\t"
 756         "wunpckehub wr5, wr6            \n\t"
 757         "wunpckelub wr6, wr7            \n\t"
 758         "wunpckehub wr7, wr7            \n\t"
 759         "wunpckelub wr8, wr10           \n\t"
 760         "wunpckehub wr9, wr10           \n\t"
 761         "wunpckelub wr10, wr11          \n\t"
 762         "wunpckehub wr11, wr11          \n\t"
 763         "waddhus wr4, wr4, wr8          \n\t"
 764         "waddhus wr5, wr5, wr9          \n\t"
 765         "waddhus wr6, wr6, wr10         \n\t"
 766         "waddhus wr7, wr7, wr11         \n\t"
 767         "waddhus wr8, wr0, wr4          \n\t"
 768         "waddhus wr9, wr1, wr5          \n\t"
 769         "waddhus wr10, wr2, wr6         \n\t"
 770         "waddhus wr11, wr3, wr7         \n\t"
 771         "waddhus wr8, wr8, wr15         \n\t"
 772         "waddhus wr9, wr9, wr15         \n\t"
 773         "waddhus wr10, wr10, wr15       \n\t"
 774         "waddhus wr11, wr11, wr15       \n\t"
 775         "wsrlhg wr8, wr8, wcgr0         \n\t"
 776         "wsrlhg wr9, wr9, wcgr0         \n\t"
 777         "wsrlhg wr10, wr10, wcgr0       \n\t"
 778         "wsrlhg wr11, wr11, wcgr0       \n\t"
 779         "wpackhus wr8, wr8, wr9         \n\t"
 780         "wpackhus wr9, wr10, wr11       \n\t"
 781         "wstrd wr8, [%[block]]          \n\t"
 782         "wstrd wr9, [%[block], #8]      \n\t"
 783         "add %[block], %[block], %[line_size]   \n\t"
 784
 785         // [wr0 wr1 wr2 wr3] <= *
 786         // [wr4 wr5 wr6 wr7]
 787         "wldrd wr12, [%[pixels]]        \n\t"
 788         "wldrd wr13, [%[pixels], #8]    \n\t"
 789         "wldrd wr14, [%[pixels], #16]   \n\t"
 790         "add %[pixels], %[pixels], %[line_size] \n\t"
 791         "walignr1 wr2, wr12, wr13       \n\t"
 792         "pld [%[pixels]]                \n\t"
 793         "pld [%[pixels], #32]           \n\t"
 794         "walignr1 wr3, wr13, wr14       \n\t"
 795         "wmoveq wr10, wr13              \n\t"
 796         "wmoveq wr11, wr14              \n\t"
 797         "walignr2ne wr10, wr12, wr13    \n\t"
 798         "walignr2ne wr11, wr13, wr14    \n\t"
 799         "wunpckelub wr0, wr2            \n\t"
 800         "wunpckehub wr1, wr2            \n\t"
 801         "wunpckelub wr2, wr3            \n\t"
 802         "wunpckehub wr3, wr3            \n\t"
 803         "wunpckelub wr8, wr10           \n\t"
 804         "wunpckehub wr9, wr10           \n\t"
 805         "wunpckelub wr10, wr11          \n\t"
 806         "wunpckehub wr11, wr11          \n\t"
 807         "waddhus wr0, wr0, wr8          \n\t"
 808         "waddhus wr1, wr1, wr9          \n\t"
 809         "waddhus wr2, wr2, wr10         \n\t"
 810         "waddhus wr3, wr3, wr11         \n\t"
 811         "waddhus wr8, wr0, wr4          \n\t"
 812         "waddhus wr9, wr1, wr5          \n\t"
 813         "waddhus wr10, wr2, wr6         \n\t"
 814         "waddhus wr11, wr3, wr7         \n\t"
 815         "waddhus wr8, wr8, wr15         \n\t"
 816         "waddhus wr9, wr9, wr15         \n\t"
 817         "waddhus wr10, wr10, wr15       \n\t"
 818         "waddhus wr11, wr11, wr15       \n\t"
 819         "wsrlhg wr8, wr8, wcgr0         \n\t"
 820         "wsrlhg wr9, wr9, wcgr0         \n\t"
 821         "wsrlhg wr10, wr10, wcgr0       \n\t"
 822         "wsrlhg wr11, wr11, wcgr0       \n\t"
 823         "wpackhus wr8, wr8, wr9         \n\t"
 824         "wpackhus wr9, wr10, wr11       \n\t"
 825         "wstrd wr8, [%[block]]          \n\t"
 826         "wstrd wr9, [%[block], #8]      \n\t"
 827         "add %[block], %[block], %[line_size]   \n\t"
 828
 829         "subs %[h], %[h], #2            \n\t"
 830         "bne 1b                         \n\t"
 831         : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
 832         : [line_size]"r"(line_size)
 833         : "r12", "memory");
 834 }
 835
 836 void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
 837 {
 838     // [wr0 wr1 wr2 wr3] for previous line
 839     // [wr4 wr5 wr6 wr7] for current line
 840     SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
 841     __asm__ __volatile__(
 842         "pld [%[block]]                 \n\t"
 843         "pld [%[block], #32]            \n\t"
 844         "pld [%[pixels]]                \n\t"
 845         "mov r12, #2                    \n\t"
 846         "pld [%[pixels], #32]           \n\t"
 847         "tmcr wcgr0, r12                \n\t" /* for shift value */
 848         "and r12, %[pixels], #7         \n\t"
 849         "bic %[pixels], %[pixels], #7   \n\t"
 850         "tmcr wcgr1, r12                \n\t"
 851
 852         // [wr0 wr1 wr2 wr3] <= *
 853         // [wr4 wr5 wr6 wr7]
 854         "wldrd wr12, [%[pixels]]        \n\t"
 855         "add r12, r12, #1               \n\t"
 856         "wldrd wr13, [%[pixels], #8]    \n\t"
 857         "tmcr wcgr2, r12                \n\t"
 858         "add %[pixels], %[pixels], %[line_size] \n\t"
 859         "cmp r12, #8                    \n\t"
 860         "pld [%[pixels]]                \n\t"
 861         "pld [%[pixels], #32]           \n\t"
 862         "walignr1 wr2, wr12, wr13       \n\t"
 863         "wmoveq wr10, wr13              \n\t"
 864         "walignr2ne wr10, wr12, wr13    \n\t"
 865         "wunpckelub wr0, wr2            \n\t"
 866         "wunpckehub wr1, wr2            \n\t"
 867         "wunpckelub wr8, wr10           \n\t"
 868         "wunpckehub wr9, wr10           \n\t"
 869         "waddhus wr0, wr0, wr8          \n\t"
 870         "waddhus wr1, wr1, wr9          \n\t"
 871
 872         "1:                             \n\t"
 873         // [wr0 wr1 wr2 wr3]
 874         // [wr4 wr5 wr6 wr7] <= *
 875         "wldrd wr12, [%[pixels]]        \n\t"
 876         "cmp r12, #8                    \n\t"
 877         "wldrd wr13, [%[pixels], #8]    \n\t"
 878         "add %[pixels], %[pixels], %[line_size] \n\t"
 879         "walignr1 wr6, wr12, wr13       \n\t"
 880         "pld [%[pixels]]                \n\t"
 881         "pld [%[pixels], #32]           \n\t"
 882         "wmoveq wr10, wr13              \n\t"
 883         "walignr2ne wr10, wr12, wr13    \n\t"
 884         "wunpckelub wr4, wr6            \n\t"
 885         "wunpckehub wr5, wr6            \n\t"
 886         "wunpckelub wr8, wr10           \n\t"
 887         "wunpckehub wr9, wr10           \n\t"
 888         "waddhus wr4, wr4, wr8          \n\t"
 889         "waddhus wr5, wr5, wr9          \n\t"
 890         "waddhus wr8, wr0, wr4          \n\t"
 891         "waddhus wr9, wr1, wr5          \n\t"
 892         "waddhus wr8, wr8, wr15         \n\t"
 893         "waddhus wr9, wr9, wr15         \n\t"
 894         "wldrd wr12, [%[block]]         \n\t"
 895         "wsrlhg wr8, wr8, wcgr0         \n\t"
 896         "wsrlhg wr9, wr9, wcgr0         \n\t"
 897         "wpackhus wr8, wr8, wr9         \n\t"
 898         WAVG2B" wr8, wr8, wr12          \n\t"
 899         "wstrd wr8, [%[block]]          \n\t"
 900         "add %[block], %[block], %[line_size]   \n\t"
 901         "wldrd wr12, [%[pixels]]        \n\t"
 902         "pld [%[block]]                 \n\t"
 903         "pld [%[block], #32]            \n\t"
 904
 905         // [wr0 wr1 wr2 wr3] <= *
 906         // [wr4 wr5 wr6 wr7]
 907         "wldrd wr13, [%[pixels], #8]    \n\t"
 908         "add %[pixels], %[pixels], %[line_size] \n\t"
 909         "walignr1 wr2, wr12, wr13       \n\t"
 910         "pld [%[pixels]]                \n\t"
 911         "pld [%[pixels], #32]           \n\t"
 912         "wmoveq wr10, wr13              \n\t"
 913         "walignr2ne wr10, wr12, wr13    \n\t"
 914         "wunpckelub wr0, wr2            \n\t"
 915         "wunpckehub wr1, wr2            \n\t"
 916         "wunpckelub wr8, wr10           \n\t"
 917         "wunpckehub wr9, wr10           \n\t"
 918         "waddhus wr0, wr0, wr8          \n\t"
 919         "waddhus wr1, wr1, wr9          \n\t"
 920         "waddhus wr8, wr0, wr4          \n\t"
 921         "waddhus wr9, wr1, wr5          \n\t"
 922         "waddhus wr8, wr8, wr15         \n\t"
 923         "waddhus wr9, wr9, wr15         \n\t"
 924         "wldrd wr12, [%[block]]         \n\t"
 925         "wsrlhg wr8, wr8, wcgr0         \n\t"
 926         "wsrlhg wr9, wr9, wcgr0         \n\t"
 927         "wpackhus wr8, wr8, wr9         \n\t"
 928         "subs %[h], %[h], #2            \n\t"
 929         WAVG2B" wr8, wr8, wr12          \n\t"
 930         "wstrd wr8, [%[block]]          \n\t"
 931         "add %[block], %[block], %[line_size]   \n\t"
 932         "pld [%[block]]                 \n\t"
 933         "pld [%[block], #32]            \n\t"
 934         "bne 1b                         \n\t"
 935         : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
 936         : [line_size]"r"(line_size)
 937         : "r12", "memory");
 938 }
 939
 940 void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h)
 941 {
 942     // [wr0 wr1 wr2 wr3] for previous line
 943     // [wr4 wr5 wr6 wr7] for current line
 944     SET_RND(wr15); // =2 for rnd  and  =1 for no_rnd version
 945     __asm__ __volatile__(
 946         "pld [%[block]]                 \n\t"
 947         "pld [%[block], #32]            \n\t"
 948         "pld [%[pixels]]                \n\t"
 949         "mov r12, #2                    \n\t"
 950         "pld [%[pixels], #32]           \n\t"
 951         "tmcr wcgr0, r12                \n\t" /* for shift value */
 952         /* alignment */
 953         "and r12, %[pixels], #7         \n\t"
 954         "bic %[pixels], %[pixels], #7           \n\t"
 955         "tmcr wcgr1, r12                \n\t"
 956         "add r12, r12, #1               \n\t"
 957         "tmcr wcgr2, r12                \n\t"
 958
 959         // [wr0 wr1 wr2 wr3] <= *
 960         // [wr4 wr5 wr6 wr7]
 961         "wldrd wr12, [%[pixels]]        \n\t"
 962         "cmp r12, #8                    \n\t"
 963         "wldrd wr13, [%[pixels], #8]    \n\t"
 964         "wldrd wr14, [%[pixels], #16]   \n\t"
 965         "add %[pixels], %[pixels], %[line_size] \n\t"
 966         "pld [%[pixels]]                \n\t"
 967         "walignr1 wr2, wr12, wr13       \n\t"
 968         "pld [%[pixels], #32]           \n\t"
 969         "walignr1 wr3, wr13, wr14       \n\t"
 970         "wmoveq wr10, wr13              \n\t"
 971         "wmoveq wr11, wr14              \n\t"
 972         "walignr2ne wr10, wr12, wr13    \n\t"
 973         "walignr2ne wr11, wr13, wr14    \n\t"
 974         "wunpckelub wr0, wr2            \n\t"
 975         "wunpckehub wr1, wr2            \n\t"
 976         "wunpckelub wr2, wr3            \n\t"
 977         "wunpckehub wr3, wr3            \n\t"
 978         "wunpckelub wr8, wr10           \n\t"
 979         "wunpckehub wr9, wr10           \n\t"
 980         "wunpckelub wr10, wr11          \n\t"
 981         "wunpckehub wr11, wr11          \n\t"
 982         "waddhus wr0, wr0, wr8          \n\t"
 983         "waddhus wr1, wr1, wr9          \n\t"
 984         "waddhus wr2, wr2, wr10         \n\t"
 985         "waddhus wr3, wr3, wr11         \n\t"
 986
 987         "1:                             \n\t"
 988         // [wr0 wr1 wr2 wr3]
 989         // [wr4 wr5 wr6 wr7] <= *
 990         "wldrd wr12, [%[pixels]]        \n\t"
 991         "cmp r12, #8                    \n\t"
 992         "wldrd wr13, [%[pixels], #8]    \n\t"
 993         "wldrd wr14, [%[pixels], #16]   \n\t"
 994         "add %[pixels], %[pixels], %[line_size] \n\t"
 995         "walignr1 wr6, wr12, wr13       \n\t"
 996         "pld [%[pixels]]                \n\t"
 997         "pld [%[pixels], #32]           \n\t"
 998         "walignr1 wr7, wr13, wr14       \n\t"
 999         "wmoveq wr10, wr13              \n\t"
1000         "wmoveq wr11, wr14              \n\t"
1001         "walignr2ne wr10, wr12, wr13    \n\t"
1002         "walignr2ne wr11, wr13, wr14    \n\t"
1003         "wunpckelub wr4, wr6            \n\t"
1004         "wunpckehub wr5, wr6            \n\t"
1005         "wunpckelub wr6, wr7            \n\t"
1006         "wunpckehub wr7, wr7            \n\t"
1007         "wunpckelub wr8, wr10           \n\t"
1008         "wunpckehub wr9, wr10           \n\t"
1009         "wunpckelub wr10, wr11          \n\t"
1010         "wunpckehub wr11, wr11          \n\t"
1011         "waddhus wr4, wr4, wr8          \n\t"
1012         "waddhus wr5, wr5, wr9          \n\t"
1013         "waddhus wr6, wr6, wr10         \n\t"
1014         "waddhus wr7, wr7, wr11         \n\t"
1015         "waddhus wr8, wr0, wr4          \n\t"
1016         "waddhus wr9, wr1, wr5          \n\t"
1017         "waddhus wr10, wr2, wr6         \n\t"
1018         "waddhus wr11, wr3, wr7         \n\t"
1019         "waddhus wr8, wr8, wr15         \n\t"
1020         "waddhus wr9, wr9, wr15         \n\t"
1021         "waddhus wr10, wr10, wr15       \n\t"
1022         "waddhus wr11, wr11, wr15       \n\t"
1023         "wsrlhg wr8, wr8, wcgr0         \n\t"
1024         "wsrlhg wr9, wr9, wcgr0         \n\t"
1025         "wldrd wr12, [%[block]]         \n\t"
1026         "wldrd wr13, [%[block], #8]     \n\t"
1027         "wsrlhg wr10, wr10, wcgr0       \n\t"
1028         "wsrlhg wr11, wr11, wcgr0       \n\t"
1029         "wpackhus wr8, wr8, wr9         \n\t"
1030         "wpackhus wr9, wr10, wr11       \n\t"
1031         WAVG2B" wr8, wr8, wr12          \n\t"
1032         WAVG2B" wr9, wr9, wr13          \n\t"
1033         "wstrd wr8, [%[block]]          \n\t"
1034         "wstrd wr9, [%[block], #8]      \n\t"
1035         "add %[block], %[block], %[line_size]   \n\t"
1036
1037         // [wr0 wr1 wr2 wr3] <= *
1038         // [wr4 wr5 wr6 wr7]
1039         "wldrd wr12, [%[pixels]]        \n\t"
1040         "pld [%[block]]                 \n\t"
1041         "wldrd wr13, [%[pixels], #8]    \n\t"
1042         "pld [%[block], #32]            \n\t"
1043         "wldrd wr14, [%[pixels], #16]   \n\t"
1044         "add %[pixels], %[pixels], %[line_size] \n\t"
1045         "walignr1 wr2, wr12, wr13       \n\t"
1046         "pld [%[pixels]]                \n\t"
1047         "pld [%[pixels], #32]           \n\t"
1048         "walignr1 wr3, wr13, wr14       \n\t"
1049         "wmoveq wr10, wr13              \n\t"
1050         "wmoveq wr11, wr14              \n\t"
1051         "walignr2ne wr10, wr12, wr13    \n\t"
1052         "walignr2ne wr11, wr13, wr14    \n\t"
1053         "wunpckelub wr0, wr2            \n\t"
1054         "wunpckehub wr1, wr2            \n\t"
1055         "wunpckelub wr2, wr3            \n\t"
1056         "wunpckehub wr3, wr3            \n\t"
1057         "wunpckelub wr8, wr10           \n\t"
1058         "wunpckehub wr9, wr10           \n\t"
1059         "wunpckelub wr10, wr11          \n\t"
1060         "wunpckehub wr11, wr11          \n\t"
1061         "waddhus wr0, wr0, wr8          \n\t"
1062         "waddhus wr1, wr1, wr9          \n\t"
1063         "waddhus wr2, wr2, wr10         \n\t"
1064         "waddhus wr3, wr3, wr11         \n\t"
1065         "waddhus wr8, wr0, wr4          \n\t"
1066         "waddhus wr9, wr1, wr5          \n\t"
1067         "waddhus wr10, wr2, wr6         \n\t"
1068         "waddhus wr11, wr3, wr7         \n\t"
1069         "waddhus wr8, wr8, wr15         \n\t"
1070         "waddhus wr9, wr9, wr15         \n\t"
1071         "waddhus wr10, wr10, wr15       \n\t"
1072         "waddhus wr11, wr11, wr15       \n\t"
1073         "wsrlhg wr8, wr8, wcgr0         \n\t"
1074         "wsrlhg wr9, wr9, wcgr0         \n\t"
1075         "wldrd wr12, [%[block]]         \n\t"
1076         "wldrd wr13, [%[block], #8]     \n\t"
1077         "wsrlhg wr10, wr10, wcgr0       \n\t"
1078         "wsrlhg wr11, wr11, wcgr0       \n\t"
1079         "wpackhus wr8, wr8, wr9         \n\t"
1080         "wpackhus wr9, wr10, wr11       \n\t"
1081         WAVG2B" wr8, wr8, wr12          \n\t"
1082         WAVG2B" wr9, wr9, wr13          \n\t"
1083         "wstrd wr8, [%[block]]          \n\t"
1084         "wstrd wr9, [%[block], #8]      \n\t"
1085         "add %[block], %[block], %[line_size]   \n\t"
1086         "subs %[h], %[h], #2            \n\t"
1087         "pld [%[block]]                 \n\t"
1088         "pld [%[block], #32]            \n\t"
1089         "bne 1b                         \n\t"
1090         : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block)
1091         : [line_size]"r"(line_size)
1092         : "r12", "memory");
1093 }