quicktime/ffmpeg/libavcodec/armv4l/simple_idct_arm.S

   1 /*
   2  * simple_idct_arm.S
   3  * Copyright (C) 2002 Frederic 'dilb' Boulay.
   4  * All Rights Reserved.
   5  *
   6  * Author: Frederic Boulay <dilb@handhelds.org>
   7  *
   8  * You can redistribute this file and/or modify
   9  * it under the terms of the GNU General Public License (version 2)
  10  * as published by the Free Software Foundation.
  11  *
  12  * This file is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  * GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with this library; if not, write to the Free Software
  19  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  20  *
  21  *
  22  * The function defined in this file, is derived from the simple_idct function
  23  * from the libavcodec library part of the ffmpeg project.
  24  */
  25
  26 /* useful constants for the algorithm, they are save in __constant_ptr__ at */
  27 /* the end of the source code.*/
  28 #define W1  22725
  29 #define W2  21407
  30 #define W3  19266
  31 #define W4  16383
  32 #define W5  12873
  33 #define W6  8867
  34 #define W7  4520
  35 #define MASK_MSHW 0xFFFF0000
  36
  37 /* offsets of the constants in the vector */
  38 #define offW1  0
  39 #define offW2  4
  40 #define offW3  8
  41 #define offW4  12
  42 #define offW5  16
  43 #define offW6  20
  44 #define offW7  24
  45 #define offMASK_MSHW 28
  46
  47 #define ROW_SHIFT 11
  48 #define ROW_SHIFT2MSHW (16-11)
  49 #define COL_SHIFT 20
  50 #define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */
  51 #define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */
  52
  53
  54         .text
  55         .align
  56         .global simple_idct_ARM
  57
  58 simple_idct_ARM:
  59         @@ void simple_idct_ARM(int16_t *block)
  60         @@ save stack for reg needed (take all of them),
  61         @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block
  62         @@ so it must not be overwritten, if it is not saved!!
  63         @@ R12 is another scratch register, so it should not be saved too
  64         @@ save all registers
  65         stmfd sp!, {r4-r11, r14} @ R14 is also called LR
  66         @@ at this point, R0=block, other registers are free.
  67         add r14, r0, #112        @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block.
  68         add r12, pc, #(__constant_ptr__-.-8) @ R12=__constant_ptr__, the vector containing the constants, probably not necessary to reserve a register for it
  69         @@ add 2 temporary variables in the stack: R0 and R14
  70         sub sp, sp, #8          @ allow 2 local variables
  71         str r0, [sp, #0]        @ save block in sp[0]
  72         @@ stack status
  73         @@ sp+4   free
  74         @@ sp+0   R0  (block)
  75
  76
  77         @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free
  78
  79
  80 __row_loop:
  81         @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimise ldr accesses (i.e. split 32bits in 2 16bits words), at least it gives more usable registers :)
  82         ldr r1, [r14, #0]        @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer)
  83         ldr r2, [r14, #4]        @ R2=(int32)(R12)[1]=ROWr32[1]
  84         ldr r3, [r14, #8]        @ R3=ROWr32[2]
  85         ldr r4, [r14, #12]       @ R4=ROWr32[3]
  86         @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop),
  87         @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row)
  88         @@ else follow the complete algorithm.
  89         @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
  90         @@                R3=ROWr32[2], R4=ROWr32[3], R5-R11 free
  91         orr r5, r4, r3           @ R5=R4 | R3
  92         orr r5, r5, r2           @ R5=R4 | R3 | R2
  93         orrs r6, r5, r1          @ Test R5 | R1 (the aim is to check if everything is null)
  94         beq __end_row_loop
  95         mov r7, r1, asr #16      @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later)
  96         ldrsh r6, [r14, #0]      @ R6=ROWr16[0]
  97         orrs r5, r5, r7          @ R5=R4 | R3 | R2 | R7
  98         beq __almost_empty_row
  99
 100 __b_evaluation:
 101         @@ at this point, R0=block (temp),  R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3],
 102         @@     R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free,
 103         @@     R12=__const_ptr_, R14=&block[n]
 104         @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3
 105
 106         @@ MUL16(b0, W1, row[1]);
 107         @@ MUL16(b1, W3, row[1]);
 108         @@ MUL16(b2, W5, row[1]);
 109         @@ MUL16(b3, W7, row[1]);
 110         @@ MAC16(b0, W3, row[3]);
 111         @@ MAC16(b1, -W7, row[3]);
 112         @@ MAC16(b2, -W1, row[3]);
 113         @@ MAC16(b3, -W5, row[3]);
 114         ldr r8, [r12, #offW1]    @ R8=W1
 115         mov r2, r2, asr #16      @ R2=ROWr16[3]
 116         mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 117         ldr r9, [r12, #offW3]    @ R9=W3
 118         ldr r10, [r12, #offW5]   @ R10=W5
 119         mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 120         ldr r11, [r12, #offW7]   @ R11=W7
 121         mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 122         mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 123                 teq r2, #0               @ if null avoid muls
 124                 mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 125         rsbne r2, r2, #0         @ R2=-ROWr16[3]
 126         mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 127         mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 128         mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 129
 130         @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
 131         @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
 132         @@     R12=__const_ptr_, R14=&block[n]
 133         @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
 134         @@ if (temp != 0) {}
 135         orrs r2, r3, r4          @ R2=ROWr32[2] | ROWr32[3]
 136         beq __end_b_evaluation
 137
 138         @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
 139         @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
 140         @@     R12=__const_ptr_, R14=&block[n]
 141         @@ MAC16(b0, W5, row[5]);
 142         @@ MAC16(b2, W7, row[5]);
 143         @@ MAC16(b3, W3, row[5]);
 144         @@ MAC16(b1, -W1, row[5]);
 145         @@ MAC16(b0, W7, row[7]);
 146         @@ MAC16(b2, W3, row[7]);
 147         @@ MAC16(b3, -W1, row[7]);
 148         @@ MAC16(b1, -W5, row[7]);
 149         mov r3, r3, asr #16      @ R3=ROWr16[5]
 150                 teq r3, #0               @ if null avoid muls
 151         mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5]=b0
 152         mov r4, r4, asr #16      @ R4=ROWr16[7]
 153         mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5]=b2
 154         mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5]=b3
 155         rsbne r3, r3, #0         @ R3=-ROWr16[5]
 156         mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5]=b1
 157         @@ R3 is free now
 158                 teq r4, #0               @ if null avoid muls
 159         mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7]=b0
 160         mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7]=b2
 161         rsbne r4, r4, #0         @ R4=-ROWr16[7]
 162         mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7]=b3
 163         mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7]=b1
 164         @@ R4 is free now
 165 __end_b_evaluation:
 166         @@ at this point, R0=b0,  R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free),
 167         @@     R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
 168         @@     R12=__const_ptr_, R14=&block[n]
 169
 170 __a_evaluation:
 171         @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
 172         @@ a1 = a0 + W6 * row[2];
 173         @@ a2 = a0 - W6 * row[2];
 174         @@ a3 = a0 - W2 * row[2];
 175         @@ a0 = a0 + W2 * row[2];
 176         ldr r9, [r12, #offW4]    @ R9=W4
 177         mul r6, r9, r6           @ R6=W4*ROWr16[0]
 178         ldr r10, [r12, #offW6]   @ R10=W6
 179         ldrsh r4, [r14, #4]      @ R4=ROWr16[2] (a3 not defined yet)
 180         add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0)
 181
 182         mul r11, r10, r4         @ R11=W6*ROWr16[2]
 183         ldr r8, [r12, #offW2]    @ R8=W2
 184         sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
 185         @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
 186         @@ if (temp != 0) {}
 187         teq r2, #0
 188         beq __end_bef_a_evaluation
 189
 190         add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
 191         mul r11, r8, r4          @ R11=W2*ROWr16[2]
 192         sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
 193         add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
 194
 195
 196         @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
 197         @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
 198         @@     R12=__const_ptr_, R14=&block[n]
 199
 200
 201         @@ a0 += W4*row[4]
 202         @@ a1 -= W4*row[4]
 203         @@ a2 -= W4*row[4]
 204         @@ a3 += W4*row[4]
 205         ldrsh r11, [r14, #8]     @ R11=ROWr16[4]
 206                 teq r11, #0              @ if null avoid muls
 207         mulne r11, r9, r11       @ R11=W4*ROWr16[4]
 208         @@ R9 is free now
 209         ldrsh r9, [r14, #12]     @ R9=ROWr16[6]
 210         addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
 211         subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
 212         subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
 213         addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
 214         @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
 215                 teq r9, #0               @ if null avoid muls
 216         mulne r11, r10, r9       @ R11=W6*ROWr16[6]
 217         addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
 218         mulne r10, r8, r9        @ R10=W2*ROWr16[6]
 219         @@ a0 += W6*row[6];
 220         @@ a3 -= W6*row[6];
 221         @@ a1 -= W2*row[6];
 222         @@ a2 += W2*row[6];
 223         subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
 224         subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
 225         addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
 226
 227 __end_a_evaluation:
 228         @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
 229         @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
 230         @@     R12=__const_ptr_, R14=&block[n]
 231         @@ row[0] = (a0 + b0) >> ROW_SHIFT;
 232         @@ row[1] = (a1 + b1) >> ROW_SHIFT;
 233         @@ row[2] = (a2 + b2) >> ROW_SHIFT;
 234         @@ row[3] = (a3 + b3) >> ROW_SHIFT;
 235         @@ row[4] = (a3 - b3) >> ROW_SHIFT;
 236         @@ row[5] = (a2 - b2) >> ROW_SHIFT;
 237         @@ row[6] = (a1 - b1) >> ROW_SHIFT;
 238         @@ row[7] = (a0 - b0) >> ROW_SHIFT;
 239         add r8, r6, r0           @ R8=a0+b0
 240         add r9, r2, r1           @ R9=a1+b1
 241         @@ put 2 16 bits half-words in a 32bits word
 242         @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only Little Endian compliant then!!!)
 243         ldr r10, [r12, #offMASK_MSHW] @ R10=0xFFFF0000
 244         and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5)
 245         mvn r11, r10             @ R11= NOT R10= 0x0000FFFF
 246         and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11)
 247         orr r8, r8, r9
 248         str r8, [r14, #0]
 249
 250         add r8, r3, r5           @ R8=a2+b2
 251         add r9, r4, r7           @ R9=a3+b3
 252         and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5)
 253         and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11)
 254         orr r8, r8, r9
 255         str r8, [r14, #4]
 256
 257         sub r8, r4, r7           @ R8=a3-b3
 258         sub r9, r3, r5           @ R9=a2-b2
 259         and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5)
 260         and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11)
 261         orr r8, r8, r9
 262         str r8, [r14, #8]
 263
 264         sub r8, r2, r1           @ R8=a1-b1
 265         sub r9, r6, r0           @ R9=a0-b0
 266         and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5)
 267         and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11)
 268         orr r8, r8, r9
 269         str r8, [r14, #12]
 270
 271         bal __end_row_loop
 272
 273 __almost_empty_row:
 274         @@ the row was empty, except ROWr16[0], now, management of this special case
 275         @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
 276         @@                R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1],
 277         @@                R8=0xFFFF (temp), R9-R11 free
 278         mov r8, #0x10000         @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run).
 279         sub r8, r8, #1           @ R8 is now ready.
 280         and r5, r8, r6, lsl #3   @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF
 281         orr r5, r5, r5, lsl #16  @ R5=R5 | (R5<<16)
 282         str r5, [r14, #0]        @ R14[0]=ROWr32[0]=R5
 283         str r5, [r14, #4]        @ R14[4]=ROWr32[1]=R5
 284         str r5, [r14, #8]        @ R14[8]=ROWr32[2]=R5
 285         str r5, [r14, #12]       @ R14[12]=ROWr32[3]=R5
 286
 287 __end_row_loop:
 288         @@ at this point, R0-R11 (free)
 289         @@     R12=__const_ptr_, R14=&block[n]
 290         ldr r0, [sp, #0]         @ R0=block
 291         teq r0, r14              @ compare current &block[8*n] to block, when block is reached, the loop is finished.
 292         sub r14, r14, #16
 293         bne __row_loop
 294
 295
 296
 297         @@ at this point, R0=block, R1-R11 (free)
 298         @@     R12=__const_ptr_, R14=&block[n]
 299         add r14, r0, #14        @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block.
 300 __col_loop:
 301
 302 __b_evaluation2:
 303         @@ at this point, R0=block (temp),  R1-R11 (free)
 304         @@     R12=__const_ptr_, R14=&block[n]
 305         @@ proceed with b0-b3 first, followed by a0-a3
 306         @@ MUL16(b0, W1, col[8x1]);
 307         @@ MUL16(b1, W3, col[8x1]);
 308         @@ MUL16(b2, W5, col[8x1]);
 309         @@ MUL16(b3, W7, col[8x1]);
 310         @@ MAC16(b0, W3, col[8x3]);
 311         @@ MAC16(b1, -W7, col[8x3]);
 312         @@ MAC16(b2, -W1, col[8x3]);
 313         @@ MAC16(b3, -W5, col[8x3]);
 314         ldr r8, [r12, #offW1]    @ R8=W1
 315         ldrsh r7, [r14, #16]
 316         mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 317         ldr r9, [r12, #offW3]    @ R9=W3
 318         ldr r10, [r12, #offW5]   @ R10=W5
 319         mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 320         ldr r11, [r12, #offW7]   @ R11=W7
 321         mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 322         ldrsh r2, [r14, #48]
 323         mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
 324         teq r2, #0               @ if 0, then avoid muls
 325         mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 326         rsbne r2, r2, #0         @ R2=-ROWr16[3]
 327         mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 328         mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 329         mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
 330
 331         @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
 332         @@     R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
 333         @@     R12=__const_ptr_, R14=&block[n]
 334         @@ MAC16(b0, W5, col[5x8]);
 335         @@ MAC16(b2, W7, col[5x8]);
 336         @@ MAC16(b3, W3, col[5x8]);
 337         @@ MAC16(b1, -W1, col[5x8]);
 338         @@ MAC16(b0, W7, col[7x8]);
 339         @@ MAC16(b2, W3, col[7x8]);
 340         @@ MAC16(b3, -W1, col[7x8]);
 341         @@ MAC16(b1, -W5, col[7x8]);
 342         ldrsh r3, [r14, #80]     @ R3=COLr16[5x8]
 343         teq r3, #0               @ if 0 then avoid muls
 344         mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5x8]=b0
 345         mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5x8]=b2
 346         mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5x8]=b3
 347         rsbne r3, r3, #0         @ R3=-ROWr16[5x8]
 348         ldrsh r4, [r14, #112]    @ R4=COLr16[7x8]
 349         mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5x8]=b1
 350         @@ R3 is free now
 351         teq r4, #0               @ if 0 then avoid muls
 352         mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7x8]=b0
 353         mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7x8]=b2
 354         rsbne r4, r4, #0         @ R4=-ROWr16[7x8]
 355         mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7x8]=b3
 356         mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7x8]=b1
 357         @@ R4 is free now
 358 __end_b_evaluation2:
 359         @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
 360         @@     R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
 361         @@     R12=__const_ptr_, R14=&block[n]
 362
 363 __a_evaluation2:
 364         @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1));
 365         @@ a1 = a0 + W6 * row[2];
 366         @@ a2 = a0 - W6 * row[2];
 367         @@ a3 = a0 - W2 * row[2];
 368         @@ a0 = a0 + W2 * row[2];
 369         ldrsh r6, [r14, #0]
 370         ldr r9, [r12, #offW4]    @ R9=W4
 371         mul r6, r9, r6           @ R6=W4*ROWr16[0]
 372         ldr r10, [r12, #offW6]   @ R10=W6
 373         ldrsh r4, [r14, #32]      @ R4=ROWr16[2] (a3 not defined yet)
 374         add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
 375         mul r11, r10, r4         @ R11=W6*ROWr16[2]
 376         ldr r8, [r12, #offW2]    @ R8=W2
 377         add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
 378         sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
 379         mul r11, r8, r4          @ R11=W2*ROWr16[2]
 380         sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
 381         add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
 382
 383         @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
 384         @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
 385         @@     R12=__const_ptr_, R14=&block[n]
 386         @@ a0 += W4*row[4]
 387         @@ a1 -= W4*row[4]
 388         @@ a2 -= W4*row[4]
 389         @@ a3 += W4*row[4]
 390         ldrsh r11, [r14, #64]     @ R11=ROWr16[4]
 391         teq r11, #0              @ if null avoid muls
 392         mulne r11, r9, r11       @ R11=W4*ROWr16[4]
 393         @@ R9 is free now
 394         addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
 395         subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
 396         subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
 397         ldrsh r9, [r14, #96]     @ R9=ROWr16[6]
 398         addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
 399         @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
 400         teq r9, #0               @ if null avoid muls
 401         mulne r11, r10, r9       @ R11=W6*ROWr16[6]
 402         addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
 403         mulne r10, r8, r9        @ R10=W2*ROWr16[6]
 404         @@ a0 += W6*row[6];
 405         @@ a3 -= W6*row[6];
 406         @@ a1 -= W2*row[6];
 407         @@ a2 += W2*row[6];
 408         subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
 409         subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
 410         addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
 411 __end_a_evaluation2:
 412         @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
 413         @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
 414         @@     R12=__const_ptr_, R14=&block[n]
 415         @@ col[0 ] = ((a0 + b0) >> COL_SHIFT);
 416         @@ col[8 ] = ((a1 + b1) >> COL_SHIFT);
 417         @@ col[16] = ((a2 + b2) >> COL_SHIFT);
 418         @@ col[24] = ((a3 + b3) >> COL_SHIFT);
 419         @@ col[32] = ((a3 - b3) >> COL_SHIFT);
 420         @@ col[40] = ((a2 - b2) >> COL_SHIFT);
 421         @@ col[48] = ((a1 - b1) >> COL_SHIFT);
 422         @@ col[56] = ((a0 - b0) >> COL_SHIFT);
 423         @@@@@ no optimisation here @@@@@
 424         add r8, r6, r0           @ R8=a0+b0
 425         add r9, r2, r1           @ R9=a1+b1
 426         mov r8, r8, asr #COL_SHIFT
 427         mov r9, r9, asr #COL_SHIFT
 428         strh r8, [r14, #0]
 429         strh r9, [r14, #16]
 430         add r8, r3, r5           @ R8=a2+b2
 431         add r9, r4, r7           @ R9=a3+b3
 432         mov r8, r8, asr #COL_SHIFT
 433         mov r9, r9, asr #COL_SHIFT
 434         strh r8, [r14, #32]
 435         strh r9, [r14, #48]
 436         sub r8, r4, r7           @ R8=a3-b3
 437         sub r9, r3, r5           @ R9=a2-b2
 438         mov r8, r8, asr #COL_SHIFT
 439         mov r9, r9, asr #COL_SHIFT
 440         strh r8, [r14, #64]
 441         strh r9, [r14, #80]
 442         sub r8, r2, r1           @ R8=a1-b1
 443         sub r9, r6, r0           @ R9=a0-b0
 444         mov r8, r8, asr #COL_SHIFT
 445         mov r9, r9, asr #COL_SHIFT
 446         strh r8, [r14, #96]
 447         strh r9, [r14, #112]
 448
 449 __end_col_loop:
 450         @@ at this point, R0-R11 (free)
 451         @@     R12=__const_ptr_, R14=&block[n]
 452         ldr r0, [sp, #0]         @ R0=block
 453         teq r0, r14              @ compare current &block[n] to block, when block is reached, the loop is finished.
 454         sub r14, r14, #2
 455         bne __col_loop
 456
 457
 458
 459
 460 __end_simple_idct_ARM:
 461         @@ restore registers to previous status!
 462         add sp, sp, #8 @@ the local variables!
 463         ldmfd sp!, {r4-r11, r15} @@ update PC with LR content.
 464
 465
 466
 467 @@ kind of sub-function, here not to overload the common case.
 468 __end_bef_a_evaluation:
 469         add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
 470         mul r11, r8, r4          @ R11=W2*ROWr16[2]
 471         sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
 472         add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
 473         bal __end_a_evaluation
 474
 475
 476 __constant_ptr__:  @@ see #defines at the beginning of the source code for values.
 477         .align
 478         .word   W1
 479         .word   W2
 480         .word   W3
 481         .word   W4
 482         .word   W5
 483         .word   W6
 484         .word   W7
 485         .word   MASK_MSHW