apps/codecs/libffmpegFLAC/arm.S

   1 /***************************************************************************
   2  *             __________               __   ___.
   3  *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
   4  *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
   5  *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
   6  *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
   7  *                     \/            \/     \/    \/            \/
   8  * $Id$
   9  *
  10  * Copyright (C) 2006 by Thom Johansen
  11  *
  12  * This program is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU General Public License
  14  * as published by the Free Software Foundation; either version 2
  15  * of the License, or (at your option) any later version.
  16  *
  17  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
  18  * KIND, either express or implied.
  19  *
  20  ****************************************************************************/
  21
  22 #include "config.h"
  23
  24 /* The following is an assembler optimised version of the LPC filtering
  25    routines needed for FLAC decoding. It is optimised for use with ARM
  26    processors.
  27    All LPC filtering up to order 9 is done in specially optimised unrolled
  28    loops, while every order above this is handled by a slower default routine.
  29  */
  30 #ifdef USE_IRAM
  31     .section .icode,"ax",%progbits
  32 #else
  33     .text
  34 #endif
  35     .global lpc_decode_arm
  36 lpc_decode_arm:
  37     stmdb sp!, { r4-r11, lr }
  38     ldr r4, [sp, #36]
  39     /* r0 = blocksize, r1 = qlevel, r2 = pred_order
  40        r3 = data, r4 = coeffs
  41      */
  42
  43     /* the data pointer always lags behind history pointer by 'pred_order'
  44        samples. since we have one loop for each order, we can hard code this
  45        and free a register by not saving data pointer.
  46      */
  47     sub r3, r3, r2, lsl #2    @ r3 = history
  48     cmp r0, #0                @ no samples to process
  49     beq .exit
  50     cmp r2, #9                @ check if order is too high for unrolled loops
  51     addls pc, pc, r2, lsl #2  @ jump to our unrolled decode loop if it exists
  52 @ jumptable:
  53     b .default                @ order too high, go to default routine
  54     b .exit                   @ zero order filter isn't possible, exit function
  55     b .order1
  56     b .order2
  57     b .order3
  58     b .order4
  59     b .order5
  60     b .order6
  61     b .order7
  62     b .order8
  63
  64 @ last jump table entry coincides with target, so leave it out
  65 .order9:
  66     ldmia r4, { r5-r12, r14 } @ fetch coefs
  67 .loop9:
  68     ldr r4, [r3], #4          @ load first history sample
  69     mul r2, r4, r14           @ multiply with last coef
  70     ldr r4, [r3], #4          @ rinse and repeat while accumulating sum in r2
  71     mla r2, r4, r12, r2
  72     ldr r4, [r3], #4
  73     mla r2, r4, r11, r2
  74     ldr r4, [r3], #4
  75     mla r2, r4, r10, r2
  76     ldr r4, [r3], #4
  77     mla r2, r4, r9, r2
  78     ldr r4, [r3], #4
  79     mla r2, r4, r8, r2
  80     ldr r4, [r3], #4
  81     mla r2, r4, r7, r2
  82     ldr r4, [r3], #4
  83     mla r2, r4, r6, r2
  84     ldr r4, [r3], #4
  85     mla r2, r4, r5, r2
  86     ldr r4, [r3]              @ r4 = residual
  87     add r2, r4, r2, asr r1    @ shift sum by qlevel bits and add residual
  88     str r2, [r3], #-8*4       @ save result and wrap history pointer back
  89     subs r0, r0, #1           @ check if we're done
  90     bne .loop9                @ nope, jump back
  91     b .exit
  92
  93 .order8:
  94     ldmia r4, { r5-r12 }
  95 .loop8:
  96     @ we have more registers to spare here, so start block reading
  97     ldmia r3!, { r4, r14 }
  98     mul r2, r4, r12
  99     mla r2, r14, r11, r2
 100     ldmia r3!, { r4, r14 }
 101     mla r2, r4, r10, r2
 102     mla r2, r14, r9, r2
 103     ldmia r3!, { r4, r14 }
 104     mla r2, r4, r8, r2
 105     mla r2, r14, r7, r2
 106     ldmia r3!, { r4, r14 }
 107     mla r2, r4, r6, r2
 108     mla r2, r14, r5, r2
 109     ldr r4, [r3]
 110     add r2, r4, r2, asr r1
 111     str r2, [r3], #-7*4
 112     subs r0, r0, #1
 113     bne .loop8
 114     b .exit
 115
 116 .order7:
 117     ldmia r4, { r5-r11 }
 118 .loop7:
 119     ldmia r3!, { r4, r12, r14 }
 120     mul r2, r4, r11
 121     mla r2, r12, r10, r2
 122     mla r2, r14, r9, r2
 123     ldmia r3!, { r4, r12, r14 }
 124     mla r2, r4, r8, r2
 125     mla r2, r12, r7, r2
 126     mla r2, r14, r6, r2
 127     ldr r4, [r3], #4
 128     mla r2, r4, r5, r2
 129     ldr r4, [r3]
 130     add r2, r4, r2, asr r1
 131     str r2, [r3], #-6*4
 132     subs r0, r0, #1
 133     bne .loop7
 134     b .exit
 135
 136 .order6:
 137     ldmia r4, { r5-r10 }
 138 .loop6:
 139     ldmia r3!, { r4, r11-r12, r14 }
 140     mul r2, r4, r10
 141     mla r2, r11, r9, r2
 142     mla r2, r12, r8, r2
 143     mla r2, r14, r7, r2
 144     ldmia r3!, { r4, r11 }
 145     mla r2, r4, r6, r2
 146     mla r2, r11, r5, r2
 147     ldr r4, [r3]
 148     add r2, r4, r2, asr r1
 149     str r2, [r3], #-5*4
 150     subs r0, r0, #1
 151     bne .loop6
 152     b .exit
 153
 154 .order5:
 155     ldmia r4, { r5-r9 }
 156 .loop5:
 157     ldmia r3!, { r4, r10-r12, r14 }
 158     mul r2, r4, r9
 159     mla r2, r10, r8, r2
 160     mla r2, r11, r7, r2
 161     mla r2, r12, r6, r2
 162     mla r2, r14, r5, r2
 163     ldr r4, [r3]
 164     add r2, r4, r2, asr r1
 165     str r2, [r3], #-4*4
 166     subs r0, r0, #1
 167     bne .loop5
 168     b .exit
 169
 170 .order4:
 171     ldmia r4, { r5-r8 }
 172 .loop4:
 173     ldmia r3!, { r4, r11-r12, r14 }
 174     mul r2, r4, r8
 175     mla r2, r11, r7, r2
 176     mla r2, r12, r6, r2
 177     mla r2, r14, r5, r2
 178     ldr r4, [r3]
 179     add r2, r4, r2, asr r1
 180     str r2, [r3], #-3*4
 181     subs r0, r0, #1
 182     bne .loop4
 183     b .exit
 184
 185 .order3:
 186     ldmia r4, { r5-r7 }
 187 .loop3:
 188     ldmia r3!, { r4, r12, r14 }
 189     mul r2, r4, r7
 190     mla r2, r12, r6, r2
 191     mla r2, r14, r5, r2
 192     ldr r4, [r3]
 193     add r2, r4, r2, asr r1
 194     str r2, [r3], #-2*4
 195     subs r0, r0, #1
 196     bne .loop3
 197     b .exit
 198
 199 .order2:
 200     ldmia r4, { r5-r6 }
 201 .loop2:
 202     ldmia r3!, { r4, r14 }
 203     mul r2, r4, r6
 204     mla r2, r14, r5, r2
 205     ldr r4, [r3]
 206     add r2, r4, r2, asr r1
 207     str r2, [r3], #-1*4
 208     subs r0, r0, #1
 209     bne .loop2
 210     b .exit
 211
 212 .order1:
 213     ldr r5, [r4]            @ load the one coef we need
 214     ldr r4, [r3], #4        @ load one history sample, r3 now points to residual
 215 .loop1:
 216     mul r2, r4, r5          @ multiply coef by history sample
 217     ldr r4, [r3]            @ load residual
 218     add r4, r4, r2, asr r1  @ add result to residual
 219     str r4, [r3], #4        @ place r3 at next residual, we already have
 220     subs r0, r0, #1         @ the current sample in r4 for the next iteration
 221     bne .loop1
 222     b .exit
 223
 224 .default:
 225     /* we do the filtering in an unrolled by 4 loop as far as we can, and then
 226        do the rest by jump table. */
 227     add r5, r4, r2, lsl #2   @ need to start in the other end of coefs
 228     mov r7, r2, lsr #2       @ r7 = coefs/4
 229     mov r14, #0              @ init accumulator
 230 .dloop1:
 231     ldmdb r5!, { r8-r11 }
 232     ldmia r3!, { r6, r12 }
 233     mla r14, r6, r11, r14
 234     mla r14, r12, r10, r14
 235     ldmia r3!, { r6, r12 }
 236     mla r14, r6, r9, r14
 237     mla r14, r12, r8, r14
 238     subs r7, r7, #1
 239     bne .dloop1
 240
 241     and r7, r2, #3            @ get remaining samples to be filtered
 242     add pc, pc, r7, lsl #2    @ jump into accumulator chain
 243 @ jumptable:
 244     b .dsave @ padding
 245     b .dsave
 246     b .oneleft
 247     b .twoleft
 248 @ implicit .threeleft
 249     ldr r12, [r5, #-4]!
 250     ldr r8, [r3], #4
 251     mla r14, r12, r8, r14
 252 .twoleft:
 253     ldr r12, [r5, #-4]!
 254     ldr r8, [r3], #4
 255     mla r14, r12, r8, r14
 256 .oneleft:
 257     ldr r12, [r5, #-4]!
 258     ldr r8, [r3], #4
 259     mla r14, r12, r8, r14
 260
 261 .dsave:
 262     ldr r12, [r3]             @ load residual
 263     add r14, r12, r14, asr r1 @ shift sum by qlevel bits and add residual
 264     str r14, [r3], #4         @ store result
 265     sub r3, r3, r2, lsl #2    @ and wrap history pointer back to next first pos
 266     subs r0, r0, #1           @ are we done?
 267     bne .default              @ no, prepare for next sample
 268
 269 .exit:
 270     ldmia sp!, { r4-r11, pc }
 271