apps/codecs/libwavpack/arml.S

   1 /***************************************************************************
   2  *             __________               __   ___.
   3  *   Open      \______   \ ____   ____ |  | _\_ |__   _______  ___
   4  *   Source     |       _//  _ \_/ ___\|  |/ /| __ \ /  _ \  \/  /
   5  *   Jukebox    |    |   (  <_> )  \___|    < | \_\ (  <_> > <  <
   6  *   Firmware   |____|_  /\____/ \___  >__|_ \|___  /\____/__/\_ \
   7  *                     \/            \/     \/    \/            \/
   8  * $Id$
   9  *
  10  * Copyright (C) 2006 by David Bryant
  11  *
  12  * This program is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU General Public License
  14  * as published by the Free Software Foundation; either version 2
  15  * of the License, or (at your option) any later version.
  16  *
  17  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
  18  * KIND, either express or implied.
  19  *
  20  ****************************************************************************/
  21
  22 /* This is an assembly optimized version of the following WavPack function:
  23  *
  24  * void decorr_stereo_pass_cont_arml (struct decorr_pass *dpp,
  25  *                                    long *buffer, long sample_count);
  26  *
  27  * It performs a single pass of stereo decorrelation on the provided buffer.
  28  * Note that this version of the function requires that the 8 previous stereo
  29  * samples are visible and correct. In other words, it ignores the "samples_*"
  30  * fields in the decorr_pass structure and gets the history data directly
  31  * from the buffer. It does, however, return the appropriate history samples
  32  * to the decorr_pass structure before returning.
  33  *
  34  * This is written to work on a ARM7TDMI processor. This version uses the
  35  * 64-bit multiply-accumulate instruction and so can be used with all
  36  * WavPack files. However, for optimum performance with 16-bit WavPack
  37  * files, there is a faster version that only uses the 32-bit MLA
  38  * instruction.
  39  */
  40
  41         .text
  42         .align
  43         .global         decorr_stereo_pass_cont_arml
  44
  45 /*
  46  * on entry:
  47  *
  48  * r0 = struct decorr_pass *dpp
  49  * r1 = long *buffer
  50  * r2 = long sample_count
  51  */
  52
  53 decorr_stereo_pass_cont_arml:
  54
  55         stmfd   sp!, {r4 - r8, r10, r11, lr}
  56         mov     r5, r0                  @ r5 = dpp
  57         mov     r11, #512               @ r11 = 512 for rounding
  58         ldrsh   r6, [r0, #2]            @ r6 = dpp->delta
  59         ldrsh   r4, [r0, #4]            @ r4 = dpp->weight_A
  60         ldrsh   r0, [r0, #6]            @ r0 = dpp->weight_B
  61         cmp     r2, #0                  @ exit if no samples to process
  62         beq     common_exit
  63
  64         mov     r0, r0, asl #18         @ for 64-bit math we use weights << 18
  65         mov     r4, r4, asl #18
  66         mov     r6, r6, asl #18
  67         add     r7, r1, r2, asl #3      @ r7 = buffer ending position
  68         ldrsh   r2, [r5, #0]            @ r2 = dpp->term
  69         cmp     r2, #0
  70         blt     minus_term
  71
  72         ldr     lr, [r1, #-16]          @ load 2 sample history from buffer
  73         ldr     r10, [r1, #-12]         @  for terms 2, 17, and 18
  74         ldr     r8, [r1, #-8]
  75         ldr     r3, [r1, #-4]
  76
  77         cmp     r2, #18
  78         beq     term_18_loop
  79         mov     lr, lr, asl #4
  80         mov     r10, r10, asl #4
  81         cmp     r2, #2
  82         beq     term_2_loop
  83         cmp     r2, #17
  84         beq     term_17_loop
  85         b       term_default_loop
  86
  87 minus_term:
  88         mov     r10, #(1024 << 18)      @ r10 = -1024 << 18 for weight clipping
  89         rsb     r10, r10, #0            @  (only used for negative terms)
  90         cmn     r2, #1
  91         beq     term_minus_1
  92         cmn     r2, #2
  93         beq     term_minus_2
  94         cmn     r2, #3
  95         beq     term_minus_3
  96         b       common_exit
  97
  98 /*
  99  ******************************************************************************
 100  * Loop to handle term = 17 condition
 101  *
 102  * r0 = dpp->weight_B           r8 = previous left sample
 103  * r1 = bptr                    r9 =
 104  * r2 = current sample          r10 = second previous left sample << 4
 105  * r3 = previous right sample   r11 = lo accumulator (for rounding)
 106  * r4 = dpp->weight_A           ip = current decorrelation value
 107  * r5 = dpp                     sp =
 108  * r6 = dpp->delta              lr = second previous right sample << 4
 109  * r7 = eptr                    pc =
 110  *******************************************************************************
 111  */
 112
 113 term_17_loop:
 114         rsbs    ip, lr, r8, asl #5      @ decorr value = (2 * prev) - 2nd prev
 115         mov     lr, r8, asl #4          @ previous becomes 2nd previous
 116         ldr     r2, [r1], #4            @ get sample & update pointer
 117         mov     r11, #0x80000000
 118         mov     r8, r2
 119         smlalne r11, r8, r4, ip
 120         strne   r8, [r1, #-4]           @ if change possible, store sample back
 121         cmpne   r2, #0
 122         beq     .L325
 123         teq     ip, r2                  @ update weight based on signs
 124         submi   r4, r4, r6
 125         addpl   r4, r4, r6
 126
 127 .L325:  rsbs    ip, r10, r3, asl #5     @ do same thing for right channel
 128         mov     r10, r3, asl #4
 129         ldr     r2, [r1], #4
 130         mov     r11, #0x80000000
 131         mov     r3, r2
 132         smlalne r11, r3, r0, ip
 133         strne   r3, [r1, #-4]
 134         cmpne   r2, #0
 135         beq     .L329
 136         teq     ip, r2
 137         submi   r0, r0, r6
 138         addpl   r0, r0, r6
 139
 140 .L329:  cmp     r7, r1                  @ loop back if more samples to do
 141         bhi     term_17_loop
 142         mov     lr, lr, asr #4
 143         mov     r10, r10, asr #4
 144         b       store_1718              @ common exit for terms 17 & 18
 145
 146 /*
 147  ******************************************************************************
 148  * Loop to handle term = 18 condition
 149  *
 150  * r0 = dpp->weight_B           r8 = previous left sample
 151  * r1 = bptr                    r9 =
 152  * r2 = current sample          r10 = second previous left sample
 153  * r3 = previous right sample   r11 = lo accumulator (for rounding)
 154  * r4 = dpp->weight_A           ip = decorrelation value
 155  * r5 = dpp                     sp =
 156  * r6 = dpp->delta              lr = second previous right sample
 157  * r7 = eptr                    pc =
 158  *******************************************************************************
 159  */
 160
 161 term_18_loop:
 162         rsb     ip, lr, r8              @ decorr value =
 163         mov     lr, r8                  @  ((3 * prev) - 2nd prev) >> 1
 164         add     ip, lr, ip, asr #1
 165         movs    ip, ip, asl #4
 166         ldr     r2, [r1], #4            @ get sample & update pointer
 167         mov     r11, #0x80000000
 168         mov     r8, r2
 169         smlalne r11, r8, r4, ip
 170         strne   r8, [r1, #-4]           @ if change possible, store sample back
 171         cmpne   r2, #0
 172         beq     .L337
 173         teq     ip, r2                  @ update weight based on signs
 174         submi   r4, r4, r6
 175         addpl   r4, r4, r6
 176
 177 .L337:  rsb     ip, r10, r3             @ do same thing for right channel
 178         mov     r10, r3
 179         add     ip, r10, ip, asr #1
 180         movs    ip, ip, asl #4
 181         ldr     r2, [r1], #4
 182         mov     r11, #0x80000000
 183         mov     r3, r2
 184         smlalne r11, r3, r0, ip
 185         strne   r3, [r1, #-4]
 186         cmpne   r2, #0
 187         beq     .L341
 188         teq     ip, r2
 189         submi   r0, r0, r6
 190         addpl   r0, r0, r6
 191
 192 .L341:  cmp     r7, r1                  @ loop back if more samples to do
 193         bhi     term_18_loop
 194
 195 /* common exit for terms 17 & 18 */
 196
 197 store_1718:
 198         str     r3, [r5, #40]           @ store sample history into struct
 199         str     r8, [r5, #8]
 200         str     r10, [r5, #44]
 201         str     lr, [r5, #12]
 202         b       common_exit             @ and return
 203
 204 /*
 205  ******************************************************************************
 206  * Loop to handle term = 2 condition
 207  * (note that this case can be handled by the default term handler (1-8), but
 208  * this special case is faster because it doesn't have to read memory twice)
 209  *
 210  * r0 = dpp->weight_B           r8 = previous left sample
 211  * r1 = bptr                    r9 =
 212  * r2 = current sample          r10 = second previous left sample << 4
 213  * r3 = previous right sample   r11 = lo accumulator (for rounding)
 214  * r4 = dpp->weight_A           ip = decorrelation value
 215  * r5 = dpp                     sp =
 216  * r6 = dpp->delta              lr = second previous right sample << 4
 217  * r7 = eptr                    pc =
 218  *******************************************************************************
 219  */
 220
 221 term_2_loop:
 222         movs    ip, lr                  @ get decorrelation value & test
 223         ldr     r2, [r1], #4            @ get sample & update pointer
 224         mov     lr, r8, asl #4          @ previous becomes 2nd previous
 225         mov     r11, #0x80000000
 226         mov     r8, r2
 227         smlalne r11, r8, r4, ip
 228         strne   r8, [r1, #-4]           @ if change possible, store sample back
 229         cmpne   r2, #0
 230         beq     .L225
 231         teq     ip, r2                  @ update weight based on signs
 232         submi   r4, r4, r6
 233         addpl   r4, r4, r6
 234
 235 .L225:  movs    ip, r10                 @ do same thing for right channel
 236         ldr     r2, [r1], #4
 237         mov     r10, r3, asl #4
 238         mov     r11, #0x80000000
 239         mov     r3, r2
 240         smlalne r11, r3, r0, ip
 241         strne   r3, [r1, #-4]
 242         cmpne   r2, #0
 243         beq     .L229
 244         teq     ip, r2
 245         submi   r0, r0, r6
 246         addpl   r0, r0, r6
 247
 248 .L229:  cmp     r7, r1                  @ loop back if more samples to do
 249         bhi     term_2_loop
 250
 251         b       default_term_exit       @ this exit updates all dpp->samples
 252
 253 /*
 254  ******************************************************************************
 255  * Loop to handle default term condition
 256  *
 257  * r0 = dpp->weight_B           r8 = result accumulator
 258  * r1 = bptr                    r9 =
 259  * r2 = dpp->term               r10 =
 260  * r3 = decorrelation value     r11 = lo accumulator (for rounding)
 261  * r4 = dpp->weight_A           ip = current sample
 262  * r5 = dpp                     sp =
 263  * r6 = dpp->delta              lr =
 264  * r7 = eptr                    pc =
 265  *******************************************************************************
 266  */
 267
 268 term_default_loop:
 269         ldr     r3, [r1, -r2, asl #3]   @ get decorrelation value based on term
 270         ldr     ip, [r1], #4            @ get original sample and bump ptr
 271         movs    r3, r3, asl #4
 272         mov     r11, #0x80000000
 273         mov     r8, ip
 274         smlalne r11, r8, r4, r3
 275         strne   r8, [r1, #-4]           @ if possibly changed, store updated sample
 276         cmpne   ip, #0
 277         beq     .L350
 278         teq     ip, r3                  @ update weight based on signs
 279         submi   r4, r4, r6
 280         addpl   r4, r4, r6
 281
 282 .L350:  ldr     r3, [r1, -r2, asl #3]   @ do the same thing for right channel
 283         ldr     ip, [r1], #4
 284         movs    r3, r3, asl #4
 285         mov     r11, #0x80000000
 286         mov     r8, ip
 287         smlalne r11, r8, r0, r3
 288         strne   r8, [r1, #-4]
 289         cmpne   ip, #0
 290         beq     .L354
 291         teq     ip, r3
 292         submi   r0, r0, r6
 293         addpl   r0, r0, r6
 294
 295 .L354:  cmp     r7, r1                  @ loop back if more samples to do
 296         bhi     term_default_loop
 297
 298 /*
 299  * This exit is used by terms 1-8 to store the previous 8 samples into the decorr
 300  * structure (even if they are not all used for the given term)
 301  */
 302
 303 default_term_exit:
 304         ldrsh   r3, [r5, #0]
 305         sub     ip, r3, #1
 306         mov     lr, #7
 307
 308 .L358:  and     r3, ip, #7
 309         add     r3, r5, r3, asl #2
 310         ldr     r2, [r1, #-4]
 311         str     r2, [r3, #40]
 312         ldr     r2, [r1, #-8]!
 313         str     r2, [r3, #8]
 314         sub     ip, ip, #1
 315         sub     lr, lr, #1
 316         cmn     lr, #1
 317         bne     .L358
 318         b       common_exit
 319
 320 /*
 321  ******************************************************************************
 322  * Loop to handle term = -1 condition
 323  *
 324  * r0 = dpp->weight_B           r8 =
 325  * r1 = bptr                    r9 =
 326  * r2 = intermediate result     r10 = -1024 (for clipping)
 327  * r3 = previous right sample   r11 = lo accumulator (for rounding)
 328  * r4 = dpp->weight_A           ip = current sample
 329  * r5 = dpp                     sp =
 330  * r6 = dpp->delta              lr = updated left sample
 331  * r7 = eptr                    pc =
 332  *******************************************************************************
 333  */
 334
 335 term_minus_1:
 336         ldr     r3, [r1, #-4]
 337
 338 term_minus_1_loop:
 339         ldr     ip, [r1], #8            @ for left channel the decorrelation value
 340         movs    r3, r3, asl #4          @  is the previous right sample (in r3)
 341         mov     r11, #0x80000000
 342         mov     lr, ip
 343         smlalne r11, lr, r4, r3
 344         strne   lr, [r1, #-8]
 345         cmpne   ip, #0
 346         beq     .L361
 347         teq     ip, r3                  @ update weight based on signs
 348         submi   r4, r4, r6
 349         addpl   r4, r4, r6
 350         cmp     r4, #(1024 << 18)
 351         movgt   r4, #(1024 << 18)
 352         cmp     r4, r10
 353         movlt   r4, r10
 354
 355 .L361:  ldr     r2, [r1, #-4]           @ for right channel the decorrelation value
 356         movs    lr, lr, asl #4
 357         mov     r11, #0x80000000
 358         mov     r3, r2
 359         smlalne r11, r3, r0, lr
 360         strne   r3, [r1, #-4]
 361         cmpne   r2, #0
 362         beq     .L369
 363         teq     r2, lr
 364         submi   r0, r0, r6
 365         addpl   r0, r0, r6
 366         cmp     r0, #(1024 << 18)               @ then clip weight to +/-1024
 367         movgt   r0, #(1024 << 18)
 368         cmp     r0, r10
 369         movlt   r0, r10
 370
 371 .L369:  cmp     r7, r1                  @ loop back if more samples to do
 372         bhi     term_minus_1_loop
 373
 374         str     r3, [r5, #8]            @ else store right sample and exit
 375         b       common_exit
 376
 377 /*
 378  ******************************************************************************
 379  * Loop to handle term = -2 condition
 380  * (note that the channels are processed in the reverse order here)
 381  *
 382  * r0 = dpp->weight_B           r8 =
 383  * r1 = bptr                    r9 =
 384  * r2 = intermediate result     r10 = -1024 (for clipping)
 385  * r3 = previous left sample    r11 = lo accumulator (for rounding)
 386  * r4 = dpp->weight_A           ip = current sample
 387  * r5 = dpp                     sp =
 388  * r6 = dpp->delta              lr = updated right sample
 389  * r7 = eptr                    pc =
 390  *******************************************************************************
 391  */
 392
 393 term_minus_2:
 394         ldr     r3, [r1, #-8]
 395
 396 term_minus_2_loop:
 397         ldr     ip, [r1, #4]            @ for right channel the decorrelation value
 398         movs    r3, r3, asl #4          @  is the previous left sample (in r3)
 399         mov     r11, #0x80000000
 400         mov     lr, ip
 401         smlalne r11, lr, r0, r3
 402         strne   lr, [r1, #4]
 403         cmpne   ip, #0
 404         beq     .L380
 405         teq     ip, r3                  @ update weight based on signs
 406         submi   r0, r0, r6
 407         addpl   r0, r0, r6
 408         cmp     r0, #(1024 << 18)               @ then clip weight to +/-1024
 409         movgt   r0, #(1024 << 18)
 410         cmp     r0, r10
 411         movlt   r0, r10
 412
 413 .L380:  ldr     r2, [r1], #8            @ for left channel the decorrelation value
 414         movs    lr, lr, asl #4
 415         mov     r11, #0x80000000
 416         mov     r3, r2
 417         smlalne r11, r3, r4, lr
 418         strne   r3, [r1, #-8]
 419         cmpne   r2, #0
 420         beq     .L388
 421         teq     r2, lr
 422         submi   r4, r4, r6
 423         addpl   r4, r4, r6
 424         cmp     r4, #(1024 << 18)
 425         movgt   r4, #(1024 << 18)
 426         cmp     r4, r10
 427         movlt   r4, r10
 428
 429 .L388:  cmp     r7, r1                  @ loop back if more samples to do
 430         bhi     term_minus_2_loop
 431
 432         str     r3, [r5, #40]           @ else store left channel and exit
 433         b       common_exit
 434
 435 /*
 436  ******************************************************************************
 437  * Loop to handle term = -3 condition
 438  *
 439  * r0 = dpp->weight_B           r8 = previous left sample
 440  * r1 = bptr                    r9 =
 441  * r2 = current left sample     r10 = -1024 (for clipping)
 442  * r3 = previous right sample   r11 = lo accumulator (for rounding)
 443  * r4 = dpp->weight_A           ip = intermediate result
 444  * r5 = dpp                     sp =
 445  * r6 = dpp->delta              lr =
 446  * r7 = eptr                    pc =
 447  *******************************************************************************
 448  */
 449
 450 term_minus_3:
 451         ldr     r3, [r1, #-4]           @ load previous samples
 452         ldr     r8, [r1, #-8]
 453
 454 term_minus_3_loop:
 455         ldr     ip, [r1], #4
 456         movs    r3, r3, asl #4
 457         mov     r11, #0x80000000
 458         mov     r2, ip
 459         smlalne r11, r2, r4, r3
 460         strne   r2, [r1, #-4]
 461         cmpne   ip, #0
 462         beq     .L399
 463         teq     ip, r3                  @ update weight based on signs
 464         submi   r4, r4, r6
 465         addpl   r4, r4, r6
 466         cmp     r4, #(1024 << 18)       @ then clip weight to +/-1024
 467         movgt   r4, #(1024 << 18)
 468         cmp     r4, r10
 469         movlt   r4, r10
 470
 471 .L399:  movs    ip, r8, asl #4          @ ip = previous left we use now
 472         mov     r8, r2                  @ r8 = current left we use next time
 473         ldr     r2, [r1], #4
 474         mov     r11, #0x80000000
 475         mov     r3, r2
 476         smlalne r11, r3, r0, ip
 477         strne   r3, [r1, #-4]
 478         cmpne   r2, #0
 479         beq     .L407
 480         teq     ip, r2
 481         submi   r0, r0, r6
 482         addpl   r0, r0, r6
 483         cmp     r0, #(1024 << 18)
 484         movgt   r0, #(1024 << 18)
 485         cmp     r0, r10
 486         movlt   r0, r10
 487
 488 .L407:  cmp     r7, r1                  @ loop back if more samples to do
 489         bhi     term_minus_3_loop
 490
 491         str     r3, [r5, #8]            @ else store previous samples & exit
 492         str     r8, [r5, #40]
 493
 494 /*
 495  * Before finally exiting we must store weights back for next time
 496  */
 497
 498 common_exit:
 499         mov     r0, r0, asr #18         @ restore weights to real magnitude
 500         mov     r4, r4, asr #18
 501         strh    r4, [r5, #4]
 502         strh    r0, [r5, #6]
 503         ldmfd   sp!, {r4 - r8, r10, r11, pc}
 504