1 /***************************************************************************
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
10 * Copyright (C) 2005 by David Bryant
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version 2
15 * of the License, or (at your option) any later version.
17 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
18 * KIND, either express or implied.
20 ****************************************************************************/
22 /* This is an assembly optimized version of the following WavPack function:
24 * void decorr_stereo_pass_cont_mcf5249 (struct decorr_pass *dpp,
25 * long *buffer, long sample_count);
27 * It performs a single pass of stereo decorrelation on the provided buffer.
28 * Note that this version of the function requires that the 8 previous stereo
29 * samples are visible and correct. In other words, it ignores the "samples_*"
30 * fields in the decorr_pass structure and gets the history data directly
31 * from the buffer. It does, however, return the appropriate history samples
32 * to the decorr_pass structure before returning.
34 * This is written to work on a MCF5249 processor, or any processor based on
35 * the ColdFire V2 core with an EMAC unit. The EMAC is perfectly suited for
36 * the "apply_weight" function of WavPack decorrelation because it provides
37 * the requires 40-bit product. The fractional rounding mode of the EMAC is not
38 * configurable and uses "round to even" while WavPack uses "round to larger",
39 * so the rounding has to be done manually.
44 .global decorr_stereo_pass_cont_mcf5249
46 decorr_stereo_pass_cont_mcf5249:
49 movem.l %d2-%d7/%a2-%a6, (%sp)
50 move.l 44+4(%sp), %a2 | a2 = dpp->
51 move.l 44+8(%sp), %a1 | a1 = bptr
52 move.w 2(%a2), %a3 | a3 = dpp->delta
53 move.w 4(%a2), %d3 | d3 = dpp->weight_A (sign extended)
55 move.w 6(%a2), %d4 | d4 = dpp->weight_B (sign extended)
57 move.l 44+12(%sp), %d0 | d0 = sample_count
58 jbeq return_only | if zero, nothing to do
60 lsl.l #3, %d0 | d5 = bptr + (sample_count * 8)
64 moveq.l #17, %d0 | left shift weights & delta 17 places
72 move.l %d6, %macsr | set fractional mode for MAC
73 move.l #0x800000, %accext01 | acc1 = 0x00 0000 80 (for rounding)
75 move.l #1024<<17, %d6 | d6 & d7 are weight clipping limits
76 move.l #-1024<<17, %d7 | (only used by negative terms)
78 move.w (%a2), %d0 | d0 = term
81 jbeq term_17 | term = 17
83 jbeq term_18 | term = 18
85 jbeq term_minus_1 | term = -1
87 jbeq term_minus_2 | term = -2
89 jbeq term_minus_3 | term = -3
90 jbra term_default | default term = 1 - 8
92 |------------------------------------------------------------------------------
93 | Loop to handle term = 17 condition
95 | a0 = d0 = (2 * bptr [-1]) - bptr [-2]
96 | a1 = bptr d1 = initial bptr [0]
97 | a2 = dpp-> d2 = updated bptr [0]
98 | a3 = dpp->delta << 17 d3 = dpp->weight_A << 17
99 | a4 = d4 = dpp->weight_B << 17
101 | macsr = 0x20 acc1 = 0x00 0000 80
102 |------------------------------------------------------------------------------
105 move.l -8(%a1), %d0 | d0 = 2 * bptr [-1] - bptr [-2]
108 beq .L251 | if zero, skip calculation
110 asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_A
111 mac.l %d0, %d3, %acc0
114 eor.l %d1, %d0 | else compare signs
115 bge .L256 | if same, add delta to weight
116 sub.l %a3, %d3 | else subtract delta from weight
117 sub.l %a3, %d3 | subtract again instead of branch
118 .L256: add.l %a3, %d3 | add delta to weight
120 .L255: move.l %acc0, %d2 | d2 = rounded product
121 add.l %d1, %d2 | update bptr [0] and store
124 .L253: move.l -8(%a1), %d0 | d0 = 2 * bptr [-1] - bptr [-2]
127 beq .L257 | if zero, skip calculations
129 asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_B
130 mac.l %d0, %d4, %acc0
133 eor.l %d1, %d0 | else compare signs
134 bge .L259 | if same, add delta to weight
135 sub.l %a3, %d4 | else subtract delta from weight
136 sub.l %a3, %d4 | subtract again instead of branch
137 .L259: add.l %a3, %d4 | add delta to weight
139 .L254: move.l %acc0, %d2 | d2 = rounded product
140 add.l %d1, %d2 | update bptr [0] and store
143 .L252: cmp.l %a1, %d5 | loop if bptr < eptr
145 bra term_17_18_finish | exit through common path
147 .L251: addq.l #4, %a1 | update point and jump back into loop
150 .L257: addq.l #4, %a1 | update point and jump back into loop
153 |------------------------------------------------------------------------------
154 | Loop to handle term = 18 condition
156 | a0 = d0 = ((3 * bptr [-1]) - bptr [-2]) >> 1
157 | a1 = bptr d1 = initial bptr [0]
158 | a2 = dpp-> d2 = updated bptr [0]
159 | a3 = dpp->delta << 17 d3 = dpp->weight_A << 17
160 | a4 = d4 = dpp->weight_B << 17
162 | macsr = 0x20 acc1 = 0x00 0000 80
163 |------------------------------------------------------------------------------
166 move.l -8(%a1), %a0 | d0 = (3 * bptr [-1] - bptr [-2]) >> 1
167 lea (%a0,%a0.l*2), %a0
173 asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_A
174 mac.l %d0, %d3, %acc0
177 eor.l %d1, %d0 | else compare signs
178 bge .L267 | if same, add delta to weight
179 sub.l %a3, %d3 | else subtract delta from weight
180 sub.l %a3, %d3 | subtract again instead of branch
181 .L267: add.l %a3, %d3 | add delta to weight
183 .L266: move.l %acc0, %d2 | d2 = rounded product
184 add.l %d1, %d2 | add applied weight to bptr [0], store
187 .L268: move.l -8(%a1), %a0 | d0 = (3 * bptr [-1] - bptr [-2]) >> 1
188 lea (%a0,%a0.l*2), %a0
194 asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_B
195 mac.l %d0, %d4, %acc0
198 eor.l %d1, %d0 | else compare signs
199 bge .L270 | if same, add delta to weight
200 sub.l %a3, %d4 | else subtract delta from weight
201 sub.l %a3, %d4 | subtract again instead of branch
202 .L270: add.l %a3, %d4 | add delta to weight
204 .L265: move.l %acc0, %d2 | d2 = rounded product
205 add.l %d1, %d2 | add applied weight to bptr [0], store
208 .L269: cmp.l %a1, %d5 | loop if bptr < eptr
210 bra term_17_18_finish | exit through common path
212 .L260: addq.l #4, %a1 | bump pointer and jump back into loop
215 .L261: addq.l #4, %a1 | bump pointer and jump back into loop
219 move.l -4(%a1), 40(%a2) | restore dpp->samples_A [0-1], B [0-1]
220 move.l -8(%a1), 8(%a2)
221 move.l -12(%a1), 44(%a2)
222 move.l -16(%a1), 12(%a2)
225 |------------------------------------------------------------------------------
226 | Loop to handle default terms (i.e. 1 - 8)
228 | a0 = tptr d0 = tptr [0]
229 | a1 = bptr d1 = initial bptr [0]
230 | a2 = dpp-> d2 = updated bptr [0]
231 | a3 = dpp->delta << 17 d3 = dpp->weight_A << 17
232 | a4 = d4 = dpp->weight_B << 17
234 | macsr = 0x20 acc1 = 0x00 0000 80
235 |------------------------------------------------------------------------------
238 move.w (%a2), %d0 | a0 = a1 - (dpp->term * 8)
245 move.l (%a0)+, %d0 | d0 = tptr [0], skip ahead if zero
248 asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_A
249 mac.l %d0, %d3, %acc0
252 eor.l %d1, %d0 | else compare signs
253 bge .L278 | if same, add delta to weight
254 sub.l %a3, %d3 | else subtract delta from weight
255 sub.l %a3, %d3 | subtract again instead of branch
256 .L278: add.l %a3, %d3 | add delta to weight
258 .L277: move.l %acc0, %d2 | d2 = rounded product
259 add.l %d1, %d2 | add applied weight to bptr [0], store
262 .L275: move.l (%a0)+, %d0 | d0 = tptr [0], skip ahead if zero
265 asl.l #4, %d0 | acc0 = acc1 + (d0 << 4) * weight_B
266 mac.l %d0, %d4, %acc0
269 eor.l %d1, %d0 | else compare signs
270 bge .L281 | if same, add delta to weight
271 sub.l %a3, %d4 | else subtract delta from weight
272 sub.l %a3, %d4 | subtract again instead of branch
273 .L281: add.l %a3, %d4 | add delta to weight
275 .L276: move.l %acc0, %d2 | d2 = rounded product
276 add.l %d1, %d2 | add applied weight to bptr [0], store
279 .L274: cmp.l %a1, %d5 | loop back if bptr < eptr
280 jbhi term_default_loop
281 move.w (%a2), %d0 | d0 = term - 1
282 moveq.l #8, %d1 | d1 = loop counter
284 .L323: subq.l #1, %d0 | back up & mask index
286 move.l -(%a1), 40(%a2,%d0.l*4) | store dpp->samples_B [d0]
287 move.l -(%a1), 8(%a2,%d0.l*4) | store dpp->samples_A [d0]
288 subq.l #1, %d1 | loop on count
292 .L271: addq.l #4, %a1 | bump pointer and jump back into loop
295 .L272: addq.l #4, %a1 | bump pointer and jump back into loop
299 |------------------------------------------------------------------------------
300 | Loop to handle term = -1 condition
302 | a0 = d0 = decorrelation sample
303 | a1 = bptr d1 = initial bptr [0]
304 | a2 = dpp-> d2 = updated bptr [0]
305 | a3 = dpp->delta << 17 d3 = dpp->weight_A << 17
306 | a4 = d4 = dpp->weight_B << 17
308 | a6 = d6 = 1024 << 17
309 | a7 = d7 = -1024 << 17
310 | macsr = 0x20 acc1 = 0x00 0000 80
311 |------------------------------------------------------------------------------
314 move.l -4(%a1), %d0 | d0 = bptr [-1]
317 asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_A)
318 mac.l %d0, %d3, %acc0
321 eor.l %d1, %d0 | else compare signs
322 bge .L404 | if same, add delta to weight
323 sub.l %a3, %d3 | else subtract delta from weight
324 cmp.l %d7, %d3 | check for negative clip limit
329 .L404: add.l %a3, %d3 | add delta to weight
330 cmp.l %d6, %d3 | check for positive clip limit
334 .L405: move.l %acc0, %d0 | d2 = rounded product
335 add.l %d1, %d0 | add applied weight to bptr [0], store
339 .L410: move.l %acc1, %acc0
340 asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_B)
341 mac.l %d0, %d4, %acc0
344 eor.l %d1, %d0 | else compare signs
345 bge .L407 | if same, add delta to weight
346 sub.l %a3, %d4 | else subtract delta from weight
347 cmp.l %d7, %d4 | check for negative clip limit
352 .L407: add.l %a3, %d4 | add delta to weight
353 cmp.l %d6, %d4 | check for positive clip limit
357 .L403: move.l %acc0, %d2 | d2 = rounded product
358 add.l %d1, %d2 | add applied weight to bptr [1], store
361 .L411: cmp.l %a1, %d5 | loop back if bptr < eptr
363 move.l -4(%a1), 8(%a2) | dpp->samples_A [0] = bptr [-1]
366 .L402: move.l (%a1)+, %d0
369 .L401: addq.l #4, %a1
373 |------------------------------------------------------------------------------
374 | Loop to handle term = -2 condition
376 | a0 = d0 = decorrelation sample
377 | a1 = bptr d1 = initial bptr [0]
378 | a2 = dpp-> d2 = updated bptr [0]
379 | a3 = dpp->delta << 17 d3 = dpp->weight_A << 17
380 | a4 = d4 = dpp->weight_B << 17
382 | a6 = d6 = 1024 << 17
383 | a7 = d7 = -1024 << 17
384 | macsr = 0x20 acc1 = 0x00 0000 80
385 |------------------------------------------------------------------------------
388 move.l -8(%a1), %d0 | d0 = bptr [-2]
391 asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_B)
392 mac.l %d0, %d4, %acc0
395 eor.l %d1, %d0 | else compare signs
396 bge .L504 | if same, add delta to weight
397 sub.l %a3, %d4 | else subtract delta from weight
398 cmp.l %d7, %d4 | ckeck for negative clip limit
403 .L504: add.l %a3, %d4 | add delta to weight
404 cmp.l %d6, %d4 | check for positive clip limit
408 .L505: move.l %acc0, %d0 | d2 = rounded product
409 add.l %d1, %d0 | add applied weight to bptr [0], store
413 .L510: move.l %acc1, %acc0
414 asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_A)
415 mac.l %d0, %d3, %acc0
418 eor.l %d1, %d0 | else compare signs
419 bge .L507 | if same, add delta to weight
420 sub.l %a3, %d3 | else subtract delta from weight
421 cmp.l %d7, %d3 | check for negative clip limit
426 .L507: add.l %a3, %d3 | add delta to weight
427 cmp.l %d6, %d3 | check for negative clip limit
431 .L503: move.l %acc0, %d2 | d2 = rounded product
432 add.l %d1, %d2 | add applied weight to bptr [1], store
435 .L512: addq.l #8, %a1
436 cmp.l %a1, %d5 | loop if bptr < eptr
438 move.l -8(%a1), 40(%a2) | dpp->samples_B [0] = bptr [-4]
441 .L511: move.l 4(%a1), %d0
446 |------------------------------------------------------------------------------
447 | Loop to handle term = -3 condition
449 | a0 = d0 = decorrelation sample
450 | a1 = bptr d1 = initial bptr [0]
451 | a2 = dpp-> d2 = updated bptr [0]
452 | a3 = dpp->delta << 17 d3 = dpp->weight_A << 17
453 | a4 = d4 = dpp->weight_B << 17
455 | a6 = d6 = 1024 << 17
456 | a7 = d7 = -1024 << 17
457 | macsr = 0x20 acc1 = 0x00 0000 80
458 |------------------------------------------------------------------------------
461 move.l -4(%a1), %d0 | d0 = bptr [-1]
464 asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_A)
465 mac.l %d0, %d3, %acc0
468 eor.l %d1, %d0 | else compare signs
469 bge .L319 | if same, add delta to weight
470 sub.l %a3, %d3 | else subtract delta from weight
471 cmp.l %d7, %d3 | check for negative clip limit
476 .L319: add.l %a3, %d3 | add delta to weight
477 cmp.l %d6, %d3 | check for positive clip limit
481 .L320: move.l %acc0, %d2 | d2 = rounded product
482 add.l %d1, %d2 | add applied weight to bptr [0], store
485 .L330: move.l -12(%a1), %d0 | d0 = bptr [-2]
488 asl.l #4, %d0 | acc0 = acc1 + ((d0 << 4) * weight_B)
489 mac.l %d0, %d4, %acc0
492 eor.l %d1, %d0 | else compare signs
493 bge .L322 | if same, add delta to weight
494 sub.l %a3, %d4 | else subtract delta from weight
495 cmp.l %d7, %d4 | check for negative clip limit
500 .L322: add.l %a3, %d4 | add delta to weight
501 cmp.l %d6, %d4 | check for positive clip limit
505 .L318: move.l %acc0, %d2 | d2 = rounded product
506 add.l %d1, %d2 | add applied weight to bptr [1], store
509 .L331: cmp.l %a1, %d5 | bptr, eptr
511 move.l -4(%a1), 8(%a2) | dpp->samples_A [0] = bptr [-1]
512 move.l -8(%a1), 40(%a2) | dpp->samples_B [0] = bptr [-2]
515 .L301: addq.l #4, %a1
518 .L302: addq.l #4, %a1
527 move.w %d3, 4(%a2) | weight_A, dpp->weight_A
528 move.w %d4, 6(%a2) | weight_B, dpp->weight_B
530 clr.l %d0 | clear up EMAC
535 movem.l (%sp), %d2-%d7/%a2-%a6