1 /***************************************************************************
3 * Open \______ \ ____ ____ | | _\_ |__ _______ ___
4 * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ /
5 * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < <
6 * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \
12 * Copyright (C) 2009 Andrew Mahone asm versions of the C IDCT algorithms used
15 * This program is free software; you can redistribute it and/or
16 * modify it under the terms of the GNU General Public License
17 * as published by the Free Software Foundation; either version 2
18 * of the License, or (at your option) any later version.
20 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
21 * KIND, either express or implied.
23 ****************************************************************************/
29 .type jpeg_idct1h, %function
31 .type jpeg_idct2v, %function
33 .type jpeg_idct2h, %function
35 .type jpeg_idct4v, %function
37 .type jpeg_idct4h, %function
39 .type jpeg_idct8v, %function
41 .type jpeg_idct8h, %function
44 /* In the common case of one pass through the loop, the extra add should be
45 cheaper than saving registers to stack and loading a the value 4112. */
53 mvnhi r12, r12, asr #31
55 usat r12, #8, r12, asr #5
63 .size jpeg_idct1h, .-jpeg_idct1h
67 /* Use SWAR tricks to fake partitioned add and subtract. This is slightly faster
68 than loading two values in each register and using shifts and strh, and
69 requires fewer fixup operations than splitting the values, calculating, and
93 /* ARMv6 offers partitioned adds and subtracts, used here to unroll the loop
108 .size jpeg_idct2v, .-jpeg_idct2v
112 /* Using LDR and shifts here would costs two more ops, and is no faster as
113 results can not be stored merged.
115 stmdb sp!, { r4-r5, lr }
130 mvnhi r5, r5, asr #31
132 mvnhi r4, r4, asr #31
133 #ifdef HAVE_LCD_COLOR
144 ldmia sp!, { r4-r5, pc }
146 stmdb sp!, { r4, lr }
151 saddsubx r12, r12, r12
152 usat r4, #8, r12, asr #21
154 usat r12, #8, r12, asr #5
155 #ifdef HAVE_LCD_COLOR
166 ldmia sp!, { r4, pc }
168 .size jpeg_idct2h, .-jpeg_idct2h
172 stmdb sp!, { r4-r7, lr }
180 add r6, r2, r4 /* r6 = tmp10 >> 2 = d0 + d2 */
181 sub r2, r2, r4 /* r2 = tmp12 >> 2= d0 - d2 */
182 add r4, r3, r5 /* r4 = z1 = d1 + d3 */
183 add r7, r4, r4, lsl #3
184 rsb r4, r4, r7, lsl #4
185 rsb r4, r4, r4, lsl #5 /* z1 *= 4433 */
187 mla r3, r12, r3, r4 /* r3 = tmp2 = z1 + z2 * 6270 */
188 mla r5, r14, r5, r4 /* r5 = tmp0 = z1 - z3 * 15137 */
189 mov r6, r6, lsl #2 /* r6 <<= 2 */
190 mov r2, r2, lsl #2 /* r2 <<= 2 */
191 add r7, r6, r3, asr #11 /* r7 = o0 */
192 sub r3, r6, r3, asr #11 /* r3 = o3 */
193 add r6, r2, r5, asr #11 /* r6 = o1 */
194 sub r2, r2, r5, asr #11 /* r2 = o2 */
202 ldmia sp!, { r4-r7, pc }
204 stmdb sp!, { r4-r8, lr }
212 add r6, r3, r14 /* r6 = z1 = d1 + d3 */
213 add r7, r2, r12 /* r7 = tmp10 >> 2 = d0 + d2 */
214 smlabb r6, r5, r6, r8 /* z1 *= 4433 */
215 sub r2, r2, r12 /* r2 = tmp12 >> 2= d0 - d2 */
216 smlatb r3, r5, r3, r6 /* r3 = tmp2 = z1 + z2 * 6270 */
217 smlabb r14, r4, r14, r6 /* r14 = tmp0 = z1 - z3 * 15137 */
220 add r12, r7, r3, asr #11 /* r12 = o0 */
221 sub r7, r7, r3, asr #11 /* r7 = o3 */
222 add r3, r2, r14, asr #11 /* r3 = o1 */
223 sub r2, r2, r14, asr #11 /* r2 = o2 */
231 ldmia sp!, { r4-r8, pc }
233 stmdb sp!, { r4-r10, lr }
241 /* this part is being done in parallel on two columns */
242 sadd16 r8, r4, r6 /* r8 = d0 + d2 */
243 ssub16 r4, r4, r6 /* r4 = d0 - d2 */
244 sadd16 r6, r5, r7 /* r6 = d1 + d3 */
245 /* there is no parallel shift operation, but we can fake it with bic
249 /* multiplication expands values beyond 16 bits, so this part needs to be
250 split. the values will be merged below so that the rest of the addition
251 can be done in parallel */
252 smlabb r9, r3, r6, r12 /* r9 = z1[0] = (d1 * d3) * 4433 + 1024 */
253 smlabt r6, r3, r6, r12 /* r6 = z1[1] = (d1 * d3) * 4433 + 1024 */
254 smlatb r10, r3, r5, r9 /* r10 = tmp2[0] = z1 + d1 * 6270 */
255 smlabb r14, r2, r7, r9 /* r14 = tmp0[0] = z1 - d3 * 15137 */
256 smlatt r5, r3, r5, r6 /* r5 = tmp2[1] */
257 smlabt r6, r2, r7, r6 /* r6 = tmp0[1] */
258 mov r8, r8, lsl #2 /* complete the parallel shift started */
259 mov r4, r4, lsl #2 /* with the earlier bic instructions */
260 /* tmp2 are in r10, r5; tmp0 are in r14, r6 */
261 /* tmp10, tmp12 are in r4, r8 */
262 mov r10, r10, asr #11
263 mov r14, r14, asr #11
264 pkhbt r5, r10, r5, lsl #5 /* parallel tmp2 */
265 pkhbt r6, r14, r6, lsl #5 /* parallel tmp0 */
266 sadd16 r10, r8, r5 /* d0 */
267 ssub16 r5, r8, r5 /* d3 */
268 sadd16 r14, r4, r6 /* d1 */
269 ssub16 r6, r4, r6 /* d2 */
277 ldmia sp!, { r4-r10, pc }
279 .size jpeg_idct4v, .-jpeg_idct4v
294 stmdb sp!, { r4-r10, lr }
304 add r8, r4, r6 /* r8 = tmp10 >> 2 = d0 + d2 */
305 sub r4, r4, r6 /* r4 = tmp12 >> 2= d0 - d2 */
306 add r6, r5, r7 /* r6 = z1 = d1 + d3 */
307 add r9, r6, r6, lsl #3
308 rsb r6, r6, r9, lsl #4
309 rsb r6, r6, r6, lsl #5 /* z1 *= 4433 */
310 mla r7, r10, r7, r6 /* r5 = tmp0 = z1 - z3 * 15137 */
311 mla r5, r12, r5, r6 /* r3 = tmp2 = z1 + z2 * 6270 */
312 add r9, r5, r8, lsl #13 /* r7 = o0 */
313 rsb r5, r5, r8, lsl #13 /* r3 = o3 */
314 add r8, r7, r4, lsl #13 /* r6 = o1 */
315 rsb r4, r7, r4, lsl #13 /* r2 = o2 */
321 mvnhi r9, r9, asr #31
323 mvnhi r8, r8, asr #31
325 mvnhi r4, r4, asr #31
327 mvnhi r5, r5, asr #31
328 #ifdef HAVE_LCD_COLOR
343 ldmia sp!, { r4-r10, pc }
344 #elif ARM_ARCH < 6 || 1
345 stmdb sp!, { r4-r9, lr }
352 add r8, r14, r7 /* r8 = z1 = d1 + d3 */
353 add r12, r12, r4, lsr #16
354 smulbb r8, r5, r8 /* z1 *= 4433 */
355 add r9, r12, r6 /* r9 = tmp10 >> 13 = d0 + d2 */
356 smlatb r14, r5, r14, r8 /* r14= tmp2 = z1 + z2 * 6270 */
357 smlabb r7, r4, r7, r8 /* r7 = tmp0 = z1 - z3 * 15137 */
358 sub r12, r12, r6 /* r12= tmp12 >> 13 = d0 - d2 */
359 add r6, r14, r9, lsl #13 /* r6 = o0 */
360 rsb r9, r14, r9, lsl #13 /* r9 = o3 */
361 add r14, r7, r12, lsl #13 /* r14= o1 */
362 rsb r12, r7, r12, lsl #13 /* r12= o2 */
364 mov r14, r14, asr #18
365 mov r12, r12, asr #18
368 mvnhi r6, r6, asr #31
370 mvnhi r14, r14, asr #31
372 mvnhi r12, r12, asr #31
374 mvnhi r9, r9, asr #31
375 #ifdef HAVE_LCD_COLOR
390 ldmia sp!, { r4-r9, pc }
392 stmdb sp!, { r4-r9, lr }
396 ldmia r0, { r12, r14 }
398 sadd16 r6, r12, r14 /* r6lo = d0 + d2, r6hi = d1 + d3 */
399 ssub16 r7, r12, r14 /* r7lo = d0 - d2 */
402 smlatt r12, r5, r12, r8 /* r12= tmp2 = z1 + z2 * 6270 */
403 smlabt r14, r4, r14, r8 /* r14= tmp0 = z1 - z3 * 15137 */
405 add r8, r12, r6, lsl #13 /* r8 = o0 */
406 rsb r6, r12, r6, lsl #13 /* r6 = o3 */
407 add r12, r14, r7, lsl #13 /* r12= o1 */
408 rsb r14, r14, r7, lsl #13 /* r14= o2 */
409 usat r8, #8, r8, asr #18
410 usat r6, #8, r6, asr #18
411 usat r12, #8, r12, asr #18
412 usat r14, #8, r14, asr #18
413 #ifdef HAVE_LCD_COLOR
428 ldmia sp!, { r4-r9, pc }
430 .size jpeg_idct4h, .-jpeg_idct4h
434 stmdb sp!, { r4-r11, lr }
441 orreqs r9, r5, r4, lsr #16
455 ldmia sp!, { r4-r11, pc }
461 mov r10, r10, asr #16 /* r10 = z2 = d2 */
462 mov r11, r11, asr #16 /* r11 = z3 = d6 */
465 mov r8, r8, asr #3 /* r8 = z4 = (d0 << 13) + 1024 */
466 mul r9, r14, r9 /* r9 = z1 = (z2 + z3) * 4433 */
468 mla r11, r12, r11, r9 /* r11 = tmp2 = z1 - z3 * 15137 */
469 mla r10, r14, r10, r9 /* r10 = tmp3 = z1 + z2 * 6270 */
470 mov r9, r6, lsl #16 /* r9 = z5 << 3 = d4 << 16 */
471 add r12, r8, r9, asr #3 /* r12 = tmp0 = z4 + z5 */
472 sub r14, r8, r9, asr #3 /* r14 = tmp1 = z4 - z5 */
473 add r8, r12, r10 /* r8 = tmp10 = tmp0 + tmp3 */
474 sub r9, r12, r10 /* r9 = tmp13 = tmp0 - tmp3 */
475 add r10, r14, r11 /* r10 = tmp11 = tmp1 + tmp2 */
476 sub r11, r14, r11 /* r11 = tmp12 = tmp1 - tmp2 */
477 stmdb sp, { r8-r11 } /* tmp10 tmp13 tmp11 tmp12 */
478 mov r4, r4, asr #16 /* r4 = tmp3 = d1 */
479 mov r5, r5, asr #16 /* r5 = tmp2 = d3 */
480 mov r6, r6, asr #16 /* r6 = tmp1 = d5 */
481 mov r7, r7, asr #16 /* r7 = tmp0 = d7 */
484 add r12, r5, r7 /* r12 = z3 = tmp0 + tmp2 */
485 add r14, r4, r6 /* r14 = z4 = tmp1 + tmp3 */
486 add r9, r12, r14 /* r9 = z3 + z4 */
487 mul r9, r10, r9 /* r9 = z5 = (z3 + z4) * 9633 */
489 mla r12, r11, r12, r9 /* r12 = z3 = z5 - z3 * 16069 */
491 mla r14, r10, r14, r9 /* r14 = z4 = z5 - z4 * 3196 */
493 add r9, r4, r7 /* r9 = tmp0 + tmp3 */
494 mla r8, r11, r9, r12 /* r8 = z1 + z3 */
495 mla r9, r11, r9, r14 /* r9 = z1 + z4 */
497 mla r7, r10, r7, r8 /* r7 = tmp0 = z1 + z3 + tmp0 * 2446 */
499 mla r4, r11, r4, r9 /* r4 = tmp3 = z1 + z4 + tmp0 * 12299 */
501 add r9, r5, r6 /* r9 = tmp1 + tmp2 */
502 mla r12, r10, r9, r12 /* r12 = z2 + z3 */
503 mla r14, r10, r9, r14 /* r14 = z2 + z4 */
505 mla r5, r11, r5, r12 /* r5 = tmp2 = z2 + z3 + tmp2 * 25172 */
506 mla r6, r10, r6, r14 /* r6 = tmp1 = z2 + z4 + tmp1 * 16819 */
508 add r12, r8, r4 /* o0 */
509 sub r14, r8, r4 /* o7 */
510 add r8, r9, r7 /* o3 */
511 sub r9, r9, r7 /* o4 */
512 add r4, r10, r5 /* O1 */
513 sub r5, r10, r5 /* o6 */
514 add r10, r11, r6 /* o2 */
515 sub r11, r11, r6 /* o5 */
516 /* output in order: r12 r4 r10 r8 r9 r11 r5 r14 */
517 mov r12, r12, asr #11
519 mov r10, r10, asr #11
522 mov r11, r11, asr #11
524 mov r14, r14, asr #11
536 orreqs r9, r5, r4, lsr #16
538 mov r12, r12, asr #14
550 ldmia sp!, { r4-r11, pc }
554 add r10, r5, r7 /* r10[15:0] = d2 + d6 */
555 sub r14, r12, r6, lsl #16 /* r14 = tmp1 << 3 = (d0 - d4) << 16 */
556 smulbb r10, r8, r10 /* r10 = z1 = (d2 + d6) * 4433 */
557 add r12, r12, r6, lsl #16 /* r12 = tmp0 << 3= (d0 + d4) << 16 */
558 smlatb r11, r8, r7, r10 /* r11 = tmp2 = z1 - d6 * 15137 */
559 smlabb r10, r9, r5, r10 /* r10 = tmp3 = z1 + d2 * 6270 */
560 add r8, r11, r14, asr #3 /* r8 = tmp11 */
561 rsb r11, r11, r14, asr #3 /* r11 = tmp12 */
562 add r14, r10, r12, asr #3 /* r14 = tmp10 */
563 rsb r12, r10, r12, asr #3 /* r12 = tmp13 */
564 stmdb sp, { r8, r11, r12, r14 }/* tmp11 tmp12 tmp13 tmp10 */
565 mov r6, r6, asr #16 /* r6 = tmp1 = d5 */
566 mov r7, r7, asr #16 /* r7 = tmp0 = d7 */
567 add r12, r6, r4, asr #16 /* r12 = z4 = tmp1 + tmp3 */
568 add r14, r7, r5, asr #16 /* r14 = z3 = tmp0 + tmp2 */
569 add r8, r12, r14 /* r8 = z3 + z4 */
571 smultb r8, r9, r8 /* r8 = z5 = (z3 + z4) * 9633 */
572 add r9, r7, r4, asr #16 /* r9 = z1 = tmp0 + tmp3 */
573 smlabb r14, r10, r14, r8 /* r14 = z3 = z5 - z3 * 16069 */
574 smlatb r12, r10, r12, r8 /* r12 = z4 = z5 - z4 * 3196 */
575 smlabb r8, r11, r9, r14 /* r8 = z3 - z1 * 7373 */
576 smlabb r9, r11, r9, r12 /* r9 = z4 - z1 * 7373 */
577 add r10, r6, r5, asr #16 /* r10 = z2 = tmp1 + tmp2 */
578 smlatb r12, r11, r10, r12 /* r12 = z4 - z2 * 20995 */
579 smlatb r14, r11, r10, r14 /* r14 = z3 - z2 * 20995 */
581 smlabb r7, r10, r7, r8 /* r7 = tmp0 */
582 smlatt r4, r10, r4, r9 /* r4 = tmp3 */
583 smlabb r6, r11, r6, r12 /* r6 = tmp1 */
584 smlatt r5, r11, r5, r14 /* r5 = tmp2 */
585 ldmdb sp, { r8-r11 } /* tmp11 tmp12 tmp13 tmp10 */
586 add r12, r8, r5 /* o1 */
587 sub r14, r8, r5 /* o6 */
588 add r8, r9, r6 /* o2 */
589 sub r9, r9, r6 /* o5 */
590 add r6, r10, r7 /* o3 */
591 sub r7, r10, r7 /* o4 */
592 add r10, r11, r4 /* o0 */
593 sub r11, r11, r4 /* o7 */
594 mov r12, r12, asr #11
595 mov r14, r14, asr #11
600 mov r10, r10, asr #11
601 mov r11, r11, asr #11
614 ldmia sp!, { r4-r11, pc }
615 .size jpeg_idct8v, .-jpeg_idct8v
636 stmdb sp!, { r4-r11, lr }
641 add r8, r14, r4, lsl #16
643 orreqs r9, r5, r4, lsr #16
647 mvnhi r8, r8, asr #31
648 #ifdef HAVE_LCD_COLOR
670 ldmia sp!, { r4-r11, pc }
676 mov r10, r10, asr #16 /* r10 = z2 = d2 */
677 mov r11, r11, asr #16 /* r11 = z3 = d6 */
679 mov r8, r8, asr #3 /* r8 = z4 = (d0 + 4112) << 13 */
680 mul r9, r14, r9 /* r9 = z1 = (z2 + z3) * 4433 */
682 mla r11, r12, r11, r9 /* r11 = tmp2 = z1 - z3 * 15137 */
683 mla r10, r14, r10, r9 /* r10 = tmp3 = z1 + z2 * 6270 */
684 mov r9, r6, lsl #16 /* r9 = z5 << 3 = d4 << 16 */
685 add r12, r8, r9, asr #3 /* r12 = tmp0 = z4 + z5 */
686 sub r14, r8, r9, asr #3 /* r14 = tmp1 = z4 - z5 */
687 add r8, r12, r10 /* r8 = tmp10 = tmp0 + tmp3 */
688 sub r9, r12, r10 /* r9 = tmp13 = tmp0 - tmp3 */
689 add r10, r14, r11 /* r10 = tmp11 = tmp1 + tmp2 */
690 sub r11, r14, r11 /* r11 = tmp12 = tmp1 - tmp2 */
691 stmdb sp, { r8-r11 } /* tmp10 tmp13 tmp11 tmp12 */
692 mov r4, r4, asr #16 /* r4 = tmp3 = d1 */
693 mov r5, r5, asr #16 /* r5 = tmp2 = d3 */
694 mov r6, r6, asr #16 /* r6 = tmp1 = d5 */
695 mov r7, r7, asr #16 /* r7 = tmp0 = d7 */
698 add r12, r5, r7 /* r12 = z3 = tmp0 + tmp2 */
699 add r14, r4, r6 /* r14 = z4 = tmp1 + tmp3 */
700 add r9, r12, r14 /* r9 = z3 + z4 */
701 mul r9, r10, r9 /* r9 = z5 = (z3 + z4) * 9633 */
703 mla r12, r11, r12, r9 /* r12 = z3 = z5 - z3 * 16069 */
705 mla r14, r10, r14, r9 /* r14 = z4 = z5 - z4 * 3196 */
707 add r9, r4, r7 /* r9 = tmp0 + tmp3 */
708 mla r8, r11, r9, r12 /* r8 = z1 + z3 */
709 mla r9, r11, r9, r14 /* r9 = z1 + z4 */
711 mla r7, r10, r7, r8 /* r7 = tmp0 = z1 + z3 + tmp0 * 2446 */
713 mla r4, r11, r4, r9 /* r4 = tmp3 = z1 + z4 + tmp0 * 12299 */
715 add r9, r5, r6 /* r9 = tmp1 + tmp2 */
716 mla r12, r10, r9, r12 /* r12 = z2 + z3 */
717 mla r14, r10, r9, r14 /* r14 = z2 + z4 */
719 mla r5, r11, r5, r12 /* r5 = tmp2 = z2 + z3 + tmp2 * 25172 */
720 mla r6, r10, r6, r14 /* r6 = tmp1 = z2 + z4 + tmp1 * 16819 */
722 add r12, r8, r4 /* o0 */
723 sub r14, r8, r4 /* o7 */
724 add r8, r9, r7 /* o3 */
725 sub r9, r9, r7 /* o4 */
726 add r4, r10, r5 /* O1 */
727 sub r5, r10, r5 /* o6 */
728 add r10, r11, r6 /* o2 */
729 sub r11, r11, r6 /* o5 */
730 /* output in order: r12 r4 r10 r8 r9 r11 r5 r14 */
731 mov r12, r12, asr #18
733 mvnhi r12, r12, asr #31
736 mvnhi r4, r4, asr #31
737 mov r10, r10, asr #18
739 mvnhi r10, r10, asr #31
742 mvnhi r8, r8, asr #31
745 mvnhi r9, r9, asr #31
746 mov r11, r11, asr #18
748 mvnhi r11, r11, asr #31
751 mvnhi r5, r5, asr #31
752 mov r14, r14, asr #18
754 mvnhi r14, r14, asr #31
755 #ifdef HAVE_LCD_COLOR
775 add r12, r14, r4, lsl #16
777 orreqs r9, r5, r4, lsr #16
779 mov r12, r12, asr #21
781 mvnhi r12, r12, asr #31
782 #ifdef HAVE_LCD_COLOR
804 ldmia sp!, { r4-r11, pc }
807 add r10, r5, r7 /* r10[15:0] = d2 + d6 */
808 sub r14, r12, r6, lsl #16 /* r14 = tmp1 << 3 = (d0 - d4) << 16 */
809 smulbb r10, r8, r10 /* r10 = z1 = (d2 + d6) * 4433 */
810 add r12, r12, r6, lsl #16 /* r12 = tmp0 << 3= (d0 + d4) << 16 */
811 smlatb r11, r8, r7, r10 /* r11 = tmp2 = z1 - d6 * 15137 */
812 smlabb r10, r9, r5, r10 /* r10 = tmp3 = z1 + d2 * 6270 */
813 add r8, r11, r14, asr #3 /* r8 = tmp11 */
814 rsb r11, r11, r14, asr #3 /* r11 = tmp12 */
815 add r14, r10, r12, asr #3 /* r14 = tmp10 */
816 rsb r12, r10, r12, asr #3 /* r12 = tmp13 */
817 stmdb sp, { r8, r11, r12, r14 }/* tmp11 tmp12 tmp13 tmp10 */
818 mov r6, r6, asr #16 /* r6 = tmp1 = d5 */
819 mov r7, r7, asr #16 /* r7 = tmp0 = d7 */
820 add r12, r6, r4, asr #16 /* r12 = z4 = tmp1 + tmp3 */
821 add r14, r7, r5, asr #16 /* r14 = z3 = tmp0 + tmp2 */
822 add r8, r12, r14 /* r8 = z3 + z4 */
824 smultb r8, r9, r8 /* r8 = z5 = (z3 + z4) * 9633 */
825 add r9, r7, r4, asr #16 /* r9 = z1 = tmp0 + tmp3 */
826 smlabb r14, r10, r14, r8 /* r14 = z3 = z5 - z3 * 16069 */
827 smlatb r12, r10, r12, r8 /* r12 = z4 = z5 - z4 * 3196 */
828 smlabb r8, r11, r9, r14 /* r8 = z3 - z1 * 7373 */
829 smlabb r9, r11, r9, r12 /* r9 = z4 - z1 * 7373 */
830 add r10, r6, r5, asr #16 /* r10 = z2 = tmp1 + tmp2 */
831 smlatb r12, r11, r10, r12 /* r12 = z4 - z2 * 20995 */
832 smlatb r14, r11, r10, r14 /* r14 = z3 - z2 * 20995 */
834 smlabb r7, r10, r7, r8 /* r7 = tmp0 */
835 smlatt r4, r10, r4, r9 /* r4 = tmp3 */
836 smlabb r6, r11, r6, r12 /* r6 = tmp1 */
837 smlatt r5, r11, r5, r14 /* r5 = tmp2 */
838 ldmdb sp, { r8-r11 } /* tmp11 tmp12 tmp13 tmp10 */
839 add r12, r8, r5 /* o1 */
840 sub r14, r8, r5 /* o6 */
841 add r8, r9, r6 /* o2 */
842 sub r9, r9, r6 /* o5 */
843 add r6, r10, r7 /* o3 */
844 sub r7, r10, r7 /* o4 */
845 add r10, r11, r4 /* o0 */
846 sub r11, r11, r4 /* o7 */
847 /* output in order: r10 r12 r8 r6 r7 r9 r14 r11 */
848 mov r10, r10, asr #18
850 mvnhi r10, r10, asr #31
851 mov r12, r12, asr #18
853 mvnhi r12, r12, asr #31
856 mvnhi r8, r8, asr #31
859 mvnhi r6, r6, asr #31
862 mvnhi r7, r7, asr #31
865 mvnhi r9, r9, asr #31
866 mov r14, r14, asr #18
868 mvnhi r14, r14, asr #31
869 mov r11, r11, asr #18
871 mvnhi r11, r11, asr #31
872 #ifdef HAVE_LCD_COLOR
895 ldmia sp!, { r4-r11, pc }
896 .size jpeg_idct8h, .-jpeg_idct8h
899 stmdb sp!, { r4-r11, lr }
904 orreqs r9, r5, r4, lsr #16
918 ldmia sp!, { r4-r11, pc }
922 add r10, r5, r7 /* r10 = d2 + d6 */
924 add r3, r12, r6, lsl #16 /* tmp0 */
925 sub r12, r12, r6, lsl #16 /* tmp1 */
926 pkhtb r4, r5, r4, asr #16 /* r4 = (tmp3[o], tmp2[0]) = (d1, d3) */
927 smulbb r14, r8, r10 /* r14 = z1[e] = (d2 + d6) * 4433 */
928 pkhtb r6, r6, r7, asr #16 /* r6 = (tmp0[o], tmp1[o]) = (d7, d5) */
929 smlatb r7, r8, r7, r14 /* r7 = tmp2[e] = z1 - d6 * 15137 */
930 smlabb r5, r9, r5, r14 /* r5 = tmp3[e] = z1 + d2 * 6270 */
931 pkhtb r9, r9, r9, asr #16 /* r9 = (9633, 9633) */
932 add r10, r5, r3, asr #3 /* r10 = tmp10 */
933 rsb r11, r5, r3, asr #3 /* r11 = tmp13 */
935 rsb r14, r7, r12, asr #3 /* r14 = tmp12 */
936 add r12, r7, r12, asr #3 /* r12 = tmp11 */
937 sadd16 r8, r3, r6 /* z3, z4 */
938 stmdb sp, { r10-r12, r14 } /* tmp10 tmp13 tmp11 tmp12 */
939 smuad r5, r9, r8 /* r5 = z5 = (z3[o] + z4[o]) * 9633 */
941 sadd16 r7, r4, r6 /* r7 = (z1, z2) */
942 smlatt r9, r10, r8, r5 /* r9 = z4 = z5 - z4 * 16069 */
943 smlabb r8, r10, r8, r5 /* r8 = z3 = z5 - z3 * 3196 */
944 smlabb r14, r11, r7, r9 /* r14 = z1 + z4 */
945 smlabb r12, r11, r7, r8 /* r12 = z1 + z3 */
946 smlatt r5, r11, r7, r9 /* r5 = z2 + z4 */
947 smlatt r7, r11, r7, r8 /* r7 = z2 + z3 */
949 smlabt r7, r9, r4, r7 /* r7 = tmp2 */
950 smlatb r14, r9, r4, r14 /* r14 = tmp3 */
951 ldmdb sp, { r4, r9-r11 } /* tmp10 tmp13 tmp11 tmp12 */
952 smlabb r12, r8, r6, r12 /* r12 = tmp0 */
953 smlatt r5, r8, r6, r5 /* r5 = tmp1 */
954 /* used: r4, r5, r7, r9-r12, r14 */
955 add r6, r4, r14 /* o0 */
956 sub r8, r4, r14 /* o7 */
957 add r14, r9, r12 /* o3 */
958 sub r12, r9, r12 /* o4 */
959 add r4, r10, r7 /* o1 */
960 sub r7, r10, r7 /* o6 */
961 add r9, r11, r5 /* o2 */
962 sub r10, r11, r5 /* o5 */
966 mov r14, r14, asr #11
967 mov r12, r12, asr #11
968 mov r10, r10, asr #11
982 ldmia sp!, { r4-r11, pc }
983 .size jpeg_idct8v, .-jpeg_idct8v
1002 stmdb sp!, { r4-r11, lr }
1005 ldmia r0!, { r4-r7 }
1008 orreqs r9, r5, r4, lsr #16
1011 usat r4, #8, r4, asr #5
1012 #ifdef HAVE_LCD_COLOR
1034 ldmia sp!, { r4-r11, pc }
1037 sadd16 r10, r5, r7 /* r10 = (d2 + d6, d3 + d7) */
1038 ssub16 r12, r4, r6 /* r12 = (d0 - d4, d1 - d5) */
1039 sadd16 r11, r4, r6 /* r11 = (d0 + d4, d1 + d5) */
1040 pkhtb r4, r5, r4, asr #16 /* r4 = (tmp3[o], tmp2[0]) = (d1, d3) */
1041 smulbb r14, r8, r10 /* r14 = z1[e] = (d2 + d6) * 4433 */
1042 pkhtb r6, r6, r7, asr #16 /* r6 = (tmp0[o], tmp1[o]) = (d7, d5) */
1043 smlatb r7, r8, r7, r14 /* r7 = tmp2[e] = z1 - d6 * 15137 */
1044 smlabb r5, r9, r5, r14 /* r5 = tmp3[e] = z1 + d2 * 6270 */
1045 sxth r12, r12 /* r12 = tmp1[e] = d0 - d4 */
1046 pkhtb r8, r11, r10, asr #16 /* r8 = (z3[o], z4[o]) */
1047 sxth r14, r11 /* r14 = tmp0[e] */
1048 pkhtb r9, r9, r9, asr #16 /* r9 = (9633, 9633) */
1049 add r10, r5, r14, lsl #13 /* r10 = tmp10 */
1050 rsb r11, r5, r14, lsl #13 /* r11 = tmp13 */
1051 rsb r14, r7, r12, lsl #13 /* r14 = tmp12 */
1052 add r12, r7, r12, lsl #13 /* r12 = tmp11 */
1053 stmdb sp, { r10-r12, r14 } /* tmp10 tmp13 tmp11 tmp12 */
1054 smuad r5, r9, r8 /* r5 = z5 = (z3[o] + z4[o]) * 9633 */
1056 sadd16 r7, r4, r6 /* r7 = (z1, z2) */
1057 smlatt r9, r10, r8, r5 /* r9 = z4 = z5 - z4 * 16069 */
1058 smlabb r8, r10, r8, r5 /* r8 = z3 = z5 - z3 * 3196 */
1059 smlabb r14, r11, r7, r9 /* r14 = z1 + z4 */
1060 smlabb r12, r11, r7, r8 /* r12 = z1 + z3 */
1061 smlatt r5, r11, r7, r9 /* r5 = z2 + z4 */
1062 smlatt r7, r11, r7, r8 /* r7 = z2 + z3 */
1064 smlabt r7, r9, r4, r7 /* r7 = tmp2 */
1065 smlatb r14, r9, r4, r14 /* r14 = tmp3 */
1066 ldmdb sp, { r4, r9-r11 } /* tmp10 tmp13 tmp11 tmp12 */
1067 smlabb r12, r8, r6, r12 /* r12 = tmp0 */
1068 smlatt r5, r8, r6, r5 /* r5 = tmp1 */
1069 /* used: r4, r5, r7, r9-r12, r14 */
1070 add r6, r4, r14 /* o0 */
1071 sub r8, r4, r14 /* o7 */
1072 add r14, r9, r12 /* o3 */
1073 sub r12, r9, r12 /* o4 */
1074 add r4, r10, r7 /* o1 */
1075 sub r7, r10, r7 /* o6 */
1076 add r9, r11, r5 /* o2 */
1077 sub r10, r11, r5 /* o5 */
1078 usat r6, #8, r6, asr #18
1079 usat r4, #8, r4, asr #18
1080 usat r9, #8, r9, asr #18
1081 usat r14, #8, r14, asr #18
1082 usat r12, #8, r12, asr #18
1083 usat r10, #8, r10, asr #18
1084 usat r7, #8, r7, asr #18
1085 usat r8, #8, r8, asr #18
1086 #ifdef HAVE_LCD_COLOR
1108 ldmia sp!, { r4-r11, pc }
1109 .size jpeg_idct8h, .-jpeg_idct8h