2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 EXPORT |vp8_fast_fdct4x4_neon|
18 AREA ||.text||
, CODE
, READONLY
, ALIGN=2
19 ;void vp8_fast_fdct4x4_c(short *input, short *output, int pitch);
21 ;The input *src_diff. src_diff is calculated as:
22 ;diff_ptr[c] = src_ptr[c] - pred_ptr[c]; (in Subtract* function)
23 ;In which *src_ptr and *pred_ptr both are unsigned char.
24 ;Therefore, *src_diff should be in the range of [-255, 255].
26 ;The input values of 25th block are set in vp8_build_dcblock function, which are out of [-255, 255].
27 ;But, VP8 encoder only uses vp8_short_fdct4x4_c for 25th block, not vp8_fast_fdct4x4_c. That makes
28 ;it ok for assuming *input in [-255, 255] in vp8_fast_fdct4x4_c, but not ok in vp8_short_fdct4x4_c.
30 |vp8_fast_fdct4x4_neon|
PROC
31 vld1.16
{d2}, [r0
], r2
;load input
32 ldr r12
, _ffdct_coeff_
33 vld1.16
{d3}, [r0
], r2
34 vld1.16
{d4}, [r0
], r2
36 vld1.16
{d5}, [r0
], r2
39 ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[1], d4=ip[2], d5=ip[3]
45 vadd.s16 d6
, d2
, d5
;ip[0]+ip[3]
46 vadd.s16 d7
, d3
, d4
;ip[1]+ip[2]
47 vsub.s16 d8
, d3
, d4
;ip[1]-ip[2]
48 vsub.s16 d9
, d2
, d5
;ip[0]-ip[3]
49 vshl.i16 q3
, q3
, #
1 ; a1, b1
50 vshl.i16 q4
, q4
, #
1 ; c1, d1
52 vadd.s16 d10
, d6
, d7
;temp1 = a1 + b1
53 vsub.s16 d11
, d6
, d7
;temp2 = a1 - b1
55 vqdmulh.s16 q6
, q5
, d0
[1]
56 vqdmulh.s16 q8
, q4
, d0
[0]
57 vqdmulh.s16 q7
, q4
, d0
[2]
61 vshr.s16 q7
, q7
, #
1 ;d14:temp1 = ( c1 * x_c3)>>16; d15:temp1 = (d1 * x_c3)>>16
62 vadd.s16 q8
, q4
, q8
;d16:temp2 = ((c1 * x_c1)>>16) + c1; d17:temp2 = ((d1 * x_c1)>>16) + d1
64 vadd.s16 d2
, d10
, d12
;op[0] = ((temp1 * x_c2 )>>16) + temp1
65 vadd.s16 d4
, d11
, d13
;op[2] = ((temp2 * x_c2 )>>16) + temp2
66 vadd.s16 d3
, d14
, d17
;op[1] = temp1 + temp2 -- q is not necessary, just for protection
67 vsub.s16 d5
, d15
, d16
;op[3] = temp1 - temp2
70 ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[4], d4=ip[8], d5=ip[12]
76 vadd.s16 d6
, d2
, d5
;a1 = ip[0]+ip[12]
77 vadd.s16 d7
, d3
, d4
;b1 = ip[4]+ip[8]
78 vsub.s16 d8
, d3
, d4
;c1 = ip[4]-ip[8]
79 vsub.s16 d9
, d2
, d5
;d1 = ip[0]-ip[12]
81 vadd.s16 d10
, d6
, d7
;temp1 = a1 + b1
82 vsub.s16 d11
, d6
, d7
;temp2 = a1 - b1
85 vqdmulh.s16 q6
, q5
, d0
[1]
86 vqdmulh.s16 q8
, q4
, d0
[0]
87 vqdmulh.s16 q7
, q4
, d0
[2]
91 vshr.s16 q7
, q7
, #
1 ;d14:temp1 = ( c1 * x_c3)>>16; d15:temp1 = (d1 * x_c3)>>16
92 vadd.s16 q8
, q4
, q8
;d16:temp2 = ((c1 * x_c1)>>16) + c1; d17:temp2 = ((d1 * x_c1)>>16) + d1
94 vadd.s16 d2
, d10
, d12
;a2 = ((temp1 * x_c2 )>>16) + temp1
95 vadd.s16 d4
, d11
, d13
;c2 = ((temp2 * x_c2 )>>16) + temp2
96 vadd.s16 d3
, d14
, d17
;b2 = temp1 + temp2 -- q is not necessary, just for protection
97 vsub.s16 d5
, d15
, d16
;d2 = temp1 - temp2
108 vst1.16
{q1, q2}, [r1
]
122 DCD
0xB505EC83, 0x000061F8