2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 EXPORT |vp8_short_fdct4x4_neon|
13 EXPORT |vp8_short_fdct8x4_neon|
19 AREA ||.text||
, CODE
, READONLY
, ALIGN=2
24 ; Input has a pitch, output is contiguous
25 |vp8_short_fdct4x4_neon|
PROC
31 vld1.16
{q2, q3}, [r12
]
34 vmull.s16 q11
, d4
, d0
[0] ;i=0
35 vmull.s16 q12
, d4
, d1
[0] ;i=1
36 vmull.s16 q13
, d4
, d2
[0] ;i=2
37 vmull.s16 q14
, d4
, d3
[0] ;i=3
39 vmlal.s16 q11
, d5
, d0
[1]
40 vmlal.s16 q12
, d5
, d1
[1]
41 vmlal.s16 q13
, d5
, d2
[1]
42 vmlal.s16 q14
, d5
, d3
[1]
44 vmlal.s16 q11
, d6
, d0
[2]
45 vmlal.s16 q12
, d6
, d1
[2]
46 vmlal.s16 q13
, d6
, d2
[2]
47 vmlal.s16 q14
, d6
, d3
[2]
49 vmlal.s16 q11
, d7
, d0
[3] ;sumtemp for i=0
50 vmlal.s16 q12
, d7
, d1
[3] ;sumtemp for i=1
51 vmlal.s16 q13
, d7
, d2
[3] ;sumtemp for i=2
52 vmlal.s16 q14
, d7
, d3
[3] ;sumtemp for i=3
55 vrshrn.i32 d22
, q11
, #
14
56 vrshrn.i32 d24
, q12
, #
14
57 vrshrn.i32 d26
, q13
, #
14
58 vrshrn.i32 d28
, q14
, #
14
61 vmull.s16 q4
, d22
, d4
[0] ;i=0
62 vmull.s16 q5
, d22
, d4
[1] ;i=1
63 vmull.s16 q6
, d22
, d4
[2] ;i=2
64 vmull.s16 q7
, d22
, d4
[3] ;i=3
66 vmlal.s16 q4
, d24
, d5
[0]
67 vmlal.s16 q5
, d24
, d5
[1]
68 vmlal.s16 q6
, d24
, d5
[2]
69 vmlal.s16 q7
, d24
, d5
[3]
71 vmlal.s16 q4
, d26
, d6
[0]
72 vmlal.s16 q5
, d26
, d6
[1]
73 vmlal.s16 q6
, d26
, d6
[2]
74 vmlal.s16 q7
, d26
, d6
[3]
76 vmlal.s16 q4
, d28
, d7
[0] ;sumtemp for i=0
77 vmlal.s16 q5
, d28
, d7
[1] ;sumtemp for i=1
78 vmlal.s16 q6
, d28
, d7
[2] ;sumtemp for i=2
79 vmlal.s16 q7
, d28
, d7
[3] ;sumtemp for i=3
91 vst1.16
{q0, q1}, [r1
]
100 |vp8_short_fdct8x4_neon|
PROC
101 ; Store link register and input before calling
102 ; first 4x4 fdct. Do not need to worry about
103 ; output or pitch because those pointers are not
104 ; touched in the 4x4 fdct function
107 bl vp8_short_fdct4x4_neon
111 ; Move to the next block of data.
115 ; Second time through do not store off the
116 ; link register, just return from the 4x4 fdtc
117 b vp8_short_fdct4x4_neon
119 ; Should never get to this.
129 ; DCW 23170, 30274, 23170, 12540
130 ; DCW 23170, 12540, -23170,-30274
131 ; DCW 23170, -12540, -23170, 30274
132 ; DCW 23170, -30274, 23170,-12540
139 DCD
0x76425a82, 0x30fc5a82
140 DCD
0x30fc5a82, 0x89bea57e
141 DCD
0xcf045a82, 0x7642a57e
142 DCD
0x89be5a82, 0xcf045a82