Set cpu_used range to [-16, 16] in real-time mode
[libvpx.git] / vp8 / encoder / arm / neon / shortfdct_neon.asm
blob1b7f36277765c973b31d486495e4e0d12adfc34f
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 EXPORT |vp8_short_fdct4x4_neon|
13 EXPORT |vp8_short_fdct8x4_neon|
14 ARM
15 REQUIRE8
16 PRESERVE8
19 AREA ||.text||, CODE, READONLY, ALIGN=2
21 ; r0 short *input
22 ; r1 short *output
23 ; r2 int pitch
24 ; Input has a pitch, output is contiguous
25 |vp8_short_fdct4x4_neon| PROC
26 ldr r12, _dct_matrix_
27 vld1.16 d0, [r0], r2
28 vld1.16 d1, [r0], r2
29 vld1.16 d2, [r0], r2
30 vld1.16 d3, [r0]
31 vld1.16 {q2, q3}, [r12]
33 ;first stage
34 vmull.s16 q11, d4, d0[0] ;i=0
35 vmull.s16 q12, d4, d1[0] ;i=1
36 vmull.s16 q13, d4, d2[0] ;i=2
37 vmull.s16 q14, d4, d3[0] ;i=3
39 vmlal.s16 q11, d5, d0[1]
40 vmlal.s16 q12, d5, d1[1]
41 vmlal.s16 q13, d5, d2[1]
42 vmlal.s16 q14, d5, d3[1]
44 vmlal.s16 q11, d6, d0[2]
45 vmlal.s16 q12, d6, d1[2]
46 vmlal.s16 q13, d6, d2[2]
47 vmlal.s16 q14, d6, d3[2]
49 vmlal.s16 q11, d7, d0[3] ;sumtemp for i=0
50 vmlal.s16 q12, d7, d1[3] ;sumtemp for i=1
51 vmlal.s16 q13, d7, d2[3] ;sumtemp for i=2
52 vmlal.s16 q14, d7, d3[3] ;sumtemp for i=3
54 ; rounding
55 vrshrn.i32 d22, q11, #14
56 vrshrn.i32 d24, q12, #14
57 vrshrn.i32 d26, q13, #14
58 vrshrn.i32 d28, q14, #14
60 ;second stage
61 vmull.s16 q4, d22, d4[0] ;i=0
62 vmull.s16 q5, d22, d4[1] ;i=1
63 vmull.s16 q6, d22, d4[2] ;i=2
64 vmull.s16 q7, d22, d4[3] ;i=3
66 vmlal.s16 q4, d24, d5[0]
67 vmlal.s16 q5, d24, d5[1]
68 vmlal.s16 q6, d24, d5[2]
69 vmlal.s16 q7, d24, d5[3]
71 vmlal.s16 q4, d26, d6[0]
72 vmlal.s16 q5, d26, d6[1]
73 vmlal.s16 q6, d26, d6[2]
74 vmlal.s16 q7, d26, d6[3]
76 vmlal.s16 q4, d28, d7[0] ;sumtemp for i=0
77 vmlal.s16 q5, d28, d7[1] ;sumtemp for i=1
78 vmlal.s16 q6, d28, d7[2] ;sumtemp for i=2
79 vmlal.s16 q7, d28, d7[3] ;sumtemp for i=3
81 vrshr.s32 q0, q4, #16
82 vrshr.s32 q1, q5, #16
83 vrshr.s32 q2, q6, #16
84 vrshr.s32 q3, q7, #16
86 vmovn.i32 d0, q0
87 vmovn.i32 d1, q1
88 vmovn.i32 d2, q2
89 vmovn.i32 d3, q3
91 vst1.16 {q0, q1}, [r1]
93 bx lr
95 ENDP
97 ; r0 short *input
98 ; r1 short *output
99 ; r2 int pitch
100 |vp8_short_fdct8x4_neon| PROC
101 ; Store link register and input before calling
102 ; first 4x4 fdct. Do not need to worry about
103 ; output or pitch because those pointers are not
104 ; touched in the 4x4 fdct function
105 stmdb sp!, {r0, lr}
107 bl vp8_short_fdct4x4_neon
109 ldmia sp!, {r0, lr}
111 ; Move to the next block of data.
112 add r0, r0, #8
113 add r1, r1, #32
115 ; Second time through do not store off the
116 ; link register, just return from the 4x4 fdtc
117 b vp8_short_fdct4x4_neon
119 ; Should never get to this.
120 bx lr
122 ENDP
124 ;-----------------
126 _dct_matrix_
127 DCD dct_matrix
128 dct_matrix
129 ; DCW 23170, 30274, 23170, 12540
130 ; DCW 23170, 12540, -23170,-30274
131 ; DCW 23170, -12540, -23170, 30274
132 ; DCW 23170, -30274, 23170,-12540
133 ; 23170 = 0x5a82
134 ; -23170 = 0xa57e
135 ; 30274 = 0x7642
136 ; -30274 = 0x89be
137 ; 12540 = 0x30fc
138 ; -12540 = 0xcf04
139 DCD 0x76425a82, 0x30fc5a82
140 DCD 0x30fc5a82, 0x89bea57e
141 DCD 0xcf045a82, 0x7642a57e
142 DCD 0x89be5a82, 0xcf045a82