Define RDCOST only once
[libvpx.git] / vp8 / encoder / arm / neon / variance_neon.asm
blobe1a46869a431721c7feebac634a3ff1e9ae3474d
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 EXPORT |vp8_variance16x16_neon|
13 EXPORT |vp8_variance16x8_neon|
14 EXPORT |vp8_variance8x16_neon|
15 EXPORT |vp8_variance8x8_neon|
17 ARM
18 REQUIRE8
19 PRESERVE8
21 AREA ||.text||, CODE, READONLY, ALIGN=2
23 ; r0 unsigned char *src_ptr
24 ; r1 int source_stride
25 ; r2 unsigned char *ref_ptr
26 ; r3 int recon_stride
27 ; stack unsigned int *sse
28 |vp8_variance16x16_neon| PROC
29 vmov.i8 q8, #0 ;q8 - sum
30 vmov.i8 q9, #0 ;q9, q10 - sse
31 vmov.i8 q10, #0
33 mov r12, #8
35 variance16x16_neon_loop
36 vld1.8 {q0}, [r0], r1 ;Load up source and reference
37 vld1.8 {q2}, [r2], r3
38 vld1.8 {q1}, [r0], r1
39 vld1.8 {q3}, [r2], r3
41 vsubl.u8 q11, d0, d4 ;calculate diff
42 vsubl.u8 q12, d1, d5
43 vsubl.u8 q13, d2, d6
44 vsubl.u8 q14, d3, d7
46 ;VPADAL adds adjacent pairs of elements of a vector, and accumulates
47 ;the results into the elements of the destination vector. The explanation
48 ;in ARM guide is wrong.
49 vpadal.s16 q8, q11 ;calculate sum
50 vmlal.s16 q9, d22, d22 ;calculate sse
51 vmlal.s16 q10, d23, d23
53 subs r12, r12, #1
55 vpadal.s16 q8, q12
56 vmlal.s16 q9, d24, d24
57 vmlal.s16 q10, d25, d25
58 vpadal.s16 q8, q13
59 vmlal.s16 q9, d26, d26
60 vmlal.s16 q10, d27, d27
61 vpadal.s16 q8, q14
62 vmlal.s16 q9, d28, d28
63 vmlal.s16 q10, d29, d29
65 bne variance16x16_neon_loop
67 vadd.u32 q10, q9, q10 ;accumulate sse
68 vpaddl.s32 q0, q8 ;accumulate sum
70 ldr r12, [sp] ;load *sse from stack
72 vpaddl.u32 q1, q10
73 vadd.s64 d0, d0, d1
74 vadd.u64 d1, d2, d3
76 ;vmov.32 r0, d0[0] ;this instruction costs a lot
77 ;vmov.32 r1, d1[0]
78 ;mul r0, r0, r0
79 ;str r1, [r12]
80 ;sub r0, r1, r0, asr #8
82 ;sum is in [-255x256, 255x256]. sumxsum is 32-bit. Shift to right should
83 ;have sign-bit exension, which is vshr.s. Have to use s32 to make it right.
84 vmull.s32 q5, d0, d0
85 vst1.32 {d1[0]}, [r12] ;store sse
86 vshr.s32 d10, d10, #8
87 vsub.s32 d0, d1, d10
89 vmov.32 r0, d0[0] ;return
90 bx lr
92 ENDP
94 ;================================
95 ;unsigned int vp8_variance16x8_c(
96 ; unsigned char *src_ptr,
97 ; int source_stride,
98 ; unsigned char *ref_ptr,
99 ; int recon_stride,
100 ; unsigned int *sse)
101 |vp8_variance16x8_neon| PROC
102 vmov.i8 q8, #0 ;q8 - sum
103 vmov.i8 q9, #0 ;q9, q10 - sse
104 vmov.i8 q10, #0
106 mov r12, #4
108 variance16x8_neon_loop
109 vld1.8 {q0}, [r0], r1 ;Load up source and reference
110 vld1.8 {q2}, [r2], r3
111 vld1.8 {q1}, [r0], r1
112 vld1.8 {q3}, [r2], r3
114 vsubl.u8 q11, d0, d4 ;calculate diff
115 vsubl.u8 q12, d1, d5
116 vsubl.u8 q13, d2, d6
117 vsubl.u8 q14, d3, d7
119 vpadal.s16 q8, q11 ;calculate sum
120 vmlal.s16 q9, d22, d22 ;calculate sse
121 vmlal.s16 q10, d23, d23
123 subs r12, r12, #1
125 vpadal.s16 q8, q12
126 vmlal.s16 q9, d24, d24
127 vmlal.s16 q10, d25, d25
128 vpadal.s16 q8, q13
129 vmlal.s16 q9, d26, d26
130 vmlal.s16 q10, d27, d27
131 vpadal.s16 q8, q14
132 vmlal.s16 q9, d28, d28
133 vmlal.s16 q10, d29, d29
135 bne variance16x8_neon_loop
137 vadd.u32 q10, q9, q10 ;accumulate sse
138 vpaddl.s32 q0, q8 ;accumulate sum
140 ldr r12, [sp] ;load *sse from stack
142 vpaddl.u32 q1, q10
143 vadd.s64 d0, d0, d1
144 vadd.u64 d1, d2, d3
146 vmull.s32 q5, d0, d0
147 vst1.32 {d1[0]}, [r12] ;store sse
148 vshr.s32 d10, d10, #7
149 vsub.s32 d0, d1, d10
151 vmov.32 r0, d0[0] ;return
152 bx lr
154 ENDP
156 ;=================================
157 ;unsigned int vp8_variance8x16_c(
158 ; unsigned char *src_ptr,
159 ; int source_stride,
160 ; unsigned char *ref_ptr,
161 ; int recon_stride,
162 ; unsigned int *sse)
164 |vp8_variance8x16_neon| PROC
165 vmov.i8 q8, #0 ;q8 - sum
166 vmov.i8 q9, #0 ;q9, q10 - sse
167 vmov.i8 q10, #0
169 mov r12, #8
171 variance8x16_neon_loop
172 vld1.8 {d0}, [r0], r1 ;Load up source and reference
173 vld1.8 {d4}, [r2], r3
174 vld1.8 {d2}, [r0], r1
175 vld1.8 {d6}, [r2], r3
177 vsubl.u8 q11, d0, d4 ;calculate diff
178 vsubl.u8 q12, d2, d6
180 vpadal.s16 q8, q11 ;calculate sum
181 vmlal.s16 q9, d22, d22 ;calculate sse
182 vmlal.s16 q10, d23, d23
184 subs r12, r12, #1
186 vpadal.s16 q8, q12
187 vmlal.s16 q9, d24, d24
188 vmlal.s16 q10, d25, d25
190 bne variance8x16_neon_loop
192 vadd.u32 q10, q9, q10 ;accumulate sse
193 vpaddl.s32 q0, q8 ;accumulate sum
195 ldr r12, [sp] ;load *sse from stack
197 vpaddl.u32 q1, q10
198 vadd.s64 d0, d0, d1
199 vadd.u64 d1, d2, d3
201 vmull.s32 q5, d0, d0
202 vst1.32 {d1[0]}, [r12] ;store sse
203 vshr.s32 d10, d10, #7
204 vsub.s32 d0, d1, d10
206 vmov.32 r0, d0[0] ;return
207 bx lr
209 ENDP
211 ;==================================
212 ; r0 unsigned char *src_ptr
213 ; r1 int source_stride
214 ; r2 unsigned char *ref_ptr
215 ; r3 int recon_stride
216 ; stack unsigned int *sse
217 |vp8_variance8x8_neon| PROC
218 vmov.i8 q8, #0 ;q8 - sum
219 vmov.i8 q9, #0 ;q9, q10 - sse
220 vmov.i8 q10, #0
222 mov r12, #2
224 variance8x8_neon_loop
225 vld1.8 {d0}, [r0], r1 ;Load up source and reference
226 vld1.8 {d4}, [r2], r3
227 vld1.8 {d1}, [r0], r1
228 vld1.8 {d5}, [r2], r3
229 vld1.8 {d2}, [r0], r1
230 vld1.8 {d6}, [r2], r3
231 vld1.8 {d3}, [r0], r1
232 vld1.8 {d7}, [r2], r3
234 vsubl.u8 q11, d0, d4 ;calculate diff
235 vsubl.u8 q12, d1, d5
236 vsubl.u8 q13, d2, d6
237 vsubl.u8 q14, d3, d7
239 vpadal.s16 q8, q11 ;calculate sum
240 vmlal.s16 q9, d22, d22 ;calculate sse
241 vmlal.s16 q10, d23, d23
243 subs r12, r12, #1
245 vpadal.s16 q8, q12
246 vmlal.s16 q9, d24, d24
247 vmlal.s16 q10, d25, d25
248 vpadal.s16 q8, q13
249 vmlal.s16 q9, d26, d26
250 vmlal.s16 q10, d27, d27
251 vpadal.s16 q8, q14
252 vmlal.s16 q9, d28, d28
253 vmlal.s16 q10, d29, d29
255 bne variance8x8_neon_loop
257 vadd.u32 q10, q9, q10 ;accumulate sse
258 vpaddl.s32 q0, q8 ;accumulate sum
260 ldr r12, [sp] ;load *sse from stack
262 vpaddl.u32 q1, q10
263 vadd.s64 d0, d0, d1
264 vadd.u64 d1, d2, d3
266 vmull.s32 q5, d0, d0
267 vst1.32 {d1[0]}, [r12] ;store sse
268 vshr.s32 d10, d10, #6
269 vsub.s32 d0, d1, d10
271 vmov.32 r0, d0[0] ;return
272 bx lr
274 ENDP