2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 EXPORT |vp8_variance16x16_neon|
13 EXPORT |vp8_variance16x8_neon|
14 EXPORT |vp8_variance8x16_neon|
15 EXPORT |vp8_variance8x8_neon|
21 AREA ||.text||
, CODE
, READONLY
, ALIGN=2
23 ; r0 unsigned char *src_ptr
24 ; r1 int source_stride
25 ; r2 unsigned char *ref_ptr
27 ; stack unsigned int *sse
28 |vp8_variance16x16_neon|
PROC
29 vmov.i8 q8
, #
0 ;q8 - sum
30 vmov.i8 q9
, #
0 ;q9, q10 - sse
35 variance16x16_neon_loop
36 vld1.8
{q0}, [r0
], r1
;Load up source and reference
41 vsubl.u8 q11
, d0
, d4
;calculate diff
46 ;VPADAL adds adjacent pairs of elements of a vector, and accumulates
47 ;the results into the elements of the destination vector. The explanation
48 ;in ARM guide is wrong.
49 vpadal.s16 q8
, q11
;calculate sum
50 vmlal.s16 q9
, d22
, d22
;calculate sse
51 vmlal.s16 q10
, d23
, d23
56 vmlal.s16 q9
, d24
, d24
57 vmlal.s16 q10
, d25
, d25
59 vmlal.s16 q9
, d26
, d26
60 vmlal.s16 q10
, d27
, d27
62 vmlal.s16 q9
, d28
, d28
63 vmlal.s16 q10
, d29
, d29
65 bne variance16x16_neon_loop
67 vadd.u32 q10
, q9
, q10
;accumulate sse
68 vpaddl.s32 q0
, q8
;accumulate sum
70 ldr r12
, [sp] ;load *sse from stack
76 ;vmov.32 r0, d0[0] ;this instruction costs a lot
80 ;sub r0, r1, r0, asr #8
82 ;sum is in [-255x256, 255x256]. sumxsum is 32-bit. Shift to right should
83 ;have sign-bit exension, which is vshr.s. Have to use s32 to make it right.
85 vst1.32
{d1[0]}, [r12
] ;store sse
89 vmov
.32 r0
, d0
[0] ;return
94 ;================================
95 ;unsigned int vp8_variance16x8_c(
96 ; unsigned char *src_ptr,
98 ; unsigned char *ref_ptr,
101 |vp8_variance16x8_neon|
PROC
102 vmov.i8 q8
, #
0 ;q8 - sum
103 vmov.i8 q9
, #
0 ;q9, q10 - sse
108 variance16x8_neon_loop
109 vld1.8
{q0}, [r0
], r1
;Load up source and reference
110 vld1.8
{q2}, [r2
], r3
111 vld1.8
{q1}, [r0
], r1
112 vld1.8
{q3}, [r2
], r3
114 vsubl.u8 q11
, d0
, d4
;calculate diff
119 vpadal.s16 q8
, q11
;calculate sum
120 vmlal.s16 q9
, d22
, d22
;calculate sse
121 vmlal.s16 q10
, d23
, d23
126 vmlal.s16 q9
, d24
, d24
127 vmlal.s16 q10
, d25
, d25
129 vmlal.s16 q9
, d26
, d26
130 vmlal.s16 q10
, d27
, d27
132 vmlal.s16 q9
, d28
, d28
133 vmlal.s16 q10
, d29
, d29
135 bne variance16x8_neon_loop
137 vadd.u32 q10
, q9
, q10
;accumulate sse
138 vpaddl.s32 q0
, q8
;accumulate sum
140 ldr r12
, [sp] ;load *sse from stack
147 vst1.32
{d1[0]}, [r12
] ;store sse
148 vshr.s32 d10
, d10
, #
7
151 vmov
.32 r0
, d0
[0] ;return
156 ;=================================
157 ;unsigned int vp8_variance8x16_c(
158 ; unsigned char *src_ptr,
160 ; unsigned char *ref_ptr,
164 |vp8_variance8x16_neon|
PROC
165 vmov.i8 q8
, #
0 ;q8 - sum
166 vmov.i8 q9
, #
0 ;q9, q10 - sse
171 variance8x16_neon_loop
172 vld1.8
{d0}, [r0
], r1
;Load up source and reference
173 vld1.8
{d4}, [r2
], r3
174 vld1.8
{d2}, [r0
], r1
175 vld1.8
{d6}, [r2
], r3
177 vsubl.u8 q11
, d0
, d4
;calculate diff
180 vpadal.s16 q8
, q11
;calculate sum
181 vmlal.s16 q9
, d22
, d22
;calculate sse
182 vmlal.s16 q10
, d23
, d23
187 vmlal.s16 q9
, d24
, d24
188 vmlal.s16 q10
, d25
, d25
190 bne variance8x16_neon_loop
192 vadd.u32 q10
, q9
, q10
;accumulate sse
193 vpaddl.s32 q0
, q8
;accumulate sum
195 ldr r12
, [sp] ;load *sse from stack
202 vst1.32
{d1[0]}, [r12
] ;store sse
203 vshr.s32 d10
, d10
, #
7
206 vmov
.32 r0
, d0
[0] ;return
211 ;==================================
212 ; r0 unsigned char *src_ptr
213 ; r1 int source_stride
214 ; r2 unsigned char *ref_ptr
215 ; r3 int recon_stride
216 ; stack unsigned int *sse
217 |vp8_variance8x8_neon|
PROC
218 vmov.i8 q8
, #
0 ;q8 - sum
219 vmov.i8 q9
, #
0 ;q9, q10 - sse
224 variance8x8_neon_loop
225 vld1.8
{d0}, [r0
], r1
;Load up source and reference
226 vld1.8
{d4}, [r2
], r3
227 vld1.8
{d1}, [r0
], r1
228 vld1.8
{d5}, [r2
], r3
229 vld1.8
{d2}, [r0
], r1
230 vld1.8
{d6}, [r2
], r3
231 vld1.8
{d3}, [r0
], r1
232 vld1.8
{d7}, [r2
], r3
234 vsubl.u8 q11
, d0
, d4
;calculate diff
239 vpadal.s16 q8
, q11
;calculate sum
240 vmlal.s16 q9
, d22
, d22
;calculate sse
241 vmlal.s16 q10
, d23
, d23
246 vmlal.s16 q9
, d24
, d24
247 vmlal.s16 q10
, d25
, d25
249 vmlal.s16 q9
, d26
, d26
250 vmlal.s16 q10
, d27
, d27
252 vmlal.s16 q9
, d28
, d28
253 vmlal.s16 q10
, d29
, d29
255 bne variance8x8_neon_loop
257 vadd.u32 q10
, q9
, q10
;accumulate sse
258 vpaddl.s32 q0
, q8
;accumulate sum
260 ldr r12
, [sp] ;load *sse from stack
267 vst1.32
{d1[0]}, [r12
] ;store sse
268 vshr.s32 d10
, d10
, #
6
271 vmov
.32 r0
, d0
[0] ;return