Set cpu_used range to [-16, 16] in real-time mode
[libvpx.git] / vp8 / encoder / arm / neon / vp8_mse16x16_neon.asm
blob6af4e87bab62f9609d78b7d6de7efdcc833cf7b3
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 EXPORT |vp8_mse16x16_neon|
13 EXPORT |vp8_get16x16pred_error_neon|
14 EXPORT |vp8_get4x4sse_cs_neon|
16 ARM
17 REQUIRE8
18 PRESERVE8
20 AREA ||.text||, CODE, READONLY, ALIGN=2
21 ;============================
22 ; r0 unsigned char *src_ptr
23 ; r1 int source_stride
24 ; r2 unsigned char *ref_ptr
25 ; r3 int recon_stride
26 ; stack unsigned int *sse
27 ;note: in this function, sum is never used. So, we can remove this part of calculation
28 ;from vp8_variance().
30 |vp8_mse16x16_neon| PROC
31 vmov.i8 q7, #0 ;q7, q8, q9, q10 - sse
32 vmov.i8 q8, #0
33 vmov.i8 q9, #0
34 vmov.i8 q10, #0
36 mov r12, #8
38 mse16x16_neon_loop
39 vld1.8 {q0}, [r0], r1 ;Load up source and reference
40 vld1.8 {q2}, [r2], r3
41 vld1.8 {q1}, [r0], r1
42 vld1.8 {q3}, [r2], r3
44 vsubl.u8 q11, d0, d4
45 vsubl.u8 q12, d1, d5
46 vsubl.u8 q13, d2, d6
47 vsubl.u8 q14, d3, d7
49 vmlal.s16 q7, d22, d22
50 vmlal.s16 q8, d23, d23
52 subs r12, r12, #1
54 vmlal.s16 q9, d24, d24
55 vmlal.s16 q10, d25, d25
56 vmlal.s16 q7, d26, d26
57 vmlal.s16 q8, d27, d27
58 vmlal.s16 q9, d28, d28
59 vmlal.s16 q10, d29, d29
61 bne mse16x16_neon_loop
63 vadd.u32 q7, q7, q8
64 vadd.u32 q9, q9, q10
66 ldr r12, [sp] ;load *sse from stack
68 vadd.u32 q10, q7, q9
69 vpaddl.u32 q1, q10
70 vadd.u64 d0, d2, d3
72 vst1.32 {d0[0]}, [r12]
73 vmov.32 r0, d0[0]
75 bx lr
77 ENDP
79 ;============================
80 ; r0 unsigned char *src_ptr
81 ; r1 int src_stride
82 ; r2 unsigned char *ref_ptr
83 ; r3 int ref_stride
84 |vp8_get16x16pred_error_neon| PROC
85 vmov.i8 q8, #0 ;q8 - sum
86 vmov.i8 q9, #0 ;q9, q10 - pred_error
87 vmov.i8 q10, #0
89 mov r12, #8
91 get16x16pred_error_neon_loop
92 vld1.8 {q0}, [r0], r1 ;Load up source and reference
93 vld1.8 {q2}, [r2], r3
94 vld1.8 {q1}, [r0], r1
95 vld1.8 {q3}, [r2], r3
97 vsubl.u8 q11, d0, d4
98 vsubl.u8 q12, d1, d5
99 vsubl.u8 q13, d2, d6
100 vsubl.u8 q14, d3, d7
102 vpadal.s16 q8, q11
103 vmlal.s16 q9, d22, d22
104 vmlal.s16 q10, d23, d23
106 subs r12, r12, #1
108 vpadal.s16 q8, q12
109 vmlal.s16 q9, d24, d24
110 vmlal.s16 q10, d25, d25
111 vpadal.s16 q8, q13
112 vmlal.s16 q9, d26, d26
113 vmlal.s16 q10, d27, d27
114 vpadal.s16 q8, q14
115 vmlal.s16 q9, d28, d28
116 vmlal.s16 q10, d29, d29
118 bne get16x16pred_error_neon_loop
120 vadd.u32 q10, q9, q10
121 vpaddl.s32 q0, q8
123 vpaddl.u32 q1, q10
124 vadd.s64 d0, d0, d1
125 vadd.u64 d1, d2, d3
127 vmull.s32 q5, d0, d0
128 vshr.s32 d10, d10, #8
129 vsub.s32 d0, d1, d10
131 vmov.32 r0, d0[0]
132 bx lr
134 ENDP
136 ;=============================
137 ; r0 unsigned char *src_ptr,
138 ; r1 int source_stride,
139 ; r2 unsigned char *ref_ptr,
140 ; r3 int recon_stride
141 |vp8_get4x4sse_cs_neon| PROC
142 vld1.8 {d0}, [r0], r1 ;Load up source and reference
143 vld1.8 {d4}, [r2], r3
144 vld1.8 {d1}, [r0], r1
145 vld1.8 {d5}, [r2], r3
146 vld1.8 {d2}, [r0], r1
147 vld1.8 {d6}, [r2], r3
148 vld1.8 {d3}, [r0], r1
149 vld1.8 {d7}, [r2], r3
151 vsubl.u8 q11, d0, d4
152 vsubl.u8 q12, d1, d5
153 vsubl.u8 q13, d2, d6
154 vsubl.u8 q14, d3, d7
156 vmull.s16 q7, d22, d22
157 vmull.s16 q8, d24, d24
158 vmull.s16 q9, d26, d26
159 vmull.s16 q10, d28, d28
161 vadd.u32 q7, q7, q8
162 vadd.u32 q9, q9, q10
163 vadd.u32 q9, q7, q9
165 vpaddl.u32 q1, q9
166 vadd.u64 d0, d2, d3
168 vmov.32 r0, d0[0]
169 bx lr
171 ENDP