Define RDCOST only once
[libvpx.git] / vp8 / encoder / arm / neon / vp8_subpixelvariance8x8_neon.asm
blob38b58780a266ed7b3cbff9b40495db9a330fbd92
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 EXPORT |vp8_sub_pixel_variance8x8_neon|
13 ARM
14 REQUIRE8
15 PRESERVE8
17 AREA ||.text||, CODE, READONLY, ALIGN=2
18 ; r0 unsigned char *src_ptr,
19 ; r1 int src_pixels_per_line,
20 ; r2 int xoffset,
21 ; r3 int yoffset,
22 ; stack(r4) unsigned char *dst_ptr,
23 ; stack(r5) int dst_pixels_per_line,
24 ; stack(r6) unsigned int *sse
25 ;note: most of the code is copied from bilinear_predict8x8_neon and vp8_variance8x8_neon.
27 |vp8_sub_pixel_variance8x8_neon| PROC
28 push {r4-r5, lr}
30 ldr r12, _BilinearTaps_coeff_
31 ldr r4, [sp, #12] ;load *dst_ptr from stack
32 ldr r5, [sp, #16] ;load dst_pixels_per_line from stack
33 ldr lr, [sp, #20] ;load *sse from stack
35 cmp r2, #0 ;skip first_pass filter if xoffset=0
36 beq skip_firstpass_filter
38 ;First pass: output_height lines x output_width columns (9x8)
39 add r2, r12, r2, lsl #3 ;calculate filter location
41 vld1.u8 {q1}, [r0], r1 ;load src data
42 vld1.u32 {d31}, [r2] ;load first_pass filter
43 vld1.u8 {q2}, [r0], r1
44 vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
45 vld1.u8 {q3}, [r0], r1
46 vdup.8 d1, d31[4]
47 vld1.u8 {q4}, [r0], r1
49 vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
50 vmull.u8 q7, d4, d0
51 vmull.u8 q8, d6, d0
52 vmull.u8 q9, d8, d0
54 vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
55 vext.8 d5, d4, d5, #1
56 vext.8 d7, d6, d7, #1
57 vext.8 d9, d8, d9, #1
59 vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
60 vmlal.u8 q7, d5, d1
61 vmlal.u8 q8, d7, d1
62 vmlal.u8 q9, d9, d1
64 vld1.u8 {q1}, [r0], r1 ;load src data
65 vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
66 vld1.u8 {q2}, [r0], r1
67 vqrshrn.u16 d23, q7, #7
68 vld1.u8 {q3}, [r0], r1
69 vqrshrn.u16 d24, q8, #7
70 vld1.u8 {q4}, [r0], r1
71 vqrshrn.u16 d25, q9, #7
73 ;first_pass filtering on the rest 5-line data
74 vld1.u8 {q5}, [r0], r1
76 vmull.u8 q6, d2, d0 ;(src_ptr[0] * Filter[0])
77 vmull.u8 q7, d4, d0
78 vmull.u8 q8, d6, d0
79 vmull.u8 q9, d8, d0
80 vmull.u8 q10, d10, d0
82 vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
83 vext.8 d5, d4, d5, #1
84 vext.8 d7, d6, d7, #1
85 vext.8 d9, d8, d9, #1
86 vext.8 d11, d10, d11, #1
88 vmlal.u8 q6, d3, d1 ;(src_ptr[1] * Filter[1])
89 vmlal.u8 q7, d5, d1
90 vmlal.u8 q8, d7, d1
91 vmlal.u8 q9, d9, d1
92 vmlal.u8 q10, d11, d1
94 vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8
95 vqrshrn.u16 d27, q7, #7
96 vqrshrn.u16 d28, q8, #7
97 vqrshrn.u16 d29, q9, #7
98 vqrshrn.u16 d30, q10, #7
100 ;Second pass: 8x8
101 secondpass_filter
102 cmp r3, #0 ;skip second_pass filter if yoffset=0
103 ;skip_secondpass_filter
104 beq sub_pixel_variance8x8_neon
106 add r3, r12, r3, lsl #3
108 vld1.u32 {d31}, [r3] ;load second_pass filter
110 vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
111 vdup.8 d1, d31[4]
113 vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
114 vmull.u8 q2, d23, d0
115 vmull.u8 q3, d24, d0
116 vmull.u8 q4, d25, d0
117 vmull.u8 q5, d26, d0
118 vmull.u8 q6, d27, d0
119 vmull.u8 q7, d28, d0
120 vmull.u8 q8, d29, d0
122 vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * Filter[1])
123 vmlal.u8 q2, d24, d1
124 vmlal.u8 q3, d25, d1
125 vmlal.u8 q4, d26, d1
126 vmlal.u8 q5, d27, d1
127 vmlal.u8 q6, d28, d1
128 vmlal.u8 q7, d29, d1
129 vmlal.u8 q8, d30, d1
131 vqrshrn.u16 d22, q1, #7 ;shift/round/saturate to u8
132 vqrshrn.u16 d23, q2, #7
133 vqrshrn.u16 d24, q3, #7
134 vqrshrn.u16 d25, q4, #7
135 vqrshrn.u16 d26, q5, #7
136 vqrshrn.u16 d27, q6, #7
137 vqrshrn.u16 d28, q7, #7
138 vqrshrn.u16 d29, q8, #7
140 b sub_pixel_variance8x8_neon
142 ;--------------------
143 skip_firstpass_filter
144 vld1.u8 {d22}, [r0], r1 ;load src data
145 vld1.u8 {d23}, [r0], r1
146 vld1.u8 {d24}, [r0], r1
147 vld1.u8 {d25}, [r0], r1
148 vld1.u8 {d26}, [r0], r1
149 vld1.u8 {d27}, [r0], r1
150 vld1.u8 {d28}, [r0], r1
151 vld1.u8 {d29}, [r0], r1
152 vld1.u8 {d30}, [r0], r1
154 b secondpass_filter
156 ;----------------------
157 ;vp8_variance8x8_neon
158 sub_pixel_variance8x8_neon
159 vmov.i8 q8, #0 ;q8 - sum
160 vmov.i8 q9, #0 ;q9, q10 - sse
161 vmov.i8 q10, #0
163 mov r12, #2
165 sub_pixel_variance8x8_neon_loop
166 vld1.8 {d0}, [r4], r5 ;load dst data
167 subs r12, r12, #1
168 vld1.8 {d1}, [r4], r5
169 vld1.8 {d2}, [r4], r5
170 vsubl.u8 q4, d22, d0 ;calculate diff
171 vld1.8 {d3}, [r4], r5
173 vsubl.u8 q5, d23, d1
174 vsubl.u8 q6, d24, d2
176 vpadal.s16 q8, q4 ;sum
177 vmlal.s16 q9, d8, d8 ;sse
178 vmlal.s16 q10, d9, d9
180 vsubl.u8 q7, d25, d3
182 vpadal.s16 q8, q5
183 vmlal.s16 q9, d10, d10
184 vmlal.s16 q10, d11, d11
186 vmov q11, q13
188 vpadal.s16 q8, q6
189 vmlal.s16 q9, d12, d12
190 vmlal.s16 q10, d13, d13
192 vmov q12, q14
194 vpadal.s16 q8, q7
195 vmlal.s16 q9, d14, d14
196 vmlal.s16 q10, d15, d15
198 bne sub_pixel_variance8x8_neon_loop
200 vadd.u32 q10, q9, q10 ;accumulate sse
201 vpaddl.s32 q0, q8 ;accumulate sum
203 vpaddl.u32 q1, q10
204 vadd.s64 d0, d0, d1
205 vadd.u64 d1, d2, d3
207 vmull.s32 q5, d0, d0
208 vst1.32 {d1[0]}, [lr] ;store sse
209 vshr.s32 d10, d10, #6
210 vsub.s32 d0, d1, d10
212 vmov.32 r0, d0[0] ;return
213 pop {r4-r5, pc}
215 ENDP
217 ;-----------------
219 _BilinearTaps_coeff_
220 DCD bilinear_taps_coeff
221 bilinear_taps_coeff
222 DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112