2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 EXPORT |vp8_sub_pixel_variance8x8_neon|
17 AREA ||.text||
, CODE
, READONLY
, ALIGN=2
18 ; r0 unsigned char *src_ptr,
19 ; r1 int src_pixels_per_line,
22 ; stack(r4) unsigned char *dst_ptr,
23 ; stack(r5) int dst_pixels_per_line,
24 ; stack(r6) unsigned int *sse
25 ;note: most of the code is copied from bilinear_predict8x8_neon and vp8_variance8x8_neon.
27 |vp8_sub_pixel_variance8x8_neon|
PROC
30 ldr r12
, _BilinearTaps_coeff_
31 ldr r4
, [sp, #
12] ;load *dst_ptr from stack
32 ldr r5
, [sp, #
16] ;load dst_pixels_per_line from stack
33 ldr lr
, [sp, #
20] ;load *sse from stack
35 cmp r2
, #
0 ;skip first_pass filter if xoffset=0
36 beq skip_firstpass_filter
38 ;First pass: output_height lines x output_width columns (9x8)
39 add r2
, r12
, r2
, lsl #
3 ;calculate filter location
41 vld1.u8
{q1}, [r0
], r1
;load src data
42 vld1.u32
{d31}, [r2
] ;load first_pass filter
43 vld1.u8
{q2}, [r0
], r1
44 vdup
.8 d0
, d31
[0] ;first_pass filter (d0 d1)
45 vld1.u8
{q3}, [r0
], r1
47 vld1.u8
{q4}, [r0
], r1
49 vmull.u8 q6
, d2
, d0
;(src_ptr[0] * Filter[0])
54 vext
.8 d3
, d2
, d3
, #
1 ;construct src_ptr[-1]
59 vmlal.u8 q6
, d3
, d1
;(src_ptr[1] * Filter[1])
64 vld1.u8
{q1}, [r0
], r1
;load src data
65 vqrshrn.u16 d22
, q6
, #
7 ;shift/round/saturate to u8
66 vld1.u8
{q2}, [r0
], r1
67 vqrshrn.u16 d23
, q7
, #
7
68 vld1.u8
{q3}, [r0
], r1
69 vqrshrn.u16 d24
, q8
, #
7
70 vld1.u8
{q4}, [r0
], r1
71 vqrshrn.u16 d25
, q9
, #
7
73 ;first_pass filtering on the rest 5-line data
74 vld1.u8
{q5}, [r0
], r1
76 vmull.u8 q6
, d2
, d0
;(src_ptr[0] * Filter[0])
82 vext
.8 d3
, d2
, d3
, #
1 ;construct src_ptr[-1]
86 vext
.8 d11
, d10
, d11
, #
1
88 vmlal.u8 q6
, d3
, d1
;(src_ptr[1] * Filter[1])
94 vqrshrn.u16 d26
, q6
, #
7 ;shift/round/saturate to u8
95 vqrshrn.u16 d27
, q7
, #
7
96 vqrshrn.u16 d28
, q8
, #
7
97 vqrshrn.u16 d29
, q9
, #
7
98 vqrshrn.u16 d30
, q10
, #
7
102 cmp r3
, #
0 ;skip second_pass filter if yoffset=0
103 ;skip_secondpass_filter
104 beq sub_pixel_variance8x8_neon
106 add r3
, r12
, r3
, lsl #
3
108 vld1.u32
{d31}, [r3
] ;load second_pass filter
110 vdup
.8 d0
, d31
[0] ;second_pass filter parameters (d0 d1)
113 vmull.u8 q1
, d22
, d0
;(src_ptr[0] * Filter[0])
122 vmlal.u8 q1
, d23
, d1
;(src_ptr[pixel_step] * Filter[1])
131 vqrshrn.u16 d22
, q1
, #
7 ;shift/round/saturate to u8
132 vqrshrn.u16 d23
, q2
, #
7
133 vqrshrn.u16 d24
, q3
, #
7
134 vqrshrn.u16 d25
, q4
, #
7
135 vqrshrn.u16 d26
, q5
, #
7
136 vqrshrn.u16 d27
, q6
, #
7
137 vqrshrn.u16 d28
, q7
, #
7
138 vqrshrn.u16 d29
, q8
, #
7
140 b sub_pixel_variance8x8_neon
142 ;--------------------
143 skip_firstpass_filter
144 vld1.u8
{d22}, [r0
], r1
;load src data
145 vld1.u8
{d23}, [r0
], r1
146 vld1.u8
{d24}, [r0
], r1
147 vld1.u8
{d25}, [r0
], r1
148 vld1.u8
{d26}, [r0
], r1
149 vld1.u8
{d27}, [r0
], r1
150 vld1.u8
{d28}, [r0
], r1
151 vld1.u8
{d29}, [r0
], r1
152 vld1.u8
{d30}, [r0
], r1
156 ;----------------------
157 ;vp8_variance8x8_neon
158 sub_pixel_variance8x8_neon
159 vmov.i8 q8
, #
0 ;q8 - sum
160 vmov.i8 q9
, #
0 ;q9, q10 - sse
165 sub_pixel_variance8x8_neon_loop
166 vld1.8
{d0}, [r4
], r5
;load dst data
168 vld1.8
{d1}, [r4
], r5
169 vld1.8
{d2}, [r4
], r5
170 vsubl.u8 q4
, d22
, d0
;calculate diff
171 vld1.8
{d3}, [r4
], r5
176 vpadal.s16 q8
, q4
;sum
177 vmlal.s16 q9
, d8
, d8
;sse
178 vmlal.s16 q10
, d9
, d9
183 vmlal.s16 q9
, d10
, d10
184 vmlal.s16 q10
, d11
, d11
189 vmlal.s16 q9
, d12
, d12
190 vmlal.s16 q10
, d13
, d13
195 vmlal.s16 q9
, d14
, d14
196 vmlal.s16 q10
, d15
, d15
198 bne sub_pixel_variance8x8_neon_loop
200 vadd.u32 q10
, q9
, q10
;accumulate sse
201 vpaddl.s32 q0
, q8
;accumulate sum
208 vst1.32
{d1[0]}, [lr
] ;store sse
209 vshr.s32 d10
, d10
, #
6
212 vmov
.32 r0
, d0
[0] ;return
220 DCD bilinear_taps_coeff
222 DCD
128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112