arm: remove duplicate functions
[libvpx.git] / vp8 / common / arm / neon / bilinearpredict8x8_neon.asm
blobf7a7d149664b4edd10ef556588e87f3bcae9b5d2
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 EXPORT |vp8_bilinear_predict8x8_neon|
13 ARM
14 REQUIRE8
15 PRESERVE8
17 AREA ||.text||, CODE, READONLY, ALIGN=2
18 ; r0 unsigned char *src_ptr,
19 ; r1 int src_pixels_per_line,
20 ; r2 int xoffset,
21 ; r3 int yoffset,
22 ; r4 unsigned char *dst_ptr,
23 ; stack(lr) int dst_pitch
25 |vp8_bilinear_predict8x8_neon| PROC
26 push {r4, lr}
28 ldr r12, _bifilter8_coeff_
29 ldr r4, [sp, #8] ;load parameters from stack
30 ldr lr, [sp, #12] ;load parameters from stack
32 cmp r2, #0 ;skip first_pass filter if xoffset=0
33 beq skip_firstpass_filter
35 ;First pass: output_height lines x output_width columns (9x8)
36 add r2, r12, r2, lsl #3 ;calculate filter location
38 vld1.u8 {q1}, [r0], r1 ;load src data
39 vld1.u32 {d31}, [r2] ;load first_pass filter
40 vld1.u8 {q2}, [r0], r1
41 vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
42 vld1.u8 {q3}, [r0], r1
43 vdup.8 d1, d31[4]
44 vld1.u8 {q4}, [r0], r1
46 vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0])
47 vmull.u8 q7, d4, d0
48 vmull.u8 q8, d6, d0
49 vmull.u8 q9, d8, d0
51 vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
52 vext.8 d5, d4, d5, #1
53 vext.8 d7, d6, d7, #1
54 vext.8 d9, d8, d9, #1
56 vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1])
57 vmlal.u8 q7, d5, d1
58 vmlal.u8 q8, d7, d1
59 vmlal.u8 q9, d9, d1
61 vld1.u8 {q1}, [r0], r1 ;load src data
62 vqrshrn.u16 d22, q6, #7 ;shift/round/saturate to u8
63 vld1.u8 {q2}, [r0], r1
64 vqrshrn.u16 d23, q7, #7
65 vld1.u8 {q3}, [r0], r1
66 vqrshrn.u16 d24, q8, #7
67 vld1.u8 {q4}, [r0], r1
68 vqrshrn.u16 d25, q9, #7
70 ;first_pass filtering on the rest 5-line data
71 vld1.u8 {q5}, [r0], r1
73 vmull.u8 q6, d2, d0 ;(src_ptr[0] * vp8_filter[0])
74 vmull.u8 q7, d4, d0
75 vmull.u8 q8, d6, d0
76 vmull.u8 q9, d8, d0
77 vmull.u8 q10, d10, d0
79 vext.8 d3, d2, d3, #1 ;construct src_ptr[-1]
80 vext.8 d5, d4, d5, #1
81 vext.8 d7, d6, d7, #1
82 vext.8 d9, d8, d9, #1
83 vext.8 d11, d10, d11, #1
85 vmlal.u8 q6, d3, d1 ;(src_ptr[1] * vp8_filter[1])
86 vmlal.u8 q7, d5, d1
87 vmlal.u8 q8, d7, d1
88 vmlal.u8 q9, d9, d1
89 vmlal.u8 q10, d11, d1
91 vqrshrn.u16 d26, q6, #7 ;shift/round/saturate to u8
92 vqrshrn.u16 d27, q7, #7
93 vqrshrn.u16 d28, q8, #7
94 vqrshrn.u16 d29, q9, #7
95 vqrshrn.u16 d30, q10, #7
97 ;Second pass: 8x8
98 secondpass_filter
99 cmp r3, #0 ;skip second_pass filter if yoffset=0
100 beq skip_secondpass_filter
102 add r3, r12, r3, lsl #3
103 add r0, r4, lr
105 vld1.u32 {d31}, [r3] ;load second_pass filter
106 add r1, r0, lr
108 vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
109 vdup.8 d1, d31[4]
111 vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0])
112 vmull.u8 q2, d23, d0
113 vmull.u8 q3, d24, d0
114 vmull.u8 q4, d25, d0
115 vmull.u8 q5, d26, d0
116 vmull.u8 q6, d27, d0
117 vmull.u8 q7, d28, d0
118 vmull.u8 q8, d29, d0
120 vmlal.u8 q1, d23, d1 ;(src_ptr[pixel_step] * vp8_filter[1])
121 vmlal.u8 q2, d24, d1
122 vmlal.u8 q3, d25, d1
123 vmlal.u8 q4, d26, d1
124 vmlal.u8 q5, d27, d1
125 vmlal.u8 q6, d28, d1
126 vmlal.u8 q7, d29, d1
127 vmlal.u8 q8, d30, d1
129 vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
130 vqrshrn.u16 d3, q2, #7
131 vqrshrn.u16 d4, q3, #7
132 vqrshrn.u16 d5, q4, #7
133 vqrshrn.u16 d6, q5, #7
134 vqrshrn.u16 d7, q6, #7
135 vqrshrn.u16 d8, q7, #7
136 vqrshrn.u16 d9, q8, #7
138 vst1.u8 {d2}, [r4] ;store result
139 vst1.u8 {d3}, [r0]
140 vst1.u8 {d4}, [r1], lr
141 vst1.u8 {d5}, [r1], lr
142 vst1.u8 {d6}, [r1], lr
143 vst1.u8 {d7}, [r1], lr
144 vst1.u8 {d8}, [r1], lr
145 vst1.u8 {d9}, [r1], lr
147 pop {r4, pc}
149 ;--------------------
150 skip_firstpass_filter
151 vld1.u8 {d22}, [r0], r1 ;load src data
152 vld1.u8 {d23}, [r0], r1
153 vld1.u8 {d24}, [r0], r1
154 vld1.u8 {d25}, [r0], r1
155 vld1.u8 {d26}, [r0], r1
156 vld1.u8 {d27}, [r0], r1
157 vld1.u8 {d28}, [r0], r1
158 vld1.u8 {d29}, [r0], r1
159 vld1.u8 {d30}, [r0], r1
161 b secondpass_filter
163 ;---------------------
164 skip_secondpass_filter
165 vst1.u8 {d22}, [r4], lr ;store result
166 vst1.u8 {d23}, [r4], lr
167 vst1.u8 {d24}, [r4], lr
168 vst1.u8 {d25}, [r4], lr
169 vst1.u8 {d26}, [r4], lr
170 vst1.u8 {d27}, [r4], lr
171 vst1.u8 {d28}, [r4], lr
172 vst1.u8 {d29}, [r4], lr
174 pop {r4, pc}
176 ENDP
178 ;-----------------
179 AREA bifilters8_dat, DATA, READWRITE ;read/write by default
180 ;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
181 ;One word each is reserved. Label filter_coeff can be used to access the data.
182 ;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
183 _bifilter8_coeff_
184 DCD bifilter8_coeff
185 bifilter8_coeff
186 DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112