Merge "Skip computation of distortion in vp8_pick_inter_mode if active_map is used"
[libvpx.git] / vpx_scale / arm / neon / vp8_vpxyv12_extendframeborders_neon.asm
blob8444b8e03719630d8dcee556740b6a2f9120cc2a
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 EXPORT |vp8_yv12_extend_frame_borders_neon|
13 ARM
14 REQUIRE8
15 PRESERVE8
17 INCLUDE asm_com_offsets.asm
19 AREA ||.text||, CODE, READONLY, ALIGN=2
20 ;void vp8_yv12_extend_frame_borders_neon (YV12_BUFFER_CONFIG *ybf);
21 ; we depend on VP8BORDERINPIXELS being 32
23 |vp8_yv12_extend_frame_borders_neon| PROC
24 push {r4 - r10, lr}
25 vpush {d8 - d15}
27 ; Border = 32
28 ldr r3, [r0, #yv12_buffer_config_y_width] ; plane_width
29 ldr r1, [r0, #yv12_buffer_config_y_buffer] ; src_ptr1
30 ldr r4, [r0, #yv12_buffer_config_y_height] ; plane_height
31 ldr lr, [r0, #yv12_buffer_config_y_stride] ; plane_stride
33 ; Border copy for Y plane
34 ; copy the left and right most columns out
35 add r6, r1, r3 ; dest_ptr2 = src_ptr2 + 1 (src_ptr1 + plane_width)
36 sub r2, r6, #1 ; src_ptr2 = src_ptr1 + plane_width - 1
37 sub r5, r1, #32 ; dest_ptr1 = src_ptr1 - Border
39 mov r12, r4, lsr #2 ; plane_height / 4
41 copy_left_right_y
42 vld1.8 {d0[], d1[]}, [r1], lr
43 vld1.8 {d4[], d5[]}, [r2], lr
44 vld1.8 {d8[], d9[]}, [r1], lr
45 vld1.8 {d12[], d13[]}, [r2], lr
46 vld1.8 {d16[], d17[]}, [r1], lr
47 vld1.8 {d20[], d21[]}, [r2], lr
48 vld1.8 {d24[], d25[]}, [r1], lr
49 vld1.8 {d28[], d29[]}, [r2], lr
51 vmov q1, q0
52 vmov q3, q2
53 vmov q5, q4
54 vmov q7, q6
55 vmov q9, q8
56 vmov q11, q10
57 vmov q13, q12
58 vmov q15, q14
60 subs r12, r12, #1
62 vst1.8 {q0, q1}, [r5], lr
63 vst1.8 {q2, q3}, [r6], lr
64 vst1.8 {q4, q5}, [r5], lr
65 vst1.8 {q6, q7}, [r6], lr
66 vst1.8 {q8, q9}, [r5], lr
67 vst1.8 {q10, q11}, [r6], lr
68 vst1.8 {q12, q13}, [r5], lr
69 vst1.8 {q14, q15}, [r6], lr
71 bne copy_left_right_y
73 ;Now copy the top and bottom source lines into each line of the respective borders
74 ldr r1, [r0, #yv12_buffer_config_y_buffer] ; y_buffer
75 mul r8, r4, lr ; plane_height * plane_stride
77 ; copy width is plane_stride
78 mov r12, lr, lsr #7 ; plane_stride / 128
80 sub r1, r1, #32 ; src_ptr1 = y_buffer - Border
81 add r6, r1, r8 ; dest_ptr2 = src_ptr2 - plane_stride (src_ptr1 + (plane_height * plane_stride))
82 sub r2, r6, lr ; src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride
83 sub r5, r1, lr, asl #5 ; dest_ptr1 = src_ptr1 - (Border * plane_stride)
85 copy_top_bottom_y
86 vld1.8 {q0, q1}, [r1]!
87 vld1.8 {q8, q9}, [r2]!
88 vld1.8 {q2, q3}, [r1]!
89 vld1.8 {q10, q11}, [r2]!
90 vld1.8 {q4, q5}, [r1]!
91 vld1.8 {q12, q13}, [r2]!
92 vld1.8 {q6, q7}, [r1]!
93 vld1.8 {q14, q15}, [r2]!
95 mov r7, #32 ; Border
97 top_bottom_32
98 subs r7, r7, #1
100 vst1.8 {q0, q1}, [r5]!
101 vst1.8 {q8, q9}, [r6]!
102 vst1.8 {q2, q3}, [r5]!
103 vst1.8 {q10, q11}, [r6]!
104 vst1.8 {q4, q5}, [r5]!
105 vst1.8 {q12, q13}, [r6]!
106 vst1.8 {q6, q7}, [r5]!
107 vst1.8 {q14, q15}, [r6]!
109 add r5, r5, lr ; dest_ptr1 += plane_stride
110 sub r5, r5, #128 ; dest_ptr1 -= 128
111 add r6, r6, lr ; dest_ptr2 += plane_stride
112 sub r6, r6, #128 ; dest_ptr2 -= 128
114 bne top_bottom_32
116 sub r5, r1, lr, asl #5 ; src_ptr1 - (Border* plane_stride)
117 add r6, r2, lr ; src_ptr2 + plane_stride
119 subs r12, r12, #1
120 bne copy_top_bottom_y
122 mov r7, lr, lsr #4 ; check to see if extra copy is needed
123 ands r7, r7, #0x7
124 bne extra_top_bottom_y
125 end_of_border_copy_y
127 ;Border copy for U, V planes
128 ; Border = 16
129 ldr r7, [r0, #yv12_buffer_config_u_buffer] ; src_ptr1
130 ldr lr, [r0, #yv12_buffer_config_uv_stride] ; plane_stride
131 ldr r3, [r0, #yv12_buffer_config_uv_width] ; plane_width
132 ldr r4, [r0, #yv12_buffer_config_uv_height] ; plane_height
134 mov r10, #2
136 ;copy the left and right most columns out
137 border_copy_uv
138 mov r1, r7 ; src_ptr1 needs to be saved for second half of loop
139 sub r5, r1, #16 ; dest_ptr1 = src_ptr1 - Border
140 add r6, r1, r3 ; dest_ptr2 = src_ptr2 + 1 (src_ptr1 + plane_width)
141 sub r2, r6, #1 ; src_ptr2 = src_ptr1 + plane_width - 1
143 mov r12, r4, lsr #3 ; plane_height / 8
145 copy_left_right_uv
146 vld1.8 {d0[], d1[]}, [r1], lr
147 vld1.8 {d2[], d3[]}, [r2], lr
148 vld1.8 {d4[], d5[]}, [r1], lr
149 vld1.8 {d6[], d7[]}, [r2], lr
150 vld1.8 {d8[], d9[]}, [r1], lr
151 vld1.8 {d10[], d11[]}, [r2], lr
152 vld1.8 {d12[], d13[]}, [r1], lr
153 vld1.8 {d14[], d15[]}, [r2], lr
154 vld1.8 {d16[], d17[]}, [r1], lr
155 vld1.8 {d18[], d19[]}, [r2], lr
156 vld1.8 {d20[], d21[]}, [r1], lr
157 vld1.8 {d22[], d23[]}, [r2], lr
158 vld1.8 {d24[], d25[]}, [r1], lr
159 vld1.8 {d26[], d27[]}, [r2], lr
160 vld1.8 {d28[], d29[]}, [r1], lr
161 vld1.8 {d30[], d31[]}, [r2], lr
163 subs r12, r12, #1
165 vst1.8 {q0}, [r5], lr
166 vst1.8 {q1}, [r6], lr
167 vst1.8 {q2}, [r5], lr
168 vst1.8 {q3}, [r6], lr
169 vst1.8 {q4}, [r5], lr
170 vst1.8 {q5}, [r6], lr
171 vst1.8 {q6}, [r5], lr
172 vst1.8 {q7}, [r6], lr
173 vst1.8 {q8}, [r5], lr
174 vst1.8 {q9}, [r6], lr
175 vst1.8 {q10}, [r5], lr
176 vst1.8 {q11}, [r6], lr
177 vst1.8 {q12}, [r5], lr
178 vst1.8 {q13}, [r6], lr
179 vst1.8 {q14}, [r5], lr
180 vst1.8 {q15}, [r6], lr
182 bne copy_left_right_uv
184 ;Now copy the top and bottom source lines into each line of the respective borders
185 mov r1, r7
186 mul r8, r4, lr ; plane_height * plane_stride
187 mov r12, lr, lsr #6 ; plane_stride / 64
189 sub r1, r1, #16 ; src_ptr1 = u_buffer - Border
190 add r6, r1, r8 ; dest_ptr2 = src_ptr2 + plane_stride (src_ptr1 + (plane_height * plane_stride)
191 sub r2, r6, lr ; src_ptr2 = src_ptr1 + (plane_height * plane_stride) - plane_stride
192 sub r5, r1, lr, asl #4 ; dest_ptr1 = src_ptr1 - (Border * plane_stride)
194 copy_top_bottom_uv
195 vld1.8 {q0, q1}, [r1]!
196 vld1.8 {q8, q9}, [r2]!
197 vld1.8 {q2, q3}, [r1]!
198 vld1.8 {q10, q11}, [r2]!
200 mov r7, #16 ; Border
202 top_bottom_16
203 subs r7, r7, #1
205 vst1.8 {q0, q1}, [r5]!
206 vst1.8 {q8, q9}, [r6]!
207 vst1.8 {q2, q3}, [r5]!
208 vst1.8 {q10, q11}, [r6]!
210 add r5, r5, lr ; dest_ptr1 += plane_stride
211 sub r5, r5, #64
212 add r6, r6, lr ; dest_ptr2 += plane_stride
213 sub r6, r6, #64
215 bne top_bottom_16
217 sub r5, r1, lr, asl #4 ; dest_ptr1 = src_ptr1 - (Border * plane_stride)
218 add r6, r2, lr ; dest_ptr2 = src_ptr2 + plane_stride
220 subs r12, r12, #1
221 bne copy_top_bottom_uv
223 mov r7, lr, lsr #3 ; check to see if extra copy is needed
224 ands r7, r7, #0x7
225 bne extra_top_bottom_uv
227 end_of_border_copy_uv
228 subs r10, r10, #1
229 ldrne r7, [r0, #yv12_buffer_config_v_buffer] ; src_ptr1
230 bne border_copy_uv
232 vpop {d8 - d15}
233 pop {r4 - r10, pc}
235 ;;;;;;;;;;;;;;;;;;;;;;
236 extra_top_bottom_y
237 vld1.8 {q0}, [r1]!
238 vld1.8 {q2}, [r2]!
240 mov r9, #4 ; 32 >> 3
242 extra_top_bottom_32
243 subs r9, r9, #1
245 vst1.8 {q0}, [r5], lr
246 vst1.8 {q2}, [r6], lr
247 vst1.8 {q0}, [r5], lr
248 vst1.8 {q2}, [r6], lr
249 vst1.8 {q0}, [r5], lr
250 vst1.8 {q2}, [r6], lr
251 vst1.8 {q0}, [r5], lr
252 vst1.8 {q2}, [r6], lr
253 vst1.8 {q0}, [r5], lr
254 vst1.8 {q2}, [r6], lr
255 vst1.8 {q0}, [r5], lr
256 vst1.8 {q2}, [r6], lr
257 vst1.8 {q0}, [r5], lr
258 vst1.8 {q2}, [r6], lr
259 vst1.8 {q0}, [r5], lr
260 vst1.8 {q2}, [r6], lr
261 bne extra_top_bottom_32
263 sub r5, r1, lr, asl #5 ; src_ptr1 - (Border * plane_stride)
264 add r6, r2, lr ; src_ptr2 + plane_stride
265 subs r7, r7, #1
266 bne extra_top_bottom_y
268 b end_of_border_copy_y
270 extra_top_bottom_uv
271 vld1.8 {d0}, [r1]!
272 vld1.8 {d8}, [r2]!
274 mov r9, #2 ; 16 >> 3
276 extra_top_bottom_16
277 subs r9, r9, #1
279 vst1.8 {d0}, [r5], lr
280 vst1.8 {d8}, [r6], lr
281 vst1.8 {d0}, [r5], lr
282 vst1.8 {d8}, [r6], lr
283 vst1.8 {d0}, [r5], lr
284 vst1.8 {d8}, [r6], lr
285 vst1.8 {d0}, [r5], lr
286 vst1.8 {d8}, [r6], lr
287 vst1.8 {d0}, [r5], lr
288 vst1.8 {d8}, [r6], lr
289 vst1.8 {d0}, [r5], lr
290 vst1.8 {d8}, [r6], lr
291 vst1.8 {d0}, [r5], lr
292 vst1.8 {d8}, [r6], lr
293 vst1.8 {d0}, [r5], lr
294 vst1.8 {d8}, [r6], lr
295 bne extra_top_bottom_16
297 sub r5, r1, lr, asl #4 ; src_ptr1 - (Border * plane_stride)
298 add r6, r2, lr ; src_ptr2 + plane_stride
299 subs r7, r7, #1
300 bne extra_top_bottom_uv
302 b end_of_border_copy_uv
304 ENDP