Merge "respect alignment in arm asm files"
[libvpx.git] / vpx_scale / arm / neon / vp8_vpxyv12_copyframeyonly_neon.asm
blobfebccc2d8469389e3238a77138ab59ed5bc81bf1
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 EXPORT |vp8_yv12_copy_frame_yonly_neon|
13 EXPORT |vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon|
15 ARM
16 REQUIRE8
17 PRESERVE8
19 INCLUDE asm_com_offsets.asm
21 AREA ||.text||, CODE, READONLY, ALIGN=2
22 ;void vpxyv12_copy_frame_yonly(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
23 ; Note: this is VP8 function, which has border=32 and 16. Internal y_width and y_height
24 ; are always multiples of 16.
26 |vp8_yv12_copy_frame_yonly_neon| PROC
27 push {r4 - r11, lr}
28 vpush {d8 - d15}
30 ldr r4, [r0, #yv12_buffer_config_y_height]
31 ldr r5, [r0, #yv12_buffer_config_y_width]
32 ldr r6, [r0, #yv12_buffer_config_y_stride]
33 ldr r7, [r1, #yv12_buffer_config_y_stride]
34 ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1
35 ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1
37 ; copy two rows at one time
38 mov lr, r4, lsr #1
40 cp_src_to_dst_height_loop
41 mov r8, r2
42 mov r9, r3
43 add r10, r2, r6
44 add r11, r3, r7
45 mov r12, r5, lsr #7
47 cp_src_to_dst_width_loop
48 vld1.8 {q0, q1}, [r8]!
49 vld1.8 {q8, q9}, [r10]!
50 vld1.8 {q2, q3}, [r8]!
51 vld1.8 {q10, q11}, [r10]!
52 vld1.8 {q4, q5}, [r8]!
53 vld1.8 {q12, q13}, [r10]!
54 vld1.8 {q6, q7}, [r8]!
55 vld1.8 {q14, q15}, [r10]!
57 subs r12, r12, #1
59 vst1.8 {q0, q1}, [r9]!
60 vst1.8 {q8, q9}, [r11]!
61 vst1.8 {q2, q3}, [r9]!
62 vst1.8 {q10, q11}, [r11]!
63 vst1.8 {q4, q5}, [r9]!
64 vst1.8 {q12, q13}, [r11]!
65 vst1.8 {q6, q7}, [r9]!
66 vst1.8 {q14, q15}, [r11]!
68 bne cp_src_to_dst_width_loop
70 subs lr, lr, #1
71 add r2, r2, r6, lsl #1
72 add r3, r3, r7, lsl #1
74 bne cp_src_to_dst_height_loop
76 ands r10, r5, #0x7f ;check to see if extra copy is needed
77 sub r11, r5, r10
78 ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1
79 ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1
80 bne extra_cp_src_to_dst_width
81 end_of_cp_src_to_dst
84 ;vpxyv12_extend_frame_borders_yonly
85 mov r0, r1
86 ;Not need to load y_width, since: y_width = y_stride - 2*border
87 ldr r3, [r0, #yv12_buffer_config_border]
88 ldr r1, [r0, #yv12_buffer_config_y_buffer] ;srcptr1
89 ldr r4, [r0, #yv12_buffer_config_y_height]
90 ldr lr, [r0, #yv12_buffer_config_y_stride]
92 cmp r3, #16
93 beq b16_extend_frame_borders
95 ;=======================
96 b32_extend_frame_borders
97 ;border = 32
98 ;=======================
99 ;Border copy for Y plane
100 ;copy the left and right most columns out
101 sub r5, r1, r3 ;destptr1
102 add r6, r1, lr
103 sub r6, r6, r3, lsl #1 ;destptr2
104 sub r2, r6, #1 ;srcptr2
106 ;Do four rows at one time
107 mov r12, r4, lsr #2
109 copy_left_right_y
110 vld1.8 {d0[], d1[]}, [r1], lr
111 vld1.8 {d4[], d5[]}, [r2], lr
112 vld1.8 {d8[], d9[]}, [r1], lr
113 vld1.8 {d12[], d13[]}, [r2], lr
114 vld1.8 {d16[], d17[]}, [r1], lr
115 vld1.8 {d20[], d21[]}, [r2], lr
116 vld1.8 {d24[], d25[]}, [r1], lr
117 vld1.8 {d28[], d29[]}, [r2], lr
119 vmov q1, q0
120 vmov q3, q2
121 vmov q5, q4
122 vmov q7, q6
123 vmov q9, q8
124 vmov q11, q10
125 vmov q13, q12
126 vmov q15, q14
128 subs r12, r12, #1
130 vst1.8 {q0, q1}, [r5], lr
131 vst1.8 {q2, q3}, [r6], lr
132 vst1.8 {q4, q5}, [r5], lr
133 vst1.8 {q6, q7}, [r6], lr
134 vst1.8 {q8, q9}, [r5], lr
135 vst1.8 {q10, q11}, [r6], lr
136 vst1.8 {q12, q13}, [r5], lr
137 vst1.8 {q14, q15}, [r6], lr
139 bne copy_left_right_y
141 ;Now copy the top and bottom source lines into each line of the respective borders
142 ldr r7, [r0, #yv12_buffer_config_y_buffer] ;srcptr1
143 mul r8, r3, lr
145 mov r12, lr, lsr #7
147 sub r6, r1, r3 ;destptr2
148 sub r2, r6, lr ;srcptr2
149 sub r1, r7, r3 ;srcptr1
150 sub r5, r1, r8 ;destptr1
152 copy_top_bottom_y
153 vld1.8 {q0, q1}, [r1]!
154 vld1.8 {q8, q9}, [r2]!
155 vld1.8 {q2, q3}, [r1]!
156 vld1.8 {q10, q11}, [r2]!
157 vld1.8 {q4, q5}, [r1]!
158 vld1.8 {q12, q13}, [r2]!
159 vld1.8 {q6, q7}, [r1]!
160 vld1.8 {q14, q15}, [r2]!
162 mov r7, r3
164 top_bottom_32
165 subs r7, r7, #1
167 vst1.8 {q0, q1}, [r5]!
168 vst1.8 {q8, q9}, [r6]!
169 vst1.8 {q2, q3}, [r5]!
170 vst1.8 {q10, q11}, [r6]!
171 vst1.8 {q4, q5}, [r5]!
172 vst1.8 {q12, q13}, [r6]!
173 vst1.8 {q6, q7}, [r5]!
174 vst1.8 {q14, q15}, [r6]!
176 add r5, r5, lr
177 sub r5, r5, #128
178 add r6, r6, lr
179 sub r6, r6, #128
181 bne top_bottom_32
183 sub r5, r1, r8
184 add r6, r2, lr
186 subs r12, r12, #1
187 bne copy_top_bottom_y
189 mov r7, lr, lsr #4 ;check to see if extra copy is needed
190 ands r7, r7, #0x7
191 bne extra_top_bottom_y
192 end_of_border_copy_y
194 vpop {d8 - d15}
195 pop {r4 - r11, pc}
197 ;=====================
198 ;extra copy part for Y
199 extra_top_bottom_y
200 vld1.8 {q0}, [r1]!
201 vld1.8 {q2}, [r2]!
203 mov r9, r3, lsr #3
205 extra_top_bottom_32
206 subs r9, r9, #1
208 vst1.8 {q0}, [r5], lr
209 vst1.8 {q2}, [r6], lr
210 vst1.8 {q0}, [r5], lr
211 vst1.8 {q2}, [r6], lr
212 vst1.8 {q0}, [r5], lr
213 vst1.8 {q2}, [r6], lr
214 vst1.8 {q0}, [r5], lr
215 vst1.8 {q2}, [r6], lr
216 vst1.8 {q0}, [r5], lr
217 vst1.8 {q2}, [r6], lr
218 vst1.8 {q0}, [r5], lr
219 vst1.8 {q2}, [r6], lr
220 vst1.8 {q0}, [r5], lr
221 vst1.8 {q2}, [r6], lr
222 vst1.8 {q0}, [r5], lr
223 vst1.8 {q2}, [r6], lr
224 bne extra_top_bottom_32
226 sub r5, r1, r8
227 add r6, r2, lr
228 subs r7, r7, #1
229 bne extra_top_bottom_y
231 b end_of_border_copy_y
234 ;=======================
235 b16_extend_frame_borders
236 ;border = 16
237 ;=======================
238 ;Border copy for Y plane
239 ;copy the left and right most columns out
240 sub r5, r1, r3 ;destptr1
241 add r6, r1, lr
242 sub r6, r6, r3, lsl #1 ;destptr2
243 sub r2, r6, #1 ;srcptr2
245 ;Do four rows at one time
246 mov r12, r4, lsr #2
248 copy_left_right_y_b16
249 vld1.8 {d0[], d1[]}, [r1], lr
250 vld1.8 {d4[], d5[]}, [r2], lr
251 vld1.8 {d8[], d9[]}, [r1], lr
252 vld1.8 {d12[], d13[]}, [r2], lr
253 vld1.8 {d16[], d17[]}, [r1], lr
254 vld1.8 {d20[], d21[]}, [r2], lr
255 vld1.8 {d24[], d25[]}, [r1], lr
256 vld1.8 {d28[], d29[]}, [r2], lr
258 subs r12, r12, #1
260 vst1.8 {q0}, [r5], lr
261 vst1.8 {q2}, [r6], lr
262 vst1.8 {q4}, [r5], lr
263 vst1.8 {q6}, [r6], lr
264 vst1.8 {q8}, [r5], lr
265 vst1.8 {q10}, [r6], lr
266 vst1.8 {q12}, [r5], lr
267 vst1.8 {q14}, [r6], lr
269 bne copy_left_right_y_b16
271 ;Now copy the top and bottom source lines into each line of the respective borders
272 ldr r7, [r0, #yv12_buffer_config_y_buffer] ;srcptr1
273 mul r8, r3, lr
275 mov r12, lr, lsr #7
277 sub r6, r1, r3 ;destptr2
278 sub r2, r6, lr ;srcptr2
279 sub r1, r7, r3 ;srcptr1
280 sub r5, r1, r8 ;destptr1
282 copy_top_bottom_y_b16
283 vld1.8 {q0, q1}, [r1]!
284 vld1.8 {q8, q9}, [r2]!
285 vld1.8 {q2, q3}, [r1]!
286 vld1.8 {q10, q11}, [r2]!
287 vld1.8 {q4, q5}, [r1]!
288 vld1.8 {q12, q13}, [r2]!
289 vld1.8 {q6, q7}, [r1]!
290 vld1.8 {q14, q15}, [r2]!
292 mov r7, r3
294 top_bottom_16_b16
295 subs r7, r7, #1
297 vst1.8 {q0, q1}, [r5]!
298 vst1.8 {q8, q9}, [r6]!
299 vst1.8 {q2, q3}, [r5]!
300 vst1.8 {q10, q11}, [r6]!
301 vst1.8 {q4, q5}, [r5]!
302 vst1.8 {q12, q13}, [r6]!
303 vst1.8 {q6, q7}, [r5]!
304 vst1.8 {q14, q15}, [r6]!
306 add r5, r5, lr
307 sub r5, r5, #128
308 add r6, r6, lr
309 sub r6, r6, #128
311 bne top_bottom_16_b16
313 sub r5, r1, r8
314 add r6, r2, lr
316 subs r12, r12, #1
317 bne copy_top_bottom_y_b16
319 mov r7, lr, lsr #4 ;check to see if extra copy is needed
320 ands r7, r7, #0x7
321 bne extra_top_bottom_y_b16
322 end_of_border_copy_y_b16
324 vpop {d8 - d15}
325 pop {r4 - r11, pc}
327 ;=====================
328 ;extra copy part for Y
329 extra_top_bottom_y_b16
330 vld1.8 {q0}, [r1]!
331 vld1.8 {q2}, [r2]!
333 mov r9, r3, lsr #3
335 extra_top_bottom_16_b16
336 subs r9, r9, #1
338 vst1.8 {q0}, [r5], lr
339 vst1.8 {q2}, [r6], lr
340 vst1.8 {q0}, [r5], lr
341 vst1.8 {q2}, [r6], lr
342 vst1.8 {q0}, [r5], lr
343 vst1.8 {q2}, [r6], lr
344 vst1.8 {q0}, [r5], lr
345 vst1.8 {q2}, [r6], lr
346 vst1.8 {q0}, [r5], lr
347 vst1.8 {q2}, [r6], lr
348 vst1.8 {q0}, [r5], lr
349 vst1.8 {q2}, [r6], lr
350 vst1.8 {q0}, [r5], lr
351 vst1.8 {q2}, [r6], lr
352 vst1.8 {q0}, [r5], lr
353 vst1.8 {q2}, [r6], lr
354 bne extra_top_bottom_16_b16
356 sub r5, r1, r8
357 add r6, r2, lr
358 subs r7, r7, #1
359 bne extra_top_bottom_y_b16
361 b end_of_border_copy_y_b16
363 ;=============================
364 extra_cp_src_to_dst_width
365 add r2, r2, r11
366 add r3, r3, r11
367 add r0, r8, r6
368 add r11, r9, r7
370 mov lr, r4, lsr #1
371 extra_cp_src_to_dst_height_loop
372 mov r8, r2
373 mov r9, r3
374 add r0, r8, r6
375 add r11, r9, r7
377 mov r12, r10
379 extra_cp_src_to_dst_width_loop
380 vld1.8 {q0}, [r8]!
381 vld1.8 {q1}, [r0]!
383 subs r12, r12, #16
385 vst1.8 {q0}, [r9]!
386 vst1.8 {q1}, [r11]!
387 bne extra_cp_src_to_dst_width_loop
389 subs lr, lr, #1
391 add r2, r2, r6, lsl #1
392 add r3, r3, r7, lsl #1
394 bne extra_cp_src_to_dst_height_loop
396 b end_of_cp_src_to_dst
398 ENDP
400 ;===========================================================
401 ;In vp8cx_pick_filter_level(), call vp8_yv12_copy_frame_yonly
402 ;without extend_frame_borders.
403 |vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon| PROC
404 push {r4 - r11, lr}
405 vpush {d8-d15}
407 ldr r4, [r0, #yv12_buffer_config_y_height]
408 ldr r5, [r0, #yv12_buffer_config_y_width]
409 ldr r6, [r0, #yv12_buffer_config_y_stride]
410 ldr r7, [r1, #yv12_buffer_config_y_stride]
411 ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1
412 ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1
414 ; copy two rows at one time
415 mov lr, r4, lsr #1
417 cp_src_to_dst_height_loop1
418 mov r8, r2
419 mov r9, r3
420 add r10, r2, r6
421 add r11, r3, r7
422 mov r12, r5, lsr #7
424 cp_src_to_dst_width_loop1
425 vld1.8 {q0, q1}, [r8]!
426 vld1.8 {q8, q9}, [r10]!
427 vld1.8 {q2, q3}, [r8]!
428 vld1.8 {q10, q11}, [r10]!
429 vld1.8 {q4, q5}, [r8]!
430 vld1.8 {q12, q13}, [r10]!
431 vld1.8 {q6, q7}, [r8]!
432 vld1.8 {q14, q15}, [r10]!
434 subs r12, r12, #1
436 vst1.8 {q0, q1}, [r9]!
437 vst1.8 {q8, q9}, [r11]!
438 vst1.8 {q2, q3}, [r9]!
439 vst1.8 {q10, q11}, [r11]!
440 vst1.8 {q4, q5}, [r9]!
441 vst1.8 {q12, q13}, [r11]!
442 vst1.8 {q6, q7}, [r9]!
443 vst1.8 {q14, q15}, [r11]!
445 bne cp_src_to_dst_width_loop1
447 subs lr, lr, #1
448 add r2, r2, r6, lsl #1
449 add r3, r3, r7, lsl #1
451 bne cp_src_to_dst_height_loop1
453 ands r10, r5, #0x7f ;check to see if extra copy is needed
454 sub r11, r5, r10
455 ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1
456 ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1
457 bne extra_cp_src_to_dst_width1
458 end_of_cp_src_to_dst1
460 vpop {d8 - d15}
461 pop {r4-r11, pc}
463 ;=============================
464 extra_cp_src_to_dst_width1
465 add r2, r2, r11
466 add r3, r3, r11
467 add r0, r8, r6
468 add r11, r9, r7
470 mov lr, r4, lsr #1
471 extra_cp_src_to_dst_height_loop1
472 mov r8, r2
473 mov r9, r3
474 add r0, r8, r6
475 add r11, r9, r7
477 mov r12, r10
479 extra_cp_src_to_dst_width_loop1
480 vld1.8 {q0}, [r8]!
481 vld1.8 {q1}, [r0]!
483 subs r12, r12, #16
485 vst1.8 {q0}, [r9]!
486 vst1.8 {q1}, [r11]!
487 bne extra_cp_src_to_dst_width_loop1
489 subs lr, lr, #1
491 add r2, r2, r6, lsl #1
492 add r3, r3, r7, lsl #1
494 bne extra_cp_src_to_dst_height_loop1
496 b end_of_cp_src_to_dst1
498 ENDP