Merge "respect alignment in arm asm files"
[libvpx.git] / vpx_scale / arm / neon / vp8_vpxyv12_extendframeborders_neon.asm
blobb0a3b93a24eb326468fc9d1284a73c4b9cb0df77
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 EXPORT |vp8_yv12_extend_frame_borders_neon|
13 ARM
14 REQUIRE8
15 PRESERVE8
17 INCLUDE asm_com_offsets.asm
19 AREA ||.text||, CODE, READONLY, ALIGN=2
20 ;void vp8_yv12_extend_frame_borders_neon (YV12_BUFFER_CONFIG *ybf);
21 ;Note: this is VP8 function, which has border=32 and 16. Internal y_width and y_height
22 ; are always multiples of 16.
24 |vp8_yv12_extend_frame_borders_neon| PROC
25 push {r4 - r10, lr}
26 vpush {d8 - d15}
28 ;Not need to load y_width, since: y_width = y_stride - 2*border
29 ldr r3, [r0, #yv12_buffer_config_border]
30 ldr r1, [r0, #yv12_buffer_config_y_buffer] ;srcptr1
31 ldr r4, [r0, #yv12_buffer_config_y_height]
32 ldr lr, [r0, #yv12_buffer_config_y_stride]
34 cmp r3, #16
35 beq b16_extend_frame_borders
37 ;=======================
38 b32_extend_frame_borders
39 ;border = 32
40 ;=======================
41 ;Border copy for Y plane
42 ;copy the left and right most columns out
43 sub r5, r1, r3 ;destptr1
44 add r6, r1, lr
45 sub r6, r6, r3, lsl #1 ;destptr2
46 sub r2, r6, #1 ;srcptr2
48 ;Do four rows at one time
49 mov r12, r4, lsr #2
51 copy_left_right_y
52 vld1.8 {d0[], d1[]}, [r1], lr
53 vld1.8 {d4[], d5[]}, [r2], lr
54 vld1.8 {d8[], d9[]}, [r1], lr
55 vld1.8 {d12[], d13[]}, [r2], lr
56 vld1.8 {d16[], d17[]}, [r1], lr
57 vld1.8 {d20[], d21[]}, [r2], lr
58 vld1.8 {d24[], d25[]}, [r1], lr
59 vld1.8 {d28[], d29[]}, [r2], lr
61 vmov q1, q0
62 vmov q3, q2
63 vmov q5, q4
64 vmov q7, q6
65 vmov q9, q8
66 vmov q11, q10
67 vmov q13, q12
68 vmov q15, q14
70 subs r12, r12, #1
72 vst1.8 {q0, q1}, [r5], lr
73 vst1.8 {q2, q3}, [r6], lr
74 vst1.8 {q4, q5}, [r5], lr
75 vst1.8 {q6, q7}, [r6], lr
76 vst1.8 {q8, q9}, [r5], lr
77 vst1.8 {q10, q11}, [r6], lr
78 vst1.8 {q12, q13}, [r5], lr
79 vst1.8 {q14, q15}, [r6], lr
81 bne copy_left_right_y
83 ;Now copy the top and bottom source lines into each line of the respective borders
84 ldr r7, [r0, #yv12_buffer_config_y_buffer] ;srcptr1
85 mul r8, r3, lr
87 mov r12, lr, lsr #7
89 sub r6, r1, r3 ;destptr2
90 sub r2, r6, lr ;srcptr2
91 sub r1, r7, r3 ;srcptr1
92 sub r5, r1, r8 ;destptr1
94 copy_top_bottom_y
95 vld1.8 {q0, q1}, [r1]!
96 vld1.8 {q8, q9}, [r2]!
97 vld1.8 {q2, q3}, [r1]!
98 vld1.8 {q10, q11}, [r2]!
99 vld1.8 {q4, q5}, [r1]!
100 vld1.8 {q12, q13}, [r2]!
101 vld1.8 {q6, q7}, [r1]!
102 vld1.8 {q14, q15}, [r2]!
104 mov r7, r3
106 top_bottom_32
107 subs r7, r7, #1
109 vst1.8 {q0, q1}, [r5]!
110 vst1.8 {q8, q9}, [r6]!
111 vst1.8 {q2, q3}, [r5]!
112 vst1.8 {q10, q11}, [r6]!
113 vst1.8 {q4, q5}, [r5]!
114 vst1.8 {q12, q13}, [r6]!
115 vst1.8 {q6, q7}, [r5]!
116 vst1.8 {q14, q15}, [r6]!
118 add r5, r5, lr
119 sub r5, r5, #128
120 add r6, r6, lr
121 sub r6, r6, #128
123 bne top_bottom_32
125 sub r5, r1, r8
126 add r6, r2, lr
128 subs r12, r12, #1
129 bne copy_top_bottom_y
131 mov r7, lr, lsr #4 ;check to see if extra copy is needed
132 ands r7, r7, #0x7
133 bne extra_top_bottom_y
134 end_of_border_copy_y
136 ;Border copy for U, V planes
137 ldr r1, [r0, #yv12_buffer_config_u_buffer] ;srcptr1
138 mov lr, lr, lsr #1 ;uv_stride
139 mov r3, r3, lsr #1 ;border
140 mov r4, r4, lsr #1 ;uv_height
141 mov r8, r8, lsr #2
143 mov r10, #2
145 ;copy the left and right most columns out
146 border_copy_uv
147 sub r5, r1, r3 ;destptr1
148 add r6, r1, lr
149 sub r6, r6, r3, lsl #1 ;destptr2
150 sub r2, r6, #1 ;srcptr2
152 mov r7, r1
154 ;Do eight rows at one time
155 mov r12, r4, lsr #3
157 copy_left_right_uv
158 vld1.8 {d0[], d1[]}, [r1], lr
159 vld1.8 {d2[], d3[]}, [r2], lr
160 vld1.8 {d4[], d5[]}, [r1], lr
161 vld1.8 {d6[], d7[]}, [r2], lr
162 vld1.8 {d8[], d9[]}, [r1], lr
163 vld1.8 {d10[], d11[]}, [r2], lr
164 vld1.8 {d12[], d13[]}, [r1], lr
165 vld1.8 {d14[], d15[]}, [r2], lr
166 vld1.8 {d16[], d17[]}, [r1], lr
167 vld1.8 {d18[], d19[]}, [r2], lr
168 vld1.8 {d20[], d21[]}, [r1], lr
169 vld1.8 {d22[], d23[]}, [r2], lr
170 vld1.8 {d24[], d25[]}, [r1], lr
171 vld1.8 {d26[], d27[]}, [r2], lr
172 vld1.8 {d28[], d29[]}, [r1], lr
173 vld1.8 {d30[], d31[]}, [r2], lr
175 subs r12, r12, #1
177 vst1.8 {q0}, [r5], lr
178 vst1.8 {q1}, [r6], lr
179 vst1.8 {q2}, [r5], lr
180 vst1.8 {q3}, [r6], lr
181 vst1.8 {q4}, [r5], lr
182 vst1.8 {q5}, [r6], lr
183 vst1.8 {q6}, [r5], lr
184 vst1.8 {q7}, [r6], lr
185 vst1.8 {q8}, [r5], lr
186 vst1.8 {q9}, [r6], lr
187 vst1.8 {q10}, [r5], lr
188 vst1.8 {q11}, [r6], lr
189 vst1.8 {q12}, [r5], lr
190 vst1.8 {q13}, [r6], lr
191 vst1.8 {q14}, [r5], lr
192 vst1.8 {q15}, [r6], lr
194 bne copy_left_right_uv
196 ;Now copy the top and bottom source lines into each line of the respective borders
197 mov r12, lr, lsr #6
199 sub r6, r1, r3 ;destptr2
200 sub r2, r6, lr ;srcptr2
201 sub r1, r7, r3 ;srcptr1
202 sub r5, r1, r8 ;destptr1
204 copy_top_bottom_uv
205 vld1.8 {q0, q1}, [r1]!
206 vld1.8 {q8, q9}, [r2]!
207 vld1.8 {q2, q3}, [r1]!
208 vld1.8 {q10, q11}, [r2]!
210 mov r7, r3
212 top_bottom_16
213 subs r7, r7, #1
215 vst1.8 {q0, q1}, [r5]!
216 vst1.8 {q8, q9}, [r6]!
217 vst1.8 {q2, q3}, [r5]!
218 vst1.8 {q10, q11}, [r6]!
220 add r5, r5, lr
221 sub r5, r5, #64
222 add r6, r6, lr
223 sub r6, r6, #64
225 bne top_bottom_16
227 sub r5, r1, r8
228 add r6, r2, lr
230 subs r12, r12, #1
231 bne copy_top_bottom_uv
233 mov r7, lr, lsr #3 ;check to see if extra copy is needed
234 ands r7, r7, #0x7
235 bne extra_top_bottom_uv
237 end_of_border_copy_uv
238 subs r10, r10, #1
239 ldrne r1, [r0, #yv12_buffer_config_v_buffer] ;srcptr1
240 bne border_copy_uv
242 vpop {d8 - d15}
243 pop {r4 - r10, pc}
245 ;;;;;;;;;;;;;;;;;;;;;;
246 ;extra copy part for Y
247 extra_top_bottom_y
248 vld1.8 {q0}, [r1]!
249 vld1.8 {q2}, [r2]!
251 mov r9, r3, lsr #3
253 extra_top_bottom_32
254 subs r9, r9, #1
256 vst1.8 {q0}, [r5], lr
257 vst1.8 {q2}, [r6], lr
258 vst1.8 {q0}, [r5], lr
259 vst1.8 {q2}, [r6], lr
260 vst1.8 {q0}, [r5], lr
261 vst1.8 {q2}, [r6], lr
262 vst1.8 {q0}, [r5], lr
263 vst1.8 {q2}, [r6], lr
264 vst1.8 {q0}, [r5], lr
265 vst1.8 {q2}, [r6], lr
266 vst1.8 {q0}, [r5], lr
267 vst1.8 {q2}, [r6], lr
268 vst1.8 {q0}, [r5], lr
269 vst1.8 {q2}, [r6], lr
270 vst1.8 {q0}, [r5], lr
271 vst1.8 {q2}, [r6], lr
272 bne extra_top_bottom_32
274 sub r5, r1, r8
275 add r6, r2, lr
276 subs r7, r7, #1
277 bne extra_top_bottom_y
279 b end_of_border_copy_y
281 ;extra copy part for UV
282 extra_top_bottom_uv
283 vld1.8 {d0}, [r1]!
284 vld1.8 {d8}, [r2]!
286 mov r9, r3, lsr #3
288 extra_top_bottom_16
289 subs r9, r9, #1
291 vst1.8 {d0}, [r5], lr
292 vst1.8 {d8}, [r6], lr
293 vst1.8 {d0}, [r5], lr
294 vst1.8 {d8}, [r6], lr
295 vst1.8 {d0}, [r5], lr
296 vst1.8 {d8}, [r6], lr
297 vst1.8 {d0}, [r5], lr
298 vst1.8 {d8}, [r6], lr
299 vst1.8 {d0}, [r5], lr
300 vst1.8 {d8}, [r6], lr
301 vst1.8 {d0}, [r5], lr
302 vst1.8 {d8}, [r6], lr
303 vst1.8 {d0}, [r5], lr
304 vst1.8 {d8}, [r6], lr
305 vst1.8 {d0}, [r5], lr
306 vst1.8 {d8}, [r6], lr
307 bne extra_top_bottom_16
309 sub r5, r1, r8
310 add r6, r2, lr
311 subs r7, r7, #1
312 bne extra_top_bottom_uv
314 b end_of_border_copy_uv
317 ;=======================
318 b16_extend_frame_borders
319 ;border = 16
320 ;=======================
321 ;Border copy for Y plane
322 ;copy the left and right most columns out
323 sub r5, r1, r3 ;destptr1
324 add r6, r1, lr
325 sub r6, r6, r3, lsl #1 ;destptr2
326 sub r2, r6, #1 ;srcptr2
328 ;Do four rows at one time
329 mov r12, r4, lsr #2
331 copy_left_right_y_b16
332 vld1.8 {d0[], d1[]}, [r1], lr
333 vld1.8 {d4[], d5[]}, [r2], lr
334 vld1.8 {d8[], d9[]}, [r1], lr
335 vld1.8 {d12[], d13[]}, [r2], lr
336 vld1.8 {d16[], d17[]}, [r1], lr
337 vld1.8 {d20[], d21[]}, [r2], lr
338 vld1.8 {d24[], d25[]}, [r1], lr
339 vld1.8 {d28[], d29[]}, [r2], lr
341 subs r12, r12, #1
343 vst1.8 {q0}, [r5], lr
344 vst1.8 {q2}, [r6], lr
345 vst1.8 {q4}, [r5], lr
346 vst1.8 {q6}, [r6], lr
347 vst1.8 {q8}, [r5], lr
348 vst1.8 {q10}, [r6], lr
349 vst1.8 {q12}, [r5], lr
350 vst1.8 {q14}, [r6], lr
352 bne copy_left_right_y_b16
354 ;Now copy the top and bottom source lines into each line of the respective borders
355 ldr r7, [r0, #yv12_buffer_config_y_buffer] ;srcptr1
356 mul r8, r3, lr
358 mov r12, lr, lsr #7
360 sub r6, r1, r3 ;destptr2
361 sub r2, r6, lr ;srcptr2
362 sub r1, r7, r3 ;srcptr1
363 sub r5, r1, r8 ;destptr1
365 copy_top_bottom_y_b16
366 vld1.8 {q0, q1}, [r1]!
367 vld1.8 {q8, q9}, [r2]!
368 vld1.8 {q2, q3}, [r1]!
369 vld1.8 {q10, q11}, [r2]!
370 vld1.8 {q4, q5}, [r1]!
371 vld1.8 {q12, q13}, [r2]!
372 vld1.8 {q6, q7}, [r1]!
373 vld1.8 {q14, q15}, [r2]!
375 mov r7, r3
377 top_bottom_16_b16
378 subs r7, r7, #1
380 vst1.8 {q0, q1}, [r5]!
381 vst1.8 {q8, q9}, [r6]!
382 vst1.8 {q2, q3}, [r5]!
383 vst1.8 {q10, q11}, [r6]!
384 vst1.8 {q4, q5}, [r5]!
385 vst1.8 {q12, q13}, [r6]!
386 vst1.8 {q6, q7}, [r5]!
387 vst1.8 {q14, q15}, [r6]!
389 add r5, r5, lr
390 sub r5, r5, #128
391 add r6, r6, lr
392 sub r6, r6, #128
394 bne top_bottom_16_b16
396 sub r5, r1, r8
397 add r6, r2, lr
399 subs r12, r12, #1
400 bne copy_top_bottom_y_b16
402 mov r7, lr, lsr #4 ;check to see if extra copy is needed
403 ands r7, r7, #0x7
404 bne extra_top_bottom_y_b16
405 end_of_border_copy_y_b16
407 ;Border copy for U, V planes
408 ldr r1, [r0, #yv12_buffer_config_u_buffer] ;srcptr1
409 mov lr, lr, lsr #1 ;uv_stride
410 mov r3, r3, lsr #1 ;border
411 mov r4, r4, lsr #1 ;uv_height
412 mov r8, r8, lsr #2
414 mov r10, #2
416 ;copy the left and right most columns out
417 border_copy_uv_b16
418 sub r5, r1, r3 ;destptr1
419 add r6, r1, lr
420 sub r6, r6, r3, lsl #1 ;destptr2
421 sub r2, r6, #1 ;srcptr2
423 mov r7, r1
425 ;Do eight rows at one time
426 mov r12, r4, lsr #3
428 copy_left_right_uv_b16
429 vld1.8 {d0[]}, [r1], lr
430 vld1.8 {d2[]}, [r2], lr
431 vld1.8 {d4[]}, [r1], lr
432 vld1.8 {d6[]}, [r2], lr
433 vld1.8 {d8[]}, [r1], lr
434 vld1.8 {d10[]}, [r2], lr
435 vld1.8 {d12[]}, [r1], lr
436 vld1.8 {d14[]}, [r2], lr
437 vld1.8 {d16[]}, [r1], lr
438 vld1.8 {d18[]}, [r2], lr
439 vld1.8 {d20[]}, [r1], lr
440 vld1.8 {d22[]}, [r2], lr
441 vld1.8 {d24[]}, [r1], lr
442 vld1.8 {d26[]}, [r2], lr
443 vld1.8 {d28[]}, [r1], lr
444 vld1.8 {d30[]}, [r2], lr
446 subs r12, r12, #1
448 vst1.8 {d0}, [r5], lr
449 vst1.8 {d2}, [r6], lr
450 vst1.8 {d4}, [r5], lr
451 vst1.8 {d6}, [r6], lr
452 vst1.8 {d8}, [r5], lr
453 vst1.8 {d10}, [r6], lr
454 vst1.8 {d12}, [r5], lr
455 vst1.8 {d14}, [r6], lr
456 vst1.8 {d16}, [r5], lr
457 vst1.8 {d18}, [r6], lr
458 vst1.8 {d20}, [r5], lr
459 vst1.8 {d22}, [r6], lr
460 vst1.8 {d24}, [r5], lr
461 vst1.8 {d26}, [r6], lr
462 vst1.8 {d28}, [r5], lr
463 vst1.8 {d30}, [r6], lr
465 bne copy_left_right_uv_b16
467 ;Now copy the top and bottom source lines into each line of the respective borders
468 mov r12, lr, lsr #6
470 sub r6, r1, r3 ;destptr2
471 sub r2, r6, lr ;srcptr2
472 sub r1, r7, r3 ;srcptr1
473 sub r5, r1, r8 ;destptr1
475 copy_top_bottom_uv_b16
476 vld1.8 {q0, q1}, [r1]!
477 vld1.8 {q8, q9}, [r2]!
478 vld1.8 {q2, q3}, [r1]!
479 vld1.8 {q10, q11}, [r2]!
481 mov r7, r3
483 top_bottom_8_b16
484 subs r7, r7, #1
486 vst1.8 {q0, q1}, [r5]!
487 vst1.8 {q8, q9}, [r6]!
488 vst1.8 {q2, q3}, [r5]!
489 vst1.8 {q10, q11}, [r6]!
491 add r5, r5, lr
492 sub r5, r5, #64
493 add r6, r6, lr
494 sub r6, r6, #64
496 bne top_bottom_8_b16
498 sub r5, r1, r8
499 add r6, r2, lr
501 subs r12, r12, #1
502 bne copy_top_bottom_uv_b16
504 mov r7, lr, lsr #3 ;check to see if extra copy is needed
505 ands r7, r7, #0x7
506 bne extra_top_bottom_uv_b16
508 end_of_border_copy_uv_b16
509 subs r10, r10, #1
510 ldrne r1, [r0, #yv12_buffer_config_v_buffer] ;srcptr1
511 bne border_copy_uv_b16
513 vpop {d8-d15}
514 pop {r4 - r10, pc}
516 ;;;;;;;;;;;;;;;;;;;;;;
517 ;extra copy part for Y
518 extra_top_bottom_y_b16
519 vld1.8 {q0}, [r1]!
520 vld1.8 {q2}, [r2]!
522 mov r9, r3, lsr #3
524 extra_top_bottom_16_b16
525 subs r9, r9, #1
527 vst1.8 {q0}, [r5], lr
528 vst1.8 {q2}, [r6], lr
529 vst1.8 {q0}, [r5], lr
530 vst1.8 {q2}, [r6], lr
531 vst1.8 {q0}, [r5], lr
532 vst1.8 {q2}, [r6], lr
533 vst1.8 {q0}, [r5], lr
534 vst1.8 {q2}, [r6], lr
535 vst1.8 {q0}, [r5], lr
536 vst1.8 {q2}, [r6], lr
537 vst1.8 {q0}, [r5], lr
538 vst1.8 {q2}, [r6], lr
539 vst1.8 {q0}, [r5], lr
540 vst1.8 {q2}, [r6], lr
541 vst1.8 {q0}, [r5], lr
542 vst1.8 {q2}, [r6], lr
543 bne extra_top_bottom_16_b16
545 sub r5, r1, r8
546 add r6, r2, lr
547 subs r7, r7, #1
548 bne extra_top_bottom_y_b16
550 b end_of_border_copy_y_b16
552 ;extra copy part for UV
553 extra_top_bottom_uv_b16
554 vld1.8 {d0}, [r1]!
555 vld1.8 {d8}, [r2]!
557 mov r9, r3, lsr #3
559 extra_top_bottom_8_b16
560 subs r9, r9, #1
562 vst1.8 {d0}, [r5], lr
563 vst1.8 {d8}, [r6], lr
564 vst1.8 {d0}, [r5], lr
565 vst1.8 {d8}, [r6], lr
566 vst1.8 {d0}, [r5], lr
567 vst1.8 {d8}, [r6], lr
568 vst1.8 {d0}, [r5], lr
569 vst1.8 {d8}, [r6], lr
570 vst1.8 {d0}, [r5], lr
571 vst1.8 {d8}, [r6], lr
572 vst1.8 {d0}, [r5], lr
573 vst1.8 {d8}, [r6], lr
574 vst1.8 {d0}, [r5], lr
575 vst1.8 {d8}, [r6], lr
576 vst1.8 {d0}, [r5], lr
577 vst1.8 {d8}, [r6], lr
578 bne extra_top_bottom_8_b16
580 sub r5, r1, r8
581 add r6, r2, lr
582 subs r7, r7, #1
583 bne extra_top_bottom_uv_b16
585 b end_of_border_copy_uv_b16
587 ENDP