Merge "respect alignment in arm asm files"
[libvpx.git] / vpx_scale / arm / neon / vp8_vpxyv12_copysrcframe_func_neon.asm
blobec64dbc4ea4723e06ad0e723259b6aa6a47da60e
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 EXPORT |vp8_yv12_copy_src_frame_func_neon|
13 ARM
14 REQUIRE8
15 PRESERVE8
17 INCLUDE asm_com_offsets.asm
19 AREA ||.text||, CODE, READONLY, ALIGN=2
20 ;Note: This function is used to copy source data in src_buffer[i] at beginning of
21 ;the encoding. The buffer has a width and height of cpi->oxcf.Width and cpi->oxcf.Height,
22 ;which can be ANY numbers(NOT always multiples of 16 or 4).
24 ;void vp8_yv12_copy_src_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
26 |vp8_yv12_copy_src_frame_func_neon| PROC
27 push {r4 - r11, lr}
28 vpush {d8 - d15}
30 ;Copy Y plane
31 ldr r4, [r0, #yv12_buffer_config_y_height]
32 ldr r5, [r0, #yv12_buffer_config_y_width]
33 ldr r6, [r0, #yv12_buffer_config_y_stride]
34 ldr r7, [r1, #yv12_buffer_config_y_stride]
35 ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1
36 ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1
38 add r10, r2, r6 ;second row src
39 add r11, r3, r7 ;second row dst
40 mov r6, r6, lsl #1
41 mov r7, r7, lsl #1
42 sub r6, r6, r5 ;adjust stride
43 sub r7, r7, r5
45 ; copy two rows at one time
46 mov lr, r4, lsr #1
48 cp_src_to_dst_height_loop
49 mov r12, r5
51 cp_width_128_loop
52 vld1.8 {q0, q1}, [r2]!
53 vld1.8 {q4, q5}, [r10]!
54 vld1.8 {q2, q3}, [r2]!
55 vld1.8 {q6, q7}, [r10]!
56 vld1.8 {q8, q9}, [r2]!
57 vld1.8 {q12, q13}, [r10]!
58 vld1.8 {q10, q11}, [r2]!
59 vld1.8 {q14, q15}, [r10]!
60 sub r12, r12, #128
61 cmp r12, #128
62 vst1.8 {q0, q1}, [r3]!
63 vst1.8 {q4, q5}, [r11]!
64 vst1.8 {q2, q3}, [r3]!
65 vst1.8 {q6, q7}, [r11]!
66 vst1.8 {q8, q9}, [r3]!
67 vst1.8 {q12, q13}, [r11]!
68 vst1.8 {q10, q11}, [r3]!
69 vst1.8 {q14, q15}, [r11]!
70 bhs cp_width_128_loop
72 cmp r12, #0
73 beq cp_width_done
75 cp_width_8_loop
76 vld1.8 {d0}, [r2]!
77 vld1.8 {d1}, [r10]!
78 sub r12, r12, #8
79 cmp r12, #8
80 vst1.8 {d0}, [r3]!
81 vst1.8 {d1}, [r11]!
82 bhs cp_width_8_loop
84 cmp r12, #0
85 beq cp_width_done
87 cp_width_1_loop
88 ldrb r8, [r2], #1
89 subs r12, r12, #1
90 strb r8, [r3], #1
91 ldrb r8, [r10], #1
92 strb r8, [r11], #1
93 bne cp_width_1_loop
95 cp_width_done
96 subs lr, lr, #1
97 add r2, r2, r6
98 add r3, r3, r7
99 add r10, r10, r6
100 add r11, r11, r7
101 bne cp_src_to_dst_height_loop
103 ;copy last line for Y if y_height is odd
104 tst r4, #1
105 beq cp_width_done_1
106 mov r12, r5
108 cp_width_128_loop_1
109 vld1.8 {q0, q1}, [r2]!
110 vld1.8 {q2, q3}, [r2]!
111 vld1.8 {q8, q9}, [r2]!
112 vld1.8 {q10, q11}, [r2]!
113 sub r12, r12, #128
114 cmp r12, #128
115 vst1.8 {q0, q1}, [r3]!
116 vst1.8 {q2, q3}, [r3]!
117 vst1.8 {q8, q9}, [r3]!
118 vst1.8 {q10, q11}, [r3]!
119 bhs cp_width_128_loop_1
121 cmp r12, #0
122 beq cp_width_done_1
124 cp_width_8_loop_1
125 vld1.8 {d0}, [r2]!
126 sub r12, r12, #8
127 cmp r12, #8
128 vst1.8 {d0}, [r3]!
129 bhs cp_width_8_loop_1
131 cmp r12, #0
132 beq cp_width_done_1
134 cp_width_1_loop_1
135 ldrb r8, [r2], #1
136 subs r12, r12, #1
137 strb r8, [r3], #1
138 bne cp_width_1_loop_1
139 cp_width_done_1
141 ;Copy U & V planes
142 ldr r4, [r0, #yv12_buffer_config_uv_height]
143 ldr r5, [r0, #yv12_buffer_config_uv_width]
144 ldr r6, [r0, #yv12_buffer_config_uv_stride]
145 ldr r7, [r1, #yv12_buffer_config_uv_stride]
146 ldr r2, [r0, #yv12_buffer_config_u_buffer] ;srcptr1
147 ldr r3, [r1, #yv12_buffer_config_u_buffer] ;dstptr1
149 add r10, r2, r6 ;second row src
150 add r11, r3, r7 ;second row dst
151 mov r6, r6, lsl #1
152 mov r7, r7, lsl #1
153 sub r6, r6, r5 ;adjust stride
154 sub r7, r7, r5
156 mov r9, #2
158 cp_uv_loop
159 ;copy two rows at one time
160 mov lr, r4, lsr #1
162 cp_src_to_dst_height_uv_loop
163 mov r12, r5
165 cp_width_uv_64_loop
166 vld1.8 {q0, q1}, [r2]!
167 vld1.8 {q4, q5}, [r10]!
168 vld1.8 {q2, q3}, [r2]!
169 vld1.8 {q6, q7}, [r10]!
170 sub r12, r12, #64
171 cmp r12, #64
172 vst1.8 {q0, q1}, [r3]!
173 vst1.8 {q4, q5}, [r11]!
174 vst1.8 {q2, q3}, [r3]!
175 vst1.8 {q6, q7}, [r11]!
176 bhs cp_width_uv_64_loop
178 cmp r12, #0
179 beq cp_width_uv_done
181 cp_width_uv_8_loop
182 vld1.8 {d0}, [r2]!
183 vld1.8 {d1}, [r10]!
184 sub r12, r12, #8
185 cmp r12, #8
186 vst1.8 {d0}, [r3]!
187 vst1.8 {d1}, [r11]!
188 bhs cp_width_uv_8_loop
190 cmp r12, #0
191 beq cp_width_uv_done
193 cp_width_uv_1_loop
194 ldrb r8, [r2], #1
195 subs r12, r12, #1
196 strb r8, [r3], #1
197 ldrb r8, [r10], #1
198 strb r8, [r11], #1
199 bne cp_width_uv_1_loop
201 cp_width_uv_done
202 subs lr, lr, #1
203 add r2, r2, r6
204 add r3, r3, r7
205 add r10, r10, r6
206 add r11, r11, r7
207 bne cp_src_to_dst_height_uv_loop
209 ;copy last line for U & V if uv_height is odd
210 tst r4, #1
211 beq cp_width_uv_done_1
212 mov r12, r5
214 cp_width_uv_64_loop_1
215 vld1.8 {q0, q1}, [r2]!
216 vld1.8 {q2, q3}, [r2]!
217 sub r12, r12, #64
218 cmp r12, #64
219 vst1.8 {q0, q1}, [r3]!
220 vst1.8 {q2, q3}, [r3]!
221 bhs cp_width_uv_64_loop_1
223 cmp r12, #0
224 beq cp_width_uv_done_1
226 cp_width_uv_8_loop_1
227 vld1.8 {d0}, [r2]!
228 sub r12, r12, #8
229 cmp r12, #8
230 vst1.8 {d0}, [r3]!
231 bhs cp_width_uv_8_loop_1
233 cmp r12, #0
234 beq cp_width_uv_done_1
236 cp_width_uv_1_loop_1
237 ldrb r8, [r2], #1
238 subs r12, r12, #1
239 strb r8, [r3], #1
240 bne cp_width_uv_1_loop_1
241 cp_width_uv_done_1
243 subs r9, r9, #1
244 ldrne r2, [r0, #yv12_buffer_config_v_buffer] ;srcptr1
245 ldrne r3, [r1, #yv12_buffer_config_v_buffer] ;dstptr1
246 ldrne r10, [r0, #yv12_buffer_config_uv_stride]
247 ldrne r11, [r1, #yv12_buffer_config_uv_stride]
249 addne r10, r2, r10 ;second row src
250 addne r11, r3, r11 ;second row dst
252 bne cp_uv_loop
254 vpop {d8 - d15}
255 pop {r4 - r11, pc}
257 ENDP