Initial WebM release
[libvpx.git] / vp8 / common / arm / neon / buildintrapredictorsmby_neon.asm
blobf42ac63c9c4ef86ea0b7d709ca110822a7f48ad4
2 ; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license and patent
5 ; grant that can be found in the LICENSE file in the root of the source
6 ; tree. All contributing project authors may be found in the AUTHORS
7 ; file in the root of the source tree.
11 EXPORT |vp8_build_intra_predictors_mby_neon_func|
12 EXPORT |vp8_build_intra_predictors_mby_s_neon_func|
14 ARM
15 REQUIRE8
16 PRESERVE8
18 AREA ||.text||, CODE, READONLY, ALIGN=2
19 ; r0 unsigned char *y_buffer
20 ; r1 unsigned char *ypred_ptr
21 ; r2 int y_stride
22 ; r3 int mode
23 ; stack int Up
24 ; stack int Left
26 |vp8_build_intra_predictors_mby_neon_func| PROC
27 push {r4-r8, lr}
29 cmp r3, #0
30 beq case_dc_pred
31 cmp r3, #1
32 beq case_v_pred
33 cmp r3, #2
34 beq case_h_pred
35 cmp r3, #3
36 beq case_tm_pred
38 case_dc_pred
39 ldr r4, [sp, #24] ; Up
40 ldr r5, [sp, #28] ; Left
42 ; Default the DC average to 128
43 mov r12, #128
44 vdup.u8 q0, r12
46 ; Zero out running sum
47 mov r12, #0
49 ; compute shift and jump
50 adds r7, r4, r5
51 beq skip_dc_pred_up_left
53 ; Load above row, if it exists
54 cmp r4, #0
55 beq skip_dc_pred_up
57 sub r6, r0, r2
58 vld1.8 {q1}, [r6]
59 vpaddl.u8 q2, q1
60 vpaddl.u16 q3, q2
61 vpaddl.u32 q4, q3
63 vmov.32 r4, d8[0]
64 vmov.32 r6, d9[0]
66 add r12, r4, r6
68 ; Move back to interger registers
70 skip_dc_pred_up
72 cmp r5, #0
73 beq skip_dc_pred_left
75 sub r0, r0, #1
77 ; Load left row, if it exists
78 ldrb r3, [r0], r2
79 ldrb r4, [r0], r2
80 ldrb r5, [r0], r2
81 ldrb r6, [r0], r2
83 add r12, r12, r3
84 add r12, r12, r4
85 add r12, r12, r5
86 add r12, r12, r6
88 ldrb r3, [r0], r2
89 ldrb r4, [r0], r2
90 ldrb r5, [r0], r2
91 ldrb r6, [r0], r2
93 add r12, r12, r3
94 add r12, r12, r4
95 add r12, r12, r5
96 add r12, r12, r6
98 ldrb r3, [r0], r2
99 ldrb r4, [r0], r2
100 ldrb r5, [r0], r2
101 ldrb r6, [r0], r2
103 add r12, r12, r3
104 add r12, r12, r4
105 add r12, r12, r5
106 add r12, r12, r6
108 ldrb r3, [r0], r2
109 ldrb r4, [r0], r2
110 ldrb r5, [r0], r2
111 ldrb r6, [r0]
113 add r12, r12, r3
114 add r12, r12, r4
115 add r12, r12, r5
116 add r12, r12, r6
118 skip_dc_pred_left
119 add r7, r7, #3 ; Shift
120 sub r4, r7, #1
121 mov r5, #1
122 add r12, r12, r5, lsl r4
123 mov r5, r12, lsr r7 ; expected_dc
125 vdup.u8 q0, r5
127 skip_dc_pred_up_left
128 vst1.u8 {q0}, [r1]!
129 vst1.u8 {q0}, [r1]!
130 vst1.u8 {q0}, [r1]!
131 vst1.u8 {q0}, [r1]!
132 vst1.u8 {q0}, [r1]!
133 vst1.u8 {q0}, [r1]!
134 vst1.u8 {q0}, [r1]!
135 vst1.u8 {q0}, [r1]!
136 vst1.u8 {q0}, [r1]!
137 vst1.u8 {q0}, [r1]!
138 vst1.u8 {q0}, [r1]!
139 vst1.u8 {q0}, [r1]!
140 vst1.u8 {q0}, [r1]!
141 vst1.u8 {q0}, [r1]!
142 vst1.u8 {q0}, [r1]!
143 vst1.u8 {q0}, [r1]!
145 pop {r4-r8,pc}
146 case_v_pred
147 ; Copy down above row
148 sub r6, r0, r2
149 vld1.8 {q0}, [r6]
151 vst1.u8 {q0}, [r1]!
152 vst1.u8 {q0}, [r1]!
153 vst1.u8 {q0}, [r1]!
154 vst1.u8 {q0}, [r1]!
155 vst1.u8 {q0}, [r1]!
156 vst1.u8 {q0}, [r1]!
157 vst1.u8 {q0}, [r1]!
158 vst1.u8 {q0}, [r1]!
159 vst1.u8 {q0}, [r1]!
160 vst1.u8 {q0}, [r1]!
161 vst1.u8 {q0}, [r1]!
162 vst1.u8 {q0}, [r1]!
163 vst1.u8 {q0}, [r1]!
164 vst1.u8 {q0}, [r1]!
165 vst1.u8 {q0}, [r1]!
166 vst1.u8 {q0}, [r1]!
167 pop {r4-r8,pc}
169 case_h_pred
170 ; Load 4x yleft_col
171 sub r0, r0, #1
173 ldrb r3, [r0], r2
174 ldrb r4, [r0], r2
175 ldrb r5, [r0], r2
176 ldrb r6, [r0], r2
177 vdup.u8 q0, r3
178 vdup.u8 q1, r4
179 vdup.u8 q2, r5
180 vdup.u8 q3, r6
181 vst1.u8 {q0}, [r1]!
182 vst1.u8 {q1}, [r1]!
183 vst1.u8 {q2}, [r1]!
184 vst1.u8 {q3}, [r1]!
186 ldrb r3, [r0], r2
187 ldrb r4, [r0], r2
188 ldrb r5, [r0], r2
189 ldrb r6, [r0], r2
190 vdup.u8 q0, r3
191 vdup.u8 q1, r4
192 vdup.u8 q2, r5
193 vdup.u8 q3, r6
194 vst1.u8 {q0}, [r1]!
195 vst1.u8 {q1}, [r1]!
196 vst1.u8 {q2}, [r1]!
197 vst1.u8 {q3}, [r1]!
200 ldrb r3, [r0], r2
201 ldrb r4, [r0], r2
202 ldrb r5, [r0], r2
203 ldrb r6, [r0], r2
204 vdup.u8 q0, r3
205 vdup.u8 q1, r4
206 vdup.u8 q2, r5
207 vdup.u8 q3, r6
208 vst1.u8 {q0}, [r1]!
209 vst1.u8 {q1}, [r1]!
210 vst1.u8 {q2}, [r1]!
211 vst1.u8 {q3}, [r1]!
213 ldrb r3, [r0], r2
214 ldrb r4, [r0], r2
215 ldrb r5, [r0], r2
216 ldrb r6, [r0], r2
217 vdup.u8 q0, r3
218 vdup.u8 q1, r4
219 vdup.u8 q2, r5
220 vdup.u8 q3, r6
221 vst1.u8 {q0}, [r1]!
222 vst1.u8 {q1}, [r1]!
223 vst1.u8 {q2}, [r1]!
224 vst1.u8 {q3}, [r1]!
226 pop {r4-r8,pc}
228 case_tm_pred
229 ; Load yabove_row
230 sub r3, r0, r2
231 vld1.8 {q8}, [r3]
233 ; Load ytop_left
234 sub r3, r3, #1
235 ldrb r7, [r3]
237 vdup.u16 q7, r7
239 ; Compute yabove_row - ytop_left
240 mov r3, #1
241 vdup.u8 q0, r3
243 vmull.u8 q4, d16, d0
244 vmull.u8 q5, d17, d0
246 vsub.s16 q4, q4, q7
247 vsub.s16 q5, q5, q7
249 ; Load 4x yleft_col
250 sub r0, r0, #1
251 mov r12, #4
253 case_tm_pred_loop
254 ldrb r3, [r0], r2
255 ldrb r4, [r0], r2
256 ldrb r5, [r0], r2
257 ldrb r6, [r0], r2
258 vdup.u16 q0, r3
259 vdup.u16 q1, r4
260 vdup.u16 q2, r5
261 vdup.u16 q3, r6
263 vqadd.s16 q8, q0, q4
264 vqadd.s16 q9, q0, q5
266 vqadd.s16 q10, q1, q4
267 vqadd.s16 q11, q1, q5
269 vqadd.s16 q12, q2, q4
270 vqadd.s16 q13, q2, q5
272 vqadd.s16 q14, q3, q4
273 vqadd.s16 q15, q3, q5
275 vqshrun.s16 d0, q8, #0
276 vqshrun.s16 d1, q9, #0
278 vqshrun.s16 d2, q10, #0
279 vqshrun.s16 d3, q11, #0
281 vqshrun.s16 d4, q12, #0
282 vqshrun.s16 d5, q13, #0
284 vqshrun.s16 d6, q14, #0
285 vqshrun.s16 d7, q15, #0
287 vst1.u8 {q0}, [r1]!
288 vst1.u8 {q1}, [r1]!
289 vst1.u8 {q2}, [r1]!
290 vst1.u8 {q3}, [r1]!
292 subs r12, r12, #1
293 bne case_tm_pred_loop
295 pop {r4-r8,pc}
297 ENDP
299 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
300 ; r0 unsigned char *y_buffer
301 ; r1 unsigned char *ypred_ptr
302 ; r2 int y_stride
303 ; r3 int mode
304 ; stack int Up
305 ; stack int Left
307 |vp8_build_intra_predictors_mby_s_neon_func| PROC
308 push {r4-r8, lr}
310 mov r1, r0 ; unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor;
312 cmp r3, #0
313 beq case_dc_pred_s
314 cmp r3, #1
315 beq case_v_pred_s
316 cmp r3, #2
317 beq case_h_pred_s
318 cmp r3, #3
319 beq case_tm_pred_s
321 case_dc_pred_s
322 ldr r4, [sp, #24] ; Up
323 ldr r5, [sp, #28] ; Left
325 ; Default the DC average to 128
326 mov r12, #128
327 vdup.u8 q0, r12
329 ; Zero out running sum
330 mov r12, #0
332 ; compute shift and jump
333 adds r7, r4, r5
334 beq skip_dc_pred_up_left_s
336 ; Load above row, if it exists
337 cmp r4, #0
338 beq skip_dc_pred_up_s
340 sub r6, r0, r2
341 vld1.8 {q1}, [r6]
342 vpaddl.u8 q2, q1
343 vpaddl.u16 q3, q2
344 vpaddl.u32 q4, q3
346 vmov.32 r4, d8[0]
347 vmov.32 r6, d9[0]
349 add r12, r4, r6
351 ; Move back to interger registers
353 skip_dc_pred_up_s
355 cmp r5, #0
356 beq skip_dc_pred_left_s
358 sub r0, r0, #1
360 ; Load left row, if it exists
361 ldrb r3, [r0], r2
362 ldrb r4, [r0], r2
363 ldrb r5, [r0], r2
364 ldrb r6, [r0], r2
366 add r12, r12, r3
367 add r12, r12, r4
368 add r12, r12, r5
369 add r12, r12, r6
371 ldrb r3, [r0], r2
372 ldrb r4, [r0], r2
373 ldrb r5, [r0], r2
374 ldrb r6, [r0], r2
376 add r12, r12, r3
377 add r12, r12, r4
378 add r12, r12, r5
379 add r12, r12, r6
381 ldrb r3, [r0], r2
382 ldrb r4, [r0], r2
383 ldrb r5, [r0], r2
384 ldrb r6, [r0], r2
386 add r12, r12, r3
387 add r12, r12, r4
388 add r12, r12, r5
389 add r12, r12, r6
391 ldrb r3, [r0], r2
392 ldrb r4, [r0], r2
393 ldrb r5, [r0], r2
394 ldrb r6, [r0]
396 add r12, r12, r3
397 add r12, r12, r4
398 add r12, r12, r5
399 add r12, r12, r6
401 skip_dc_pred_left_s
402 add r7, r7, #3 ; Shift
403 sub r4, r7, #1
404 mov r5, #1
405 add r12, r12, r5, lsl r4
406 mov r5, r12, lsr r7 ; expected_dc
408 vdup.u8 q0, r5
410 skip_dc_pred_up_left_s
411 vst1.u8 {q0}, [r1], r2
412 vst1.u8 {q0}, [r1], r2
413 vst1.u8 {q0}, [r1], r2
414 vst1.u8 {q0}, [r1], r2
415 vst1.u8 {q0}, [r1], r2
416 vst1.u8 {q0}, [r1], r2
417 vst1.u8 {q0}, [r1], r2
418 vst1.u8 {q0}, [r1], r2
419 vst1.u8 {q0}, [r1], r2
420 vst1.u8 {q0}, [r1], r2
421 vst1.u8 {q0}, [r1], r2
422 vst1.u8 {q0}, [r1], r2
423 vst1.u8 {q0}, [r1], r2
424 vst1.u8 {q0}, [r1], r2
425 vst1.u8 {q0}, [r1], r2
426 vst1.u8 {q0}, [r1], r2
428 pop {r4-r8,pc}
429 case_v_pred_s
430 ; Copy down above row
431 sub r6, r0, r2
432 vld1.8 {q0}, [r6]
434 vst1.u8 {q0}, [r1], r2
435 vst1.u8 {q0}, [r1], r2
436 vst1.u8 {q0}, [r1], r2
437 vst1.u8 {q0}, [r1], r2
438 vst1.u8 {q0}, [r1], r2
439 vst1.u8 {q0}, [r1], r2
440 vst1.u8 {q0}, [r1], r2
441 vst1.u8 {q0}, [r1], r2
442 vst1.u8 {q0}, [r1], r2
443 vst1.u8 {q0}, [r1], r2
444 vst1.u8 {q0}, [r1], r2
445 vst1.u8 {q0}, [r1], r2
446 vst1.u8 {q0}, [r1], r2
447 vst1.u8 {q0}, [r1], r2
448 vst1.u8 {q0}, [r1], r2
449 vst1.u8 {q0}, [r1], r2
450 pop {r4-r8,pc}
452 case_h_pred_s
453 ; Load 4x yleft_col
454 sub r0, r0, #1
456 ldrb r3, [r0], r2
457 ldrb r4, [r0], r2
458 ldrb r5, [r0], r2
459 ldrb r6, [r0], r2
460 vdup.u8 q0, r3
461 vdup.u8 q1, r4
462 vdup.u8 q2, r5
463 vdup.u8 q3, r6
464 vst1.u8 {q0}, [r1], r2
465 vst1.u8 {q1}, [r1], r2
466 vst1.u8 {q2}, [r1], r2
467 vst1.u8 {q3}, [r1], r2
469 ldrb r3, [r0], r2
470 ldrb r4, [r0], r2
471 ldrb r5, [r0], r2
472 ldrb r6, [r0], r2
473 vdup.u8 q0, r3
474 vdup.u8 q1, r4
475 vdup.u8 q2, r5
476 vdup.u8 q3, r6
477 vst1.u8 {q0}, [r1], r2
478 vst1.u8 {q1}, [r1], r2
479 vst1.u8 {q2}, [r1], r2
480 vst1.u8 {q3}, [r1], r2
483 ldrb r3, [r0], r2
484 ldrb r4, [r0], r2
485 ldrb r5, [r0], r2
486 ldrb r6, [r0], r2
487 vdup.u8 q0, r3
488 vdup.u8 q1, r4
489 vdup.u8 q2, r5
490 vdup.u8 q3, r6
491 vst1.u8 {q0}, [r1], r2
492 vst1.u8 {q1}, [r1], r2
493 vst1.u8 {q2}, [r1], r2
494 vst1.u8 {q3}, [r1], r2
496 ldrb r3, [r0], r2
497 ldrb r4, [r0], r2
498 ldrb r5, [r0], r2
499 ldrb r6, [r0], r2
500 vdup.u8 q0, r3
501 vdup.u8 q1, r4
502 vdup.u8 q2, r5
503 vdup.u8 q3, r6
504 vst1.u8 {q0}, [r1], r2
505 vst1.u8 {q1}, [r1], r2
506 vst1.u8 {q2}, [r1], r2
507 vst1.u8 {q3}, [r1], r2
509 pop {r4-r8,pc}
511 case_tm_pred_s
512 ; Load yabove_row
513 sub r3, r0, r2
514 vld1.8 {q8}, [r3]
516 ; Load ytop_left
517 sub r3, r3, #1
518 ldrb r7, [r3]
520 vdup.u16 q7, r7
522 ; Compute yabove_row - ytop_left
523 mov r3, #1
524 vdup.u8 q0, r3
526 vmull.u8 q4, d16, d0
527 vmull.u8 q5, d17, d0
529 vsub.s16 q4, q4, q7
530 vsub.s16 q5, q5, q7
532 ; Load 4x yleft_col
533 sub r0, r0, #1
534 mov r12, #4
536 case_tm_pred_loop_s
537 ldrb r3, [r0], r2
538 ldrb r4, [r0], r2
539 ldrb r5, [r0], r2
540 ldrb r6, [r0], r2
541 vdup.u16 q0, r3
542 vdup.u16 q1, r4
543 vdup.u16 q2, r5
544 vdup.u16 q3, r6
546 vqadd.s16 q8, q0, q4
547 vqadd.s16 q9, q0, q5
549 vqadd.s16 q10, q1, q4
550 vqadd.s16 q11, q1, q5
552 vqadd.s16 q12, q2, q4
553 vqadd.s16 q13, q2, q5
555 vqadd.s16 q14, q3, q4
556 vqadd.s16 q15, q3, q5
558 vqshrun.s16 d0, q8, #0
559 vqshrun.s16 d1, q9, #0
561 vqshrun.s16 d2, q10, #0
562 vqshrun.s16 d3, q11, #0
564 vqshrun.s16 d4, q12, #0
565 vqshrun.s16 d5, q13, #0
567 vqshrun.s16 d6, q14, #0
568 vqshrun.s16 d7, q15, #0
570 vst1.u8 {q0}, [r1], r2
571 vst1.u8 {q1}, [r1], r2
572 vst1.u8 {q2}, [r1], r2
573 vst1.u8 {q3}, [r1], r2
575 subs r12, r12, #1
576 bne case_tm_pred_loop_s
578 pop {r4-r8,pc}
580 ENDP