arm: remove duplicate functions
[libvpx.git] / vp8 / common / arm / neon / buildintrapredictorsmby_neon.asm
blobe3ea91fe6c0493dc8e5c510f34fe7ce77e598b53
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 EXPORT |vp8_build_intra_predictors_mby_neon_func|
13 EXPORT |vp8_build_intra_predictors_mby_s_neon_func|
15 ARM
16 REQUIRE8
17 PRESERVE8
19 AREA ||.text||, CODE, READONLY, ALIGN=2
20 ; r0 unsigned char *y_buffer
21 ; r1 unsigned char *ypred_ptr
22 ; r2 int y_stride
23 ; r3 int mode
24 ; stack int Up
25 ; stack int Left
27 |vp8_build_intra_predictors_mby_neon_func| PROC
28 push {r4-r8, lr}
30 cmp r3, #0
31 beq case_dc_pred
32 cmp r3, #1
33 beq case_v_pred
34 cmp r3, #2
35 beq case_h_pred
36 cmp r3, #3
37 beq case_tm_pred
39 case_dc_pred
40 ldr r4, [sp, #24] ; Up
41 ldr r5, [sp, #28] ; Left
43 ; Default the DC average to 128
44 mov r12, #128
45 vdup.u8 q0, r12
47 ; Zero out running sum
48 mov r12, #0
50 ; compute shift and jump
51 adds r7, r4, r5
52 beq skip_dc_pred_up_left
54 ; Load above row, if it exists
55 cmp r4, #0
56 beq skip_dc_pred_up
58 sub r6, r0, r2
59 vld1.8 {q1}, [r6]
60 vpaddl.u8 q2, q1
61 vpaddl.u16 q3, q2
62 vpaddl.u32 q4, q3
64 vmov.32 r4, d8[0]
65 vmov.32 r6, d9[0]
67 add r12, r4, r6
69 ; Move back to interger registers
71 skip_dc_pred_up
73 cmp r5, #0
74 beq skip_dc_pred_left
76 sub r0, r0, #1
78 ; Load left row, if it exists
79 ldrb r3, [r0], r2
80 ldrb r4, [r0], r2
81 ldrb r5, [r0], r2
82 ldrb r6, [r0], r2
84 add r12, r12, r3
85 add r12, r12, r4
86 add r12, r12, r5
87 add r12, r12, r6
89 ldrb r3, [r0], r2
90 ldrb r4, [r0], r2
91 ldrb r5, [r0], r2
92 ldrb r6, [r0], r2
94 add r12, r12, r3
95 add r12, r12, r4
96 add r12, r12, r5
97 add r12, r12, r6
99 ldrb r3, [r0], r2
100 ldrb r4, [r0], r2
101 ldrb r5, [r0], r2
102 ldrb r6, [r0], r2
104 add r12, r12, r3
105 add r12, r12, r4
106 add r12, r12, r5
107 add r12, r12, r6
109 ldrb r3, [r0], r2
110 ldrb r4, [r0], r2
111 ldrb r5, [r0], r2
112 ldrb r6, [r0]
114 add r12, r12, r3
115 add r12, r12, r4
116 add r12, r12, r5
117 add r12, r12, r6
119 skip_dc_pred_left
120 add r7, r7, #3 ; Shift
121 sub r4, r7, #1
122 mov r5, #1
123 add r12, r12, r5, lsl r4
124 mov r5, r12, lsr r7 ; expected_dc
126 vdup.u8 q0, r5
128 skip_dc_pred_up_left
129 vst1.u8 {q0}, [r1]!
130 vst1.u8 {q0}, [r1]!
131 vst1.u8 {q0}, [r1]!
132 vst1.u8 {q0}, [r1]!
133 vst1.u8 {q0}, [r1]!
134 vst1.u8 {q0}, [r1]!
135 vst1.u8 {q0}, [r1]!
136 vst1.u8 {q0}, [r1]!
137 vst1.u8 {q0}, [r1]!
138 vst1.u8 {q0}, [r1]!
139 vst1.u8 {q0}, [r1]!
140 vst1.u8 {q0}, [r1]!
141 vst1.u8 {q0}, [r1]!
142 vst1.u8 {q0}, [r1]!
143 vst1.u8 {q0}, [r1]!
144 vst1.u8 {q0}, [r1]!
146 pop {r4-r8,pc}
147 case_v_pred
148 ; Copy down above row
149 sub r6, r0, r2
150 vld1.8 {q0}, [r6]
152 vst1.u8 {q0}, [r1]!
153 vst1.u8 {q0}, [r1]!
154 vst1.u8 {q0}, [r1]!
155 vst1.u8 {q0}, [r1]!
156 vst1.u8 {q0}, [r1]!
157 vst1.u8 {q0}, [r1]!
158 vst1.u8 {q0}, [r1]!
159 vst1.u8 {q0}, [r1]!
160 vst1.u8 {q0}, [r1]!
161 vst1.u8 {q0}, [r1]!
162 vst1.u8 {q0}, [r1]!
163 vst1.u8 {q0}, [r1]!
164 vst1.u8 {q0}, [r1]!
165 vst1.u8 {q0}, [r1]!
166 vst1.u8 {q0}, [r1]!
167 vst1.u8 {q0}, [r1]!
168 pop {r4-r8,pc}
170 case_h_pred
171 ; Load 4x yleft_col
172 sub r0, r0, #1
174 ldrb r3, [r0], r2
175 ldrb r4, [r0], r2
176 ldrb r5, [r0], r2
177 ldrb r6, [r0], r2
178 vdup.u8 q0, r3
179 vdup.u8 q1, r4
180 vdup.u8 q2, r5
181 vdup.u8 q3, r6
182 vst1.u8 {q0}, [r1]!
183 vst1.u8 {q1}, [r1]!
184 vst1.u8 {q2}, [r1]!
185 vst1.u8 {q3}, [r1]!
187 ldrb r3, [r0], r2
188 ldrb r4, [r0], r2
189 ldrb r5, [r0], r2
190 ldrb r6, [r0], r2
191 vdup.u8 q0, r3
192 vdup.u8 q1, r4
193 vdup.u8 q2, r5
194 vdup.u8 q3, r6
195 vst1.u8 {q0}, [r1]!
196 vst1.u8 {q1}, [r1]!
197 vst1.u8 {q2}, [r1]!
198 vst1.u8 {q3}, [r1]!
201 ldrb r3, [r0], r2
202 ldrb r4, [r0], r2
203 ldrb r5, [r0], r2
204 ldrb r6, [r0], r2
205 vdup.u8 q0, r3
206 vdup.u8 q1, r4
207 vdup.u8 q2, r5
208 vdup.u8 q3, r6
209 vst1.u8 {q0}, [r1]!
210 vst1.u8 {q1}, [r1]!
211 vst1.u8 {q2}, [r1]!
212 vst1.u8 {q3}, [r1]!
214 ldrb r3, [r0], r2
215 ldrb r4, [r0], r2
216 ldrb r5, [r0], r2
217 ldrb r6, [r0], r2
218 vdup.u8 q0, r3
219 vdup.u8 q1, r4
220 vdup.u8 q2, r5
221 vdup.u8 q3, r6
222 vst1.u8 {q0}, [r1]!
223 vst1.u8 {q1}, [r1]!
224 vst1.u8 {q2}, [r1]!
225 vst1.u8 {q3}, [r1]!
227 pop {r4-r8,pc}
229 case_tm_pred
230 ; Load yabove_row
231 sub r3, r0, r2
232 vld1.8 {q8}, [r3]
234 ; Load ytop_left
235 sub r3, r3, #1
236 ldrb r7, [r3]
238 vdup.u16 q7, r7
240 ; Compute yabove_row - ytop_left
241 mov r3, #1
242 vdup.u8 q0, r3
244 vmull.u8 q4, d16, d0
245 vmull.u8 q5, d17, d0
247 vsub.s16 q4, q4, q7
248 vsub.s16 q5, q5, q7
250 ; Load 4x yleft_col
251 sub r0, r0, #1
252 mov r12, #4
254 case_tm_pred_loop
255 ldrb r3, [r0], r2
256 ldrb r4, [r0], r2
257 ldrb r5, [r0], r2
258 ldrb r6, [r0], r2
259 vdup.u16 q0, r3
260 vdup.u16 q1, r4
261 vdup.u16 q2, r5
262 vdup.u16 q3, r6
264 vqadd.s16 q8, q0, q4
265 vqadd.s16 q9, q0, q5
267 vqadd.s16 q10, q1, q4
268 vqadd.s16 q11, q1, q5
270 vqadd.s16 q12, q2, q4
271 vqadd.s16 q13, q2, q5
273 vqadd.s16 q14, q3, q4
274 vqadd.s16 q15, q3, q5
276 vqshrun.s16 d0, q8, #0
277 vqshrun.s16 d1, q9, #0
279 vqshrun.s16 d2, q10, #0
280 vqshrun.s16 d3, q11, #0
282 vqshrun.s16 d4, q12, #0
283 vqshrun.s16 d5, q13, #0
285 vqshrun.s16 d6, q14, #0
286 vqshrun.s16 d7, q15, #0
288 vst1.u8 {q0}, [r1]!
289 vst1.u8 {q1}, [r1]!
290 vst1.u8 {q2}, [r1]!
291 vst1.u8 {q3}, [r1]!
293 subs r12, r12, #1
294 bne case_tm_pred_loop
296 pop {r4-r8,pc}
298 ENDP
300 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
301 ; r0 unsigned char *y_buffer
302 ; r1 unsigned char *ypred_ptr
303 ; r2 int y_stride
304 ; r3 int mode
305 ; stack int Up
306 ; stack int Left
308 |vp8_build_intra_predictors_mby_s_neon_func| PROC
309 push {r4-r8, lr}
311 mov r1, r0 ; unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor;
313 cmp r3, #0
314 beq case_dc_pred_s
315 cmp r3, #1
316 beq case_v_pred_s
317 cmp r3, #2
318 beq case_h_pred_s
319 cmp r3, #3
320 beq case_tm_pred_s
322 case_dc_pred_s
323 ldr r4, [sp, #24] ; Up
324 ldr r5, [sp, #28] ; Left
326 ; Default the DC average to 128
327 mov r12, #128
328 vdup.u8 q0, r12
330 ; Zero out running sum
331 mov r12, #0
333 ; compute shift and jump
334 adds r7, r4, r5
335 beq skip_dc_pred_up_left_s
337 ; Load above row, if it exists
338 cmp r4, #0
339 beq skip_dc_pred_up_s
341 sub r6, r0, r2
342 vld1.8 {q1}, [r6]
343 vpaddl.u8 q2, q1
344 vpaddl.u16 q3, q2
345 vpaddl.u32 q4, q3
347 vmov.32 r4, d8[0]
348 vmov.32 r6, d9[0]
350 add r12, r4, r6
352 ; Move back to interger registers
354 skip_dc_pred_up_s
356 cmp r5, #0
357 beq skip_dc_pred_left_s
359 sub r0, r0, #1
361 ; Load left row, if it exists
362 ldrb r3, [r0], r2
363 ldrb r4, [r0], r2
364 ldrb r5, [r0], r2
365 ldrb r6, [r0], r2
367 add r12, r12, r3
368 add r12, r12, r4
369 add r12, r12, r5
370 add r12, r12, r6
372 ldrb r3, [r0], r2
373 ldrb r4, [r0], r2
374 ldrb r5, [r0], r2
375 ldrb r6, [r0], r2
377 add r12, r12, r3
378 add r12, r12, r4
379 add r12, r12, r5
380 add r12, r12, r6
382 ldrb r3, [r0], r2
383 ldrb r4, [r0], r2
384 ldrb r5, [r0], r2
385 ldrb r6, [r0], r2
387 add r12, r12, r3
388 add r12, r12, r4
389 add r12, r12, r5
390 add r12, r12, r6
392 ldrb r3, [r0], r2
393 ldrb r4, [r0], r2
394 ldrb r5, [r0], r2
395 ldrb r6, [r0]
397 add r12, r12, r3
398 add r12, r12, r4
399 add r12, r12, r5
400 add r12, r12, r6
402 skip_dc_pred_left_s
403 add r7, r7, #3 ; Shift
404 sub r4, r7, #1
405 mov r5, #1
406 add r12, r12, r5, lsl r4
407 mov r5, r12, lsr r7 ; expected_dc
409 vdup.u8 q0, r5
411 skip_dc_pred_up_left_s
412 vst1.u8 {q0}, [r1], r2
413 vst1.u8 {q0}, [r1], r2
414 vst1.u8 {q0}, [r1], r2
415 vst1.u8 {q0}, [r1], r2
416 vst1.u8 {q0}, [r1], r2
417 vst1.u8 {q0}, [r1], r2
418 vst1.u8 {q0}, [r1], r2
419 vst1.u8 {q0}, [r1], r2
420 vst1.u8 {q0}, [r1], r2
421 vst1.u8 {q0}, [r1], r2
422 vst1.u8 {q0}, [r1], r2
423 vst1.u8 {q0}, [r1], r2
424 vst1.u8 {q0}, [r1], r2
425 vst1.u8 {q0}, [r1], r2
426 vst1.u8 {q0}, [r1], r2
427 vst1.u8 {q0}, [r1], r2
429 pop {r4-r8,pc}
430 case_v_pred_s
431 ; Copy down above row
432 sub r6, r0, r2
433 vld1.8 {q0}, [r6]
435 vst1.u8 {q0}, [r1], r2
436 vst1.u8 {q0}, [r1], r2
437 vst1.u8 {q0}, [r1], r2
438 vst1.u8 {q0}, [r1], r2
439 vst1.u8 {q0}, [r1], r2
440 vst1.u8 {q0}, [r1], r2
441 vst1.u8 {q0}, [r1], r2
442 vst1.u8 {q0}, [r1], r2
443 vst1.u8 {q0}, [r1], r2
444 vst1.u8 {q0}, [r1], r2
445 vst1.u8 {q0}, [r1], r2
446 vst1.u8 {q0}, [r1], r2
447 vst1.u8 {q0}, [r1], r2
448 vst1.u8 {q0}, [r1], r2
449 vst1.u8 {q0}, [r1], r2
450 vst1.u8 {q0}, [r1], r2
451 pop {r4-r8,pc}
453 case_h_pred_s
454 ; Load 4x yleft_col
455 sub r0, r0, #1
457 ldrb r3, [r0], r2
458 ldrb r4, [r0], r2
459 ldrb r5, [r0], r2
460 ldrb r6, [r0], r2
461 vdup.u8 q0, r3
462 vdup.u8 q1, r4
463 vdup.u8 q2, r5
464 vdup.u8 q3, r6
465 vst1.u8 {q0}, [r1], r2
466 vst1.u8 {q1}, [r1], r2
467 vst1.u8 {q2}, [r1], r2
468 vst1.u8 {q3}, [r1], r2
470 ldrb r3, [r0], r2
471 ldrb r4, [r0], r2
472 ldrb r5, [r0], r2
473 ldrb r6, [r0], r2
474 vdup.u8 q0, r3
475 vdup.u8 q1, r4
476 vdup.u8 q2, r5
477 vdup.u8 q3, r6
478 vst1.u8 {q0}, [r1], r2
479 vst1.u8 {q1}, [r1], r2
480 vst1.u8 {q2}, [r1], r2
481 vst1.u8 {q3}, [r1], r2
484 ldrb r3, [r0], r2
485 ldrb r4, [r0], r2
486 ldrb r5, [r0], r2
487 ldrb r6, [r0], r2
488 vdup.u8 q0, r3
489 vdup.u8 q1, r4
490 vdup.u8 q2, r5
491 vdup.u8 q3, r6
492 vst1.u8 {q0}, [r1], r2
493 vst1.u8 {q1}, [r1], r2
494 vst1.u8 {q2}, [r1], r2
495 vst1.u8 {q3}, [r1], r2
497 ldrb r3, [r0], r2
498 ldrb r4, [r0], r2
499 ldrb r5, [r0], r2
500 ldrb r6, [r0], r2
501 vdup.u8 q0, r3
502 vdup.u8 q1, r4
503 vdup.u8 q2, r5
504 vdup.u8 q3, r6
505 vst1.u8 {q0}, [r1], r2
506 vst1.u8 {q1}, [r1], r2
507 vst1.u8 {q2}, [r1], r2
508 vst1.u8 {q3}, [r1], r2
510 pop {r4-r8,pc}
512 case_tm_pred_s
513 ; Load yabove_row
514 sub r3, r0, r2
515 vld1.8 {q8}, [r3]
517 ; Load ytop_left
518 sub r3, r3, #1
519 ldrb r7, [r3]
521 vdup.u16 q7, r7
523 ; Compute yabove_row - ytop_left
524 mov r3, #1
525 vdup.u8 q0, r3
527 vmull.u8 q4, d16, d0
528 vmull.u8 q5, d17, d0
530 vsub.s16 q4, q4, q7
531 vsub.s16 q5, q5, q7
533 ; Load 4x yleft_col
534 sub r0, r0, #1
535 mov r12, #4
537 case_tm_pred_loop_s
538 ldrb r3, [r0], r2
539 ldrb r4, [r0], r2
540 ldrb r5, [r0], r2
541 ldrb r6, [r0], r2
542 vdup.u16 q0, r3
543 vdup.u16 q1, r4
544 vdup.u16 q2, r5
545 vdup.u16 q3, r6
547 vqadd.s16 q8, q0, q4
548 vqadd.s16 q9, q0, q5
550 vqadd.s16 q10, q1, q4
551 vqadd.s16 q11, q1, q5
553 vqadd.s16 q12, q2, q4
554 vqadd.s16 q13, q2, q5
556 vqadd.s16 q14, q3, q4
557 vqadd.s16 q15, q3, q5
559 vqshrun.s16 d0, q8, #0
560 vqshrun.s16 d1, q9, #0
562 vqshrun.s16 d2, q10, #0
563 vqshrun.s16 d3, q11, #0
565 vqshrun.s16 d4, q12, #0
566 vqshrun.s16 d5, q13, #0
568 vqshrun.s16 d6, q14, #0
569 vqshrun.s16 d7, q15, #0
571 vst1.u8 {q0}, [r1], r2
572 vst1.u8 {q1}, [r1], r2
573 vst1.u8 {q2}, [r1], r2
574 vst1.u8 {q3}, [r1], r2
576 subs r12, r12, #1
577 bne case_tm_pred_loop_s
579 pop {r4-r8,pc}
581 ENDP