2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 .macro transpose_8x8 r0 r1 r2 r3 r4 r5 r6 r7
40 .macro transpose_4x4 r0 r1 r2 r3
47 .macro swap4 r0 r1 r2 r3 r4 r5 r6 r7
54 .macro transpose16_4x4 r0 r1 r2 r3 r4 r5 r6 r7
65 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
66 .macro h264_chroma_mc8 type
67 function ff_\type\()_h264_chroma_mc8_neon, export=1
77 rsb r6, r7, r5, lsl #3
78 rsb ip, r7, r4, lsl #3
79 sub r4, r7, r4, lsl #3
80 sub r4, r4, r5, lsl #3
90 vld1.64 {d4, d5}, [r1], r4
92 vld1.64 {d6, d7}, [r5], r4
101 vld1.64 {d4, d5}, [r1], r4
103 vext.8 d5, d4, d5, #1
110 vrshrn.u16 d16, q8, #6
111 vld1.64 {d6, d7}, [r5], r4
113 vrshrn.u16 d17, q9, #6
115 vld1.64 {d20}, [lr,:64], r2
116 vld1.64 {d21}, [lr,:64], r2
117 vrhadd.u8 q8, q8, q10
119 vext.8 d7, d6, d7, #1
120 vst1.64 {d16}, [r0,:64], r2
121 vst1.64 {d17}, [r0,:64], r2
135 vld1.64 {d4}, [r1], r4
136 vld1.64 {d6}, [r5], r4
141 vld1.64 {d4}, [r1], r4
144 vld1.64 {d6}, [r5], r4
145 vrshrn.u16 d16, q8, #6
146 vrshrn.u16 d17, q9, #6
148 vld1.64 {d20}, [lr,:64], r2
149 vld1.64 {d21}, [lr,:64], r2
150 vrhadd.u8 q8, q8, q10
154 vst1.64 {d16}, [r0,:64], r2
155 vst1.64 {d17}, [r0,:64], r2
160 4: vld1.64 {d4, d5}, [r1], r2
161 vld1.64 {d6, d7}, [r1], r2
162 vext.8 d5, d4, d5, #1
163 vext.8 d7, d6, d7, #1
169 vld1.64 {d4, d5}, [r1], r2
173 vext.8 d5, d4, d5, #1
174 vrshrn.u16 d16, q8, #6
175 vrshrn.u16 d17, q9, #6
177 vld1.64 {d20}, [lr,:64], r2
178 vld1.64 {d21}, [lr,:64], r2
179 vrhadd.u8 q8, q8, q10
181 vld1.64 {d6, d7}, [r1], r2
182 vext.8 d7, d6, d7, #1
183 vst1.64 {d16}, [r0,:64], r2
184 vst1.64 {d17}, [r0,:64], r2
191 /* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
192 .macro h264_chroma_mc4 type
193 function ff_\type\()_h264_chroma_mc4_neon, export=1
203 rsb r6, r7, r5, lsl #3
204 rsb ip, r7, r4, lsl #3
205 sub r4, r7, r4, lsl #3
206 sub r4, r4, r5, lsl #3
216 vld1.64 {d4}, [r1], r4
218 vld1.64 {d6}, [r5], r4
221 vext.8 d5, d4, d5, #1
222 vext.8 d7, d6, d7, #1
232 vld1.64 {d4}, [r1], r4
233 vext.8 d5, d4, d5, #1
237 vld1.64 {d6}, [r5], r4
238 vadd.i16 d16, d16, d17
239 vadd.i16 d17, d18, d19
240 vrshrn.u16 d16, q8, #6
244 vld1.32 {d20[0]}, [lr,:32], r2
245 vld1.32 {d20[1]}, [lr,:32], r2
246 vrhadd.u8 d16, d16, d20
248 vext.8 d7, d6, d7, #1
250 vst1.32 {d16[0]}, [r0,:32], r2
251 vst1.32 {d16[1]}, [r0,:32], r2
264 vext.32 d1, d0, d1, #1
267 vld1.32 {d4[0]}, [r1], r4
268 vld1.32 {d4[1]}, [r5], r4
272 vld1.32 {d4[0]}, [r1], r4
274 vld1.32 {d4[1]}, [r5], r4
275 vadd.i16 d16, d16, d17
276 vadd.i16 d17, d18, d19
277 vrshrn.u16 d16, q8, #6
279 vld1.32 {d20[0]}, [lr,:32], r2
280 vld1.32 {d20[1]}, [lr,:32], r2
281 vrhadd.u8 d16, d16, d20
285 vst1.32 {d16[0]}, [r0,:32], r2
286 vst1.32 {d16[1]}, [r0,:32], r2
291 4: vld1.64 {d4}, [r1], r2
292 vld1.64 {d6}, [r1], r2
293 vext.8 d5, d4, d5, #1
294 vext.8 d7, d6, d7, #1
298 5: vmull.u8 q8, d4, d0
301 vld1.64 {d4}, [r1], r2
302 vext.8 d5, d4, d5, #1
304 vadd.i16 d16, d16, d17
305 vadd.i16 d17, d18, d19
307 vrshrn.u16 d16, q8, #6
309 vld1.32 {d20[0]}, [lr,:32], r2
310 vld1.32 {d20[1]}, [lr,:32], r2
311 vrhadd.u8 d16, d16, d20
313 vld1.64 {d6}, [r1], r2
314 vext.8 d7, d6, d7, #1
317 vst1.32 {d16[0]}, [r0,:32], r2
318 vst1.32 {d16[1]}, [r0,:32], r2
333 /* H.264 loop filter */
335 .macro h264_loop_filter_start
341 and ip, ip, ip, lsl #16
343 ands ip, ip, ip, lsl #8
347 .macro align_push_regs
351 vst1.64 {d12-d15}, [sp,:128]
353 vst1.64 {d8-d11}, [sp,:128]
356 .macro align_pop_regs
357 vld1.64 {d8-d11}, [sp,:128]!
358 vld1.64 {d12-d15}, [sp,:128], ip
361 .macro h264_loop_filter_luma
362 vdup.8 q11, r2 @ alpha
364 vabd.u8 q6, q8, q0 @ abs(p0 - q0)
366 vabd.u8 q14, q9, q8 @ abs(p1 - p0)
368 vabd.u8 q15, q1, q0 @ abs(q1 - q0)
369 vsli.32 q12, q12, #16
370 vclt.u8 q6, q6, q11 @ < alpha
371 vdup.8 q11, r3 @ beta
373 vclt.u8 q14, q14, q11 @ < beta
374 vclt.u8 q15, q15, q11 @ < beta
376 vabd.u8 q4, q10, q8 @ abs(p2 - p0)
378 vabd.u8 q5, q2, q0 @ abs(q2 - q0)
379 vclt.u8 q4, q4, q11 @ < beta
381 vclt.u8 q5, q5, q11 @ < beta
385 vrhadd.u8 q14, q8, q0
388 vhadd.u8 q10, q10, q14
390 vhadd.u8 q14, q2, q14
392 vqsub.u8 q11, q9, q12
395 vqsub.u8 q11, q1, q12
398 vmax.u8 q14, q14, q11
401 vsubw.u8 q10, q10, d17
403 vshl.i16 q10, q10, #2
405 vaddw.u8 q10, q10, d19
407 vsubw.u8 q10, q10, d3
408 vrshrn.i16 d4, q2, #3
409 vrshrn.i16 d5, q10, #3
419 vaddw.s8 q14, q14, d4
421 vsubw.s8 q11, q11, d4
422 vsubw.s8 q12, q12, d5
429 function ff_h264_v_loop_filter_luma_neon, export=1
430 h264_loop_filter_start
432 vld1.64 {d0, d1}, [r0,:128], r1
433 vld1.64 {d2, d3}, [r0,:128], r1
434 vld1.64 {d4, d5}, [r0,:128], r1
435 sub r0, r0, r1, lsl #2
436 sub r0, r0, r1, lsl #1
437 vld1.64 {d20,d21}, [r0,:128], r1
438 vld1.64 {d18,d19}, [r0,:128], r1
439 vld1.64 {d16,d17}, [r0,:128], r1
443 h264_loop_filter_luma
445 sub r0, r0, r1, lsl #1
446 vst1.64 {d8, d9}, [r0,:128], r1
447 vst1.64 {d16,d17}, [r0,:128], r1
448 vst1.64 {d0, d1}, [r0,:128], r1
449 vst1.64 {d10,d11}, [r0,:128]
455 function ff_h264_h_loop_filter_luma_neon, export=1
456 h264_loop_filter_start
459 vld1.64 {d6}, [r0], r1
460 vld1.64 {d20}, [r0], r1
461 vld1.64 {d18}, [r0], r1
462 vld1.64 {d16}, [r0], r1
463 vld1.64 {d0}, [r0], r1
464 vld1.64 {d2}, [r0], r1
465 vld1.64 {d4}, [r0], r1
466 vld1.64 {d26}, [r0], r1
467 vld1.64 {d7}, [r0], r1
468 vld1.64 {d21}, [r0], r1
469 vld1.64 {d19}, [r0], r1
470 vld1.64 {d17}, [r0], r1
471 vld1.64 {d1}, [r0], r1
472 vld1.64 {d3}, [r0], r1
473 vld1.64 {d5}, [r0], r1
474 vld1.64 {d27}, [r0], r1
476 transpose_8x8 q3, q10, q9, q8, q0, q1, q2, q13
480 h264_loop_filter_luma
482 transpose_4x4 q4, q8, q0, q5
484 sub r0, r0, r1, lsl #4
486 vst1.32 {d8[0]}, [r0], r1
487 vst1.32 {d16[0]}, [r0], r1
488 vst1.32 {d0[0]}, [r0], r1
489 vst1.32 {d10[0]}, [r0], r1
490 vst1.32 {d8[1]}, [r0], r1
491 vst1.32 {d16[1]}, [r0], r1
492 vst1.32 {d0[1]}, [r0], r1
493 vst1.32 {d10[1]}, [r0], r1
494 vst1.32 {d9[0]}, [r0], r1
495 vst1.32 {d17[0]}, [r0], r1
496 vst1.32 {d1[0]}, [r0], r1
497 vst1.32 {d11[0]}, [r0], r1
498 vst1.32 {d9[1]}, [r0], r1
499 vst1.32 {d17[1]}, [r0], r1
500 vst1.32 {d1[1]}, [r0], r1
501 vst1.32 {d11[1]}, [r0], r1
507 .macro h264_loop_filter_chroma
508 vdup.8 d22, r2 @ alpha
510 vabd.u8 d26, d16, d0 @ abs(p0 - q0)
512 vabd.u8 d28, d18, d16 @ abs(p1 - p0)
516 vabd.u8 d30, d2, d0 @ abs(q1 - q0)
518 vclt.u8 d26, d26, d22 @ < alpha
520 vdup.8 d22, r3 @ beta
522 vrshrn.i16 d4, q2, #3
523 vclt.u8 d28, d28, d22 @ < beta
525 vclt.u8 d30, d30, d22 @ < beta
534 vaddw.s8 q14, q14, d4
535 vsubw.s8 q11, q11, d4
540 function ff_h264_v_loop_filter_chroma_neon, export=1
541 h264_loop_filter_start
543 sub r0, r0, r1, lsl #1
544 vld1.64 {d18}, [r0,:64], r1
545 vld1.64 {d16}, [r0,:64], r1
546 vld1.64 {d0}, [r0,:64], r1
547 vld1.64 {d2}, [r0,:64]
549 h264_loop_filter_chroma
551 sub r0, r0, r1, lsl #1
552 vst1.64 {d16}, [r0,:64], r1
553 vst1.64 {d0}, [r0,:64], r1
558 function ff_h264_h_loop_filter_chroma_neon, export=1
559 h264_loop_filter_start
562 vld1.32 {d18[0]}, [r0], r1
563 vld1.32 {d16[0]}, [r0], r1
564 vld1.32 {d0[0]}, [r0], r1
565 vld1.32 {d2[0]}, [r0], r1
566 vld1.32 {d18[1]}, [r0], r1
567 vld1.32 {d16[1]}, [r0], r1
568 vld1.32 {d0[1]}, [r0], r1
569 vld1.32 {d2[1]}, [r0], r1
576 h264_loop_filter_chroma
583 sub r0, r0, r1, lsl #3
584 vst1.32 {d18[0]}, [r0], r1
585 vst1.32 {d16[0]}, [r0], r1
586 vst1.32 {d0[0]}, [r0], r1
587 vst1.32 {d2[0]}, [r0], r1
588 vst1.32 {d18[1]}, [r0], r1
589 vst1.32 {d16[1]}, [r0], r1
590 vst1.32 {d0[1]}, [r0], r1
591 vst1.32 {d2[1]}, [r0], r1
598 .macro lowpass_const r
604 .macro lowpass_8 r0, r1, r2, r3, d0, d1, narrow=1
612 vext.8 d2, \r0, \r1, #2
613 vext.8 d3, \r0, \r1, #3
615 vext.8 d4, \r0, \r1, #1
616 vext.8 d5, \r0, \r1, #4
618 vext.8 d30, \r0, \r1, #5
619 vaddl.u8 t0, \r0, d30
620 vext.8 d18, \r2, \r3, #2
621 vmla.i16 t0, q1, d6[1]
622 vext.8 d19, \r2, \r3, #3
623 vaddl.u8 q9, d18, d19
624 vext.8 d20, \r2, \r3, #1
625 vmls.i16 t0, q2, d6[0]
626 vext.8 d21, \r2, \r3, #4
627 vaddl.u8 q10, d20, d21
628 vext.8 d31, \r2, \r3, #5
629 vaddl.u8 t1, \r2, d31
630 vmla.i16 t1, q9, d6[1]
631 vmls.i16 t1, q10, d6[0]
633 vqrshrun.s16 \d0, t0, #5
634 vqrshrun.s16 \d1, t1, #5
640 .macro lowpass_8_1 r0, r1, d0, narrow=1
646 vext.8 d2, \r0, \r1, #2
647 vext.8 d3, \r0, \r1, #3
649 vext.8 d4, \r0, \r1, #1
650 vext.8 d5, \r0, \r1, #4
652 vext.8 d30, \r0, \r1, #5
653 vaddl.u8 t0, \r0, d30
654 vmla.i16 t0, q1, d6[1]
655 vmls.i16 t0, q2, d6[0]
657 vqrshrun.s16 \d0, t0, #5
662 .macro lowpass_8.16 r0, r1, l0, h0, l1, h1, d
663 vext.16 q1, \r0, \r1, #2
664 vext.16 q0, \r0, \r1, #3
666 vext.16 q2, \r0, \r1, #1
668 vext.16 q3, \r0, \r1, #4
669 vaddl.s16 q10, d4, d6
670 vext.16 \r1, \r0, \r1, #5
672 vaddl.s16 q0, \h0, \h1
673 vaddl.s16 q8, \l0, \l1
677 vshl.i32 q15, q10, #2
679 vadd.i32 q10, q10, q15
693 vrshrn.s32 d18, q9, #10
694 vrshrn.s32 d19, q1, #10
699 function put_h264_qpel16_h_lowpass_neon_packed
703 bl put_h264_qpel8_h_lowpass_neon
704 sub r1, r1, r2, lsl #4
708 b put_h264_qpel8_h_lowpass_neon
711 function put_h264_qpel16_h_lowpass_neon
714 bl put_h264_qpel8_h_lowpass_neon
715 sub r0, r0, r3, lsl #4
716 sub r1, r1, r2, lsl #4
723 function put_h264_qpel8_h_lowpass_neon
724 1: vld1.64 {d0, d1}, [r1], r2
725 vld1.64 {d16,d17}, [r1], r2
727 lowpass_8 d0, d1, d16, d17, d0, d16
728 vst1.64 {d0}, [r0,:64], r3
729 vst1.64 {d16}, [r0,:64], r3
734 function put_h264_qpel16_h_lowpass_l2_neon
737 bl put_h264_qpel8_h_lowpass_l2_neon
738 sub r0, r0, r2, lsl #4
739 sub r1, r1, r2, lsl #4
740 sub r3, r3, r2, lsl #4
748 function put_h264_qpel8_h_lowpass_l2_neon
749 1: vld1.64 {d0, d1}, [r1], r2
750 vld1.64 {d16,d17}, [r1], r2
751 vld1.64 {d28}, [r3], r2
752 vld1.64 {d29}, [r3], r2
754 lowpass_8 d0, d1, d16, d17, d0, d1
755 vrhadd.u8 q0, q0, q14
756 vst1.64 {d0}, [r0,:64], r2
757 vst1.64 {d1}, [r0,:64], r2
762 function put_h264_qpel16_v_lowpass_neon_packed
765 bl put_h264_qpel8_v_lowpass_neon
766 sub r1, r1, r3, lsl #2
767 bl put_h264_qpel8_v_lowpass_neon
768 sub r1, r1, r3, lsl #4
769 sub r1, r1, r3, lsl #2
771 bl put_h264_qpel8_v_lowpass_neon
772 sub r1, r1, r3, lsl #2
774 b put_h264_qpel8_v_lowpass_neon
777 function put_h264_qpel16_v_lowpass_neon
779 bl put_h264_qpel8_v_lowpass_neon
780 sub r1, r1, r3, lsl #2
781 bl put_h264_qpel8_v_lowpass_neon
782 sub r0, r0, r2, lsl #4
784 sub r1, r1, r3, lsl #4
785 sub r1, r1, r3, lsl #2
787 bl put_h264_qpel8_v_lowpass_neon
788 sub r1, r1, r3, lsl #2
792 function put_h264_qpel8_v_lowpass_neon
793 vld1.64 {d8}, [r1], r3
794 vld1.64 {d10}, [r1], r3
795 vld1.64 {d12}, [r1], r3
796 vld1.64 {d14}, [r1], r3
797 vld1.64 {d22}, [r1], r3
798 vld1.64 {d24}, [r1], r3
799 vld1.64 {d26}, [r1], r3
800 vld1.64 {d28}, [r1], r3
801 vld1.64 {d9}, [r1], r3
802 vld1.64 {d11}, [r1], r3
803 vld1.64 {d13}, [r1], r3
804 vld1.64 {d15}, [r1], r3
807 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
808 lowpass_8 d8, d9, d10, d11, d8, d10
809 lowpass_8 d12, d13, d14, d15, d12, d14
810 lowpass_8 d22, d23, d24, d25, d22, d24
811 lowpass_8 d26, d27, d28, d29, d26, d28
812 transpose_8x8 d8, d10, d12, d14, d22, d24, d26, d28
814 vst1.64 {d8}, [r0,:64], r2
815 vst1.64 {d10}, [r0,:64], r2
816 vst1.64 {d12}, [r0,:64], r2
817 vst1.64 {d14}, [r0,:64], r2
818 vst1.64 {d22}, [r0,:64], r2
819 vst1.64 {d24}, [r0,:64], r2
820 vst1.64 {d26}, [r0,:64], r2
821 vst1.64 {d28}, [r0,:64], r2
826 function put_h264_qpel16_v_lowpass_l2_neon
828 bl put_h264_qpel8_v_lowpass_l2_neon
829 sub r1, r1, r3, lsl #2
830 bl put_h264_qpel8_v_lowpass_l2_neon
831 sub r0, r0, r3, lsl #4
832 sub ip, ip, r2, lsl #4
835 sub r1, r1, r3, lsl #4
836 sub r1, r1, r3, lsl #2
838 bl put_h264_qpel8_v_lowpass_l2_neon
839 sub r1, r1, r3, lsl #2
843 function put_h264_qpel8_v_lowpass_l2_neon
844 vld1.64 {d8}, [r1], r3
845 vld1.64 {d10}, [r1], r3
846 vld1.64 {d12}, [r1], r3
847 vld1.64 {d14}, [r1], r3
848 vld1.64 {d22}, [r1], r3
849 vld1.64 {d24}, [r1], r3
850 vld1.64 {d26}, [r1], r3
851 vld1.64 {d28}, [r1], r3
852 vld1.64 {d9}, [r1], r3
853 vld1.64 {d11}, [r1], r3
854 vld1.64 {d13}, [r1], r3
855 vld1.64 {d15}, [r1], r3
858 transpose_8x8 q4, q5, q6, q7, q11, q12, q13, q14
859 lowpass_8 d8, d9, d10, d11, d8, d9
860 lowpass_8 d12, d13, d14, d15, d12, d13
861 lowpass_8 d22, d23, d24, d25, d22, d23
862 lowpass_8 d26, d27, d28, d29, d26, d27
863 transpose_8x8 d8, d9, d12, d13, d22, d23, d26, d27
865 vld1.64 {d0}, [ip], r2
866 vld1.64 {d1}, [ip], r2
867 vld1.64 {d2}, [ip], r2
868 vld1.64 {d3}, [ip], r2
869 vld1.64 {d4}, [ip], r2
871 vld1.64 {d5}, [ip], r2
873 vld1.64 {d10}, [ip], r2
874 vrhadd.u8 q2, q2, q11
875 vld1.64 {d11}, [ip], r2
877 vst1.64 {d0}, [r0,:64], r3
878 vst1.64 {d1}, [r0,:64], r3
879 vrhadd.u8 q5, q5, q13
880 vst1.64 {d2}, [r0,:64], r3
881 vst1.64 {d3}, [r0,:64], r3
882 vst1.64 {d4}, [r0,:64], r3
883 vst1.64 {d5}, [r0,:64], r3
884 vst1.64 {d10}, [r0,:64], r3
885 vst1.64 {d11}, [r0,:64], r3
890 function put_h264_qpel8_hv_lowpass_neon_top
893 1: vld1.64 {d0, d1}, [r1], r3
894 vld1.64 {d16,d17}, [r1], r3
896 lowpass_8 d0, d1, d16, d17, q11, q12, narrow=0
897 vst1.64 {d22-d25}, [r4,:128]!
900 vld1.64 {d0, d1}, [r1]
901 lowpass_8_1 d0, d1, q12, narrow=0
905 vld1.64 {d30,d31}, [r4,:128], ip
906 vld1.64 {d20,d21}, [r4,:128], ip
907 vld1.64 {d18,d19}, [r4,:128], ip
908 vld1.64 {d16,d17}, [r4,:128], ip
909 vld1.64 {d14,d15}, [r4,:128], ip
910 vld1.64 {d12,d13}, [r4,:128], ip
911 vld1.64 {d10,d11}, [r4,:128], ip
912 vld1.64 {d8, d9}, [r4,:128], ip
913 vld1.64 {d6, d7}, [r4,:128], ip
914 vld1.64 {d4, d5}, [r4,:128], ip
915 vld1.64 {d2, d3}, [r4,:128], ip
916 vld1.64 {d0, d1}, [r4,:128]
918 swap4 d1, d3, d5, d7, d8, d10, d12, d14
919 transpose16_4x4 q0, q1, q2, q3, q4, q5, q6, q7
921 swap4 d17, d19, d21, d31, d24, d26, d28, d22
922 transpose16_4x4 q8, q9, q10, q15, q12, q13, q14, q11
924 vst1.64 {d30,d31}, [r4,:128]!
925 vst1.64 {d6, d7}, [r4,:128]!
926 vst1.64 {d20,d21}, [r4,:128]!
927 vst1.64 {d4, d5}, [r4,:128]!
928 vst1.64 {d18,d19}, [r4,:128]!
929 vst1.64 {d2, d3}, [r4,:128]!
930 vst1.64 {d16,d17}, [r4,:128]!
931 vst1.64 {d0, d1}, [r4,:128]
933 lowpass_8.16 q4, q12, d8, d9, d24, d25, d8
934 lowpass_8.16 q5, q13, d10, d11, d26, d27, d9
935 lowpass_8.16 q6, q14, d12, d13, d28, d29, d10
936 lowpass_8.16 q7, q11, d14, d15, d22, d23, d11
938 vld1.64 {d16,d17}, [r4,:128], ip
939 vld1.64 {d30,d31}, [r4,:128], ip
940 lowpass_8.16 q8, q15, d16, d17, d30, d31, d12
941 vld1.64 {d16,d17}, [r4,:128], ip
942 vld1.64 {d30,d31}, [r4,:128], ip
943 lowpass_8.16 q8, q15, d16, d17, d30, d31, d13
944 vld1.64 {d16,d17}, [r4,:128], ip
945 vld1.64 {d30,d31}, [r4,:128], ip
946 lowpass_8.16 q8, q15, d16, d17, d30, d31, d14
947 vld1.64 {d16,d17}, [r4,:128], ip
948 vld1.64 {d30,d31}, [r4,:128]
949 lowpass_8.16 q8, q15, d16, d17, d30, d31, d15
951 transpose_8x8 d12, d13, d14, d15, d8, d9, d10, d11
956 function put_h264_qpel8_hv_lowpass_neon
958 bl put_h264_qpel8_hv_lowpass_neon_top
959 vst1.64 {d12}, [r0,:64], r2
960 vst1.64 {d13}, [r0,:64], r2
961 vst1.64 {d14}, [r0,:64], r2
962 vst1.64 {d15}, [r0,:64], r2
963 vst1.64 {d8}, [r0,:64], r2
964 vst1.64 {d9}, [r0,:64], r2
965 vst1.64 {d10}, [r0,:64], r2
966 vst1.64 {d11}, [r0,:64], r2
972 function put_h264_qpel8_hv_lowpass_l2_neon
974 bl put_h264_qpel8_hv_lowpass_neon_top
976 vld1.64 {d0, d1}, [r2,:128]!
977 vld1.64 {d2, d3}, [r2,:128]!
979 vld1.64 {d4, d5}, [r2,:128]!
981 vld1.64 {d6, d7}, [r2,:128]!
984 vst1.64 {d0}, [r0,:64], r3
986 vst1.64 {d1}, [r0,:64], r3
987 vst1.64 {d2}, [r0,:64], r3
988 vst1.64 {d3}, [r0,:64], r3
989 vst1.64 {d4}, [r0,:64], r3
990 vst1.64 {d5}, [r0,:64], r3
991 vst1.64 {d6}, [r0,:64], r3
992 vst1.64 {d7}, [r0,:64], r3
998 function put_h264_qpel16_hv_lowpass_neon
1000 bl put_h264_qpel8_hv_lowpass_neon
1001 sub r1, r1, r3, lsl #2
1002 bl put_h264_qpel8_hv_lowpass_neon
1003 sub r1, r1, r3, lsl #4
1004 sub r1, r1, r3, lsl #2
1006 sub r0, r0, r2, lsl #4
1008 bl put_h264_qpel8_hv_lowpass_neon
1009 sub r1, r1, r3, lsl #2
1011 b put_h264_qpel8_hv_lowpass_neon
1014 function put_h264_qpel16_hv_lowpass_l2_neon
1017 bl put_h264_qpel8_hv_lowpass_l2_neon
1018 sub r1, r1, r3, lsl #2
1019 bl put_h264_qpel8_hv_lowpass_l2_neon
1020 sub r1, r1, r3, lsl #4
1021 sub r1, r1, r3, lsl #2
1023 sub r0, r0, r3, lsl #4
1025 bl put_h264_qpel8_hv_lowpass_l2_neon
1026 sub r1, r1, r3, lsl #2
1028 b put_h264_qpel8_hv_lowpass_l2_neon
1031 function ff_put_h264_qpel8_mc10_neon, export=1
1036 b put_h264_qpel8_h_lowpass_l2_neon
1039 function ff_put_h264_qpel8_mc20_neon, export=1
1044 b put_h264_qpel8_h_lowpass_neon
1047 function ff_put_h264_qpel8_mc30_neon, export=1
1052 b put_h264_qpel8_h_lowpass_l2_neon
1055 function ff_put_h264_qpel8_mc01_neon, export=1
1058 put_h264_qpel8_mc01:
1061 sub r1, r1, r2, lsl #1
1063 bl put_h264_qpel8_v_lowpass_l2_neon
1068 function ff_put_h264_qpel8_mc11_neon, export=1
1069 push {r0, r1, r2, lr}
1070 put_h264_qpel8_mc11:
1078 bl put_h264_qpel8_h_lowpass_neon
1082 sub r1, r1, r2, lsl #1
1084 bl put_h264_qpel8_v_lowpass_l2_neon
1090 function ff_put_h264_qpel8_mc21_neon, export=1
1091 push {r0, r1, r4, r10, r11, lr}
1092 put_h264_qpel8_mc21:
1096 sub sp, sp, #(8*8+16*12)
1102 bl put_h264_qpel8_h_lowpass_neon
1105 sub r1, r1, r2, lsl #1
1109 bl put_h264_qpel8_hv_lowpass_l2_neon
1112 pop {r4, r10, r11, pc}
1115 function ff_put_h264_qpel8_mc31_neon, export=1
1117 push {r0, r1, r2, lr}
1119 b put_h264_qpel8_mc11
1122 function ff_put_h264_qpel8_mc02_neon, export=1
1125 sub r1, r1, r2, lsl #1
1128 bl put_h264_qpel8_v_lowpass_neon
1133 function ff_put_h264_qpel8_mc12_neon, export=1
1134 push {r0, r1, r4, r10, r11, lr}
1135 put_h264_qpel8_mc12:
1139 sub sp, sp, #(8*8+16*12)
1140 sub r1, r1, r2, lsl #1
1145 bl put_h264_qpel8_v_lowpass_neon
1148 sub r1, r1, r3, lsl #1
1151 bl put_h264_qpel8_hv_lowpass_l2_neon
1154 pop {r4, r10, r11, pc}
1157 function ff_put_h264_qpel8_mc22_neon, export=1
1158 push {r4, r10, r11, lr}
1161 sub r1, r1, r2, lsl #1
1164 sub sp, sp, #(16*12)
1167 bl put_h264_qpel8_hv_lowpass_neon
1170 pop {r4, r10, r11, pc}
1173 function ff_put_h264_qpel8_mc32_neon, export=1
1174 push {r0, r1, r4, r10, r11, lr}
1176 b put_h264_qpel8_mc12
1179 function ff_put_h264_qpel8_mc03_neon, export=1
1182 b put_h264_qpel8_mc01
1185 function ff_put_h264_qpel8_mc13_neon, export=1
1186 push {r0, r1, r2, lr}
1188 b put_h264_qpel8_mc11
1191 function ff_put_h264_qpel8_mc23_neon, export=1
1192 push {r0, r1, r4, r10, r11, lr}
1194 b put_h264_qpel8_mc21
1197 function ff_put_h264_qpel8_mc33_neon, export=1
1199 push {r0, r1, r2, lr}
1202 b put_h264_qpel8_mc11
1205 function ff_put_h264_qpel16_mc10_neon, export=1
1209 b put_h264_qpel16_h_lowpass_l2_neon
1212 function ff_put_h264_qpel16_mc20_neon, export=1
1216 b put_h264_qpel16_h_lowpass_neon
1219 function ff_put_h264_qpel16_mc30_neon, export=1
1223 b put_h264_qpel16_h_lowpass_l2_neon
1226 function ff_put_h264_qpel16_mc01_neon, export=1
1229 put_h264_qpel16_mc01:
1232 sub r1, r1, r2, lsl #1
1234 bl put_h264_qpel16_v_lowpass_l2_neon
1239 function ff_put_h264_qpel16_mc11_neon, export=1
1240 push {r0, r1, r4, lr}
1241 put_h264_qpel16_mc11:
1248 bl put_h264_qpel16_h_lowpass_neon
1253 sub r1, r1, r2, lsl #1
1255 bl put_h264_qpel16_v_lowpass_l2_neon
1257 add sp, sp, #(256+8)
1261 function ff_put_h264_qpel16_mc21_neon, export=1
1262 push {r0, r1, r4-r5, r9-r11, lr}
1263 put_h264_qpel16_mc21:
1267 sub sp, sp, #(16*16+16*12)
1271 bl put_h264_qpel16_h_lowpass_neon_packed
1274 sub r1, r1, r2, lsl #1
1277 bl put_h264_qpel16_hv_lowpass_l2_neon
1280 pop {r4-r5, r9-r11, pc}
1283 function ff_put_h264_qpel16_mc31_neon, export=1
1285 push {r0, r1, r4, lr}
1287 b put_h264_qpel16_mc11
1290 function ff_put_h264_qpel16_mc02_neon, export=1
1293 sub r1, r1, r2, lsl #1
1296 bl put_h264_qpel16_v_lowpass_neon
1301 function ff_put_h264_qpel16_mc12_neon, export=1
1302 push {r0, r1, r4-r5, r9-r11, lr}
1303 put_h264_qpel16_mc12:
1307 sub sp, sp, #(16*16+16*12)
1308 sub r1, r1, r2, lsl #1
1312 bl put_h264_qpel16_v_lowpass_neon_packed
1315 sub r1, r1, r3, lsl #1
1318 bl put_h264_qpel16_hv_lowpass_l2_neon
1321 pop {r4-r5, r9-r11, pc}
1324 function ff_put_h264_qpel16_mc22_neon, export=1
1325 push {r4, r9-r11, lr}
1329 sub r1, r1, r2, lsl #1
1332 sub sp, sp, #(16*12)
1335 bl put_h264_qpel16_hv_lowpass_neon
1338 pop {r4, r9-r11, pc}
1341 function ff_put_h264_qpel16_mc32_neon, export=1
1342 push {r0, r1, r4-r5, r9-r11, lr}
1344 b put_h264_qpel16_mc12
1347 function ff_put_h264_qpel16_mc03_neon, export=1
1350 b put_h264_qpel16_mc01
1353 function ff_put_h264_qpel16_mc13_neon, export=1
1354 push {r0, r1, r4, lr}
1356 b put_h264_qpel16_mc11
1359 function ff_put_h264_qpel16_mc23_neon, export=1
1360 push {r0, r1, r4-r5, r9-r11, lr}
1362 b put_h264_qpel16_mc21
1365 function ff_put_h264_qpel16_mc33_neon, export=1
1367 push {r0, r1, r4, lr}
1370 b put_h264_qpel16_mc11
1373 @ Biweighted prediction
1375 .macro biweight_16 macs, macd
1381 vld1.8 {d20-d21},[r0,:128], r2
1385 vld1.8 {d22-d23},[r1,:128], r2
1390 vld1.8 {d28-d29},[r0,:128], r2
1395 vld1.8 {d30-d31},[r1,:128], r2
1403 vshl.s16 q12, q12, q9
1404 vshl.s16 q13, q13, q9
1405 vqmovun.s16 d24, q12
1406 vqmovun.s16 d25, q13
1408 vst1.8 {d4- d5}, [r6,:128], r2
1410 vst1.8 {d24-d25},[r6,:128], r2
1415 .macro biweight_8 macs, macd
1421 vld1.8 {d4},[r0,:64], r2
1424 vld1.8 {d5},[r1,:64], r2
1427 vld1.8 {d6},[r0,:64], r2
1430 vld1.8 {d7},[r1,:64], r2
1435 vshl.s16 q10, q10, q9
1438 vst1.8 {d2},[r6,:64], r2
1440 vst1.8 {d4},[r6,:64], r2
1445 .macro biweight_4 macs, macd
1451 vld1.32 {d4[0]},[r0,:32], r2
1452 vld1.32 {d4[1]},[r0,:32], r2
1455 vld1.32 {d5[0]},[r1,:32], r2
1456 vld1.32 {d5[1]},[r1,:32], r2
1460 vld1.32 {d6[0]},[r0,:32], r2
1461 vld1.32 {d6[1]},[r0,:32], r2
1464 vld1.32 {d7[0]},[r1,:32], r2
1465 vld1.32 {d7[1]},[r1,:32], r2
1470 vshl.s16 q10, q10, q9
1473 vst1.32 {d2[0]},[r6,:32], r2
1474 vst1.32 {d2[1]},[r6,:32], r2
1476 vst1.32 {d4[0]},[r6,:32], r2
1477 vst1.32 {d4[1]},[r6,:32], r2
1480 2: vshl.s16 q1, q1, q9
1482 vst1.32 {d2[0]},[r6,:32], r2
1483 vst1.32 {d2[1]},[r6,:32], r2
1487 .macro biweight_func w
1488 function biweight_h264_pixels_\w\()_neon
1494 eors lr, lr, r5, lsr #30
1507 10: biweight_\w vmlal.u8, vmlal.u8
1509 biweight_\w vmlal.u8, vmlsl.u8
1512 biweight_\w vmlsl.u8, vmlsl.u8
1514 biweight_\w vmlsl.u8, vmlal.u8
1518 .macro biweight_entry w, h, b=1
1519 function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
1522 b biweight_h264_pixels_\w\()_neon
1527 biweight_entry 16, 8
1528 biweight_entry 16, 16, b=0
1531 biweight_entry 8, 16
1533 biweight_entry 8, 8, b=0
1538 biweight_entry 4, 4, b=0
1541 @ Weighted prediction
1543 .macro weight_16 add
1546 vld1.8 {d20-d21},[r0,:128], r1
1547 vmull.u8 q2, d0, d20
1549 vmull.u8 q3, d0, d21
1550 vld1.8 {d28-d29},[r0,:128], r1
1551 vmull.u8 q12, d0, d28
1553 vmull.u8 q13, d0, d29
1555 vrshl.s16 q2, q2, q9
1557 vrshl.s16 q3, q3, q9
1561 vrshl.s16 q12, q12, q9
1563 vrshl.s16 q13, q13, q9
1564 vqmovun.s16 d24, q12
1565 vqmovun.s16 d25, q13
1566 vst1.8 {d4- d5}, [r4,:128], r1
1567 vst1.8 {d24-d25},[r4,:128], r1
1575 vld1.8 {d4},[r0,:64], r1
1578 vld1.8 {d6},[r0,:64], r1
1579 vmull.u8 q10, d0, d6
1582 vrshl.s16 q1, q1, q9
1585 vrshl.s16 q10, q10, q9
1587 vst1.8 {d2},[r4,:64], r1
1588 vst1.8 {d4},[r4,:64], r1
1598 vld1.32 {d4[0]},[r0,:32], r1
1599 vld1.32 {d4[1]},[r0,:32], r1
1603 vld1.32 {d6[0]},[r0,:32], r1
1604 vld1.32 {d6[1]},[r0,:32], r1
1605 vmull.u8 q10, d0, d6
1608 vrshl.s16 q1, q1, q9
1611 vrshl.s16 q10, q10, q9
1614 vst1.32 {d2[0]},[r4,:32], r1
1615 vst1.32 {d2[1]},[r4,:32], r1
1617 vst1.32 {d4[0]},[r4,:32], r1
1618 vst1.32 {d4[1]},[r4,:32], r1
1622 vrshl.s16 q1, q1, q9
1624 vst1.32 {d2[0]},[r4,:32], r1
1625 vst1.32 {d2[1]},[r4,:32], r1
1629 .macro weight_func w
1630 function weight_h264_pixels_\w\()_neon
1655 .macro weight_entry w, h, b=1
1656 function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
1659 b weight_h264_pixels_\w\()_neon
1665 weight_entry 16, 16, b=0
1670 weight_entry 8, 8, b=0
1675 weight_entry 4, 4, b=0