2 * ARM NEON optimised DSP functions
3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
32 1: vld1.64 {d0, d1}, [r1], r2
33 vld1.64 {d2, d3}, [r1], r2
34 vld1.64 {d4, d5}, [r1], r2
36 vld1.64 {d6, d7}, [r1], r2
41 vld1.64 {d16,d17}, [ip,:128], r2
43 vld1.64 {d18,d19}, [ip,:128], r2
45 vld1.64 {d20,d21}, [ip,:128], r2
47 vld1.64 {d22,d23}, [ip,:128], r2
51 vst1.64 {d0, d1}, [r0,:128], r2
52 vst1.64 {d2, d3}, [r0,:128], r2
53 vst1.64 {d4, d5}, [r0,:128], r2
54 vst1.64 {d6, d7}, [r0,:128], r2
59 .macro pixels16_x2 vhadd=vrhadd.u8
60 1: vld1.64 {d0-d2}, [r1], r2
61 vld1.64 {d4-d6}, [r1], r2
69 vst1.64 {d0, d1}, [r0,:128], r2
70 vst1.64 {d4, d5}, [r0,:128], r2
75 .macro pixels16_y2 vhadd=vrhadd.u8
76 vld1.64 {d0, d1}, [r1], r2
77 vld1.64 {d2, d3}, [r1], r2
80 vld1.64 {d0, d1}, [r1], r2
82 vld1.64 {d2, d3}, [r1], r2
85 vst1.64 {d4, d5}, [r0,:128], r2
86 vst1.64 {d6, d7}, [r0,:128], r2
91 .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0
92 vld1.64 {d0-d2}, [r1], r2
93 vld1.64 {d4-d6}, [r1], r2
100 vext.8 q3, q2, q3, #1
106 vld1.64 {d0-d2}, [r1], r2
110 vadd.u16 q12, q12, q13
112 vext.8 q15, q0, q1, #1
113 vadd.u16 q1 , q10, q11
120 vld1.64 {d2-d4}, [r1], r2
121 vaddl.u8 q10, d1, d31
122 vst1.64 {d28,d29}, [r0,:128], r2
126 vadd.u16 q12, q12, q13
128 vext.8 q2, q1, q2, #1
129 vadd.u16 q0, q10, q11
137 vst1.64 {d30,d31}, [r0,:128], r2
143 1: vld1.64 {d0}, [r1], r2
144 vld1.64 {d1}, [r1], r2
145 vld1.64 {d2}, [r1], r2
147 vld1.64 {d3}, [r1], r2
152 vst1.64 {d0}, [r0,:64], r2
153 vst1.64 {d1}, [r0,:64], r2
154 vst1.64 {d2}, [r0,:64], r2
155 vst1.64 {d3}, [r0,:64], r2
160 .macro pixels8_x2 vhadd=vrhadd.u8
161 1: vld1.64 {d0, d1}, [r1], r2
162 vext.8 d1, d0, d1, #1
163 vld1.64 {d2, d3}, [r1], r2
164 vext.8 d3, d2, d3, #1
170 vst1.64 {d0}, [r0,:64], r2
171 vst1.64 {d1}, [r0,:64], r2
176 .macro pixels8_y2 vhadd=vrhadd.u8
177 vld1.64 {d0}, [r1], r2
178 vld1.64 {d1}, [r1], r2
181 vld1.64 {d0}, [r1], r2
183 vld1.64 {d1}, [r1], r2
186 vst1.64 {d4}, [r0,:64], r2
187 vst1.64 {d5}, [r0,:64], r2
192 .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0
193 vld1.64 {d0, d1}, [r1], r2
194 vld1.64 {d2, d3}, [r1], r2
200 vext.8 d4, d0, d1, #1
201 vext.8 d6, d2, d3, #1
205 vld1.64 {d0, d1}, [r1], r2
208 vext.8 d4, d0, d1, #1
210 vadd.u16 q10, q10, q11
214 vld1.64 {d2, d3}, [r1], r2
218 vadd.u16 q10, q10, q11
220 vst1.64 {d5}, [r0,:64], r2
222 vext.8 d6, d2, d3, #1
224 vst1.64 {d7}, [r0,:64], r2
229 .macro pixfunc pfx name suf rnd_op args:vararg
230 function ff_\pfx\name\suf\()_neon, export=1
235 .macro pixfunc2 pfx name args:vararg
237 pixfunc \pfx \name \args
240 function ff_put_h264_qpel16_mc00_neon, export=1
244 pixfunc put_ pixels16
245 pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8
246 pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8
247 pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1
249 function ff_avg_h264_qpel16_mc00_neon, export=1
253 pixfunc avg_ pixels16,, 1
255 function ff_put_h264_qpel8_mc00_neon, export=1
260 pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8
261 pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8
262 pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1
264 function ff_put_pixels_clamped_neon, export=1
265 vld1.64 {d16-d19}, [r0,:128]!
267 vld1.64 {d20-d23}, [r0,:128]!
269 vld1.64 {d24-d27}, [r0,:128]!
271 vld1.64 {d28-d31}, [r0,:128]!
273 vst1.64 {d0}, [r1,:64], r2
275 vst1.64 {d1}, [r1,:64], r2
277 vst1.64 {d2}, [r1,:64], r2
279 vst1.64 {d3}, [r1,:64], r2
281 vst1.64 {d4}, [r1,:64], r2
282 vst1.64 {d5}, [r1,:64], r2
283 vst1.64 {d6}, [r1,:64], r2
284 vst1.64 {d7}, [r1,:64], r2
288 function ff_put_signed_pixels_clamped_neon, export=1
290 vld1.64 {d16-d17}, [r0,:128]!
292 vld1.64 {d18-d19}, [r0,:128]!
294 vld1.64 {d16-d17}, [r0,:128]!
296 vld1.64 {d18-d19}, [r0,:128]!
298 vld1.64 {d20-d21}, [r0,:128]!
300 vld1.64 {d22-d23}, [r0,:128]!
302 vst1.64 {d0}, [r1,:64], r2
304 vst1.64 {d1}, [r1,:64], r2
306 vst1.64 {d2}, [r1,:64], r2
308 vld1.64 {d24-d25}, [r0,:128]!
310 vld1.64 {d26-d27}, [r0,:128]!
313 vst1.64 {d3}, [r1,:64], r2
315 vst1.64 {d4}, [r1,:64], r2
317 vst1.64 {d5}, [r1,:64], r2
320 vst1.64 {d6}, [r1,:64], r2
321 vst1.64 {d7}, [r1,:64], r2
325 function ff_add_pixels_clamped_neon, export=1
327 vld1.64 {d16}, [r1,:64], r2
328 vld1.64 {d0-d1}, [r0,:128]!
330 vld1.64 {d17}, [r1,:64], r2
331 vld1.64 {d2-d3}, [r0,:128]!
333 vld1.64 {d18}, [r1,:64], r2
335 vld1.64 {d4-d5}, [r0,:128]!
337 vst1.64 {d0}, [r3,:64], r2
339 vld1.64 {d19}, [r1,:64], r2
340 vld1.64 {d6-d7}, [r0,:128]!
343 vst1.64 {d2}, [r3,:64], r2
344 vld1.64 {d16}, [r1,:64], r2
346 vld1.64 {d0-d1}, [r0,:128]!
348 vst1.64 {d4}, [r3,:64], r2
349 vld1.64 {d17}, [r1,:64], r2
350 vld1.64 {d2-d3}, [r0,:128]!
352 vst1.64 {d6}, [r3,:64], r2
354 vld1.64 {d18}, [r1,:64], r2
355 vld1.64 {d4-d5}, [r0,:128]!
357 vst1.64 {d0}, [r3,:64], r2
359 vld1.64 {d19}, [r1,:64], r2
361 vld1.64 {d6-d7}, [r0,:128]!
363 vst1.64 {d2}, [r3,:64], r2
365 vst1.64 {d4}, [r3,:64], r2
366 vst1.64 {d6}, [r3,:64], r2
370 function ff_float_to_int16_neon, export=1
372 vld1.64 {d0-d1}, [r1,:128]!
373 vcvt.s32.f32 q8, q0, #16
374 vld1.64 {d2-d3}, [r1,:128]!
375 vcvt.s32.f32 q9, q1, #16
380 vshrn.s32 d4, q8, #16
381 vld1.64 {d0-d1}, [r1,:128]!
382 vcvt.s32.f32 q0, q0, #16
383 vshrn.s32 d5, q9, #16
384 vld1.64 {d2-d3}, [r1,:128]!
385 vcvt.s32.f32 q1, q1, #16
386 vshrn.s32 d6, q0, #16
387 vst1.64 {d4-d5}, [r0,:128]!
388 vshrn.s32 d7, q1, #16
389 vld1.64 {d16-d17},[r1,:128]!
390 vcvt.s32.f32 q8, q8, #16
391 vld1.64 {d18-d19},[r1,:128]!
392 vcvt.s32.f32 q9, q9, #16
393 vst1.64 {d6-d7}, [r0,:128]!
397 2: vld1.64 {d0-d1}, [r1,:128]!
398 vshrn.s32 d4, q8, #16
399 vcvt.s32.f32 q0, q0, #16
400 vld1.64 {d2-d3}, [r1,:128]!
401 vshrn.s32 d5, q9, #16
402 vcvt.s32.f32 q1, q1, #16
403 vshrn.s32 d6, q0, #16
404 vst1.64 {d4-d5}, [r0,:128]!
405 vshrn.s32 d7, q1, #16
406 vst1.64 {d6-d7}, [r0,:128]!
408 3: vshrn.s32 d4, q8, #16
409 vshrn.s32 d5, q9, #16
410 vst1.64 {d4-d5}, [r0,:128]!
414 function ff_float_to_int16_interleave_neon, export=1
417 blt ff_float_to_int16_neon
424 vld1.64 {d0-d1}, [r3,:128]!
425 vcvt.s32.f32 q8, q0, #16
426 vld1.64 {d2-d3}, [r3,:128]!
427 vcvt.s32.f32 q9, q1, #16
428 vld1.64 {d20-d21},[r1,:128]!
429 vcvt.s32.f32 q10, q10, #16
430 vld1.64 {d22-d23},[r1,:128]!
431 vcvt.s32.f32 q11, q11, #16
436 vld1.64 {d0-d1}, [r3,:128]!
437 vcvt.s32.f32 q0, q0, #16
439 vld1.64 {d2-d3}, [r3,:128]!
440 vcvt.s32.f32 q1, q1, #16
441 vld1.64 {d24-d25},[r1,:128]!
442 vcvt.s32.f32 q12, q12, #16
443 vld1.64 {d26-d27},[r1,:128]!
445 vst1.64 {d20-d21},[r0,:128]!
446 vcvt.s32.f32 q13, q13, #16
447 vst1.64 {d22-d23},[r0,:128]!
449 vld1.64 {d16-d17},[r3,:128]!
451 vst1.64 {d24-d25},[r0,:128]!
452 vcvt.s32.f32 q8, q8, #16
453 vld1.64 {d18-d19},[r3,:128]!
454 vcvt.s32.f32 q9, q9, #16
455 vld1.64 {d20-d21},[r1,:128]!
456 vcvt.s32.f32 q10, q10, #16
457 vld1.64 {d22-d23},[r1,:128]!
458 vcvt.s32.f32 q11, q11, #16
459 vst1.64 {d26-d27},[r0,:128]!
463 2: vsri.32 q10, q8, #16
464 vld1.64 {d0-d1}, [r3,:128]!
465 vcvt.s32.f32 q0, q0, #16
466 vld1.64 {d2-d3}, [r3,:128]!
467 vcvt.s32.f32 q1, q1, #16
468 vld1.64 {d24-d25},[r1,:128]!
469 vcvt.s32.f32 q12, q12, #16
471 vld1.64 {d26-d27},[r1,:128]!
472 vcvt.s32.f32 q13, q13, #16
473 vst1.64 {d20-d21},[r0,:128]!
475 vst1.64 {d22-d23},[r0,:128]!
477 vst1.64 {d24-d27},[r0,:128]!
479 3: vsri.32 q10, q8, #16
481 vst1.64 {d20-d23},[r0,:128]!
490 5: ldmia r1!, {r4-r7}
493 vld1.64 {d16-d17},[r4,:128]!
494 vcvt.s32.f32 q8, q8, #16
495 vld1.64 {d18-d19},[r5,:128]!
496 vcvt.s32.f32 q9, q9, #16
497 vld1.64 {d20-d21},[r6,:128]!
498 vcvt.s32.f32 q10, q10, #16
499 vld1.64 {d22-d23},[r7,:128]!
500 vcvt.s32.f32 q11, q11, #16
502 vld1.64 {d0-d1}, [r4,:128]!
503 vcvt.s32.f32 q0, q0, #16
505 vld1.64 {d2-d3}, [r5,:128]!
506 vcvt.s32.f32 q1, q1, #16
507 vsri.32 q11, q10, #16
508 vld1.64 {d4-d5}, [r6,:128]!
509 vcvt.s32.f32 q2, q2, #16
511 vld1.64 {d6-d7}, [r7,:128]!
512 vcvt.s32.f32 q3, q3, #16
514 vst1.64 {d18}, [r8], ip
516 vst1.64 {d22}, [r8], ip
518 vst1.64 {d19}, [r8], ip
520 vst1.64 {d23}, [r8], ip
523 vld1.64 {d16-d17},[r4,:128]!
524 vcvt.s32.f32 q8, q8, #16
525 vst1.64 {d2}, [r8], ip
526 vld1.64 {d18-d19},[r5,:128]!
527 vcvt.s32.f32 q9, q9, #16
528 vst1.64 {d6}, [r8], ip
529 vld1.64 {d20-d21},[r6,:128]!
530 vcvt.s32.f32 q10, q10, #16
531 vst1.64 {d3}, [r8], ip
532 vld1.64 {d22-d23},[r7,:128]!
533 vcvt.s32.f32 q11, q11, #16
534 vst1.64 {d7}, [r8], ip
536 7: vst1.64 {d2}, [r8], ip
537 vst1.64 {d6}, [r8], ip
538 vst1.64 {d3}, [r8], ip
539 vst1.64 {d7}, [r8], ip
553 vld1.64 {d16-d17},[r4,:128]!
554 vcvt.s32.f32 q8, q8, #16
555 vld1.64 {d18-d19},[r5,:128]!
556 vcvt.s32.f32 q9, q9, #16
557 vld1.64 {d20-d21},[r4,:128]!
558 vcvt.s32.f32 q10, q10, #16
559 vld1.64 {d22-d23},[r5,:128]!
560 vcvt.s32.f32 q11, q11, #16
564 vsri.32 d18, d16, #16
565 vsri.32 d19, d17, #16
566 vld1.64 {d16-d17},[r4,:128]!
567 vcvt.s32.f32 q8, q8, #16
568 vst1.32 {d18[0]}, [r8], ip
569 vsri.32 d22, d20, #16
570 vst1.32 {d18[1]}, [r8], ip
571 vsri.32 d23, d21, #16
572 vst1.32 {d19[0]}, [r8], ip
573 vst1.32 {d19[1]}, [r8], ip
574 vld1.64 {d18-d19},[r5,:128]!
575 vcvt.s32.f32 q9, q9, #16
576 vst1.32 {d22[0]}, [r8], ip
577 vst1.32 {d22[1]}, [r8], ip
578 vld1.64 {d20-d21},[r4,:128]!
579 vcvt.s32.f32 q10, q10, #16
580 vst1.32 {d23[0]}, [r8], ip
581 vst1.32 {d23[1]}, [r8], ip
582 vld1.64 {d22-d23},[r5,:128]!
583 vcvt.s32.f32 q11, q11, #16
585 vld1.64 {d0-d1}, [r4,:128]!
586 vcvt.s32.f32 q0, q0, #16
587 vsri.32 d18, d16, #16
588 vld1.64 {d2-d3}, [r5,:128]!
589 vcvt.s32.f32 q1, q1, #16
590 vsri.32 d19, d17, #16
591 vld1.64 {d4-d5}, [r4,:128]!
592 vcvt.s32.f32 q2, q2, #16
593 vld1.64 {d6-d7}, [r5,:128]!
594 vcvt.s32.f32 q3, q3, #16
595 vst1.32 {d18[0]}, [r8], ip
596 vsri.32 d22, d20, #16
597 vst1.32 {d18[1]}, [r8], ip
598 vsri.32 d23, d21, #16
599 vst1.32 {d19[0]}, [r8], ip
601 vst1.32 {d19[1]}, [r8], ip
603 vst1.32 {d22[0]}, [r8], ip
605 vst1.32 {d22[1]}, [r8], ip
607 vst1.32 {d23[0]}, [r8], ip
608 vst1.32 {d23[1]}, [r8], ip
610 vld1.64 {d16-d17},[r4,:128]!
611 vcvt.s32.f32 q8, q8, #16
612 vst1.32 {d2[0]}, [r8], ip
613 vst1.32 {d2[1]}, [r8], ip
614 vld1.64 {d18-d19},[r5,:128]!
615 vcvt.s32.f32 q9, q9, #16
616 vst1.32 {d3[0]}, [r8], ip
617 vst1.32 {d3[1]}, [r8], ip
618 vld1.64 {d20-d21},[r4,:128]!
619 vcvt.s32.f32 q10, q10, #16
620 vst1.32 {d6[0]}, [r8], ip
621 vst1.32 {d6[1]}, [r8], ip
622 vld1.64 {d22-d23},[r5,:128]!
623 vcvt.s32.f32 q11, q11, #16
624 vst1.32 {d7[0]}, [r8], ip
625 vst1.32 {d7[1]}, [r8], ip
627 6: vst1.32 {d2[0]}, [r8], ip
628 vst1.32 {d2[1]}, [r8], ip
629 vst1.32 {d3[0]}, [r8], ip
630 vst1.32 {d3[1]}, [r8], ip
631 vst1.32 {d6[0]}, [r8], ip
632 vst1.32 {d6[1]}, [r8], ip
633 vst1.32 {d7[0]}, [r8], ip
634 vst1.32 {d7[1]}, [r8], ip
636 7: vsri.32 d18, d16, #16
637 vsri.32 d19, d17, #16
638 vst1.32 {d18[0]}, [r8], ip
639 vsri.32 d22, d20, #16
640 vst1.32 {d18[1]}, [r8], ip
641 vsri.32 d23, d21, #16
642 vst1.32 {d19[0]}, [r8], ip
643 vst1.32 {d19[1]}, [r8], ip
644 vst1.32 {d22[0]}, [r8], ip
645 vst1.32 {d22[1]}, [r8], ip
646 vst1.32 {d23[0]}, [r8], ip
647 vst1.32 {d23[1]}, [r8], ip
657 vld1.64 {d0-d1}, [r4,:128]!
658 vcvt.s32.f32 q0, q0, #16
659 vld1.64 {d2-d3}, [r4,:128]!
660 vcvt.s32.f32 q1, q1, #16
663 vld1.64 {d4-d5}, [r4,:128]!
664 vcvt.s32.f32 q2, q2, #16
665 vld1.64 {d6-d7}, [r4,:128]!
666 vcvt.s32.f32 q3, q3, #16
667 vst1.16 {d0[1]}, [r5,:16], ip
668 vst1.16 {d0[3]}, [r5,:16], ip
669 vst1.16 {d1[1]}, [r5,:16], ip
670 vst1.16 {d1[3]}, [r5,:16], ip
671 vst1.16 {d2[1]}, [r5,:16], ip
672 vst1.16 {d2[3]}, [r5,:16], ip
673 vst1.16 {d3[1]}, [r5,:16], ip
674 vst1.16 {d3[3]}, [r5,:16], ip
676 vld1.64 {d0-d1}, [r4,:128]!
677 vcvt.s32.f32 q0, q0, #16
678 vld1.64 {d2-d3}, [r4,:128]!
679 vcvt.s32.f32 q1, q1, #16
680 7: vst1.16 {d4[1]}, [r5,:16], ip
681 vst1.16 {d4[3]}, [r5,:16], ip
682 vst1.16 {d5[1]}, [r5,:16], ip
683 vst1.16 {d5[3]}, [r5,:16], ip
684 vst1.16 {d6[1]}, [r5,:16], ip
685 vst1.16 {d6[3]}, [r5,:16], ip
686 vst1.16 {d7[1]}, [r5,:16], ip
687 vst1.16 {d7[3]}, [r5,:16], ip
691 vst1.16 {d0[1]}, [r5,:16], ip
692 vst1.16 {d0[3]}, [r5,:16], ip
693 vst1.16 {d1[1]}, [r5,:16], ip
694 vst1.16 {d1[3]}, [r5,:16], ip
695 vst1.16 {d2[1]}, [r5,:16], ip
696 vst1.16 {d2[3]}, [r5,:16], ip
697 vst1.16 {d3[1]}, [r5,:16], ip
698 vst1.16 {d3[3]}, [r5,:16], ip
700 vld1.64 {d0-d1}, [r4,:128]!
701 vcvt.s32.f32 q0, q0, #16
702 vld1.64 {d2-d3}, [r4,:128]!
703 vcvt.s32.f32 q1, q1, #16
707 function ff_vector_fmul_neon, export=1
710 vld1.64 {d0-d3}, [r0,:128]!
711 vld1.64 {d4-d7}, [r1,:128]!
718 vld1.64 {d0-d1}, [r0,:128]!
719 vld1.64 {d4-d5}, [r1,:128]!
721 vld1.64 {d2-d3}, [r0,:128]!
722 vld1.64 {d6-d7}, [r1,:128]!
724 vst1.64 {d16-d19},[r3,:128]!
725 vld1.64 {d0-d1}, [r0,:128]!
726 vld1.64 {d4-d5}, [r1,:128]!
728 vld1.64 {d2-d3}, [r0,:128]!
729 vld1.64 {d6-d7}, [r1,:128]!
731 vst1.64 {d20-d23},[r3,:128]!
735 2: vld1.64 {d0-d1}, [r0,:128]!
736 vld1.64 {d4-d5}, [r1,:128]!
737 vst1.64 {d16-d17},[r3,:128]!
739 vld1.64 {d2-d3}, [r0,:128]!
740 vld1.64 {d6-d7}, [r1,:128]!
741 vst1.64 {d18-d19},[r3,:128]!
743 3: vst1.64 {d16-d19},[r3,:128]!
747 function ff_vector_fmul_window_neon, export=1
748 vld1.32 {d16[],d17[]}, [sp,:32]
753 add r2, r2, r5, lsl #2
754 add r4, r3, r5, lsl #3
755 add ip, r0, r5, lsl #3
757 vld1.64 {d0,d1}, [r1,:128]!
758 vld1.64 {d2,d3}, [r2,:128], r5
759 vld1.64 {d4,d5}, [r3,:128]!
760 vld1.64 {d6,d7}, [r4,:128], r5
772 vld1.64 {d0,d1}, [r1,:128]!
774 vld1.64 {d18,d19},[r2,:128], r5
776 vld1.64 {d24,d25},[r3,:128]!
778 vld1.64 {d6,d7}, [r4,:128], r5
783 vst1.64 {d20,d21},[r0,:128]!
784 vst1.64 {d22,d23},[ip,:128], r5
786 2: vmla.f32 d22, d3, d7
792 vst1.64 {d20,d21},[r0,:128]!
793 vst1.64 {d22,d23},[ip,:128], r5