avcodec/jpegxl_parse{,r}: fix integer overflow for some malformed files
[FFMpeg-mirror.git] / libavfilter / x86 / vf_removegrain.asm
blobd049bf257d3583431ce8ce3e18fd145414f60e56
1 ;*****************************************************************************
2 ;* x86-optimized functions for removegrain filter
3 ;*
4 ;* Copyright (C) 2015 James Darnley
5 ;*
6 ;* This file is part of FFmpeg.
7 ;*
8 ;* FFmpeg is free software; you can redistribute it and/or modify
9 ;* it under the terms of the GNU General Public License as published by
10 ;* the Free Software Foundation; either version 2 of the License, or
11 ;* (at your option) any later version.
13 ;* FFmpeg is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ;* GNU General Public License for more details.
18 ;* You should have received a copy of the GNU General Public License along
19 ;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
20 ;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21 ;*****************************************************************************
23 ; column: -1 0 +1
24 ; row -1: a1 a2 a3
25 ; row 0: a4 c a5
26 ; row +1: a6 a7 a8
28 %include "libavutil/x86/x86util.asm"
30 SECTION_RODATA 32
32 pw_4: times 16 dw 4
33 pw_8: times 16 dw 8
34 pw_div9: times 16 dw ((1<<16)+4)/9
36 SECTION .text
38 ;*** Preprocessor helpers
40 %define a1 srcq+stride_n-1
41 %define a2 srcq+stride_n
42 %define a3 srcq+stride_n+1
43 %define a4 srcq-1
44 %define c srcq
45 %define a5 srcq+1
46 %define a6 srcq+stride_p-1
47 %define a7 srcq+stride_p
48 %define a8 srcq+stride_p+1
50 ; %1 dest simd register
51 ; %2 source memory location
52 ; %3 zero location (simd register/memory)
53 %macro LOAD 3
54 movh %1, %2
55 punpcklbw %1, %3
56 %endmacro
58 %macro LOAD_SQUARE 0
59 movu m1, [a1]
60 movu m2, [a2]
61 movu m3, [a3]
62 movu m4, [a4]
63 movu m0, [c]
64 movu m5, [a5]
65 movu m6, [a6]
66 movu m7, [a7]
67 movu m8, [a8]
68 %endmacro
70 ; %1 zero location (simd register/memory)
71 %macro LOAD_SQUARE_16 1
72 LOAD m1, [a1], %1
73 LOAD m2, [a2], %1
74 LOAD m3, [a3], %1
75 LOAD m4, [a4], %1
76 LOAD m0, [c], %1
77 LOAD m5, [a5], %1
78 LOAD m6, [a6], %1
79 LOAD m7, [a7], %1
80 LOAD m8, [a8], %1
81 %endmacro
83 ; %1 data type
84 ; %2 simd register to hold maximums
85 ; %3 simd register to hold minimums
86 ; %4 temp location (simd register/memory)
87 %macro SORT_PAIR 4
88 mova %4, %2
89 pmin%1 %2, %3
90 pmax%1 %3, %4
91 %endmacro
93 %macro SORT_AXIS 0
94 SORT_PAIR ub, m1, m8, m9
95 SORT_PAIR ub, m2, m7, m10
96 SORT_PAIR ub, m3, m6, m11
97 SORT_PAIR ub, m4, m5, m12
98 %endmacro
101 %macro SORT_AXIS_16 0
102 SORT_PAIR sw, m1, m8, m9
103 SORT_PAIR sw, m2, m7, m10
104 SORT_PAIR sw, m3, m6, m11
105 SORT_PAIR sw, m4, m5, m12
106 %endmacro
108 ; The loop doesn't need to do all the iterations. It could stop when the right
109 ; pixels are in the right registers.
110 %macro SORT_SQUARE 0
111 %assign k 7
112 %rep 7
113 %assign i 1
114 %assign j 2
115 %rep k
116 SORT_PAIR ub, m %+ i , m %+ j , m9
117 %assign i i+1
118 %assign j j+1
119 %endrep
120 %assign k k-1
121 %endrep
122 %endmacro
124 ; %1 dest simd register
125 ; %2 source (simd register/memory)
126 ; %3 temp simd register
127 %macro ABS_DIFF 3
128 mova %3, %2
129 psubusb %3, %1
130 psubusb %1, %2
131 por %1, %3
132 %endmacro
134 ; %1 dest simd register
135 ; %2 source (simd register/memory)
136 ; %3 temp simd register
137 %macro ABS_DIFF_W 3
138 mova %3, %2
139 psubusw %3, %1
140 psubusw %1, %2
141 por %1, %3
142 %endmacro
144 ; %1 simd register that holds the "false" values and will hold the result
145 ; %2 simd register that holds the "true" values
146 ; %3 location (simd register/memory) that hold the mask
147 %macro BLEND 3
148 %if cpuflag(avx2)
149 vpblendvb %1, %1, %2, %3
150 %else
151 pand %2, %3
152 pandn %3, %1
153 por %3, %2
154 SWAP %1, %3
155 %endif
156 %endmacro
158 ; Functions
160 INIT_XMM sse2
161 cglobal rg_fl_mode_1, 4, 5, 3, 0, dst, src, stride, pixels
162 mov r4q, strideq
163 neg r4q
164 %define stride_p strideq
165 %define stride_n r4q
167 .loop:
168 movu m0, [a1]
169 mova m1, m0
171 movu m2, [a2]
172 pmaxub m0, m2
173 pminub m1, m2
175 movu m2, [a3]
176 pmaxub m0, m2
177 pminub m1, m2
179 movu m2, [a4]
180 pmaxub m0, m2
181 pminub m1, m2
183 movu m2, [a5]
184 pmaxub m0, m2
185 pminub m1, m2
187 movu m2, [a6]
188 pmaxub m0, m2
189 pminub m1, m2
191 movu m2, [a7]
192 pmaxub m0, m2
193 pminub m1, m2
195 movu m2, [a8]
196 pmaxub m0, m2
197 pminub m1, m2
199 movu m2, [c]
200 pminub m2, m0
201 pmaxub m2, m1
203 movu [dstq], m2
204 add srcq, mmsize
205 add dstq, mmsize
206 sub pixelsd, mmsize
207 jg .loop
210 %if ARCH_X86_64
211 cglobal rg_fl_mode_2, 4, 5, 10, 0, dst, src, stride, pixels
212 mov r4q, strideq
213 neg r4q
214 %define stride_p strideq
215 %define stride_n r4q
217 .loop:
218 LOAD_SQUARE
219 SORT_SQUARE
221 CLIPUB m0, m2, m7
223 movu [dstq], m0
224 add srcq, mmsize
225 add dstq, mmsize
226 sub pixelsd, mmsize
227 jg .loop
230 cglobal rg_fl_mode_3, 4, 5, 10, 0, dst, src, stride, pixels
231 mov r4q, strideq
232 neg r4q
233 %define stride_p strideq
234 %define stride_n r4q
236 .loop:
237 LOAD_SQUARE
238 SORT_SQUARE
240 CLIPUB m0, m3, m6
242 movu [dstq], m0
243 add srcq, mmsize
244 add dstq, mmsize
245 sub pixelsd, mmsize
246 jg .loop
249 cglobal rg_fl_mode_4, 4, 5, 10, 0, dst, src, stride, pixels
250 mov r4q, strideq
251 neg r4q
252 %define stride_p strideq
253 %define stride_n r4q
255 .loop:
256 LOAD_SQUARE
257 SORT_SQUARE
259 CLIPUB m0, m4, m5
261 movu [dstq], m0
262 add srcq, mmsize
263 add dstq, mmsize
264 sub pixelsd, mmsize
265 jg .loop
268 cglobal rg_fl_mode_5, 4, 5, 13, 0, dst, src, stride, pixels
269 mov r4q, strideq
270 neg r4q
271 %define stride_p strideq
272 %define stride_n r4q
274 .loop:
275 LOAD_SQUARE
276 SORT_AXIS
278 mova m9, m0
279 mova m10, m0
280 mova m11, m0
281 mova m12, m0
283 CLIPUB m9, m1, m8
284 CLIPUB m10, m2, m7
285 CLIPUB m11, m3, m6
286 CLIPUB m12, m4, m5
288 mova m8, m9 ; clip1
289 mova m7, m10 ; clip2
290 mova m6, m11 ; clip3
291 mova m5, m12 ; clip4
293 ABS_DIFF m9, m0, m1 ; c1
294 ABS_DIFF m10, m0, m2 ; c2
295 ABS_DIFF m11, m0, m3 ; c3
296 ABS_DIFF m12, m0, m4 ; c4
298 pminub m9, m10
299 pminub m9, m11
300 pminub m9, m12 ; mindiff
302 pcmpeqb m10, m9
303 pcmpeqb m11, m9
304 pcmpeqb m12, m9
306 ; Notice the order here: c1, c3, c2, c4
307 BLEND m8, m6, m11
308 BLEND m8, m7, m10
309 BLEND m8, m5, m12
311 movu [dstq], m8
312 add srcq, mmsize
313 add dstq, mmsize
314 sub pixelsd, mmsize
315 jg .loop
318 cglobal rg_fl_mode_6, 4, 5, 16, 0, dst, src, stride, pixels
319 mov r4q, strideq
320 neg r4q
321 %define stride_p strideq
322 %define stride_n r4q
324 ; Some register saving suggestions: the zero can be somewhere other than a
325 ; register, the center pixels could be on the stack.
327 pxor m15, m15
328 .loop:
329 LOAD_SQUARE_16 m15
330 SORT_AXIS_16
332 mova m9, m0
333 mova m10, m0
334 mova m11, m0
335 mova m12, m0
336 CLIPW m9, m1, m8 ; clip1
337 CLIPW m10, m2, m7 ; clip2
338 CLIPW m11, m3, m6 ; clip3
339 CLIPW m12, m4, m5 ; clip4
341 psubw m8, m1 ; d1
342 psubw m7, m2 ; d2
343 psubw m6, m3 ; d3
344 psubw m5, m4 ; d4
346 mova m1, m9
347 mova m2, m10
348 mova m3, m11
349 mova m4, m12
350 ABS_DIFF_W m1, m0, m13
351 ABS_DIFF_W m2, m0, m14
352 ABS_DIFF_W m3, m0, m13
353 ABS_DIFF_W m4, m0, m14
354 psllw m1, 1
355 psllw m2, 1
356 psllw m3, 1
357 psllw m4, 1
358 paddw m1, m8 ; c1
359 paddw m2, m7 ; c2
360 paddw m3, m6 ; c3
361 paddw m4, m5 ; c4
362 ; As the differences (d1..d4) can only be positive, there is no need to
363 ; clip to zero. Also, the maximum positive value is less than 768.
365 pminsw m1, m2
366 pminsw m1, m3
367 pminsw m1, m4
369 pcmpeqw m2, m1
370 pcmpeqw m3, m1
371 pcmpeqw m4, m1
373 BLEND m9, m11, m3
374 BLEND m9, m10, m2
375 BLEND m9, m12, m4
376 packuswb m9, m9
378 movh [dstq], m9
379 add srcq, mmsize/2
380 add dstq, mmsize/2
381 sub pixelsd, mmsize/2
382 jg .loop
385 ; This is just copy-pasted straight from mode 6 with the left shifts removed.
386 cglobal rg_fl_mode_7, 4, 5, 16, 0, dst, src, stride, pixels
387 mov r4q, strideq
388 neg r4q
389 %define stride_p strideq
390 %define stride_n r4q
392 ; Can this be done without unpacking?
394 pxor m15, m15
395 .loop:
396 LOAD_SQUARE_16 m15
397 SORT_AXIS_16
399 mova m9, m0
400 mova m10, m0
401 mova m11, m0
402 mova m12, m0
403 CLIPW m9, m1, m8 ; clip1
404 CLIPW m10, m2, m7 ; clip2
405 CLIPW m11, m3, m6 ; clip3
406 CLIPW m12, m4, m5 ; clip4
408 psubw m8, m1 ; d1
409 psubw m7, m2 ; d2
410 psubw m6, m3 ; d3
411 psubw m5, m4 ; d4
413 mova m1, m9
414 mova m2, m10
415 mova m3, m11
416 mova m4, m12
417 ABS_DIFF_W m1, m0, m13
418 ABS_DIFF_W m2, m0, m14
419 ABS_DIFF_W m3, m0, m13
420 ABS_DIFF_W m4, m0, m14
421 paddw m1, m8 ; c1
422 paddw m2, m7 ; c2
423 paddw m3, m6 ; c3
424 paddw m4, m5 ; c4
426 pminsw m1, m2
427 pminsw m1, m3
428 pminsw m1, m4
430 pcmpeqw m2, m1
431 pcmpeqw m3, m1
432 pcmpeqw m4, m1
434 BLEND m9, m11, m3
435 BLEND m9, m10, m2
436 BLEND m9, m12, m4
437 packuswb m9, m9
439 movh [dstq], m9
440 add srcq, mmsize/2
441 add dstq, mmsize/2
442 sub pixelsd, mmsize/2
443 jg .loop
446 ; This is just copy-pasted straight from mode 6 with a few changes.
447 cglobal rg_fl_mode_8, 4, 5, 16, 0, dst, src, stride, pixels
448 mov r4q, strideq
449 neg r4q
450 %define stride_p strideq
451 %define stride_n r4q
453 pxor m15, m15
454 .loop:
455 LOAD_SQUARE_16 m15
456 SORT_AXIS_16
458 mova m9, m0
459 mova m10, m0
460 mova m11, m0
461 mova m12, m0
462 CLIPW m9, m1, m8 ; clip1
463 CLIPW m10, m2, m7 ; clip2
464 CLIPW m11, m3, m6 ; clip3
465 CLIPW m12, m4, m5 ; clip4
467 psubw m8, m1 ; d1
468 psubw m7, m2 ; d2
469 psubw m6, m3 ; d3
470 psubw m5, m4 ; d4
471 psllw m8, 1
472 psllw m7, 1
473 psllw m6, 1
474 psllw m5, 1
476 mova m1, m9
477 mova m2, m10
478 mova m3, m11
479 mova m4, m12
480 ABS_DIFF_W m1, m0, m13
481 ABS_DIFF_W m2, m0, m14
482 ABS_DIFF_W m3, m0, m13
483 ABS_DIFF_W m4, m0, m14
484 paddw m1, m8 ; c1
485 paddw m2, m7 ; c1
486 paddw m3, m6 ; c1
487 paddw m4, m5 ; c1
488 ; As the differences (d1..d4) can only be positive, there is no need to
489 ; clip to zero. Also, the maximum positive value is less than 768.
491 pminsw m1, m2
492 pminsw m1, m3
493 pminsw m1, m4
495 pcmpeqw m2, m1
496 pcmpeqw m3, m1
497 pcmpeqw m4, m1
499 BLEND m9, m11, m3
500 BLEND m9, m10, m2
501 BLEND m9, m12, m4
502 packuswb m9, m9
504 movh [dstq], m9
505 add srcq, mmsize/2
506 add dstq, mmsize/2
507 sub pixelsd, mmsize/2
508 jg .loop
511 cglobal rg_fl_mode_9, 4, 5, 13, 0, dst, src, stride, pixels
512 mov r4q, strideq
513 neg r4q
514 %define stride_p strideq
515 %define stride_n r4q
517 .loop:
518 LOAD_SQUARE
519 SORT_AXIS
521 mova m9, m0
522 mova m10, m0
523 mova m11, m0
524 mova m12, m0
525 CLIPUB m9, m1, m8 ; clip1
526 CLIPUB m10, m2, m7 ; clip2
527 CLIPUB m11, m3, m6 ; clip3
528 CLIPUB m12, m4, m5 ; clip4
530 psubb m8, m1 ; d1
531 psubb m7, m2 ; d2
532 psubb m6, m3 ; d3
533 psubb m5, m4 ; d4
535 pminub m8, m7
536 pminub m8, m6
537 pminub m8, m5
539 pcmpeqb m7, m8
540 pcmpeqb m6, m8
541 pcmpeqb m5, m8
543 BLEND m9, m11, m6
544 BLEND m9, m10, m7
545 BLEND m9, m12, m5
547 movu [dstq], m9
548 add srcq, mmsize
549 add dstq, mmsize
550 sub pixelsd, mmsize
551 jg .loop
553 %endif
555 cglobal rg_fl_mode_10, 4, 5, 8, 0, dst, src, stride, pixels
556 mov r4q, strideq
557 neg r4q
558 %define stride_p strideq
559 %define stride_n r4q
561 .loop:
562 movu m0, [c]
564 movu m1, [a4]
565 mova m2, m1
566 ABS_DIFF m1, m0, m7
568 movu m3, [a5] ; load pixel
569 mova m4, m3
570 ABS_DIFF m4, m0, m7 ; absolute difference from center
571 pminub m1, m4 ; mindiff
572 pcmpeqb m4, m1 ; if (difference == mindiff)
573 BLEND m2, m3, m4 ; return pixel
575 movu m5, [a1]
576 mova m6, m5
577 ABS_DIFF m6, m0, m7
578 pminub m1, m6
579 pcmpeqb m6, m1
580 BLEND m2, m5, m6
582 movu m3, [a3]
583 mova m4, m3
584 ABS_DIFF m4, m0, m7
585 pminub m1, m4
586 pcmpeqb m4, m1
587 BLEND m2, m3, m4
589 movu m5, [a2]
590 mova m6, m5
591 ABS_DIFF m6, m0, m7
592 pminub m1, m6
593 pcmpeqb m6, m1
594 BLEND m2, m5, m6
596 movu m3, [a6]
597 mova m4, m3
598 ABS_DIFF m4, m0, m7
599 pminub m1, m4
600 pcmpeqb m4, m1
601 BLEND m2, m3, m4
603 movu m5, [a8]
604 mova m6, m5
605 ABS_DIFF m6, m0, m7
606 pminub m1, m6
607 pcmpeqb m6, m1
608 BLEND m2, m5, m6
610 movu m3, [a7]
611 mova m4, m3
612 ABS_DIFF m4, m0, m7
613 pminub m1, m4
614 pcmpeqb m4, m1
615 BLEND m2, m3, m4
617 movu [dstq], m2
618 add srcq, mmsize
619 add dstq, mmsize
620 sub pixelsd, mmsize
621 jg .loop
624 cglobal rg_fl_mode_11_12, 4, 5, 7, 0, dst, src, stride, pixels
625 mov r4q, strideq
626 neg r4q
627 %define stride_p strideq
628 %define stride_n r4q
630 pxor m0, m0
631 .loop:
632 LOAD m1, [c], m0
633 LOAD m2, [a2], m0
634 LOAD m3, [a4], m0
635 LOAD m4, [a5], m0
636 LOAD m5, [a7], m0
638 psllw m1, 2
639 paddw m2, m3
640 paddw m4, m5
641 paddw m2, m4
642 psllw m2, 1
644 LOAD m3, [a1], m0
645 LOAD m4, [a3], m0
646 LOAD m5, [a6], m0
647 LOAD m6, [a8], m0
648 paddw m1, m2
649 paddw m3, m4
650 paddw m5, m6
651 paddw m1, m3
652 paddw m1, m5
654 paddw m1, [pw_8]
655 psraw m1, 4
657 packuswb m1, m1
659 movh [dstq], m1
660 add srcq, mmsize/2
661 add dstq, mmsize/2
662 sub pixelsd, mmsize/2
663 jg .loop
666 cglobal rg_fl_mode_13_14, 4, 5, 8, 0, dst, src, stride, pixels
667 mov r4q, strideq
668 neg r4q
669 %define stride_p strideq
670 %define stride_n r4q
672 .loop:
673 movu m1, [a1]
674 movu m2, [a8]
675 mova m0, m1
676 pavgb m1, m2
677 ABS_DIFF m0, m2, m6
679 movu m3, [a3]
680 movu m4, [a6]
681 mova m5, m3
682 pavgb m3, m4
683 ABS_DIFF m5, m4, m7
684 pminub m0, m5
685 pcmpeqb m5, m0
686 BLEND m1, m3, m5
688 movu m2, [a2]
689 movu m3, [a7]
690 mova m4, m2
691 pavgb m2, m3
692 ABS_DIFF m4, m3, m6
693 pminub m0, m4
694 pcmpeqb m4, m0
695 BLEND m1, m2, m4
697 movu [dstq], m1
698 add srcq, mmsize
699 add dstq, mmsize
700 sub pixelsd, mmsize
701 jg .loop
704 %if ARCH_X86_64
705 cglobal rg_fl_mode_15_16, 4, 5, 16, 0, dst, src, stride, pixels
706 mov r4q, strideq
707 neg r4q
708 %define stride_p strideq
709 %define stride_n r4q
711 pxor m15, m15
712 .loop:
713 LOAD_SQUARE_16 m15
715 mova m9, m1
716 mova m10, m2
717 mova m11, m3
718 ABS_DIFF_W m9, m8, m12
719 ABS_DIFF_W m10, m7, m13
720 ABS_DIFF_W m11, m6, m14
721 pminsw m9, m10
722 pminsw m9, m11
723 pcmpeqw m10, m9
724 pcmpeqw m11, m9
726 mova m12, m2
727 mova m13, m1
728 mova m14, m6
729 paddw m12, m7
730 psllw m12, 1
731 paddw m13, m3
732 paddw m14, m8
733 paddw m12, [pw_4]
734 paddw m13, m14
735 paddw m12, m13
736 psrlw m12, 3
738 SORT_PAIR ub, m1, m8, m0
739 SORT_PAIR ub, m2, m7, m9
740 SORT_PAIR ub, m3, m6, m14
741 mova m4, m12
742 mova m5, m12
743 CLIPW m4, m1, m8
744 CLIPW m5, m2, m7
745 CLIPW m12, m3, m6
747 BLEND m4, m12, m11
748 BLEND m4, m5, m10
749 packuswb m4, m4
751 movh [dstq], m4
752 add srcq, mmsize/2
753 add dstq, mmsize/2
754 sub pixelsd, mmsize/2
755 jg .loop
758 cglobal rg_fl_mode_17, 4, 5, 9, 0, dst, src, stride, pixels
759 mov r4q, strideq
760 neg r4q
761 %define stride_p strideq
762 %define stride_n r4q
764 .loop:
765 LOAD_SQUARE
766 SORT_AXIS
768 pmaxub m1, m2
769 pmaxub m3, m4
771 pminub m8, m7
772 pminub m5, m6
774 pmaxub m1, m3
775 pminub m8, m5
777 mova m2, m1
778 pminub m1, m8
779 pmaxub m8, m2
781 CLIPUB m0, m1, m8
783 movu [dstq], m0
784 add srcq, mmsize
785 add dstq, mmsize
786 sub pixelsd, mmsize
787 jg .loop
790 cglobal rg_fl_mode_18, 4, 5, 16, 0, dst, src, stride, pixels
791 mov r4q, strideq
792 neg r4q
793 %define stride_p strideq
794 %define stride_n r4q
796 .loop:
797 LOAD_SQUARE
799 mova m9, m1
800 mova m10, m8
801 ABS_DIFF m9, m0, m11
802 ABS_DIFF m10, m0, m12
803 pmaxub m9, m10 ; m9 = d1
805 mova m10, m2
806 mova m11, m7
807 ABS_DIFF m10, m0, m12
808 ABS_DIFF m11, m0, m13
809 pmaxub m10, m11 ; m10 = d2
811 mova m11, m3
812 mova m12, m6
813 ABS_DIFF m11, m0, m13
814 ABS_DIFF m12, m0, m14
815 pmaxub m11, m12 ; m11 = d3
817 mova m12, m4
818 mova m13, m5
819 ABS_DIFF m12, m0, m14
820 ABS_DIFF m13, m0, m15
821 pmaxub m12, m13 ; m12 = d4
823 mova m13, m9
824 pminub m13, m10
825 pminub m13, m11
826 pminub m13, m12 ; m13 = mindiff
828 pcmpeqb m10, m13
829 pcmpeqb m11, m13
830 pcmpeqb m12, m13
832 mova m14, m1
833 pminub m1, m8
834 pmaxub m8, m14
836 mova m13, m0
837 mova m14, m1
838 pminub m1, m8
839 pmaxub m8, m14
840 CLIPUB m13, m1, m8 ; m13 = ret...d1
842 mova m14, m0
843 mova m15, m3
844 pminub m3, m6
845 pmaxub m6, m15
846 CLIPUB m14, m3, m6
847 pand m14, m11
848 pandn m11, m13
849 por m14, m11 ; m14 = ret...d3
851 mova m15, m0
852 mova m1, m2
853 pminub m2, m7
854 pmaxub m7, m1
855 CLIPUB m15, m2, m7
856 pand m15, m10
857 pandn m10, m14
858 por m15, m10 ; m15 = ret...d2
860 mova m1, m0
861 mova m2, m4
862 pminub m4, m5
863 pmaxub m5, m2
864 CLIPUB m1, m4, m5
865 pand m1, m12
866 pandn m12, m15
867 por m1, m12 ; m15 = ret...d4
869 movu [dstq], m1
870 add srcq, mmsize
871 add dstq, mmsize
872 sub pixelsd, mmsize
873 jg .loop
875 %endif
877 cglobal rg_fl_mode_19, 4, 5, 7, 0, dst, src, stride, pixels
878 mov r4q, strideq
879 neg r4q
880 %define stride_p strideq
881 %define stride_n r4q
883 pxor m0, m0
884 .loop:
885 LOAD m1, [a1], m0
886 LOAD m2, [a2], m0
887 paddw m1, m2
889 LOAD m3, [a3], m0
890 LOAD m4, [a4], m0
891 paddw m3, m4
893 LOAD m5, [a5], m0
894 LOAD m6, [a6], m0
895 paddw m5, m6
897 LOAD m2, [a7], m0
898 LOAD m4, [a8], m0
899 paddw m2, m4
901 paddw m1, m3
902 paddw m2, m5
903 paddw m1, m2
905 paddw m1, [pw_4]
906 psraw m1, 3
908 packuswb m1, m1
910 movh [dstq], m1
911 add srcq, mmsize/2
912 add dstq, mmsize/2
913 sub pixelsd, mmsize/2
914 jg .loop
917 cglobal rg_fl_mode_20, 4, 5, 7, 0, dst, src, stride, pixels
918 mov r4q, strideq
919 neg r4q
920 %define stride_p strideq
921 %define stride_n r4q
923 pxor m0, m0
924 .loop:
925 LOAD m1, [a1], m0
926 LOAD m2, [a2], m0
927 paddw m1, m2
929 LOAD m3, [a3], m0
930 LOAD m4, [a4], m0
931 paddw m3, m4
933 LOAD m5, [a5], m0
934 LOAD m6, [a6], m0
935 paddw m5, m6
937 LOAD m2, [a7], m0
938 LOAD m4, [a8], m0
939 paddw m2, m4
941 LOAD m6, [c], m0
942 paddw m1, m3
943 paddw m2, m5
944 paddw m6, [pw_4]
946 paddw m1, m2
947 paddw m1, m6
949 pmulhuw m1, [pw_div9]
951 packuswb m1, m1
953 movh [dstq], m1
954 add srcq, mmsize/2
955 add dstq, mmsize/2
956 sub pixelsd, mmsize/2
957 jg .loop
960 cglobal rg_fl_mode_21, 4, 5, 8, 0, dst, src, stride, pixels
961 mov r4q, strideq
962 neg r4q
963 %define stride_p strideq
964 %define stride_n r4q
966 pxor m0, m0
967 .loop:
968 movu m1, [a1]
969 movu m2, [a8]
970 pavgb m7, m1, m2
971 punpckhbw m3, m1, m0
972 punpcklbw m1, m0
973 punpckhbw m4, m2, m0
974 punpcklbw m2, m0
975 paddw m3, m4
976 paddw m1, m2
977 psrlw m3, 1
978 psrlw m1, 1
979 packuswb m1, m3
981 movu m2, [a2]
982 movu m3, [a7]
983 pavgb m6, m2, m3
984 punpckhbw m4, m2, m0
985 punpcklbw m2, m0
986 punpckhbw m5, m3, m0
987 punpcklbw m3, m0
988 paddw m4, m5
989 paddw m2, m3
990 psrlw m4, 1
991 psrlw m2, 1
992 packuswb m2, m4
994 pminub m1, m2
995 pmaxub m7, m6
997 movu m2, [a3]
998 movu m3, [a6]
999 pavgb m6, m2, m3
1000 punpckhbw m4, m2, m0
1001 punpcklbw m2, m0
1002 punpckhbw m5, m3, m0
1003 punpcklbw m3, m0
1004 paddw m4, m5
1005 paddw m2, m3
1006 psrlw m4, 1
1007 psrlw m2, 1
1008 packuswb m2, m4
1010 pminub m1, m2
1011 pmaxub m7, m6
1013 movu m2, [a4]
1014 movu m3, [a5]
1015 pavgb m6, m2, m3
1016 punpckhbw m4, m2, m0
1017 punpcklbw m2, m0
1018 punpckhbw m5, m3, m0
1019 punpcklbw m3, m0
1020 paddw m4, m5
1021 paddw m2, m3
1022 psrlw m4, 1
1023 psrlw m2, 1
1024 packuswb m2, m4
1026 pminub m1, m2
1027 pmaxub m7, m6
1029 movu m3, [c]
1030 CLIPUB m3, m1, m7
1032 movu [dstq], m3
1033 add srcq, mmsize
1034 add dstq, mmsize
1035 sub pixelsd, mmsize
1036 jg .loop
1039 cglobal rg_fl_mode_22, 4, 5, 8, 0, dst, src, stride, pixels
1040 mov r4q, strideq
1041 neg r4q
1042 %define stride_p strideq
1043 %define stride_n r4q
1045 .loop:
1046 movu m0, [a1]
1047 movu m1, [a8]
1048 pavgb m0, m1
1049 movu m2, [a2]
1050 movu m3, [a7]
1051 pavgb m2, m3
1052 movu m4, [a3]
1053 movu m5, [a6]
1054 pavgb m4, m5
1055 movu m6, [a4]
1056 movu m7, [a5]
1057 pavgb m6, m7
1059 mova m1, m0
1060 mova m3, m2
1061 mova m5, m4
1062 mova m7, m6
1063 pminub m0, m2
1064 pminub m4, m6
1065 pmaxub m1, m3
1066 pmaxub m5, m7
1067 pminub m0, m4
1068 pmaxub m1, m5
1070 movu m2, [c]
1071 CLIPUB m2, m0, m1
1073 movu [dstq], m2
1074 add srcq, mmsize
1075 add dstq, mmsize
1076 sub pixelsd, mmsize
1077 jg .loop
1080 %if ARCH_X86_64
1081 cglobal rg_fl_mode_23, 4, 5, 16, 0, dst, src, stride, pixels
1082 mov r4q, strideq
1083 neg r4q
1084 %define stride_p strideq
1085 %define stride_n r4q
1087 pxor m15, m15
1088 .loop:
1089 LOAD_SQUARE_16 m15
1090 SORT_AXIS_16
1092 mova m9, m8
1093 mova m10, m7
1094 mova m11, m6
1095 mova m12, m5
1096 psubw m9, m1 ; linediff1
1097 psubw m10, m2 ; linediff2
1098 psubw m11, m3 ; linediff3
1099 psubw m12, m4 ; linediff4
1101 psubw m1, m0
1102 psubw m2, m0
1103 psubw m3, m0
1104 psubw m4, m0
1105 pminsw m1, m9 ; d1
1106 pminsw m2, m10 ; d2
1107 pminsw m3, m11 ; d3
1108 pminsw m4, m12 ; d4
1109 pmaxsw m1, m2
1110 pmaxsw m3, m4
1111 pmaxsw m1, m3
1112 pmaxsw m1, m15 ; d
1114 mova m13, m0
1115 mova m14, m0
1116 mova m2, m0
1117 mova m4, m0
1118 psubw m13, m8
1119 psubw m14, m7
1120 psubw m2, m6
1121 psubw m4, m5
1122 pminsw m9, m13 ; u1
1123 pminsw m10, m14 ; u2
1124 pminsw m11, m2 ; u3
1125 pminsw m12, m4 ; u4
1126 pmaxsw m9, m10
1127 pmaxsw m11, m12
1128 pmaxsw m9, m11
1129 pmaxsw m9, m15 ; u
1131 paddw m0, m1
1132 psubw m0, m9
1133 packuswb m0, m0
1135 movh [dstq], m0
1136 add srcq, mmsize/2
1137 add dstq, mmsize/2
1138 sub pixelsd, mmsize/2
1139 jg .loop
1142 cglobal rg_fl_mode_24, 4, 5, 16, mmsize, dst, src, stride, pixels
1143 mov r4q, strideq
1144 neg r4q
1145 %define stride_p strideq
1146 %define stride_n r4q
1148 pxor m15, m15
1149 .loop:
1150 LOAD_SQUARE_16 m15
1151 mova [rsp], m0
1152 SORT_AXIS_16
1154 mova m9, m8
1155 mova m10, m7
1156 mova m11, m6
1157 mova m12, m5
1158 psubw m9, m1 ; linediff1
1159 psubw m10, m2 ; linediff2
1160 psubw m11, m3 ; linediff3
1161 psubw m12, m4 ; linediff4
1163 psubw m1, [rsp] ; td1
1164 psubw m2, [rsp] ; td2
1165 psubw m3, [rsp] ; td3
1166 psubw m4, [rsp] ; td4
1167 mova m0, m9
1168 mova m13, m10
1169 mova m14, m11
1170 mova m15, m12
1171 psubw m0, m1
1172 psubw m13, m2
1173 psubw m14, m3
1174 psubw m15, m4
1175 pminsw m1, m0 ; d1
1176 pminsw m2, m13 ; d2
1177 pminsw m3, m14 ; d3
1178 pminsw m4, m15 ; d4
1179 pmaxsw m1, m2
1180 pmaxsw m3, m4
1182 mova m0, [rsp]
1183 mova m13, [rsp]
1184 mova m14, [rsp]
1185 mova m15, [rsp]
1186 psubw m0, m8 ; tu1
1187 psubw m13, m7 ; tu2
1188 psubw m14, m6 ; tu3
1189 psubw m15, m5 ; tu4
1190 psubw m9, m0
1191 psubw m10, m13
1192 psubw m11, m14
1193 psubw m12, m15
1194 pminsw m9, m0 ; u1
1195 pminsw m10, m13 ; u2
1196 pminsw m11, m14 ; u3
1197 pminsw m12, m15 ; u4
1198 pmaxsw m9, m10
1199 pmaxsw m11, m12
1201 pmaxsw m1, m3 ; d without max(d,0)
1202 pmaxsw m9, m11 ; u without max(u,0)
1203 pxor m15, m15
1204 pmaxsw m1, m15
1205 pmaxsw m9, m15
1207 mova m0, [rsp]
1208 paddw m0, m1
1209 psubw m0, m9
1210 packuswb m0, m0
1212 movh [dstq], m0
1213 add srcq, mmsize/2
1214 add dstq, mmsize/2
1215 sub pixelsd, mmsize/2
1216 jg .loop
1218 %endif