1 ;*****************************************************************************
2 ;* x86-optimized functions for removegrain filter
4 ;* Copyright (C) 2015 James Darnley
6 ;* This file is part of FFmpeg.
8 ;* FFmpeg is free software; you can redistribute it and/or modify
9 ;* it under the terms of the GNU General Public License as published by
10 ;* the Free Software Foundation; either version 2 of the License, or
11 ;* (at your option) any later version.
13 ;* FFmpeg is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ;* GNU General Public License for more details.
18 ;* You should have received a copy of the GNU General Public License along
19 ;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
20 ;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21 ;*****************************************************************************
28 %include "libavutil/x86/x86util.asm"
34 pw_div9: times
16 dw ((1<<16)+4)/9
38 ;*** Preprocessor helpers
40 %define a1 srcq
+stride_n
-1
41 %define a2 srcq
+stride_n
42 %define a3 srcq
+stride_n
+1
46 %define a6 srcq
+stride_p
-1
47 %define a7 srcq
+stride_p
48 %define a8 srcq
+stride_p
+1
50 ; %1 dest simd register
51 ; %2 source memory location
52 ; %3 zero location (simd register/memory)
70 ; %1 zero location (simd register/memory)
71 %macro LOAD_SQUARE_16
1
84 ; %2 simd register to hold maximums
85 ; %3 simd register to hold minimums
86 ; %4 temp location (simd register/memory)
94 SORT_PAIR ub
, m1
, m8
, m9
95 SORT_PAIR ub
, m2
, m7
, m10
96 SORT_PAIR ub
, m3
, m6
, m11
97 SORT_PAIR ub
, m4
, m5
, m12
101 %macro SORT_AXIS_16
0
102 SORT_PAIR sw
, m1
, m8
, m9
103 SORT_PAIR sw
, m2
, m7
, m10
104 SORT_PAIR sw
, m3
, m6
, m11
105 SORT_PAIR sw
, m4
, m5
, m12
108 ; The loop doesn't need to do all the iterations. It could stop when the right
109 ; pixels are in the right registers.
116 SORT_PAIR ub
, m
%+ i
, m
%+ j
, m9
124 ; %1 dest simd register
125 ; %2 source (simd register/memory)
126 ; %3 temp simd register
134 ; %1 dest simd register
135 ; %2 source (simd register/memory)
136 ; %3 temp simd register
144 ; %1 simd register that holds the "false" values and will hold the result
145 ; %2 simd register that holds the "true" values
146 ; %3 location (simd register/memory) that hold the mask
149 vpblendvb
%1, %1, %2, %3
161 cglobal rg_fl_mode_1
, 4, 5, 3, 0, dst
, src
, stride
, pixels
164 %define stride_p strideq
211 cglobal rg_fl_mode_2
, 4, 5, 10, 0, dst
, src
, stride
, pixels
214 %define stride_p strideq
230 cglobal rg_fl_mode_3
, 4, 5, 10, 0, dst
, src
, stride
, pixels
233 %define stride_p strideq
249 cglobal rg_fl_mode_4
, 4, 5, 10, 0, dst
, src
, stride
, pixels
252 %define stride_p strideq
268 cglobal rg_fl_mode_5
, 4, 5, 13, 0, dst
, src
, stride
, pixels
271 %define stride_p strideq
293 ABS_DIFF m9
, m0
, m1
; c1
294 ABS_DIFF m10
, m0
, m2
; c2
295 ABS_DIFF m11
, m0
, m3
; c3
296 ABS_DIFF m12
, m0
, m4
; c4
300 pminub m9
, m12
; mindiff
306 ; Notice the order here: c1, c3, c2, c4
318 cglobal rg_fl_mode_6
, 4, 5, 16, 0, dst
, src
, stride
, pixels
321 %define stride_p strideq
324 ; Some register saving suggestions: the zero can be somewhere other than a
325 ; register, the center pixels could be on the stack.
336 CLIPW m9
, m1
, m8
; clip1
337 CLIPW m10
, m2
, m7
; clip2
338 CLIPW m11
, m3
, m6
; clip3
339 CLIPW m12
, m4
, m5
; clip4
350 ABS_DIFF_W m1
, m0
, m13
351 ABS_DIFF_W m2
, m0
, m14
352 ABS_DIFF_W m3
, m0
, m13
353 ABS_DIFF_W m4
, m0
, m14
362 ; As the differences (d1..d4) can only be positive, there is no need to
363 ; clip to zero. Also, the maximum positive value is less than 768.
381 sub pixelsd
, mmsize
/2
385 ; This is just copy-pasted straight from mode 6 with the left shifts removed.
386 cglobal rg_fl_mode_7
, 4, 5, 16, 0, dst
, src
, stride
, pixels
389 %define stride_p strideq
392 ; Can this be done without unpacking?
403 CLIPW m9
, m1
, m8
; clip1
404 CLIPW m10
, m2
, m7
; clip2
405 CLIPW m11
, m3
, m6
; clip3
406 CLIPW m12
, m4
, m5
; clip4
417 ABS_DIFF_W m1
, m0
, m13
418 ABS_DIFF_W m2
, m0
, m14
419 ABS_DIFF_W m3
, m0
, m13
420 ABS_DIFF_W m4
, m0
, m14
442 sub pixelsd
, mmsize
/2
446 ; This is just copy-pasted straight from mode 6 with a few changes.
447 cglobal rg_fl_mode_8
, 4, 5, 16, 0, dst
, src
, stride
, pixels
450 %define stride_p strideq
462 CLIPW m9
, m1
, m8
; clip1
463 CLIPW m10
, m2
, m7
; clip2
464 CLIPW m11
, m3
, m6
; clip3
465 CLIPW m12
, m4
, m5
; clip4
480 ABS_DIFF_W m1
, m0
, m13
481 ABS_DIFF_W m2
, m0
, m14
482 ABS_DIFF_W m3
, m0
, m13
483 ABS_DIFF_W m4
, m0
, m14
488 ; As the differences (d1..d4) can only be positive, there is no need to
489 ; clip to zero. Also, the maximum positive value is less than 768.
507 sub pixelsd
, mmsize
/2
511 cglobal rg_fl_mode_9
, 4, 5, 13, 0, dst
, src
, stride
, pixels
514 %define stride_p strideq
525 CLIPUB m9
, m1
, m8
; clip1
526 CLIPUB m10
, m2
, m7
; clip2
527 CLIPUB m11
, m3
, m6
; clip3
528 CLIPUB m12
, m4
, m5
; clip4
555 cglobal rg_fl_mode_10
, 4, 5, 8, 0, dst
, src
, stride
, pixels
558 %define stride_p strideq
568 movu m3
, [a5
] ; load pixel
570 ABS_DIFF m4
, m0
, m7
; absolute difference from center
571 pminub m1
, m4
; mindiff
572 pcmpeqb m4
, m1
; if (difference == mindiff)
573 BLEND m2
, m3
, m4
; return pixel
624 cglobal rg_fl_mode_11_12
, 4, 5, 7, 0, dst
, src
, stride
, pixels
627 %define stride_p strideq
662 sub pixelsd
, mmsize
/2
666 cglobal rg_fl_mode_13_14
, 4, 5, 8, 0, dst
, src
, stride
, pixels
669 %define stride_p strideq
705 cglobal rg_fl_mode_15_16
, 4, 5, 16, 0, dst
, src
, stride
, pixels
708 %define stride_p strideq
718 ABS_DIFF_W m9
, m8
, m12
719 ABS_DIFF_W m10
, m7
, m13
720 ABS_DIFF_W m11
, m6
, m14
738 SORT_PAIR ub
, m1
, m8
, m0
739 SORT_PAIR ub
, m2
, m7
, m9
740 SORT_PAIR ub
, m3
, m6
, m14
754 sub pixelsd
, mmsize
/2
758 cglobal rg_fl_mode_17
, 4, 5, 9, 0, dst
, src
, stride
, pixels
761 %define stride_p strideq
790 cglobal rg_fl_mode_18
, 4, 5, 16, 0, dst
, src
, stride
, pixels
793 %define stride_p strideq
802 ABS_DIFF m10
, m0
, m12
803 pmaxub m9
, m10
; m9 = d1
807 ABS_DIFF m10
, m0
, m12
808 ABS_DIFF m11
, m0
, m13
809 pmaxub m10
, m11
; m10 = d2
813 ABS_DIFF m11
, m0
, m13
814 ABS_DIFF m12
, m0
, m14
815 pmaxub m11
, m12
; m11 = d3
819 ABS_DIFF m12
, m0
, m14
820 ABS_DIFF m13
, m0
, m15
821 pmaxub m12
, m13
; m12 = d4
826 pminub m13
, m12
; m13 = mindiff
840 CLIPUB m13
, m1
, m8
; m13 = ret...d1
849 por m14
, m11
; m14 = ret...d3
858 por m15
, m10
; m15 = ret...d2
867 por m1
, m12
; m15 = ret...d4
877 cglobal rg_fl_mode_19
, 4, 5, 7, 0, dst
, src
, stride
, pixels
880 %define stride_p strideq
913 sub pixelsd
, mmsize
/2
917 cglobal rg_fl_mode_20
, 4, 5, 7, 0, dst
, src
, stride
, pixels
920 %define stride_p strideq
949 pmulhuw m1
, [pw_div9
]
956 sub pixelsd
, mmsize
/2
960 cglobal rg_fl_mode_21
, 4, 5, 8, 0, dst
, src
, stride
, pixels
963 %define stride_p strideq
1000 punpckhbw m4
, m2
, m0
1002 punpckhbw m5
, m3
, m0
1016 punpckhbw m4
, m2
, m0
1018 punpckhbw m5
, m3
, m0
1039 cglobal rg_fl_mode_22
, 4, 5, 8, 0, dst
, src
, stride
, pixels
1042 %define stride_p strideq
1043 %define stride_n r4q
1081 cglobal rg_fl_mode_23
, 4, 5, 16, 0, dst
, src
, stride
, pixels
1084 %define stride_p strideq
1085 %define stride_n r4q
1096 psubw m9
, m1
; linediff1
1097 psubw m10
, m2
; linediff2
1098 psubw m11
, m3
; linediff3
1099 psubw m12
, m4
; linediff4
1123 pminsw m10
, m14
; u2
1138 sub pixelsd
, mmsize
/2
1142 cglobal rg_fl_mode_24
, 4, 5, 16, mmsize
, dst
, src
, stride
, pixels
1145 %define stride_p strideq
1146 %define stride_n r4q
1158 psubw m9
, m1
; linediff1
1159 psubw m10
, m2
; linediff2
1160 psubw m11
, m3
; linediff3
1161 psubw m12
, m4
; linediff4
1163 psubw m1
, [rsp
] ; td1
1164 psubw m2
, [rsp
] ; td2
1165 psubw m3
, [rsp
] ; td3
1166 psubw m4
, [rsp
] ; td4
1195 pminsw m10
, m13
; u2
1196 pminsw m11
, m14
; u3
1197 pminsw m12
, m15
; u4
1201 pmaxsw m1
, m3
; d without max(d,0)
1202 pmaxsw m9
, m11
; u without max(u,0)
1215 sub pixelsd
, mmsize
/2