1 ;*****************************************************************************
2 ;* x86-optimized functions for blend filter
4 ;* Copyright (C) 2015 Paul B Mahol
5 ;* Copyright (C) 2018 Henrik Gramner
6 ;* Copyright (C) 2018 Jokyo Images
8 ;* This file is part of FFmpeg.
10 ;* FFmpeg is free software; you can redistribute it and/or
11 ;* modify it under the terms of the GNU Lesser General Public
12 ;* License as published by the Free Software Foundation; either
13 ;* version 2.1 of the License, or (at your option) any later version.
15 ;* FFmpeg is distributed in the hope that it will be useful,
16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;* Lesser General Public License for more details.
20 ;* You should have received a copy of the GNU Lesser General Public
21 ;* License along with FFmpeg; if not, write to the Free Software
22 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 ;******************************************************************************
25 %include "libavutil/x86/x86util.asm"
29 ps_255: times
4 dd 255.0
30 pd_32768
: times
4 dd 32768
31 pd_65535
: times
4 dd 65535
33 pw_128: times
8 dw 128
34 pw_255: times
8 dw 255
35 pb_127: times
16 db 127
36 pb_128: times
16 db 128
37 pb_255: times
16 db 255
41 %macro BLEND_INIT
2-3 0
43 cglobal blend_
%1, 6, 9, %2, top
, top_linesize
, bottom
, bottom_linesize
, dst
, dst_linesize
, width, end, x
44 mov widthd
, dword widthm
46 add widthq
, widthq
; doesn't compile on x86_32
49 cglobal blend_
%1, 5, 7, %2, top
, top_linesize
, bottom
, bottom_linesize
, dst
, end, x
50 %define dst_linesizeq r5mp
61 add topq
, top_linesizeq
62 add bottomq
, bottom_linesizeq
63 add dstq
, dst_linesizeq
69 %macro BLEND_SIMPLE
2-3 0
76 movu m1
, [bottomq
+ xq
]
84 ; %1 name , %2 src (b or w), %3 inter (w or d), %4 (1 if 16bit, not set if 8 bit)
85 %macro GRAINEXTRACT
3-4 0
89 VBROADCASTI128 m5
, [pd_32768
]
91 VBROADCASTI128 m5
, [pw_128
]
97 movu m3
, [bottomq
+ xq
]
99 punpckl
%2%3 m0
, m1
, m4
101 punpckl
%2%3 m2
, m3
, m4
117 %macro MULTIPLY
3 ; a, b, pw_1
118 pmullw
%1, %2 ; xxxxxxxx a * b
122 psrlw
%1, 8 ; 00xx00xx a * b / 255
125 %macro SCREEN
4 ; a, b, pw_1, pw_255
126 pxor
%1, %4 ; 00xx00xx 255 - a
129 pxor
%1, %4 ; 00xx00xx 255 - x / 255
132 %macro BLEND_MULTIPLY
0
133 BLEND_INIT multiply
, 6
135 VBROADCASTI128 m5
, [pw_1
]
141 movu m3
, [bottomq
+ xq
]
157 %macro BLEND_SCREEN
0
161 VBROADCASTI128 m5
, [pw_1
]
162 VBROADCASTI128 m6
, [pw_255
]
168 movu m3
, [bottomq
+ xq
]
174 SCREEN m0
, m2
, m5
, m6
175 SCREEN m1
, m3
, m5
, m6
184 ;%1 name, %2 (b or w), %3 (set if 16 bit)
194 movu m1
, [bottomq
+ xq
]
205 ; %1 name , %2 src (b or w), %3 inter (w or d), %4 (1 if 16bit, not set if 8 bit)
206 %macro GRAINMERGE
3-4 0
210 VBROADCASTI128 m5
, [pd_32768
]
212 VBROADCASTI128 m5
, [pw_128
]
219 movu m3
, [bottomq
+ xq
]
221 punpckl
%2%3 m0
, m1
, m4
223 punpckl
%2%3 m2
, m3
, m4
240 BLEND_INIT hardmix
, 5
241 VBROADCASTI128 m2
, [pb_255
]
242 VBROADCASTI128 m3
, [pb_128
]
243 VBROADCASTI128 m4
, [pb_127
]
249 movu m1
, [bottomq
+ xq
]
268 movd m0
, [topq
+ xq
] ; 000000xx
269 movd m1
, [bottomq
+ xq
]
270 punpcklbw m0
, m2
; 00000x0x
272 punpcklwd m0
, m2
; 000x000x
278 mulps m0
, m3
; a / b * 255
282 packssdw m0
, m0
; 00000x0x
283 packuswb m0
, m0
; 000000xx
292 ; %1 name, %2 b or w, %3 (opt) 1 if 16 bit
294 VBROADCASTI128 m3
, [pb_255
]
300 movu m1
, [bottomq
+ xq
]
313 ; %1 name , %2 src (b or w), %3 inter (w or d), %4 (1 if 16bit, not set if 8 bit)
314 %macro DIFFERENCE
3-4 0
322 movu m1
, [bottomq
+ xq
]
323 punpckh
%2%3 m3
, m0
, m2
325 punpckh
%2%3 m4
, m1
, m2
342 ; %1 name , %2 src (b or w), %3 inter (w or d), %4 (1 if 16bit, not set if 8 bit)
343 %macro EXTREMITY
3-4 0
347 VBROADCASTI128 m4
, [pd_65535
]
349 VBROADCASTI128 m4
, [pw_255
]
356 movu m1
, [bottomq
+ xq
]
357 punpckh
%2%3 m5
, m0
, m2
359 punpckh
%2%3 m6
, m1
, m2
378 %macro NEGATION
3-4 0
382 VBROADCASTI128 m4
, [pd_65535
]
384 VBROADCASTI128 m4
, [pw_255
]
391 movu m1
, [bottomq
+ xq
]
392 punpckh
%2%3 m5
, m0
, m2
394 punpckh
%2%3 m6
, m1
, m2
416 BLEND_SIMPLE
xor, xor
418 BLEND_SIMPLE
and, and
419 BLEND_SIMPLE addition
, addusb
420 BLEND_SIMPLE subtract
, subusb
421 BLEND_SIMPLE darken
, minub
422 BLEND_SIMPLE lighten
, maxub
423 GRAINEXTRACT grainextract
, b
, w
427 GRAINMERGE grainmerge
, b
, w
430 DIFFERENCE difference
, b
, w
432 EXTREMITY extremity
, b
, w
433 NEGATION negation
, b
, w
436 BLEND_SIMPLE addition_16
, addusw
, 1
437 BLEND_SIMPLE and_16
, and, 1
438 BLEND_SIMPLE or_16
, or, 1
439 BLEND_SIMPLE subtract_16
, subusw
, 1
440 BLEND_SIMPLE xor_16
, xor, 1
441 AVERAGE average_16
, w
, 1
445 DIFFERENCE difference
, b
, w
446 EXTREMITY extremity
, b
, w
447 NEGATION negation
, b
, w
451 BLEND_SIMPLE darken_16
, minuw
, 1
452 BLEND_SIMPLE lighten_16
, maxuw
, 1
453 GRAINEXTRACT grainextract_16
, w
, d
, 1
454 GRAINMERGE grainmerge_16
, w
, d
, 1
455 PHOENIX phoenix_16
, w
, 1
456 DIFFERENCE difference_16
, w
, d
, 1
457 EXTREMITY extremity_16
, w
, d
, 1
458 NEGATION negation_16
, w
, d
, 1
461 %if HAVE_AVX2_EXTERNAL
463 BLEND_SIMPLE
xor, xor
465 BLEND_SIMPLE
and, and
466 BLEND_SIMPLE addition
, addusb
467 BLEND_SIMPLE subtract
, subusb
468 BLEND_SIMPLE darken
, minub
469 BLEND_SIMPLE lighten
, maxub
470 GRAINEXTRACT grainextract
, b
, w
474 GRAINMERGE grainmerge
, b
, w
478 DIFFERENCE difference
, b
, w
479 EXTREMITY extremity
, b
, w
480 NEGATION negation
, b
, w
483 BLEND_SIMPLE addition_16
, addusw
, 1
484 BLEND_SIMPLE and_16
, and, 1
485 BLEND_SIMPLE darken_16
, minuw
, 1
486 BLEND_SIMPLE lighten_16
, maxuw
, 1
487 BLEND_SIMPLE or_16
, or, 1
488 BLEND_SIMPLE subtract_16
, subusw
, 1
489 BLEND_SIMPLE xor_16
, xor, 1
490 GRAINEXTRACT grainextract_16
, w
, d
, 1
491 AVERAGE average_16
, w
, 1
492 GRAINMERGE grainmerge_16
, w
, d
, 1
493 PHOENIX phoenix_16
, w
, 1
494 DIFFERENCE difference_16
, w
, d
, 1
495 EXTREMITY extremity_16
, w
, d
, 1
496 NEGATION negation_16
, w
, d
, 1