1 ;*****************************************************************************
2 ;* x86-optimized functions for w3fdif filter
4 ;* Copyright (c) 2015 Paul B Mahol
6 ;* This file is part of FFmpeg.
8 ;* FFmpeg is free software; you can redistribute it and/or
9 ;* modify it under the terms of the GNU Lesser General Public
10 ;* License as published by the Free Software Foundation; either
11 ;* version 2.1 of the License, or (at your option) any later version.
13 ;* FFmpeg is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 ;* Lesser General Public License for more details.
18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with FFmpeg; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;******************************************************************************
23 %include "libavutil/x86/x86util.asm"
28 cglobal w3fdif_scale
, 3, 3, 2, 0, out_pixel
, work_pixel
, linesize
30 mova m0
, [work_pixelq
]
31 mova m1
, [work_pixelq
+mmsize
]
37 add out_pixelq
, mmsize
/2
38 add work_pixelq
, mmsize
*2
39 sub linesized
, mmsize
/2
43 cglobal w3fdif_simple_low
, 4, 5, 6, 0, work_line
, in_lines_cur0
, coef
, linesize
, offset
45 DEFINE_ARGS work_line
, in_lines_cur0
, in_lines_cur1
, linesize
, offset
50 mov in_lines_cur1q
, [in_lines_cur0q
+ gprsize
]
51 mov in_lines_cur0q
, [in_lines_cur0q
]
54 movh m2
, [in_lines_cur0q
+offsetq
]
55 movh m3
, [in_lines_cur1q
+offsetq
]
58 SBUTTERFLY wd
, 2, 3, 5
61 mova
[work_lineq
+offsetq
*4], m2
62 mova
[work_lineq
+offsetq
*4+mmsize
], m3
64 sub linesized
, mmsize
/2
68 cglobal w3fdif_complex_low
, 4, 7, 8, 0, work_line
, in_lines_cur0
, coef
, linesize
70 DEFINE_ARGS work_line
, in_lines_cur0
, in_lines_cur1
, linesize
, offset, in_lines_cur2
, in_lines_cur3
75 mov in_lines_cur3q
, [in_lines_cur0q
+gprsize
*3]
76 mov in_lines_cur2q
, [in_lines_cur0q
+gprsize
*2]
77 mov in_lines_cur1q
, [in_lines_cur0q
+gprsize
]
78 mov in_lines_cur0q
, [in_lines_cur0q
]
81 movh m4
, [in_lines_cur0q
+offsetq
]
82 movh m5
, [in_lines_cur1q
+offsetq
]
85 SBUTTERFLY wd
, 4, 5, 7
88 movh m6
, [in_lines_cur2q
+offsetq
]
89 movh m3
, [in_lines_cur3q
+offsetq
]
92 SBUTTERFLY wd
, 6, 3, 7
97 mova
[work_lineq
+offsetq
*4], m4
98 mova
[work_lineq
+offsetq
*4+mmsize
], m5
100 sub linesized
, mmsize
/2
105 cglobal w3fdif_simple_high
, 5, 9, 8, 0, work_line
, in_lines_cur0
, in_lines_adj0
, coef
, linesize
107 cglobal w3fdif_simple_high
, 4, 7, 8, 0, work_line
, in_lines_cur0
, in_lines_adj0
, coef
, linesize
111 DEFINE_ARGS work_line
, in_lines_cur0
, in_lines_adj0
, in_lines_cur1
, linesize
, offset, in_lines_cur2
, in_lines_adj1
, in_lines_adj2
114 DEFINE_ARGS work_line
, in_lines_cur0
, in_lines_adj0
, in_lines_cur1
, in_lines_cur2
, in_lines_adj1
, in_lines_adj2
115 %define linesized r4mp
121 mov in_lines_cur2q
, [in_lines_cur0q
+gprsize
*2]
122 mov in_lines_cur1q
, [in_lines_cur0q
+gprsize
]
123 mov in_lines_cur0q
, [in_lines_cur0q
]
124 mov in_lines_adj2q
, [in_lines_adj0q
+gprsize
*2]
125 mov in_lines_adj1q
, [in_lines_adj0q
+gprsize
]
126 mov in_lines_adj0q
, [in_lines_adj0q
]
129 sub in_lines_cur1q
, in_lines_cur0q
130 sub in_lines_cur2q
, in_lines_cur0q
131 sub in_lines_adj0q
, in_lines_cur0q
132 sub in_lines_adj1q
, in_lines_cur0q
133 sub in_lines_adj2q
, in_lines_cur0q
134 %define offsetq in_lines_cur0q
139 movh m3
, [in_lines_cur0q
+offsetq
]
141 movh m3
, [in_lines_cur0q
]
143 movh m4
, [in_lines_cur1q
+offsetq
]
146 SBUTTERFLY wd
, 3, 4, 1
149 movh m5
, [in_lines_adj0q
+offsetq
]
150 movh m6
, [in_lines_adj1q
+offsetq
]
153 SBUTTERFLY wd
, 5, 6, 1
158 movh m5
, [in_lines_cur2q
+offsetq
]
159 movh m6
, [in_lines_adj2q
+offsetq
]
162 SBUTTERFLY wd
, 5, 6, 1
168 paddd m3
, [work_lineq
+offsetq
*4]
169 paddd m4
, [work_lineq
+offsetq
*4+mmsize
]
170 mova
[work_lineq
+offsetq
*4], m3
171 mova
[work_lineq
+offsetq
*4+mmsize
], m4
173 paddd m3
, [work_lineq
]
174 paddd m4
, [work_lineq
+mmsize
]
175 mova
[work_lineq
], m3
176 mova
[work_lineq
+mmsize
], m4
177 add work_lineq
, mmsize
*2
179 add offsetq
, mmsize
/2
180 sub linesized
, mmsize
/2
186 cglobal w3fdif_complex_high
, 5, 13, 10, 0, work_line
, in_lines_cur0
, in_lines_adj0
, coef
, linesize
189 DEFINE_ARGS work_line
, in_lines_cur0
, in_lines_adj0
, in_lines_cur1
, linesize
, offset, in_lines_cur2
, in_lines_cur3
, in_lines_cur4
, in_lines_adj1
, in_lines_adj2
, in_lines_adj3
, in_lines_adj4
195 mov in_lines_cur4q
, [in_lines_cur0q
+gprsize
*4]
196 mov in_lines_cur3q
, [in_lines_cur0q
+gprsize
*3]
197 mov in_lines_cur2q
, [in_lines_cur0q
+gprsize
*2]
198 mov in_lines_cur1q
, [in_lines_cur0q
+gprsize
]
199 mov in_lines_cur0q
, [in_lines_cur0q
]
200 mov in_lines_adj4q
, [in_lines_adj0q
+gprsize
*4]
201 mov in_lines_adj3q
, [in_lines_adj0q
+gprsize
*3]
202 mov in_lines_adj2q
, [in_lines_adj0q
+gprsize
*2]
203 mov in_lines_adj1q
, [in_lines_adj0q
+gprsize
]
204 mov in_lines_adj0q
, [in_lines_adj0q
]
207 movh m5
, [in_lines_cur0q
+offsetq
]
208 movh m6
, [in_lines_cur1q
+offsetq
]
211 SBUTTERFLY wd
, 5, 6, 2
214 movh m8
, [in_lines_cur2q
+offsetq
]
215 movh m9
, [in_lines_cur3q
+offsetq
]
218 SBUTTERFLY wd
, 8, 9, 2
223 movh m8
, [in_lines_adj0q
+offsetq
]
224 movh m9
, [in_lines_adj1q
+offsetq
]
227 SBUTTERFLY wd
, 8, 9, 2
232 movh m8
, [in_lines_adj2q
+offsetq
]
233 movh m9
, [in_lines_adj3q
+offsetq
]
236 SBUTTERFLY wd
, 8, 9, 2
241 movh m8
, [in_lines_cur4q
+offsetq
]
242 movh m9
, [in_lines_adj4q
+offsetq
]
245 SBUTTERFLY wd
, 8, 9, 2
250 paddd m5
, [work_lineq
+offsetq
*4]
251 paddd m6
, [work_lineq
+offsetq
*4+mmsize
]
252 mova
[work_lineq
+offsetq
*4], m5
253 mova
[work_lineq
+offsetq
*4+mmsize
], m6
254 add offsetq
, mmsize
/2
255 sub linesized
, mmsize
/2