threadprogress: reorder instructions to fix race.
[ffmpeg.git] / libavcodec / x86 / vp9lpf_16bpp.asm
blobc0888170c923463d71f333085413707c73f1c4e0
1 ;******************************************************************************
2 ;* VP9 loop filter SIMD optimizations
3 ;*
4 ;* Copyright (C) 2015 Ronald S. Bultje <rsbultje@gmail.com>
5 ;*
6 ;* This file is part of FFmpeg.
7 ;*
8 ;* FFmpeg is free software; you can redistribute it and/or
9 ;* modify it under the terms of the GNU Lesser General Public
10 ;* License as published by the Free Software Foundation; either
11 ;* version 2.1 of the License, or (at your option) any later version.
13 ;* FFmpeg is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 ;* Lesser General Public License for more details.
18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with FFmpeg; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;******************************************************************************
23 %include "libavutil/x86/x86util.asm"
25 SECTION_RODATA
27 pw_511: times 16 dw 511
28 pw_2047: times 16 dw 2047
29 pw_16384: times 16 dw 16384
30 pw_m512: times 16 dw -512
31 pw_m2048: times 16 dw -2048
33 cextern pw_1
34 cextern pw_3
35 cextern pw_4
36 cextern pw_8
37 cextern pw_16
38 cextern pw_256
39 cextern pw_1023
40 cextern pw_4095
41 cextern pw_m1
43 SECTION .text
45 %macro SCRATCH 3-4
46 %if ARCH_X86_64
47 SWAP %1, %2
48 %if %0 == 4
49 %define reg_%4 m%2
50 %endif
51 %else
52 mova [%3], m%1
53 %if %0 == 4
54 %define reg_%4 [%3]
55 %endif
56 %endif
57 %endmacro
59 %macro UNSCRATCH 3-4
60 %if ARCH_X86_64
61 SWAP %1, %2
62 %else
63 mova m%1, [%3]
64 %endif
65 %if %0 == 4
66 %undef reg_%4
67 %endif
68 %endmacro
70 %macro PRELOAD 2-3
71 %if ARCH_X86_64
72 mova m%1, [%2]
73 %if %0 == 3
74 %define reg_%3 m%1
75 %endif
76 %elif %0 == 3
77 %define reg_%3 [%2]
78 %endif
79 %endmacro
81 ; calculate p or q portion of flat8out
82 %macro FLAT8OUT_HALF 0
83 psubw m4, m0 ; q4-q0
84 psubw m5, m0 ; q5-q0
85 psubw m6, m0 ; q6-q0
86 psubw m7, m0 ; q7-q0
87 ABS2 m4, m5, m2, m3 ; abs(q4-q0) | abs(q5-q0)
88 ABS2 m6, m7, m2, m3 ; abs(q6-q0) | abs(q7-q0)
89 pcmpgtw m4, reg_F ; abs(q4-q0) > F
90 pcmpgtw m5, reg_F ; abs(q5-q0) > F
91 pcmpgtw m6, reg_F ; abs(q6-q0) > F
92 pcmpgtw m7, reg_F ; abs(q7-q0) > F
93 por m5, m4
94 por m7, m6
95 por m7, m5 ; !flat8out, q portion
96 %endmacro
98 ; calculate p or q portion of flat8in/hev/fm (excluding mb_edge condition)
99 %macro FLAT8IN_HALF 1
100 %if %1 > 4
101 psubw m4, m3, m0 ; q3-q0
102 psubw m5, m2, m0 ; q2-q0
103 ABS2 m4, m5, m6, m7 ; abs(q3-q0) | abs(q2-q0)
104 pcmpgtw m4, reg_F ; abs(q3-q0) > F
105 pcmpgtw m5, reg_F ; abs(q2-q0) > F
106 %endif
107 psubw m3, m2 ; q3-q2
108 psubw m2, m1 ; q2-q1
109 ABS2 m3, m2, m6, m7 ; abs(q3-q2) | abs(q2-q1)
110 pcmpgtw m3, reg_I ; abs(q3-q2) > I
111 pcmpgtw m2, reg_I ; abs(q2-q1) > I
112 %if %1 > 4
113 por m4, m5
114 %endif
115 por m2, m3
116 psubw m3, m1, m0 ; q1-q0
117 ABS1 m3, m5 ; abs(q1-q0)
118 %if %1 > 4
119 pcmpgtw m6, m3, reg_F ; abs(q1-q0) > F
120 %endif
121 pcmpgtw m7, m3, reg_H ; abs(q1-q0) > H
122 pcmpgtw m3, reg_I ; abs(q1-q0) > I
123 %if %1 > 4
124 por m4, m6
125 %endif
126 por m2, m3
127 %endmacro
129 ; one step in filter_14/filter_6
131 ; take sum $reg, downshift, apply mask and write into dst
133 ; if sub2/add1-2 are present, add/sub as appropriate to prepare for the next
134 ; step's sum $reg. This is omitted for the last row in each filter.
136 ; if dont_store is set, don't write the result into memory, instead keep the
137 ; values in register so we can write it out later
138 %macro FILTER_STEP 6-10 "", "", "", 0 ; tmp, reg, mask, shift, dst, \
139 ; src/sub1, sub2, add1, add2, dont_store
140 psrlw %1, %2, %4
141 psubw %1, %6 ; abs->delta
142 %ifnidn %7, ""
143 psubw %2, %6
144 psubw %2, %7
145 paddw %2, %8
146 paddw %2, %9
147 %endif
148 pand %1, reg_%3 ; apply mask
149 %if %10 == 1
150 paddw %6, %1 ; delta->abs
151 %else
152 paddw %1, %6 ; delta->abs
153 mova [%5], %1
154 %endif
155 %endmacro
157 ; FIXME avx2 versions for 16_16 and mix2_{4,8}{4,8}
159 %macro LOOP_FILTER 3 ; dir[h/v], wd[4/8/16], bpp[10/12]
161 %if ARCH_X86_64
162 %if %2 == 16
163 %assign %%num_xmm_regs 16
164 %elif %2 == 8
165 %assign %%num_xmm_regs 15
166 %else ; %2 == 4
167 %assign %%num_xmm_regs 14
168 %endif ; %2
169 %assign %%bak_mem 0
170 %else ; ARCH_X86_32
171 %assign %%num_xmm_regs 8
172 %if %2 == 16
173 %assign %%bak_mem 7
174 %elif %2 == 8
175 %assign %%bak_mem 6
176 %else ; %2 == 4
177 %assign %%bak_mem 5
178 %endif ; %2
179 %endif ; ARCH_X86_64/32
181 %if %2 == 16
182 %ifidn %1, v
183 %assign %%num_gpr_regs 6
184 %else ; %1 == h
185 %assign %%num_gpr_regs 5
186 %endif ; %1
187 %assign %%wd_mem 6
188 %else ; %2 == 8/4
189 %assign %%num_gpr_regs 5
190 %if ARCH_X86_32 && %2 == 8
191 %assign %%wd_mem 2
192 %else ; ARCH_X86_64 || %2 == 4
193 %assign %%wd_mem 0
194 %endif ; ARCH_X86_64/32 etc.
195 %endif ; %2
197 %ifidn %1, v
198 %assign %%tsp_mem 0
199 %elif %2 == 16 ; && %1 == h
200 %assign %%tsp_mem 16
201 %else ; %1 == h && %1 == 8/4
202 %assign %%tsp_mem 8
203 %endif ; %1/%2
205 %assign %%off %%wd_mem
206 %assign %%tspoff %%bak_mem+%%wd_mem
207 %assign %%stack_mem ((%%bak_mem+%%wd_mem+%%tsp_mem)*mmsize)
209 %if %3 == 10
210 %define %%maxsgn 511
211 %define %%minsgn m512
212 %define %%maxusgn 1023
213 %define %%maxf 4
214 %else ; %3 == 12
215 %define %%maxsgn 2047
216 %define %%minsgn m2048
217 %define %%maxusgn 4095
218 %define %%maxf 16
219 %endif ; %3
221 cglobal vp9_loop_filter_%1_%2_%3, 5, %%num_gpr_regs, %%num_xmm_regs, %%stack_mem, dst, stride, E, I, H
222 ; prepare E, I and H masks
223 shl Ed, %3-8
224 shl Id, %3-8
225 shl Hd, %3-8
226 %if cpuflag(ssse3)
227 mova m0, [pw_256]
228 %endif
229 movd m1, Ed
230 movd m2, Id
231 movd m3, Hd
232 %if cpuflag(ssse3)
233 pshufb m1, m0 ; E << (bit_depth - 8)
234 pshufb m2, m0 ; I << (bit_depth - 8)
235 pshufb m3, m0 ; H << (bit_depth - 8)
236 %else
237 punpcklwd m1, m1
238 punpcklwd m2, m2
239 punpcklwd m3, m3
240 pshufd m1, m1, q0000
241 pshufd m2, m2, q0000
242 pshufd m3, m3, q0000
243 %endif
244 SCRATCH 1, 8, rsp+(%%off+0)*mmsize, E
245 SCRATCH 2, 9, rsp+(%%off+1)*mmsize, I
246 SCRATCH 3, 10, rsp+(%%off+2)*mmsize, H
247 %if %2 > 4
248 PRELOAD 11, pw_ %+ %%maxf, F
249 %endif
251 ; set up variables to load data
252 %ifidn %1, v
253 DEFINE_ARGS dst8, stride, stride3, dst0, dst4, dst12
254 lea stride3q, [strideq*3]
255 neg strideq
256 %if %2 == 16
257 lea dst0q, [dst8q+strideq*8]
258 %else
259 lea dst4q, [dst8q+strideq*4]
260 %endif
261 neg strideq
262 %if %2 == 16
263 lea dst12q, [dst8q+strideq*4]
264 lea dst4q, [dst0q+strideq*4]
265 %endif
267 %if %2 == 16
268 %define %%p7 dst0q
269 %define %%p6 dst0q+strideq
270 %define %%p5 dst0q+strideq*2
271 %define %%p4 dst0q+stride3q
272 %endif
273 %define %%p3 dst4q
274 %define %%p2 dst4q+strideq
275 %define %%p1 dst4q+strideq*2
276 %define %%p0 dst4q+stride3q
277 %define %%q0 dst8q
278 %define %%q1 dst8q+strideq
279 %define %%q2 dst8q+strideq*2
280 %define %%q3 dst8q+stride3q
281 %if %2 == 16
282 %define %%q4 dst12q
283 %define %%q5 dst12q+strideq
284 %define %%q6 dst12q+strideq*2
285 %define %%q7 dst12q+stride3q
286 %endif
287 %else ; %1 == h
288 DEFINE_ARGS dst0, stride, stride3, dst4
289 lea stride3q, [strideq*3]
290 lea dst4q, [dst0q+strideq*4]
292 %define %%p3 rsp+(%%tspoff+0)*mmsize
293 %define %%p2 rsp+(%%tspoff+1)*mmsize
294 %define %%p1 rsp+(%%tspoff+2)*mmsize
295 %define %%p0 rsp+(%%tspoff+3)*mmsize
296 %define %%q0 rsp+(%%tspoff+4)*mmsize
297 %define %%q1 rsp+(%%tspoff+5)*mmsize
298 %define %%q2 rsp+(%%tspoff+6)*mmsize
299 %define %%q3 rsp+(%%tspoff+7)*mmsize
301 %if %2 < 16
302 movu m0, [dst0q+strideq*0-8]
303 movu m1, [dst0q+strideq*1-8]
304 movu m2, [dst0q+strideq*2-8]
305 movu m3, [dst0q+stride3q -8]
306 movu m4, [dst4q+strideq*0-8]
307 movu m5, [dst4q+strideq*1-8]
308 movu m6, [dst4q+strideq*2-8]
309 movu m7, [dst4q+stride3q -8]
311 %if ARCH_X86_64
312 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 12
313 %else
314 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%%p0], [%%q0]
315 %endif
317 mova [%%p3], m0
318 mova [%%p2], m1
319 mova [%%p1], m2
320 mova [%%p0], m3
321 %if ARCH_X86_64
322 mova [%%q0], m4
323 %endif
324 mova [%%q1], m5
325 mova [%%q2], m6
326 mova [%%q3], m7
328 ; FIXME investigate if we can _not_ load q0-3 below if h, and adjust register
329 ; order here accordingly
330 %else ; %2 == 16
332 %define %%p7 rsp+(%%tspoff+ 8)*mmsize
333 %define %%p6 rsp+(%%tspoff+ 9)*mmsize
334 %define %%p5 rsp+(%%tspoff+10)*mmsize
335 %define %%p4 rsp+(%%tspoff+11)*mmsize
336 %define %%q4 rsp+(%%tspoff+12)*mmsize
337 %define %%q5 rsp+(%%tspoff+13)*mmsize
338 %define %%q6 rsp+(%%tspoff+14)*mmsize
339 %define %%q7 rsp+(%%tspoff+15)*mmsize
341 mova m0, [dst0q+strideq*0-16]
342 mova m1, [dst0q+strideq*1-16]
343 mova m2, [dst0q+strideq*2-16]
344 mova m3, [dst0q+stride3q -16]
345 mova m4, [dst4q+strideq*0-16]
346 mova m5, [dst4q+strideq*1-16]
347 %if ARCH_X86_64
348 mova m6, [dst4q+strideq*2-16]
349 %endif
350 mova m7, [dst4q+stride3q -16]
352 %if ARCH_X86_64
353 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 12
354 %else
355 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2-16], [%%p3], 1
356 %endif
358 mova [%%p7], m0
359 mova [%%p6], m1
360 mova [%%p5], m2
361 mova [%%p4], m3
362 %if ARCH_X86_64
363 mova [%%p3], m4
364 %endif
365 mova [%%p2], m5
366 mova [%%p1], m6
367 mova [%%p0], m7
369 mova m0, [dst0q+strideq*0]
370 mova m1, [dst0q+strideq*1]
371 mova m2, [dst0q+strideq*2]
372 mova m3, [dst0q+stride3q ]
373 mova m4, [dst4q+strideq*0]
374 mova m5, [dst4q+strideq*1]
375 %if ARCH_X86_64
376 mova m6, [dst4q+strideq*2]
377 %endif
378 mova m7, [dst4q+stride3q ]
380 %if ARCH_X86_64
381 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 12
382 %else
383 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [dst4q+strideq*2], [%%q4], 1
384 %endif
386 mova [%%q0], m0
387 mova [%%q1], m1
388 mova [%%q2], m2
389 mova [%%q3], m3
390 %if ARCH_X86_64
391 mova [%%q4], m4
392 %endif
393 mova [%%q5], m5
394 mova [%%q6], m6
395 mova [%%q7], m7
397 ; FIXME investigate if we can _not_ load q0|q4-7 below if h, and adjust register
398 ; order here accordingly
399 %endif ; %2
400 %endif ; %1
402 ; load q0|q4-7 data
403 mova m0, [%%q0]
404 %if %2 == 16
405 mova m4, [%%q4]
406 mova m5, [%%q5]
407 mova m6, [%%q6]
408 mova m7, [%%q7]
410 ; flat8out q portion
411 FLAT8OUT_HALF
412 SCRATCH 7, 15, rsp+(%%off+6)*mmsize, F8O
413 %endif
415 ; load q1-3 data
416 mova m1, [%%q1]
417 mova m2, [%%q2]
418 mova m3, [%%q3]
420 ; r6-8|pw_4[m8-11]=reg_E/I/H/F
421 ; r9[m15]=!flatout[q]
422 ; m12-14=free
423 ; m0-3=q0-q3
424 ; m4-7=free
426 ; flat8in|fm|hev q portion
427 FLAT8IN_HALF %2
428 SCRATCH 7, 13, rsp+(%%off+4)*mmsize, HEV
429 %if %2 > 4
430 SCRATCH 4, 14, rsp+(%%off+5)*mmsize, F8I
431 %endif
433 ; r6-8|pw_4[m8-11]=reg_E/I/H/F
434 ; r9[m15]=!flat8out[q]
435 ; r10[m13]=hev[q]
436 ; r11[m14]=!flat8in[q]
437 ; m2=!fm[q]
438 ; m0,1=q0-q1
439 ; m2-7=free
440 ; m12=free
442 ; load p0-1
443 mova m3, [%%p0]
444 mova m4, [%%p1]
446 ; fm mb_edge portion
447 psubw m5, m3, m0 ; q0-p0
448 psubw m6, m4, m1 ; q1-p1
449 %if ARCH_X86_64
450 ABS2 m5, m6, m7, m12 ; abs(q0-p0) | abs(q1-p1)
451 %else
452 ABS1 m5, m7 ; abs(q0-p0)
453 ABS1 m6, m7 ; abs(q1-p1)
454 %endif
455 paddw m5, m5
456 psraw m6, 1
457 paddw m6, m5 ; abs(q0-p0)*2+(abs(q1-p1)>>1)
458 pcmpgtw m6, reg_E
459 por m2, m6
460 SCRATCH 2, 12, rsp+(%%off+3)*mmsize, FM
462 ; r6-8|pw_4[m8-11]=reg_E/I/H/F
463 ; r9[m15]=!flat8out[q]
464 ; r10[m13]=hev[q]
465 ; r11[m14]=!flat8in[q]
466 ; r12[m12]=!fm[q]
467 ; m3-4=q0-1
468 ; m0-2/5-7=free
470 ; load p4-7 data
471 SWAP 3, 0 ; p0
472 SWAP 4, 1 ; p1
473 %if %2 == 16
474 mova m7, [%%p7]
475 mova m6, [%%p6]
476 mova m5, [%%p5]
477 mova m4, [%%p4]
479 ; flat8out p portion
480 FLAT8OUT_HALF
481 por m7, reg_F8O
482 SCRATCH 7, 15, rsp+(%%off+6)*mmsize, F8O
483 %endif
485 ; r6-8|pw_4[m8-11]=reg_E/I/H/F
486 ; r9[m15]=!flat8out
487 ; r10[m13]=hev[q]
488 ; r11[m14]=!flat8in[q]
489 ; r12[m12]=!fm[q]
490 ; m0=p0
491 ; m1-7=free
493 ; load p2-3 data
494 mova m2, [%%p2]
495 mova m3, [%%p3]
497 ; flat8in|fm|hev p portion
498 FLAT8IN_HALF %2
499 por m7, reg_HEV
500 %if %2 > 4
501 por m4, reg_F8I
502 %endif
503 por m2, reg_FM
504 %if %2 > 4
505 por m4, m2 ; !flat8|!fm
506 %if %2 == 16
507 por m5, m4, reg_F8O ; !flat16|!fm
508 pandn m2, m4 ; filter4_mask
509 pandn m4, m5 ; filter8_mask
510 pxor m5, [pw_m1] ; filter16_mask
511 SCRATCH 5, 15, rsp+(%%off+6)*mmsize, F16M
512 %else
513 pandn m2, m4 ; filter4_mask
514 pxor m4, [pw_m1] ; filter8_mask
515 %endif
516 SCRATCH 4, 14, rsp+(%%off+5)*mmsize, F8M
517 %else
518 pxor m2, [pw_m1] ; filter4_mask
519 %endif
520 SCRATCH 7, 13, rsp+(%%off+4)*mmsize, HEV
521 SCRATCH 2, 12, rsp+(%%off+3)*mmsize, F4M
523 ; r9[m15]=filter16_mask
524 ; r10[m13]=hev
525 ; r11[m14]=filter8_mask
526 ; r12[m12]=filter4_mask
527 ; m0,1=p0-p1
528 ; m2-7=free
529 ; m8-11=free
531 %if %2 > 4
532 %if %2 == 16
533 ; filter_14
534 mova m2, [%%p7]
535 mova m3, [%%p6]
536 mova m6, [%%p5]
537 mova m7, [%%p4]
538 PRELOAD 8, %%p3, P3
539 PRELOAD 9, %%p2, P2
540 %endif
541 PRELOAD 10, %%q0, Q0
542 PRELOAD 11, %%q1, Q1
543 %if %2 == 16
544 psllw m4, m2, 3
545 paddw m5, m3, m3
546 paddw m4, m6
547 paddw m5, m7
548 paddw m4, reg_P3
549 paddw m5, reg_P2
550 paddw m4, m1
551 paddw m5, m0
552 paddw m4, reg_Q0 ; q0+p1+p3+p5+p7*8
553 psubw m5, m2 ; p0+p2+p4+p6*2-p7
554 paddw m4, [pw_8]
555 paddw m5, m4 ; q0+p0+p1+p2+p3+p4+p5+p6*2+p7*7+8
557 ; below, we use r0-5 for storing pre-filter pixels for subsequent subtraction
558 ; at the end of the filter
560 mova [rsp+0*mmsize], m3
561 FILTER_STEP m4, m5, F16M, 4, %%p6, m3, m2, m6, reg_Q1
562 %endif
563 mova m3, [%%q2]
564 %if %2 == 16
565 mova [rsp+1*mmsize], m6
566 FILTER_STEP m4, m5, F16M, 4, %%p5, m6, m2, m7, m3
567 %endif
568 mova m6, [%%q3]
569 %if %2 == 16
570 mova [rsp+2*mmsize], m7
571 FILTER_STEP m4, m5, F16M, 4, %%p4, m7, m2, reg_P3, m6
572 mova m7, [%%q4]
573 %if ARCH_X86_64
574 mova [rsp+3*mmsize], reg_P3
575 %else
576 mova m4, reg_P3
577 mova [rsp+3*mmsize], m4
578 %endif
579 FILTER_STEP m4, m5, F16M, 4, %%p3, reg_P3, m2, reg_P2, m7
580 PRELOAD 8, %%q5, Q5
581 %if ARCH_X86_64
582 mova [rsp+4*mmsize], reg_P2
583 %else
584 mova m4, reg_P2
585 mova [rsp+4*mmsize], m4
586 %endif
587 FILTER_STEP m4, m5, F16M, 4, %%p2, reg_P2, m2, m1, reg_Q5
588 PRELOAD 9, %%q6, Q6
589 mova [rsp+5*mmsize], m1
590 FILTER_STEP m4, m5, F16M, 4, %%p1, m1, m2, m0, reg_Q6
591 mova m1, [%%q7]
592 FILTER_STEP m4, m5, F16M, 4, %%p0, m0, m2, reg_Q0, m1, 1
593 FILTER_STEP m4, m5, F16M, 4, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m1, ARCH_X86_64
594 FILTER_STEP m4, m5, F16M, 4, %%q1, reg_Q1, [rsp+1*mmsize], m3, m1, ARCH_X86_64
595 FILTER_STEP m4, m5, F16M, 4, %%q2, m3, [rsp+2*mmsize], m6, m1, 1
596 FILTER_STEP m4, m5, F16M, 4, %%q3, m6, [rsp+3*mmsize], m7, m1
597 FILTER_STEP m4, m5, F16M, 4, %%q4, m7, [rsp+4*mmsize], reg_Q5, m1
598 FILTER_STEP m4, m5, F16M, 4, %%q5, reg_Q5, [rsp+5*mmsize], reg_Q6, m1
599 FILTER_STEP m4, m5, F16M, 4, %%q6, reg_Q6
601 mova m7, [%%p1]
602 %else
603 SWAP 1, 7
604 %endif
606 mova m2, [%%p3]
607 mova m1, [%%p2]
609 ; reg_Q0-1 (m10-m11)
610 ; m0=p0
611 ; m1=p2
612 ; m2=p3
613 ; m3=q2
614 ; m4-5=free
615 ; m6=q3
616 ; m7=p1
617 ; m8-9 unused
619 ; filter_6
620 psllw m4, m2, 2
621 paddw m5, m1, m1
622 paddw m4, m7
623 psubw m5, m2
624 paddw m4, m0
625 paddw m5, reg_Q0
626 paddw m4, [pw_4]
627 paddw m5, m4
629 %if ARCH_X86_64
630 mova m8, m1
631 mova m9, m7
632 %else
633 mova [rsp+0*mmsize], m1
634 mova [rsp+1*mmsize], m7
635 %endif
636 %ifidn %1, v
637 FILTER_STEP m4, m5, F8M, 3, %%p2, m1, m2, m7, reg_Q1
638 %else
639 FILTER_STEP m4, m5, F8M, 3, %%p2, m1, m2, m7, reg_Q1, 1
640 %endif
641 FILTER_STEP m4, m5, F8M, 3, %%p1, m7, m2, m0, m3, 1
642 FILTER_STEP m4, m5, F8M, 3, %%p0, m0, m2, reg_Q0, m6, 1
643 %if ARCH_X86_64
644 FILTER_STEP m4, m5, F8M, 3, %%q0, reg_Q0, m8, reg_Q1, m6, ARCH_X86_64
645 FILTER_STEP m4, m5, F8M, 3, %%q1, reg_Q1, m9, m3, m6, ARCH_X86_64
646 %else
647 FILTER_STEP m4, m5, F8M, 3, %%q0, reg_Q0, [rsp+0*mmsize], reg_Q1, m6, ARCH_X86_64
648 FILTER_STEP m4, m5, F8M, 3, %%q1, reg_Q1, [rsp+1*mmsize], m3, m6, ARCH_X86_64
649 %endif
650 FILTER_STEP m4, m5, F8M, 3, %%q2, m3
652 UNSCRATCH 2, 10, %%q0
653 UNSCRATCH 6, 11, %%q1
654 %else
655 SWAP 1, 7
656 mova m2, [%%q0]
657 mova m6, [%%q1]
658 %endif
659 UNSCRATCH 3, 13, rsp+(%%off+4)*mmsize, HEV
661 ; m0=p0
662 ; m1=p2
663 ; m2=q0
664 ; m3=hev_mask
665 ; m4-5=free
666 ; m6=q1
667 ; m7=p1
669 ; filter_4
670 psubw m4, m7, m6 ; p1-q1
671 psubw m5, m2, m0 ; q0-p0
672 pand m4, m3
673 pminsw m4, [pw_ %+ %%maxsgn]
674 pmaxsw m4, [pw_ %+ %%minsgn] ; clip_intp2(p1-q1, 9) -> f
675 paddw m4, m5
676 paddw m5, m5
677 paddw m4, m5 ; 3*(q0-p0)+f
678 pminsw m4, [pw_ %+ %%maxsgn]
679 pmaxsw m4, [pw_ %+ %%minsgn] ; clip_intp2(3*(q0-p0)+f, 9) -> f
680 pand m4, reg_F4M
681 paddw m5, m4, [pw_4]
682 paddw m4, [pw_3]
683 pminsw m5, [pw_ %+ %%maxsgn]
684 pminsw m4, [pw_ %+ %%maxsgn]
685 psraw m5, 3 ; min_intp2(f+4, 9)>>3 -> f1
686 psraw m4, 3 ; min_intp2(f+3, 9)>>3 -> f2
687 psubw m2, m5 ; q0-f1
688 paddw m0, m4 ; p0+f2
689 pandn m3, m5 ; f1 & !hev (for p1/q1 adj)
690 pxor m4, m4
691 mova m5, [pw_ %+ %%maxusgn]
692 pmaxsw m2, m4
693 pmaxsw m0, m4
694 pminsw m2, m5
695 pminsw m0, m5
696 %if cpuflag(ssse3)
697 pmulhrsw m3, [pw_16384] ; (f1+1)>>1
698 %else
699 paddw m3, [pw_1]
700 psraw m3, 1
701 %endif
702 paddw m7, m3 ; p1+f
703 psubw m6, m3 ; q1-f
704 pmaxsw m7, m4
705 pmaxsw m6, m4
706 pminsw m7, m5
707 pminsw m6, m5
709 ; store
710 %ifidn %1, v
711 mova [%%p1], m7
712 mova [%%p0], m0
713 mova [%%q0], m2
714 mova [%%q1], m6
715 %else ; %1 == h
716 %if %2 == 4
717 TRANSPOSE4x4W 7, 0, 2, 6, 1
718 movh [dst0q+strideq*0-4], m7
719 movhps [dst0q+strideq*1-4], m7
720 movh [dst0q+strideq*2-4], m0
721 movhps [dst0q+stride3q -4], m0
722 movh [dst4q+strideq*0-4], m2
723 movhps [dst4q+strideq*1-4], m2
724 movh [dst4q+strideq*2-4], m6
725 movhps [dst4q+stride3q -4], m6
726 %elif %2 == 8
727 mova m3, [%%p3]
728 mova m4, [%%q2]
729 mova m5, [%%q3]
731 %if ARCH_X86_64
732 TRANSPOSE8x8W 3, 1, 7, 0, 2, 6, 4, 5, 8
733 %else
734 TRANSPOSE8x8W 3, 1, 7, 0, 2, 6, 4, 5, [%%q2], [%%q0], 1
735 mova m2, [%%q0]
736 %endif
738 movu [dst0q+strideq*0-8], m3
739 movu [dst0q+strideq*1-8], m1
740 movu [dst0q+strideq*2-8], m7
741 movu [dst0q+stride3q -8], m0
742 movu [dst4q+strideq*0-8], m2
743 movu [dst4q+strideq*1-8], m6
744 movu [dst4q+strideq*2-8], m4
745 movu [dst4q+stride3q -8], m5
746 %else ; %2 == 16
747 SCRATCH 2, 8, %%q0
748 SCRATCH 6, 9, %%q1
749 mova m2, [%%p7]
750 mova m3, [%%p6]
751 mova m4, [%%p5]
752 mova m5, [%%p4]
753 mova m6, [%%p3]
755 %if ARCH_X86_64
756 TRANSPOSE8x8W 2, 3, 4, 5, 6, 1, 7, 0, 10
757 %else
758 mova [%%p1], m7
759 TRANSPOSE8x8W 2, 3, 4, 5, 6, 1, 7, 0, [%%p1], [dst4q+strideq*0-16], 1
760 %endif
762 mova [dst0q+strideq*0-16], m2
763 mova [dst0q+strideq*1-16], m3
764 mova [dst0q+strideq*2-16], m4
765 mova [dst0q+stride3q -16], m5
766 %if ARCH_X86_64
767 mova [dst4q+strideq*0-16], m6
768 %endif
769 mova [dst4q+strideq*1-16], m1
770 mova [dst4q+strideq*2-16], m7
771 mova [dst4q+stride3q -16], m0
773 UNSCRATCH 2, 8, %%q0
774 UNSCRATCH 6, 9, %%q1
775 mova m0, [%%q2]
776 mova m1, [%%q3]
777 mova m3, [%%q4]
778 mova m4, [%%q5]
779 %if ARCH_X86_64
780 mova m5, [%%q6]
781 %endif
782 mova m7, [%%q7]
784 %if ARCH_X86_64
785 TRANSPOSE8x8W 2, 6, 0, 1, 3, 4, 5, 7, 8
786 %else
787 TRANSPOSE8x8W 2, 6, 0, 1, 3, 4, 5, 7, [%%q6], [dst4q+strideq*0], 1
788 %endif
790 mova [dst0q+strideq*0], m2
791 mova [dst0q+strideq*1], m6
792 mova [dst0q+strideq*2], m0
793 mova [dst0q+stride3q ], m1
794 %if ARCH_X86_64
795 mova [dst4q+strideq*0], m3
796 %endif
797 mova [dst4q+strideq*1], m4
798 mova [dst4q+strideq*2], m5
799 mova [dst4q+stride3q ], m7
800 %endif ; %2
801 %endif ; %1
803 %endmacro
805 %macro LOOP_FILTER_CPUSETS 3
806 INIT_XMM sse2
807 LOOP_FILTER %1, %2, %3
808 INIT_XMM ssse3
809 LOOP_FILTER %1, %2, %3
810 INIT_XMM avx
811 LOOP_FILTER %1, %2, %3
812 %endmacro
814 %macro LOOP_FILTER_WDSETS 2
815 LOOP_FILTER_CPUSETS %1, 4, %2
816 LOOP_FILTER_CPUSETS %1, 8, %2
817 LOOP_FILTER_CPUSETS %1, 16, %2
818 %endmacro
820 LOOP_FILTER_WDSETS h, 10
821 LOOP_FILTER_WDSETS v, 10
822 LOOP_FILTER_WDSETS h, 12
823 LOOP_FILTER_WDSETS v, 12