lavfi: switch to AVFrame.
[FFMpeg-mirror/mplayer-patches.git] / libavcodec / x86 / vp8dsp.asm
blobcaf2cd679a84ef99fea2909e69bc833f7b1816ed
1 ;******************************************************************************
2 ;* VP8 MMXEXT optimizations
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
5 ;*
6 ;* This file is part of Libav.
7 ;*
8 ;* Libav is free software; you can redistribute it and/or
9 ;* modify it under the terms of the GNU Lesser General Public
10 ;* License as published by the Free Software Foundation; either
11 ;* version 2.1 of the License, or (at your option) any later version.
13 ;* Libav is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 ;* Lesser General Public License for more details.
18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with Libav; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;******************************************************************************
23 %include "libavutil/x86/x86util.asm"
25 SECTION_RODATA
27 fourtap_filter_hw_m: times 4 dw -6, 123
28 times 4 dw 12, -1
29 times 4 dw -9, 93
30 times 4 dw 50, -6
31 times 4 dw -6, 50
32 times 4 dw 93, -9
33 times 4 dw -1, 12
34 times 4 dw 123, -6
36 sixtap_filter_hw_m: times 4 dw 2, -11
37 times 4 dw 108, 36
38 times 4 dw -8, 1
39 times 4 dw 3, -16
40 times 4 dw 77, 77
41 times 4 dw -16, 3
42 times 4 dw 1, -8
43 times 4 dw 36, 108
44 times 4 dw -11, 2
46 fourtap_filter_hb_m: times 8 db -6, 123
47 times 8 db 12, -1
48 times 8 db -9, 93
49 times 8 db 50, -6
50 times 8 db -6, 50
51 times 8 db 93, -9
52 times 8 db -1, 12
53 times 8 db 123, -6
55 sixtap_filter_hb_m: times 8 db 2, 1
56 times 8 db -11, 108
57 times 8 db 36, -8
58 times 8 db 3, 3
59 times 8 db -16, 77
60 times 8 db 77, -16
61 times 8 db 1, 2
62 times 8 db -8, 36
63 times 8 db 108, -11
65 fourtap_filter_v_m: times 8 dw -6
66 times 8 dw 123
67 times 8 dw 12
68 times 8 dw -1
69 times 8 dw -9
70 times 8 dw 93
71 times 8 dw 50
72 times 8 dw -6
73 times 8 dw -6
74 times 8 dw 50
75 times 8 dw 93
76 times 8 dw -9
77 times 8 dw -1
78 times 8 dw 12
79 times 8 dw 123
80 times 8 dw -6
82 sixtap_filter_v_m: times 8 dw 2
83 times 8 dw -11
84 times 8 dw 108
85 times 8 dw 36
86 times 8 dw -8
87 times 8 dw 1
88 times 8 dw 3
89 times 8 dw -16
90 times 8 dw 77
91 times 8 dw 77
92 times 8 dw -16
93 times 8 dw 3
94 times 8 dw 1
95 times 8 dw -8
96 times 8 dw 36
97 times 8 dw 108
98 times 8 dw -11
99 times 8 dw 2
101 bilinear_filter_vw_m: times 8 dw 1
102 times 8 dw 2
103 times 8 dw 3
104 times 8 dw 4
105 times 8 dw 5
106 times 8 dw 6
107 times 8 dw 7
109 bilinear_filter_vb_m: times 8 db 7, 1
110 times 8 db 6, 2
111 times 8 db 5, 3
112 times 8 db 4, 4
113 times 8 db 3, 5
114 times 8 db 2, 6
115 times 8 db 1, 7
117 %ifdef PIC
118 %define fourtap_filter_hw picregq
119 %define sixtap_filter_hw picregq
120 %define fourtap_filter_hb picregq
121 %define sixtap_filter_hb picregq
122 %define fourtap_filter_v picregq
123 %define sixtap_filter_v picregq
124 %define bilinear_filter_vw picregq
125 %define bilinear_filter_vb picregq
126 %define npicregs 1
127 %else
128 %define fourtap_filter_hw fourtap_filter_hw_m
129 %define sixtap_filter_hw sixtap_filter_hw_m
130 %define fourtap_filter_hb fourtap_filter_hb_m
131 %define sixtap_filter_hb sixtap_filter_hb_m
132 %define fourtap_filter_v fourtap_filter_v_m
133 %define sixtap_filter_v sixtap_filter_v_m
134 %define bilinear_filter_vw bilinear_filter_vw_m
135 %define bilinear_filter_vb bilinear_filter_vb_m
136 %define npicregs 0
137 %endif
139 filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
140 filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
142 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
143 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
144 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
146 pw_256: times 8 dw 256
148 pw_20091: times 4 dw 20091
149 pw_17734: times 4 dw 17734
151 pb_27_63: times 8 db 27, 63
152 pb_18_63: times 8 db 18, 63
153 pb_9_63: times 8 db 9, 63
155 cextern pb_1
156 cextern pw_3
157 cextern pb_3
158 cextern pw_4
159 cextern pb_4
160 cextern pw_9
161 cextern pw_18
162 cextern pw_27
163 cextern pw_63
164 cextern pw_64
165 cextern pb_80
166 cextern pb_F8
167 cextern pb_FE
169 SECTION .text
171 ;-----------------------------------------------------------------------------
172 ; subpel MC functions:
174 ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
175 ; uint8_t *src, int srcstride,
176 ; int height, int mx, int my);
177 ;-----------------------------------------------------------------------------
179 %macro FILTER_SSSE3 1
180 cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg
181 lea mxd, [mxq*3]
182 mova m3, [filter_h6_shuf2]
183 mova m4, [filter_h6_shuf3]
184 %ifdef PIC
185 lea picregq, [sixtap_filter_hb_m]
186 %endif
187 mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
188 mova m6, [sixtap_filter_hb+mxq*8-32]
189 mova m7, [sixtap_filter_hb+mxq*8-16]
191 .nextrow:
192 movu m0, [srcq-2]
193 mova m1, m0
194 mova m2, m0
195 %if mmsize == 8
196 ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
197 ; shuffle with a memory operand
198 punpcklbw m0, [srcq+3]
199 %else
200 pshufb m0, [filter_h6_shuf1]
201 %endif
202 pshufb m1, m3
203 pshufb m2, m4
204 pmaddubsw m0, m5
205 pmaddubsw m1, m6
206 pmaddubsw m2, m7
207 paddsw m0, m1
208 paddsw m0, m2
209 pmulhrsw m0, [pw_256]
210 packuswb m0, m0
211 movh [dstq], m0 ; store
213 ; go to next line
214 add dstq, dststrideq
215 add srcq, srcstrideq
216 dec heightd ; next row
217 jg .nextrow
218 REP_RET
220 cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
221 shl mxd, 4
222 mova m2, [pw_256]
223 mova m3, [filter_h2_shuf]
224 mova m4, [filter_h4_shuf]
225 %ifdef PIC
226 lea picregq, [fourtap_filter_hb_m]
227 %endif
228 mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes
229 mova m6, [fourtap_filter_hb+mxq]
231 .nextrow:
232 movu m0, [srcq-1]
233 mova m1, m0
234 pshufb m0, m3
235 pshufb m1, m4
236 pmaddubsw m0, m5
237 pmaddubsw m1, m6
238 paddsw m0, m1
239 pmulhrsw m0, m2
240 packuswb m0, m0
241 movh [dstq], m0 ; store
243 ; go to next line
244 add dstq, dststrideq
245 add srcq, srcstrideq
246 dec heightd ; next row
247 jg .nextrow
248 REP_RET
250 cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
251 shl myd, 4
252 %ifdef PIC
253 lea picregq, [fourtap_filter_hb_m]
254 %endif
255 mova m5, [fourtap_filter_hb+myq-16]
256 mova m6, [fourtap_filter_hb+myq]
257 mova m7, [pw_256]
259 ; read 3 lines
260 sub srcq, srcstrideq
261 movh m0, [srcq]
262 movh m1, [srcq+ srcstrideq]
263 movh m2, [srcq+2*srcstrideq]
264 add srcq, srcstrideq
266 .nextrow:
267 movh m3, [srcq+2*srcstrideq] ; read new row
268 mova m4, m0
269 mova m0, m1
270 punpcklbw m4, m1
271 mova m1, m2
272 punpcklbw m2, m3
273 pmaddubsw m4, m5
274 pmaddubsw m2, m6
275 paddsw m4, m2
276 mova m2, m3
277 pmulhrsw m4, m7
278 packuswb m4, m4
279 movh [dstq], m4
281 ; go to next line
282 add dstq, dststrideq
283 add srcq, srcstrideq
284 dec heightd ; next row
285 jg .nextrow
286 REP_RET
288 cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
289 lea myd, [myq*3]
290 %ifdef PIC
291 lea picregq, [sixtap_filter_hb_m]
292 %endif
293 lea myq, [sixtap_filter_hb+myq*8]
295 ; read 5 lines
296 sub srcq, srcstrideq
297 sub srcq, srcstrideq
298 movh m0, [srcq]
299 movh m1, [srcq+srcstrideq]
300 movh m2, [srcq+srcstrideq*2]
301 lea srcq, [srcq+srcstrideq*2]
302 add srcq, srcstrideq
303 movh m3, [srcq]
304 movh m4, [srcq+srcstrideq]
306 .nextrow:
307 movh m5, [srcq+2*srcstrideq] ; read new row
308 mova m6, m0
309 punpcklbw m6, m5
310 mova m0, m1
311 punpcklbw m1, m2
312 mova m7, m3
313 punpcklbw m7, m4
314 pmaddubsw m6, [myq-48]
315 pmaddubsw m1, [myq-32]
316 pmaddubsw m7, [myq-16]
317 paddsw m6, m1
318 paddsw m6, m7
319 mova m1, m2
320 mova m2, m3
321 pmulhrsw m6, [pw_256]
322 mova m3, m4
323 packuswb m6, m6
324 mova m4, m5
325 movh [dstq], m6
327 ; go to next line
328 add dstq, dststrideq
329 add srcq, srcstrideq
330 dec heightd ; next row
331 jg .nextrow
332 REP_RET
333 %endmacro
335 INIT_MMX ssse3
336 FILTER_SSSE3 4
337 INIT_XMM ssse3
338 FILTER_SSSE3 8
340 ; 4x4 block, H-only 4-tap filter
341 INIT_MMX mmxext
342 cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
343 shl mxd, 4
344 %ifdef PIC
345 lea picregq, [fourtap_filter_hw_m]
346 %endif
347 movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words
348 movq mm5, [fourtap_filter_hw+mxq]
349 movq mm7, [pw_64]
350 pxor mm6, mm6
352 .nextrow:
353 movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels
355 ; first set of 2 pixels
356 movq mm2, mm1 ; byte ABCD..
357 punpcklbw mm1, mm6 ; byte->word ABCD
358 pshufw mm0, mm2, 9 ; byte CDEF..
359 punpcklbw mm0, mm6 ; byte->word CDEF
360 pshufw mm3, mm1, 0x94 ; word ABBC
361 pshufw mm1, mm0, 0x94 ; word CDDE
362 pmaddwd mm3, mm4 ; multiply 2px with F0/F1
363 movq mm0, mm1 ; backup for second set of pixels
364 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
365 paddd mm3, mm1 ; finish 1st 2px
367 ; second set of 2 pixels, use backup of above
368 punpckhbw mm2, mm6 ; byte->word EFGH
369 pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
370 pshufw mm1, mm2, 0x94 ; word EFFG
371 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
372 paddd mm0, mm1 ; finish 2nd 2px
374 ; merge two sets of 2 pixels into one set of 4, round/clip/store
375 packssdw mm3, mm0 ; merge dword->word (4px)
376 paddsw mm3, mm7 ; rounding
377 psraw mm3, 7
378 packuswb mm3, mm6 ; clip and word->bytes
379 movd [dstq], mm3 ; store
381 ; go to next line
382 add dstq, dststrideq
383 add srcq, srcstrideq
384 dec heightd ; next row
385 jg .nextrow
386 REP_RET
388 ; 4x4 block, H-only 6-tap filter
389 INIT_MMX mmxext
390 cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
391 lea mxd, [mxq*3]
392 %ifdef PIC
393 lea picregq, [sixtap_filter_hw_m]
394 %endif
395 movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words
396 movq mm5, [sixtap_filter_hw+mxq*8-32]
397 movq mm6, [sixtap_filter_hw+mxq*8-16]
398 movq mm7, [pw_64]
399 pxor mm3, mm3
401 .nextrow:
402 movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels
404 ; first set of 2 pixels
405 movq mm2, mm1 ; byte ABCD..
406 punpcklbw mm1, mm3 ; byte->word ABCD
407 pshufw mm0, mm2, 0x9 ; byte CDEF..
408 punpckhbw mm2, mm3 ; byte->word EFGH
409 punpcklbw mm0, mm3 ; byte->word CDEF
410 pshufw mm1, mm1, 0x94 ; word ABBC
411 pshufw mm2, mm2, 0x94 ; word EFFG
412 pmaddwd mm1, mm4 ; multiply 2px with F0/F1
413 pshufw mm3, mm0, 0x94 ; word CDDE
414 movq mm0, mm3 ; backup for second set of pixels
415 pmaddwd mm3, mm5 ; multiply 2px with F2/F3
416 paddd mm1, mm3 ; add to 1st 2px cache
417 movq mm3, mm2 ; backup for second set of pixels
418 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
419 paddd mm1, mm2 ; finish 1st 2px
421 ; second set of 2 pixels, use backup of above
422 movd mm2, [srcq+3] ; byte FGHI (prevent overreads)
423 pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
424 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
425 paddd mm0, mm3 ; add to 2nd 2px cache
426 pxor mm3, mm3
427 punpcklbw mm2, mm3 ; byte->word FGHI
428 pshufw mm2, mm2, 0xE9 ; word GHHI
429 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
430 paddd mm0, mm2 ; finish 2nd 2px
432 ; merge two sets of 2 pixels into one set of 4, round/clip/store
433 packssdw mm1, mm0 ; merge dword->word (4px)
434 paddsw mm1, mm7 ; rounding
435 psraw mm1, 7
436 packuswb mm1, mm3 ; clip and word->bytes
437 movd [dstq], mm1 ; store
439 ; go to next line
440 add dstq, dststrideq
441 add srcq, srcstrideq
442 dec heightd ; next row
443 jg .nextrow
444 REP_RET
446 INIT_XMM sse2
447 cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg
448 shl mxd, 5
449 %ifdef PIC
450 lea picregq, [fourtap_filter_v_m]
451 %endif
452 lea mxq, [fourtap_filter_v+mxq-32]
453 pxor m7, m7
454 mova m4, [pw_64]
455 mova m5, [mxq+ 0]
456 mova m6, [mxq+16]
457 %ifdef m8
458 mova m8, [mxq+32]
459 mova m9, [mxq+48]
460 %endif
461 .nextrow:
462 movq m0, [srcq-1]
463 movq m1, [srcq-0]
464 movq m2, [srcq+1]
465 movq m3, [srcq+2]
466 punpcklbw m0, m7
467 punpcklbw m1, m7
468 punpcklbw m2, m7
469 punpcklbw m3, m7
470 pmullw m0, m5
471 pmullw m1, m6
472 %ifdef m8
473 pmullw m2, m8
474 pmullw m3, m9
475 %else
476 pmullw m2, [mxq+32]
477 pmullw m3, [mxq+48]
478 %endif
479 paddsw m0, m1
480 paddsw m2, m3
481 paddsw m0, m2
482 paddsw m0, m4
483 psraw m0, 7
484 packuswb m0, m7
485 movh [dstq], m0 ; store
487 ; go to next line
488 add dstq, dststrideq
489 add srcq, srcstrideq
490 dec heightd ; next row
491 jg .nextrow
492 REP_RET
494 INIT_XMM sse2
495 cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg
496 lea mxd, [mxq*3]
497 shl mxd, 4
498 %ifdef PIC
499 lea picregq, [sixtap_filter_v_m]
500 %endif
501 lea mxq, [sixtap_filter_v+mxq-96]
502 pxor m7, m7
503 mova m6, [pw_64]
504 %ifdef m8
505 mova m8, [mxq+ 0]
506 mova m9, [mxq+16]
507 mova m10, [mxq+32]
508 mova m11, [mxq+48]
509 mova m12, [mxq+64]
510 mova m13, [mxq+80]
511 %endif
512 .nextrow:
513 movq m0, [srcq-2]
514 movq m1, [srcq-1]
515 movq m2, [srcq-0]
516 movq m3, [srcq+1]
517 movq m4, [srcq+2]
518 movq m5, [srcq+3]
519 punpcklbw m0, m7
520 punpcklbw m1, m7
521 punpcklbw m2, m7
522 punpcklbw m3, m7
523 punpcklbw m4, m7
524 punpcklbw m5, m7
525 %ifdef m8
526 pmullw m0, m8
527 pmullw m1, m9
528 pmullw m2, m10
529 pmullw m3, m11
530 pmullw m4, m12
531 pmullw m5, m13
532 %else
533 pmullw m0, [mxq+ 0]
534 pmullw m1, [mxq+16]
535 pmullw m2, [mxq+32]
536 pmullw m3, [mxq+48]
537 pmullw m4, [mxq+64]
538 pmullw m5, [mxq+80]
539 %endif
540 paddsw m1, m4
541 paddsw m0, m5
542 paddsw m1, m2
543 paddsw m0, m3
544 paddsw m0, m1
545 paddsw m0, m6
546 psraw m0, 7
547 packuswb m0, m7
548 movh [dstq], m0 ; store
550 ; go to next line
551 add dstq, dststrideq
552 add srcq, srcstrideq
553 dec heightd ; next row
554 jg .nextrow
555 REP_RET
557 %macro FILTER_V 1
558 ; 4x4 block, V-only 4-tap filter
559 cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
560 shl myd, 5
561 %ifdef PIC
562 lea picregq, [fourtap_filter_v_m]
563 %endif
564 lea myq, [fourtap_filter_v+myq-32]
565 mova m6, [pw_64]
566 pxor m7, m7
567 mova m5, [myq+48]
569 ; read 3 lines
570 sub srcq, srcstrideq
571 movh m0, [srcq]
572 movh m1, [srcq+ srcstrideq]
573 movh m2, [srcq+2*srcstrideq]
574 add srcq, srcstrideq
575 punpcklbw m0, m7
576 punpcklbw m1, m7
577 punpcklbw m2, m7
579 .nextrow:
580 ; first calculate negative taps (to prevent losing positive overflows)
581 movh m4, [srcq+2*srcstrideq] ; read new row
582 punpcklbw m4, m7
583 mova m3, m4
584 pmullw m0, [myq+0]
585 pmullw m4, m5
586 paddsw m4, m0
588 ; then calculate positive taps
589 mova m0, m1
590 pmullw m1, [myq+16]
591 paddsw m4, m1
592 mova m1, m2
593 pmullw m2, [myq+32]
594 paddsw m4, m2
595 mova m2, m3
597 ; round/clip/store
598 paddsw m4, m6
599 psraw m4, 7
600 packuswb m4, m7
601 movh [dstq], m4
603 ; go to next line
604 add dstq, dststrideq
605 add srcq, srcstrideq
606 dec heightd ; next row
607 jg .nextrow
608 REP_RET
611 ; 4x4 block, V-only 6-tap filter
612 cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
613 shl myd, 4
614 lea myq, [myq*3]
615 %ifdef PIC
616 lea picregq, [sixtap_filter_v_m]
617 %endif
618 lea myq, [sixtap_filter_v+myq-96]
619 pxor m7, m7
621 ; read 5 lines
622 sub srcq, srcstrideq
623 sub srcq, srcstrideq
624 movh m0, [srcq]
625 movh m1, [srcq+srcstrideq]
626 movh m2, [srcq+srcstrideq*2]
627 lea srcq, [srcq+srcstrideq*2]
628 add srcq, srcstrideq
629 movh m3, [srcq]
630 movh m4, [srcq+srcstrideq]
631 punpcklbw m0, m7
632 punpcklbw m1, m7
633 punpcklbw m2, m7
634 punpcklbw m3, m7
635 punpcklbw m4, m7
637 .nextrow:
638 ; first calculate negative taps (to prevent losing positive overflows)
639 mova m5, m1
640 pmullw m5, [myq+16]
641 mova m6, m4
642 pmullw m6, [myq+64]
643 paddsw m6, m5
645 ; then calculate positive taps
646 movh m5, [srcq+2*srcstrideq] ; read new row
647 punpcklbw m5, m7
648 pmullw m0, [myq+0]
649 paddsw m6, m0
650 mova m0, m1
651 mova m1, m2
652 pmullw m2, [myq+32]
653 paddsw m6, m2
654 mova m2, m3
655 pmullw m3, [myq+48]
656 paddsw m6, m3
657 mova m3, m4
658 mova m4, m5
659 pmullw m5, [myq+80]
660 paddsw m6, m5
662 ; round/clip/store
663 paddsw m6, [pw_64]
664 psraw m6, 7
665 packuswb m6, m7
666 movh [dstq], m6
668 ; go to next line
669 add dstq, dststrideq
670 add srcq, srcstrideq
671 dec heightd ; next row
672 jg .nextrow
673 REP_RET
674 %endmacro
676 INIT_MMX mmxext
677 FILTER_V 4
678 INIT_XMM sse2
679 FILTER_V 8
681 %macro FILTER_BILINEAR 1
682 cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my
683 shl myd, 4
684 %ifdef PIC
685 lea picregq, [bilinear_filter_vw_m]
686 %endif
687 pxor m6, m6
688 mova m5, [bilinear_filter_vw+myq-1*16]
689 neg myq
690 mova m4, [bilinear_filter_vw+myq+7*16]
691 .nextrow:
692 movh m0, [srcq+srcstrideq*0]
693 movh m1, [srcq+srcstrideq*1]
694 movh m3, [srcq+srcstrideq*2]
695 punpcklbw m0, m6
696 punpcklbw m1, m6
697 punpcklbw m3, m6
698 mova m2, m1
699 pmullw m0, m4
700 pmullw m1, m5
701 pmullw m2, m4
702 pmullw m3, m5
703 paddsw m0, m1
704 paddsw m2, m3
705 psraw m0, 2
706 psraw m2, 2
707 pavgw m0, m6
708 pavgw m2, m6
709 %if mmsize == 8
710 packuswb m0, m0
711 packuswb m2, m2
712 movh [dstq+dststrideq*0], m0
713 movh [dstq+dststrideq*1], m2
714 %else
715 packuswb m0, m2
716 movh [dstq+dststrideq*0], m0
717 movhps [dstq+dststrideq*1], m0
718 %endif
720 lea dstq, [dstq+dststrideq*2]
721 lea srcq, [srcq+srcstrideq*2]
722 sub heightd, 2
723 jg .nextrow
724 REP_RET
726 cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
727 shl mxd, 4
728 %ifdef PIC
729 lea picregq, [bilinear_filter_vw_m]
730 %endif
731 pxor m6, m6
732 mova m5, [bilinear_filter_vw+mxq-1*16]
733 neg mxq
734 mova m4, [bilinear_filter_vw+mxq+7*16]
735 .nextrow:
736 movh m0, [srcq+srcstrideq*0+0]
737 movh m1, [srcq+srcstrideq*0+1]
738 movh m2, [srcq+srcstrideq*1+0]
739 movh m3, [srcq+srcstrideq*1+1]
740 punpcklbw m0, m6
741 punpcklbw m1, m6
742 punpcklbw m2, m6
743 punpcklbw m3, m6
744 pmullw m0, m4
745 pmullw m1, m5
746 pmullw m2, m4
747 pmullw m3, m5
748 paddsw m0, m1
749 paddsw m2, m3
750 psraw m0, 2
751 psraw m2, 2
752 pavgw m0, m6
753 pavgw m2, m6
754 %if mmsize == 8
755 packuswb m0, m0
756 packuswb m2, m2
757 movh [dstq+dststrideq*0], m0
758 movh [dstq+dststrideq*1], m2
759 %else
760 packuswb m0, m2
761 movh [dstq+dststrideq*0], m0
762 movhps [dstq+dststrideq*1], m0
763 %endif
765 lea dstq, [dstq+dststrideq*2]
766 lea srcq, [srcq+srcstrideq*2]
767 sub heightd, 2
768 jg .nextrow
769 REP_RET
770 %endmacro
772 INIT_MMX mmxext
773 FILTER_BILINEAR 4
774 INIT_XMM sse2
775 FILTER_BILINEAR 8
777 %macro FILTER_BILINEAR_SSSE3 1
778 cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my
779 shl myd, 4
780 %ifdef PIC
781 lea picregq, [bilinear_filter_vb_m]
782 %endif
783 pxor m4, m4
784 mova m3, [bilinear_filter_vb+myq-16]
785 .nextrow:
786 movh m0, [srcq+srcstrideq*0]
787 movh m1, [srcq+srcstrideq*1]
788 movh m2, [srcq+srcstrideq*2]
789 punpcklbw m0, m1
790 punpcklbw m1, m2
791 pmaddubsw m0, m3
792 pmaddubsw m1, m3
793 psraw m0, 2
794 psraw m1, 2
795 pavgw m0, m4
796 pavgw m1, m4
797 %if mmsize==8
798 packuswb m0, m0
799 packuswb m1, m1
800 movh [dstq+dststrideq*0], m0
801 movh [dstq+dststrideq*1], m1
802 %else
803 packuswb m0, m1
804 movh [dstq+dststrideq*0], m0
805 movhps [dstq+dststrideq*1], m0
806 %endif
808 lea dstq, [dstq+dststrideq*2]
809 lea srcq, [srcq+srcstrideq*2]
810 sub heightd, 2
811 jg .nextrow
812 REP_RET
814 cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg
815 shl mxd, 4
816 %ifdef PIC
817 lea picregq, [bilinear_filter_vb_m]
818 %endif
819 pxor m4, m4
820 mova m2, [filter_h2_shuf]
821 mova m3, [bilinear_filter_vb+mxq-16]
822 .nextrow:
823 movu m0, [srcq+srcstrideq*0]
824 movu m1, [srcq+srcstrideq*1]
825 pshufb m0, m2
826 pshufb m1, m2
827 pmaddubsw m0, m3
828 pmaddubsw m1, m3
829 psraw m0, 2
830 psraw m1, 2
831 pavgw m0, m4
832 pavgw m1, m4
833 %if mmsize==8
834 packuswb m0, m0
835 packuswb m1, m1
836 movh [dstq+dststrideq*0], m0
837 movh [dstq+dststrideq*1], m1
838 %else
839 packuswb m0, m1
840 movh [dstq+dststrideq*0], m0
841 movhps [dstq+dststrideq*1], m0
842 %endif
844 lea dstq, [dstq+dststrideq*2]
845 lea srcq, [srcq+srcstrideq*2]
846 sub heightd, 2
847 jg .nextrow
848 REP_RET
849 %endmacro
851 INIT_MMX ssse3
852 FILTER_BILINEAR_SSSE3 4
853 INIT_XMM ssse3
854 FILTER_BILINEAR_SSSE3 8
856 INIT_MMX mmx
857 cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height
858 .nextrow:
859 movq mm0, [srcq+srcstrideq*0]
860 movq mm1, [srcq+srcstrideq*1]
861 lea srcq, [srcq+srcstrideq*2]
862 movq [dstq+dststrideq*0], mm0
863 movq [dstq+dststrideq*1], mm1
864 lea dstq, [dstq+dststrideq*2]
865 sub heightd, 2
866 jg .nextrow
867 REP_RET
869 %if ARCH_X86_32
870 INIT_MMX mmx
871 cglobal put_vp8_pixels16, 5, 5, 0, dst, dststride, src, srcstride, height
872 .nextrow:
873 movq mm0, [srcq+srcstrideq*0+0]
874 movq mm1, [srcq+srcstrideq*0+8]
875 movq mm2, [srcq+srcstrideq*1+0]
876 movq mm3, [srcq+srcstrideq*1+8]
877 lea srcq, [srcq+srcstrideq*2]
878 movq [dstq+dststrideq*0+0], mm0
879 movq [dstq+dststrideq*0+8], mm1
880 movq [dstq+dststrideq*1+0], mm2
881 movq [dstq+dststrideq*1+8], mm3
882 lea dstq, [dstq+dststrideq*2]
883 sub heightd, 2
884 jg .nextrow
885 REP_RET
886 %endif
888 INIT_XMM sse
889 cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
890 .nextrow:
891 movups xmm0, [srcq+srcstrideq*0]
892 movups xmm1, [srcq+srcstrideq*1]
893 lea srcq, [srcq+srcstrideq*2]
894 movaps [dstq+dststrideq*0], xmm0
895 movaps [dstq+dststrideq*1], xmm1
896 lea dstq, [dstq+dststrideq*2]
897 sub heightd, 2
898 jg .nextrow
899 REP_RET
901 ;-----------------------------------------------------------------------------
902 ; void vp8_idct_dc_add_<opt>(uint8_t *dst, int16_t block[16], int stride);
903 ;-----------------------------------------------------------------------------
905 %macro ADD_DC 4
906 %4 m2, [dst1q+%3]
907 %4 m3, [dst1q+strideq+%3]
908 %4 m4, [dst2q+%3]
909 %4 m5, [dst2q+strideq+%3]
910 paddusb m2, %1
911 paddusb m3, %1
912 paddusb m4, %1
913 paddusb m5, %1
914 psubusb m2, %2
915 psubusb m3, %2
916 psubusb m4, %2
917 psubusb m5, %2
918 %4 [dst1q+%3], m2
919 %4 [dst1q+strideq+%3], m3
920 %4 [dst2q+%3], m4
921 %4 [dst2q+strideq+%3], m5
922 %endmacro
924 INIT_MMX mmx
925 cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
926 ; load data
927 movd m0, [blockq]
929 ; calculate DC
930 paddw m0, [pw_4]
931 pxor m1, m1
932 psraw m0, 3
933 movd [blockq], m1
934 psubw m1, m0
935 packuswb m0, m0
936 packuswb m1, m1
937 punpcklbw m0, m0
938 punpcklbw m1, m1
939 punpcklwd m0, m0
940 punpcklwd m1, m1
942 ; add DC
943 DEFINE_ARGS dst1, dst2, stride
944 lea dst2q, [dst1q+strideq*2]
945 ADD_DC m0, m1, 0, movh
948 INIT_XMM sse4
949 cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
950 ; load data
951 movd m0, [blockq]
952 pxor m1, m1
954 ; calculate DC
955 paddw m0, [pw_4]
956 movd [blockq], m1
957 DEFINE_ARGS dst1, dst2, stride
958 lea dst2q, [dst1q+strideq*2]
959 movd m2, [dst1q]
960 movd m3, [dst1q+strideq]
961 movd m4, [dst2q]
962 movd m5, [dst2q+strideq]
963 psraw m0, 3
964 pshuflw m0, m0, 0
965 punpcklqdq m0, m0
966 punpckldq m2, m3
967 punpckldq m4, m5
968 punpcklbw m2, m1
969 punpcklbw m4, m1
970 paddw m2, m0
971 paddw m4, m0
972 packuswb m2, m4
973 movd [dst1q], m2
974 pextrd [dst1q+strideq], m2, 1
975 pextrd [dst2q], m2, 2
976 pextrd [dst2q+strideq], m2, 3
979 ;-----------------------------------------------------------------------------
980 ; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, int16_t block[4][16], int stride);
981 ;-----------------------------------------------------------------------------
983 %if ARCH_X86_32
984 INIT_MMX mmx
985 cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride
986 ; load data
987 movd m0, [blockq+32*0] ; A
988 movd m1, [blockq+32*2] ; C
989 punpcklwd m0, [blockq+32*1] ; A B
990 punpcklwd m1, [blockq+32*3] ; C D
991 punpckldq m0, m1 ; A B C D
992 pxor m6, m6
994 ; calculate DC
995 paddw m0, [pw_4]
996 movd [blockq+32*0], m6
997 movd [blockq+32*1], m6
998 movd [blockq+32*2], m6
999 movd [blockq+32*3], m6
1000 psraw m0, 3
1001 psubw m6, m0
1002 packuswb m0, m0
1003 packuswb m6, m6
1004 punpcklbw m0, m0 ; AABBCCDD
1005 punpcklbw m6, m6 ; AABBCCDD
1006 movq m1, m0
1007 movq m7, m6
1008 punpcklbw m0, m0 ; AAAABBBB
1009 punpckhbw m1, m1 ; CCCCDDDD
1010 punpcklbw m6, m6 ; AAAABBBB
1011 punpckhbw m7, m7 ; CCCCDDDD
1013 ; add DC
1014 DEFINE_ARGS dst1, dst2, stride
1015 lea dst2q, [dst1q+strideq*2]
1016 ADD_DC m0, m6, 0, mova
1017 ADD_DC m1, m7, 8, mova
1019 %endif
1021 INIT_XMM sse2
1022 cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride
1023 ; load data
1024 movd m0, [blockq+32*0] ; A
1025 movd m1, [blockq+32*2] ; C
1026 punpcklwd m0, [blockq+32*1] ; A B
1027 punpcklwd m1, [blockq+32*3] ; C D
1028 punpckldq m0, m1 ; A B C D
1029 pxor m1, m1
1031 ; calculate DC
1032 paddw m0, [pw_4]
1033 movd [blockq+32*0], m1
1034 movd [blockq+32*1], m1
1035 movd [blockq+32*2], m1
1036 movd [blockq+32*3], m1
1037 psraw m0, 3
1038 psubw m1, m0
1039 packuswb m0, m0
1040 packuswb m1, m1
1041 punpcklbw m0, m0
1042 punpcklbw m1, m1
1043 punpcklbw m0, m0
1044 punpcklbw m1, m1
1046 ; add DC
1047 DEFINE_ARGS dst1, dst2, stride
1048 lea dst2q, [dst1q+strideq*2]
1049 ADD_DC m0, m1, 0, mova
1052 ;-----------------------------------------------------------------------------
1053 ; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, int16_t block[4][16], int stride);
1054 ;-----------------------------------------------------------------------------
1056 INIT_MMX mmx
1057 cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride
1058 ; load data
1059 movd m0, [blockq+32*0] ; A
1060 movd m1, [blockq+32*2] ; C
1061 punpcklwd m0, [blockq+32*1] ; A B
1062 punpcklwd m1, [blockq+32*3] ; C D
1063 punpckldq m0, m1 ; A B C D
1064 pxor m6, m6
1066 ; calculate DC
1067 paddw m0, [pw_4]
1068 movd [blockq+32*0], m6
1069 movd [blockq+32*1], m6
1070 movd [blockq+32*2], m6
1071 movd [blockq+32*3], m6
1072 psraw m0, 3
1073 psubw m6, m0
1074 packuswb m0, m0
1075 packuswb m6, m6
1076 punpcklbw m0, m0 ; AABBCCDD
1077 punpcklbw m6, m6 ; AABBCCDD
1078 movq m1, m0
1079 movq m7, m6
1080 punpcklbw m0, m0 ; AAAABBBB
1081 punpckhbw m1, m1 ; CCCCDDDD
1082 punpcklbw m6, m6 ; AAAABBBB
1083 punpckhbw m7, m7 ; CCCCDDDD
1085 ; add DC
1086 DEFINE_ARGS dst1, dst2, stride
1087 lea dst2q, [dst1q+strideq*2]
1088 ADD_DC m0, m6, 0, mova
1089 lea dst1q, [dst1q+strideq*4]
1090 lea dst2q, [dst2q+strideq*4]
1091 ADD_DC m1, m7, 0, mova
1094 ;-----------------------------------------------------------------------------
1095 ; void vp8_idct_add_<opt>(uint8_t *dst, int16_t block[16], int stride);
1096 ;-----------------------------------------------------------------------------
1098 ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
1099 ; this macro assumes that m6/m7 have words for 20091/17734 loaded
1100 %macro VP8_MULTIPLY_SUMSUB 4
1101 mova %3, %1
1102 mova %4, %2
1103 pmulhw %3, m6 ;20091(1)
1104 pmulhw %4, m6 ;20091(2)
1105 paddw %3, %1
1106 paddw %4, %2
1107 paddw %1, %1
1108 paddw %2, %2
1109 pmulhw %1, m7 ;35468(1)
1110 pmulhw %2, m7 ;35468(2)
1111 psubw %1, %4
1112 paddw %2, %3
1113 %endmacro
1115 ; calculate x0=%1+%3; x1=%1-%3
1116 ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
1117 ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
1118 ; %5/%6 are temporary registers
1119 ; we assume m6/m7 have constant words 20091/17734 loaded in them
1120 %macro VP8_IDCT_TRANSFORM4x4_1D 6
1121 SUMSUB_BA w, %3, %1, %5 ;t0, t1
1122 VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
1123 SUMSUB_BA w, %4, %3, %5 ;tmp0, tmp3
1124 SUMSUB_BA w, %2, %1, %5 ;tmp1, tmp2
1125 SWAP %4, %1
1126 SWAP %4, %3
1127 %endmacro
1129 %macro VP8_IDCT_ADD 0
1130 cglobal vp8_idct_add, 3, 3, 0, dst, block, stride
1131 ; load block data
1132 movq m0, [blockq+ 0]
1133 movq m1, [blockq+ 8]
1134 movq m2, [blockq+16]
1135 movq m3, [blockq+24]
1136 movq m6, [pw_20091]
1137 movq m7, [pw_17734]
1138 %if cpuflag(sse)
1139 xorps xmm0, xmm0
1140 movaps [blockq+ 0], xmm0
1141 movaps [blockq+16], xmm0
1142 %else
1143 pxor m4, m4
1144 movq [blockq+ 0], m4
1145 movq [blockq+ 8], m4
1146 movq [blockq+16], m4
1147 movq [blockq+24], m4
1148 %endif
1150 ; actual IDCT
1151 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1152 TRANSPOSE4x4W 0, 1, 2, 3, 4
1153 paddw m0, [pw_4]
1154 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1155 TRANSPOSE4x4W 0, 1, 2, 3, 4
1157 ; store
1158 pxor m4, m4
1159 DEFINE_ARGS dst1, dst2, stride
1160 lea dst2q, [dst1q+2*strideq]
1161 STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq
1162 STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq
1165 %endmacro
1167 %if ARCH_X86_32
1168 INIT_MMX mmx
1169 VP8_IDCT_ADD
1170 %endif
1171 INIT_MMX sse
1172 VP8_IDCT_ADD
1174 ;-----------------------------------------------------------------------------
1175 ; void vp8_luma_dc_wht_mmxext(int16_t block[4][4][16], int16_t dc[16])
1176 ;-----------------------------------------------------------------------------
1178 %macro SCATTER_WHT 3
1179 movd dc1d, m%1
1180 movd dc2d, m%2
1181 mov [blockq+2*16*(0+%3)], dc1w
1182 mov [blockq+2*16*(1+%3)], dc2w
1183 shr dc1d, 16
1184 shr dc2d, 16
1185 psrlq m%1, 32
1186 psrlq m%2, 32
1187 mov [blockq+2*16*(4+%3)], dc1w
1188 mov [blockq+2*16*(5+%3)], dc2w
1189 movd dc1d, m%1
1190 movd dc2d, m%2
1191 mov [blockq+2*16*(8+%3)], dc1w
1192 mov [blockq+2*16*(9+%3)], dc2w
1193 shr dc1d, 16
1194 shr dc2d, 16
1195 mov [blockq+2*16*(12+%3)], dc1w
1196 mov [blockq+2*16*(13+%3)], dc2w
1197 %endmacro
1199 %macro HADAMARD4_1D 4
1200 SUMSUB_BADC w, %2, %1, %4, %3
1201 SUMSUB_BADC w, %4, %2, %3, %1
1202 SWAP %1, %4, %3
1203 %endmacro
1205 %macro VP8_DC_WHT 0
1206 cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2
1207 movq m0, [dc1q]
1208 movq m1, [dc1q+8]
1209 movq m2, [dc1q+16]
1210 movq m3, [dc1q+24]
1211 %if cpuflag(sse)
1212 xorps xmm0, xmm0
1213 movaps [dc1q+ 0], xmm0
1214 movaps [dc1q+16], xmm0
1215 %else
1216 pxor m4, m4
1217 movq [dc1q+ 0], m4
1218 movq [dc1q+ 8], m4
1219 movq [dc1q+16], m4
1220 movq [dc1q+24], m4
1221 %endif
1222 HADAMARD4_1D 0, 1, 2, 3
1223 TRANSPOSE4x4W 0, 1, 2, 3, 4
1224 paddw m0, [pw_3]
1225 HADAMARD4_1D 0, 1, 2, 3
1226 psraw m0, 3
1227 psraw m1, 3
1228 psraw m2, 3
1229 psraw m3, 3
1230 SCATTER_WHT 0, 1, 0
1231 SCATTER_WHT 2, 3, 2
1233 %endmacro
1235 %if ARCH_X86_32
1236 INIT_MMX mmx
1237 VP8_DC_WHT
1238 %endif
1239 INIT_MMX sse
1240 VP8_DC_WHT
1242 ;-----------------------------------------------------------------------------
1243 ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
1244 ;-----------------------------------------------------------------------------
1246 ; macro called with 7 mm register indexes as argument, and 4 regular registers
1248 ; first 4 mm registers will carry the transposed pixel data
1249 ; the other three are scratchspace (one would be sufficient, but this allows
1250 ; for more spreading/pipelining and thus faster execution on OOE CPUs)
1252 ; first two regular registers are buf+4*stride and buf+5*stride
1253 ; third is -stride, fourth is +stride
1254 %macro READ_8x4_INTERLEAVED 11
1255 ; interleave 8 (A-H) rows of 4 pixels each
1256 movd m%1, [%8+%10*4] ; A0-3
1257 movd m%5, [%9+%10*4] ; B0-3
1258 movd m%2, [%8+%10*2] ; C0-3
1259 movd m%6, [%8+%10] ; D0-3
1260 movd m%3, [%8] ; E0-3
1261 movd m%7, [%9] ; F0-3
1262 movd m%4, [%9+%11] ; G0-3
1263 punpcklbw m%1, m%5 ; A/B interleaved
1264 movd m%5, [%9+%11*2] ; H0-3
1265 punpcklbw m%2, m%6 ; C/D interleaved
1266 punpcklbw m%3, m%7 ; E/F interleaved
1267 punpcklbw m%4, m%5 ; G/H interleaved
1268 %endmacro
1270 ; macro called with 7 mm register indexes as argument, and 5 regular registers
1271 ; first 11 mean the same as READ_8x4_TRANSPOSED above
1272 ; fifth regular register is scratchspace to reach the bottom 8 rows, it
1273 ; will be set to second regular register + 8*stride at the end
1274 %macro READ_16x4_INTERLEAVED 12
1275 ; transpose 16 (A-P) rows of 4 pixels each
1276 lea %12, [r0+8*r2]
1278 ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M
1279 movd m%1, [%8+%10*4] ; A0-3
1280 movd m%3, [%12+%10*4] ; I0-3
1281 movd m%2, [%8+%10*2] ; C0-3
1282 movd m%4, [%12+%10*2] ; K0-3
1283 movd m%6, [%8+%10] ; D0-3
1284 movd m%5, [%12+%10] ; L0-3
1285 movd m%7, [%12] ; M0-3
1286 add %12, %11
1287 punpcklbw m%1, m%3 ; A/I
1288 movd m%3, [%8] ; E0-3
1289 punpcklbw m%2, m%4 ; C/K
1290 punpcklbw m%6, m%5 ; D/L
1291 punpcklbw m%3, m%7 ; E/M
1292 punpcklbw m%2, m%6 ; C/D/K/L interleaved
1294 ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P
1295 movd m%5, [%9+%10*4] ; B0-3
1296 movd m%4, [%12+%10*4] ; J0-3
1297 movd m%7, [%9] ; F0-3
1298 movd m%6, [%12] ; N0-3
1299 punpcklbw m%5, m%4 ; B/J
1300 punpcklbw m%7, m%6 ; F/N
1301 punpcklbw m%1, m%5 ; A/B/I/J interleaved
1302 punpcklbw m%3, m%7 ; E/F/M/N interleaved
1303 movd m%4, [%9+%11] ; G0-3
1304 movd m%6, [%12+%11] ; O0-3
1305 movd m%5, [%9+%11*2] ; H0-3
1306 movd m%7, [%12+%11*2] ; P0-3
1307 punpcklbw m%4, m%6 ; G/O
1308 punpcklbw m%5, m%7 ; H/P
1309 punpcklbw m%4, m%5 ; G/H/O/P interleaved
1310 %endmacro
1312 ; write 4 mm registers of 2 dwords each
1313 ; first four arguments are mm register indexes containing source data
1314 ; last four are registers containing buf+4*stride, buf+5*stride,
1315 ; -stride and +stride
1316 %macro WRITE_4x2D 8
1317 ; write out (2 dwords per register)
1318 movd [%5+%7*4], m%1
1319 movd [%5+%7*2], m%2
1320 movd [%5], m%3
1321 movd [%6+%8], m%4
1322 punpckhdq m%1, m%1
1323 punpckhdq m%2, m%2
1324 punpckhdq m%3, m%3
1325 punpckhdq m%4, m%4
1326 movd [%6+%7*4], m%1
1327 movd [%5+%7], m%2
1328 movd [%6], m%3
1329 movd [%6+%8*2], m%4
1330 %endmacro
1332 ; write 4 xmm registers of 4 dwords each
1333 ; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
1334 ; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
1335 ; we add 1*stride to the third regular registry in the process
1336 ; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the
1337 ; same memory region), or 8 if they cover two separate buffers (third one points to
1338 ; a different memory region than the first two), allowing for more optimal code for
1339 ; the 16-width case
1340 %macro WRITE_4x4D 10
1341 ; write out (4 dwords per register), start with dwords zero
1342 movd [%5+%8*4], m%1
1343 movd [%5], m%2
1344 movd [%7+%8*4], m%3
1345 movd [%7], m%4
1347 ; store dwords 1
1348 psrldq m%1, 4
1349 psrldq m%2, 4
1350 psrldq m%3, 4
1351 psrldq m%4, 4
1352 movd [%6+%8*4], m%1
1353 movd [%6], m%2
1354 %if %10 == 16
1355 movd [%6+%9*4], m%3
1356 %endif
1357 movd [%7+%9], m%4
1359 ; write dwords 2
1360 psrldq m%1, 4
1361 psrldq m%2, 4
1362 %if %10 == 8
1363 movd [%5+%8*2], m%1
1364 movd %5d, m%3
1365 %endif
1366 psrldq m%3, 4
1367 psrldq m%4, 4
1368 %if %10 == 16
1369 movd [%5+%8*2], m%1
1370 %endif
1371 movd [%6+%9], m%2
1372 movd [%7+%8*2], m%3
1373 movd [%7+%9*2], m%4
1374 add %7, %9
1376 ; store dwords 3
1377 psrldq m%1, 4
1378 psrldq m%2, 4
1379 psrldq m%3, 4
1380 psrldq m%4, 4
1381 %if %10 == 8
1382 mov [%7+%8*4], %5d
1383 movd [%6+%8*2], m%1
1384 %else
1385 movd [%5+%8], m%1
1386 %endif
1387 movd [%6+%9*2], m%2
1388 movd [%7+%8*2], m%3
1389 movd [%7+%9*2], m%4
1390 %endmacro
1392 ; write 4 or 8 words in the mmx/xmm registers as 8 lines
1393 ; 1 and 2 are the registers to write, this can be the same (for SSE2)
1394 ; for pre-SSE4:
1395 ; 3 is a general-purpose register that we will clobber
1396 ; for SSE4:
1397 ; 3 is a pointer to the destination's 5th line
1398 ; 4 is a pointer to the destination's 4th line
1399 ; 5/6 is -stride and +stride
1400 %macro WRITE_2x4W 6
1401 movd %3d, %1
1402 punpckhdq %1, %1
1403 mov [%4+%5*4], %3w
1404 shr %3, 16
1405 add %4, %6
1406 mov [%4+%5*4], %3w
1408 movd %3d, %1
1409 add %4, %5
1410 mov [%4+%5*2], %3w
1411 shr %3, 16
1412 mov [%4+%5 ], %3w
1414 movd %3d, %2
1415 punpckhdq %2, %2
1416 mov [%4 ], %3w
1417 shr %3, 16
1418 mov [%4+%6 ], %3w
1420 movd %3d, %2
1421 add %4, %6
1422 mov [%4+%6 ], %3w
1423 shr %3, 16
1424 mov [%4+%6*2], %3w
1425 add %4, %5
1426 %endmacro
1428 %macro WRITE_8W 5
1429 %if cpuflag(sse4)
1430 pextrw [%3+%4*4], %1, 0
1431 pextrw [%2+%4*4], %1, 1
1432 pextrw [%3+%4*2], %1, 2
1433 pextrw [%3+%4 ], %1, 3
1434 pextrw [%3 ], %1, 4
1435 pextrw [%2 ], %1, 5
1436 pextrw [%2+%5 ], %1, 6
1437 pextrw [%2+%5*2], %1, 7
1438 %else
1439 movd %2d, %1
1440 psrldq %1, 4
1441 mov [%3+%4*4], %2w
1442 shr %2, 16
1443 add %3, %5
1444 mov [%3+%4*4], %2w
1446 movd %2d, %1
1447 psrldq %1, 4
1448 add %3, %4
1449 mov [%3+%4*2], %2w
1450 shr %2, 16
1451 mov [%3+%4 ], %2w
1453 movd %2d, %1
1454 psrldq %1, 4
1455 mov [%3 ], %2w
1456 shr %2, 16
1457 mov [%3+%5 ], %2w
1459 movd %2d, %1
1460 add %3, %5
1461 mov [%3+%5 ], %2w
1462 shr %2, 16
1463 mov [%3+%5*2], %2w
1464 %endif
1465 %endmacro
1467 %macro SIMPLE_LOOPFILTER 2
1468 cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr
1469 %if mmsize == 8 ; mmx/mmxext
1470 mov cntrq, 2
1471 %endif
1472 %if cpuflag(ssse3)
1473 pxor m0, m0
1474 %endif
1475 SPLATB_REG m7, flim, m0 ; splat "flim" into register
1477 ; set up indexes to address 4 rows
1478 %if mmsize == 8
1479 DEFINE_ARGS dst1, mstride, stride, cntr, dst2
1480 %else
1481 DEFINE_ARGS dst1, mstride, stride, dst3, dst2
1482 %endif
1483 mov strideq, mstrideq
1484 neg mstrideq
1485 %ifidn %1, h
1486 lea dst1q, [dst1q+4*strideq-2]
1487 %endif
1489 %if mmsize == 8 ; mmx / mmxext
1490 .next8px:
1491 %endif
1492 %ifidn %1, v
1493 ; read 4 half/full rows of pixels
1494 mova m0, [dst1q+mstrideq*2] ; p1
1495 mova m1, [dst1q+mstrideq] ; p0
1496 mova m2, [dst1q] ; q0
1497 mova m3, [dst1q+ strideq] ; q1
1498 %else ; h
1499 lea dst2q, [dst1q+ strideq]
1501 %if mmsize == 8 ; mmx/mmxext
1502 READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq
1503 %else ; sse2
1504 READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq, dst3q
1505 %endif
1506 TRANSPOSE4x4W 0, 1, 2, 3, 4
1507 %endif
1509 ; simple_limit
1510 mova m5, m2 ; m5=backup of q0
1511 mova m6, m1 ; m6=backup of p0
1512 psubusb m1, m2 ; p0-q0
1513 psubusb m2, m6 ; q0-p0
1514 por m1, m2 ; FFABS(p0-q0)
1515 paddusb m1, m1 ; m1=FFABS(p0-q0)*2
1517 mova m4, m3
1518 mova m2, m0
1519 psubusb m3, m0 ; q1-p1
1520 psubusb m0, m4 ; p1-q1
1521 por m3, m0 ; FFABS(p1-q1)
1522 mova m0, [pb_80]
1523 pxor m2, m0
1524 pxor m4, m0
1525 psubsb m2, m4 ; m2=p1-q1 (signed) backup for below
1526 pand m3, [pb_FE]
1527 psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed
1528 paddusb m3, m1
1529 psubusb m3, m7
1530 pxor m1, m1
1531 pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0)
1533 ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask)
1534 mova m4, m5
1535 pxor m5, m0
1536 pxor m0, m6
1537 psubsb m5, m0 ; q0-p0 (signed)
1538 paddsb m2, m5
1539 paddsb m2, m5
1540 paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0)
1541 pand m2, m3 ; apply filter mask (m3)
1543 mova m3, [pb_F8]
1544 mova m1, m2
1545 paddsb m2, [pb_4] ; f1<<3=a+4
1546 paddsb m1, [pb_3] ; f2<<3=a+3
1547 pand m2, m3
1548 pand m1, m3 ; cache f2<<3
1550 pxor m0, m0
1551 pxor m3, m3
1552 pcmpgtb m0, m2 ; which values are <0?
1553 psubb m3, m2 ; -f1<<3
1554 psrlq m2, 3 ; +f1
1555 psrlq m3, 3 ; -f1
1556 pand m3, m0
1557 pandn m0, m2
1558 psubusb m4, m0
1559 paddusb m4, m3 ; q0-f1
1561 pxor m0, m0
1562 pxor m3, m3
1563 pcmpgtb m0, m1 ; which values are <0?
1564 psubb m3, m1 ; -f2<<3
1565 psrlq m1, 3 ; +f2
1566 psrlq m3, 3 ; -f2
1567 pand m3, m0
1568 pandn m0, m1
1569 paddusb m6, m0
1570 psubusb m6, m3 ; p0+f2
1572 ; store
1573 %ifidn %1, v
1574 mova [dst1q], m4
1575 mova [dst1q+mstrideq], m6
1576 %else ; h
1577 inc dst1q
1578 SBUTTERFLY bw, 6, 4, 0
1580 %if mmsize == 16 ; sse2
1581 %if cpuflag(sse4)
1582 inc dst2q
1583 %endif
1584 WRITE_8W m6, dst2q, dst1q, mstrideq, strideq
1585 lea dst2q, [dst3q+mstrideq+1]
1586 %if cpuflag(sse4)
1587 inc dst3q
1588 %endif
1589 WRITE_8W m4, dst3q, dst2q, mstrideq, strideq
1590 %else ; mmx/mmxext
1591 WRITE_2x4W m6, m4, dst2q, dst1q, mstrideq, strideq
1592 %endif
1593 %endif
1595 %if mmsize == 8 ; mmx/mmxext
1596 ; next 8 pixels
1597 %ifidn %1, v
1598 add dst1q, 8 ; advance 8 cols = pixels
1599 %else ; h
1600 lea dst1q, [dst1q+strideq*8-1] ; advance 8 rows = lines
1601 %endif
1602 dec cntrq
1603 jg .next8px
1604 REP_RET
1605 %else ; sse2
1607 %endif
1608 %endmacro
1610 %if ARCH_X86_32
1611 INIT_MMX mmx
1612 SIMPLE_LOOPFILTER v, 4
1613 SIMPLE_LOOPFILTER h, 5
1614 INIT_MMX mmxext
1615 SIMPLE_LOOPFILTER v, 4
1616 SIMPLE_LOOPFILTER h, 5
1617 %endif
1619 INIT_XMM sse2
1620 SIMPLE_LOOPFILTER v, 3
1621 SIMPLE_LOOPFILTER h, 5
1622 INIT_XMM ssse3
1623 SIMPLE_LOOPFILTER v, 3
1624 SIMPLE_LOOPFILTER h, 5
1625 INIT_XMM sse4
1626 SIMPLE_LOOPFILTER h, 5
1628 ;-----------------------------------------------------------------------------
1629 ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
1630 ; int flimE, int flimI, int hev_thr);
1631 ;-----------------------------------------------------------------------------
1633 %macro INNER_LOOPFILTER 2
1634 %define stack_size 0
1635 %ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
1636 %ifidn %1, v ; [3]=hev() result
1637 %define stack_size mmsize * -4
1638 %else ; h ; extra storage space for transposes
1639 %define stack_size mmsize * -5
1640 %endif
1641 %endif
1643 %if %2 == 8 ; chroma
1644 cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, stack_size, dst, dst8, stride, flimE, flimI, hevthr
1645 %else ; luma
1646 cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, flimI, hevthr
1647 %endif
1649 %if cpuflag(ssse3)
1650 pxor m7, m7
1651 %endif
1653 %ifndef m8
1654 ; splat function arguments
1655 SPLATB_REG m0, flimEq, m7 ; E
1656 SPLATB_REG m1, flimIq, m7 ; I
1657 SPLATB_REG m2, hevthrq, m7 ; hev_thresh
1659 %define m_flimE [rsp]
1660 %define m_flimI [rsp+mmsize]
1661 %define m_hevthr [rsp+mmsize*2]
1662 %define m_maskres [rsp+mmsize*3]
1663 %define m_p0backup [rsp+mmsize*3]
1664 %define m_q0backup [rsp+mmsize*4]
1666 mova m_flimE, m0
1667 mova m_flimI, m1
1668 mova m_hevthr, m2
1669 %else
1670 %define m_flimE m9
1671 %define m_flimI m10
1672 %define m_hevthr m11
1673 %define m_maskres m12
1674 %define m_p0backup m12
1675 %define m_q0backup m8
1677 ; splat function arguments
1678 SPLATB_REG m_flimE, flimEq, m7 ; E
1679 SPLATB_REG m_flimI, flimIq, m7 ; I
1680 SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh
1681 %endif
1683 %if %2 == 8 ; chroma
1684 DEFINE_ARGS dst1, dst8, mstride, stride, dst2
1685 %elif mmsize == 8
1686 DEFINE_ARGS dst1, mstride, stride, dst2, cntr
1687 mov cntrq, 2
1688 %else
1689 DEFINE_ARGS dst1, mstride, stride, dst2, dst8
1690 %endif
1691 mov strideq, mstrideq
1692 neg mstrideq
1693 %ifidn %1, h
1694 lea dst1q, [dst1q+strideq*4-4]
1695 %if %2 == 8 ; chroma
1696 lea dst8q, [dst8q+strideq*4-4]
1697 %endif
1698 %endif
1700 %if mmsize == 8
1701 .next8px:
1702 %endif
1703 ; read
1704 lea dst2q, [dst1q+strideq]
1705 %ifidn %1, v
1706 %if %2 == 8 && mmsize == 16
1707 %define movrow movh
1708 %else
1709 %define movrow mova
1710 %endif
1711 movrow m0, [dst1q+mstrideq*4] ; p3
1712 movrow m1, [dst2q+mstrideq*4] ; p2
1713 movrow m2, [dst1q+mstrideq*2] ; p1
1714 movrow m5, [dst2q] ; q1
1715 movrow m6, [dst2q+ strideq*1] ; q2
1716 movrow m7, [dst2q+ strideq*2] ; q3
1717 %if mmsize == 16 && %2 == 8
1718 movhps m0, [dst8q+mstrideq*4]
1719 movhps m2, [dst8q+mstrideq*2]
1720 add dst8q, strideq
1721 movhps m1, [dst8q+mstrideq*4]
1722 movhps m5, [dst8q]
1723 movhps m6, [dst8q+ strideq ]
1724 movhps m7, [dst8q+ strideq*2]
1725 add dst8q, mstrideq
1726 %endif
1727 %elif mmsize == 8 ; mmx/mmxext (h)
1728 ; read 8 rows of 8px each
1729 movu m0, [dst1q+mstrideq*4]
1730 movu m1, [dst2q+mstrideq*4]
1731 movu m2, [dst1q+mstrideq*2]
1732 movu m3, [dst1q+mstrideq ]
1733 movu m4, [dst1q]
1734 movu m5, [dst2q]
1735 movu m6, [dst2q+ strideq ]
1737 ; 8x8 transpose
1738 TRANSPOSE4x4B 0, 1, 2, 3, 7
1739 mova m_q0backup, m1
1740 movu m7, [dst2q+ strideq*2]
1741 TRANSPOSE4x4B 4, 5, 6, 7, 1
1742 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
1743 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
1744 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
1745 mova m1, m_q0backup
1746 mova m_q0backup, m2 ; store q0
1747 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
1748 mova m_p0backup, m5 ; store p0
1749 SWAP 1, 4
1750 SWAP 2, 4
1751 SWAP 6, 3
1752 SWAP 5, 3
1753 %else ; sse2 (h)
1754 %if %2 == 16
1755 lea dst8q, [dst1q+ strideq*8]
1756 %endif
1758 ; read 16 rows of 8px each, interleave
1759 movh m0, [dst1q+mstrideq*4]
1760 movh m1, [dst8q+mstrideq*4]
1761 movh m2, [dst1q+mstrideq*2]
1762 movh m5, [dst8q+mstrideq*2]
1763 movh m3, [dst1q+mstrideq ]
1764 movh m6, [dst8q+mstrideq ]
1765 movh m4, [dst1q]
1766 movh m7, [dst8q]
1767 punpcklbw m0, m1 ; A/I
1768 punpcklbw m2, m5 ; C/K
1769 punpcklbw m3, m6 ; D/L
1770 punpcklbw m4, m7 ; E/M
1772 add dst8q, strideq
1773 movh m1, [dst2q+mstrideq*4]
1774 movh m6, [dst8q+mstrideq*4]
1775 movh m5, [dst2q]
1776 movh m7, [dst8q]
1777 punpcklbw m1, m6 ; B/J
1778 punpcklbw m5, m7 ; F/N
1779 movh m6, [dst2q+ strideq ]
1780 movh m7, [dst8q+ strideq ]
1781 punpcklbw m6, m7 ; G/O
1783 ; 8x16 transpose
1784 TRANSPOSE4x4B 0, 1, 2, 3, 7
1785 %ifdef m8
1786 SWAP 1, 8
1787 %else
1788 mova m_q0backup, m1
1789 %endif
1790 movh m7, [dst2q+ strideq*2]
1791 movh m1, [dst8q+ strideq*2]
1792 punpcklbw m7, m1 ; H/P
1793 TRANSPOSE4x4B 4, 5, 6, 7, 1
1794 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
1795 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
1796 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
1797 %ifdef m8
1798 SWAP 1, 8
1799 SWAP 2, 8
1800 %else
1801 mova m1, m_q0backup
1802 mova m_q0backup, m2 ; store q0
1803 %endif
1804 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
1805 %ifdef m12
1806 SWAP 5, 12
1807 %else
1808 mova m_p0backup, m5 ; store p0
1809 %endif
1810 SWAP 1, 4
1811 SWAP 2, 4
1812 SWAP 6, 3
1813 SWAP 5, 3
1814 %endif
1816 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
1817 mova m4, m1
1818 SWAP 4, 1
1819 psubusb m4, m0 ; p2-p3
1820 psubusb m0, m1 ; p3-p2
1821 por m0, m4 ; abs(p3-p2)
1823 mova m4, m2
1824 SWAP 4, 2
1825 psubusb m4, m1 ; p1-p2
1826 psubusb m1, m2 ; p2-p1
1827 por m1, m4 ; abs(p2-p1)
1829 mova m4, m6
1830 SWAP 4, 6
1831 psubusb m4, m7 ; q2-q3
1832 psubusb m7, m6 ; q3-q2
1833 por m7, m4 ; abs(q3-q2)
1835 mova m4, m5
1836 SWAP 4, 5
1837 psubusb m4, m6 ; q1-q2
1838 psubusb m6, m5 ; q2-q1
1839 por m6, m4 ; abs(q2-q1)
1841 %if notcpuflag(mmxext)
1842 mova m4, m_flimI
1843 pxor m3, m3
1844 psubusb m0, m4
1845 psubusb m1, m4
1846 psubusb m7, m4
1847 psubusb m6, m4
1848 pcmpeqb m0, m3 ; abs(p3-p2) <= I
1849 pcmpeqb m1, m3 ; abs(p2-p1) <= I
1850 pcmpeqb m7, m3 ; abs(q3-q2) <= I
1851 pcmpeqb m6, m3 ; abs(q2-q1) <= I
1852 pand m0, m1
1853 pand m7, m6
1854 pand m0, m7
1855 %else ; mmxext/sse2
1856 pmaxub m0, m1
1857 pmaxub m6, m7
1858 pmaxub m0, m6
1859 %endif
1861 ; normal_limit and high_edge_variance for p1-p0, q1-q0
1862 SWAP 7, 3 ; now m7 is zero
1863 %ifidn %1, v
1864 movrow m3, [dst1q+mstrideq ] ; p0
1865 %if mmsize == 16 && %2 == 8
1866 movhps m3, [dst8q+mstrideq ]
1867 %endif
1868 %elifdef m12
1869 SWAP 3, 12
1870 %else
1871 mova m3, m_p0backup
1872 %endif
1874 mova m1, m2
1875 SWAP 1, 2
1876 mova m6, m3
1877 SWAP 3, 6
1878 psubusb m1, m3 ; p1-p0
1879 psubusb m6, m2 ; p0-p1
1880 por m1, m6 ; abs(p1-p0)
1881 %if notcpuflag(mmxext)
1882 mova m6, m1
1883 psubusb m1, m4
1884 psubusb m6, m_hevthr
1885 pcmpeqb m1, m7 ; abs(p1-p0) <= I
1886 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
1887 pand m0, m1
1888 mova m_maskres, m6
1889 %else ; mmxext/sse2
1890 pmaxub m0, m1 ; max_I
1891 SWAP 1, 4 ; max_hev_thresh
1892 %endif
1894 SWAP 6, 4 ; now m6 is I
1895 %ifidn %1, v
1896 movrow m4, [dst1q] ; q0
1897 %if mmsize == 16 && %2 == 8
1898 movhps m4, [dst8q]
1899 %endif
1900 %elifdef m8
1901 SWAP 4, 8
1902 %else
1903 mova m4, m_q0backup
1904 %endif
1905 mova m1, m4
1906 SWAP 1, 4
1907 mova m7, m5
1908 SWAP 7, 5
1909 psubusb m1, m5 ; q0-q1
1910 psubusb m7, m4 ; q1-q0
1911 por m1, m7 ; abs(q1-q0)
1912 %if notcpuflag(mmxext)
1913 mova m7, m1
1914 psubusb m1, m6
1915 psubusb m7, m_hevthr
1916 pxor m6, m6
1917 pcmpeqb m1, m6 ; abs(q1-q0) <= I
1918 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
1919 mova m6, m_maskres
1920 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
1921 pand m6, m7
1922 %else ; mmxext/sse2
1923 pxor m7, m7
1924 pmaxub m0, m1
1925 pmaxub m6, m1
1926 psubusb m0, m_flimI
1927 psubusb m6, m_hevthr
1928 pcmpeqb m0, m7 ; max(abs(..)) <= I
1929 pcmpeqb m6, m7 ; !(max(abs..) > thresh)
1930 %endif
1931 %ifdef m12
1932 SWAP 6, 12
1933 %else
1934 mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
1935 %endif
1937 ; simple_limit
1938 mova m1, m3
1939 SWAP 1, 3
1940 mova m6, m4 ; keep copies of p0/q0 around for later use
1941 SWAP 6, 4
1942 psubusb m1, m4 ; p0-q0
1943 psubusb m6, m3 ; q0-p0
1944 por m1, m6 ; abs(q0-p0)
1945 paddusb m1, m1 ; m1=2*abs(q0-p0)
1947 mova m7, m2
1948 SWAP 7, 2
1949 mova m6, m5
1950 SWAP 6, 5
1951 psubusb m7, m5 ; p1-q1
1952 psubusb m6, m2 ; q1-p1
1953 por m7, m6 ; abs(q1-p1)
1954 pxor m6, m6
1955 pand m7, [pb_FE]
1956 psrlq m7, 1 ; abs(q1-p1)/2
1957 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
1958 psubusb m7, m_flimE
1959 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
1960 pand m0, m7 ; normal_limit result
1962 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
1963 %ifdef m8 ; x86-64 && sse2
1964 mova m8, [pb_80]
1965 %define m_pb_80 m8
1966 %else ; x86-32 or mmx/mmxext
1967 %define m_pb_80 [pb_80]
1968 %endif
1969 mova m1, m4
1970 mova m7, m3
1971 pxor m1, m_pb_80
1972 pxor m7, m_pb_80
1973 psubsb m1, m7 ; (signed) q0-p0
1974 mova m6, m2
1975 mova m7, m5
1976 pxor m6, m_pb_80
1977 pxor m7, m_pb_80
1978 psubsb m6, m7 ; (signed) p1-q1
1979 mova m7, m_maskres
1980 pandn m7, m6
1981 paddsb m7, m1
1982 paddsb m7, m1
1983 paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1)
1985 pand m7, m0
1986 mova m1, [pb_F8]
1987 mova m6, m7
1988 paddsb m7, [pb_3]
1989 paddsb m6, [pb_4]
1990 pand m7, m1
1991 pand m6, m1
1993 pxor m1, m1
1994 pxor m0, m0
1995 pcmpgtb m1, m7
1996 psubb m0, m7
1997 psrlq m7, 3 ; +f2
1998 psrlq m0, 3 ; -f2
1999 pand m0, m1
2000 pandn m1, m7
2001 psubusb m3, m0
2002 paddusb m3, m1 ; p0+f2
2004 pxor m1, m1
2005 pxor m0, m0
2006 pcmpgtb m0, m6
2007 psubb m1, m6
2008 psrlq m6, 3 ; +f1
2009 psrlq m1, 3 ; -f1
2010 pand m1, m0
2011 pandn m0, m6
2012 psubusb m4, m0
2013 paddusb m4, m1 ; q0-f1
2015 %ifdef m12
2016 SWAP 6, 12
2017 %else
2018 mova m6, m_maskres
2019 %endif
2020 %if notcpuflag(mmxext)
2021 mova m7, [pb_1]
2022 %else ; mmxext/sse2
2023 pxor m7, m7
2024 %endif
2025 pand m0, m6
2026 pand m1, m6
2027 %if notcpuflag(mmxext)
2028 paddusb m0, m7
2029 pand m1, [pb_FE]
2030 pandn m7, m0
2031 psrlq m1, 1
2032 psrlq m7, 1
2033 SWAP 0, 7
2034 %else ; mmxext/sse2
2035 psubusb m1, [pb_1]
2036 pavgb m0, m7 ; a
2037 pavgb m1, m7 ; -a
2038 %endif
2039 psubusb m5, m0
2040 psubusb m2, m1
2041 paddusb m5, m1 ; q1-a
2042 paddusb m2, m0 ; p1+a
2044 ; store
2045 %ifidn %1, v
2046 movrow [dst1q+mstrideq*2], m2
2047 movrow [dst1q+mstrideq ], m3
2048 movrow [dst1q], m4
2049 movrow [dst1q+ strideq ], m5
2050 %if mmsize == 16 && %2 == 8
2051 movhps [dst8q+mstrideq*2], m2
2052 movhps [dst8q+mstrideq ], m3
2053 movhps [dst8q], m4
2054 movhps [dst8q+ strideq ], m5
2055 %endif
2056 %else ; h
2057 add dst1q, 2
2058 add dst2q, 2
2060 ; 4x8/16 transpose
2061 TRANSPOSE4x4B 2, 3, 4, 5, 6
2063 %if mmsize == 8 ; mmx/mmxext (h)
2064 WRITE_4x2D 2, 3, 4, 5, dst1q, dst2q, mstrideq, strideq
2065 %else ; sse2 (h)
2066 lea dst8q, [dst8q+mstrideq +2]
2067 WRITE_4x4D 2, 3, 4, 5, dst1q, dst2q, dst8q, mstrideq, strideq, %2
2068 %endif
2069 %endif
2071 %if mmsize == 8
2072 %if %2 == 8 ; chroma
2073 %ifidn %1, h
2074 sub dst1q, 2
2075 %endif
2076 cmp dst1q, dst8q
2077 mov dst1q, dst8q
2078 jnz .next8px
2079 %else
2080 %ifidn %1, h
2081 lea dst1q, [dst1q+ strideq*8-2]
2082 %else ; v
2083 add dst1q, 8
2084 %endif
2085 dec cntrq
2086 jg .next8px
2087 %endif
2088 REP_RET
2089 %else ; mmsize == 16
2091 %endif
2092 %endmacro
2094 %if ARCH_X86_32
2095 INIT_MMX mmx
2096 INNER_LOOPFILTER v, 16
2097 INNER_LOOPFILTER h, 16
2098 INNER_LOOPFILTER v, 8
2099 INNER_LOOPFILTER h, 8
2101 INIT_MMX mmxext
2102 INNER_LOOPFILTER v, 16
2103 INNER_LOOPFILTER h, 16
2104 INNER_LOOPFILTER v, 8
2105 INNER_LOOPFILTER h, 8
2106 %endif
2108 INIT_XMM sse2
2109 INNER_LOOPFILTER v, 16
2110 INNER_LOOPFILTER h, 16
2111 INNER_LOOPFILTER v, 8
2112 INNER_LOOPFILTER h, 8
2114 INIT_XMM ssse3
2115 INNER_LOOPFILTER v, 16
2116 INNER_LOOPFILTER h, 16
2117 INNER_LOOPFILTER v, 8
2118 INNER_LOOPFILTER h, 8
2120 ;-----------------------------------------------------------------------------
2121 ; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
2122 ; int flimE, int flimI, int hev_thr);
2123 ;-----------------------------------------------------------------------------
2125 %macro MBEDGE_LOOPFILTER 2
2126 %define stack_size 0
2127 %ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
2128 %if mmsize == 16 ; [3]=hev() result
2129 ; [4]=filter tmp result
2130 ; [5]/[6] = p2/q2 backup
2131 ; [7]=lim_res sign result
2132 %define stack_size mmsize * -7
2133 %else ; 8 ; extra storage space for transposes
2134 %define stack_size mmsize * -8
2135 %endif
2136 %endif
2138 %if %2 == 8 ; chroma
2139 cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, stack_size, dst1, dst8, stride, flimE, flimI, hevthr
2140 %else ; luma
2141 cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, flimI, hevthr
2142 %endif
2144 %if cpuflag(ssse3)
2145 pxor m7, m7
2146 %endif
2148 %ifndef m8
2149 ; splat function arguments
2150 SPLATB_REG m0, flimEq, m7 ; E
2151 SPLATB_REG m1, flimIq, m7 ; I
2152 SPLATB_REG m2, hevthrq, m7 ; hev_thresh
2154 %define m_flimE [rsp]
2155 %define m_flimI [rsp+mmsize]
2156 %define m_hevthr [rsp+mmsize*2]
2157 %define m_maskres [rsp+mmsize*3]
2158 %define m_limres [rsp+mmsize*4]
2159 %define m_p0backup [rsp+mmsize*3]
2160 %define m_q0backup [rsp+mmsize*4]
2161 %define m_p2backup [rsp+mmsize*5]
2162 %define m_q2backup [rsp+mmsize*6]
2163 %if mmsize == 16
2164 %define m_limsign [rsp]
2165 %else
2166 %define m_limsign [rsp+mmsize*7]
2167 %endif
2169 mova m_flimE, m0
2170 mova m_flimI, m1
2171 mova m_hevthr, m2
2172 %else ; sse2 on x86-64
2173 %define m_flimE m9
2174 %define m_flimI m10
2175 %define m_hevthr m11
2176 %define m_maskres m12
2177 %define m_limres m8
2178 %define m_p0backup m12
2179 %define m_q0backup m8
2180 %define m_p2backup m13
2181 %define m_q2backup m14
2182 %define m_limsign m9
2184 ; splat function arguments
2185 SPLATB_REG m_flimE, flimEq, m7 ; E
2186 SPLATB_REG m_flimI, flimIq, m7 ; I
2187 SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh
2188 %endif
2190 %if %2 == 8 ; chroma
2191 DEFINE_ARGS dst1, dst8, mstride, stride, dst2
2192 %elif mmsize == 8
2193 DEFINE_ARGS dst1, mstride, stride, dst2, cntr
2194 mov cntrq, 2
2195 %else
2196 DEFINE_ARGS dst1, mstride, stride, dst2, dst8
2197 %endif
2198 mov strideq, mstrideq
2199 neg mstrideq
2200 %ifidn %1, h
2201 lea dst1q, [dst1q+strideq*4-4]
2202 %if %2 == 8 ; chroma
2203 lea dst8q, [dst8q+strideq*4-4]
2204 %endif
2205 %endif
2207 %if mmsize == 8
2208 .next8px:
2209 %endif
2210 ; read
2211 lea dst2q, [dst1q+ strideq ]
2212 %ifidn %1, v
2213 %if %2 == 8 && mmsize == 16
2214 %define movrow movh
2215 %else
2216 %define movrow mova
2217 %endif
2218 movrow m0, [dst1q+mstrideq*4] ; p3
2219 movrow m1, [dst2q+mstrideq*4] ; p2
2220 movrow m2, [dst1q+mstrideq*2] ; p1
2221 movrow m5, [dst2q] ; q1
2222 movrow m6, [dst2q+ strideq ] ; q2
2223 movrow m7, [dst2q+ strideq*2] ; q3
2224 %if mmsize == 16 && %2 == 8
2225 movhps m0, [dst8q+mstrideq*4]
2226 movhps m2, [dst8q+mstrideq*2]
2227 add dst8q, strideq
2228 movhps m1, [dst8q+mstrideq*4]
2229 movhps m5, [dst8q]
2230 movhps m6, [dst8q+ strideq ]
2231 movhps m7, [dst8q+ strideq*2]
2232 add dst8q, mstrideq
2233 %endif
2234 %elif mmsize == 8 ; mmx/mmxext (h)
2235 ; read 8 rows of 8px each
2236 movu m0, [dst1q+mstrideq*4]
2237 movu m1, [dst2q+mstrideq*4]
2238 movu m2, [dst1q+mstrideq*2]
2239 movu m3, [dst1q+mstrideq ]
2240 movu m4, [dst1q]
2241 movu m5, [dst2q]
2242 movu m6, [dst2q+ strideq ]
2244 ; 8x8 transpose
2245 TRANSPOSE4x4B 0, 1, 2, 3, 7
2246 mova m_q0backup, m1
2247 movu m7, [dst2q+ strideq*2]
2248 TRANSPOSE4x4B 4, 5, 6, 7, 1
2249 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
2250 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
2251 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
2252 mova m1, m_q0backup
2253 mova m_q0backup, m2 ; store q0
2254 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
2255 mova m_p0backup, m5 ; store p0
2256 SWAP 1, 4
2257 SWAP 2, 4
2258 SWAP 6, 3
2259 SWAP 5, 3
2260 %else ; sse2 (h)
2261 %if %2 == 16
2262 lea dst8q, [dst1q+ strideq*8 ]
2263 %endif
2265 ; read 16 rows of 8px each, interleave
2266 movh m0, [dst1q+mstrideq*4]
2267 movh m1, [dst8q+mstrideq*4]
2268 movh m2, [dst1q+mstrideq*2]
2269 movh m5, [dst8q+mstrideq*2]
2270 movh m3, [dst1q+mstrideq ]
2271 movh m6, [dst8q+mstrideq ]
2272 movh m4, [dst1q]
2273 movh m7, [dst8q]
2274 punpcklbw m0, m1 ; A/I
2275 punpcklbw m2, m5 ; C/K
2276 punpcklbw m3, m6 ; D/L
2277 punpcklbw m4, m7 ; E/M
2279 add dst8q, strideq
2280 movh m1, [dst2q+mstrideq*4]
2281 movh m6, [dst8q+mstrideq*4]
2282 movh m5, [dst2q]
2283 movh m7, [dst8q]
2284 punpcklbw m1, m6 ; B/J
2285 punpcklbw m5, m7 ; F/N
2286 movh m6, [dst2q+ strideq ]
2287 movh m7, [dst8q+ strideq ]
2288 punpcklbw m6, m7 ; G/O
2290 ; 8x16 transpose
2291 TRANSPOSE4x4B 0, 1, 2, 3, 7
2292 %ifdef m8
2293 SWAP 1, 8
2294 %else
2295 mova m_q0backup, m1
2296 %endif
2297 movh m7, [dst2q+ strideq*2]
2298 movh m1, [dst8q+ strideq*2]
2299 punpcklbw m7, m1 ; H/P
2300 TRANSPOSE4x4B 4, 5, 6, 7, 1
2301 SBUTTERFLY dq, 0, 4, 1 ; p3/p2
2302 SBUTTERFLY dq, 2, 6, 1 ; q0/q1
2303 SBUTTERFLY dq, 3, 7, 1 ; q2/q3
2304 %ifdef m8
2305 SWAP 1, 8
2306 SWAP 2, 8
2307 %else
2308 mova m1, m_q0backup
2309 mova m_q0backup, m2 ; store q0
2310 %endif
2311 SBUTTERFLY dq, 1, 5, 2 ; p1/p0
2312 %ifdef m12
2313 SWAP 5, 12
2314 %else
2315 mova m_p0backup, m5 ; store p0
2316 %endif
2317 SWAP 1, 4
2318 SWAP 2, 4
2319 SWAP 6, 3
2320 SWAP 5, 3
2321 %endif
2323 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
2324 mova m4, m1
2325 SWAP 4, 1
2326 psubusb m4, m0 ; p2-p3
2327 psubusb m0, m1 ; p3-p2
2328 por m0, m4 ; abs(p3-p2)
2330 mova m4, m2
2331 SWAP 4, 2
2332 psubusb m4, m1 ; p1-p2
2333 mova m_p2backup, m1
2334 psubusb m1, m2 ; p2-p1
2335 por m1, m4 ; abs(p2-p1)
2337 mova m4, m6
2338 SWAP 4, 6
2339 psubusb m4, m7 ; q2-q3
2340 psubusb m7, m6 ; q3-q2
2341 por m7, m4 ; abs(q3-q2)
2343 mova m4, m5
2344 SWAP 4, 5
2345 psubusb m4, m6 ; q1-q2
2346 mova m_q2backup, m6
2347 psubusb m6, m5 ; q2-q1
2348 por m6, m4 ; abs(q2-q1)
2350 %if notcpuflag(mmxext)
2351 mova m4, m_flimI
2352 pxor m3, m3
2353 psubusb m0, m4
2354 psubusb m1, m4
2355 psubusb m7, m4
2356 psubusb m6, m4
2357 pcmpeqb m0, m3 ; abs(p3-p2) <= I
2358 pcmpeqb m1, m3 ; abs(p2-p1) <= I
2359 pcmpeqb m7, m3 ; abs(q3-q2) <= I
2360 pcmpeqb m6, m3 ; abs(q2-q1) <= I
2361 pand m0, m1
2362 pand m7, m6
2363 pand m0, m7
2364 %else ; mmxext/sse2
2365 pmaxub m0, m1
2366 pmaxub m6, m7
2367 pmaxub m0, m6
2368 %endif
2370 ; normal_limit and high_edge_variance for p1-p0, q1-q0
2371 SWAP 7, 3 ; now m7 is zero
2372 %ifidn %1, v
2373 movrow m3, [dst1q+mstrideq ] ; p0
2374 %if mmsize == 16 && %2 == 8
2375 movhps m3, [dst8q+mstrideq ]
2376 %endif
2377 %elifdef m12
2378 SWAP 3, 12
2379 %else
2380 mova m3, m_p0backup
2381 %endif
2383 mova m1, m2
2384 SWAP 1, 2
2385 mova m6, m3
2386 SWAP 3, 6
2387 psubusb m1, m3 ; p1-p0
2388 psubusb m6, m2 ; p0-p1
2389 por m1, m6 ; abs(p1-p0)
2390 %if notcpuflag(mmxext)
2391 mova m6, m1
2392 psubusb m1, m4
2393 psubusb m6, m_hevthr
2394 pcmpeqb m1, m7 ; abs(p1-p0) <= I
2395 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
2396 pand m0, m1
2397 mova m_maskres, m6
2398 %else ; mmxext/sse2
2399 pmaxub m0, m1 ; max_I
2400 SWAP 1, 4 ; max_hev_thresh
2401 %endif
2403 SWAP 6, 4 ; now m6 is I
2404 %ifidn %1, v
2405 movrow m4, [dst1q] ; q0
2406 %if mmsize == 16 && %2 == 8
2407 movhps m4, [dst8q]
2408 %endif
2409 %elifdef m8
2410 SWAP 4, 8
2411 %else
2412 mova m4, m_q0backup
2413 %endif
2414 mova m1, m4
2415 SWAP 1, 4
2416 mova m7, m5
2417 SWAP 7, 5
2418 psubusb m1, m5 ; q0-q1
2419 psubusb m7, m4 ; q1-q0
2420 por m1, m7 ; abs(q1-q0)
2421 %if notcpuflag(mmxext)
2422 mova m7, m1
2423 psubusb m1, m6
2424 psubusb m7, m_hevthr
2425 pxor m6, m6
2426 pcmpeqb m1, m6 ; abs(q1-q0) <= I
2427 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
2428 mova m6, m_maskres
2429 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
2430 pand m6, m7
2431 %else ; mmxext/sse2
2432 pxor m7, m7
2433 pmaxub m0, m1
2434 pmaxub m6, m1
2435 psubusb m0, m_flimI
2436 psubusb m6, m_hevthr
2437 pcmpeqb m0, m7 ; max(abs(..)) <= I
2438 pcmpeqb m6, m7 ; !(max(abs..) > thresh)
2439 %endif
2440 %ifdef m12
2441 SWAP 6, 12
2442 %else
2443 mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
2444 %endif
2446 ; simple_limit
2447 mova m1, m3
2448 SWAP 1, 3
2449 mova m6, m4 ; keep copies of p0/q0 around for later use
2450 SWAP 6, 4
2451 psubusb m1, m4 ; p0-q0
2452 psubusb m6, m3 ; q0-p0
2453 por m1, m6 ; abs(q0-p0)
2454 paddusb m1, m1 ; m1=2*abs(q0-p0)
2456 mova m7, m2
2457 SWAP 7, 2
2458 mova m6, m5
2459 SWAP 6, 5
2460 psubusb m7, m5 ; p1-q1
2461 psubusb m6, m2 ; q1-p1
2462 por m7, m6 ; abs(q1-p1)
2463 pxor m6, m6
2464 pand m7, [pb_FE]
2465 psrlq m7, 1 ; abs(q1-p1)/2
2466 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
2467 psubusb m7, m_flimE
2468 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
2469 pand m0, m7 ; normal_limit result
2471 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
2472 %ifdef m8 ; x86-64 && sse2
2473 mova m8, [pb_80]
2474 %define m_pb_80 m8
2475 %else ; x86-32 or mmx/mmxext
2476 %define m_pb_80 [pb_80]
2477 %endif
2478 mova m1, m4
2479 mova m7, m3
2480 pxor m1, m_pb_80
2481 pxor m7, m_pb_80
2482 psubsb m1, m7 ; (signed) q0-p0
2483 mova m6, m2
2484 mova m7, m5
2485 pxor m6, m_pb_80
2486 pxor m7, m_pb_80
2487 psubsb m6, m7 ; (signed) p1-q1
2488 mova m7, m_maskres
2489 paddsb m6, m1
2490 paddsb m6, m1
2491 paddsb m6, m1
2492 pand m6, m0
2493 %ifdef m8
2494 mova m_limres, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge
2495 pand m_limres, m7
2496 %else
2497 mova m0, m6
2498 pand m0, m7
2499 mova m_limres, m0
2500 %endif
2501 pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common
2503 mova m1, [pb_F8]
2504 mova m6, m7
2505 paddsb m7, [pb_3]
2506 paddsb m6, [pb_4]
2507 pand m7, m1
2508 pand m6, m1
2510 pxor m1, m1
2511 pxor m0, m0
2512 pcmpgtb m1, m7
2513 psubb m0, m7
2514 psrlq m7, 3 ; +f2
2515 psrlq m0, 3 ; -f2
2516 pand m0, m1
2517 pandn m1, m7
2518 psubusb m3, m0
2519 paddusb m3, m1 ; p0+f2
2521 pxor m1, m1
2522 pxor m0, m0
2523 pcmpgtb m0, m6
2524 psubb m1, m6
2525 psrlq m6, 3 ; +f1
2526 psrlq m1, 3 ; -f1
2527 pand m1, m0
2528 pandn m0, m6
2529 psubusb m4, m0
2530 paddusb m4, m1 ; q0-f1
2532 ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w)
2533 %if cpuflag(ssse3)
2534 mova m7, [pb_1]
2535 %else
2536 mova m7, [pw_63]
2537 %endif
2538 %ifdef m8
2539 SWAP 1, 8
2540 %else
2541 mova m1, m_limres
2542 %endif
2543 pxor m0, m0
2544 mova m6, m1
2545 pcmpgtb m0, m1 ; which are negative
2546 %if cpuflag(ssse3)
2547 punpcklbw m6, m7 ; interleave with "1" for rounding
2548 punpckhbw m1, m7
2549 %else
2550 punpcklbw m6, m0 ; signed byte->word
2551 punpckhbw m1, m0
2552 %endif
2553 mova m_limsign, m0
2554 %if cpuflag(ssse3)
2555 mova m7, [pb_27_63]
2556 %ifndef m8
2557 mova m_limres, m1
2558 %endif
2559 %ifdef m10
2560 SWAP 0, 10 ; don't lose lim_sign copy
2561 %endif
2562 mova m0, m7
2563 pmaddubsw m7, m6
2564 SWAP 6, 7
2565 pmaddubsw m0, m1
2566 SWAP 1, 0
2567 %ifdef m10
2568 SWAP 0, 10
2569 %else
2570 mova m0, m_limsign
2571 %endif
2572 %else
2573 mova m_maskres, m6 ; backup for later in filter
2574 mova m_limres, m1
2575 pmullw m6, [pw_27]
2576 pmullw m1, [pw_27]
2577 paddw m6, m7
2578 paddw m1, m7
2579 %endif
2580 psraw m6, 7
2581 psraw m1, 7
2582 packsswb m6, m1 ; a0
2583 pxor m1, m1
2584 psubb m1, m6
2585 pand m1, m0 ; -a0
2586 pandn m0, m6 ; +a0
2587 %if cpuflag(ssse3)
2588 mova m6, [pb_18_63] ; pipelining
2589 %endif
2590 psubusb m3, m1
2591 paddusb m4, m1
2592 paddusb m3, m0 ; p0+a0
2593 psubusb m4, m0 ; q0-a0
2595 %if cpuflag(ssse3)
2596 SWAP 6, 7
2597 %ifdef m10
2598 SWAP 1, 10
2599 %else
2600 mova m1, m_limres
2601 %endif
2602 mova m0, m7
2603 pmaddubsw m7, m6
2604 SWAP 6, 7
2605 pmaddubsw m0, m1
2606 SWAP 1, 0
2607 %ifdef m10
2608 SWAP 0, 10
2609 %endif
2610 mova m0, m_limsign
2611 %else
2612 mova m6, m_maskres
2613 mova m1, m_limres
2614 pmullw m6, [pw_18]
2615 pmullw m1, [pw_18]
2616 paddw m6, m7
2617 paddw m1, m7
2618 %endif
2619 mova m0, m_limsign
2620 psraw m6, 7
2621 psraw m1, 7
2622 packsswb m6, m1 ; a1
2623 pxor m1, m1
2624 psubb m1, m6
2625 pand m1, m0 ; -a1
2626 pandn m0, m6 ; +a1
2627 %if cpuflag(ssse3)
2628 mova m6, [pb_9_63]
2629 %endif
2630 psubusb m2, m1
2631 paddusb m5, m1
2632 paddusb m2, m0 ; p1+a1
2633 psubusb m5, m0 ; q1-a1
2635 %if cpuflag(ssse3)
2636 SWAP 6, 7
2637 %ifdef m10
2638 SWAP 1, 10
2639 %else
2640 mova m1, m_limres
2641 %endif
2642 mova m0, m7
2643 pmaddubsw m7, m6
2644 SWAP 6, 7
2645 pmaddubsw m0, m1
2646 SWAP 1, 0
2647 %else
2648 %ifdef m8
2649 SWAP 6, 12
2650 SWAP 1, 8
2651 %else
2652 mova m6, m_maskres
2653 mova m1, m_limres
2654 %endif
2655 pmullw m6, [pw_9]
2656 pmullw m1, [pw_9]
2657 paddw m6, m7
2658 paddw m1, m7
2659 %endif
2660 %ifdef m9
2661 SWAP 7, 9
2662 %else
2663 mova m7, m_limsign
2664 %endif
2665 psraw m6, 7
2666 psraw m1, 7
2667 packsswb m6, m1 ; a1
2668 pxor m0, m0
2669 psubb m0, m6
2670 pand m0, m7 ; -a1
2671 pandn m7, m6 ; +a1
2672 %ifdef m8
2673 SWAP 1, 13
2674 SWAP 6, 14
2675 %else
2676 mova m1, m_p2backup
2677 mova m6, m_q2backup
2678 %endif
2679 psubusb m1, m0
2680 paddusb m6, m0
2681 paddusb m1, m7 ; p1+a1
2682 psubusb m6, m7 ; q1-a1
2684 ; store
2685 %ifidn %1, v
2686 movrow [dst2q+mstrideq*4], m1
2687 movrow [dst1q+mstrideq*2], m2
2688 movrow [dst1q+mstrideq ], m3
2689 movrow [dst1q], m4
2690 movrow [dst2q], m5
2691 movrow [dst2q+ strideq ], m6
2692 %if mmsize == 16 && %2 == 8
2693 add dst8q, mstrideq
2694 movhps [dst8q+mstrideq*2], m1
2695 movhps [dst8q+mstrideq ], m2
2696 movhps [dst8q], m3
2697 add dst8q, strideq
2698 movhps [dst8q], m4
2699 movhps [dst8q+ strideq ], m5
2700 movhps [dst8q+ strideq*2], m6
2701 %endif
2702 %else ; h
2703 inc dst1q
2704 inc dst2q
2706 ; 4x8/16 transpose
2707 TRANSPOSE4x4B 1, 2, 3, 4, 0
2708 SBUTTERFLY bw, 5, 6, 0
2710 %if mmsize == 8 ; mmx/mmxext (h)
2711 WRITE_4x2D 1, 2, 3, 4, dst1q, dst2q, mstrideq, strideq
2712 add dst1q, 4
2713 WRITE_2x4W m5, m6, dst2q, dst1q, mstrideq, strideq
2714 %else ; sse2 (h)
2715 lea dst8q, [dst8q+mstrideq+1]
2716 WRITE_4x4D 1, 2, 3, 4, dst1q, dst2q, dst8q, mstrideq, strideq, %2
2717 lea dst1q, [dst2q+mstrideq+4]
2718 lea dst8q, [dst8q+mstrideq+4]
2719 %if cpuflag(sse4)
2720 add dst2q, 4
2721 %endif
2722 WRITE_8W m5, dst2q, dst1q, mstrideq, strideq
2723 %if cpuflag(sse4)
2724 lea dst2q, [dst8q+ strideq ]
2725 %endif
2726 WRITE_8W m6, dst2q, dst8q, mstrideq, strideq
2727 %endif
2728 %endif
2730 %if mmsize == 8
2731 %if %2 == 8 ; chroma
2732 %ifidn %1, h
2733 sub dst1q, 5
2734 %endif
2735 cmp dst1q, dst8q
2736 mov dst1q, dst8q
2737 jnz .next8px
2738 %else
2739 %ifidn %1, h
2740 lea dst1q, [dst1q+ strideq*8-5]
2741 %else ; v
2742 add dst1q, 8
2743 %endif
2744 dec cntrq
2745 jg .next8px
2746 %endif
2747 REP_RET
2748 %else ; mmsize == 16
2750 %endif
2751 %endmacro
2753 %if ARCH_X86_32
2754 INIT_MMX mmx
2755 MBEDGE_LOOPFILTER v, 16
2756 MBEDGE_LOOPFILTER h, 16
2757 MBEDGE_LOOPFILTER v, 8
2758 MBEDGE_LOOPFILTER h, 8
2760 INIT_MMX mmxext
2761 MBEDGE_LOOPFILTER v, 16
2762 MBEDGE_LOOPFILTER h, 16
2763 MBEDGE_LOOPFILTER v, 8
2764 MBEDGE_LOOPFILTER h, 8
2765 %endif
2767 INIT_XMM sse2
2768 MBEDGE_LOOPFILTER v, 16
2769 MBEDGE_LOOPFILTER h, 16
2770 MBEDGE_LOOPFILTER v, 8
2771 MBEDGE_LOOPFILTER h, 8
2773 INIT_XMM ssse3
2774 MBEDGE_LOOPFILTER v, 16
2775 MBEDGE_LOOPFILTER h, 16
2776 MBEDGE_LOOPFILTER v, 8
2777 MBEDGE_LOOPFILTER h, 8
2779 INIT_XMM sse4
2780 MBEDGE_LOOPFILTER h, 16
2781 MBEDGE_LOOPFILTER h, 8