aarch64: Add assembly support for -fsanitize=hwaddress tagged globals.
[libav.git] / libavcodec / x86 / vp8dsp.asm
blob4270cdd64470c7e526992ba31c4e606ed8f9874c
1 ;******************************************************************************
2 ;* VP8 MMXEXT optimizations
3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4 ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
5 ;*
6 ;* This file is part of Libav.
7 ;*
8 ;* Libav is free software; you can redistribute it and/or
9 ;* modify it under the terms of the GNU Lesser General Public
10 ;* License as published by the Free Software Foundation; either
11 ;* version 2.1 of the License, or (at your option) any later version.
13 ;* Libav is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 ;* Lesser General Public License for more details.
18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with Libav; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;******************************************************************************
23 %include "libavutil/x86/x86util.asm"
25 SECTION_RODATA
27 fourtap_filter_hw_m: times 4 dw -6, 123
28 times 4 dw 12, -1
29 times 4 dw -9, 93
30 times 4 dw 50, -6
31 times 4 dw -6, 50
32 times 4 dw 93, -9
33 times 4 dw -1, 12
34 times 4 dw 123, -6
36 sixtap_filter_hw_m: times 4 dw 2, -11
37 times 4 dw 108, 36
38 times 4 dw -8, 1
39 times 4 dw 3, -16
40 times 4 dw 77, 77
41 times 4 dw -16, 3
42 times 4 dw 1, -8
43 times 4 dw 36, 108
44 times 4 dw -11, 2
46 fourtap_filter_hb_m: times 8 db -6, 123
47 times 8 db 12, -1
48 times 8 db -9, 93
49 times 8 db 50, -6
50 times 8 db -6, 50
51 times 8 db 93, -9
52 times 8 db -1, 12
53 times 8 db 123, -6
55 sixtap_filter_hb_m: times 8 db 2, 1
56 times 8 db -11, 108
57 times 8 db 36, -8
58 times 8 db 3, 3
59 times 8 db -16, 77
60 times 8 db 77, -16
61 times 8 db 1, 2
62 times 8 db -8, 36
63 times 8 db 108, -11
65 fourtap_filter_v_m: times 8 dw -6
66 times 8 dw 123
67 times 8 dw 12
68 times 8 dw -1
69 times 8 dw -9
70 times 8 dw 93
71 times 8 dw 50
72 times 8 dw -6
73 times 8 dw -6
74 times 8 dw 50
75 times 8 dw 93
76 times 8 dw -9
77 times 8 dw -1
78 times 8 dw 12
79 times 8 dw 123
80 times 8 dw -6
82 sixtap_filter_v_m: times 8 dw 2
83 times 8 dw -11
84 times 8 dw 108
85 times 8 dw 36
86 times 8 dw -8
87 times 8 dw 1
88 times 8 dw 3
89 times 8 dw -16
90 times 8 dw 77
91 times 8 dw 77
92 times 8 dw -16
93 times 8 dw 3
94 times 8 dw 1
95 times 8 dw -8
96 times 8 dw 36
97 times 8 dw 108
98 times 8 dw -11
99 times 8 dw 2
101 bilinear_filter_vw_m: times 8 dw 1
102 times 8 dw 2
103 times 8 dw 3
104 times 8 dw 4
105 times 8 dw 5
106 times 8 dw 6
107 times 8 dw 7
109 bilinear_filter_vb_m: times 8 db 7, 1
110 times 8 db 6, 2
111 times 8 db 5, 3
112 times 8 db 4, 4
113 times 8 db 3, 5
114 times 8 db 2, 6
115 times 8 db 1, 7
117 %ifdef PIC
118 %define fourtap_filter_hw picregq
119 %define sixtap_filter_hw picregq
120 %define fourtap_filter_hb picregq
121 %define sixtap_filter_hb picregq
122 %define fourtap_filter_v picregq
123 %define sixtap_filter_v picregq
124 %define bilinear_filter_vw picregq
125 %define bilinear_filter_vb picregq
126 %define npicregs 1
127 %else
128 %define fourtap_filter_hw fourtap_filter_hw_m
129 %define sixtap_filter_hw sixtap_filter_hw_m
130 %define fourtap_filter_hb fourtap_filter_hb_m
131 %define sixtap_filter_hb sixtap_filter_hb_m
132 %define fourtap_filter_v fourtap_filter_v_m
133 %define sixtap_filter_v sixtap_filter_v_m
134 %define bilinear_filter_vw bilinear_filter_vw_m
135 %define bilinear_filter_vb bilinear_filter_vb_m
136 %define npicregs 0
137 %endif
139 filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
140 filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
142 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
143 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
144 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
146 pw_20091: times 4 dw 20091
147 pw_17734: times 4 dw 17734
149 cextern pw_3
150 cextern pw_4
151 cextern pw_64
152 cextern pw_256
154 SECTION .text
156 ;-------------------------------------------------------------------------------
157 ; subpel MC functions:
159 ; void ff_put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, ptrdiff_t deststride,
160 ; uint8_t *src, ptrdiff_t srcstride,
161 ; int height, int mx, int my);
162 ;-------------------------------------------------------------------------------
164 %macro FILTER_SSSE3 1
165 cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg
166 lea mxd, [mxq*3]
167 mova m3, [filter_h6_shuf2]
168 mova m4, [filter_h6_shuf3]
169 %ifdef PIC
170 lea picregq, [sixtap_filter_hb_m]
171 %endif
172 mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
173 mova m6, [sixtap_filter_hb+mxq*8-32]
174 mova m7, [sixtap_filter_hb+mxq*8-16]
176 .nextrow:
177 movu m0, [srcq-2]
178 mova m1, m0
179 mova m2, m0
180 %if mmsize == 8
181 ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
182 ; shuffle with a memory operand
183 punpcklbw m0, [srcq+3]
184 %else
185 pshufb m0, [filter_h6_shuf1]
186 %endif
187 pshufb m1, m3
188 pshufb m2, m4
189 pmaddubsw m0, m5
190 pmaddubsw m1, m6
191 pmaddubsw m2, m7
192 paddsw m0, m1
193 paddsw m0, m2
194 pmulhrsw m0, [pw_256]
195 packuswb m0, m0
196 movh [dstq], m0 ; store
198 ; go to next line
199 add dstq, dststrideq
200 add srcq, srcstrideq
201 dec heightd ; next row
202 jg .nextrow
203 REP_RET
205 cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
206 shl mxd, 4
207 mova m2, [pw_256]
208 mova m3, [filter_h2_shuf]
209 mova m4, [filter_h4_shuf]
210 %ifdef PIC
211 lea picregq, [fourtap_filter_hb_m]
212 %endif
213 mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes
214 mova m6, [fourtap_filter_hb+mxq]
216 .nextrow:
217 movu m0, [srcq-1]
218 mova m1, m0
219 pshufb m0, m3
220 pshufb m1, m4
221 pmaddubsw m0, m5
222 pmaddubsw m1, m6
223 paddsw m0, m1
224 pmulhrsw m0, m2
225 packuswb m0, m0
226 movh [dstq], m0 ; store
228 ; go to next line
229 add dstq, dststrideq
230 add srcq, srcstrideq
231 dec heightd ; next row
232 jg .nextrow
233 REP_RET
235 cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
236 shl myd, 4
237 %ifdef PIC
238 lea picregq, [fourtap_filter_hb_m]
239 %endif
240 mova m5, [fourtap_filter_hb+myq-16]
241 mova m6, [fourtap_filter_hb+myq]
242 mova m7, [pw_256]
244 ; read 3 lines
245 sub srcq, srcstrideq
246 movh m0, [srcq]
247 movh m1, [srcq+ srcstrideq]
248 movh m2, [srcq+2*srcstrideq]
249 add srcq, srcstrideq
251 .nextrow:
252 movh m3, [srcq+2*srcstrideq] ; read new row
253 mova m4, m0
254 mova m0, m1
255 punpcklbw m4, m1
256 mova m1, m2
257 punpcklbw m2, m3
258 pmaddubsw m4, m5
259 pmaddubsw m2, m6
260 paddsw m4, m2
261 mova m2, m3
262 pmulhrsw m4, m7
263 packuswb m4, m4
264 movh [dstq], m4
266 ; go to next line
267 add dstq, dststrideq
268 add srcq, srcstrideq
269 dec heightd ; next row
270 jg .nextrow
271 REP_RET
273 cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
274 lea myd, [myq*3]
275 %ifdef PIC
276 lea picregq, [sixtap_filter_hb_m]
277 %endif
278 lea myq, [sixtap_filter_hb+myq*8]
280 ; read 5 lines
281 sub srcq, srcstrideq
282 sub srcq, srcstrideq
283 movh m0, [srcq]
284 movh m1, [srcq+srcstrideq]
285 movh m2, [srcq+srcstrideq*2]
286 lea srcq, [srcq+srcstrideq*2]
287 add srcq, srcstrideq
288 movh m3, [srcq]
289 movh m4, [srcq+srcstrideq]
291 .nextrow:
292 movh m5, [srcq+2*srcstrideq] ; read new row
293 mova m6, m0
294 punpcklbw m6, m5
295 mova m0, m1
296 punpcklbw m1, m2
297 mova m7, m3
298 punpcklbw m7, m4
299 pmaddubsw m6, [myq-48]
300 pmaddubsw m1, [myq-32]
301 pmaddubsw m7, [myq-16]
302 paddsw m6, m1
303 paddsw m6, m7
304 mova m1, m2
305 mova m2, m3
306 pmulhrsw m6, [pw_256]
307 mova m3, m4
308 packuswb m6, m6
309 mova m4, m5
310 movh [dstq], m6
312 ; go to next line
313 add dstq, dststrideq
314 add srcq, srcstrideq
315 dec heightd ; next row
316 jg .nextrow
317 REP_RET
318 %endmacro
320 INIT_MMX ssse3
321 FILTER_SSSE3 4
322 INIT_XMM ssse3
323 FILTER_SSSE3 8
325 ; 4x4 block, H-only 4-tap filter
326 INIT_MMX mmxext
327 cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
328 shl mxd, 4
329 %ifdef PIC
330 lea picregq, [fourtap_filter_hw_m]
331 %endif
332 movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words
333 movq mm5, [fourtap_filter_hw+mxq]
334 movq mm7, [pw_64]
335 pxor mm6, mm6
337 .nextrow:
338 movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels
340 ; first set of 2 pixels
341 movq mm2, mm1 ; byte ABCD..
342 punpcklbw mm1, mm6 ; byte->word ABCD
343 pshufw mm0, mm2, 9 ; byte CDEF..
344 punpcklbw mm0, mm6 ; byte->word CDEF
345 pshufw mm3, mm1, 0x94 ; word ABBC
346 pshufw mm1, mm0, 0x94 ; word CDDE
347 pmaddwd mm3, mm4 ; multiply 2px with F0/F1
348 movq mm0, mm1 ; backup for second set of pixels
349 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
350 paddd mm3, mm1 ; finish 1st 2px
352 ; second set of 2 pixels, use backup of above
353 punpckhbw mm2, mm6 ; byte->word EFGH
354 pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
355 pshufw mm1, mm2, 0x94 ; word EFFG
356 pmaddwd mm1, mm5 ; multiply 2px with F2/F3
357 paddd mm0, mm1 ; finish 2nd 2px
359 ; merge two sets of 2 pixels into one set of 4, round/clip/store
360 packssdw mm3, mm0 ; merge dword->word (4px)
361 paddsw mm3, mm7 ; rounding
362 psraw mm3, 7
363 packuswb mm3, mm6 ; clip and word->bytes
364 movd [dstq], mm3 ; store
366 ; go to next line
367 add dstq, dststrideq
368 add srcq, srcstrideq
369 dec heightd ; next row
370 jg .nextrow
371 REP_RET
373 ; 4x4 block, H-only 6-tap filter
374 INIT_MMX mmxext
375 cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
376 lea mxd, [mxq*3]
377 %ifdef PIC
378 lea picregq, [sixtap_filter_hw_m]
379 %endif
380 movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words
381 movq mm5, [sixtap_filter_hw+mxq*8-32]
382 movq mm6, [sixtap_filter_hw+mxq*8-16]
383 movq mm7, [pw_64]
384 pxor mm3, mm3
386 .nextrow:
387 movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels
389 ; first set of 2 pixels
390 movq mm2, mm1 ; byte ABCD..
391 punpcklbw mm1, mm3 ; byte->word ABCD
392 pshufw mm0, mm2, 0x9 ; byte CDEF..
393 punpckhbw mm2, mm3 ; byte->word EFGH
394 punpcklbw mm0, mm3 ; byte->word CDEF
395 pshufw mm1, mm1, 0x94 ; word ABBC
396 pshufw mm2, mm2, 0x94 ; word EFFG
397 pmaddwd mm1, mm4 ; multiply 2px with F0/F1
398 pshufw mm3, mm0, 0x94 ; word CDDE
399 movq mm0, mm3 ; backup for second set of pixels
400 pmaddwd mm3, mm5 ; multiply 2px with F2/F3
401 paddd mm1, mm3 ; add to 1st 2px cache
402 movq mm3, mm2 ; backup for second set of pixels
403 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
404 paddd mm1, mm2 ; finish 1st 2px
406 ; second set of 2 pixels, use backup of above
407 movd mm2, [srcq+3] ; byte FGHI (prevent overreads)
408 pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
409 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
410 paddd mm0, mm3 ; add to 2nd 2px cache
411 pxor mm3, mm3
412 punpcklbw mm2, mm3 ; byte->word FGHI
413 pshufw mm2, mm2, 0xE9 ; word GHHI
414 pmaddwd mm2, mm6 ; multiply 2px with F4/F5
415 paddd mm0, mm2 ; finish 2nd 2px
417 ; merge two sets of 2 pixels into one set of 4, round/clip/store
418 packssdw mm1, mm0 ; merge dword->word (4px)
419 paddsw mm1, mm7 ; rounding
420 psraw mm1, 7
421 packuswb mm1, mm3 ; clip and word->bytes
422 movd [dstq], mm1 ; store
424 ; go to next line
425 add dstq, dststrideq
426 add srcq, srcstrideq
427 dec heightd ; next row
428 jg .nextrow
429 REP_RET
431 INIT_XMM sse2
432 cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg
433 shl mxd, 5
434 %ifdef PIC
435 lea picregq, [fourtap_filter_v_m]
436 %endif
437 lea mxq, [fourtap_filter_v+mxq-32]
438 pxor m7, m7
439 mova m4, [pw_64]
440 mova m5, [mxq+ 0]
441 mova m6, [mxq+16]
442 %ifdef m8
443 mova m8, [mxq+32]
444 mova m9, [mxq+48]
445 %endif
446 .nextrow:
447 movq m0, [srcq-1]
448 movq m1, [srcq-0]
449 movq m2, [srcq+1]
450 movq m3, [srcq+2]
451 punpcklbw m0, m7
452 punpcklbw m1, m7
453 punpcklbw m2, m7
454 punpcklbw m3, m7
455 pmullw m0, m5
456 pmullw m1, m6
457 %ifdef m8
458 pmullw m2, m8
459 pmullw m3, m9
460 %else
461 pmullw m2, [mxq+32]
462 pmullw m3, [mxq+48]
463 %endif
464 paddsw m0, m1
465 paddsw m2, m3
466 paddsw m0, m2
467 paddsw m0, m4
468 psraw m0, 7
469 packuswb m0, m7
470 movh [dstq], m0 ; store
472 ; go to next line
473 add dstq, dststrideq
474 add srcq, srcstrideq
475 dec heightd ; next row
476 jg .nextrow
477 REP_RET
479 INIT_XMM sse2
480 cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg
481 lea mxd, [mxq*3]
482 shl mxd, 4
483 %ifdef PIC
484 lea picregq, [sixtap_filter_v_m]
485 %endif
486 lea mxq, [sixtap_filter_v+mxq-96]
487 pxor m7, m7
488 mova m6, [pw_64]
489 %ifdef m8
490 mova m8, [mxq+ 0]
491 mova m9, [mxq+16]
492 mova m10, [mxq+32]
493 mova m11, [mxq+48]
494 mova m12, [mxq+64]
495 mova m13, [mxq+80]
496 %endif
497 .nextrow:
498 movq m0, [srcq-2]
499 movq m1, [srcq-1]
500 movq m2, [srcq-0]
501 movq m3, [srcq+1]
502 movq m4, [srcq+2]
503 movq m5, [srcq+3]
504 punpcklbw m0, m7
505 punpcklbw m1, m7
506 punpcklbw m2, m7
507 punpcklbw m3, m7
508 punpcklbw m4, m7
509 punpcklbw m5, m7
510 %ifdef m8
511 pmullw m0, m8
512 pmullw m1, m9
513 pmullw m2, m10
514 pmullw m3, m11
515 pmullw m4, m12
516 pmullw m5, m13
517 %else
518 pmullw m0, [mxq+ 0]
519 pmullw m1, [mxq+16]
520 pmullw m2, [mxq+32]
521 pmullw m3, [mxq+48]
522 pmullw m4, [mxq+64]
523 pmullw m5, [mxq+80]
524 %endif
525 paddsw m1, m4
526 paddsw m0, m5
527 paddsw m1, m2
528 paddsw m0, m3
529 paddsw m0, m1
530 paddsw m0, m6
531 psraw m0, 7
532 packuswb m0, m7
533 movh [dstq], m0 ; store
535 ; go to next line
536 add dstq, dststrideq
537 add srcq, srcstrideq
538 dec heightd ; next row
539 jg .nextrow
540 REP_RET
542 %macro FILTER_V 1
543 ; 4x4 block, V-only 4-tap filter
544 cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
545 shl myd, 5
546 %ifdef PIC
547 lea picregq, [fourtap_filter_v_m]
548 %endif
549 lea myq, [fourtap_filter_v+myq-32]
550 mova m6, [pw_64]
551 pxor m7, m7
552 mova m5, [myq+48]
554 ; read 3 lines
555 sub srcq, srcstrideq
556 movh m0, [srcq]
557 movh m1, [srcq+ srcstrideq]
558 movh m2, [srcq+2*srcstrideq]
559 add srcq, srcstrideq
560 punpcklbw m0, m7
561 punpcklbw m1, m7
562 punpcklbw m2, m7
564 .nextrow:
565 ; first calculate negative taps (to prevent losing positive overflows)
566 movh m4, [srcq+2*srcstrideq] ; read new row
567 punpcklbw m4, m7
568 mova m3, m4
569 pmullw m0, [myq+0]
570 pmullw m4, m5
571 paddsw m4, m0
573 ; then calculate positive taps
574 mova m0, m1
575 pmullw m1, [myq+16]
576 paddsw m4, m1
577 mova m1, m2
578 pmullw m2, [myq+32]
579 paddsw m4, m2
580 mova m2, m3
582 ; round/clip/store
583 paddsw m4, m6
584 psraw m4, 7
585 packuswb m4, m7
586 movh [dstq], m4
588 ; go to next line
589 add dstq, dststrideq
590 add srcq, srcstrideq
591 dec heightd ; next row
592 jg .nextrow
593 REP_RET
596 ; 4x4 block, V-only 6-tap filter
597 cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
598 shl myd, 4
599 lea myq, [myq*3]
600 %ifdef PIC
601 lea picregq, [sixtap_filter_v_m]
602 %endif
603 lea myq, [sixtap_filter_v+myq-96]
604 pxor m7, m7
606 ; read 5 lines
607 sub srcq, srcstrideq
608 sub srcq, srcstrideq
609 movh m0, [srcq]
610 movh m1, [srcq+srcstrideq]
611 movh m2, [srcq+srcstrideq*2]
612 lea srcq, [srcq+srcstrideq*2]
613 add srcq, srcstrideq
614 movh m3, [srcq]
615 movh m4, [srcq+srcstrideq]
616 punpcklbw m0, m7
617 punpcklbw m1, m7
618 punpcklbw m2, m7
619 punpcklbw m3, m7
620 punpcklbw m4, m7
622 .nextrow:
623 ; first calculate negative taps (to prevent losing positive overflows)
624 mova m5, m1
625 pmullw m5, [myq+16]
626 mova m6, m4
627 pmullw m6, [myq+64]
628 paddsw m6, m5
630 ; then calculate positive taps
631 movh m5, [srcq+2*srcstrideq] ; read new row
632 punpcklbw m5, m7
633 pmullw m0, [myq+0]
634 paddsw m6, m0
635 mova m0, m1
636 mova m1, m2
637 pmullw m2, [myq+32]
638 paddsw m6, m2
639 mova m2, m3
640 pmullw m3, [myq+48]
641 paddsw m6, m3
642 mova m3, m4
643 mova m4, m5
644 pmullw m5, [myq+80]
645 paddsw m6, m5
647 ; round/clip/store
648 paddsw m6, [pw_64]
649 psraw m6, 7
650 packuswb m6, m7
651 movh [dstq], m6
653 ; go to next line
654 add dstq, dststrideq
655 add srcq, srcstrideq
656 dec heightd ; next row
657 jg .nextrow
658 REP_RET
659 %endmacro
661 INIT_MMX mmxext
662 FILTER_V 4
663 INIT_XMM sse2
664 FILTER_V 8
666 %macro FILTER_BILINEAR 1
667 %if cpuflag(ssse3)
668 cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my
669 shl myd, 4
670 %ifdef PIC
671 lea picregq, [bilinear_filter_vb_m]
672 %endif
673 pxor m4, m4
674 mova m3, [bilinear_filter_vb+myq-16]
675 .nextrow:
676 movh m0, [srcq+srcstrideq*0]
677 movh m1, [srcq+srcstrideq*1]
678 movh m2, [srcq+srcstrideq*2]
679 punpcklbw m0, m1
680 punpcklbw m1, m2
681 pmaddubsw m0, m3
682 pmaddubsw m1, m3
683 psraw m0, 2
684 psraw m1, 2
685 pavgw m0, m4
686 pavgw m1, m4
687 %if mmsize==8
688 packuswb m0, m0
689 packuswb m1, m1
690 movh [dstq+dststrideq*0], m0
691 movh [dstq+dststrideq*1], m1
692 %else
693 packuswb m0, m1
694 movh [dstq+dststrideq*0], m0
695 movhps [dstq+dststrideq*1], m0
696 %endif
697 %else ; cpuflag(ssse3)
698 cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my
699 shl myd, 4
700 %ifdef PIC
701 lea picregq, [bilinear_filter_vw_m]
702 %endif
703 pxor m6, m6
704 mova m5, [bilinear_filter_vw+myq-1*16]
705 neg myq
706 mova m4, [bilinear_filter_vw+myq+7*16]
707 .nextrow:
708 movh m0, [srcq+srcstrideq*0]
709 movh m1, [srcq+srcstrideq*1]
710 movh m3, [srcq+srcstrideq*2]
711 punpcklbw m0, m6
712 punpcklbw m1, m6
713 punpcklbw m3, m6
714 mova m2, m1
715 pmullw m0, m4
716 pmullw m1, m5
717 pmullw m2, m4
718 pmullw m3, m5
719 paddsw m0, m1
720 paddsw m2, m3
721 psraw m0, 2
722 psraw m2, 2
723 pavgw m0, m6
724 pavgw m2, m6
725 %if mmsize == 8
726 packuswb m0, m0
727 packuswb m2, m2
728 movh [dstq+dststrideq*0], m0
729 movh [dstq+dststrideq*1], m2
730 %else
731 packuswb m0, m2
732 movh [dstq+dststrideq*0], m0
733 movhps [dstq+dststrideq*1], m0
734 %endif
735 %endif ; cpuflag(ssse3)
737 lea dstq, [dstq+dststrideq*2]
738 lea srcq, [srcq+srcstrideq*2]
739 sub heightd, 2
740 jg .nextrow
741 REP_RET
743 %if cpuflag(ssse3)
744 cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg
745 shl mxd, 4
746 %ifdef PIC
747 lea picregq, [bilinear_filter_vb_m]
748 %endif
749 pxor m4, m4
750 mova m2, [filter_h2_shuf]
751 mova m3, [bilinear_filter_vb+mxq-16]
752 .nextrow:
753 movu m0, [srcq+srcstrideq*0]
754 movu m1, [srcq+srcstrideq*1]
755 pshufb m0, m2
756 pshufb m1, m2
757 pmaddubsw m0, m3
758 pmaddubsw m1, m3
759 psraw m0, 2
760 psraw m1, 2
761 pavgw m0, m4
762 pavgw m1, m4
763 %if mmsize==8
764 packuswb m0, m0
765 packuswb m1, m1
766 movh [dstq+dststrideq*0], m0
767 movh [dstq+dststrideq*1], m1
768 %else
769 packuswb m0, m1
770 movh [dstq+dststrideq*0], m0
771 movhps [dstq+dststrideq*1], m0
772 %endif
773 %else ; cpuflag(ssse3)
774 cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
775 shl mxd, 4
776 %ifdef PIC
777 lea picregq, [bilinear_filter_vw_m]
778 %endif
779 pxor m6, m6
780 mova m5, [bilinear_filter_vw+mxq-1*16]
781 neg mxq
782 mova m4, [bilinear_filter_vw+mxq+7*16]
783 .nextrow:
784 movh m0, [srcq+srcstrideq*0+0]
785 movh m1, [srcq+srcstrideq*0+1]
786 movh m2, [srcq+srcstrideq*1+0]
787 movh m3, [srcq+srcstrideq*1+1]
788 punpcklbw m0, m6
789 punpcklbw m1, m6
790 punpcklbw m2, m6
791 punpcklbw m3, m6
792 pmullw m0, m4
793 pmullw m1, m5
794 pmullw m2, m4
795 pmullw m3, m5
796 paddsw m0, m1
797 paddsw m2, m3
798 psraw m0, 2
799 psraw m2, 2
800 pavgw m0, m6
801 pavgw m2, m6
802 %if mmsize == 8
803 packuswb m0, m0
804 packuswb m2, m2
805 movh [dstq+dststrideq*0], m0
806 movh [dstq+dststrideq*1], m2
807 %else
808 packuswb m0, m2
809 movh [dstq+dststrideq*0], m0
810 movhps [dstq+dststrideq*1], m0
811 %endif
812 %endif ; cpuflag(ssse3)
814 lea dstq, [dstq+dststrideq*2]
815 lea srcq, [srcq+srcstrideq*2]
816 sub heightd, 2
817 jg .nextrow
818 REP_RET
819 %endmacro
821 INIT_MMX mmxext
822 FILTER_BILINEAR 4
823 INIT_XMM sse2
824 FILTER_BILINEAR 8
825 INIT_MMX ssse3
826 FILTER_BILINEAR 4
827 INIT_XMM ssse3
828 FILTER_BILINEAR 8
830 INIT_MMX mmx
831 cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height
832 .nextrow:
833 movq mm0, [srcq+srcstrideq*0]
834 movq mm1, [srcq+srcstrideq*1]
835 lea srcq, [srcq+srcstrideq*2]
836 movq [dstq+dststrideq*0], mm0
837 movq [dstq+dststrideq*1], mm1
838 lea dstq, [dstq+dststrideq*2]
839 sub heightd, 2
840 jg .nextrow
841 REP_RET
843 %if ARCH_X86_32
844 INIT_MMX mmx
845 cglobal put_vp8_pixels16, 5, 5, 0, dst, dststride, src, srcstride, height
846 .nextrow:
847 movq mm0, [srcq+srcstrideq*0+0]
848 movq mm1, [srcq+srcstrideq*0+8]
849 movq mm2, [srcq+srcstrideq*1+0]
850 movq mm3, [srcq+srcstrideq*1+8]
851 lea srcq, [srcq+srcstrideq*2]
852 movq [dstq+dststrideq*0+0], mm0
853 movq [dstq+dststrideq*0+8], mm1
854 movq [dstq+dststrideq*1+0], mm2
855 movq [dstq+dststrideq*1+8], mm3
856 lea dstq, [dstq+dststrideq*2]
857 sub heightd, 2
858 jg .nextrow
859 REP_RET
860 %endif
862 INIT_XMM sse
863 cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
864 .nextrow:
865 movups xmm0, [srcq+srcstrideq*0]
866 movups xmm1, [srcq+srcstrideq*1]
867 lea srcq, [srcq+srcstrideq*2]
868 movaps [dstq+dststrideq*0], xmm0
869 movaps [dstq+dststrideq*1], xmm1
870 lea dstq, [dstq+dststrideq*2]
871 sub heightd, 2
872 jg .nextrow
873 REP_RET
875 ;-----------------------------------------------------------------------------
876 ; void ff_vp8_idct_dc_add_<opt>(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
877 ;-----------------------------------------------------------------------------
879 %macro ADD_DC 4
880 %4 m2, [dst1q+%3]
881 %4 m3, [dst1q+strideq+%3]
882 %4 m4, [dst2q+%3]
883 %4 m5, [dst2q+strideq+%3]
884 paddusb m2, %1
885 paddusb m3, %1
886 paddusb m4, %1
887 paddusb m5, %1
888 psubusb m2, %2
889 psubusb m3, %2
890 psubusb m4, %2
891 psubusb m5, %2
892 %4 [dst1q+%3], m2
893 %4 [dst1q+strideq+%3], m3
894 %4 [dst2q+%3], m4
895 %4 [dst2q+strideq+%3], m5
896 %endmacro
898 INIT_MMX mmx
899 cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
900 ; load data
901 movd m0, [blockq]
903 ; calculate DC
904 paddw m0, [pw_4]
905 pxor m1, m1
906 psraw m0, 3
907 movd [blockq], m1
908 psubw m1, m0
909 packuswb m0, m0
910 packuswb m1, m1
911 punpcklbw m0, m0
912 punpcklbw m1, m1
913 punpcklwd m0, m0
914 punpcklwd m1, m1
916 ; add DC
917 DEFINE_ARGS dst1, dst2, stride
918 lea dst2q, [dst1q+strideq*2]
919 ADD_DC m0, m1, 0, movh
922 INIT_XMM sse4
923 cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
924 ; load data
925 movd m0, [blockq]
926 pxor m1, m1
928 ; calculate DC
929 paddw m0, [pw_4]
930 movd [blockq], m1
931 DEFINE_ARGS dst1, dst2, stride
932 lea dst2q, [dst1q+strideq*2]
933 movd m2, [dst1q]
934 movd m3, [dst1q+strideq]
935 movd m4, [dst2q]
936 movd m5, [dst2q+strideq]
937 psraw m0, 3
938 pshuflw m0, m0, 0
939 punpcklqdq m0, m0
940 punpckldq m2, m3
941 punpckldq m4, m5
942 punpcklbw m2, m1
943 punpcklbw m4, m1
944 paddw m2, m0
945 paddw m4, m0
946 packuswb m2, m4
947 movd [dst1q], m2
948 pextrd [dst1q+strideq], m2, 1
949 pextrd [dst2q], m2, 2
950 pextrd [dst2q+strideq], m2, 3
953 ;-----------------------------------------------------------------------------
954 ; void ff_vp8_idct_dc_add4y_<opt>(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
955 ;-----------------------------------------------------------------------------
957 %if ARCH_X86_32
958 INIT_MMX mmx
959 cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride
960 ; load data
961 movd m0, [blockq+32*0] ; A
962 movd m1, [blockq+32*2] ; C
963 punpcklwd m0, [blockq+32*1] ; A B
964 punpcklwd m1, [blockq+32*3] ; C D
965 punpckldq m0, m1 ; A B C D
966 pxor m6, m6
968 ; calculate DC
969 paddw m0, [pw_4]
970 movd [blockq+32*0], m6
971 movd [blockq+32*1], m6
972 movd [blockq+32*2], m6
973 movd [blockq+32*3], m6
974 psraw m0, 3
975 psubw m6, m0
976 packuswb m0, m0
977 packuswb m6, m6
978 punpcklbw m0, m0 ; AABBCCDD
979 punpcklbw m6, m6 ; AABBCCDD
980 movq m1, m0
981 movq m7, m6
982 punpcklbw m0, m0 ; AAAABBBB
983 punpckhbw m1, m1 ; CCCCDDDD
984 punpcklbw m6, m6 ; AAAABBBB
985 punpckhbw m7, m7 ; CCCCDDDD
987 ; add DC
988 DEFINE_ARGS dst1, dst2, stride
989 lea dst2q, [dst1q+strideq*2]
990 ADD_DC m0, m6, 0, mova
991 ADD_DC m1, m7, 8, mova
993 %endif
995 INIT_XMM sse2
996 cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride
997 ; load data
998 movd m0, [blockq+32*0] ; A
999 movd m1, [blockq+32*2] ; C
1000 punpcklwd m0, [blockq+32*1] ; A B
1001 punpcklwd m1, [blockq+32*3] ; C D
1002 punpckldq m0, m1 ; A B C D
1003 pxor m1, m1
1005 ; calculate DC
1006 paddw m0, [pw_4]
1007 movd [blockq+32*0], m1
1008 movd [blockq+32*1], m1
1009 movd [blockq+32*2], m1
1010 movd [blockq+32*3], m1
1011 psraw m0, 3
1012 psubw m1, m0
1013 packuswb m0, m0
1014 packuswb m1, m1
1015 punpcklbw m0, m0
1016 punpcklbw m1, m1
1017 punpcklbw m0, m0
1018 punpcklbw m1, m1
1020 ; add DC
1021 DEFINE_ARGS dst1, dst2, stride
1022 lea dst2q, [dst1q+strideq*2]
1023 ADD_DC m0, m1, 0, mova
1026 ;-----------------------------------------------------------------------------
1027 ; void ff_vp8_idct_dc_add4uv_<opt>(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
1028 ;-----------------------------------------------------------------------------
1030 INIT_MMX mmx
1031 cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride
1032 ; load data
1033 movd m0, [blockq+32*0] ; A
1034 movd m1, [blockq+32*2] ; C
1035 punpcklwd m0, [blockq+32*1] ; A B
1036 punpcklwd m1, [blockq+32*3] ; C D
1037 punpckldq m0, m1 ; A B C D
1038 pxor m6, m6
1040 ; calculate DC
1041 paddw m0, [pw_4]
1042 movd [blockq+32*0], m6
1043 movd [blockq+32*1], m6
1044 movd [blockq+32*2], m6
1045 movd [blockq+32*3], m6
1046 psraw m0, 3
1047 psubw m6, m0
1048 packuswb m0, m0
1049 packuswb m6, m6
1050 punpcklbw m0, m0 ; AABBCCDD
1051 punpcklbw m6, m6 ; AABBCCDD
1052 movq m1, m0
1053 movq m7, m6
1054 punpcklbw m0, m0 ; AAAABBBB
1055 punpckhbw m1, m1 ; CCCCDDDD
1056 punpcklbw m6, m6 ; AAAABBBB
1057 punpckhbw m7, m7 ; CCCCDDDD
1059 ; add DC
1060 DEFINE_ARGS dst1, dst2, stride
1061 lea dst2q, [dst1q+strideq*2]
1062 ADD_DC m0, m6, 0, mova
1063 lea dst1q, [dst1q+strideq*4]
1064 lea dst2q, [dst2q+strideq*4]
1065 ADD_DC m1, m7, 0, mova
1068 ;-----------------------------------------------------------------------------
1069 ; void ff_vp8_idct_add_<opt>(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
1070 ;-----------------------------------------------------------------------------
1072 ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
1073 ; this macro assumes that m6/m7 have words for 20091/17734 loaded
1074 %macro VP8_MULTIPLY_SUMSUB 4
1075 mova %3, %1
1076 mova %4, %2
1077 pmulhw %3, m6 ;20091(1)
1078 pmulhw %4, m6 ;20091(2)
1079 paddw %3, %1
1080 paddw %4, %2
1081 paddw %1, %1
1082 paddw %2, %2
1083 pmulhw %1, m7 ;35468(1)
1084 pmulhw %2, m7 ;35468(2)
1085 psubw %1, %4
1086 paddw %2, %3
1087 %endmacro
1089 ; calculate x0=%1+%3; x1=%1-%3
1090 ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
1091 ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
1092 ; %5/%6 are temporary registers
1093 ; we assume m6/m7 have constant words 20091/17734 loaded in them
1094 %macro VP8_IDCT_TRANSFORM4x4_1D 6
1095 SUMSUB_BA w, %3, %1, %5 ;t0, t1
1096 VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
1097 SUMSUB_BA w, %4, %3, %5 ;tmp0, tmp3
1098 SUMSUB_BA w, %2, %1, %5 ;tmp1, tmp2
1099 SWAP %4, %1
1100 SWAP %4, %3
1101 %endmacro
1103 %macro VP8_IDCT_ADD 0
1104 cglobal vp8_idct_add, 3, 3, 0, dst, block, stride
1105 ; load block data
1106 movq m0, [blockq+ 0]
1107 movq m1, [blockq+ 8]
1108 movq m2, [blockq+16]
1109 movq m3, [blockq+24]
1110 movq m6, [pw_20091]
1111 movq m7, [pw_17734]
1112 %if cpuflag(sse)
1113 xorps xmm0, xmm0
1114 movaps [blockq+ 0], xmm0
1115 movaps [blockq+16], xmm0
1116 %else
1117 pxor m4, m4
1118 movq [blockq+ 0], m4
1119 movq [blockq+ 8], m4
1120 movq [blockq+16], m4
1121 movq [blockq+24], m4
1122 %endif
1124 ; actual IDCT
1125 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1126 TRANSPOSE4x4W 0, 1, 2, 3, 4
1127 paddw m0, [pw_4]
1128 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
1129 TRANSPOSE4x4W 0, 1, 2, 3, 4
1131 ; store
1132 pxor m4, m4
1133 DEFINE_ARGS dst1, dst2, stride
1134 lea dst2q, [dst1q+2*strideq]
1135 STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq
1136 STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq
1139 %endmacro
1141 %if ARCH_X86_32
1142 INIT_MMX mmx
1143 VP8_IDCT_ADD
1144 %endif
1145 INIT_MMX sse
1146 VP8_IDCT_ADD
1148 ;-----------------------------------------------------------------------------
1149 ; void ff_vp8_luma_dc_wht(int16_t block[4][4][16], int16_t dc[16])
1150 ;-----------------------------------------------------------------------------
1152 %macro SCATTER_WHT 3
1153 movd dc1d, m%1
1154 movd dc2d, m%2
1155 mov [blockq+2*16*(0+%3)], dc1w
1156 mov [blockq+2*16*(1+%3)], dc2w
1157 shr dc1d, 16
1158 shr dc2d, 16
1159 psrlq m%1, 32
1160 psrlq m%2, 32
1161 mov [blockq+2*16*(4+%3)], dc1w
1162 mov [blockq+2*16*(5+%3)], dc2w
1163 movd dc1d, m%1
1164 movd dc2d, m%2
1165 mov [blockq+2*16*(8+%3)], dc1w
1166 mov [blockq+2*16*(9+%3)], dc2w
1167 shr dc1d, 16
1168 shr dc2d, 16
1169 mov [blockq+2*16*(12+%3)], dc1w
1170 mov [blockq+2*16*(13+%3)], dc2w
1171 %endmacro
1173 %macro HADAMARD4_1D 4
1174 SUMSUB_BADC w, %2, %1, %4, %3
1175 SUMSUB_BADC w, %4, %2, %3, %1
1176 SWAP %1, %4, %3
1177 %endmacro
1179 %macro VP8_DC_WHT 0
1180 cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2
1181 movq m0, [dc1q]
1182 movq m1, [dc1q+8]
1183 movq m2, [dc1q+16]
1184 movq m3, [dc1q+24]
1185 %if cpuflag(sse)
1186 xorps xmm0, xmm0
1187 movaps [dc1q+ 0], xmm0
1188 movaps [dc1q+16], xmm0
1189 %else
1190 pxor m4, m4
1191 movq [dc1q+ 0], m4
1192 movq [dc1q+ 8], m4
1193 movq [dc1q+16], m4
1194 movq [dc1q+24], m4
1195 %endif
1196 HADAMARD4_1D 0, 1, 2, 3
1197 TRANSPOSE4x4W 0, 1, 2, 3, 4
1198 paddw m0, [pw_3]
1199 HADAMARD4_1D 0, 1, 2, 3
1200 psraw m0, 3
1201 psraw m1, 3
1202 psraw m2, 3
1203 psraw m3, 3
1204 SCATTER_WHT 0, 1, 0
1205 SCATTER_WHT 2, 3, 2
1207 %endmacro
1209 %if ARCH_X86_32
1210 INIT_MMX mmx
1211 VP8_DC_WHT
1212 %endif
1213 INIT_MMX sse
1214 VP8_DC_WHT