Merge "vp8_rd_pick_best_mbsegmentation code restructure"
[libvpx.git] / vp8 / common / x86 / subpixel_ssse3.asm
blob7f6fd93e4eb8a30b3802b818b746c3d5a755eec0
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 %define BLOCK_HEIGHT_WIDTH 4
15 %define VP8_FILTER_WEIGHT 128
16 %define VP8_FILTER_SHIFT 7
19 ;/************************************************************************************
20 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
21 ; input pixel array has output_height rows. This routine assumes that output_height is an
22 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
23 ; rows each iteration to take advantage of the 128 bits operations.
25 ; This is an implementation of some of the SSE optimizations first seen in ffvp8
27 ;*************************************************************************************/
28 ;void vp8_filter_block1d8_h6_ssse3
30 ; unsigned char *src_ptr,
31 ; unsigned int src_pixels_per_line,
32 ; unsigned char *output_ptr,
33 ; unsigned int output_pitch,
34 ; unsigned int output_height,
35 ; unsigned int vp8_filter_index
37 global sym(vp8_filter_block1d8_h6_ssse3)
38 sym(vp8_filter_block1d8_h6_ssse3):
39 push rbp
40 mov rbp, rsp
41 SHADOW_ARGS_TO_STACK 6
42 GET_GOT rbx
43 push rsi
44 push rdi
45 ; end prolog
47 movsxd rdx, DWORD PTR arg(5) ;table index
48 xor rsi, rsi
49 shl rdx, 4
51 movdqa xmm7, [GLOBAL(rd)]
53 lea rax, [GLOBAL(k0_k5)]
54 add rax, rdx
55 mov rdi, arg(2) ;output_ptr
57 cmp esi, DWORD PTR [rax]
58 je vp8_filter_block1d8_h4_ssse3
60 movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
61 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
62 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
64 mov rsi, arg(0) ;src_ptr
65 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
66 movsxd rcx, dword ptr arg(4) ;output_height
68 movsxd rdx, dword ptr arg(3) ;output_pitch
70 sub rdi, rdx
71 ;xmm3 free
72 filter_block1d8_h6_rowloop_ssse3:
73 movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
75 movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
77 punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
79 movdqa xmm1, xmm0
80 pmaddubsw xmm0, xmm4
82 movdqa xmm2, xmm1
83 pshufb xmm1, [GLOBAL(shuf2bfrom1)]
85 pshufb xmm2, [GLOBAL(shuf3bfrom1)]
86 pmaddubsw xmm1, xmm5
88 lea rdi, [rdi + rdx]
89 pmaddubsw xmm2, xmm6
91 lea rsi, [rsi + rax]
92 dec rcx
94 paddsw xmm0, xmm1
95 paddsw xmm2, xmm7
97 paddsw xmm0, xmm2
99 psraw xmm0, 7
101 packuswb xmm0, xmm0
103 movq MMWORD Ptr [rdi], xmm0
104 jnz filter_block1d8_h6_rowloop_ssse3
106 ; begin epilog
107 pop rdi
108 pop rsi
109 RESTORE_GOT
110 UNSHADOW_ARGS
111 pop rbp
114 vp8_filter_block1d8_h4_ssse3:
115 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
116 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
118 movdqa xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
119 movdqa xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
121 mov rsi, arg(0) ;src_ptr
123 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
124 movsxd rcx, dword ptr arg(4) ;output_height
126 movsxd rdx, dword ptr arg(3) ;output_pitch
128 sub rdi, rdx
130 filter_block1d8_h4_rowloop_ssse3:
131 movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
133 movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
135 punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
137 movdqa xmm2, xmm0
138 pshufb xmm0, xmm3
140 pshufb xmm2, xmm4
141 pmaddubsw xmm0, xmm5
143 lea rdi, [rdi + rdx]
144 pmaddubsw xmm2, xmm6
146 lea rsi, [rsi + rax]
147 dec rcx
149 paddsw xmm0, xmm7
151 paddsw xmm0, xmm2
153 psraw xmm0, 7
155 packuswb xmm0, xmm0
157 movq MMWORD Ptr [rdi], xmm0
159 jnz filter_block1d8_h4_rowloop_ssse3
161 ; begin epilog
162 pop rdi
163 pop rsi
164 RESTORE_GOT
165 UNSHADOW_ARGS
166 pop rbp
168 ;void vp8_filter_block1d16_h6_ssse3
170 ; unsigned char *src_ptr,
171 ; unsigned int src_pixels_per_line,
172 ; unsigned char *output_ptr,
173 ; unsigned int output_pitch,
174 ; unsigned int output_height,
175 ; unsigned int vp8_filter_index
177 global sym(vp8_filter_block1d16_h6_ssse3)
178 sym(vp8_filter_block1d16_h6_ssse3):
179 push rbp
180 mov rbp, rsp
181 SHADOW_ARGS_TO_STACK 6
182 SAVE_XMM
183 GET_GOT rbx
184 push rsi
185 push rdi
186 ; end prolog
188 movsxd rdx, DWORD PTR arg(5) ;table index
189 xor rsi, rsi
190 shl rdx, 4 ;
192 lea rax, [GLOBAL(k0_k5)]
193 add rax, rdx
195 mov rdi, arg(2) ;output_ptr
198 ;; cmp esi, DWORD PTR [rax]
199 ;; je vp8_filter_block1d16_h4_ssse3
201 mov rsi, arg(0) ;src_ptr
203 movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
204 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
205 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
207 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
208 movsxd rcx, dword ptr arg(4) ;output_height
209 movsxd rdx, dword ptr arg(3) ;output_pitch
211 filter_block1d16_h6_rowloop_ssse3:
212 movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
214 movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
216 punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
218 movdqa xmm1, xmm0
219 pmaddubsw xmm0, xmm4
221 movdqa xmm2, xmm1
222 pshufb xmm1, [GLOBAL(shuf2bfrom1)]
224 pshufb xmm2, [GLOBAL(shuf3bfrom1)]
225 movq xmm3, MMWORD PTR [rsi + 6]
227 pmaddubsw xmm1, xmm5
228 movq xmm7, MMWORD PTR [rsi + 11]
230 pmaddubsw xmm2, xmm6
231 punpcklbw xmm3, xmm7
233 paddsw xmm0, xmm1
234 movdqa xmm1, xmm3
236 pmaddubsw xmm3, xmm4
237 paddsw xmm0, xmm2
239 movdqa xmm2, xmm1
240 paddsw xmm0, [GLOBAL(rd)]
242 pshufb xmm1, [GLOBAL(shuf2bfrom1)]
243 pshufb xmm2, [GLOBAL(shuf3bfrom1)]
245 psraw xmm0, 7
246 pmaddubsw xmm1, xmm5
248 pmaddubsw xmm2, xmm6
249 packuswb xmm0, xmm0
251 lea rsi, [rsi + rax]
252 paddsw xmm3, xmm1
254 paddsw xmm3, xmm2
256 paddsw xmm3, [GLOBAL(rd)]
258 psraw xmm3, 7
260 packuswb xmm3, xmm3
262 punpcklqdq xmm0, xmm3
264 movdqa XMMWORD Ptr [rdi], xmm0
266 lea rdi, [rdi + rdx]
267 dec rcx
268 jnz filter_block1d16_h6_rowloop_ssse3
270 ; begin epilog
271 pop rdi
272 pop rsi
273 RESTORE_GOT
274 UNSHADOW_ARGS
275 pop rbp
278 vp8_filter_block1d16_h4_ssse3:
279 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
280 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
282 mov rsi, arg(0) ;src_ptr
283 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
284 movsxd rcx, dword ptr arg(4) ;output_height
285 movsxd rdx, dword ptr arg(3) ;output_pitch
287 filter_block1d16_h4_rowloop_ssse3:
288 movdqu xmm1, XMMWORD PTR [rsi - 2]
290 movdqa xmm2, xmm1
291 pshufb xmm1, [GLOBAL(shuf2b)]
292 pshufb xmm2, [GLOBAL(shuf3b)]
293 pmaddubsw xmm1, xmm5
295 movdqu xmm3, XMMWORD PTR [rsi + 6]
297 pmaddubsw xmm2, xmm6
298 movdqa xmm0, xmm3
299 pshufb xmm3, [GLOBAL(shuf3b)]
300 pshufb xmm0, [GLOBAL(shuf2b)]
302 paddsw xmm1, [GLOBAL(rd)]
303 paddsw xmm1, xmm2
305 pmaddubsw xmm0, xmm5
306 pmaddubsw xmm3, xmm6
308 psraw xmm1, 7
309 packuswb xmm1, xmm1
310 lea rsi, [rsi + rax]
311 paddsw xmm3, xmm0
312 paddsw xmm3, [GLOBAL(rd)]
313 psraw xmm3, 7
314 packuswb xmm3, xmm3
316 punpcklqdq xmm1, xmm3
318 movdqa XMMWORD Ptr [rdi], xmm1
320 add rdi, rdx
321 dec rcx
322 jnz filter_block1d16_h4_rowloop_ssse3
325 ; begin epilog
326 pop rdi
327 pop rsi
328 RESTORE_GOT
329 UNSHADOW_ARGS
330 pop rbp
333 ;void vp8_filter_block1d4_h6_ssse3
335 ; unsigned char *src_ptr,
336 ; unsigned int src_pixels_per_line,
337 ; unsigned char *output_ptr,
338 ; unsigned int output_pitch,
339 ; unsigned int output_height,
340 ; unsigned int vp8_filter_index
342 global sym(vp8_filter_block1d4_h6_ssse3)
343 sym(vp8_filter_block1d4_h6_ssse3):
344 push rbp
345 mov rbp, rsp
346 SHADOW_ARGS_TO_STACK 6
347 GET_GOT rbx
348 push rsi
349 push rdi
350 ; end prolog
352 movsxd rdx, DWORD PTR arg(5) ;table index
353 xor rsi, rsi
354 shl rdx, 4 ;
356 lea rax, [GLOBAL(k0_k5)]
357 add rax, rdx
358 movdqa xmm7, [GLOBAL(rd)]
360 cmp esi, DWORD PTR [rax]
361 je vp8_filter_block1d4_h4_ssse3
363 movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
364 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
365 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
367 mov rsi, arg(0) ;src_ptr
368 mov rdi, arg(2) ;output_ptr
369 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
370 movsxd rcx, dword ptr arg(4) ;output_height
372 movsxd rdx, dword ptr arg(3) ;output_pitch
374 ;xmm3 free
375 filter_block1d4_h6_rowloop_ssse3:
376 movdqu xmm0, XMMWORD PTR [rsi - 2]
378 movdqa xmm1, xmm0
379 pshufb xmm0, [GLOBAL(shuf1b)]
381 movdqa xmm2, xmm1
382 pshufb xmm1, [GLOBAL(shuf2b)]
383 pmaddubsw xmm0, xmm4
384 pshufb xmm2, [GLOBAL(shuf3b)]
385 pmaddubsw xmm1, xmm5
388 pmaddubsw xmm2, xmm6
390 lea rsi, [rsi + rax]
392 paddsw xmm0, xmm1
393 paddsw xmm0, xmm7
394 pxor xmm1, xmm1
395 paddsw xmm0, xmm2
396 psraw xmm0, 7
397 packuswb xmm0, xmm0
399 movd DWORD PTR [rdi], xmm0
401 add rdi, rdx
402 dec rcx
403 jnz filter_block1d4_h6_rowloop_ssse3
405 ; begin epilog
406 pop rdi
407 pop rsi
408 RESTORE_GOT
409 UNSHADOW_ARGS
410 pop rbp
413 vp8_filter_block1d4_h4_ssse3:
414 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
415 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
416 movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
417 movdqa xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
419 mov rsi, arg(0) ;src_ptr
420 mov rdi, arg(2) ;output_ptr
421 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
422 movsxd rcx, dword ptr arg(4) ;output_height
424 movsxd rdx, dword ptr arg(3) ;output_pitch
426 filter_block1d4_h4_rowloop_ssse3:
427 movdqu xmm1, XMMWORD PTR [rsi - 2]
429 movdqa xmm2, xmm1
430 pshufb xmm1, xmm0 ;;[GLOBAL(shuf2b)]
431 pshufb xmm2, xmm3 ;;[GLOBAL(shuf3b)]
432 pmaddubsw xmm1, xmm5
435 pmaddubsw xmm2, xmm6
437 lea rsi, [rsi + rax]
439 paddsw xmm1, xmm7
440 paddsw xmm1, xmm2
441 psraw xmm1, 7
442 packuswb xmm1, xmm1
444 movd DWORD PTR [rdi], xmm1
446 add rdi, rdx
447 dec rcx
448 jnz filter_block1d4_h4_rowloop_ssse3
450 ; begin epilog
451 pop rdi
452 pop rsi
453 RESTORE_GOT
454 UNSHADOW_ARGS
455 pop rbp
460 ;void vp8_filter_block1d16_v6_ssse3
462 ; unsigned char *src_ptr,
463 ; unsigned int src_pitch,
464 ; unsigned char *output_ptr,
465 ; unsigned int out_pitch,
466 ; unsigned int output_height,
467 ; unsigned int vp8_filter_index
469 global sym(vp8_filter_block1d16_v6_ssse3)
470 sym(vp8_filter_block1d16_v6_ssse3):
471 push rbp
472 mov rbp, rsp
473 SHADOW_ARGS_TO_STACK 6
474 GET_GOT rbx
475 push rsi
476 push rdi
477 ; end prolog
479 movsxd rdx, DWORD PTR arg(5) ;table index
480 xor rsi, rsi
481 shl rdx, 4 ;
483 lea rax, [GLOBAL(k0_k5)]
484 add rax, rdx
486 cmp esi, DWORD PTR [rax]
487 je vp8_filter_block1d16_v4_ssse3
489 movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
490 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
491 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
493 mov rsi, arg(0) ;src_ptr
494 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
495 mov rdi, arg(2) ;output_ptr
497 %if ABI_IS_32BIT=0
498 movsxd r8, DWORD PTR arg(3) ;out_pitch
499 %endif
500 mov rax, rsi
501 movsxd rcx, DWORD PTR arg(4) ;output_height
502 add rax, rdx
505 vp8_filter_block1d16_v6_ssse3_loop:
506 movq xmm1, MMWORD PTR [rsi] ;A
507 movq xmm2, MMWORD PTR [rsi + rdx] ;B
508 movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
509 movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
510 movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
512 punpcklbw xmm2, xmm4 ;B D
513 punpcklbw xmm3, xmm0 ;C E
515 movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
517 pmaddubsw xmm3, xmm6
518 punpcklbw xmm1, xmm0 ;A F
519 pmaddubsw xmm2, xmm7
520 pmaddubsw xmm1, xmm5
522 paddsw xmm2, xmm3
523 paddsw xmm2, xmm1
524 paddsw xmm2, [GLOBAL(rd)]
525 psraw xmm2, 7
526 packuswb xmm2, xmm2
528 movq MMWORD PTR [rdi], xmm2 ;store the results
530 movq xmm1, MMWORD PTR [rsi + 8] ;A
531 movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B
532 movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C
533 movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
534 movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
536 punpcklbw xmm2, xmm4 ;B D
537 punpcklbw xmm3, xmm0 ;C E
539 movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F
540 pmaddubsw xmm3, xmm6
541 punpcklbw xmm1, xmm0 ;A F
542 pmaddubsw xmm2, xmm7
543 pmaddubsw xmm1, xmm5
545 add rsi, rdx
546 add rax, rdx
549 paddsw xmm2, xmm3
550 paddsw xmm2, xmm1
551 paddsw xmm2, [GLOBAL(rd)]
552 psraw xmm2, 7
553 packuswb xmm2, xmm2
555 movq MMWORD PTR [rdi+8], xmm2
557 %if ABI_IS_32BIT
558 add rdi, DWORD PTR arg(3) ;out_pitch
559 %else
560 add rdi, r8
561 %endif
562 dec rcx
563 jnz vp8_filter_block1d16_v6_ssse3_loop
565 ; begin epilog
566 pop rdi
567 pop rsi
568 RESTORE_GOT
569 UNSHADOW_ARGS
570 pop rbp
573 vp8_filter_block1d16_v4_ssse3:
574 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
575 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
577 mov rsi, arg(0) ;src_ptr
578 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
579 mov rdi, arg(2) ;output_ptr
581 %if ABI_IS_32BIT=0
582 movsxd r8, DWORD PTR arg(3) ;out_pitch
583 %endif
584 mov rax, rsi
585 movsxd rcx, DWORD PTR arg(4) ;output_height
586 add rax, rdx
588 vp8_filter_block1d16_v4_ssse3_loop:
589 movq xmm2, MMWORD PTR [rsi + rdx] ;B
590 movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
591 movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
592 movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
594 punpcklbw xmm2, xmm4 ;B D
595 punpcklbw xmm3, xmm0 ;C E
597 pmaddubsw xmm3, xmm6
598 pmaddubsw xmm2, xmm7
599 movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B
600 movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C
601 movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
602 movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
604 paddsw xmm2, [GLOBAL(rd)]
605 paddsw xmm2, xmm3
606 psraw xmm2, 7
607 packuswb xmm2, xmm2
609 punpcklbw xmm5, xmm4 ;B D
610 punpcklbw xmm1, xmm0 ;C E
612 pmaddubsw xmm1, xmm6
613 pmaddubsw xmm5, xmm7
615 movdqa xmm4, [GLOBAL(rd)]
616 add rsi, rdx
617 add rax, rdx
620 paddsw xmm5, xmm1
621 paddsw xmm5, xmm4
622 psraw xmm5, 7
623 packuswb xmm5, xmm5
625 punpcklqdq xmm2, xmm5
627 movdqa XMMWORD PTR [rdi], xmm2
629 %if ABI_IS_32BIT
630 add rdi, DWORD PTR arg(3) ;out_pitch
631 %else
632 add rdi, r8
633 %endif
634 dec rcx
635 jnz vp8_filter_block1d16_v4_ssse3_loop
637 ; begin epilog
638 pop rdi
639 pop rsi
640 RESTORE_GOT
641 UNSHADOW_ARGS
642 pop rbp
645 ;void vp8_filter_block1d8_v6_ssse3
647 ; unsigned char *src_ptr,
648 ; unsigned int src_pitch,
649 ; unsigned char *output_ptr,
650 ; unsigned int out_pitch,
651 ; unsigned int output_height,
652 ; unsigned int vp8_filter_index
654 global sym(vp8_filter_block1d8_v6_ssse3)
655 sym(vp8_filter_block1d8_v6_ssse3):
656 push rbp
657 mov rbp, rsp
658 SHADOW_ARGS_TO_STACK 6
659 GET_GOT rbx
660 push rsi
661 push rdi
662 ; end prolog
664 movsxd rdx, DWORD PTR arg(5) ;table index
665 xor rsi, rsi
666 shl rdx, 4 ;
668 lea rax, [GLOBAL(k0_k5)]
669 add rax, rdx
671 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
672 mov rdi, arg(2) ;output_ptr
673 %if ABI_IS_32BIT=0
674 movsxd r8, DWORD PTR arg(3) ; out_pitch
675 %endif
676 movsxd rcx, DWORD PTR arg(4) ;[output_height]
678 cmp esi, DWORD PTR [rax]
679 je vp8_filter_block1d8_v4_ssse3
681 movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
682 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
683 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
685 mov rsi, arg(0) ;src_ptr
687 mov rax, rsi
688 add rax, rdx
690 vp8_filter_block1d8_v6_ssse3_loop:
691 movq xmm1, MMWORD PTR [rsi] ;A
692 movq xmm2, MMWORD PTR [rsi + rdx] ;B
693 movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
694 movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
695 movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
697 punpcklbw xmm2, xmm4 ;B D
698 punpcklbw xmm3, xmm0 ;C E
700 movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
701 movdqa xmm4, [GLOBAL(rd)]
703 pmaddubsw xmm3, xmm6
704 punpcklbw xmm1, xmm0 ;A F
705 pmaddubsw xmm2, xmm7
706 pmaddubsw xmm1, xmm5
707 add rsi, rdx
708 add rax, rdx
711 paddsw xmm2, xmm3
712 paddsw xmm2, xmm1
713 paddsw xmm2, xmm4
714 psraw xmm2, 7
715 packuswb xmm2, xmm2
717 movq MMWORD PTR [rdi], xmm2
719 %if ABI_IS_32BIT
720 add rdi, DWORD PTR arg(3) ;[out_pitch]
721 %else
722 add rdi, r8
723 %endif
724 dec rcx
725 jnz vp8_filter_block1d8_v6_ssse3_loop
727 ; begin epilog
728 pop rdi
729 pop rsi
730 RESTORE_GOT
731 UNSHADOW_ARGS
732 pop rbp
735 vp8_filter_block1d8_v4_ssse3:
736 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
737 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
738 movdqa xmm5, [GLOBAL(rd)]
740 mov rsi, arg(0) ;src_ptr
742 mov rax, rsi
743 add rax, rdx
745 vp8_filter_block1d8_v4_ssse3_loop:
746 movq xmm2, MMWORD PTR [rsi + rdx] ;B
747 movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
748 movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
749 movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
751 punpcklbw xmm2, xmm4 ;B D
752 punpcklbw xmm3, xmm0 ;C E
754 pmaddubsw xmm3, xmm6
755 pmaddubsw xmm2, xmm7
756 add rsi, rdx
757 add rax, rdx
760 paddsw xmm2, xmm3
761 paddsw xmm2, xmm5
762 psraw xmm2, 7
763 packuswb xmm2, xmm2
765 movq MMWORD PTR [rdi], xmm2
767 %if ABI_IS_32BIT
768 add rdi, DWORD PTR arg(3) ;[out_pitch]
769 %else
770 add rdi, r8
771 %endif
772 dec rcx
773 jnz vp8_filter_block1d8_v4_ssse3_loop
775 ; begin epilog
776 pop rdi
777 pop rsi
778 RESTORE_GOT
779 UNSHADOW_ARGS
780 pop rbp
782 ;void vp8_filter_block1d4_v6_ssse3
784 ; unsigned char *src_ptr,
785 ; unsigned int src_pitch,
786 ; unsigned char *output_ptr,
787 ; unsigned int out_pitch,
788 ; unsigned int output_height,
789 ; unsigned int vp8_filter_index
791 global sym(vp8_filter_block1d4_v6_ssse3)
792 sym(vp8_filter_block1d4_v6_ssse3):
793 push rbp
794 mov rbp, rsp
795 SHADOW_ARGS_TO_STACK 6
796 GET_GOT rbx
797 push rsi
798 push rdi
799 ; end prolog
801 movsxd rdx, DWORD PTR arg(5) ;table index
802 xor rsi, rsi
803 shl rdx, 4 ;
805 lea rax, [GLOBAL(k0_k5)]
806 add rax, rdx
808 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
809 mov rdi, arg(2) ;output_ptr
810 %if ABI_IS_32BIT=0
811 movsxd r8, DWORD PTR arg(3) ; out_pitch
812 %endif
813 movsxd rcx, DWORD PTR arg(4) ;[output_height]
815 cmp esi, DWORD PTR [rax]
816 je vp8_filter_block1d4_v4_ssse3
818 movq mm5, MMWORD PTR [rax] ;k0_k5
819 movq mm6, MMWORD PTR [rax+256] ;k2_k4
820 movq mm7, MMWORD PTR [rax+128] ;k1_k3
822 mov rsi, arg(0) ;src_ptr
824 mov rax, rsi
825 add rax, rdx
827 vp8_filter_block1d4_v6_ssse3_loop:
828 movd mm1, DWORD PTR [rsi] ;A
829 movd mm2, DWORD PTR [rsi + rdx] ;B
830 movd mm3, DWORD PTR [rsi + rdx * 2] ;C
831 movd mm4, DWORD PTR [rax + rdx * 2] ;D
832 movd mm0, DWORD PTR [rsi + rdx * 4] ;E
834 punpcklbw mm2, mm4 ;B D
835 punpcklbw mm3, mm0 ;C E
837 movd mm0, DWORD PTR [rax + rdx * 4] ;F
839 movq mm4, [GLOBAL(rd)]
841 pmaddubsw mm3, mm6
842 punpcklbw mm1, mm0 ;A F
843 pmaddubsw mm2, mm7
844 pmaddubsw mm1, mm5
845 add rsi, rdx
846 add rax, rdx
849 paddsw mm2, mm3
850 paddsw mm2, mm1
851 paddsw mm2, mm4
852 psraw mm2, 7
853 packuswb mm2, mm2
855 movd DWORD PTR [rdi], mm2
857 %if ABI_IS_32BIT
858 add rdi, DWORD PTR arg(3) ;[out_pitch]
859 %else
860 add rdi, r8
861 %endif
862 dec rcx
863 jnz vp8_filter_block1d4_v6_ssse3_loop
865 ; begin epilog
866 pop rdi
867 pop rsi
868 RESTORE_GOT
869 UNSHADOW_ARGS
870 pop rbp
873 vp8_filter_block1d4_v4_ssse3:
874 movq mm6, MMWORD PTR [rax+256] ;k2_k4
875 movq mm7, MMWORD PTR [rax+128] ;k1_k3
876 movq mm5, MMWORD PTR [GLOBAL(rd)]
878 mov rsi, arg(0) ;src_ptr
880 mov rax, rsi
881 add rax, rdx
883 vp8_filter_block1d4_v4_ssse3_loop:
884 movd mm2, DWORD PTR [rsi + rdx] ;B
885 movd mm3, DWORD PTR [rsi + rdx * 2] ;C
886 movd mm4, DWORD PTR [rax + rdx * 2] ;D
887 movd mm0, DWORD PTR [rsi + rdx * 4] ;E
889 punpcklbw mm2, mm4 ;B D
890 punpcklbw mm3, mm0 ;C E
892 pmaddubsw mm3, mm6
893 pmaddubsw mm2, mm7
894 add rsi, rdx
895 add rax, rdx
898 paddsw mm2, mm3
899 paddsw mm2, mm5
900 psraw mm2, 7
901 packuswb mm2, mm2
903 movd DWORD PTR [rdi], mm2
905 %if ABI_IS_32BIT
906 add rdi, DWORD PTR arg(3) ;[out_pitch]
907 %else
908 add rdi, r8
909 %endif
910 dec rcx
911 jnz vp8_filter_block1d4_v4_ssse3_loop
913 ; begin epilog
914 pop rdi
915 pop rsi
916 RESTORE_GOT
917 UNSHADOW_ARGS
918 pop rbp
921 ;void vp8_bilinear_predict16x16_ssse3
923 ; unsigned char *src_ptr,
924 ; int src_pixels_per_line,
925 ; int xoffset,
926 ; int yoffset,
927 ; unsigned char *dst_ptr,
928 ; int dst_pitch
930 global sym(vp8_bilinear_predict16x16_ssse3)
931 sym(vp8_bilinear_predict16x16_ssse3):
932 push rbp
933 mov rbp, rsp
934 SHADOW_ARGS_TO_STACK 6
935 SAVE_XMM
936 GET_GOT rbx
937 push rsi
938 push rdi
939 ; end prolog
941 lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
942 movsxd rax, dword ptr arg(2) ; xoffset
944 cmp rax, 0 ; skip first_pass filter if xoffset=0
945 je b16x16_sp_only
947 shl rax, 4
948 lea rax, [rax + rcx] ; HFilter
950 mov rdi, arg(4) ; dst_ptr
951 mov rsi, arg(0) ; src_ptr
952 movsxd rdx, dword ptr arg(5) ; dst_pitch
954 movdqa xmm1, [rax]
956 movsxd rax, dword ptr arg(3) ; yoffset
958 cmp rax, 0 ; skip second_pass filter if yoffset=0
959 je b16x16_fp_only
961 shl rax, 4
962 lea rax, [rax + rcx] ; VFilter
964 lea rcx, [rdi+rdx*8]
965 lea rcx, [rcx+rdx*8]
966 movsxd rdx, dword ptr arg(1) ; src_pixels_per_line
968 movdqa xmm2, [rax]
970 %if ABI_IS_32BIT=0
971 movsxd r8, dword ptr arg(5) ; dst_pitch
972 %endif
973 movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07
974 movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
976 punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
977 movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
979 movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
981 lea rsi, [rsi + rdx] ; next line
983 pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14
985 punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
986 pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15
988 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
989 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
991 paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
992 psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128
994 movdqa xmm7, xmm3
995 packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
997 .next_row:
998 movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07
999 movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
1001 punpcklbw xmm6, xmm5
1002 movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
1004 movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
1005 lea rsi, [rsi + rdx] ; next line
1007 pmaddubsw xmm6, xmm1
1009 punpcklbw xmm4, xmm5
1010 pmaddubsw xmm4, xmm1
1012 paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
1013 psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128
1015 paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
1016 psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128
1018 packuswb xmm6, xmm4
1019 movdqa xmm5, xmm7
1021 punpcklbw xmm5, xmm6
1022 pmaddubsw xmm5, xmm2
1024 punpckhbw xmm7, xmm6
1025 pmaddubsw xmm7, xmm2
1027 paddw xmm5, [GLOBAL(rd)] ; xmm5 += round value
1028 psraw xmm5, VP8_FILTER_SHIFT ; xmm5 /= 128
1030 paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
1031 psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128
1033 packuswb xmm5, xmm7
1034 movdqa xmm7, xmm6
1036 movdqa [rdi], xmm5 ; store the results in the destination
1037 %if ABI_IS_32BIT
1038 add rdi, DWORD PTR arg(5) ; dst_pitch
1039 %else
1040 add rdi, r8
1041 %endif
1043 cmp rdi, rcx
1044 jne .next_row
1046 jmp done
1048 b16x16_sp_only:
1049 movsxd rax, dword ptr arg(3) ; yoffset
1050 shl rax, 4
1051 lea rax, [rax + rcx] ; VFilter
1053 mov rdi, arg(4) ; dst_ptr
1054 mov rsi, arg(0) ; src_ptr
1055 movsxd rdx, dword ptr arg(5) ; dst_pitch
1057 movdqa xmm1, [rax] ; VFilter
1059 lea rcx, [rdi+rdx*8]
1060 lea rcx, [rcx+rdx*8]
1061 movsxd rax, dword ptr arg(1) ; src_pixels_per_line
1063 ; get the first horizontal line done
1064 movq xmm4, [rsi] ; load row 0
1065 movq xmm2, [rsi + 8] ; load row 0
1067 lea rsi, [rsi + rax] ; next line
1068 .next_row:
1069 movq xmm3, [rsi] ; load row + 1
1070 movq xmm5, [rsi + 8] ; load row + 1
1072 punpcklbw xmm4, xmm3
1073 punpcklbw xmm2, xmm5
1075 pmaddubsw xmm4, xmm1
1076 movq xmm7, [rsi + rax] ; load row + 2
1078 pmaddubsw xmm2, xmm1
1079 movq xmm6, [rsi + rax + 8] ; load row + 2
1081 punpcklbw xmm3, xmm7
1082 punpcklbw xmm5, xmm6
1084 pmaddubsw xmm3, xmm1
1085 paddw xmm4, [GLOBAL(rd)]
1087 pmaddubsw xmm5, xmm1
1088 paddw xmm2, [GLOBAL(rd)]
1090 psraw xmm4, VP8_FILTER_SHIFT
1091 psraw xmm2, VP8_FILTER_SHIFT
1093 packuswb xmm4, xmm2
1094 paddw xmm3, [GLOBAL(rd)]
1096 movdqa [rdi], xmm4 ; store row 0
1097 paddw xmm5, [GLOBAL(rd)]
1099 psraw xmm3, VP8_FILTER_SHIFT
1100 psraw xmm5, VP8_FILTER_SHIFT
1102 packuswb xmm3, xmm5
1103 movdqa xmm4, xmm7
1105 movdqa [rdi + rdx],xmm3 ; store row 1
1106 lea rsi, [rsi + 2*rax]
1108 movdqa xmm2, xmm6
1109 lea rdi, [rdi + 2*rdx]
1111 cmp rdi, rcx
1112 jne .next_row
1114 jmp done
1116 b16x16_fp_only:
1117 lea rcx, [rdi+rdx*8]
1118 lea rcx, [rcx+rdx*8]
1119 movsxd rax, dword ptr arg(1) ; src_pixels_per_line
1121 .next_row:
1122 movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07
1123 movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08
1125 punpcklbw xmm2, xmm4
1126 movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15
1128 pmaddubsw xmm2, xmm1
1129 movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16
1131 lea rsi, [rsi + rax] ; next line
1132 punpcklbw xmm3, xmm4
1134 pmaddubsw xmm3, xmm1
1135 movq xmm5, [rsi]
1137 paddw xmm2, [GLOBAL(rd)]
1138 movq xmm7, [rsi+1]
1140 movq xmm6, [rsi+8]
1141 psraw xmm2, VP8_FILTER_SHIFT
1143 punpcklbw xmm5, xmm7
1144 movq xmm7, [rsi+9]
1146 paddw xmm3, [GLOBAL(rd)]
1147 pmaddubsw xmm5, xmm1
1149 psraw xmm3, VP8_FILTER_SHIFT
1150 punpcklbw xmm6, xmm7
1152 packuswb xmm2, xmm3
1153 pmaddubsw xmm6, xmm1
1155 movdqa [rdi], xmm2 ; store the results in the destination
1156 paddw xmm5, [GLOBAL(rd)]
1158 lea rdi, [rdi + rdx] ; dst_pitch
1159 psraw xmm5, VP8_FILTER_SHIFT
1161 paddw xmm6, [GLOBAL(rd)]
1162 psraw xmm6, VP8_FILTER_SHIFT
1164 packuswb xmm5, xmm6
1165 lea rsi, [rsi + rax] ; next line
1167 movdqa [rdi], xmm5 ; store the results in the destination
1168 lea rdi, [rdi + rdx] ; dst_pitch
1170 cmp rdi, rcx
1172 jne .next_row
1174 done:
1175 ; begin epilog
1176 pop rdi
1177 pop rsi
1178 RESTORE_GOT
1179 RESTORE_XMM
1180 UNSHADOW_ARGS
1181 pop rbp
1184 ;void vp8_bilinear_predict8x8_ssse3
1186 ; unsigned char *src_ptr,
1187 ; int src_pixels_per_line,
1188 ; int xoffset,
1189 ; int yoffset,
1190 ; unsigned char *dst_ptr,
1191 ; int dst_pitch
1193 global sym(vp8_bilinear_predict8x8_ssse3)
1194 sym(vp8_bilinear_predict8x8_ssse3):
1195 push rbp
1196 mov rbp, rsp
1197 SHADOW_ARGS_TO_STACK 6
1198 SAVE_XMM
1199 GET_GOT rbx
1200 push rsi
1201 push rdi
1202 ; end prolog
1204 ALIGN_STACK 16, rax
1205 sub rsp, 144 ; reserve 144 bytes
1207 lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)]
1209 mov rsi, arg(0) ;src_ptr
1210 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
1212 ;Read 9-line unaligned data in and put them on stack. This gives a big
1213 ;performance boost.
1214 movdqu xmm0, [rsi]
1215 lea rax, [rdx + rdx*2]
1216 movdqu xmm1, [rsi+rdx]
1217 movdqu xmm2, [rsi+rdx*2]
1218 add rsi, rax
1219 movdqu xmm3, [rsi]
1220 movdqu xmm4, [rsi+rdx]
1221 movdqu xmm5, [rsi+rdx*2]
1222 add rsi, rax
1223 movdqu xmm6, [rsi]
1224 movdqu xmm7, [rsi+rdx]
1226 movdqa XMMWORD PTR [rsp], xmm0
1228 movdqu xmm0, [rsi+rdx*2]
1230 movdqa XMMWORD PTR [rsp+16], xmm1
1231 movdqa XMMWORD PTR [rsp+32], xmm2
1232 movdqa XMMWORD PTR [rsp+48], xmm3
1233 movdqa XMMWORD PTR [rsp+64], xmm4
1234 movdqa XMMWORD PTR [rsp+80], xmm5
1235 movdqa XMMWORD PTR [rsp+96], xmm6
1236 movdqa XMMWORD PTR [rsp+112], xmm7
1237 movdqa XMMWORD PTR [rsp+128], xmm0
1239 movsxd rax, dword ptr arg(2) ; xoffset
1240 cmp rax, 0 ; skip first_pass filter if xoffset=0
1241 je b8x8_sp_only
1243 shl rax, 4
1244 add rax, rcx ; HFilter
1246 mov rdi, arg(4) ; dst_ptr
1247 movsxd rdx, dword ptr arg(5) ; dst_pitch
1249 movdqa xmm0, [rax]
1251 movsxd rax, dword ptr arg(3) ; yoffset
1252 cmp rax, 0 ; skip second_pass filter if yoffset=0
1253 je b8x8_fp_only
1255 shl rax, 4
1256 lea rax, [rax + rcx] ; VFilter
1258 lea rcx, [rdi+rdx*8]
1260 movdqa xmm1, [rax]
1262 ; get the first horizontal line done
1263 movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1264 movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
1266 psrldq xmm5, 1
1267 lea rsp, [rsp + 16] ; next line
1269 punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
1270 pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14
1272 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
1273 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
1275 movdqa xmm7, xmm3
1276 packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1278 .next_row:
1279 movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1280 lea rsp, [rsp + 16] ; next line
1282 movdqa xmm5, xmm6
1284 psrldq xmm5, 1
1286 punpcklbw xmm6, xmm5
1287 pmaddubsw xmm6, xmm0
1289 paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
1290 psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128
1292 packuswb xmm6, xmm6
1294 punpcklbw xmm7, xmm6
1295 pmaddubsw xmm7, xmm1
1297 paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
1298 psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128
1300 packuswb xmm7, xmm7
1302 movq [rdi], xmm7 ; store the results in the destination
1303 lea rdi, [rdi + rdx]
1305 movdqa xmm7, xmm6
1307 cmp rdi, rcx
1308 jne .next_row
1310 jmp done8x8
1312 b8x8_sp_only:
1313 movsxd rax, dword ptr arg(3) ; yoffset
1314 shl rax, 4
1315 lea rax, [rax + rcx] ; VFilter
1317 mov rdi, arg(4) ;dst_ptr
1318 movsxd rdx, dword ptr arg(5) ; dst_pitch
1320 movdqa xmm0, [rax] ; VFilter
1322 movq xmm1, XMMWORD PTR [rsp]
1323 movq xmm2, XMMWORD PTR [rsp+16]
1325 movq xmm3, XMMWORD PTR [rsp+32]
1326 punpcklbw xmm1, xmm2
1328 movq xmm4, XMMWORD PTR [rsp+48]
1329 punpcklbw xmm2, xmm3
1331 movq xmm5, XMMWORD PTR [rsp+64]
1332 punpcklbw xmm3, xmm4
1334 movq xmm6, XMMWORD PTR [rsp+80]
1335 punpcklbw xmm4, xmm5
1337 movq xmm7, XMMWORD PTR [rsp+96]
1338 punpcklbw xmm5, xmm6
1340 pmaddubsw xmm1, xmm0
1341 pmaddubsw xmm2, xmm0
1343 pmaddubsw xmm3, xmm0
1344 pmaddubsw xmm4, xmm0
1346 pmaddubsw xmm5, xmm0
1347 punpcklbw xmm6, xmm7
1349 pmaddubsw xmm6, xmm0
1350 paddw xmm1, [GLOBAL(rd)]
1352 paddw xmm2, [GLOBAL(rd)]
1353 psraw xmm1, VP8_FILTER_SHIFT
1355 paddw xmm3, [GLOBAL(rd)]
1356 psraw xmm2, VP8_FILTER_SHIFT
1358 paddw xmm4, [GLOBAL(rd)]
1359 psraw xmm3, VP8_FILTER_SHIFT
1361 paddw xmm5, [GLOBAL(rd)]
1362 psraw xmm4, VP8_FILTER_SHIFT
1364 paddw xmm6, [GLOBAL(rd)]
1365 psraw xmm5, VP8_FILTER_SHIFT
1367 psraw xmm6, VP8_FILTER_SHIFT
1368 packuswb xmm1, xmm1
1370 packuswb xmm2, xmm2
1371 movq [rdi], xmm1
1373 packuswb xmm3, xmm3
1374 movq [rdi+rdx], xmm2
1376 packuswb xmm4, xmm4
1377 movq xmm1, XMMWORD PTR [rsp+112]
1379 lea rdi, [rdi + 2*rdx]
1380 movq xmm2, XMMWORD PTR [rsp+128]
1382 packuswb xmm5, xmm5
1383 movq [rdi], xmm3
1385 packuswb xmm6, xmm6
1386 movq [rdi+rdx], xmm4
1388 lea rdi, [rdi + 2*rdx]
1389 punpcklbw xmm7, xmm1
1391 movq [rdi], xmm5
1392 pmaddubsw xmm7, xmm0
1394 movq [rdi+rdx], xmm6
1395 punpcklbw xmm1, xmm2
1397 pmaddubsw xmm1, xmm0
1398 paddw xmm7, [GLOBAL(rd)]
1400 psraw xmm7, VP8_FILTER_SHIFT
1401 paddw xmm1, [GLOBAL(rd)]
1403 psraw xmm1, VP8_FILTER_SHIFT
1404 packuswb xmm7, xmm7
1406 packuswb xmm1, xmm1
1407 lea rdi, [rdi + 2*rdx]
1409 movq [rdi], xmm7
1411 movq [rdi+rdx], xmm1
1412 lea rsp, [rsp + 144]
1414 jmp done8x8
1416 b8x8_fp_only:
1417 lea rcx, [rdi+rdx*8]
1419 .next_row:
1420 movdqa xmm1, XMMWORD PTR [rsp]
1421 movdqa xmm3, XMMWORD PTR [rsp+16]
1423 movdqa xmm2, xmm1
1424 movdqa xmm5, XMMWORD PTR [rsp+32]
1426 psrldq xmm2, 1
1427 movdqa xmm7, XMMWORD PTR [rsp+48]
1429 movdqa xmm4, xmm3
1430 psrldq xmm4, 1
1432 movdqa xmm6, xmm5
1433 psrldq xmm6, 1
1435 punpcklbw xmm1, xmm2
1436 pmaddubsw xmm1, xmm0
1438 punpcklbw xmm3, xmm4
1439 pmaddubsw xmm3, xmm0
1441 punpcklbw xmm5, xmm6
1442 pmaddubsw xmm5, xmm0
1444 movdqa xmm2, xmm7
1445 psrldq xmm2, 1
1447 punpcklbw xmm7, xmm2
1448 pmaddubsw xmm7, xmm0
1450 paddw xmm1, [GLOBAL(rd)]
1451 psraw xmm1, VP8_FILTER_SHIFT
1453 paddw xmm3, [GLOBAL(rd)]
1454 psraw xmm3, VP8_FILTER_SHIFT
1456 paddw xmm5, [GLOBAL(rd)]
1457 psraw xmm5, VP8_FILTER_SHIFT
1459 paddw xmm7, [GLOBAL(rd)]
1460 psraw xmm7, VP8_FILTER_SHIFT
1462 packuswb xmm1, xmm1
1463 packuswb xmm3, xmm3
1465 packuswb xmm5, xmm5
1466 movq [rdi], xmm1
1468 packuswb xmm7, xmm7
1469 movq [rdi+rdx], xmm3
1471 lea rdi, [rdi + 2*rdx]
1472 movq [rdi], xmm5
1474 lea rsp, [rsp + 4*16]
1475 movq [rdi+rdx], xmm7
1477 lea rdi, [rdi + 2*rdx]
1478 cmp rdi, rcx
1480 jne .next_row
1482 lea rsp, [rsp + 16]
1484 done8x8:
1485 ;add rsp, 144
1486 pop rsp
1487 ; begin epilog
1488 pop rdi
1489 pop rsi
1490 RESTORE_GOT
1491 RESTORE_XMM
1492 UNSHADOW_ARGS
1493 pop rbp
1496 SECTION_RODATA
1497 align 16
1498 shuf1b:
1499 db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
1500 shuf2b:
1501 db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
1502 shuf3b:
1503 db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
1505 align 16
1506 shuf2bfrom1:
1507 db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
1508 align 16
1509 shuf3bfrom1:
1510 db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
1512 align 16
1514 times 8 dw 0x40
1516 align 16
1517 k0_k5:
1518 times 8 db 0, 0 ;placeholder
1519 times 8 db 0, 0
1520 times 8 db 2, 1
1521 times 8 db 0, 0
1522 times 8 db 3, 3
1523 times 8 db 0, 0
1524 times 8 db 1, 2
1525 times 8 db 0, 0
1526 k1_k3:
1527 times 8 db 0, 0 ;placeholder
1528 times 8 db -6, 12
1529 times 8 db -11, 36
1530 times 8 db -9, 50
1531 times 8 db -16, 77
1532 times 8 db -6, 93
1533 times 8 db -8, 108
1534 times 8 db -1, 123
1535 k2_k4:
1536 times 8 db 128, 0 ;placeholder
1537 times 8 db 123, -1
1538 times 8 db 108, -8
1539 times 8 db 93, -6
1540 times 8 db 77, -16
1541 times 8 db 50, -9
1542 times 8 db 36, -11
1543 times 8 db 12, -6
1544 align 16
1545 vp8_bilinear_filters_ssse3:
1546 times 8 db 128, 0
1547 times 8 db 112, 16
1548 times 8 db 96, 32
1549 times 8 db 80, 48
1550 times 8 db 64, 64
1551 times 8 db 48, 80
1552 times 8 db 32, 96
1553 times 8 db 16, 112