Merge "vp8_rd_pick_best_mbsegmentation code restructure"
[libvpx.git] / vp8 / common / x86 / subpixel_sse2.asm
blobb87cad2593e09b3c7a725580d193ad36e7690be6
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 %define BLOCK_HEIGHT_WIDTH 4
15 %define VP8_FILTER_WEIGHT 128
16 %define VP8_FILTER_SHIFT 7
19 ;/************************************************************************************
20 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
21 ; input pixel array has output_height rows. This routine assumes that output_height is an
22 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
23 ; rows each iteration to take advantage of the 128 bits operations.
24 ;*************************************************************************************/
25 ;void vp8_filter_block1d8_h6_sse2
27 ; unsigned char *src_ptr,
28 ; unsigned short *output_ptr,
29 ; unsigned int src_pixels_per_line,
30 ; unsigned int pixel_step,
31 ; unsigned int output_height,
32 ; unsigned int output_width,
33 ; short *vp8_filter
35 global sym(vp8_filter_block1d8_h6_sse2)
36 sym(vp8_filter_block1d8_h6_sse2):
37 push rbp
38 mov rbp, rsp
39 SHADOW_ARGS_TO_STACK 7
40 SAVE_XMM
41 GET_GOT rbx
42 push rsi
43 push rdi
44 ; end prolog
46 mov rdx, arg(6) ;vp8_filter
47 mov rsi, arg(0) ;src_ptr
49 mov rdi, arg(1) ;output_ptr
51 movsxd rcx, dword ptr arg(4) ;output_height
52 movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
53 %if ABI_IS_32BIT=0
54 movsxd r8, dword ptr arg(5) ;output_width
55 %endif
56 pxor xmm0, xmm0 ; clear xmm0 for unpack
58 filter_block1d8_h6_rowloop:
59 movq xmm3, MMWORD PTR [rsi - 2]
60 movq xmm1, MMWORD PTR [rsi + 6]
62 prefetcht2 [rsi+rax-2]
64 pslldq xmm1, 8
65 por xmm1, xmm3
67 movdqa xmm4, xmm1
68 movdqa xmm5, xmm1
70 movdqa xmm6, xmm1
71 movdqa xmm7, xmm1
73 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
74 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
76 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
77 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
79 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
80 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
83 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
84 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
86 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
88 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
89 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
91 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
93 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
94 psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
97 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
99 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
100 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
103 paddsw xmm4, xmm7
104 paddsw xmm4, xmm5
106 paddsw xmm4, xmm3
107 paddsw xmm4, xmm6
109 paddsw xmm4, xmm1
110 paddsw xmm4, [GLOBAL(rd)]
112 psraw xmm4, 7
114 packuswb xmm4, xmm0
115 punpcklbw xmm4, xmm0
117 movdqa XMMWORD Ptr [rdi], xmm4
118 lea rsi, [rsi + rax]
120 %if ABI_IS_32BIT
121 add rdi, DWORD Ptr arg(5) ;[output_width]
122 %else
123 add rdi, r8
124 %endif
125 dec rcx
127 jnz filter_block1d8_h6_rowloop ; next row
129 ; begin epilog
130 pop rdi
131 pop rsi
132 RESTORE_GOT
133 RESTORE_XMM
134 UNSHADOW_ARGS
135 pop rbp
139 ;void vp8_filter_block1d16_h6_sse2
141 ; unsigned char *src_ptr,
142 ; unsigned short *output_ptr,
143 ; unsigned int src_pixels_per_line,
144 ; unsigned int pixel_step,
145 ; unsigned int output_height,
146 ; unsigned int output_width,
147 ; short *vp8_filter
149 ;/************************************************************************************
150 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
151 ; input pixel array has output_height rows. This routine assumes that output_height is an
152 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
153 ; rows each iteration to take advantage of the 128 bits operations.
154 ;*************************************************************************************/
155 global sym(vp8_filter_block1d16_h6_sse2)
156 sym(vp8_filter_block1d16_h6_sse2):
157 push rbp
158 mov rbp, rsp
159 SHADOW_ARGS_TO_STACK 7
160 SAVE_XMM
161 GET_GOT rbx
162 push rsi
163 push rdi
164 ; end prolog
166 mov rdx, arg(6) ;vp8_filter
167 mov rsi, arg(0) ;src_ptr
169 mov rdi, arg(1) ;output_ptr
171 movsxd rcx, dword ptr arg(4) ;output_height
172 movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
173 %if ABI_IS_32BIT=0
174 movsxd r8, dword ptr arg(5) ;output_width
175 %endif
177 pxor xmm0, xmm0 ; clear xmm0 for unpack
179 filter_block1d16_h6_sse2_rowloop:
180 movq xmm3, MMWORD PTR [rsi - 2]
181 movq xmm1, MMWORD PTR [rsi + 6]
183 movq xmm2, MMWORD PTR [rsi +14]
184 pslldq xmm2, 8
186 por xmm2, xmm1
187 prefetcht2 [rsi+rax-2]
189 pslldq xmm1, 8
190 por xmm1, xmm3
192 movdqa xmm4, xmm1
193 movdqa xmm5, xmm1
195 movdqa xmm6, xmm1
196 movdqa xmm7, xmm1
198 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
199 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
201 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
202 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
204 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
205 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
208 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
209 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
211 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
213 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
214 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
216 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
218 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
219 psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
222 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
224 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
225 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
227 paddsw xmm4, xmm7
228 paddsw xmm4, xmm5
230 paddsw xmm4, xmm3
231 paddsw xmm4, xmm6
233 paddsw xmm4, xmm1
234 paddsw xmm4, [GLOBAL(rd)]
236 psraw xmm4, 7
238 packuswb xmm4, xmm0
239 punpcklbw xmm4, xmm0
241 movdqa XMMWORD Ptr [rdi], xmm4
243 movdqa xmm3, xmm2
244 movdqa xmm4, xmm2
246 movdqa xmm5, xmm2
247 movdqa xmm6, xmm2
249 movdqa xmm7, xmm2
251 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
252 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
254 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
255 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
257 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
258 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
261 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
262 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
264 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
266 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
267 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
269 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
271 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
272 psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
274 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
276 punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
277 pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
280 paddsw xmm4, xmm7
281 paddsw xmm4, xmm5
283 paddsw xmm4, xmm3
284 paddsw xmm4, xmm6
286 paddsw xmm4, xmm2
287 paddsw xmm4, [GLOBAL(rd)]
289 psraw xmm4, 7
291 packuswb xmm4, xmm0
292 punpcklbw xmm4, xmm0
294 movdqa XMMWORD Ptr [rdi+16], xmm4
296 lea rsi, [rsi + rax]
297 %if ABI_IS_32BIT
298 add rdi, DWORD Ptr arg(5) ;[output_width]
299 %else
300 add rdi, r8
301 %endif
303 dec rcx
304 jnz filter_block1d16_h6_sse2_rowloop ; next row
306 ; begin epilog
307 pop rdi
308 pop rsi
309 RESTORE_GOT
310 RESTORE_XMM
311 UNSHADOW_ARGS
312 pop rbp
316 ;void vp8_filter_block1d8_v6_sse2
318 ; short *src_ptr,
319 ; unsigned char *output_ptr,
320 ; int dst_ptich,
321 ; unsigned int pixels_per_line,
322 ; unsigned int pixel_step,
323 ; unsigned int output_height,
324 ; unsigned int output_width,
325 ; short * vp8_filter
327 ;/************************************************************************************
328 ; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
329 ; input pixel array has output_height rows.
330 ;*************************************************************************************/
331 global sym(vp8_filter_block1d8_v6_sse2)
332 sym(vp8_filter_block1d8_v6_sse2):
333 push rbp
334 mov rbp, rsp
335 SHADOW_ARGS_TO_STACK 8
336 SAVE_XMM
337 GET_GOT rbx
338 push rsi
339 push rdi
340 ; end prolog
342 mov rax, arg(7) ;vp8_filter
343 movsxd rdx, dword ptr arg(3) ;pixels_per_line
345 mov rdi, arg(1) ;output_ptr
346 mov rsi, arg(0) ;src_ptr
348 sub rsi, rdx
349 sub rsi, rdx
351 movsxd rcx, DWORD PTR arg(5) ;[output_height]
352 pxor xmm0, xmm0 ; clear xmm0
354 movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
355 %if ABI_IS_32BIT=0
356 movsxd r8, dword ptr arg(2) ; dst_ptich
357 %endif
359 vp8_filter_block1d8_v6_sse2_loop:
360 movdqa xmm1, XMMWORD PTR [rsi]
361 pmullw xmm1, [rax]
363 movdqa xmm2, XMMWORD PTR [rsi + rdx]
364 pmullw xmm2, [rax + 16]
366 movdqa xmm3, XMMWORD PTR [rsi + rdx * 2]
367 pmullw xmm3, [rax + 32]
369 movdqa xmm5, XMMWORD PTR [rsi + rdx * 4]
370 pmullw xmm5, [rax + 64]
372 add rsi, rdx
373 movdqa xmm4, XMMWORD PTR [rsi + rdx * 2]
375 pmullw xmm4, [rax + 48]
376 movdqa xmm6, XMMWORD PTR [rsi + rdx * 4]
378 pmullw xmm6, [rax + 80]
380 paddsw xmm2, xmm5
381 paddsw xmm2, xmm3
383 paddsw xmm2, xmm1
384 paddsw xmm2, xmm4
386 paddsw xmm2, xmm6
387 paddsw xmm2, xmm7
389 psraw xmm2, 7
390 packuswb xmm2, xmm0 ; pack and saturate
392 movq QWORD PTR [rdi], xmm2 ; store the results in the destination
393 %if ABI_IS_32BIT
394 add rdi, DWORD PTR arg(2) ;[dst_ptich]
395 %else
396 add rdi, r8
397 %endif
398 dec rcx ; decrement count
399 jnz vp8_filter_block1d8_v6_sse2_loop ; next row
401 ; begin epilog
402 pop rdi
403 pop rsi
404 RESTORE_GOT
405 RESTORE_XMM
406 UNSHADOW_ARGS
407 pop rbp
411 ;void vp8_filter_block1d16_v6_sse2
413 ; unsigned short *src_ptr,
414 ; unsigned char *output_ptr,
415 ; int dst_ptich,
416 ; unsigned int pixels_per_line,
417 ; unsigned int pixel_step,
418 ; unsigned int output_height,
419 ; unsigned int output_width,
420 ; const short *vp8_filter
422 ;/************************************************************************************
423 ; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
424 ; input pixel array has output_height rows.
425 ;*************************************************************************************/
426 global sym(vp8_filter_block1d16_v6_sse2)
427 sym(vp8_filter_block1d16_v6_sse2):
428 push rbp
429 mov rbp, rsp
430 SHADOW_ARGS_TO_STACK 8
431 SAVE_XMM
432 GET_GOT rbx
433 push rsi
434 push rdi
435 ; end prolog
437 mov rax, arg(7) ;vp8_filter
438 movsxd rdx, dword ptr arg(3) ;pixels_per_line
440 mov rdi, arg(1) ;output_ptr
441 mov rsi, arg(0) ;src_ptr
443 sub rsi, rdx
444 sub rsi, rdx
446 movsxd rcx, DWORD PTR arg(5) ;[output_height]
447 %if ABI_IS_32BIT=0
448 movsxd r8, dword ptr arg(2) ; dst_ptich
449 %endif
451 vp8_filter_block1d16_v6_sse2_loop:
452 ; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
453 movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2
454 movdqa xmm2, XMMWORD PTR [rsi + rdx + 16]
455 pmullw xmm1, [rax + 16]
456 pmullw xmm2, [rax + 16]
458 movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5
459 movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16]
460 pmullw xmm3, [rax + 64]
461 pmullw xmm4, [rax + 64]
463 movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3
464 movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16]
465 pmullw xmm5, [rax + 32]
466 pmullw xmm6, [rax + 32]
468 movdqa xmm7, XMMWORD PTR [rsi] ; line 1
469 movdqa xmm0, XMMWORD PTR [rsi + 16]
470 pmullw xmm7, [rax]
471 pmullw xmm0, [rax]
473 paddsw xmm1, xmm3
474 paddsw xmm2, xmm4
475 paddsw xmm1, xmm5
476 paddsw xmm2, xmm6
477 paddsw xmm1, xmm7
478 paddsw xmm2, xmm0
480 add rsi, rdx
482 movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4
483 movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16]
484 pmullw xmm3, [rax + 48]
485 pmullw xmm4, [rax + 48]
487 movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6
488 movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16]
489 pmullw xmm5, [rax + 80]
490 pmullw xmm6, [rax + 80]
492 movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
493 pxor xmm0, xmm0 ; clear xmm0
495 paddsw xmm1, xmm3
496 paddsw xmm2, xmm4
497 paddsw xmm1, xmm5
498 paddsw xmm2, xmm6
500 paddsw xmm1, xmm7
501 paddsw xmm2, xmm7
503 psraw xmm1, 7
504 psraw xmm2, 7
506 packuswb xmm1, xmm2 ; pack and saturate
507 movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the destination
508 %if ABI_IS_32BIT
509 add rdi, DWORD PTR arg(2) ;[dst_ptich]
510 %else
511 add rdi, r8
512 %endif
513 dec rcx ; decrement count
514 jnz vp8_filter_block1d16_v6_sse2_loop ; next row
516 ; begin epilog
517 pop rdi
518 pop rsi
519 RESTORE_GOT
520 RESTORE_XMM
521 UNSHADOW_ARGS
522 pop rbp
526 ;void vp8_filter_block1d8_h6_only_sse2
528 ; unsigned char *src_ptr,
529 ; unsigned int src_pixels_per_line,
530 ; unsigned char *output_ptr,
531 ; int dst_ptich,
532 ; unsigned int output_height,
533 ; const short *vp8_filter
535 ; First-pass filter only when yoffset==0
536 global sym(vp8_filter_block1d8_h6_only_sse2)
537 sym(vp8_filter_block1d8_h6_only_sse2):
538 push rbp
539 mov rbp, rsp
540 SHADOW_ARGS_TO_STACK 6
541 SAVE_XMM
542 GET_GOT rbx
543 push rsi
544 push rdi
545 ; end prolog
547 mov rdx, arg(5) ;vp8_filter
548 mov rsi, arg(0) ;src_ptr
550 mov rdi, arg(2) ;output_ptr
552 movsxd rcx, dword ptr arg(4) ;output_height
553 movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source
554 %if ABI_IS_32BIT=0
555 movsxd r8, dword ptr arg(3) ;dst_ptich
556 %endif
557 pxor xmm0, xmm0 ; clear xmm0 for unpack
559 filter_block1d8_h6_only_rowloop:
560 movq xmm3, MMWORD PTR [rsi - 2]
561 movq xmm1, MMWORD PTR [rsi + 6]
563 prefetcht2 [rsi+rax-2]
565 pslldq xmm1, 8
566 por xmm1, xmm3
568 movdqa xmm4, xmm1
569 movdqa xmm5, xmm1
571 movdqa xmm6, xmm1
572 movdqa xmm7, xmm1
574 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
575 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
577 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
578 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
580 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
581 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
584 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
585 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
587 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
589 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
590 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
592 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
594 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
595 psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
598 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
600 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
601 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
604 paddsw xmm4, xmm7
605 paddsw xmm4, xmm5
607 paddsw xmm4, xmm3
608 paddsw xmm4, xmm6
610 paddsw xmm4, xmm1
611 paddsw xmm4, [GLOBAL(rd)]
613 psraw xmm4, 7
615 packuswb xmm4, xmm0
617 movq QWORD PTR [rdi], xmm4 ; store the results in the destination
618 lea rsi, [rsi + rax]
620 %if ABI_IS_32BIT
621 add rdi, DWORD Ptr arg(3) ;dst_ptich
622 %else
623 add rdi, r8
624 %endif
625 dec rcx
627 jnz filter_block1d8_h6_only_rowloop ; next row
629 ; begin epilog
630 pop rdi
631 pop rsi
632 RESTORE_GOT
633 RESTORE_XMM
634 UNSHADOW_ARGS
635 pop rbp
639 ;void vp8_filter_block1d16_h6_only_sse2
641 ; unsigned char *src_ptr,
642 ; unsigned int src_pixels_per_line,
643 ; unsigned char *output_ptr,
644 ; int dst_ptich,
645 ; unsigned int output_height,
646 ; const short *vp8_filter
648 ; First-pass filter only when yoffset==0
649 global sym(vp8_filter_block1d16_h6_only_sse2)
650 sym(vp8_filter_block1d16_h6_only_sse2):
651 push rbp
652 mov rbp, rsp
653 SHADOW_ARGS_TO_STACK 6
654 SAVE_XMM
655 GET_GOT rbx
656 push rsi
657 push rdi
658 ; end prolog
660 mov rdx, arg(5) ;vp8_filter
661 mov rsi, arg(0) ;src_ptr
663 mov rdi, arg(2) ;output_ptr
665 movsxd rcx, dword ptr arg(4) ;output_height
666 movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source
667 %if ABI_IS_32BIT=0
668 movsxd r8, dword ptr arg(3) ;dst_ptich
669 %endif
671 pxor xmm0, xmm0 ; clear xmm0 for unpack
673 filter_block1d16_h6_only_sse2_rowloop:
674 movq xmm3, MMWORD PTR [rsi - 2]
675 movq xmm1, MMWORD PTR [rsi + 6]
677 movq xmm2, MMWORD PTR [rsi +14]
678 pslldq xmm2, 8
680 por xmm2, xmm1
681 prefetcht2 [rsi+rax-2]
683 pslldq xmm1, 8
684 por xmm1, xmm3
686 movdqa xmm4, xmm1
687 movdqa xmm5, xmm1
689 movdqa xmm6, xmm1
690 movdqa xmm7, xmm1
692 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
693 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
695 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
696 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
698 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
699 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
701 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
702 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
704 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
706 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
707 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
709 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
711 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
712 psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
714 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
716 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
717 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
719 paddsw xmm4, xmm7
720 paddsw xmm4, xmm5
722 paddsw xmm4, xmm3
723 paddsw xmm4, xmm6
725 paddsw xmm4, xmm1
726 paddsw xmm4, [GLOBAL(rd)]
728 psraw xmm4, 7
730 packuswb xmm4, xmm0 ; lower 8 bytes
732 movq QWORD Ptr [rdi], xmm4 ; store the results in the destination
734 movdqa xmm3, xmm2
735 movdqa xmm4, xmm2
737 movdqa xmm5, xmm2
738 movdqa xmm6, xmm2
740 movdqa xmm7, xmm2
742 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
743 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
745 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
746 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
748 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
749 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
751 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
752 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
754 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
756 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
757 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
759 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
761 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
762 psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
764 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
766 punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
767 pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
769 paddsw xmm4, xmm7
770 paddsw xmm4, xmm5
772 paddsw xmm4, xmm3
773 paddsw xmm4, xmm6
775 paddsw xmm4, xmm2
776 paddsw xmm4, [GLOBAL(rd)]
778 psraw xmm4, 7
780 packuswb xmm4, xmm0 ; higher 8 bytes
782 movq QWORD Ptr [rdi+8], xmm4 ; store the results in the destination
784 lea rsi, [rsi + rax]
785 %if ABI_IS_32BIT
786 add rdi, DWORD Ptr arg(3) ;dst_ptich
787 %else
788 add rdi, r8
789 %endif
791 dec rcx
792 jnz filter_block1d16_h6_only_sse2_rowloop ; next row
794 ; begin epilog
795 pop rdi
796 pop rsi
797 RESTORE_GOT
798 RESTORE_XMM
799 UNSHADOW_ARGS
800 pop rbp
804 ;void vp8_filter_block1d8_v6_only_sse2
806 ; unsigned char *src_ptr,
807 ; unsigned int src_pixels_per_line,
808 ; unsigned char *output_ptr,
809 ; int dst_ptich,
810 ; unsigned int output_height,
811 ; const short *vp8_filter
813 ; Second-pass filter only when xoffset==0
814 global sym(vp8_filter_block1d8_v6_only_sse2)
815 sym(vp8_filter_block1d8_v6_only_sse2):
816 push rbp
817 mov rbp, rsp
818 SHADOW_ARGS_TO_STACK 6
819 SAVE_XMM
820 GET_GOT rbx
821 push rsi
822 push rdi
823 ; end prolog
825 mov rsi, arg(0) ;src_ptr
826 mov rdi, arg(2) ;output_ptr
828 movsxd rcx, dword ptr arg(4) ;output_height
829 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
831 mov rax, arg(5) ;vp8_filter
833 pxor xmm0, xmm0 ; clear xmm0
835 movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
836 %if ABI_IS_32BIT=0
837 movsxd r8, dword ptr arg(3) ; dst_ptich
838 %endif
840 vp8_filter_block1d8_v6_only_sse2_loop:
841 movq xmm1, MMWORD PTR [rsi]
842 movq xmm2, MMWORD PTR [rsi + rdx]
843 movq xmm3, MMWORD PTR [rsi + rdx * 2]
844 movq xmm5, MMWORD PTR [rsi + rdx * 4]
845 add rsi, rdx
846 movq xmm4, MMWORD PTR [rsi + rdx * 2]
847 movq xmm6, MMWORD PTR [rsi + rdx * 4]
849 punpcklbw xmm1, xmm0
850 pmullw xmm1, [rax]
852 punpcklbw xmm2, xmm0
853 pmullw xmm2, [rax + 16]
855 punpcklbw xmm3, xmm0
856 pmullw xmm3, [rax + 32]
858 punpcklbw xmm5, xmm0
859 pmullw xmm5, [rax + 64]
861 punpcklbw xmm4, xmm0
862 pmullw xmm4, [rax + 48]
864 punpcklbw xmm6, xmm0
865 pmullw xmm6, [rax + 80]
867 paddsw xmm2, xmm5
868 paddsw xmm2, xmm3
870 paddsw xmm2, xmm1
871 paddsw xmm2, xmm4
873 paddsw xmm2, xmm6
874 paddsw xmm2, xmm7
876 psraw xmm2, 7
877 packuswb xmm2, xmm0 ; pack and saturate
879 movq QWORD PTR [rdi], xmm2 ; store the results in the destination
880 %if ABI_IS_32BIT
881 add rdi, DWORD PTR arg(3) ;[dst_ptich]
882 %else
883 add rdi, r8
884 %endif
885 dec rcx ; decrement count
886 jnz vp8_filter_block1d8_v6_only_sse2_loop ; next row
888 ; begin epilog
889 pop rdi
890 pop rsi
891 RESTORE_GOT
892 RESTORE_XMM
893 UNSHADOW_ARGS
894 pop rbp
898 ;void vp8_unpack_block1d16_h6_sse2
900 ; unsigned char *src_ptr,
901 ; unsigned short *output_ptr,
902 ; unsigned int src_pixels_per_line,
903 ; unsigned int output_height,
904 ; unsigned int output_width
906 global sym(vp8_unpack_block1d16_h6_sse2)
907 sym(vp8_unpack_block1d16_h6_sse2):
908 push rbp
909 mov rbp, rsp
910 SHADOW_ARGS_TO_STACK 5
911 ;SAVE_XMM ;xmm6, xmm7 are not used here.
912 GET_GOT rbx
913 push rsi
914 push rdi
915 ; end prolog
917 mov rsi, arg(0) ;src_ptr
918 mov rdi, arg(1) ;output_ptr
920 movsxd rcx, dword ptr arg(3) ;output_height
921 movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
923 pxor xmm0, xmm0 ; clear xmm0 for unpack
924 %if ABI_IS_32BIT=0
925 movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source
926 %endif
928 unpack_block1d16_h6_sse2_rowloop:
929 movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
930 movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1
932 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
933 punpcklbw xmm1, xmm0
935 movdqa XMMWORD Ptr [rdi], xmm1
936 movdqa XMMWORD Ptr [rdi + 16], xmm3
938 lea rsi, [rsi + rax]
939 %if ABI_IS_32BIT
940 add rdi, DWORD Ptr arg(4) ;[output_width]
941 %else
942 add rdi, r8
943 %endif
944 dec rcx
945 jnz unpack_block1d16_h6_sse2_rowloop ; next row
947 ; begin epilog
948 pop rdi
949 pop rsi
950 RESTORE_GOT
951 ;RESTORE_XMM
952 UNSHADOW_ARGS
953 pop rbp
957 ;void vp8_bilinear_predict16x16_sse2
959 ; unsigned char *src_ptr,
960 ; int src_pixels_per_line,
961 ; int xoffset,
962 ; int yoffset,
963 ; unsigned char *dst_ptr,
964 ; int dst_pitch
966 extern sym(vp8_bilinear_filters_mmx)
967 global sym(vp8_bilinear_predict16x16_sse2)
968 sym(vp8_bilinear_predict16x16_sse2):
969 push rbp
970 mov rbp, rsp
971 SHADOW_ARGS_TO_STACK 6
972 SAVE_XMM
973 GET_GOT rbx
974 push rsi
975 push rdi
976 ; end prolog
978 ;const short *HFilter = bilinear_filters_mmx[xoffset]
979 ;const short *VFilter = bilinear_filters_mmx[yoffset]
981 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_mmx))]
982 movsxd rax, dword ptr arg(2) ;xoffset
984 cmp rax, 0 ;skip first_pass filter if xoffset=0
985 je b16x16_sp_only
987 shl rax, 5
988 add rax, rcx ;HFilter
990 mov rdi, arg(4) ;dst_ptr
991 mov rsi, arg(0) ;src_ptr
992 movsxd rdx, dword ptr arg(5) ;dst_pitch
994 movdqa xmm1, [rax]
995 movdqa xmm2, [rax+16]
997 movsxd rax, dword ptr arg(3) ;yoffset
999 cmp rax, 0 ;skip second_pass filter if yoffset=0
1000 je b16x16_fp_only
1002 shl rax, 5
1003 add rax, rcx ;VFilter
1005 lea rcx, [rdi+rdx*8]
1006 lea rcx, [rcx+rdx*8]
1007 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
1009 pxor xmm0, xmm0
1011 %if ABI_IS_32BIT=0
1012 movsxd r8, dword ptr arg(5) ;dst_pitch
1013 %endif
1014 ; get the first horizontal line done
1015 movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1016 movdqa xmm4, xmm3 ; make a copy of current line
1018 punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
1019 punpckhbw xmm4, xmm0
1021 pmullw xmm3, xmm1
1022 pmullw xmm4, xmm1
1024 movdqu xmm5, [rsi+1]
1025 movdqa xmm6, xmm5
1027 punpcklbw xmm5, xmm0
1028 punpckhbw xmm6, xmm0
1030 pmullw xmm5, xmm2
1031 pmullw xmm6, xmm2
1033 paddw xmm3, xmm5
1034 paddw xmm4, xmm6
1036 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
1037 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
1039 paddw xmm4, [GLOBAL(rd)]
1040 psraw xmm4, VP8_FILTER_SHIFT
1042 movdqa xmm7, xmm3
1043 packuswb xmm7, xmm4
1045 add rsi, rdx ; next line
1046 next_row:
1047 movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1048 movdqa xmm4, xmm3 ; make a copy of current line
1050 punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
1051 punpckhbw xmm4, xmm0
1053 pmullw xmm3, xmm1
1054 pmullw xmm4, xmm1
1056 movdqu xmm5, [rsi+1]
1057 movdqa xmm6, xmm5
1059 punpcklbw xmm5, xmm0
1060 punpckhbw xmm6, xmm0
1062 pmullw xmm5, xmm2
1063 pmullw xmm6, xmm2
1065 paddw xmm3, xmm5
1066 paddw xmm4, xmm6
1068 movdqa xmm5, xmm7
1069 movdqa xmm6, xmm7
1071 punpcklbw xmm5, xmm0
1072 punpckhbw xmm6, xmm0
1074 pmullw xmm5, [rax]
1075 pmullw xmm6, [rax]
1077 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
1078 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
1080 paddw xmm4, [GLOBAL(rd)]
1081 psraw xmm4, VP8_FILTER_SHIFT
1083 movdqa xmm7, xmm3
1084 packuswb xmm7, xmm4
1086 pmullw xmm3, [rax+16]
1087 pmullw xmm4, [rax+16]
1089 paddw xmm3, xmm5
1090 paddw xmm4, xmm6
1092 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
1093 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
1095 paddw xmm4, [GLOBAL(rd)]
1096 psraw xmm4, VP8_FILTER_SHIFT
1098 packuswb xmm3, xmm4
1099 movdqa [rdi], xmm3 ; store the results in the destination
1101 add rsi, rdx ; next line
1102 %if ABI_IS_32BIT
1103 add rdi, DWORD PTR arg(5) ;dst_pitch
1104 %else
1105 add rdi, r8
1106 %endif
1108 cmp rdi, rcx
1109 jne next_row
1111 jmp done
1113 b16x16_sp_only:
1114 movsxd rax, dword ptr arg(3) ;yoffset
1115 shl rax, 5
1116 add rax, rcx ;VFilter
1118 mov rdi, arg(4) ;dst_ptr
1119 mov rsi, arg(0) ;src_ptr
1120 movsxd rdx, dword ptr arg(5) ;dst_pitch
1122 movdqa xmm1, [rax]
1123 movdqa xmm2, [rax+16]
1125 lea rcx, [rdi+rdx*8]
1126 lea rcx, [rcx+rdx*8]
1127 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
1129 pxor xmm0, xmm0
1131 ; get the first horizontal line done
1132 movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1134 add rsi, rax ; next line
1135 next_row_spo:
1136 movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1138 movdqa xmm5, xmm7
1139 movdqa xmm6, xmm7
1141 movdqa xmm4, xmm3 ; make a copy of current line
1142 movdqa xmm7, xmm3
1144 punpcklbw xmm5, xmm0
1145 punpckhbw xmm6, xmm0
1146 punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
1147 punpckhbw xmm4, xmm0
1149 pmullw xmm5, xmm1
1150 pmullw xmm6, xmm1
1151 pmullw xmm3, xmm2
1152 pmullw xmm4, xmm2
1154 paddw xmm3, xmm5
1155 paddw xmm4, xmm6
1157 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
1158 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
1160 paddw xmm4, [GLOBAL(rd)]
1161 psraw xmm4, VP8_FILTER_SHIFT
1163 packuswb xmm3, xmm4
1164 movdqa [rdi], xmm3 ; store the results in the destination
1166 add rsi, rax ; next line
1167 add rdi, rdx ;dst_pitch
1168 cmp rdi, rcx
1169 jne next_row_spo
1171 jmp done
1173 b16x16_fp_only:
1174 lea rcx, [rdi+rdx*8]
1175 lea rcx, [rcx+rdx*8]
1176 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
1177 pxor xmm0, xmm0
1179 next_row_fpo:
1180 movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
1181 movdqa xmm4, xmm3 ; make a copy of current line
1183 punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
1184 punpckhbw xmm4, xmm0
1186 pmullw xmm3, xmm1
1187 pmullw xmm4, xmm1
1189 movdqu xmm5, [rsi+1]
1190 movdqa xmm6, xmm5
1192 punpcklbw xmm5, xmm0
1193 punpckhbw xmm6, xmm0
1195 pmullw xmm5, xmm2
1196 pmullw xmm6, xmm2
1198 paddw xmm3, xmm5
1199 paddw xmm4, xmm6
1201 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
1202 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
1204 paddw xmm4, [GLOBAL(rd)]
1205 psraw xmm4, VP8_FILTER_SHIFT
1207 packuswb xmm3, xmm4
1208 movdqa [rdi], xmm3 ; store the results in the destination
1210 add rsi, rax ; next line
1211 add rdi, rdx ; dst_pitch
1212 cmp rdi, rcx
1213 jne next_row_fpo
1215 done:
1216 ; begin epilog
1217 pop rdi
1218 pop rsi
1219 RESTORE_GOT
1220 RESTORE_XMM
1221 UNSHADOW_ARGS
1222 pop rbp
1226 ;void vp8_bilinear_predict8x8_sse2
1228 ; unsigned char *src_ptr,
1229 ; int src_pixels_per_line,
1230 ; int xoffset,
1231 ; int yoffset,
1232 ; unsigned char *dst_ptr,
1233 ; int dst_pitch
1235 extern sym(vp8_bilinear_filters_mmx)
1236 global sym(vp8_bilinear_predict8x8_sse2)
1237 sym(vp8_bilinear_predict8x8_sse2):
1238 push rbp
1239 mov rbp, rsp
1240 SHADOW_ARGS_TO_STACK 6
1241 SAVE_XMM
1242 GET_GOT rbx
1243 push rsi
1244 push rdi
1245 ; end prolog
1247 ALIGN_STACK 16, rax
1248 sub rsp, 144 ; reserve 144 bytes
1250 ;const short *HFilter = bilinear_filters_mmx[xoffset]
1251 ;const short *VFilter = bilinear_filters_mmx[yoffset]
1252 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_mmx))]
1254 mov rsi, arg(0) ;src_ptr
1255 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
1257 ;Read 9-line unaligned data in and put them on stack. This gives a big
1258 ;performance boost.
1259 movdqu xmm0, [rsi]
1260 lea rax, [rdx + rdx*2]
1261 movdqu xmm1, [rsi+rdx]
1262 movdqu xmm2, [rsi+rdx*2]
1263 add rsi, rax
1264 movdqu xmm3, [rsi]
1265 movdqu xmm4, [rsi+rdx]
1266 movdqu xmm5, [rsi+rdx*2]
1267 add rsi, rax
1268 movdqu xmm6, [rsi]
1269 movdqu xmm7, [rsi+rdx]
1271 movdqa XMMWORD PTR [rsp], xmm0
1273 movdqu xmm0, [rsi+rdx*2]
1275 movdqa XMMWORD PTR [rsp+16], xmm1
1276 movdqa XMMWORD PTR [rsp+32], xmm2
1277 movdqa XMMWORD PTR [rsp+48], xmm3
1278 movdqa XMMWORD PTR [rsp+64], xmm4
1279 movdqa XMMWORD PTR [rsp+80], xmm5
1280 movdqa XMMWORD PTR [rsp+96], xmm6
1281 movdqa XMMWORD PTR [rsp+112], xmm7
1282 movdqa XMMWORD PTR [rsp+128], xmm0
1284 movsxd rax, dword ptr arg(2) ;xoffset
1285 shl rax, 5
1286 add rax, rcx ;HFilter
1288 mov rdi, arg(4) ;dst_ptr
1289 movsxd rdx, dword ptr arg(5) ;dst_pitch
1291 movdqa xmm1, [rax]
1292 movdqa xmm2, [rax+16]
1294 movsxd rax, dword ptr arg(3) ;yoffset
1295 shl rax, 5
1296 add rax, rcx ;VFilter
1298 lea rcx, [rdi+rdx*8]
1300 movdqa xmm5, [rax]
1301 movdqa xmm6, [rax+16]
1303 pxor xmm0, xmm0
1305 ; get the first horizontal line done
1306 movdqa xmm3, XMMWORD PTR [rsp]
1307 movdqa xmm4, xmm3 ; make a copy of current line
1308 psrldq xmm4, 1
1310 punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07
1311 punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08
1313 pmullw xmm3, xmm1
1314 pmullw xmm4, xmm2
1316 paddw xmm3, xmm4
1318 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
1319 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
1321 movdqa xmm7, xmm3
1322 add rsp, 16 ; next line
1323 next_row8x8:
1324 movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
1325 movdqa xmm4, xmm3 ; make a copy of current line
1326 psrldq xmm4, 1
1328 punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07
1329 punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08
1331 pmullw xmm3, xmm1
1332 pmullw xmm4, xmm2
1334 paddw xmm3, xmm4
1335 pmullw xmm7, xmm5
1337 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
1338 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
1340 movdqa xmm4, xmm3
1342 pmullw xmm3, xmm6
1343 paddw xmm3, xmm7
1345 movdqa xmm7, xmm4
1347 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
1348 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128
1350 packuswb xmm3, xmm0
1351 movq [rdi], xmm3 ; store the results in the destination
1353 add rsp, 16 ; next line
1354 add rdi, rdx
1356 cmp rdi, rcx
1357 jne next_row8x8
1359 ;add rsp, 144
1360 pop rsp
1361 ; begin epilog
1362 pop rdi
1363 pop rsi
1364 RESTORE_GOT
1365 RESTORE_XMM
1366 UNSHADOW_ARGS
1367 pop rbp
1371 SECTION_RODATA
1372 align 16
1374 times 8 dw 0x40