2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 ;void idct_dequant_0_2x_sse2
18 ; unsigned char *pre - 2
19 ; unsigned char *dst - 3
24 global sym
(idct_dequant_0_2x_sse2
)
25 sym
(idct_dequant_0_2x_sse2
):
28 SHADOW_ARGS_TO_STACK
6
32 mov rdx
, arg
(1) ; dequant
33 mov rax
, arg
(0) ; qcoeff
38 pinsrw xmm4
, [rax
+32], 4
43 ; Zero out xmm5, for use unpacking
50 pshuflw xmm4
, xmm4
, 00000000b
51 pshufhw xmm4
, xmm4
, 00000000b
54 paddw xmm4
, [GLOBAL(fours
)]
56 movsxd rcx
, dword ptr arg
(5) ; blk_stride
61 movq xmm2
, [rax
+2*rcx
]
71 movsxd rdx
, dword ptr arg
(4) ; dst_stride
73 ; Add to predict buffer
79 ; pack up before storing
85 ; store blocks back out
87 movq
[rax
+ rdx
], xmm1
89 lea rax
, [rax
+ 2*rdx
]
92 movq
[rax
+ rdx
], xmm3
100 global sym
(idct_dequant_full_2x_sse2
)
101 sym
(idct_dequant_full_2x_sse2
):
104 SHADOW_ARGS_TO_STACK
7
111 ; special case when 2 blocks have 0 or 1 coeffs
112 ; dc is set as first coeff, so no need to load qcoeff
113 mov rax
, arg
(0) ; qcoeff
114 mov rsi
, arg
(2) ; pre
115 mov rdi
, arg
(3) ; dst
116 movsxd rcx
, dword ptr arg
(5) ; blk_stride
118 ; Zero out xmm7, for use unpacking
121 mov rdx
, arg
(1) ; dequant
123 ; note the transpose of xmm1 and xmm2, necessary for shuffle
124 ; to spit out sensicle data
126 movdqa xmm2
, [rax
+16]
127 movdqa xmm1
, [rax
+32]
128 movdqa xmm3
, [rax
+48]
132 movdqa
[rax
+16], xmm7
133 movdqa
[rax
+32], xmm7
134 movdqa
[rax
+48], xmm7
136 ; dequantize qcoeff buffer
138 pmullw xmm2
, [rdx
+16]
140 pmullw xmm3
, [rdx
+16]
142 ; repack so block 0 row x and block 1 row x are together
147 pshufd xmm0
, xmm0
, 11011000b
148 pshufd xmm1
, xmm4
, 11011000b
154 pshufd xmm2
, xmm2
, 11011000b
155 pshufd xmm3
, xmm4
, 11011000b
158 psubw xmm0
, xmm2
; b1 = 0-2
162 paddw xmm2
, xmm0
; a1 = 0+2
164 pmulhw xmm5
, [GLOBAL(x_s1sqr2
)]
165 paddw xmm5
, xmm1
; ip1 * sin(pi/8) * sqrt(2)
168 pmulhw xmm7
, [GLOBAL(x_c1sqr2less1
)]
170 paddw xmm7
, xmm3
; ip3 * cos(pi/8) * sqrt(2)
171 psubw xmm7
, xmm5
; c1
176 pmulhw xmm5
, [GLOBAL(x_c1sqr2less1
)]
179 pmulhw xmm3
, [GLOBAL(x_s1sqr2
)]
182 paddw xmm3
, xmm5
; d1
183 movdqa xmm6
, xmm2
; a1
185 movdqa xmm4
, xmm0
; b1
193 ; transpose for the second pass
194 movdqa xmm7
, xmm2
; 103 102 101 100 003 002 001 000
195 punpcklwd xmm2
, xmm0
; 007 003 006 002 005 001 004 000
196 punpckhwd xmm7
, xmm0
; 107 103 106 102 105 101 104 100
198 movdqa xmm5
, xmm4
; 111 110 109 108 011 010 009 008
199 punpcklwd xmm4
, xmm6
; 015 011 014 010 013 009 012 008
200 punpckhwd xmm5
, xmm6
; 115 111 114 110 113 109 112 108
203 movdqa xmm1
, xmm2
; 007 003 006 002 005 001 004 000
204 punpckldq xmm2
, xmm4
; 013 009 005 001 012 008 004 000
205 punpckhdq xmm1
, xmm4
; 015 011 007 003 014 010 006 002
207 movdqa xmm6
, xmm7
; 107 103 106 102 105 101 104 100
208 punpckldq xmm7
, xmm5
; 113 109 105 101 112 108 104 100
209 punpckhdq xmm6
, xmm5
; 115 111 107 103 114 110 106 102
212 movdqa xmm5
, xmm2
; 013 009 005 001 012 008 004 000
213 punpckldq xmm2
, xmm7
; 112 108 012 008 104 100 004 000
214 punpckhdq xmm5
, xmm7
; 113 109 013 009 105 101 005 001
216 movdqa xmm7
, xmm1
; 015 011 007 003 014 010 006 002
217 punpckldq xmm1
, xmm6
; 114 110 014 010 106 102 006 002
218 punpckhdq xmm7
, xmm6
; 115 111 015 011 107 103 007 003
220 pshufd xmm0
, xmm2
, 11011000b
221 pshufd xmm2
, xmm1
, 11011000b
223 pshufd xmm1
, xmm5
, 11011000b
224 pshufd xmm3
, xmm7
, 11011000b
227 psubw xmm0
, xmm2
; b1 = 0-2
231 paddw xmm2
, xmm0
; a1 = 0+2
233 pmulhw xmm5
, [GLOBAL(x_s1sqr2
)]
234 paddw xmm5
, xmm1
; ip1 * sin(pi/8) * sqrt(2)
237 pmulhw xmm7
, [GLOBAL(x_c1sqr2less1
)]
239 paddw xmm7
, xmm3
; ip3 * cos(pi/8) * sqrt(2)
240 psubw xmm7
, xmm5
; c1
245 pmulhw xmm5
, [GLOBAL(x_c1sqr2less1
)]
248 pmulhw xmm3
, [GLOBAL(x_s1sqr2
)]
251 paddw xmm3
, xmm5
; d1
252 paddw xmm0
, [GLOBAL(fours
)]
254 paddw xmm2
, [GLOBAL(fours
)]
255 movdqa xmm6
, xmm2
; a1
257 movdqa xmm4
, xmm0
; b1
272 movdqa xmm7
, xmm2
; 103 102 101 100 003 002 001 000
273 punpcklwd xmm2
, xmm0
; 007 003 006 002 005 001 004 000
274 punpckhwd xmm7
, xmm0
; 107 103 106 102 105 101 104 100
276 movdqa xmm5
, xmm4
; 111 110 109 108 011 010 009 008
277 punpcklwd xmm4
, xmm6
; 015 011 014 010 013 009 012 008
278 punpckhwd xmm5
, xmm6
; 115 111 114 110 113 109 112 108
281 movdqa xmm1
, xmm2
; 007 003 006 002 005 001 004 000
282 punpckldq xmm2
, xmm4
; 013 009 005 001 012 008 004 000
283 punpckhdq xmm1
, xmm4
; 015 011 007 003 014 010 006 002
285 movdqa xmm6
, xmm7
; 107 103 106 102 105 101 104 100
286 punpckldq xmm7
, xmm5
; 113 109 105 101 112 108 104 100
287 punpckhdq xmm6
, xmm5
; 115 111 107 103 114 110 106 102
290 movdqa xmm5
, xmm2
; 013 009 005 001 012 008 004 000
291 punpckldq xmm2
, xmm7
; 112 108 012 008 104 100 004 000
292 punpckhdq xmm5
, xmm7
; 113 109 013 009 105 101 005 001
294 movdqa xmm7
, xmm1
; 015 011 007 003 014 010 006 002
295 punpckldq xmm1
, xmm6
; 114 110 014 010 106 102 006 002
296 punpckhdq xmm7
, xmm6
; 115 111 015 011 107 103 007 003
298 pshufd xmm0
, xmm2
, 11011000b
299 pshufd xmm2
, xmm1
, 11011000b
301 pshufd xmm1
, xmm5
, 11011000b
302 pshufd xmm3
, xmm7
, 11011000b
306 ; Load up predict blocks
316 movq xmm4
, [rsi
+2*rcx
]
328 ; pack up before storing
334 ; Load destination stride before writing out,
335 ; doesn't need to persist
336 movsxd rdx
, dword ptr arg
(4) ; dst_stride
338 ; store blocks back out
340 movq
[rdi
+ rdx
], xmm1
342 lea rdi
, [rdi
+ 2*rdx
]
345 movq
[rdi
+ rdx
], xmm3
356 ;void idct_dequant_dc_0_2x_sse2
360 ; unsigned char *pre - 2
361 ; unsigned char *dst - 3
365 global sym
(idct_dequant_dc_0_2x_sse2
)
366 sym
(idct_dequant_dc_0_2x_sse2
):
369 SHADOW_ARGS_TO_STACK
7
375 ; special case when 2 blocks have 0 or 1 coeffs
376 ; dc is set as first coeff, so no need to load qcoeff
377 mov rax
, arg
(0) ; qcoeff
378 mov rsi
, arg
(2) ; pre
379 mov rdi
, arg
(3) ; dst
382 ; Zero out xmm5, for use unpacking
385 ; load up 2 dc words here == 2*16 = doubleword
388 ; Load up predict blocks
394 ; Duplicate and expand dc across
398 ; Rounding to dequant and downshift
399 paddw xmm4
, [GLOBAL(fours
)]
402 ; Predict buffer needs to be expanded from bytes to words
408 ; Add to predict buffer
414 ; pack up before storing
420 ; Load destination stride before writing out,
421 ; doesn't need to persist
422 movsxd rdx
, dword ptr arg
(4) ; dst_stride
424 ; store blocks back out
426 movq
[rdi
+ rdx
], xmm1
428 lea rdi
, [rdi
+ 2*rdx
]
431 movq
[rdi
+ rdx
], xmm3
441 global sym
(idct_dequant_dc_full_2x_sse2
)
442 sym
(idct_dequant_dc_full_2x_sse2
):
445 SHADOW_ARGS_TO_STACK
7
452 ; special case when 2 blocks have 0 or 1 coeffs
453 ; dc is set as first coeff, so no need to load qcoeff
454 mov rax
, arg
(0) ; qcoeff
455 mov rsi
, arg
(2) ; pre
456 mov rdi
, arg
(3) ; dst
458 ; Zero out xmm7, for use unpacking
461 mov rdx
, arg
(1) ; dequant
463 ; note the transpose of xmm1 and xmm2, necessary for shuffle
464 ; to spit out sensicle data
466 movdqa xmm2
, [rax
+16]
467 movdqa xmm1
, [rax
+32]
468 movdqa xmm3
, [rax
+48]
472 movdqa
[rax
+16], xmm7
473 movdqa
[rax
+32], xmm7
474 movdqa
[rax
+48], xmm7
476 ; dequantize qcoeff buffer
478 pmullw xmm2
, [rdx
+16]
480 pmullw xmm3
, [rdx
+16]
485 ; repack so block 0 row x and block 1 row x are together
490 pshufd xmm0
, xmm0
, 11011000b
491 pshufd xmm1
, xmm4
, 11011000b
497 pshufd xmm2
, xmm2
, 11011000b
498 pshufd xmm3
, xmm4
, 11011000b
500 ; insert DC component
501 pinsrw xmm0
, [rdx
], 0
502 pinsrw xmm0
, [rdx
+2], 4
505 psubw xmm0
, xmm2
; b1 = 0-2
509 paddw xmm2
, xmm0
; a1 = 0+2
511 pmulhw xmm5
, [GLOBAL(x_s1sqr2
)]
512 paddw xmm5
, xmm1
; ip1 * sin(pi/8) * sqrt(2)
515 pmulhw xmm7
, [GLOBAL(x_c1sqr2less1
)]
517 paddw xmm7
, xmm3
; ip3 * cos(pi/8) * sqrt(2)
518 psubw xmm7
, xmm5
; c1
523 pmulhw xmm5
, [GLOBAL(x_c1sqr2less1
)]
526 pmulhw xmm3
, [GLOBAL(x_s1sqr2
)]
529 paddw xmm3
, xmm5
; d1
530 movdqa xmm6
, xmm2
; a1
532 movdqa xmm4
, xmm0
; b1
540 ; transpose for the second pass
541 movdqa xmm7
, xmm2
; 103 102 101 100 003 002 001 000
542 punpcklwd xmm2
, xmm0
; 007 003 006 002 005 001 004 000
543 punpckhwd xmm7
, xmm0
; 107 103 106 102 105 101 104 100
545 movdqa xmm5
, xmm4
; 111 110 109 108 011 010 009 008
546 punpcklwd xmm4
, xmm6
; 015 011 014 010 013 009 012 008
547 punpckhwd xmm5
, xmm6
; 115 111 114 110 113 109 112 108
550 movdqa xmm1
, xmm2
; 007 003 006 002 005 001 004 000
551 punpckldq xmm2
, xmm4
; 013 009 005 001 012 008 004 000
552 punpckhdq xmm1
, xmm4
; 015 011 007 003 014 010 006 002
554 movdqa xmm6
, xmm7
; 107 103 106 102 105 101 104 100
555 punpckldq xmm7
, xmm5
; 113 109 105 101 112 108 104 100
556 punpckhdq xmm6
, xmm5
; 115 111 107 103 114 110 106 102
559 movdqa xmm5
, xmm2
; 013 009 005 001 012 008 004 000
560 punpckldq xmm2
, xmm7
; 112 108 012 008 104 100 004 000
561 punpckhdq xmm5
, xmm7
; 113 109 013 009 105 101 005 001
563 movdqa xmm7
, xmm1
; 015 011 007 003 014 010 006 002
564 punpckldq xmm1
, xmm6
; 114 110 014 010 106 102 006 002
565 punpckhdq xmm7
, xmm6
; 115 111 015 011 107 103 007 003
567 pshufd xmm0
, xmm2
, 11011000b
568 pshufd xmm2
, xmm1
, 11011000b
570 pshufd xmm1
, xmm5
, 11011000b
571 pshufd xmm3
, xmm7
, 11011000b
574 psubw xmm0
, xmm2
; b1 = 0-2
578 paddw xmm2
, xmm0
; a1 = 0+2
580 pmulhw xmm5
, [GLOBAL(x_s1sqr2
)]
581 paddw xmm5
, xmm1
; ip1 * sin(pi/8) * sqrt(2)
584 pmulhw xmm7
, [GLOBAL(x_c1sqr2less1
)]
586 paddw xmm7
, xmm3
; ip3 * cos(pi/8) * sqrt(2)
587 psubw xmm7
, xmm5
; c1
592 pmulhw xmm5
, [GLOBAL(x_c1sqr2less1
)]
595 pmulhw xmm3
, [GLOBAL(x_s1sqr2
)]
598 paddw xmm3
, xmm5
; d1
599 paddw xmm0
, [GLOBAL(fours
)]
601 paddw xmm2
, [GLOBAL(fours
)]
602 movdqa xmm6
, xmm2
; a1
604 movdqa xmm4
, xmm0
; b1
619 movdqa xmm7
, xmm2
; 103 102 101 100 003 002 001 000
620 punpcklwd xmm2
, xmm0
; 007 003 006 002 005 001 004 000
621 punpckhwd xmm7
, xmm0
; 107 103 106 102 105 101 104 100
623 movdqa xmm5
, xmm4
; 111 110 109 108 011 010 009 008
624 punpcklwd xmm4
, xmm6
; 015 011 014 010 013 009 012 008
625 punpckhwd xmm5
, xmm6
; 115 111 114 110 113 109 112 108
628 movdqa xmm1
, xmm2
; 007 003 006 002 005 001 004 000
629 punpckldq xmm2
, xmm4
; 013 009 005 001 012 008 004 000
630 punpckhdq xmm1
, xmm4
; 015 011 007 003 014 010 006 002
632 movdqa xmm6
, xmm7
; 107 103 106 102 105 101 104 100
633 punpckldq xmm7
, xmm5
; 113 109 105 101 112 108 104 100
634 punpckhdq xmm6
, xmm5
; 115 111 107 103 114 110 106 102
637 movdqa xmm5
, xmm2
; 013 009 005 001 012 008 004 000
638 punpckldq xmm2
, xmm7
; 112 108 012 008 104 100 004 000
639 punpckhdq xmm5
, xmm7
; 113 109 013 009 105 101 005 001
641 movdqa xmm7
, xmm1
; 015 011 007 003 014 010 006 002
642 punpckldq xmm1
, xmm6
; 114 110 014 010 106 102 006 002
643 punpckhdq xmm7
, xmm6
; 115 111 015 011 107 103 007 003
645 pshufd xmm0
, xmm2
, 11011000b
646 pshufd xmm2
, xmm1
, 11011000b
648 pshufd xmm1
, xmm5
, 11011000b
649 pshufd xmm3
, xmm7
, 11011000b
653 ; Load up predict blocks
674 ; pack up before storing
680 ; Load destination stride before writing out,
681 ; doesn't need to persist
682 movsxd rdx
, dword ptr arg
(4) ; dst_stride
684 ; store blocks back out
686 movq
[rdi
+ rdx
], xmm1
688 lea rdi
, [rdi
+ 2*rdx
]
691 movq
[rdi
+ rdx
], xmm3