2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 ;void idct_dequant_0_2x_sse2
18 ; unsigned char *pre - 2
19 ; unsigned char *dst - 3
24 global sym
(idct_dequant_0_2x_sse2
)
25 sym
(idct_dequant_0_2x_sse2
):
28 SHADOW_ARGS_TO_STACK
6
32 mov rdx
, arg
(1) ; dequant
33 mov rax
, arg
(0) ; qcoeff
35 ; Zero out xmm7, for use unpacking
41 pinsrw xmm4
, [rax
+32], 4
50 pshuflw xmm4
, xmm4
, 00000000b
51 pshufhw xmm4
, xmm4
, 00000000b
54 paddw xmm4
, [GLOBAL(fours
)]
56 movsxd rcx
, dword ptr arg
(5) ; blk_stride
61 movq xmm2
, [rax
+2*rcx
]
71 movsxd rdx
, dword ptr arg
(4) ; dst_stride
73 ; Add to predict buffer
79 ; pack up before storing
85 ; store blocks back out
87 movq
[rax
+ rdx
], xmm1
89 lea rax
, [rax
+ 2*rdx
]
92 movq
[rax
+ rdx
], xmm3
100 global sym
(idct_dequant_full_2x_sse2
)
101 sym
(idct_dequant_full_2x_sse2
):
104 SHADOW_ARGS_TO_STACK
7
110 ; special case when 2 blocks have 0 or 1 coeffs
111 ; dc is set as first coeff, so no need to load qcoeff
112 mov rax
, arg
(0) ; qcoeff
113 mov rsi
, arg
(2) ; pre
114 mov rdi
, arg
(3) ; dst
115 movsxd rcx
, dword ptr arg
(5) ; blk_stride
117 ; Zero out xmm7, for use unpacking
120 mov rdx
, arg
(1) ; dequant
122 ; note the transpose of xmm1 and xmm2, necessary for shuffle
123 ; to spit out sensicle data
125 movdqa xmm2
, [rax
+16]
126 movdqa xmm1
, [rax
+32]
127 movdqa xmm3
, [rax
+48]
131 movdqa
[rax
+16], xmm7
132 movdqa
[rax
+32], xmm7
133 movdqa
[rax
+48], xmm7
135 ; dequantize qcoeff buffer
137 pmullw xmm2
, [rdx
+16]
139 pmullw xmm3
, [rdx
+16]
141 ; repack so block 0 row x and block 1 row x are together
146 pshufd xmm0
, xmm0
, 11011000b
147 pshufd xmm1
, xmm4
, 11011000b
153 pshufd xmm2
, xmm2
, 11011000b
154 pshufd xmm3
, xmm4
, 11011000b
157 psubw xmm0
, xmm2
; b1 = 0-2
161 paddw xmm2
, xmm0
; a1 = 0+2
163 pmulhw xmm5
, [GLOBAL(x_s1sqr2
)]
164 paddw xmm5
, xmm1
; ip1 * sin(pi/8) * sqrt(2)
167 pmulhw xmm7
, [GLOBAL(x_c1sqr2less1
)]
169 paddw xmm7
, xmm3
; ip3 * cos(pi/8) * sqrt(2)
170 psubw xmm7
, xmm5
; c1
175 pmulhw xmm5
, [GLOBAL(x_c1sqr2less1
)]
178 pmulhw xmm3
, [GLOBAL(x_s1sqr2
)]
181 paddw xmm3
, xmm5
; d1
182 movdqa xmm6
, xmm2
; a1
184 movdqa xmm4
, xmm0
; b1
192 ; transpose for the second pass
193 movdqa xmm7
, xmm2
; 103 102 101 100 003 002 001 000
194 punpcklwd xmm2
, xmm0
; 007 003 006 002 005 001 004 000
195 punpckhwd xmm7
, xmm0
; 107 103 106 102 105 101 104 100
197 movdqa xmm5
, xmm4
; 111 110 109 108 011 010 009 008
198 punpcklwd xmm4
, xmm6
; 015 011 014 010 013 009 012 008
199 punpckhwd xmm5
, xmm6
; 115 111 114 110 113 109 112 108
202 movdqa xmm1
, xmm2
; 007 003 006 002 005 001 004 000
203 punpckldq xmm2
, xmm4
; 013 009 005 001 012 008 004 000
204 punpckhdq xmm1
, xmm4
; 015 011 007 003 014 010 006 002
206 movdqa xmm6
, xmm7
; 107 103 106 102 105 101 104 100
207 punpckldq xmm7
, xmm5
; 113 109 105 101 112 108 104 100
208 punpckhdq xmm6
, xmm5
; 115 111 107 103 114 110 106 102
211 movdqa xmm5
, xmm2
; 013 009 005 001 012 008 004 000
212 punpckldq xmm2
, xmm7
; 112 108 012 008 104 100 004 000
213 punpckhdq xmm5
, xmm7
; 113 109 013 009 105 101 005 001
215 movdqa xmm7
, xmm1
; 015 011 007 003 014 010 006 002
216 punpckldq xmm1
, xmm6
; 114 110 014 010 106 102 006 002
217 punpckhdq xmm7
, xmm6
; 115 111 015 011 107 103 007 003
219 pshufd xmm0
, xmm2
, 11011000b
220 pshufd xmm2
, xmm1
, 11011000b
222 pshufd xmm1
, xmm5
, 11011000b
223 pshufd xmm3
, xmm7
, 11011000b
226 psubw xmm0
, xmm2
; b1 = 0-2
230 paddw xmm2
, xmm0
; a1 = 0+2
232 pmulhw xmm5
, [GLOBAL(x_s1sqr2
)]
233 paddw xmm5
, xmm1
; ip1 * sin(pi/8) * sqrt(2)
236 pmulhw xmm7
, [GLOBAL(x_c1sqr2less1
)]
238 paddw xmm7
, xmm3
; ip3 * cos(pi/8) * sqrt(2)
239 psubw xmm7
, xmm5
; c1
244 pmulhw xmm5
, [GLOBAL(x_c1sqr2less1
)]
247 pmulhw xmm3
, [GLOBAL(x_s1sqr2
)]
250 paddw xmm3
, xmm5
; d1
251 paddw xmm0
, [GLOBAL(fours
)]
253 paddw xmm2
, [GLOBAL(fours
)]
254 movdqa xmm6
, xmm2
; a1
256 movdqa xmm4
, xmm0
; b1
271 movdqa xmm7
, xmm2
; 103 102 101 100 003 002 001 000
272 punpcklwd xmm2
, xmm0
; 007 003 006 002 005 001 004 000
273 punpckhwd xmm7
, xmm0
; 107 103 106 102 105 101 104 100
275 movdqa xmm5
, xmm4
; 111 110 109 108 011 010 009 008
276 punpcklwd xmm4
, xmm6
; 015 011 014 010 013 009 012 008
277 punpckhwd xmm5
, xmm6
; 115 111 114 110 113 109 112 108
280 movdqa xmm1
, xmm2
; 007 003 006 002 005 001 004 000
281 punpckldq xmm2
, xmm4
; 013 009 005 001 012 008 004 000
282 punpckhdq xmm1
, xmm4
; 015 011 007 003 014 010 006 002
284 movdqa xmm6
, xmm7
; 107 103 106 102 105 101 104 100
285 punpckldq xmm7
, xmm5
; 113 109 105 101 112 108 104 100
286 punpckhdq xmm6
, xmm5
; 115 111 107 103 114 110 106 102
289 movdqa xmm5
, xmm2
; 013 009 005 001 012 008 004 000
290 punpckldq xmm2
, xmm7
; 112 108 012 008 104 100 004 000
291 punpckhdq xmm5
, xmm7
; 113 109 013 009 105 101 005 001
293 movdqa xmm7
, xmm1
; 015 011 007 003 014 010 006 002
294 punpckldq xmm1
, xmm6
; 114 110 014 010 106 102 006 002
295 punpckhdq xmm7
, xmm6
; 115 111 015 011 107 103 007 003
297 pshufd xmm0
, xmm2
, 11011000b
298 pshufd xmm2
, xmm1
, 11011000b
300 pshufd xmm1
, xmm5
, 11011000b
301 pshufd xmm3
, xmm7
, 11011000b
305 ; Load up predict blocks
315 movq xmm4
, [rsi
+2*rcx
]
327 ; pack up before storing
333 ; Load destination stride before writing out,
334 ; doesn't need to persist
335 movsxd rdx
, dword ptr arg
(4) ; dst_stride
337 ; store blocks back out
339 movq
[rdi
+ rdx
], xmm1
341 lea rdi
, [rdi
+ 2*rdx
]
344 movq
[rdi
+ rdx
], xmm3
354 ;void idct_dequant_dc_0_2x_sse2
358 ; unsigned char *pre - 2
359 ; unsigned char *dst - 3
363 global sym
(idct_dequant_dc_0_2x_sse2
)
364 sym
(idct_dequant_dc_0_2x_sse2
):
367 SHADOW_ARGS_TO_STACK
7
373 ; special case when 2 blocks have 0 or 1 coeffs
374 ; dc is set as first coeff, so no need to load qcoeff
375 mov rax
, arg
(0) ; qcoeff
376 mov rsi
, arg
(2) ; pre
377 mov rdi
, arg
(3) ; dst
380 ; Zero out xmm7, for use unpacking
383 ; load up 2 dc words here == 2*16 = doubleword
386 ; Load up predict blocks
392 ; Duplicate and expand dc across
396 ; Rounding to dequant and downshift
397 paddw xmm4
, [GLOBAL(fours
)]
400 ; Predict buffer needs to be expanded from bytes to words
406 ; Add to predict buffer
412 ; pack up before storing
418 ; Load destination stride before writing out,
419 ; doesn't need to persist
420 movsxd rdx
, dword ptr arg
(4) ; dst_stride
422 ; store blocks back out
424 movq
[rdi
+ rdx
], xmm1
426 lea rdi
, [rdi
+ 2*rdx
]
429 movq
[rdi
+ rdx
], xmm3
439 global sym
(idct_dequant_dc_full_2x_sse2
)
440 sym
(idct_dequant_dc_full_2x_sse2
):
443 SHADOW_ARGS_TO_STACK
7
449 ; special case when 2 blocks have 0 or 1 coeffs
450 ; dc is set as first coeff, so no need to load qcoeff
451 mov rax
, arg
(0) ; qcoeff
452 mov rsi
, arg
(2) ; pre
453 mov rdi
, arg
(3) ; dst
455 ; Zero out xmm7, for use unpacking
458 mov rdx
, arg
(1) ; dequant
460 ; note the transpose of xmm1 and xmm2, necessary for shuffle
461 ; to spit out sensicle data
463 movdqa xmm2
, [rax
+16]
464 movdqa xmm1
, [rax
+32]
465 movdqa xmm3
, [rax
+48]
469 movdqa
[rax
+16], xmm7
470 movdqa
[rax
+32], xmm7
471 movdqa
[rax
+48], xmm7
473 ; dequantize qcoeff buffer
475 pmullw xmm2
, [rdx
+16]
477 pmullw xmm3
, [rdx
+16]
482 ; repack so block 0 row x and block 1 row x are together
487 pshufd xmm0
, xmm0
, 11011000b
488 pshufd xmm1
, xmm4
, 11011000b
494 pshufd xmm2
, xmm2
, 11011000b
495 pshufd xmm3
, xmm4
, 11011000b
497 ; insert DC component
498 pinsrw xmm0
, [rdx
], 0
499 pinsrw xmm0
, [rdx
+2], 4
502 psubw xmm0
, xmm2
; b1 = 0-2
506 paddw xmm2
, xmm0
; a1 = 0+2
508 pmulhw xmm5
, [GLOBAL(x_s1sqr2
)]
509 paddw xmm5
, xmm1
; ip1 * sin(pi/8) * sqrt(2)
512 pmulhw xmm7
, [GLOBAL(x_c1sqr2less1
)]
514 paddw xmm7
, xmm3
; ip3 * cos(pi/8) * sqrt(2)
515 psubw xmm7
, xmm5
; c1
520 pmulhw xmm5
, [GLOBAL(x_c1sqr2less1
)]
523 pmulhw xmm3
, [GLOBAL(x_s1sqr2
)]
526 paddw xmm3
, xmm5
; d1
527 movdqa xmm6
, xmm2
; a1
529 movdqa xmm4
, xmm0
; b1
537 ; transpose for the second pass
538 movdqa xmm7
, xmm2
; 103 102 101 100 003 002 001 000
539 punpcklwd xmm2
, xmm0
; 007 003 006 002 005 001 004 000
540 punpckhwd xmm7
, xmm0
; 107 103 106 102 105 101 104 100
542 movdqa xmm5
, xmm4
; 111 110 109 108 011 010 009 008
543 punpcklwd xmm4
, xmm6
; 015 011 014 010 013 009 012 008
544 punpckhwd xmm5
, xmm6
; 115 111 114 110 113 109 112 108
547 movdqa xmm1
, xmm2
; 007 003 006 002 005 001 004 000
548 punpckldq xmm2
, xmm4
; 013 009 005 001 012 008 004 000
549 punpckhdq xmm1
, xmm4
; 015 011 007 003 014 010 006 002
551 movdqa xmm6
, xmm7
; 107 103 106 102 105 101 104 100
552 punpckldq xmm7
, xmm5
; 113 109 105 101 112 108 104 100
553 punpckhdq xmm6
, xmm5
; 115 111 107 103 114 110 106 102
556 movdqa xmm5
, xmm2
; 013 009 005 001 012 008 004 000
557 punpckldq xmm2
, xmm7
; 112 108 012 008 104 100 004 000
558 punpckhdq xmm5
, xmm7
; 113 109 013 009 105 101 005 001
560 movdqa xmm7
, xmm1
; 015 011 007 003 014 010 006 002
561 punpckldq xmm1
, xmm6
; 114 110 014 010 106 102 006 002
562 punpckhdq xmm7
, xmm6
; 115 111 015 011 107 103 007 003
564 pshufd xmm0
, xmm2
, 11011000b
565 pshufd xmm2
, xmm1
, 11011000b
567 pshufd xmm1
, xmm5
, 11011000b
568 pshufd xmm3
, xmm7
, 11011000b
571 psubw xmm0
, xmm2
; b1 = 0-2
575 paddw xmm2
, xmm0
; a1 = 0+2
577 pmulhw xmm5
, [GLOBAL(x_s1sqr2
)]
578 paddw xmm5
, xmm1
; ip1 * sin(pi/8) * sqrt(2)
581 pmulhw xmm7
, [GLOBAL(x_c1sqr2less1
)]
583 paddw xmm7
, xmm3
; ip3 * cos(pi/8) * sqrt(2)
584 psubw xmm7
, xmm5
; c1
589 pmulhw xmm5
, [GLOBAL(x_c1sqr2less1
)]
592 pmulhw xmm3
, [GLOBAL(x_s1sqr2
)]
595 paddw xmm3
, xmm5
; d1
596 paddw xmm0
, [GLOBAL(fours
)]
598 paddw xmm2
, [GLOBAL(fours
)]
599 movdqa xmm6
, xmm2
; a1
601 movdqa xmm4
, xmm0
; b1
616 movdqa xmm7
, xmm2
; 103 102 101 100 003 002 001 000
617 punpcklwd xmm2
, xmm0
; 007 003 006 002 005 001 004 000
618 punpckhwd xmm7
, xmm0
; 107 103 106 102 105 101 104 100
620 movdqa xmm5
, xmm4
; 111 110 109 108 011 010 009 008
621 punpcklwd xmm4
, xmm6
; 015 011 014 010 013 009 012 008
622 punpckhwd xmm5
, xmm6
; 115 111 114 110 113 109 112 108
625 movdqa xmm1
, xmm2
; 007 003 006 002 005 001 004 000
626 punpckldq xmm2
, xmm4
; 013 009 005 001 012 008 004 000
627 punpckhdq xmm1
, xmm4
; 015 011 007 003 014 010 006 002
629 movdqa xmm6
, xmm7
; 107 103 106 102 105 101 104 100
630 punpckldq xmm7
, xmm5
; 113 109 105 101 112 108 104 100
631 punpckhdq xmm6
, xmm5
; 115 111 107 103 114 110 106 102
634 movdqa xmm5
, xmm2
; 013 009 005 001 012 008 004 000
635 punpckldq xmm2
, xmm7
; 112 108 012 008 104 100 004 000
636 punpckhdq xmm5
, xmm7
; 113 109 013 009 105 101 005 001
638 movdqa xmm7
, xmm1
; 015 011 007 003 014 010 006 002
639 punpckldq xmm1
, xmm6
; 114 110 014 010 106 102 006 002
640 punpckhdq xmm7
, xmm6
; 115 111 015 011 107 103 007 003
642 pshufd xmm0
, xmm2
, 11011000b
643 pshufd xmm2
, xmm1
, 11011000b
645 pshufd xmm1
, xmm5
, 11011000b
646 pshufd xmm3
, xmm7
, 11011000b
650 ; Load up predict blocks
671 ; pack up before storing
677 ; Load destination stride before writing out,
678 ; doesn't need to persist
679 movsxd rdx
, dword ptr arg
(4) ; dst_stride
681 ; store blocks back out
683 movq
[rdi
+ rdx
], xmm1
685 lea rdi
, [rdi
+ 2*rdx
]
688 movq
[rdi
+ rdx
], xmm3