2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
12 %include "vpx_ports/x86_abi_support.asm"
14 ;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
15 global sym
(vp8_short_fdct4x4_mmx
)
16 sym
(vp8_short_fdct4x4_mmx
):
19 SHADOW_ARGS_TO_STACK
3
25 mov rsi
, arg
(0) ; input
26 mov rdi
, arg
(1) ; output
28 movsxd rax
, dword ptr arg
(2) ;pitch
30 lea rcx
, [rsi
+ rax
*2]
38 ; transpose for the first stage
39 movq mm3
, mm0
; 00 01 02 03
40 movq mm5
, mm2
; 20 21 22 23
42 punpcklwd mm0
, mm1
; 00 10 01 11
43 punpckhwd mm3
, mm1
; 02 12 03 13
45 punpcklwd mm2
, mm4
; 20 30 21 31
46 punpckhwd mm5
, mm4
; 22 32 23 33
48 movq mm1
, mm0
; 00 10 01 11
49 punpckldq mm0
, mm2
; 00 10 20 30
51 punpckhdq mm1
, mm2
; 01 11 21 31
53 movq mm2
, mm3
; 02 12 03 13
54 punpckldq mm2
, mm5
; 02 12 22 32
56 punpckhdq mm3
, mm5
; 03 13 23 33
67 paddw mm0
, mm3
; a1 = 0 + 3
68 paddw mm1
, mm2
; b1 = 1 + 2
70 psubw mm4
, mm2
; c1 = 1 - 2
71 psubw mm5
, mm3
; d1 = 0 - 3
82 paddw mm0
, mm1
; op[0] = a1 + b1
83 psubw mm2
, mm1
; op[2] = a1 - b1
88 punpcklwd mm1
, mm4
; c1 d1
89 punpckhwd mm5
, mm4
; c1 d1
94 pmaddwd mm1
, MMWORD
PTR[GLOBAL (_5352_2217
)] ; c1*2217 + d1*5352
95 pmaddwd mm4
, MMWORD
PTR[GLOBAL (_5352_2217
)] ; c1*2217 + d1*5352
97 pmaddwd mm3
, MMWORD
PTR[GLOBAL(_2217_neg5352
)] ; d1*2217 - c1*5352
98 pmaddwd mm5
, MMWORD
PTR[GLOBAL(_2217_neg5352
)] ; d1*2217 - c1*5352
100 paddd mm1
, MMWORD
PTR[GLOBAL(_14500
)]
101 paddd mm4
, MMWORD
PTR[GLOBAL(_14500
)]
102 paddd mm3
, MMWORD
PTR[GLOBAL(_7500
)]
103 paddd mm5
, MMWORD
PTR[GLOBAL(_7500
)]
105 psrad mm1
, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
106 psrad mm4
, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
107 psrad mm3
, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
108 psrad mm5
, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
110 packssdw mm1
, mm4
; op[1]
111 packssdw mm3
, mm5
; op[3]
114 ; transpose for the second stage
115 movq mm4
, mm0
; 00 10 20 30
116 movq mm5
, mm2
; 02 12 22 32
118 punpcklwd mm0
, mm1
; 00 01 10 11
119 punpckhwd mm4
, mm1
; 20 21 30 31
121 punpcklwd mm2
, mm3
; 02 03 12 13
122 punpckhwd mm5
, mm3
; 22 23 32 33
124 movq mm1
, mm0
; 00 01 10 11
125 punpckldq mm0
, mm2
; 00 01 02 03
127 punpckhdq mm1
, mm2
; 01 22 12 13
129 movq mm2
, mm4
; 20 31 30 31
130 punpckldq mm2
, mm5
; 20 21 22 23
132 punpckhdq mm4
, mm5
; 30 31 32 33
142 paddw mm0
, mm4
; a1 = 0 + 3
143 paddw mm1
, mm2
; b1 = 1 + 2
145 psubw mm3
, mm2
; c1 = 1 - 2
146 psubw mm5
, mm4
; d1 = 0 - 3
148 pxor mm6
, mm6
; zero out for compare
150 pcmpeqw mm6
, mm5
; d1 != 0
152 pandn mm6
, MMWORD
PTR[GLOBAL(_cmp_mask
)] ; clear upper,
153 ; and keep bit 0 of lower
158 paddw mm0
, mm1
; a1 + b1
159 psubw mm2
, mm1
; a1 - b1
161 paddw mm0
, MMWORD
PTR[GLOBAL(_7w
)]
162 paddw mm2
, MMWORD
PTR[GLOBAL(_7w
)]
164 psraw mm0
, 4 ; op[0] = (a1 + b1 + 7)>>4
165 psraw mm2
, 4 ; op[8] = (a1 - b1 + 7)>>4
167 movq MMWORD
PTR[rdi
+ 0 ], mm0
168 movq MMWORD
PTR[rdi
+ 16], mm2
173 punpcklwd mm1
, mm3
; c1 d1
174 punpckhwd mm5
, mm3
; c1 d1
179 pmaddwd mm1
, MMWORD
PTR[GLOBAL (_5352_2217
)] ; c1*2217 + d1*5352
180 pmaddwd mm4
, MMWORD
PTR[GLOBAL (_5352_2217
)] ; c1*2217 + d1*5352
182 pmaddwd mm3
, MMWORD
PTR[GLOBAL(_2217_neg5352
)] ; d1*2217 - c1*5352
183 pmaddwd mm5
, MMWORD
PTR[GLOBAL(_2217_neg5352
)] ; d1*2217 - c1*5352
185 paddd mm1
, MMWORD
PTR[GLOBAL(_12000
)]
186 paddd mm4
, MMWORD
PTR[GLOBAL(_12000
)]
187 paddd mm3
, MMWORD
PTR[GLOBAL(_51000
)]
188 paddd mm5
, MMWORD
PTR[GLOBAL(_51000
)]
190 psrad mm1
, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
191 psrad mm4
, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
192 psrad mm3
, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
193 psrad mm5
, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
195 packssdw mm1
, mm4
; op[4]
196 packssdw mm3
, mm5
; op[12]
198 paddw mm1
, mm6
; op[4] += (d1!=0)
200 movq MMWORD
PTR[rdi
+ 8 ], mm1
201 movq MMWORD
PTR[rdi
+ 24], mm3