Define RDCOST only once
[libvpx.git] / vp8 / encoder / x86 / ssim_opt.asm
blobc267cdb54bac731318342424e69b8b1f52689838
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
11 %include "vpx_ports/x86_abi_support.asm"
13 ; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
14 %macro TABULATE_SSIM 0
15 paddusw xmm15, xmm3 ; sum_s
16 paddusw xmm14, xmm4 ; sum_r
17 movdqa xmm1, xmm3
18 pmaddwd xmm1, xmm1
19 paddq xmm13, xmm1 ; sum_sq_s
20 movdqa xmm2, xmm4
21 pmaddwd xmm2, xmm2
22 paddq xmm12, xmm2 ; sum_sq_r
23 pmaddwd xmm3, xmm4
24 paddq xmm11, xmm3 ; sum_sxr
25 %endmacro
27 ; Sum across the register %1 starting with q words
28 %macro SUM_ACROSS_Q 1
29 movdqa xmm2,%1
30 punpckldq %1,xmm0
31 punpckhdq xmm2,xmm0
32 paddq %1,xmm2
33 movdqa xmm2,%1
34 punpcklqdq %1,xmm0
35 punpckhqdq xmm2,xmm0
36 paddq %1,xmm2
37 %endmacro
39 ; Sum across the register %1 starting with q words
40 %macro SUM_ACROSS_W 1
41 movdqa xmm1, %1
42 punpcklwd %1,xmm0
43 punpckhwd xmm1,xmm0
44 paddd %1, xmm1
45 SUM_ACROSS_Q %1
46 %endmacro
47 ;void ssim_parms_sse3(
48 ; unsigned char *s,
49 ; int sp,
50 ; unsigned char *r,
51 ; int rp
52 ; unsigned long *sum_s,
53 ; unsigned long *sum_r,
54 ; unsigned long *sum_sq_s,
55 ; unsigned long *sum_sq_r,
56 ; unsigned long *sum_sxr);
58 ; TODO: Use parm passing through structure, probably don't need the pxors
59 ; ( calling app will initialize to 0 ) could easily fit everything in sse2
60 ; without too much hastle, and can probably do better estimates with psadw
61 ; or pavgb At this point this is just meant to be first pass for calculating
62 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion
63 ; in mode selection code.
64 global sym(vp8_ssim_parms_16x16_sse3)
65 sym(vp8_ssim_parms_16x16_sse3):
66 push rbp
67 mov rbp, rsp
68 SHADOW_ARGS_TO_STACK 9
69 push rsi
70 push rdi
71 ; end prolog
73 mov rsi, arg(0) ;s
74 mov rcx, arg(1) ;sp
75 mov rdi, arg(2) ;r
76 mov rax, arg(3) ;rp
78 pxor xmm0, xmm0
79 pxor xmm15,xmm15 ;sum_s
80 pxor xmm14,xmm14 ;sum_r
81 pxor xmm13,xmm13 ;sum_sq_s
82 pxor xmm12,xmm12 ;sum_sq_r
83 pxor xmm11,xmm11 ;sum_sxr
85 mov rdx, 16 ;row counter
86 NextRow:
88 ;grab source and reference pixels
89 movdqu xmm5, [rsi]
90 movdqu xmm6, [rdi]
91 movdqa xmm3, xmm5
92 movdqa xmm4, xmm6
93 punpckhbw xmm3, xmm0 ; high_s
94 punpckhbw xmm4, xmm0 ; high_r
96 TABULATE_SSIM
98 movdqa xmm3, xmm5
99 movdqa xmm4, xmm6
100 punpcklbw xmm3, xmm0 ; low_s
101 punpcklbw xmm4, xmm0 ; low_r
103 TABULATE_SSIM
105 add rsi, rcx ; next s row
106 add rdi, rax ; next r row
108 dec rdx ; counter
109 jnz NextRow
111 SUM_ACROSS_W xmm15
112 SUM_ACROSS_W xmm14
113 SUM_ACROSS_Q xmm13
114 SUM_ACROSS_Q xmm12
115 SUM_ACROSS_Q xmm11
117 mov rdi,arg(4)
118 movq [rdi], xmm15;
119 mov rdi,arg(5)
120 movq [rdi], xmm14;
121 mov rdi,arg(6)
122 movq [rdi], xmm13;
123 mov rdi,arg(7)
124 movq [rdi], xmm12;
125 mov rdi,arg(8)
126 movq [rdi], xmm11;
128 ; begin epilog
129 pop rdi
130 pop rsi
131 UNSHADOW_ARGS
132 pop rbp
135 ;void ssim_parms_sse3(
136 ; unsigned char *s,
137 ; int sp,
138 ; unsigned char *r,
139 ; int rp
140 ; unsigned long *sum_s,
141 ; unsigned long *sum_r,
142 ; unsigned long *sum_sq_s,
143 ; unsigned long *sum_sq_r,
144 ; unsigned long *sum_sxr);
146 ; TODO: Use parm passing through structure, probably don't need the pxors
147 ; ( calling app will initialize to 0 ) could easily fit everything in sse2
148 ; without too much hastle, and can probably do better estimates with psadw
149 ; or pavgb At this point this is just meant to be first pass for calculating
150 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion
151 ; in mode selection code.
152 global sym(vp8_ssim_parms_8x8_sse3)
153 sym(vp8_ssim_parms_8x8_sse3):
154 push rbp
155 mov rbp, rsp
156 SHADOW_ARGS_TO_STACK 9
157 push rsi
158 push rdi
159 ; end prolog
161 mov rsi, arg(0) ;s
162 mov rcx, arg(1) ;sp
163 mov rdi, arg(2) ;r
164 mov rax, arg(3) ;rp
166 pxor xmm0, xmm0
167 pxor xmm15,xmm15 ;sum_s
168 pxor xmm14,xmm14 ;sum_r
169 pxor xmm13,xmm13 ;sum_sq_s
170 pxor xmm12,xmm12 ;sum_sq_r
171 pxor xmm11,xmm11 ;sum_sxr
173 mov rdx, 8 ;row counter
174 NextRow2:
176 ;grab source and reference pixels
177 movq xmm5, [rsi]
178 movq xmm6, [rdi]
180 movdqa xmm3, xmm5
181 movdqa xmm4, xmm6
182 punpcklbw xmm3, xmm0 ; low_s
183 punpcklbw xmm4, xmm0 ; low_r
185 TABULATE_SSIM
187 add rsi, rcx ; next s row
188 add rdi, rax ; next r row
190 dec rdx ; counter
191 jnz NextRow2
193 SUM_ACROSS_W xmm15
194 SUM_ACROSS_W xmm14
195 SUM_ACROSS_Q xmm13
196 SUM_ACROSS_Q xmm12
197 SUM_ACROSS_Q xmm11
199 mov rdi,arg(4)
200 movq [rdi], xmm15;
201 mov rdi,arg(5)
202 movq [rdi], xmm14;
203 mov rdi,arg(6)
204 movq [rdi], xmm13;
205 mov rdi,arg(7)
206 movq [rdi], xmm12;
207 mov rdi,arg(8)
208 movq [rdi], xmm11;
210 ; begin epilog
211 pop rdi
212 pop rsi
213 UNSHADOW_ARGS
214 pop rbp