2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
12 #include "vpx_ports/config.h"
13 #include "vpx_ports/x86.h"
14 #include "vp8/encoder/variance.h"
15 #include "vp8/encoder/onyx_int.h"
19 static void short_fdct8x4_mmx(short *input
, short *output
, int pitch
)
21 vp8_short_fdct4x4_mmx(input
, output
, pitch
);
22 vp8_short_fdct4x4_mmx(input
+ 4, output
+ 16, pitch
);
25 int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr
, short *zbin_ptr
,
26 short *qcoeff_ptr
, short *dequant_ptr
,
27 short *scan_mask
, short *round_ptr
,
28 short *quant_ptr
, short *dqcoeff_ptr
);
29 static void fast_quantize_b_mmx(BLOCK
*b
, BLOCKD
*d
)
31 short *scan_mask
= vp8_default_zig_zag_mask
;//d->scan_order_mask_ptr;
32 short *coeff_ptr
= b
->coeff
;
33 short *zbin_ptr
= b
->zbin
;
34 short *round_ptr
= b
->round
;
35 short *quant_ptr
= b
->quant_fast
;
36 short *qcoeff_ptr
= d
->qcoeff
;
37 short *dqcoeff_ptr
= d
->dqcoeff
;
38 short *dequant_ptr
= d
->dequant
;
40 d
->eob
= vp8_fast_quantize_b_impl_mmx(
53 int vp8_mbblock_error_mmx_impl(short *coeff_ptr
, short *dcoef_ptr
, int dc
);
54 static int mbblock_error_mmx(MACROBLOCK
*mb
, int dc
)
56 short *coeff_ptr
= mb
->block
[0].coeff
;
57 short *dcoef_ptr
= mb
->e_mbd
.block
[0].dqcoeff
;
58 return vp8_mbblock_error_mmx_impl(coeff_ptr
, dcoef_ptr
, dc
);
61 int vp8_mbuverror_mmx_impl(short *s_ptr
, short *d_ptr
);
62 static int mbuverror_mmx(MACROBLOCK
*mb
)
64 short *s_ptr
= &mb
->coeff
[256];
65 short *d_ptr
= &mb
->e_mbd
.dqcoeff
[256];
66 return vp8_mbuverror_mmx_impl(s_ptr
, d_ptr
);
69 void vp8_subtract_b_mmx_impl(unsigned char *z
, int src_stride
,
70 short *diff
, unsigned char *predictor
,
72 static void subtract_b_mmx(BLOCK
*be
, BLOCKD
*bd
, int pitch
)
74 unsigned char *z
= *(be
->base_src
) + be
->src
;
75 unsigned int src_stride
= be
->src_stride
;
76 short *diff
= &be
->src_diff
[0];
77 unsigned char *predictor
= &bd
->predictor
[0];
78 vp8_subtract_b_mmx_impl(z
, src_stride
, diff
, predictor
, pitch
);
84 int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr
,
85 short *qcoeff_ptr
, short *dequant_ptr
,
86 const short *inv_scan_order
, short *round_ptr
,
87 short *quant_ptr
, short *dqcoeff_ptr
);
88 static void fast_quantize_b_sse2(BLOCK
*b
, BLOCKD
*d
)
90 short *scan_mask
= vp8_default_zig_zag_mask
;//d->scan_order_mask_ptr;
91 short *coeff_ptr
= b
->coeff
;
92 short *round_ptr
= b
->round
;
93 short *quant_ptr
= b
->quant_fast
;
94 short *qcoeff_ptr
= d
->qcoeff
;
95 short *dqcoeff_ptr
= d
->dqcoeff
;
96 short *dequant_ptr
= d
->dequant
;
98 d
->eob
= vp8_fast_quantize_b_impl_sse2(
102 vp8_default_inv_zig_zag
,
109 int vp8_mbblock_error_xmm_impl(short *coeff_ptr
, short *dcoef_ptr
, int dc
);
110 static int mbblock_error_xmm(MACROBLOCK
*mb
, int dc
)
112 short *coeff_ptr
= mb
->block
[0].coeff
;
113 short *dcoef_ptr
= mb
->e_mbd
.block
[0].dqcoeff
;
114 return vp8_mbblock_error_xmm_impl(coeff_ptr
, dcoef_ptr
, dc
);
117 int vp8_mbuverror_xmm_impl(short *s_ptr
, short *d_ptr
);
118 static int mbuverror_xmm(MACROBLOCK
*mb
)
120 short *s_ptr
= &mb
->coeff
[256];
121 short *d_ptr
= &mb
->e_mbd
.dqcoeff
[256];
122 return vp8_mbuverror_xmm_impl(s_ptr
, d_ptr
);
125 void vp8_subtract_b_sse2_impl(unsigned char *z
, int src_stride
,
126 short *diff
, unsigned char *predictor
,
128 static void subtract_b_sse2(BLOCK
*be
, BLOCKD
*bd
, int pitch
)
130 unsigned char *z
= *(be
->base_src
) + be
->src
;
131 unsigned int src_stride
= be
->src_stride
;
132 short *diff
= &be
->src_diff
[0];
133 unsigned char *predictor
= &bd
->predictor
[0];
134 vp8_subtract_b_sse2_impl(z
, src_stride
, diff
, predictor
, pitch
);
140 int vp8_fast_quantize_b_impl_ssse3(short *coeff_ptr
,
141 short *qcoeff_ptr
, short *dequant_ptr
,
143 short *quant_ptr
, short *dqcoeff_ptr
);
144 static void fast_quantize_b_ssse3(BLOCK
*b
, BLOCKD
*d
)
146 d
->eob
= vp8_fast_quantize_b_impl_ssse3(
163 unsigned long *sum_s
,
164 unsigned long *sum_r
,
165 unsigned long *sum_sq_s
,
166 unsigned long *sum_sq_r
,
167 unsigned long *sum_sxr
170 extern ssimpf vp8_ssim_parms_16x16_sse3
;
171 extern ssimpf vp8_ssim_parms_8x8_sse3
;
177 void vp8_arch_x86_encoder_init(VP8_COMP
*cpi
)
179 #if CONFIG_RUNTIME_CPU_DETECT
180 int flags
= x86_simd_caps();
181 int mmx_enabled
= flags
& HAS_MMX
;
182 int xmm_enabled
= flags
& HAS_SSE
;
183 int wmt_enabled
= flags
& HAS_SSE2
;
184 int SSE3Enabled
= flags
& HAS_SSE3
;
185 int SSSE3Enabled
= flags
& HAS_SSSE3
;
186 int SSE4_1Enabled
= flags
& HAS_SSE4_1
;
190 * This platform can be built without runtime CPU detection as well. If
191 * you modify any of the function mappings present in this file, be sure
192 * to also update them in static mapings (<arch>/filename_<arch>.h)
195 /* Override default functions with fastest ones for this CPU. */
199 cpi
->rtcd
.variance
.sad16x16
= vp8_sad16x16_mmx
;
200 cpi
->rtcd
.variance
.sad16x8
= vp8_sad16x8_mmx
;
201 cpi
->rtcd
.variance
.sad8x16
= vp8_sad8x16_mmx
;
202 cpi
->rtcd
.variance
.sad8x8
= vp8_sad8x8_mmx
;
203 cpi
->rtcd
.variance
.sad4x4
= vp8_sad4x4_mmx
;
205 cpi
->rtcd
.variance
.var4x4
= vp8_variance4x4_mmx
;
206 cpi
->rtcd
.variance
.var8x8
= vp8_variance8x8_mmx
;
207 cpi
->rtcd
.variance
.var8x16
= vp8_variance8x16_mmx
;
208 cpi
->rtcd
.variance
.var16x8
= vp8_variance16x8_mmx
;
209 cpi
->rtcd
.variance
.var16x16
= vp8_variance16x16_mmx
;
211 cpi
->rtcd
.variance
.subpixvar4x4
= vp8_sub_pixel_variance4x4_mmx
;
212 cpi
->rtcd
.variance
.subpixvar8x8
= vp8_sub_pixel_variance8x8_mmx
;
213 cpi
->rtcd
.variance
.subpixvar8x16
= vp8_sub_pixel_variance8x16_mmx
;
214 cpi
->rtcd
.variance
.subpixvar16x8
= vp8_sub_pixel_variance16x8_mmx
;
215 cpi
->rtcd
.variance
.subpixvar16x16
= vp8_sub_pixel_variance16x16_mmx
;
216 cpi
->rtcd
.variance
.halfpixvar16x16_h
= vp8_variance_halfpixvar16x16_h_mmx
;
217 cpi
->rtcd
.variance
.halfpixvar16x16_v
= vp8_variance_halfpixvar16x16_v_mmx
;
218 cpi
->rtcd
.variance
.halfpixvar16x16_hv
= vp8_variance_halfpixvar16x16_hv_mmx
;
219 cpi
->rtcd
.variance
.subpixmse16x16
= vp8_sub_pixel_mse16x16_mmx
;
221 cpi
->rtcd
.variance
.mse16x16
= vp8_mse16x16_mmx
;
222 cpi
->rtcd
.variance
.getmbss
= vp8_get_mb_ss_mmx
;
224 cpi
->rtcd
.variance
.get16x16prederror
= vp8_get16x16pred_error_mmx
;
225 cpi
->rtcd
.variance
.get8x8var
= vp8_get8x8var_mmx
;
226 cpi
->rtcd
.variance
.get16x16var
= vp8_get16x16var_mmx
;
227 cpi
->rtcd
.variance
.get4x4sse_cs
= vp8_get4x4sse_cs_mmx
;
229 cpi
->rtcd
.fdct
.short4x4
= vp8_short_fdct4x4_mmx
;
230 cpi
->rtcd
.fdct
.short8x4
= short_fdct8x4_mmx
;
231 cpi
->rtcd
.fdct
.fast4x4
= vp8_short_fdct4x4_mmx
;
232 cpi
->rtcd
.fdct
.fast8x4
= short_fdct8x4_mmx
;
234 cpi
->rtcd
.fdct
.walsh_short4x4
= vp8_short_walsh4x4_c
;
236 cpi
->rtcd
.encodemb
.berr
= vp8_block_error_mmx
;
237 cpi
->rtcd
.encodemb
.mberr
= mbblock_error_mmx
;
238 cpi
->rtcd
.encodemb
.mbuverr
= mbuverror_mmx
;
239 cpi
->rtcd
.encodemb
.subb
= subtract_b_mmx
;
240 cpi
->rtcd
.encodemb
.submby
= vp8_subtract_mby_mmx
;
241 cpi
->rtcd
.encodemb
.submbuv
= vp8_subtract_mbuv_mmx
;
243 /*cpi->rtcd.quantize.fastquantb = fast_quantize_b_mmx;*/
250 cpi
->rtcd
.variance
.sad16x16
= vp8_sad16x16_wmt
;
251 cpi
->rtcd
.variance
.sad16x8
= vp8_sad16x8_wmt
;
252 cpi
->rtcd
.variance
.sad8x16
= vp8_sad8x16_wmt
;
253 cpi
->rtcd
.variance
.sad8x8
= vp8_sad8x8_wmt
;
254 cpi
->rtcd
.variance
.sad4x4
= vp8_sad4x4_wmt
;
256 cpi
->rtcd
.variance
.var4x4
= vp8_variance4x4_wmt
;
257 cpi
->rtcd
.variance
.var8x8
= vp8_variance8x8_wmt
;
258 cpi
->rtcd
.variance
.var8x16
= vp8_variance8x16_wmt
;
259 cpi
->rtcd
.variance
.var16x8
= vp8_variance16x8_wmt
;
260 cpi
->rtcd
.variance
.var16x16
= vp8_variance16x16_wmt
;
262 cpi
->rtcd
.variance
.subpixvar4x4
= vp8_sub_pixel_variance4x4_wmt
;
263 cpi
->rtcd
.variance
.subpixvar8x8
= vp8_sub_pixel_variance8x8_wmt
;
264 cpi
->rtcd
.variance
.subpixvar8x16
= vp8_sub_pixel_variance8x16_wmt
;
265 cpi
->rtcd
.variance
.subpixvar16x8
= vp8_sub_pixel_variance16x8_wmt
;
266 cpi
->rtcd
.variance
.subpixvar16x16
= vp8_sub_pixel_variance16x16_wmt
;
267 cpi
->rtcd
.variance
.halfpixvar16x16_h
= vp8_variance_halfpixvar16x16_h_wmt
;
268 cpi
->rtcd
.variance
.halfpixvar16x16_v
= vp8_variance_halfpixvar16x16_v_wmt
;
269 cpi
->rtcd
.variance
.halfpixvar16x16_hv
= vp8_variance_halfpixvar16x16_hv_wmt
;
270 cpi
->rtcd
.variance
.subpixmse16x16
= vp8_sub_pixel_mse16x16_wmt
;
272 cpi
->rtcd
.variance
.mse16x16
= vp8_mse16x16_wmt
;
273 cpi
->rtcd
.variance
.getmbss
= vp8_get_mb_ss_sse2
;
275 cpi
->rtcd
.variance
.get16x16prederror
= vp8_get16x16pred_error_sse2
;
276 cpi
->rtcd
.variance
.get8x8var
= vp8_get8x8var_sse2
;
277 cpi
->rtcd
.variance
.get16x16var
= vp8_get16x16var_sse2
;
280 /* cpi->rtcd.variance.get4x4sse_cs not implemented for wmt */;
282 cpi
->rtcd
.fdct
.short4x4
= vp8_short_fdct4x4_sse2
;
283 cpi
->rtcd
.fdct
.short8x4
= vp8_short_fdct8x4_sse2
;
284 cpi
->rtcd
.fdct
.fast4x4
= vp8_short_fdct4x4_sse2
;
285 cpi
->rtcd
.fdct
.fast8x4
= vp8_short_fdct8x4_sse2
;
287 cpi
->rtcd
.fdct
.walsh_short4x4
= vp8_short_walsh4x4_sse2
;
289 cpi
->rtcd
.encodemb
.berr
= vp8_block_error_xmm
;
290 cpi
->rtcd
.encodemb
.mberr
= mbblock_error_xmm
;
291 cpi
->rtcd
.encodemb
.mbuverr
= mbuverror_xmm
;
292 cpi
->rtcd
.encodemb
.subb
= subtract_b_sse2
;
293 cpi
->rtcd
.encodemb
.submby
= vp8_subtract_mby_sse2
;
294 cpi
->rtcd
.encodemb
.submbuv
= vp8_subtract_mbuv_sse2
;
296 cpi
->rtcd
.quantize
.quantb
= vp8_regular_quantize_b_sse2
;
297 cpi
->rtcd
.quantize
.fastquantb
= fast_quantize_b_sse2
;
299 #if !(CONFIG_REALTIME_ONLY)
300 cpi
->rtcd
.temporal
.apply
= vp8_temporal_filter_apply_sse2
;
308 cpi
->rtcd
.variance
.sad16x16
= vp8_sad16x16_sse3
;
309 cpi
->rtcd
.variance
.sad16x16x3
= vp8_sad16x16x3_sse3
;
310 cpi
->rtcd
.variance
.sad16x8x3
= vp8_sad16x8x3_sse3
;
311 cpi
->rtcd
.variance
.sad8x16x3
= vp8_sad8x16x3_sse3
;
312 cpi
->rtcd
.variance
.sad8x8x3
= vp8_sad8x8x3_sse3
;
313 cpi
->rtcd
.variance
.sad4x4x3
= vp8_sad4x4x3_sse3
;
314 #if !(CONFIG_REALTIME_ONLY)
315 cpi
->rtcd
.search
.full_search
= vp8_full_search_sadx3
;
317 cpi
->rtcd
.variance
.sad16x16x4d
= vp8_sad16x16x4d_sse3
;
318 cpi
->rtcd
.variance
.sad16x8x4d
= vp8_sad16x8x4d_sse3
;
319 cpi
->rtcd
.variance
.sad8x16x4d
= vp8_sad8x16x4d_sse3
;
320 cpi
->rtcd
.variance
.sad8x8x4d
= vp8_sad8x8x4d_sse3
;
321 cpi
->rtcd
.variance
.sad4x4x4d
= vp8_sad4x4x4d_sse3
;
322 cpi
->rtcd
.search
.diamond_search
= vp8_diamond_search_sadx4
;
329 cpi
->rtcd
.variance
.sad16x16x3
= vp8_sad16x16x3_ssse3
;
330 cpi
->rtcd
.variance
.sad16x8x3
= vp8_sad16x8x3_ssse3
;
332 cpi
->rtcd
.variance
.subpixvar16x8
= vp8_sub_pixel_variance16x8_ssse3
;
333 cpi
->rtcd
.variance
.subpixvar16x16
= vp8_sub_pixel_variance16x16_ssse3
;
335 cpi
->rtcd
.quantize
.fastquantb
= fast_quantize_b_ssse3
;
339 cpi
->rtcd
.variance
.ssimpf_8x8
= vp8_ssim_parms_8x8_sse3
;
340 cpi
->rtcd
.variance
.ssimpf
= vp8_ssim_parms_16x16_sse3
;
352 cpi
->rtcd
.variance
.sad16x16x8
= vp8_sad16x16x8_sse4
;
353 cpi
->rtcd
.variance
.sad16x8x8
= vp8_sad16x8x8_sse4
;
354 cpi
->rtcd
.variance
.sad8x16x8
= vp8_sad8x16x8_sse4
;
355 cpi
->rtcd
.variance
.sad8x8x8
= vp8_sad8x8x8_sse4
;
356 cpi
->rtcd
.variance
.sad4x4x8
= vp8_sad4x4x8_sse4
;
357 #if !(CONFIG_REALTIME_ONLY)
358 cpi
->rtcd
.search
.full_search
= vp8_full_search_sadx8
;