2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
12 #include "vpx_ports/config.h"
13 #include "vpx_ports/x86.h"
14 #include "vp8/encoder/variance.h"
15 #include "vp8/encoder/onyx_int.h"
19 static void short_fdct8x4_mmx(short *input
, short *output
, int pitch
)
21 vp8_short_fdct4x4_mmx(input
, output
, pitch
);
22 vp8_short_fdct4x4_mmx(input
+ 4, output
+ 16, pitch
);
25 int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr
, short *zbin_ptr
,
26 short *qcoeff_ptr
, short *dequant_ptr
,
27 short *scan_mask
, short *round_ptr
,
28 short *quant_ptr
, short *dqcoeff_ptr
);
29 static void fast_quantize_b_mmx(BLOCK
*b
, BLOCKD
*d
)
31 short *scan_mask
= vp8_default_zig_zag_mask
;//d->scan_order_mask_ptr;
32 short *coeff_ptr
= b
->coeff
;
33 short *zbin_ptr
= b
->zbin
;
34 short *round_ptr
= b
->round
;
35 short *quant_ptr
= b
->quant_fast
;
36 short *qcoeff_ptr
= d
->qcoeff
;
37 short *dqcoeff_ptr
= d
->dqcoeff
;
38 short *dequant_ptr
= d
->dequant
;
40 d
->eob
= vp8_fast_quantize_b_impl_mmx(
53 int vp8_mbblock_error_mmx_impl(short *coeff_ptr
, short *dcoef_ptr
, int dc
);
54 static int mbblock_error_mmx(MACROBLOCK
*mb
, int dc
)
56 short *coeff_ptr
= mb
->block
[0].coeff
;
57 short *dcoef_ptr
= mb
->e_mbd
.block
[0].dqcoeff
;
58 return vp8_mbblock_error_mmx_impl(coeff_ptr
, dcoef_ptr
, dc
);
61 int vp8_mbuverror_mmx_impl(short *s_ptr
, short *d_ptr
);
62 static int mbuverror_mmx(MACROBLOCK
*mb
)
64 short *s_ptr
= &mb
->coeff
[256];
65 short *d_ptr
= &mb
->e_mbd
.dqcoeff
[256];
66 return vp8_mbuverror_mmx_impl(s_ptr
, d_ptr
);
69 void vp8_subtract_b_mmx_impl(unsigned char *z
, int src_stride
,
70 short *diff
, unsigned char *predictor
,
72 static void subtract_b_mmx(BLOCK
*be
, BLOCKD
*bd
, int pitch
)
74 unsigned char *z
= *(be
->base_src
) + be
->src
;
75 unsigned int src_stride
= be
->src_stride
;
76 short *diff
= &be
->src_diff
[0];
77 unsigned char *predictor
= &bd
->predictor
[0];
78 vp8_subtract_b_mmx_impl(z
, src_stride
, diff
, predictor
, pitch
);
84 int vp8_mbblock_error_xmm_impl(short *coeff_ptr
, short *dcoef_ptr
, int dc
);
85 static int mbblock_error_xmm(MACROBLOCK
*mb
, int dc
)
87 short *coeff_ptr
= mb
->block
[0].coeff
;
88 short *dcoef_ptr
= mb
->e_mbd
.block
[0].dqcoeff
;
89 return vp8_mbblock_error_xmm_impl(coeff_ptr
, dcoef_ptr
, dc
);
92 int vp8_mbuverror_xmm_impl(short *s_ptr
, short *d_ptr
);
93 static int mbuverror_xmm(MACROBLOCK
*mb
)
95 short *s_ptr
= &mb
->coeff
[256];
96 short *d_ptr
= &mb
->e_mbd
.dqcoeff
[256];
97 return vp8_mbuverror_xmm_impl(s_ptr
, d_ptr
);
100 void vp8_subtract_b_sse2_impl(unsigned char *z
, int src_stride
,
101 short *diff
, unsigned char *predictor
,
103 static void subtract_b_sse2(BLOCK
*be
, BLOCKD
*bd
, int pitch
)
105 unsigned char *z
= *(be
->base_src
) + be
->src
;
106 unsigned int src_stride
= be
->src_stride
;
107 short *diff
= &be
->src_diff
[0];
108 unsigned char *predictor
= &bd
->predictor
[0];
109 vp8_subtract_b_sse2_impl(z
, src_stride
, diff
, predictor
, pitch
);
123 unsigned long *sum_s
,
124 unsigned long *sum_r
,
125 unsigned long *sum_sq_s
,
126 unsigned long *sum_sq_r
,
127 unsigned long *sum_sxr
130 extern ssimpf vp8_ssim_parms_16x16_sse3
;
131 extern ssimpf vp8_ssim_parms_8x8_sse3
;
137 void vp8_arch_x86_encoder_init(VP8_COMP
*cpi
)
139 #if CONFIG_RUNTIME_CPU_DETECT
140 int flags
= x86_simd_caps();
141 int mmx_enabled
= flags
& HAS_MMX
;
142 int xmm_enabled
= flags
& HAS_SSE
;
143 int wmt_enabled
= flags
& HAS_SSE2
;
144 int SSE3Enabled
= flags
& HAS_SSE3
;
145 int SSSE3Enabled
= flags
& HAS_SSSE3
;
146 int SSE4_1Enabled
= flags
& HAS_SSE4_1
;
150 * This platform can be built without runtime CPU detection as well. If
151 * you modify any of the function mappings present in this file, be sure
152 * to also update them in static mapings (<arch>/filename_<arch>.h)
155 /* Override default functions with fastest ones for this CPU. */
159 cpi
->rtcd
.variance
.sad16x16
= vp8_sad16x16_mmx
;
160 cpi
->rtcd
.variance
.sad16x8
= vp8_sad16x8_mmx
;
161 cpi
->rtcd
.variance
.sad8x16
= vp8_sad8x16_mmx
;
162 cpi
->rtcd
.variance
.sad8x8
= vp8_sad8x8_mmx
;
163 cpi
->rtcd
.variance
.sad4x4
= vp8_sad4x4_mmx
;
165 cpi
->rtcd
.variance
.var4x4
= vp8_variance4x4_mmx
;
166 cpi
->rtcd
.variance
.var8x8
= vp8_variance8x8_mmx
;
167 cpi
->rtcd
.variance
.var8x16
= vp8_variance8x16_mmx
;
168 cpi
->rtcd
.variance
.var16x8
= vp8_variance16x8_mmx
;
169 cpi
->rtcd
.variance
.var16x16
= vp8_variance16x16_mmx
;
171 cpi
->rtcd
.variance
.subpixvar4x4
= vp8_sub_pixel_variance4x4_mmx
;
172 cpi
->rtcd
.variance
.subpixvar8x8
= vp8_sub_pixel_variance8x8_mmx
;
173 cpi
->rtcd
.variance
.subpixvar8x16
= vp8_sub_pixel_variance8x16_mmx
;
174 cpi
->rtcd
.variance
.subpixvar16x8
= vp8_sub_pixel_variance16x8_mmx
;
175 cpi
->rtcd
.variance
.subpixvar16x16
= vp8_sub_pixel_variance16x16_mmx
;
176 cpi
->rtcd
.variance
.halfpixvar16x16_h
= vp8_variance_halfpixvar16x16_h_mmx
;
177 cpi
->rtcd
.variance
.halfpixvar16x16_v
= vp8_variance_halfpixvar16x16_v_mmx
;
178 cpi
->rtcd
.variance
.halfpixvar16x16_hv
= vp8_variance_halfpixvar16x16_hv_mmx
;
179 cpi
->rtcd
.variance
.subpixmse16x16
= vp8_sub_pixel_mse16x16_mmx
;
181 cpi
->rtcd
.variance
.mse16x16
= vp8_mse16x16_mmx
;
182 cpi
->rtcd
.variance
.getmbss
= vp8_get_mb_ss_mmx
;
184 cpi
->rtcd
.variance
.get16x16prederror
= vp8_get16x16pred_error_mmx
;
185 cpi
->rtcd
.variance
.get8x8var
= vp8_get8x8var_mmx
;
186 cpi
->rtcd
.variance
.get16x16var
= vp8_get16x16var_mmx
;
187 cpi
->rtcd
.variance
.get4x4sse_cs
= vp8_get4x4sse_cs_mmx
;
189 cpi
->rtcd
.fdct
.short4x4
= vp8_short_fdct4x4_mmx
;
190 cpi
->rtcd
.fdct
.short8x4
= short_fdct8x4_mmx
;
191 cpi
->rtcd
.fdct
.fast4x4
= vp8_short_fdct4x4_mmx
;
192 cpi
->rtcd
.fdct
.fast8x4
= short_fdct8x4_mmx
;
194 cpi
->rtcd
.fdct
.walsh_short4x4
= vp8_short_walsh4x4_c
;
196 cpi
->rtcd
.encodemb
.berr
= vp8_block_error_mmx
;
197 cpi
->rtcd
.encodemb
.mberr
= mbblock_error_mmx
;
198 cpi
->rtcd
.encodemb
.mbuverr
= mbuverror_mmx
;
199 cpi
->rtcd
.encodemb
.subb
= subtract_b_mmx
;
200 cpi
->rtcd
.encodemb
.submby
= vp8_subtract_mby_mmx
;
201 cpi
->rtcd
.encodemb
.submbuv
= vp8_subtract_mbuv_mmx
;
203 /*cpi->rtcd.quantize.fastquantb = fast_quantize_b_mmx;*/
210 cpi
->rtcd
.variance
.sad16x16
= vp8_sad16x16_wmt
;
211 cpi
->rtcd
.variance
.sad16x8
= vp8_sad16x8_wmt
;
212 cpi
->rtcd
.variance
.sad8x16
= vp8_sad8x16_wmt
;
213 cpi
->rtcd
.variance
.sad8x8
= vp8_sad8x8_wmt
;
214 cpi
->rtcd
.variance
.sad4x4
= vp8_sad4x4_wmt
;
216 cpi
->rtcd
.variance
.var4x4
= vp8_variance4x4_wmt
;
217 cpi
->rtcd
.variance
.var8x8
= vp8_variance8x8_wmt
;
218 cpi
->rtcd
.variance
.var8x16
= vp8_variance8x16_wmt
;
219 cpi
->rtcd
.variance
.var16x8
= vp8_variance16x8_wmt
;
220 cpi
->rtcd
.variance
.var16x16
= vp8_variance16x16_wmt
;
222 cpi
->rtcd
.variance
.subpixvar4x4
= vp8_sub_pixel_variance4x4_wmt
;
223 cpi
->rtcd
.variance
.subpixvar8x8
= vp8_sub_pixel_variance8x8_wmt
;
224 cpi
->rtcd
.variance
.subpixvar8x16
= vp8_sub_pixel_variance8x16_wmt
;
225 cpi
->rtcd
.variance
.subpixvar16x8
= vp8_sub_pixel_variance16x8_wmt
;
226 cpi
->rtcd
.variance
.subpixvar16x16
= vp8_sub_pixel_variance16x16_wmt
;
227 cpi
->rtcd
.variance
.halfpixvar16x16_h
= vp8_variance_halfpixvar16x16_h_wmt
;
228 cpi
->rtcd
.variance
.halfpixvar16x16_v
= vp8_variance_halfpixvar16x16_v_wmt
;
229 cpi
->rtcd
.variance
.halfpixvar16x16_hv
= vp8_variance_halfpixvar16x16_hv_wmt
;
230 cpi
->rtcd
.variance
.subpixmse16x16
= vp8_sub_pixel_mse16x16_wmt
;
232 cpi
->rtcd
.variance
.mse16x16
= vp8_mse16x16_wmt
;
233 cpi
->rtcd
.variance
.getmbss
= vp8_get_mb_ss_sse2
;
235 cpi
->rtcd
.variance
.get16x16prederror
= vp8_get16x16pred_error_sse2
;
236 cpi
->rtcd
.variance
.get8x8var
= vp8_get8x8var_sse2
;
237 cpi
->rtcd
.variance
.get16x16var
= vp8_get16x16var_sse2
;
240 /* cpi->rtcd.variance.get4x4sse_cs not implemented for wmt */;
242 cpi
->rtcd
.fdct
.short4x4
= vp8_short_fdct4x4_sse2
;
243 cpi
->rtcd
.fdct
.short8x4
= vp8_short_fdct8x4_sse2
;
244 cpi
->rtcd
.fdct
.fast4x4
= vp8_short_fdct4x4_sse2
;
245 cpi
->rtcd
.fdct
.fast8x4
= vp8_short_fdct8x4_sse2
;
247 cpi
->rtcd
.fdct
.walsh_short4x4
= vp8_short_walsh4x4_sse2
;
249 cpi
->rtcd
.encodemb
.berr
= vp8_block_error_xmm
;
250 cpi
->rtcd
.encodemb
.mberr
= mbblock_error_xmm
;
251 cpi
->rtcd
.encodemb
.mbuverr
= mbuverror_xmm
;
252 cpi
->rtcd
.encodemb
.subb
= subtract_b_sse2
;
253 cpi
->rtcd
.encodemb
.submby
= vp8_subtract_mby_sse2
;
254 cpi
->rtcd
.encodemb
.submbuv
= vp8_subtract_mbuv_sse2
;
256 cpi
->rtcd
.quantize
.quantb
= vp8_regular_quantize_b_sse2
;
257 cpi
->rtcd
.quantize
.fastquantb
= vp8_fast_quantize_b_sse2
;
259 #if !(CONFIG_REALTIME_ONLY)
260 cpi
->rtcd
.temporal
.apply
= vp8_temporal_filter_apply_sse2
;
268 cpi
->rtcd
.variance
.sad16x16
= vp8_sad16x16_sse3
;
269 cpi
->rtcd
.variance
.sad16x16x3
= vp8_sad16x16x3_sse3
;
270 cpi
->rtcd
.variance
.sad16x8x3
= vp8_sad16x8x3_sse3
;
271 cpi
->rtcd
.variance
.sad8x16x3
= vp8_sad8x16x3_sse3
;
272 cpi
->rtcd
.variance
.sad8x8x3
= vp8_sad8x8x3_sse3
;
273 cpi
->rtcd
.variance
.sad4x4x3
= vp8_sad4x4x3_sse3
;
274 cpi
->rtcd
.search
.full_search
= vp8_full_search_sadx3
;
275 cpi
->rtcd
.variance
.sad16x16x4d
= vp8_sad16x16x4d_sse3
;
276 cpi
->rtcd
.variance
.sad16x8x4d
= vp8_sad16x8x4d_sse3
;
277 cpi
->rtcd
.variance
.sad8x16x4d
= vp8_sad8x16x4d_sse3
;
278 cpi
->rtcd
.variance
.sad8x8x4d
= vp8_sad8x8x4d_sse3
;
279 cpi
->rtcd
.variance
.sad4x4x4d
= vp8_sad4x4x4d_sse3
;
280 cpi
->rtcd
.search
.diamond_search
= vp8_diamond_search_sadx4
;
287 cpi
->rtcd
.variance
.sad16x16x3
= vp8_sad16x16x3_ssse3
;
288 cpi
->rtcd
.variance
.sad16x8x3
= vp8_sad16x8x3_ssse3
;
290 cpi
->rtcd
.variance
.subpixvar16x8
= vp8_sub_pixel_variance16x8_ssse3
;
291 cpi
->rtcd
.variance
.subpixvar16x16
= vp8_sub_pixel_variance16x16_ssse3
;
293 cpi
->rtcd
.quantize
.fastquantb
= vp8_fast_quantize_b_ssse3
;
297 cpi
->rtcd
.variance
.ssimpf_8x8
= vp8_ssim_parms_8x8_sse3
;
298 cpi
->rtcd
.variance
.ssimpf
= vp8_ssim_parms_16x16_sse3
;
310 cpi
->rtcd
.variance
.sad16x16x8
= vp8_sad16x16x8_sse4
;
311 cpi
->rtcd
.variance
.sad16x8x8
= vp8_sad16x8x8_sse4
;
312 cpi
->rtcd
.variance
.sad8x16x8
= vp8_sad8x16x8_sse4
;
313 cpi
->rtcd
.variance
.sad8x8x8
= vp8_sad8x8x8_sse4
;
314 cpi
->rtcd
.variance
.sad4x4x8
= vp8_sad4x4x8_sse4
;
315 cpi
->rtcd
.search
.full_search
= vp8_full_search_sadx8
;