2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
12 #include "vpx_ports/config.h"
13 #include "vpx_ports/x86.h"
19 void vp8_short_fdct8x4_mmx(short *input
, short *output
, int pitch
)
21 vp8_short_fdct4x4_c(input
, output
, pitch
);
22 vp8_short_fdct4x4_c(input
+ 4, output
+ 16, pitch
);
26 int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr
, short *zbin_ptr
,
27 short *qcoeff_ptr
, short *dequant_ptr
,
28 short *scan_mask
, short *round_ptr
,
29 short *quant_ptr
, short *dqcoeff_ptr
);
30 void vp8_fast_quantize_b_mmx(BLOCK
*b
, BLOCKD
*d
)
32 short *scan_mask
= vp8_default_zig_zag_mask
;//d->scan_order_mask_ptr;
33 short *coeff_ptr
= b
->coeff
;
34 short *zbin_ptr
= b
->zbin
;
35 short *round_ptr
= b
->round
;
36 short *quant_ptr
= b
->quant
;
37 short *qcoeff_ptr
= d
->qcoeff
;
38 short *dqcoeff_ptr
= d
->dqcoeff
;
39 short *dequant_ptr
= d
->dequant
;
41 d
->eob
= vp8_fast_quantize_b_impl_mmx(
54 int vp8_mbblock_error_mmx_impl(short *coeff_ptr
, short *dcoef_ptr
, int dc
);
55 int vp8_mbblock_error_mmx(MACROBLOCK
*mb
, int dc
)
57 short *coeff_ptr
= mb
->block
[0].coeff
;
58 short *dcoef_ptr
= mb
->e_mbd
.block
[0].dqcoeff
;
59 return vp8_mbblock_error_mmx_impl(coeff_ptr
, dcoef_ptr
, dc
);
62 int vp8_mbuverror_mmx_impl(short *s_ptr
, short *d_ptr
);
63 int vp8_mbuverror_mmx(MACROBLOCK
*mb
)
65 short *s_ptr
= &mb
->coeff
[256];
66 short *d_ptr
= &mb
->e_mbd
.dqcoeff
[256];
67 return vp8_mbuverror_mmx_impl(s_ptr
, d_ptr
);
70 void vp8_subtract_b_mmx_impl(unsigned char *z
, int src_stride
,
71 short *diff
, unsigned char *predictor
,
73 void vp8_subtract_b_mmx(BLOCK
*be
, BLOCKD
*bd
, int pitch
)
75 unsigned char *z
= *(be
->base_src
) + be
->src
;
76 unsigned int src_stride
= be
->src_stride
;
77 short *diff
= &be
->src_diff
[0];
78 unsigned char *predictor
= &bd
->predictor
[0];
79 vp8_subtract_b_mmx_impl(z
, src_stride
, diff
, predictor
, pitch
);
85 void vp8_short_fdct8x4_sse2(short *input
, short *output
, int pitch
)
87 vp8_short_fdct4x4_sse2(input
, output
, pitch
);
88 vp8_short_fdct4x4_sse2(input
+ 4, output
+ 16, pitch
);
91 int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr
,
92 short *qcoeff_ptr
, short *dequant_ptr
,
93 short *scan_mask
, short *round_ptr
,
94 short *quant_ptr
, short *dqcoeff_ptr
);
95 void vp8_fast_quantize_b_sse2(BLOCK
*b
, BLOCKD
*d
)
97 short *scan_mask
= vp8_default_zig_zag_mask
;//d->scan_order_mask_ptr;
98 short *coeff_ptr
= b
->coeff
;
99 short *round_ptr
= b
->round
;
100 short *quant_ptr
= b
->quant
;
101 short *qcoeff_ptr
= d
->qcoeff
;
102 short *dqcoeff_ptr
= d
->dqcoeff
;
103 short *dequant_ptr
= d
->dequant
;
105 d
->eob
= vp8_fast_quantize_b_impl_sse2(
118 int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr
, short *zbin_ptr
,
119 short *qcoeff_ptr
,short *dequant_ptr
,
120 const int *default_zig_zag
, short *round_ptr
,
121 short *quant_ptr
, short *dqcoeff_ptr
,
122 unsigned short zbin_oq_value
,
123 short *zbin_boost_ptr
);
125 void vp8_regular_quantize_b_sse2(BLOCK
*b
,BLOCKD
*d
)
127 short *zbin_boost_ptr
= b
->zrun_zbin_boost
;
128 short *coeff_ptr
= b
->coeff
;
129 short *zbin_ptr
= b
->zbin
;
130 short *round_ptr
= b
->round
;
131 short *quant_ptr
= b
->quant
;
132 short *qcoeff_ptr
= d
->qcoeff
;
133 short *dqcoeff_ptr
= d
->dqcoeff
;
134 short *dequant_ptr
= d
->dequant
;
135 short zbin_oq_value
= b
->zbin_extra
;
137 d
->eob
= vp8_regular_quantize_b_impl_sse2(
142 vp8_default_zig_zag1d
,
152 int vp8_mbblock_error_xmm_impl(short *coeff_ptr
, short *dcoef_ptr
, int dc
);
153 int vp8_mbblock_error_xmm(MACROBLOCK
*mb
, int dc
)
155 short *coeff_ptr
= mb
->block
[0].coeff
;
156 short *dcoef_ptr
= mb
->e_mbd
.block
[0].dqcoeff
;
157 return vp8_mbblock_error_xmm_impl(coeff_ptr
, dcoef_ptr
, dc
);
160 int vp8_mbuverror_xmm_impl(short *s_ptr
, short *d_ptr
);
161 int vp8_mbuverror_xmm(MACROBLOCK
*mb
)
163 short *s_ptr
= &mb
->coeff
[256];
164 short *d_ptr
= &mb
->e_mbd
.dqcoeff
[256];
165 return vp8_mbuverror_xmm_impl(s_ptr
, d_ptr
);
168 void vp8_subtract_b_sse2_impl(unsigned char *z
, int src_stride
,
169 short *diff
, unsigned char *predictor
,
171 void vp8_subtract_b_sse2(BLOCK
*be
, BLOCKD
*bd
, int pitch
)
173 unsigned char *z
= *(be
->base_src
) + be
->src
;
174 unsigned int src_stride
= be
->src_stride
;
175 short *diff
= &be
->src_diff
[0];
176 unsigned char *predictor
= &bd
->predictor
[0];
177 vp8_subtract_b_sse2_impl(z
, src_stride
, diff
, predictor
, pitch
);
182 void vp8_arch_x86_encoder_init(VP8_COMP
*cpi
)
184 #if CONFIG_RUNTIME_CPU_DETECT
185 int flags
= x86_simd_caps();
186 int mmx_enabled
= flags
& HAS_MMX
;
187 int xmm_enabled
= flags
& HAS_SSE
;
188 int wmt_enabled
= flags
& HAS_SSE2
;
189 int SSE3Enabled
= flags
& HAS_SSE3
;
190 int SSSE3Enabled
= flags
& HAS_SSSE3
;
194 * This platform can be built without runtime CPU detection as well. If
195 * you modify any of the function mappings present in this file, be sure
196 * to also update them in static mapings (<arch>/filename_<arch>.h)
199 /* Override default functions with fastest ones for this CPU. */
204 cpi
->rtcd
.variance
.sad16x16
= vp8_sad16x16_mmx
;
205 cpi
->rtcd
.variance
.sad16x8
= vp8_sad16x8_mmx
;
206 cpi
->rtcd
.variance
.sad8x16
= vp8_sad8x16_mmx
;
207 cpi
->rtcd
.variance
.sad8x8
= vp8_sad8x8_mmx
;
208 cpi
->rtcd
.variance
.sad4x4
= vp8_sad4x4_mmx
;
210 cpi
->rtcd
.variance
.var4x4
= vp8_variance4x4_mmx
;
211 cpi
->rtcd
.variance
.var8x8
= vp8_variance8x8_mmx
;
212 cpi
->rtcd
.variance
.var8x16
= vp8_variance8x16_mmx
;
213 cpi
->rtcd
.variance
.var16x8
= vp8_variance16x8_mmx
;
214 cpi
->rtcd
.variance
.var16x16
= vp8_variance16x16_mmx
;
216 cpi
->rtcd
.variance
.subpixvar4x4
= vp8_sub_pixel_variance4x4_mmx
;
217 cpi
->rtcd
.variance
.subpixvar8x8
= vp8_sub_pixel_variance8x8_mmx
;
218 cpi
->rtcd
.variance
.subpixvar8x16
= vp8_sub_pixel_variance8x16_mmx
;
219 cpi
->rtcd
.variance
.subpixvar16x8
= vp8_sub_pixel_variance16x8_mmx
;
220 cpi
->rtcd
.variance
.subpixvar16x16
= vp8_sub_pixel_variance16x16_mmx
;
221 cpi
->rtcd
.variance
.subpixmse16x16
= vp8_sub_pixel_mse16x16_mmx
;
223 cpi
->rtcd
.variance
.mse16x16
= vp8_mse16x16_mmx
;
224 cpi
->rtcd
.variance
.getmbss
= vp8_get_mb_ss_mmx
;
226 cpi
->rtcd
.variance
.get16x16prederror
= vp8_get16x16pred_error_mmx
;
227 cpi
->rtcd
.variance
.get8x8var
= vp8_get8x8var_mmx
;
228 cpi
->rtcd
.variance
.get16x16var
= vp8_get16x16var_mmx
;
229 cpi
->rtcd
.variance
.get4x4sse_cs
= vp8_get4x4sse_cs_mmx
;
231 cpi
->rtcd
.fdct
.short4x4
= vp8_short_fdct4x4_mmx
;
232 cpi
->rtcd
.fdct
.short8x4
= vp8_short_fdct8x4_mmx
;
233 cpi
->rtcd
.fdct
.fast4x4
= vp8_short_fdct4x4_mmx
;
234 cpi
->rtcd
.fdct
.fast8x4
= vp8_short_fdct8x4_mmx
;
236 cpi
->rtcd
.fdct
.short4x4
= vp8_short_fdct4x4_c
;
237 cpi
->rtcd
.fdct
.short8x4
= vp8_short_fdct8x4_c
;
238 cpi
->rtcd
.fdct
.fast4x4
= vp8_short_fdct4x4_c
;
239 cpi
->rtcd
.fdct
.fast8x4
= vp8_short_fdct8x4_c
;
243 cpi
->rtcd
.fdct
.walsh_short4x4
= vp8_short_walsh4x4_c
;
245 cpi
->rtcd
.encodemb
.berr
= vp8_block_error_mmx
;
246 cpi
->rtcd
.encodemb
.mberr
= vp8_mbblock_error_mmx
;
247 cpi
->rtcd
.encodemb
.mbuverr
= vp8_mbuverror_mmx
;
248 cpi
->rtcd
.encodemb
.subb
= vp8_subtract_b_mmx
;
249 cpi
->rtcd
.encodemb
.submby
= vp8_subtract_mby_mmx
;
250 cpi
->rtcd
.encodemb
.submbuv
= vp8_subtract_mbuv_mmx
;
252 /*cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_mmx;*/
260 cpi
->rtcd
.variance
.sad16x16
= vp8_sad16x16_wmt
;
261 cpi
->rtcd
.variance
.sad16x8
= vp8_sad16x8_wmt
;
262 cpi
->rtcd
.variance
.sad8x16
= vp8_sad8x16_wmt
;
263 cpi
->rtcd
.variance
.sad8x8
= vp8_sad8x8_wmt
;
264 cpi
->rtcd
.variance
.sad4x4
= vp8_sad4x4_wmt
;
266 cpi
->rtcd
.variance
.var4x4
= vp8_variance4x4_wmt
;
267 cpi
->rtcd
.variance
.var8x8
= vp8_variance8x8_wmt
;
268 cpi
->rtcd
.variance
.var8x16
= vp8_variance8x16_wmt
;
269 cpi
->rtcd
.variance
.var16x8
= vp8_variance16x8_wmt
;
270 cpi
->rtcd
.variance
.var16x16
= vp8_variance16x16_wmt
;
272 cpi
->rtcd
.variance
.subpixvar4x4
= vp8_sub_pixel_variance4x4_wmt
;
273 cpi
->rtcd
.variance
.subpixvar8x8
= vp8_sub_pixel_variance8x8_wmt
;
274 cpi
->rtcd
.variance
.subpixvar8x16
= vp8_sub_pixel_variance8x16_wmt
;
275 cpi
->rtcd
.variance
.subpixvar16x8
= vp8_sub_pixel_variance16x8_wmt
;
276 cpi
->rtcd
.variance
.subpixvar16x16
= vp8_sub_pixel_variance16x16_wmt
;
277 cpi
->rtcd
.variance
.subpixmse16x16
= vp8_sub_pixel_mse16x16_wmt
;
279 cpi
->rtcd
.variance
.mse16x16
= vp8_mse16x16_wmt
;
280 cpi
->rtcd
.variance
.getmbss
= vp8_get_mb_ss_sse2
;
282 cpi
->rtcd
.variance
.get16x16prederror
= vp8_get16x16pred_error_sse2
;
283 cpi
->rtcd
.variance
.get8x8var
= vp8_get8x8var_sse2
;
284 cpi
->rtcd
.variance
.get16x16var
= vp8_get16x16var_sse2
;
285 /* cpi->rtcd.variance.get4x4sse_cs not implemented for wmt */;
287 cpi
->rtcd
.fdct
.short4x4
= vp8_short_fdct4x4_sse2
;
288 cpi
->rtcd
.fdct
.short8x4
= vp8_short_fdct8x4_sse2
;
289 cpi
->rtcd
.fdct
.fast4x4
= vp8_short_fdct4x4_sse2
;
290 cpi
->rtcd
.fdct
.fast8x4
= vp8_short_fdct8x4_sse2
;
292 cpi
->rtcd
.fdct
.walsh_short4x4
= vp8_short_walsh4x4_sse2
;
294 cpi
->rtcd
.encodemb
.berr
= vp8_block_error_xmm
;
295 cpi
->rtcd
.encodemb
.mberr
= vp8_mbblock_error_xmm
;
296 cpi
->rtcd
.encodemb
.mbuverr
= vp8_mbuverror_xmm
;
297 cpi
->rtcd
.encodemb
.subb
= vp8_subtract_b_sse2
;
298 cpi
->rtcd
.encodemb
.submby
= vp8_subtract_mby_sse2
;
299 cpi
->rtcd
.encodemb
.submbuv
= vp8_subtract_mbuv_sse2
;
301 /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2;*/
302 cpi
->rtcd
.quantize
.fastquantb
= vp8_fast_quantize_b_sse2
;
310 cpi
->rtcd
.variance
.sad16x16
= vp8_sad16x16_sse3
;
311 cpi
->rtcd
.variance
.sad16x16x3
= vp8_sad16x16x3_sse3
;
312 cpi
->rtcd
.variance
.sad16x8x3
= vp8_sad16x8x3_sse3
;
313 cpi
->rtcd
.variance
.sad8x16x3
= vp8_sad8x16x3_sse3
;
314 cpi
->rtcd
.variance
.sad8x8x3
= vp8_sad8x8x3_sse3
;
315 cpi
->rtcd
.variance
.sad4x4x3
= vp8_sad4x4x3_sse3
;
316 cpi
->rtcd
.search
.full_search
= vp8_full_search_sadx3
;
318 cpi
->rtcd
.variance
.sad16x16x4d
= vp8_sad16x16x4d_sse3
;
319 cpi
->rtcd
.variance
.sad16x8x4d
= vp8_sad16x8x4d_sse3
;
320 cpi
->rtcd
.variance
.sad8x16x4d
= vp8_sad8x16x4d_sse3
;
321 cpi
->rtcd
.variance
.sad8x8x4d
= vp8_sad8x8x4d_sse3
;
322 cpi
->rtcd
.variance
.sad4x4x4d
= vp8_sad4x4x4d_sse3
;
323 cpi
->rtcd
.search
.diamond_search
= vp8_diamond_search_sadx4
;
331 cpi
->rtcd
.variance
.sad16x16x3
= vp8_sad16x16x3_ssse3
;
332 cpi
->rtcd
.variance
.sad16x8x3
= vp8_sad16x8x3_ssse3
;