2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
12 #include "vpx_ports/config.h"
13 #include "vpx_ports/x86.h"
19 void vp8_short_fdct8x4_mmx(short *input
, short *output
, int pitch
)
21 vp8_short_fdct4x4_mmx(input
, output
, pitch
);
22 vp8_short_fdct4x4_mmx(input
+ 4, output
+ 16, pitch
);
25 int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr
, short *zbin_ptr
,
26 short *qcoeff_ptr
, short *dequant_ptr
,
27 short *scan_mask
, short *round_ptr
,
28 short *quant_ptr
, short *dqcoeff_ptr
);
29 void vp8_fast_quantize_b_mmx(BLOCK
*b
, BLOCKD
*d
)
31 short *scan_mask
= vp8_default_zig_zag_mask
;//d->scan_order_mask_ptr;
32 short *coeff_ptr
= b
->coeff
;
33 short *zbin_ptr
= b
->zbin
;
34 short *round_ptr
= b
->round
;
35 short *quant_ptr
= b
->quant_fast
;
36 short *qcoeff_ptr
= d
->qcoeff
;
37 short *dqcoeff_ptr
= d
->dqcoeff
;
38 short *dequant_ptr
= d
->dequant
;
40 d
->eob
= vp8_fast_quantize_b_impl_mmx(
53 int vp8_mbblock_error_mmx_impl(short *coeff_ptr
, short *dcoef_ptr
, int dc
);
54 int vp8_mbblock_error_mmx(MACROBLOCK
*mb
, int dc
)
56 short *coeff_ptr
= mb
->block
[0].coeff
;
57 short *dcoef_ptr
= mb
->e_mbd
.block
[0].dqcoeff
;
58 return vp8_mbblock_error_mmx_impl(coeff_ptr
, dcoef_ptr
, dc
);
61 int vp8_mbuverror_mmx_impl(short *s_ptr
, short *d_ptr
);
62 int vp8_mbuverror_mmx(MACROBLOCK
*mb
)
64 short *s_ptr
= &mb
->coeff
[256];
65 short *d_ptr
= &mb
->e_mbd
.dqcoeff
[256];
66 return vp8_mbuverror_mmx_impl(s_ptr
, d_ptr
);
69 void vp8_subtract_b_mmx_impl(unsigned char *z
, int src_stride
,
70 short *diff
, unsigned char *predictor
,
72 void vp8_subtract_b_mmx(BLOCK
*be
, BLOCKD
*bd
, int pitch
)
74 unsigned char *z
= *(be
->base_src
) + be
->src
;
75 unsigned int src_stride
= be
->src_stride
;
76 short *diff
= &be
->src_diff
[0];
77 unsigned char *predictor
= &bd
->predictor
[0];
78 vp8_subtract_b_mmx_impl(z
, src_stride
, diff
, predictor
, pitch
);
84 int vp8_fast_quantize_b_impl_sse2(short *coeff_ptr
,
85 short *qcoeff_ptr
, short *dequant_ptr
,
86 const short *inv_scan_order
, short *round_ptr
,
87 short *quant_ptr
, short *dqcoeff_ptr
);
88 void vp8_fast_quantize_b_sse2(BLOCK
*b
, BLOCKD
*d
)
90 short *scan_mask
= vp8_default_zig_zag_mask
;//d->scan_order_mask_ptr;
91 short *coeff_ptr
= b
->coeff
;
92 short *round_ptr
= b
->round
;
93 short *quant_ptr
= b
->quant_fast
;
94 short *qcoeff_ptr
= d
->qcoeff
;
95 short *dqcoeff_ptr
= d
->dqcoeff
;
96 short *dequant_ptr
= d
->dequant
;
98 d
->eob
= vp8_fast_quantize_b_impl_sse2(
102 vp8_default_inv_zig_zag
,
110 int vp8_regular_quantize_b_impl_sse2(short *coeff_ptr
, short *zbin_ptr
,
111 short *qcoeff_ptr
,short *dequant_ptr
,
112 const int *default_zig_zag
, short *round_ptr
,
113 short *quant_ptr
, short *dqcoeff_ptr
,
114 unsigned short zbin_oq_value
,
115 short *zbin_boost_ptr
,
116 short *quant_shift_ptr
);
118 void vp8_regular_quantize_b_sse2(BLOCK
*b
,BLOCKD
*d
)
120 d
->eob
= vp8_regular_quantize_b_impl_sse2(b
->coeff
,
124 vp8_default_zig_zag1d
,
133 int vp8_mbblock_error_xmm_impl(short *coeff_ptr
, short *dcoef_ptr
, int dc
);
134 int vp8_mbblock_error_xmm(MACROBLOCK
*mb
, int dc
)
136 short *coeff_ptr
= mb
->block
[0].coeff
;
137 short *dcoef_ptr
= mb
->e_mbd
.block
[0].dqcoeff
;
138 return vp8_mbblock_error_xmm_impl(coeff_ptr
, dcoef_ptr
, dc
);
141 int vp8_mbuverror_xmm_impl(short *s_ptr
, short *d_ptr
);
142 int vp8_mbuverror_xmm(MACROBLOCK
*mb
)
144 short *s_ptr
= &mb
->coeff
[256];
145 short *d_ptr
= &mb
->e_mbd
.dqcoeff
[256];
146 return vp8_mbuverror_xmm_impl(s_ptr
, d_ptr
);
149 void vp8_subtract_b_sse2_impl(unsigned char *z
, int src_stride
,
150 short *diff
, unsigned char *predictor
,
152 void vp8_subtract_b_sse2(BLOCK
*be
, BLOCKD
*bd
, int pitch
)
154 unsigned char *z
= *(be
->base_src
) + be
->src
;
155 unsigned int src_stride
= be
->src_stride
;
156 short *diff
= &be
->src_diff
[0];
157 unsigned char *predictor
= &bd
->predictor
[0];
158 vp8_subtract_b_sse2_impl(z
, src_stride
, diff
, predictor
, pitch
);
164 int vp8_fast_quantize_b_impl_ssse3(short *coeff_ptr
,
165 short *qcoeff_ptr
, short *dequant_ptr
,
167 short *quant_ptr
, short *dqcoeff_ptr
);
168 void vp8_fast_quantize_b_ssse3(BLOCK
*b
, BLOCKD
*d
)
170 d
->eob
= vp8_fast_quantize_b_impl_ssse3(
182 void vp8_arch_x86_encoder_init(VP8_COMP
*cpi
)
184 #if CONFIG_RUNTIME_CPU_DETECT
185 int flags
= x86_simd_caps();
186 int mmx_enabled
= flags
& HAS_MMX
;
187 int xmm_enabled
= flags
& HAS_SSE
;
188 int wmt_enabled
= flags
& HAS_SSE2
;
189 int SSE3Enabled
= flags
& HAS_SSE3
;
190 int SSSE3Enabled
= flags
& HAS_SSSE3
;
191 int SSE4_1Enabled
= flags
& HAS_SSE4_1
;
195 * This platform can be built without runtime CPU detection as well. If
196 * you modify any of the function mappings present in this file, be sure
197 * to also update them in static mapings (<arch>/filename_<arch>.h)
200 /* Override default functions with fastest ones for this CPU. */
204 cpi
->rtcd
.variance
.sad16x16
= vp8_sad16x16_mmx
;
205 cpi
->rtcd
.variance
.sad16x8
= vp8_sad16x8_mmx
;
206 cpi
->rtcd
.variance
.sad8x16
= vp8_sad8x16_mmx
;
207 cpi
->rtcd
.variance
.sad8x8
= vp8_sad8x8_mmx
;
208 cpi
->rtcd
.variance
.sad4x4
= vp8_sad4x4_mmx
;
210 cpi
->rtcd
.variance
.var4x4
= vp8_variance4x4_mmx
;
211 cpi
->rtcd
.variance
.var8x8
= vp8_variance8x8_mmx
;
212 cpi
->rtcd
.variance
.var8x16
= vp8_variance8x16_mmx
;
213 cpi
->rtcd
.variance
.var16x8
= vp8_variance16x8_mmx
;
214 cpi
->rtcd
.variance
.var16x16
= vp8_variance16x16_mmx
;
216 cpi
->rtcd
.variance
.subpixvar4x4
= vp8_sub_pixel_variance4x4_mmx
;
217 cpi
->rtcd
.variance
.subpixvar8x8
= vp8_sub_pixel_variance8x8_mmx
;
218 cpi
->rtcd
.variance
.subpixvar8x16
= vp8_sub_pixel_variance8x16_mmx
;
219 cpi
->rtcd
.variance
.subpixvar16x8
= vp8_sub_pixel_variance16x8_mmx
;
220 cpi
->rtcd
.variance
.subpixvar16x16
= vp8_sub_pixel_variance16x16_mmx
;
221 cpi
->rtcd
.variance
.halfpixvar16x16_h
= vp8_variance_halfpixvar16x16_h_mmx
;
222 cpi
->rtcd
.variance
.halfpixvar16x16_v
= vp8_variance_halfpixvar16x16_v_mmx
;
223 cpi
->rtcd
.variance
.halfpixvar16x16_hv
= vp8_variance_halfpixvar16x16_hv_mmx
;
224 cpi
->rtcd
.variance
.subpixmse16x16
= vp8_sub_pixel_mse16x16_mmx
;
226 cpi
->rtcd
.variance
.mse16x16
= vp8_mse16x16_mmx
;
227 cpi
->rtcd
.variance
.getmbss
= vp8_get_mb_ss_mmx
;
229 cpi
->rtcd
.variance
.get16x16prederror
= vp8_get16x16pred_error_mmx
;
230 cpi
->rtcd
.variance
.get8x8var
= vp8_get8x8var_mmx
;
231 cpi
->rtcd
.variance
.get16x16var
= vp8_get16x16var_mmx
;
232 cpi
->rtcd
.variance
.get4x4sse_cs
= vp8_get4x4sse_cs_mmx
;
234 cpi
->rtcd
.fdct
.short4x4
= vp8_short_fdct4x4_mmx
;
235 cpi
->rtcd
.fdct
.short8x4
= vp8_short_fdct8x4_mmx
;
236 cpi
->rtcd
.fdct
.fast4x4
= vp8_short_fdct4x4_mmx
;
237 cpi
->rtcd
.fdct
.fast8x4
= vp8_short_fdct8x4_mmx
;
239 cpi
->rtcd
.fdct
.walsh_short4x4
= vp8_short_walsh4x4_c
;
241 cpi
->rtcd
.encodemb
.berr
= vp8_block_error_mmx
;
242 cpi
->rtcd
.encodemb
.mberr
= vp8_mbblock_error_mmx
;
243 cpi
->rtcd
.encodemb
.mbuverr
= vp8_mbuverror_mmx
;
244 cpi
->rtcd
.encodemb
.subb
= vp8_subtract_b_mmx
;
245 cpi
->rtcd
.encodemb
.submby
= vp8_subtract_mby_mmx
;
246 cpi
->rtcd
.encodemb
.submbuv
= vp8_subtract_mbuv_mmx
;
248 /*cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_mmx;*/
255 cpi
->rtcd
.variance
.sad16x16
= vp8_sad16x16_wmt
;
256 cpi
->rtcd
.variance
.sad16x8
= vp8_sad16x8_wmt
;
257 cpi
->rtcd
.variance
.sad8x16
= vp8_sad8x16_wmt
;
258 cpi
->rtcd
.variance
.sad8x8
= vp8_sad8x8_wmt
;
259 cpi
->rtcd
.variance
.sad4x4
= vp8_sad4x4_wmt
;
261 cpi
->rtcd
.variance
.var4x4
= vp8_variance4x4_wmt
;
262 cpi
->rtcd
.variance
.var8x8
= vp8_variance8x8_wmt
;
263 cpi
->rtcd
.variance
.var8x16
= vp8_variance8x16_wmt
;
264 cpi
->rtcd
.variance
.var16x8
= vp8_variance16x8_wmt
;
265 cpi
->rtcd
.variance
.var16x16
= vp8_variance16x16_wmt
;
267 cpi
->rtcd
.variance
.subpixvar4x4
= vp8_sub_pixel_variance4x4_wmt
;
268 cpi
->rtcd
.variance
.subpixvar8x8
= vp8_sub_pixel_variance8x8_wmt
;
269 cpi
->rtcd
.variance
.subpixvar8x16
= vp8_sub_pixel_variance8x16_wmt
;
270 cpi
->rtcd
.variance
.subpixvar16x8
= vp8_sub_pixel_variance16x8_wmt
;
271 cpi
->rtcd
.variance
.subpixvar16x16
= vp8_sub_pixel_variance16x16_wmt
;
272 cpi
->rtcd
.variance
.halfpixvar16x16_h
= vp8_variance_halfpixvar16x16_h_wmt
;
273 cpi
->rtcd
.variance
.halfpixvar16x16_v
= vp8_variance_halfpixvar16x16_v_wmt
;
274 cpi
->rtcd
.variance
.halfpixvar16x16_hv
= vp8_variance_halfpixvar16x16_hv_wmt
;
275 cpi
->rtcd
.variance
.subpixmse16x16
= vp8_sub_pixel_mse16x16_wmt
;
277 cpi
->rtcd
.variance
.mse16x16
= vp8_mse16x16_wmt
;
278 cpi
->rtcd
.variance
.getmbss
= vp8_get_mb_ss_sse2
;
280 cpi
->rtcd
.variance
.get16x16prederror
= vp8_get16x16pred_error_sse2
;
281 cpi
->rtcd
.variance
.get8x8var
= vp8_get8x8var_sse2
;
282 cpi
->rtcd
.variance
.get16x16var
= vp8_get16x16var_sse2
;
283 /* cpi->rtcd.variance.get4x4sse_cs not implemented for wmt */;
285 cpi
->rtcd
.fdct
.short4x4
= vp8_short_fdct4x4_sse2
;
286 cpi
->rtcd
.fdct
.short8x4
= vp8_short_fdct8x4_sse2
;
287 cpi
->rtcd
.fdct
.fast4x4
= vp8_short_fdct4x4_sse2
;
288 cpi
->rtcd
.fdct
.fast8x4
= vp8_short_fdct8x4_sse2
;
290 cpi
->rtcd
.fdct
.walsh_short4x4
= vp8_short_walsh4x4_sse2
;
292 cpi
->rtcd
.encodemb
.berr
= vp8_block_error_xmm
;
293 cpi
->rtcd
.encodemb
.mberr
= vp8_mbblock_error_xmm
;
294 cpi
->rtcd
.encodemb
.mbuverr
= vp8_mbuverror_xmm
;
295 cpi
->rtcd
.encodemb
.subb
= vp8_subtract_b_sse2
;
296 cpi
->rtcd
.encodemb
.submby
= vp8_subtract_mby_sse2
;
297 cpi
->rtcd
.encodemb
.submbuv
= vp8_subtract_mbuv_sse2
;
300 cpi
->rtcd
.quantize
.quantb
= vp8_regular_quantize_b_sse2
;
302 cpi
->rtcd
.quantize
.fastquantb
= vp8_fast_quantize_b_sse2
;
304 cpi
->rtcd
.temporal
.apply
= vp8_temporal_filter_apply_sse2
;
311 cpi
->rtcd
.variance
.sad16x16
= vp8_sad16x16_sse3
;
312 cpi
->rtcd
.variance
.sad16x16x3
= vp8_sad16x16x3_sse3
;
313 cpi
->rtcd
.variance
.sad16x8x3
= vp8_sad16x8x3_sse3
;
314 cpi
->rtcd
.variance
.sad8x16x3
= vp8_sad8x16x3_sse3
;
315 cpi
->rtcd
.variance
.sad8x8x3
= vp8_sad8x8x3_sse3
;
316 cpi
->rtcd
.variance
.sad4x4x3
= vp8_sad4x4x3_sse3
;
317 #if !(CONFIG_REALTIME_ONLY)
318 cpi
->rtcd
.search
.full_search
= vp8_full_search_sadx3
;
320 cpi
->rtcd
.variance
.sad16x16x4d
= vp8_sad16x16x4d_sse3
;
321 cpi
->rtcd
.variance
.sad16x8x4d
= vp8_sad16x8x4d_sse3
;
322 cpi
->rtcd
.variance
.sad8x16x4d
= vp8_sad8x16x4d_sse3
;
323 cpi
->rtcd
.variance
.sad8x8x4d
= vp8_sad8x8x4d_sse3
;
324 cpi
->rtcd
.variance
.sad4x4x4d
= vp8_sad4x4x4d_sse3
;
325 cpi
->rtcd
.search
.diamond_search
= vp8_diamond_search_sadx4
;
332 cpi
->rtcd
.variance
.sad16x16x3
= vp8_sad16x16x3_ssse3
;
333 cpi
->rtcd
.variance
.sad16x8x3
= vp8_sad16x8x3_ssse3
;
335 cpi
->rtcd
.quantize
.fastquantb
= vp8_fast_quantize_b_ssse3
;
343 cpi
->rtcd
.variance
.sad16x16x8
= vp8_sad16x16x8_sse4
;
344 cpi
->rtcd
.variance
.sad16x8x8
= vp8_sad16x8x8_sse4
;
345 cpi
->rtcd
.variance
.sad8x16x8
= vp8_sad8x16x8_sse4
;
346 cpi
->rtcd
.variance
.sad8x8x8
= vp8_sad8x8x8_sse4
;
347 cpi
->rtcd
.variance
.sad4x4x8
= vp8_sad4x4x8_sse4
;
348 #if !(CONFIG_REALTIME_ONLY)
349 cpi
->rtcd
.search
.full_search
= vp8_full_search_sadx8
;