Merge "keep values in registers during quantization"
[libvpx.git] / vp8 / common / x86 / vp8_asm_stubs.c
blob79040060937a4122390cbab74396e37e40250a10
1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
12 #include "vpx_ports/config.h"
13 #include "vpx_ports/mem.h"
14 #include "vp8/common/subpixel.h"
16 extern const short vp8_six_tap_mmx[8][6*8];
17 extern const short vp8_bilinear_filters_mmx[8][2*8];
19 extern void vp8_filter_block1d_h6_mmx
21 unsigned char *src_ptr,
22 unsigned short *output_ptr,
23 unsigned int src_pixels_per_line,
24 unsigned int pixel_step,
25 unsigned int output_height,
26 unsigned int output_width,
27 const short *vp8_filter
29 extern void vp8_filter_block1dc_v6_mmx
31 unsigned short *src_ptr,
32 unsigned char *output_ptr,
33 int output_pitch,
34 unsigned int pixels_per_line,
35 unsigned int pixel_step,
36 unsigned int output_height,
37 unsigned int output_width,
38 const short *vp8_filter
40 extern void vp8_filter_block1d8_h6_sse2
42 unsigned char *src_ptr,
43 unsigned short *output_ptr,
44 unsigned int src_pixels_per_line,
45 unsigned int pixel_step,
46 unsigned int output_height,
47 unsigned int output_width,
48 const short *vp8_filter
50 extern void vp8_filter_block1d16_h6_sse2
52 unsigned char *src_ptr,
53 unsigned short *output_ptr,
54 unsigned int src_pixels_per_line,
55 unsigned int pixel_step,
56 unsigned int output_height,
57 unsigned int output_width,
58 const short *vp8_filter
60 extern void vp8_filter_block1d8_v6_sse2
62 unsigned short *src_ptr,
63 unsigned char *output_ptr,
64 int dst_ptich,
65 unsigned int pixels_per_line,
66 unsigned int pixel_step,
67 unsigned int output_height,
68 unsigned int output_width,
69 const short *vp8_filter
71 extern void vp8_filter_block1d16_v6_sse2
73 unsigned short *src_ptr,
74 unsigned char *output_ptr,
75 int dst_ptich,
76 unsigned int pixels_per_line,
77 unsigned int pixel_step,
78 unsigned int output_height,
79 unsigned int output_width,
80 const short *vp8_filter
82 extern void vp8_unpack_block1d16_h6_sse2
84 unsigned char *src_ptr,
85 unsigned short *output_ptr,
86 unsigned int src_pixels_per_line,
87 unsigned int output_height,
88 unsigned int output_width
90 extern void vp8_filter_block1d8_h6_only_sse2
92 unsigned char *src_ptr,
93 unsigned int src_pixels_per_line,
94 unsigned char *output_ptr,
95 int dst_ptich,
96 unsigned int output_height,
97 const short *vp8_filter
99 extern void vp8_filter_block1d16_h6_only_sse2
101 unsigned char *src_ptr,
102 unsigned int src_pixels_per_line,
103 unsigned char *output_ptr,
104 int dst_ptich,
105 unsigned int output_height,
106 const short *vp8_filter
108 extern void vp8_filter_block1d8_v6_only_sse2
110 unsigned char *src_ptr,
111 unsigned int src_pixels_per_line,
112 unsigned char *output_ptr,
113 int dst_ptich,
114 unsigned int output_height,
115 const short *vp8_filter
117 extern prototype_subpixel_predict(vp8_bilinear_predict8x8_mmx);
120 #if HAVE_MMX
121 void vp8_sixtap_predict4x4_mmx
123 unsigned char *src_ptr,
124 int src_pixels_per_line,
125 int xoffset,
126 int yoffset,
127 unsigned char *dst_ptr,
128 int dst_pitch
131 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 16*16); /* Temp data bufffer used in filtering */
132 const short *HFilter, *VFilter;
133 HFilter = vp8_six_tap_mmx[xoffset];
134 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 8, HFilter);
135 VFilter = vp8_six_tap_mmx[yoffset];
136 vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4 , 4, 4, VFilter);
141 void vp8_sixtap_predict16x16_mmx
143 unsigned char *src_ptr,
144 int src_pixels_per_line,
145 int xoffset,
146 int yoffset,
147 unsigned char *dst_ptr,
148 int dst_pitch
152 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24); /* Temp data bufffer used in filtering */
154 const short *HFilter, *VFilter;
157 HFilter = vp8_six_tap_mmx[xoffset];
159 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter);
160 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 21, 32, HFilter);
161 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8, FData2 + 8, src_pixels_per_line, 1, 21, 32, HFilter);
162 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12, FData2 + 12, src_pixels_per_line, 1, 21, 32, HFilter);
164 VFilter = vp8_six_tap_mmx[yoffset];
165 vp8_filter_block1dc_v6_mmx(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, 16, VFilter);
166 vp8_filter_block1dc_v6_mmx(FData2 + 36, dst_ptr + 4, dst_pitch, 32, 16 , 16, 16, VFilter);
167 vp8_filter_block1dc_v6_mmx(FData2 + 40, dst_ptr + 8, dst_pitch, 32, 16 , 16, 16, VFilter);
168 vp8_filter_block1dc_v6_mmx(FData2 + 44, dst_ptr + 12, dst_pitch, 32, 16 , 16, 16, VFilter);
173 void vp8_sixtap_predict8x8_mmx
175 unsigned char *src_ptr,
176 int src_pixels_per_line,
177 int xoffset,
178 int yoffset,
179 unsigned char *dst_ptr,
180 int dst_pitch
184 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */
186 const short *HFilter, *VFilter;
188 HFilter = vp8_six_tap_mmx[xoffset];
189 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter);
190 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 13, 16, HFilter);
192 VFilter = vp8_six_tap_mmx[yoffset];
193 vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, 8, VFilter);
194 vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 8, 8, VFilter);
199 void vp8_sixtap_predict8x4_mmx
201 unsigned char *src_ptr,
202 int src_pixels_per_line,
203 int xoffset,
204 int yoffset,
205 unsigned char *dst_ptr,
206 int dst_pitch
210 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */
212 const short *HFilter, *VFilter;
214 HFilter = vp8_six_tap_mmx[xoffset];
215 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter);
216 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 9, 16, HFilter);
218 VFilter = vp8_six_tap_mmx[yoffset];
219 vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, 8, VFilter);
220 vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 4, 8, VFilter);
226 void vp8_bilinear_predict16x16_mmx
228 unsigned char *src_ptr,
229 int src_pixels_per_line,
230 int xoffset,
231 int yoffset,
232 unsigned char *dst_ptr,
233 int dst_pitch
236 vp8_bilinear_predict8x8_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pitch);
237 vp8_bilinear_predict8x8_mmx(src_ptr + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + 8, dst_pitch);
238 vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8, dst_pitch);
239 vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8 + 8, dst_pitch);
241 #endif
244 #if HAVE_SSE2
245 void vp8_sixtap_predict16x16_sse2
247 unsigned char *src_ptr,
248 int src_pixels_per_line,
249 int xoffset,
250 int yoffset,
251 unsigned char *dst_ptr,
252 int dst_pitch
256 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24); /* Temp data bufffer used in filtering */
258 const short *HFilter, *VFilter;
260 if (xoffset)
262 if (yoffset)
264 HFilter = vp8_six_tap_mmx[xoffset];
265 vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter);
266 VFilter = vp8_six_tap_mmx[yoffset];
267 vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, dst_pitch, VFilter);
269 else
271 /* First-pass only */
272 HFilter = vp8_six_tap_mmx[xoffset];
273 vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 16, HFilter);
276 else
278 /* Second-pass only */
279 VFilter = vp8_six_tap_mmx[yoffset];
280 vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 21, 32);
281 vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, dst_pitch, VFilter);
286 void vp8_sixtap_predict8x8_sse2
288 unsigned char *src_ptr,
289 int src_pixels_per_line,
290 int xoffset,
291 int yoffset,
292 unsigned char *dst_ptr,
293 int dst_pitch
296 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */
297 const short *HFilter, *VFilter;
299 if (xoffset)
301 if (yoffset)
303 HFilter = vp8_six_tap_mmx[xoffset];
304 vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter);
305 VFilter = vp8_six_tap_mmx[yoffset];
306 vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, dst_pitch, VFilter);
308 else
310 /* First-pass only */
311 HFilter = vp8_six_tap_mmx[xoffset];
312 vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 8, HFilter);
315 else
317 /* Second-pass only */
318 VFilter = vp8_six_tap_mmx[yoffset];
319 vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 8, VFilter);
324 void vp8_sixtap_predict8x4_sse2
326 unsigned char *src_ptr,
327 int src_pixels_per_line,
328 int xoffset,
329 int yoffset,
330 unsigned char *dst_ptr,
331 int dst_pitch
334 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */
335 const short *HFilter, *VFilter;
337 if (xoffset)
339 if (yoffset)
341 HFilter = vp8_six_tap_mmx[xoffset];
342 vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter);
343 VFilter = vp8_six_tap_mmx[yoffset];
344 vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, dst_pitch, VFilter);
346 else
348 /* First-pass only */
349 HFilter = vp8_six_tap_mmx[xoffset];
350 vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, HFilter);
353 else
355 /* Second-pass only */
356 VFilter = vp8_six_tap_mmx[yoffset];
357 vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, VFilter);
361 #endif
363 #if HAVE_SSSE3
365 extern void vp8_filter_block1d8_h6_ssse3
367 unsigned char *src_ptr,
368 unsigned int src_pixels_per_line,
369 unsigned char *output_ptr,
370 unsigned int output_pitch,
371 unsigned int output_height,
372 unsigned int vp8_filter_index
375 extern void vp8_filter_block1d16_h6_ssse3
377 unsigned char *src_ptr,
378 unsigned int src_pixels_per_line,
379 unsigned char *output_ptr,
380 unsigned int output_pitch,
381 unsigned int output_height,
382 unsigned int vp8_filter_index
385 extern void vp8_filter_block1d16_v6_ssse3
387 unsigned char *src_ptr,
388 unsigned int src_pitch,
389 unsigned char *output_ptr,
390 unsigned int out_pitch,
391 unsigned int output_height,
392 unsigned int vp8_filter_index
395 extern void vp8_filter_block1d8_v6_ssse3
397 unsigned char *src_ptr,
398 unsigned int src_pitch,
399 unsigned char *output_ptr,
400 unsigned int out_pitch,
401 unsigned int output_height,
402 unsigned int vp8_filter_index
405 extern void vp8_filter_block1d4_h6_ssse3
407 unsigned char *src_ptr,
408 unsigned int src_pixels_per_line,
409 unsigned char *output_ptr,
410 unsigned int output_pitch,
411 unsigned int output_height,
412 unsigned int vp8_filter_index
415 extern void vp8_filter_block1d4_v6_ssse3
417 unsigned char *src_ptr,
418 unsigned int src_pitch,
419 unsigned char *output_ptr,
420 unsigned int out_pitch,
421 unsigned int output_height,
422 unsigned int vp8_filter_index
425 void vp8_sixtap_predict16x16_ssse3
427 unsigned char *src_ptr,
428 int src_pixels_per_line,
429 int xoffset,
430 int yoffset,
431 unsigned char *dst_ptr,
432 int dst_pitch
436 DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 24*24);
438 if (xoffset)
440 if (yoffset)
442 vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 16, 21, xoffset);
443 vp8_filter_block1d16_v6_ssse3(FData2 , 16, dst_ptr, dst_pitch, 16, yoffset);
445 else
447 /* First-pass only */
448 vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 16, xoffset);
451 else
453 /* Second-pass only */
454 vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line) , src_pixels_per_line, dst_ptr, dst_pitch, 16, yoffset);
458 void vp8_sixtap_predict8x8_ssse3
460 unsigned char *src_ptr,
461 int src_pixels_per_line,
462 int xoffset,
463 int yoffset,
464 unsigned char *dst_ptr,
465 int dst_pitch
468 DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256);
470 if (xoffset)
472 if (yoffset)
474 vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 8, 13, xoffset);
475 vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 8, yoffset);
477 else
479 vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 8, xoffset);
482 else
484 /* Second-pass only */
485 vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 8, yoffset);
490 void vp8_sixtap_predict8x4_ssse3
492 unsigned char *src_ptr,
493 int src_pixels_per_line,
494 int xoffset,
495 int yoffset,
496 unsigned char *dst_ptr,
497 int dst_pitch
500 DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256);
502 if (xoffset)
504 if (yoffset)
506 vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 8, 9, xoffset);
507 vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 4, yoffset);
509 else
511 /* First-pass only */
512 vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, xoffset);
515 else
517 /* Second-pass only */
518 vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, yoffset);
522 void vp8_sixtap_predict4x4_ssse3
524 unsigned char *src_ptr,
525 int src_pixels_per_line,
526 int xoffset,
527 int yoffset,
528 unsigned char *dst_ptr,
529 int dst_pitch
532 DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 4*9);
534 if (xoffset)
536 if (yoffset)
538 vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, FData2, 4, 9, xoffset);
539 vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch, 4, yoffset);
541 else
543 vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, xoffset);
546 else
548 vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, yoffset);
553 #endif