Add Russian translation provided by Валерий Крувялис <valkru@mail.ru>
[xiph-mirror.git] / theora-old / lib / x86_64 / recon_mmx.c
blobd7253ac8a87b79bde843ec3b88288818a7d2b575
1 /********************************************************************
2 * *
3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
7 * *
8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
9 * by the Xiph.Org Foundation http://www.xiph.org/ *
10 * *
11 ********************************************************************
13 function:
14 last mod: $Id$
16 ********************************************************************/
18 #include "codec_internal.h"
20 typedef unsigned long long ogg_uint64_t;
22 static const __attribute__ ((aligned(8),used)) ogg_int64_t V128 = 0x8080808080808080LL;
24 static void copy8x8__mmx (unsigned char *src,
25 unsigned char *dest,
26 ogg_uint32_t stride)
28 __asm__ __volatile__ (
29 " .balign 16 \n\t"
31 " lea (%2, %2, 2), %%rdi \n\t"
33 " movq (%1), %%mm0 \n\t"
34 " movq (%1, %2), %%mm1 \n\t"
35 " movq (%1, %2, 2), %%mm2 \n\t"
36 " movq (%1, %%rdi), %%mm3 \n\t"
38 " lea (%1, %2, 4), %1 \n\t"
40 " movq %%mm0, (%0) \n\t"
41 " movq %%mm1, (%0, %2) \n\t"
42 " movq %%mm2, (%0, %2, 2) \n\t"
43 " movq %%mm3, (%0, %%rdi) \n\t"
45 " lea (%0, %2, 4), %0 \n\t"
47 " movq (%1), %%mm0 \n\t"
48 " movq (%1, %2), %%mm1 \n\t"
49 " movq (%1, %2, 2), %%mm2 \n\t"
50 " movq (%1, %%rdi), %%mm3 \n\t"
52 " movq %%mm0, (%0) \n\t"
53 " movq %%mm1, (%0, %2) \n\t"
54 " movq %%mm2, (%0, %2, 2) \n\t"
55 " movq %%mm3, (%0, %%rdi) \n\t"
56 : "+a" (dest)
57 : "c" (src),
58 "d" ((ogg_uint64_t)stride)
59 : "memory", "rdi"
63 static void recon_intra8x8__mmx (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
64 ogg_uint32_t LineStep)
66 __asm__ __volatile__ (
67 " .balign 16 \n\t"
69 " movq %[V128], %%mm0 \n\t" /* Set mm0 to 0x8080808080808080 */
71 " lea 128(%1), %%rdi \n\t" /* Endpoint in input buffer */
72 "1: \n\t"
73 " movq (%1), %%mm2 \n\t" /* First four input values */
75 " packsswb 8(%1), %%mm2 \n\t" /* pack with next(high) four values */
76 " por %%mm0, %%mm0 \n\t"
77 " pxor %%mm0, %%mm2 \n\t" /* Convert result to unsigned (same as add 128) */
78 " lea 16(%1), %1 \n\t" /* Step source buffer */
79 " cmp %%rdi, %1 \n\t" /* are we done */
81 " movq %%mm2, (%0) \n\t" /* store results */
83 " lea (%0, %2), %0 \n\t" /* Step output buffer */
84 " jc 1b \n\t" /* Loop back if we are not done */
85 : "+r" (ReconPtr)
86 : "r" (ChangePtr),
87 "r" ((ogg_uint64_t)LineStep),
88 [V128] "m" (V128)
89 : "memory", "rdi"
93 static void recon_inter8x8__mmx (unsigned char *ReconPtr, unsigned char *RefPtr,
94 ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
96 __asm__ __volatile__ (
97 " .balign 16 \n\t"
99 " pxor %%mm0, %%mm0 \n\t"
100 " lea 128(%1), %%rdi \n\t"
102 "1: \n\t"
103 " movq (%2), %%mm2 \n\t" /* (+3 misaligned) 8 reference pixels */
105 " movq (%1), %%mm4 \n\t" /* first 4 changes */
106 " movq %%mm2, %%mm3 \n\t"
107 " movq 8(%1), %%mm5 \n\t" /* last 4 changes */
108 " punpcklbw %%mm0, %%mm2 \n\t" /* turn first 4 refs into positive 16-bit #s */
109 " paddsw %%mm4, %%mm2 \n\t" /* add in first 4 changes */
110 " punpckhbw %%mm0, %%mm3 \n\t" /* turn last 4 refs into positive 16-bit #s */
111 " paddsw %%mm5, %%mm3 \n\t" /* add in last 4 changes */
112 " add %3, %2 \n\t" /* next row of reference pixels */
113 " packuswb %%mm3, %%mm2 \n\t" /* pack result to unsigned 8-bit values */
114 " lea 16(%1), %1 \n\t" /* next row of changes */
115 " cmp %%rdi, %1 \n\t" /* are we done? */
117 " movq %%mm2, (%0) \n\t" /* store result */
119 " lea (%0, %3), %0 \n\t" /* next row of output */
120 " jc 1b \n\t"
121 : "+r" (ReconPtr)
122 : "r" (ChangePtr),
123 "r" (RefPtr),
124 "r" ((ogg_uint64_t)LineStep)
125 : "memory", "rdi"
129 static void recon_inter8x8_half__mmx (unsigned char *ReconPtr, unsigned char *RefPtr1,
130 unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
131 ogg_uint32_t LineStep)
133 __asm__ __volatile__ (
134 " .balign 16 \n\t"
136 " pxor %%mm0, %%mm0 \n\t"
137 " lea 128(%1), %%rdi \n\t"
139 "1: \n\t"
140 " movq (%2), %%mm2 \n\t" /* (+3 misaligned) 8 reference pixels */
141 " movq (%3), %%mm4 \n\t" /* (+3 misaligned) 8 reference pixels */
143 " movq %%mm2, %%mm3 \n\t"
144 " punpcklbw %%mm0, %%mm2 \n\t" /* mm2 = start ref1 as positive 16-bit #s */
145 " movq %%mm4, %%mm5 \n\t"
146 " movq (%1), %%mm6 \n\t" /* first 4 changes */
147 " punpckhbw %%mm0, %%mm3 \n\t" /* mm3 = end ref1 as positive 16-bit #s */
148 " movq 8(%1), %%mm7 \n\t" /* last 4 changes */
149 " punpcklbw %%mm0, %%mm4 \n\t" /* mm4 = start ref2 as positive 16-bit #s */
150 " punpckhbw %%mm0, %%mm5 \n\t" /* mm5 = end ref2 as positive 16-bit #s */
151 " paddw %%mm4, %%mm2 \n\t" /* mm2 = start (ref1 + ref2) */
152 " paddw %%mm5, %%mm3 \n\t" /* mm3 = end (ref1 + ref2) */
153 " psrlw $1, %%mm2 \n\t" /* mm2 = start (ref1 + ref2)/2 */
154 " psrlw $1, %%mm3 \n\t" /* mm3 = end (ref1 + ref2)/2 */
155 " paddw %%mm6, %%mm2 \n\t" /* add changes to start */
156 " paddw %%mm7, %%mm3 \n\t" /* add changes to end */
157 " lea 16(%1), %1 \n\t" /* next row of changes */
158 " packuswb %%mm3, %%mm2 \n\t" /* pack start|end to unsigned 8-bit */
159 " add %4, %2 \n\t" /* next row of reference pixels */
160 " add %4, %3 \n\t" /* next row of reference pixels */
161 " movq %%mm2, (%0) \n\t" /* store result */
162 " add %4, %0 \n\t" /* next row of output */
163 " cmp %%rdi, %1 \n\t" /* are we done? */
164 " jc 1b \n\t"
165 : "+r" (ReconPtr)
166 : "r" (ChangePtr),
167 "r" (RefPtr1),
168 "r" (RefPtr2),
169 "r" ((ogg_uint64_t)LineStep)
170 : "memory", "rdi"
174 void dsp_mmx_recon_init(DspFunctions *funcs)
176 TH_DEBUG("enabling accelerated x86_64 mmx recon functions.\n");
177 funcs->copy8x8 = copy8x8__mmx;
178 funcs->recon_intra8x8 = recon_intra8x8__mmx;
179 funcs->recon_inter8x8 = recon_inter8x8__mmx;
180 funcs->recon_inter8x8_half = recon_inter8x8_half__mmx;