FS#8961 - Anti-Aliased Fonts.
[kugel-rb/myfork.git] / apps / codecs / libtremor / asm_arm.h
blob5a8109841f3b073ce7e4f7a385e1bf54d76564b5
1 /********************************************************************
2 * *
3 * THIS FILE IS PART OF THE OggVorbis 'TREMOR' CODEC SOURCE CODE. *
4 * *
5 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
6 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
7 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
8 * *
9 * THE OggVorbis 'TREMOR' SOURCE CODE IS (C) COPYRIGHT 1994-2002 *
10 * BY THE Xiph.Org FOUNDATION http://www.xiph.org/ *
11 * *
12 ********************************************************************
14 function: arm7 and later wide math functions
16 ********************************************************************/
18 #ifdef _ARM_ASSEM_
20 #if !defined(_V_WIDE_MATH) && !defined(_LOW_ACCURACY_)
21 #define _V_WIDE_MATH
23 static inline ogg_int32_t MULT32(ogg_int32_t x, ogg_int32_t y) {
24 int lo,hi;
25 asm volatile("smull\t%0, %1, %2, %3"
26 : "=&r"(lo),"=&r"(hi)
27 : "%r"(x),"r"(y)
28 : "cc");
29 return(hi);
32 static inline ogg_int32_t MULT31(ogg_int32_t x, ogg_int32_t y) {
33 return MULT32(x,y)<<1;
36 static inline ogg_int32_t MULT31_SHIFT15(ogg_int32_t x, ogg_int32_t y) {
37 int lo,hi;
38 asm volatile("smull %0, %1, %2, %3\n\t"
39 "movs %0, %0, lsr #15\n\t"
40 "adc %1, %0, %1, lsl #17\n\t"
41 : "=&r"(lo),"=&r"(hi)
42 : "%r"(x),"r"(y)
43 : "cc");
44 return(hi);
47 #define MB() asm volatile ("" : : : "memory")
49 #define XPROD32(a, b, t, v, x, y) \
50 { \
51 long l; \
52 asm( "smull %0, %1, %4, %6\n\t" \
53 "smlal %0, %1, %5, %7\n\t" \
54 "rsb %3, %4, #0\n\t" \
55 "smull %0, %2, %5, %6\n\t" \
56 "smlal %0, %2, %3, %7" \
57 : "=&r" (l), "=&r" (x), "=&r" (y), "=r" ((a)) \
58 : "3" ((a)), "r" ((b)), "r" ((t)), "r" ((v)) \
59 : "cc" ); \
62 static inline void XPROD31(ogg_int32_t a, ogg_int32_t b,
63 ogg_int32_t t, ogg_int32_t v,
64 ogg_int32_t *x, ogg_int32_t *y)
66 int x1, y1, l;
67 asm( "smull %0, %1, %4, %6\n\t"
68 "smlal %0, %1, %5, %7\n\t"
69 "rsb %3, %4, #0\n\t"
70 "smull %0, %2, %5, %6\n\t"
71 "smlal %0, %2, %3, %7"
72 : "=&r" (l), "=&r" (x1), "=&r" (y1), "=r" (a)
73 : "3" (a), "r" (b), "r" (t), "r" (v)
74 : "cc" );
75 *x = x1 << 1;
76 MB();
77 *y = y1 << 1;
80 static inline void XNPROD31(ogg_int32_t a, ogg_int32_t b,
81 ogg_int32_t t, ogg_int32_t v,
82 ogg_int32_t *x, ogg_int32_t *y)
84 int x1, y1, l;
85 asm( "rsb %2, %4, #0\n\t"
86 "smull %0, %1, %3, %5\n\t"
87 "smlal %0, %1, %2, %6\n\t"
88 "smull %0, %2, %4, %5\n\t"
89 "smlal %0, %2, %3, %6"
90 : "=&r" (l), "=&r" (x1), "=&r" (y1)
91 : "r" (a), "r" (b), "r" (t), "r" (v)
92 : "cc" );
93 *x = x1 << 1;
94 MB();
95 *y = y1 << 1;
98 #ifndef _V_VECT_OPS
99 #define _V_VECT_OPS
101 /* asm versions of vector operations for block.c, window.c */
102 /* SOME IMPORTANT NOTES: this implementation of vect_mult_bw does
103 NOT do a final shift, meaning that the result of vect_mult_bw is
104 only 31 bits not 32. This is so that we can do the shift in-place
105 in vect_add_xxxx instead to save one instruction for each mult on arm */
106 static inline
107 void vect_add_right_left(ogg_int32_t *x, const ogg_int32_t *y, int n)
109 /* first arg is right subframe of previous frame and second arg
110 is left subframe of current frame. overlap left onto right overwriting
111 the right subframe */
114 asm volatile (
115 "ldmia %[x], {r0, r1, r2, r3};"
116 "ldmia %[y]!, {r4, r5, r6, r7};"
117 "add r0, r4, r0, lsl #1;"
118 "add r1, r5, r1, lsl #1;"
119 "add r2, r6, r2, lsl #1;"
120 "add r3, r7, r3, lsl #1;"
121 "stmia %[x]!, {r0, r1, r2, r3};"
122 "ldmia %[x], {r0, r1, r2, r3};"
123 "ldmia %[y]!, {r4, r5, r6, r7};"
124 "add r0, r4, r0, lsl #1;"
125 "add r1, r5, r1, lsl #1;"
126 "add r2, r6, r2, lsl #1;"
127 "add r3, r7, r3, lsl #1;"
128 "stmia %[x]!, {r0, r1, r2, r3};"
129 : [x] "+r" (x), [y] "+r" (y)
130 : : "r0", "r1", "r2", "r3",
131 "r4", "r5", "r6", "r7",
132 "memory");
133 n -= 8;
134 } while (n);
137 static inline
138 void vect_add_left_right(ogg_int32_t *x, const ogg_int32_t *y, int n)
140 /* first arg is left subframe of current frame and second arg
141 is right subframe of previous frame. overlap right onto left overwriting
142 the LEFT subframe */
144 asm volatile (
145 "ldmia %[x], {r0, r1, r2, r3};"
146 "ldmia %[y]!, {r4, r5, r6, r7};"
147 "add r0, r0, r4, lsl #1;"
148 "add r1, r1, r5, lsl #1;"
149 "add r2, r2, r6, lsl #1;"
150 "add r3, r3, r7, lsl #1;"
151 "stmia %[x]!, {r0, r1, r2, r3};"
152 "ldmia %[x], {r0, r1, r2, r3};"
153 "ldmia %[y]!, {r4, r5, r6, r7};"
154 "add r0, r0, r4, lsl #1;"
155 "add r1, r1, r5, lsl #1;"
156 "add r2, r2, r6, lsl #1;"
157 "add r3, r3, r7, lsl #1;"
158 "stmia %[x]!, {r0, r1, r2, r3};"
159 : [x] "+r" (x), [y] "+r" (y)
160 : : "r0", "r1", "r2", "r3",
161 "r4", "r5", "r6", "r7",
162 "memory");
163 n -= 8;
164 } while (n);
167 static inline
168 void vect_mult_fw(ogg_int32_t *data, LOOKUP_T *window, int n)
170 /* Note, mult_fw uses MULT31 */
172 asm volatile (
173 "ldmia %[d], {r0, r1, r2, r3};"
174 "ldmia %[w]!, {r4, r5, r6, r7};"
175 "smull r8, r0, r4, r0;"
176 "mov r0, r0, lsl #1;"
177 "smull r8, r1, r5, r1;"
178 "mov r1, r1, lsl #1;"
179 "smull r8, r2, r6, r2;"
180 "mov r2, r2, lsl #1;"
181 "smull r8, r3, r7, r3;"
182 "mov r3, r3, lsl #1;"
183 "stmia %[d]!, {r0, r1, r2, r3};"
184 : [d] "+r" (data), [w] "+r" (window)
185 : : "r0", "r1", "r2", "r3",
186 "r4", "r5", "r6", "r7", "r8",
187 "memory", "cc");
188 n -= 4;
189 } while (n);
192 static inline
193 void vect_mult_bw(ogg_int32_t *data, LOOKUP_T *window, int n)
195 /* NOTE mult_bw uses MULT_32 i.e. doesn't shift result left at end */
196 /* On ARM, we can do the shift at the same time as the overlap-add */
198 asm volatile ("ldmia %[d], {r0, r1, r2, r3};"
199 "ldmda %[w]!, {r4, r5, r6, r7};"
200 "smull r8, r0, r7, r0;"
201 "smull r7, r1, r6, r1;"
202 "smull r6, r2, r5, r2;"
203 "smull r5, r3, r4, r3;"
204 "stmia %[d]!, {r0, r1, r2, r3};"
205 : [d] "+r" (data), [w] "+r" (window)
206 : : "r0", "r1", "r2", "r3",
207 "r4", "r5", "r6", "r7", "r8",
208 "memory", "cc");
209 n -= 4;
210 } while (n);
213 static inline void vect_copy(ogg_int32_t *x, const ogg_int32_t *y, int n)
215 memcpy(x,y,n*sizeof(ogg_int32_t));
218 #endif
220 #endif
222 #ifndef _V_CLIP_MATH
223 #define _V_CLIP_MATH
225 static inline ogg_int32_t CLIP_TO_15(ogg_int32_t x) {
226 int tmp;
227 asm volatile("subs %1, %0, #32768\n\t"
228 "movpl %0, #0x7f00\n\t"
229 "orrpl %0, %0, #0xff\n"
230 "adds %1, %0, #32768\n\t"
231 "movmi %0, #0x8000"
232 : "+r"(x),"=r"(tmp)
234 : "cc");
235 return(x);
238 #endif
240 #ifndef _V_LSP_MATH_ASM
241 #define _V_LSP_MATH_ASM
243 static inline void lsp_loop_asm(ogg_uint32_t *qip,ogg_uint32_t *pip,
244 ogg_int32_t *qexpp,
245 ogg_int32_t *ilsp,ogg_int32_t wi,
246 ogg_int32_t m){
248 ogg_uint32_t qi=*qip,pi=*pip;
249 ogg_int32_t qexp=*qexpp;
251 asm("mov r0,%3;"
252 "mov r1,%5,asr#1;"
253 "add r0,r0,r1,lsl#3;"
254 "1:"
256 "ldmdb r0!,{r1,r3};"
257 "subs r1,r1,%4;" //ilsp[j]-wi
258 "rsbmi r1,r1,#0;" //labs(ilsp[j]-wi)
259 "umull %0,r2,r1,%0;" //qi*=labs(ilsp[j]-wi)
261 "subs r1,r3,%4;" //ilsp[j+1]-wi
262 "rsbmi r1,r1,#0;" //labs(ilsp[j+1]-wi)
263 "umull %1,r3,r1,%1;" //pi*=labs(ilsp[j+1]-wi)
265 "cmn r2,r3;" // shift down 16?
266 "beq 0f;"
267 "add %2,%2,#16;"
268 "mov %0,%0,lsr #16;"
269 "orr %0,%0,r2,lsl #16;"
270 "mov %1,%1,lsr #16;"
271 "orr %1,%1,r3,lsl #16;"
272 "0:"
273 "cmp r0,%3;\n"
274 "bhi 1b;\n"
276 // odd filter assymetry
277 "ands r0,%5,#1;\n"
278 "beq 2f;\n"
279 "add r0,%3,%5,lsl#2;\n"
281 "ldr r1,[r0,#-4];\n"
282 "mov r0,#0x4000;\n"
284 "subs r1,r1,%4;\n" //ilsp[j]-wi
285 "rsbmi r1,r1,#0;\n" //labs(ilsp[j]-wi)
286 "umull %0,r2,r1,%0;\n" //qi*=labs(ilsp[j]-wi)
287 "umull %1,r3,r0,%1;\n" //pi*=labs(ilsp[j+1]-wi)
289 "cmn r2,r3;\n" // shift down 16?
290 "beq 2f;\n"
291 "add %2,%2,#16;\n"
292 "mov %0,%0,lsr #16;\n"
293 "orr %0,%0,r2,lsl #16;\n"
294 "mov %1,%1,lsr #16;\n"
295 "orr %1,%1,r3,lsl #16;\n"
297 //qi=(pi>>shift)*labs(ilsp[j]-wi);
298 //pi=(qi>>shift)*labs(ilsp[j+1]-wi);
299 //qexp+=shift;
303 /* normalize to max 16 sig figs */
304 "2:"
305 "mov r2,#0;"
306 "orr r1,%0,%1;"
307 "tst r1,#0xff000000;"
308 "addne r2,r2,#8;"
309 "movne r1,r1,lsr #8;"
310 "tst r1,#0x00f00000;"
311 "addne r2,r2,#4;"
312 "movne r1,r1,lsr #4;"
313 "tst r1,#0x000c0000;"
314 "addne r2,r2,#2;"
315 "movne r1,r1,lsr #2;"
316 "tst r1,#0x00020000;"
317 "addne r2,r2,#1;"
318 "movne r1,r1,lsr #1;"
319 "tst r1,#0x00010000;"
320 "addne r2,r2,#1;"
321 "mov %0,%0,lsr r2;"
322 "mov %1,%1,lsr r2;"
323 "add %2,%2,r2;"
325 : "+r"(qi),"+r"(pi),"+r"(qexp)
326 : "r"(ilsp),"r"(wi),"r"(m)
327 : "r0","r1","r2","r3","cc");
329 *qip=qi;
330 *pip=pi;
331 *qexpp=qexp;
334 static inline void lsp_norm_asm(ogg_uint32_t *qip,ogg_int32_t *qexpp){
336 ogg_uint32_t qi=*qip;
337 ogg_int32_t qexp=*qexpp;
339 asm("tst %0,#0x0000ff00;"
340 "moveq %0,%0,lsl #8;"
341 "subeq %1,%1,#8;"
342 "tst %0,#0x0000f000;"
343 "moveq %0,%0,lsl #4;"
344 "subeq %1,%1,#4;"
345 "tst %0,#0x0000c000;"
346 "moveq %0,%0,lsl #2;"
347 "subeq %1,%1,#2;"
348 "tst %0,#0x00008000;"
349 "moveq %0,%0,lsl #1;"
350 "subeq %1,%1,#1;"
351 : "+r"(qi),"+r"(qexp)
353 : "cc");
354 *qip=qi;
355 *qexpp=qexp;
358 #endif
359 #endif