kbuild: fix silentoldconfig with make O=
[linux-2.6/verdex.git] / arch / i386 / lib / mmx.c
blob2afda94dffd340b4097b3819b7770c1bdfb263e5
1 #include <linux/config.h>
2 #include <linux/types.h>
3 #include <linux/string.h>
4 #include <linux/sched.h>
5 #include <linux/hardirq.h>
6 #include <linux/module.h>
8 #include <asm/i387.h>
12 * MMX 3DNow! library helper functions
14 * To do:
15 * We can use MMX just for prefetch in IRQ's. This may be a win.
16 * (reported so on K6-III)
17 * We should use a better code neutral filler for the short jump
18 * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
19 * We also want to clobber the filler register so we don't get any
20 * register forwarding stalls on the filler.
22 * Add *user handling. Checksums are not a win with MMX on any CPU
23 * tested so far for any MMX solution figured.
25 * 22/09/2000 - Arjan van de Ven
26 * Improved for non-egineering-sample Athlons
30 void *_mmx_memcpy(void *to, const void *from, size_t len)
32 void *p;
33 int i;
35 if (unlikely(in_interrupt()))
36 return __memcpy(to, from, len);
38 p = to;
39 i = len >> 6; /* len/64 */
41 kernel_fpu_begin();
43 __asm__ __volatile__ (
44 "1: prefetch (%0)\n" /* This set is 28 bytes */
45 " prefetch 64(%0)\n"
46 " prefetch 128(%0)\n"
47 " prefetch 192(%0)\n"
48 " prefetch 256(%0)\n"
49 "2: \n"
50 ".section .fixup, \"ax\"\n"
51 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
52 " jmp 2b\n"
53 ".previous\n"
54 ".section __ex_table,\"a\"\n"
55 " .align 4\n"
56 " .long 1b, 3b\n"
57 ".previous"
58 : : "r" (from) );
61 for(; i>5; i--)
63 __asm__ __volatile__ (
64 "1: prefetch 320(%0)\n"
65 "2: movq (%0), %%mm0\n"
66 " movq 8(%0), %%mm1\n"
67 " movq 16(%0), %%mm2\n"
68 " movq 24(%0), %%mm3\n"
69 " movq %%mm0, (%1)\n"
70 " movq %%mm1, 8(%1)\n"
71 " movq %%mm2, 16(%1)\n"
72 " movq %%mm3, 24(%1)\n"
73 " movq 32(%0), %%mm0\n"
74 " movq 40(%0), %%mm1\n"
75 " movq 48(%0), %%mm2\n"
76 " movq 56(%0), %%mm3\n"
77 " movq %%mm0, 32(%1)\n"
78 " movq %%mm1, 40(%1)\n"
79 " movq %%mm2, 48(%1)\n"
80 " movq %%mm3, 56(%1)\n"
81 ".section .fixup, \"ax\"\n"
82 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
83 " jmp 2b\n"
84 ".previous\n"
85 ".section __ex_table,\"a\"\n"
86 " .align 4\n"
87 " .long 1b, 3b\n"
88 ".previous"
89 : : "r" (from), "r" (to) : "memory");
90 from+=64;
91 to+=64;
94 for(; i>0; i--)
96 __asm__ __volatile__ (
97 " movq (%0), %%mm0\n"
98 " movq 8(%0), %%mm1\n"
99 " movq 16(%0), %%mm2\n"
100 " movq 24(%0), %%mm3\n"
101 " movq %%mm0, (%1)\n"
102 " movq %%mm1, 8(%1)\n"
103 " movq %%mm2, 16(%1)\n"
104 " movq %%mm3, 24(%1)\n"
105 " movq 32(%0), %%mm0\n"
106 " movq 40(%0), %%mm1\n"
107 " movq 48(%0), %%mm2\n"
108 " movq 56(%0), %%mm3\n"
109 " movq %%mm0, 32(%1)\n"
110 " movq %%mm1, 40(%1)\n"
111 " movq %%mm2, 48(%1)\n"
112 " movq %%mm3, 56(%1)\n"
113 : : "r" (from), "r" (to) : "memory");
114 from+=64;
115 to+=64;
118 * Now do the tail of the block
120 __memcpy(to, from, len&63);
121 kernel_fpu_end();
122 return p;
125 #ifdef CONFIG_MK7
128 * The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
129 * other MMX using processors do not.
132 static void fast_clear_page(void *page)
134 int i;
136 kernel_fpu_begin();
138 __asm__ __volatile__ (
139 " pxor %%mm0, %%mm0\n" : :
142 for(i=0;i<4096/64;i++)
144 __asm__ __volatile__ (
145 " movntq %%mm0, (%0)\n"
146 " movntq %%mm0, 8(%0)\n"
147 " movntq %%mm0, 16(%0)\n"
148 " movntq %%mm0, 24(%0)\n"
149 " movntq %%mm0, 32(%0)\n"
150 " movntq %%mm0, 40(%0)\n"
151 " movntq %%mm0, 48(%0)\n"
152 " movntq %%mm0, 56(%0)\n"
153 : : "r" (page) : "memory");
154 page+=64;
156 /* since movntq is weakly-ordered, a "sfence" is needed to become
157 * ordered again.
159 __asm__ __volatile__ (
160 " sfence \n" : :
162 kernel_fpu_end();
165 static void fast_copy_page(void *to, void *from)
167 int i;
169 kernel_fpu_begin();
171 /* maybe the prefetch stuff can go before the expensive fnsave...
172 * but that is for later. -AV
174 __asm__ __volatile__ (
175 "1: prefetch (%0)\n"
176 " prefetch 64(%0)\n"
177 " prefetch 128(%0)\n"
178 " prefetch 192(%0)\n"
179 " prefetch 256(%0)\n"
180 "2: \n"
181 ".section .fixup, \"ax\"\n"
182 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
183 " jmp 2b\n"
184 ".previous\n"
185 ".section __ex_table,\"a\"\n"
186 " .align 4\n"
187 " .long 1b, 3b\n"
188 ".previous"
189 : : "r" (from) );
191 for(i=0; i<(4096-320)/64; i++)
193 __asm__ __volatile__ (
194 "1: prefetch 320(%0)\n"
195 "2: movq (%0), %%mm0\n"
196 " movntq %%mm0, (%1)\n"
197 " movq 8(%0), %%mm1\n"
198 " movntq %%mm1, 8(%1)\n"
199 " movq 16(%0), %%mm2\n"
200 " movntq %%mm2, 16(%1)\n"
201 " movq 24(%0), %%mm3\n"
202 " movntq %%mm3, 24(%1)\n"
203 " movq 32(%0), %%mm4\n"
204 " movntq %%mm4, 32(%1)\n"
205 " movq 40(%0), %%mm5\n"
206 " movntq %%mm5, 40(%1)\n"
207 " movq 48(%0), %%mm6\n"
208 " movntq %%mm6, 48(%1)\n"
209 " movq 56(%0), %%mm7\n"
210 " movntq %%mm7, 56(%1)\n"
211 ".section .fixup, \"ax\"\n"
212 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
213 " jmp 2b\n"
214 ".previous\n"
215 ".section __ex_table,\"a\"\n"
216 " .align 4\n"
217 " .long 1b, 3b\n"
218 ".previous"
219 : : "r" (from), "r" (to) : "memory");
220 from+=64;
221 to+=64;
223 for(i=(4096-320)/64; i<4096/64; i++)
225 __asm__ __volatile__ (
226 "2: movq (%0), %%mm0\n"
227 " movntq %%mm0, (%1)\n"
228 " movq 8(%0), %%mm1\n"
229 " movntq %%mm1, 8(%1)\n"
230 " movq 16(%0), %%mm2\n"
231 " movntq %%mm2, 16(%1)\n"
232 " movq 24(%0), %%mm3\n"
233 " movntq %%mm3, 24(%1)\n"
234 " movq 32(%0), %%mm4\n"
235 " movntq %%mm4, 32(%1)\n"
236 " movq 40(%0), %%mm5\n"
237 " movntq %%mm5, 40(%1)\n"
238 " movq 48(%0), %%mm6\n"
239 " movntq %%mm6, 48(%1)\n"
240 " movq 56(%0), %%mm7\n"
241 " movntq %%mm7, 56(%1)\n"
242 : : "r" (from), "r" (to) : "memory");
243 from+=64;
244 to+=64;
246 /* since movntq is weakly-ordered, a "sfence" is needed to become
247 * ordered again.
249 __asm__ __volatile__ (
250 " sfence \n" : :
252 kernel_fpu_end();
255 #else
258 * Generic MMX implementation without K7 specific streaming
261 static void fast_clear_page(void *page)
263 int i;
265 kernel_fpu_begin();
267 __asm__ __volatile__ (
268 " pxor %%mm0, %%mm0\n" : :
271 for(i=0;i<4096/128;i++)
273 __asm__ __volatile__ (
274 " movq %%mm0, (%0)\n"
275 " movq %%mm0, 8(%0)\n"
276 " movq %%mm0, 16(%0)\n"
277 " movq %%mm0, 24(%0)\n"
278 " movq %%mm0, 32(%0)\n"
279 " movq %%mm0, 40(%0)\n"
280 " movq %%mm0, 48(%0)\n"
281 " movq %%mm0, 56(%0)\n"
282 " movq %%mm0, 64(%0)\n"
283 " movq %%mm0, 72(%0)\n"
284 " movq %%mm0, 80(%0)\n"
285 " movq %%mm0, 88(%0)\n"
286 " movq %%mm0, 96(%0)\n"
287 " movq %%mm0, 104(%0)\n"
288 " movq %%mm0, 112(%0)\n"
289 " movq %%mm0, 120(%0)\n"
290 : : "r" (page) : "memory");
291 page+=128;
294 kernel_fpu_end();
297 static void fast_copy_page(void *to, void *from)
299 int i;
302 kernel_fpu_begin();
304 __asm__ __volatile__ (
305 "1: prefetch (%0)\n"
306 " prefetch 64(%0)\n"
307 " prefetch 128(%0)\n"
308 " prefetch 192(%0)\n"
309 " prefetch 256(%0)\n"
310 "2: \n"
311 ".section .fixup, \"ax\"\n"
312 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
313 " jmp 2b\n"
314 ".previous\n"
315 ".section __ex_table,\"a\"\n"
316 " .align 4\n"
317 " .long 1b, 3b\n"
318 ".previous"
319 : : "r" (from) );
321 for(i=0; i<4096/64; i++)
323 __asm__ __volatile__ (
324 "1: prefetch 320(%0)\n"
325 "2: movq (%0), %%mm0\n"
326 " movq 8(%0), %%mm1\n"
327 " movq 16(%0), %%mm2\n"
328 " movq 24(%0), %%mm3\n"
329 " movq %%mm0, (%1)\n"
330 " movq %%mm1, 8(%1)\n"
331 " movq %%mm2, 16(%1)\n"
332 " movq %%mm3, 24(%1)\n"
333 " movq 32(%0), %%mm0\n"
334 " movq 40(%0), %%mm1\n"
335 " movq 48(%0), %%mm2\n"
336 " movq 56(%0), %%mm3\n"
337 " movq %%mm0, 32(%1)\n"
338 " movq %%mm1, 40(%1)\n"
339 " movq %%mm2, 48(%1)\n"
340 " movq %%mm3, 56(%1)\n"
341 ".section .fixup, \"ax\"\n"
342 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
343 " jmp 2b\n"
344 ".previous\n"
345 ".section __ex_table,\"a\"\n"
346 " .align 4\n"
347 " .long 1b, 3b\n"
348 ".previous"
349 : : "r" (from), "r" (to) : "memory");
350 from+=64;
351 to+=64;
353 kernel_fpu_end();
357 #endif
360 * Favour MMX for page clear and copy.
363 static void slow_zero_page(void * page)
365 int d0, d1;
366 __asm__ __volatile__( \
367 "cld\n\t" \
368 "rep ; stosl" \
369 : "=&c" (d0), "=&D" (d1)
370 :"a" (0),"1" (page),"0" (1024)
371 :"memory");
374 void mmx_clear_page(void * page)
376 if(unlikely(in_interrupt()))
377 slow_zero_page(page);
378 else
379 fast_clear_page(page);
382 static void slow_copy_page(void *to, void *from)
384 int d0, d1, d2;
385 __asm__ __volatile__( \
386 "cld\n\t" \
387 "rep ; movsl" \
388 : "=&c" (d0), "=&D" (d1), "=&S" (d2) \
389 : "0" (1024),"1" ((long) to),"2" ((long) from) \
390 : "memory");
394 void mmx_copy_page(void *to, void *from)
396 if(unlikely(in_interrupt()))
397 slow_copy_page(to, from);
398 else
399 fast_copy_page(to, from);
402 EXPORT_SYMBOL(_mmx_memcpy);
403 EXPORT_SYMBOL(mmx_clear_page);
404 EXPORT_SYMBOL(mmx_copy_page);