1 #include <linux/config.h>
2 #include <linux/types.h>
3 #include <linux/string.h>
4 #include <linux/sched.h>
5 #include <linux/hardirq.h>
6 #include <linux/module.h>
12 * MMX 3DNow! library helper functions
15 * We can use MMX just for prefetch in IRQ's. This may be a win.
16 * (reported so on K6-III)
17 * We should use a better code neutral filler for the short jump
18 * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
19 * We also want to clobber the filler register so we don't get any
20 * register forwarding stalls on the filler.
22 * Add *user handling. Checksums are not a win with MMX on any CPU
23 * tested so far for any MMX solution figured.
25 * 22/09/2000 - Arjan van de Ven
26 * Improved for non-egineering-sample Athlons
30 void *_mmx_memcpy(void *to
, const void *from
, size_t len
)
35 if (unlikely(in_interrupt()))
36 return __memcpy(to
, from
, len
);
39 i
= len
>> 6; /* len/64 */
43 __asm__
__volatile__ (
44 "1: prefetch (%0)\n" /* This set is 28 bytes */
50 ".section .fixup, \"ax\"\n"
51 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
54 ".section __ex_table,\"a\"\n"
63 __asm__
__volatile__ (
64 "1: prefetch 320(%0)\n"
65 "2: movq (%0), %%mm0\n"
66 " movq 8(%0), %%mm1\n"
67 " movq 16(%0), %%mm2\n"
68 " movq 24(%0), %%mm3\n"
70 " movq %%mm1, 8(%1)\n"
71 " movq %%mm2, 16(%1)\n"
72 " movq %%mm3, 24(%1)\n"
73 " movq 32(%0), %%mm0\n"
74 " movq 40(%0), %%mm1\n"
75 " movq 48(%0), %%mm2\n"
76 " movq 56(%0), %%mm3\n"
77 " movq %%mm0, 32(%1)\n"
78 " movq %%mm1, 40(%1)\n"
79 " movq %%mm2, 48(%1)\n"
80 " movq %%mm3, 56(%1)\n"
81 ".section .fixup, \"ax\"\n"
82 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
85 ".section __ex_table,\"a\"\n"
89 : : "r" (from
), "r" (to
) : "memory");
96 __asm__
__volatile__ (
98 " movq 8(%0), %%mm1\n"
99 " movq 16(%0), %%mm2\n"
100 " movq 24(%0), %%mm3\n"
101 " movq %%mm0, (%1)\n"
102 " movq %%mm1, 8(%1)\n"
103 " movq %%mm2, 16(%1)\n"
104 " movq %%mm3, 24(%1)\n"
105 " movq 32(%0), %%mm0\n"
106 " movq 40(%0), %%mm1\n"
107 " movq 48(%0), %%mm2\n"
108 " movq 56(%0), %%mm3\n"
109 " movq %%mm0, 32(%1)\n"
110 " movq %%mm1, 40(%1)\n"
111 " movq %%mm2, 48(%1)\n"
112 " movq %%mm3, 56(%1)\n"
113 : : "r" (from
), "r" (to
) : "memory");
118 * Now do the tail of the block
120 __memcpy(to
, from
, len
&63);
128 * The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
129 * other MMX using processors do not.
132 static void fast_clear_page(void *page
)
138 __asm__
__volatile__ (
139 " pxor %%mm0, %%mm0\n" : :
142 for(i
=0;i
<4096/64;i
++)
144 __asm__
__volatile__ (
145 " movntq %%mm0, (%0)\n"
146 " movntq %%mm0, 8(%0)\n"
147 " movntq %%mm0, 16(%0)\n"
148 " movntq %%mm0, 24(%0)\n"
149 " movntq %%mm0, 32(%0)\n"
150 " movntq %%mm0, 40(%0)\n"
151 " movntq %%mm0, 48(%0)\n"
152 " movntq %%mm0, 56(%0)\n"
153 : : "r" (page
) : "memory");
156 /* since movntq is weakly-ordered, a "sfence" is needed to become
159 __asm__
__volatile__ (
165 static void fast_copy_page(void *to
, void *from
)
171 /* maybe the prefetch stuff can go before the expensive fnsave...
172 * but that is for later. -AV
174 __asm__
__volatile__ (
177 " prefetch 128(%0)\n"
178 " prefetch 192(%0)\n"
179 " prefetch 256(%0)\n"
181 ".section .fixup, \"ax\"\n"
182 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
185 ".section __ex_table,\"a\"\n"
191 for(i
=0; i
<(4096-320)/64; i
++)
193 __asm__
__volatile__ (
194 "1: prefetch 320(%0)\n"
195 "2: movq (%0), %%mm0\n"
196 " movntq %%mm0, (%1)\n"
197 " movq 8(%0), %%mm1\n"
198 " movntq %%mm1, 8(%1)\n"
199 " movq 16(%0), %%mm2\n"
200 " movntq %%mm2, 16(%1)\n"
201 " movq 24(%0), %%mm3\n"
202 " movntq %%mm3, 24(%1)\n"
203 " movq 32(%0), %%mm4\n"
204 " movntq %%mm4, 32(%1)\n"
205 " movq 40(%0), %%mm5\n"
206 " movntq %%mm5, 40(%1)\n"
207 " movq 48(%0), %%mm6\n"
208 " movntq %%mm6, 48(%1)\n"
209 " movq 56(%0), %%mm7\n"
210 " movntq %%mm7, 56(%1)\n"
211 ".section .fixup, \"ax\"\n"
212 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
215 ".section __ex_table,\"a\"\n"
219 : : "r" (from
), "r" (to
) : "memory");
223 for(i
=(4096-320)/64; i
<4096/64; i
++)
225 __asm__
__volatile__ (
226 "2: movq (%0), %%mm0\n"
227 " movntq %%mm0, (%1)\n"
228 " movq 8(%0), %%mm1\n"
229 " movntq %%mm1, 8(%1)\n"
230 " movq 16(%0), %%mm2\n"
231 " movntq %%mm2, 16(%1)\n"
232 " movq 24(%0), %%mm3\n"
233 " movntq %%mm3, 24(%1)\n"
234 " movq 32(%0), %%mm4\n"
235 " movntq %%mm4, 32(%1)\n"
236 " movq 40(%0), %%mm5\n"
237 " movntq %%mm5, 40(%1)\n"
238 " movq 48(%0), %%mm6\n"
239 " movntq %%mm6, 48(%1)\n"
240 " movq 56(%0), %%mm7\n"
241 " movntq %%mm7, 56(%1)\n"
242 : : "r" (from
), "r" (to
) : "memory");
246 /* since movntq is weakly-ordered, a "sfence" is needed to become
249 __asm__
__volatile__ (
258 * Generic MMX implementation without K7 specific streaming
261 static void fast_clear_page(void *page
)
267 __asm__
__volatile__ (
268 " pxor %%mm0, %%mm0\n" : :
271 for(i
=0;i
<4096/128;i
++)
273 __asm__
__volatile__ (
274 " movq %%mm0, (%0)\n"
275 " movq %%mm0, 8(%0)\n"
276 " movq %%mm0, 16(%0)\n"
277 " movq %%mm0, 24(%0)\n"
278 " movq %%mm0, 32(%0)\n"
279 " movq %%mm0, 40(%0)\n"
280 " movq %%mm0, 48(%0)\n"
281 " movq %%mm0, 56(%0)\n"
282 " movq %%mm0, 64(%0)\n"
283 " movq %%mm0, 72(%0)\n"
284 " movq %%mm0, 80(%0)\n"
285 " movq %%mm0, 88(%0)\n"
286 " movq %%mm0, 96(%0)\n"
287 " movq %%mm0, 104(%0)\n"
288 " movq %%mm0, 112(%0)\n"
289 " movq %%mm0, 120(%0)\n"
290 : : "r" (page
) : "memory");
297 static void fast_copy_page(void *to
, void *from
)
304 __asm__
__volatile__ (
307 " prefetch 128(%0)\n"
308 " prefetch 192(%0)\n"
309 " prefetch 256(%0)\n"
311 ".section .fixup, \"ax\"\n"
312 "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
315 ".section __ex_table,\"a\"\n"
321 for(i
=0; i
<4096/64; i
++)
323 __asm__
__volatile__ (
324 "1: prefetch 320(%0)\n"
325 "2: movq (%0), %%mm0\n"
326 " movq 8(%0), %%mm1\n"
327 " movq 16(%0), %%mm2\n"
328 " movq 24(%0), %%mm3\n"
329 " movq %%mm0, (%1)\n"
330 " movq %%mm1, 8(%1)\n"
331 " movq %%mm2, 16(%1)\n"
332 " movq %%mm3, 24(%1)\n"
333 " movq 32(%0), %%mm0\n"
334 " movq 40(%0), %%mm1\n"
335 " movq 48(%0), %%mm2\n"
336 " movq 56(%0), %%mm3\n"
337 " movq %%mm0, 32(%1)\n"
338 " movq %%mm1, 40(%1)\n"
339 " movq %%mm2, 48(%1)\n"
340 " movq %%mm3, 56(%1)\n"
341 ".section .fixup, \"ax\"\n"
342 "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
345 ".section __ex_table,\"a\"\n"
349 : : "r" (from
), "r" (to
) : "memory");
360 * Favour MMX for page clear and copy.
363 static void slow_zero_page(void * page
)
366 __asm__
__volatile__( \
369 : "=&c" (d0
), "=&D" (d1
)
370 :"a" (0),"1" (page
),"0" (1024)
374 void mmx_clear_page(void * page
)
376 if(unlikely(in_interrupt()))
377 slow_zero_page(page
);
379 fast_clear_page(page
);
382 static void slow_copy_page(void *to
, void *from
)
385 __asm__
__volatile__( \
388 : "=&c" (d0
), "=&D" (d1
), "=&S" (d2
) \
389 : "0" (1024),"1" ((long) to
),"2" ((long) from
) \
394 void mmx_copy_page(void *to
, void *from
)
396 if(unlikely(in_interrupt()))
397 slow_copy_page(to
, from
);
399 fast_copy_page(to
, from
);
402 EXPORT_SYMBOL(_mmx_memcpy
);
403 EXPORT_SYMBOL(mmx_clear_page
);
404 EXPORT_SYMBOL(mmx_copy_page
);