Merge git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-for-linus
[wrt350n-kernel.git] / include / asm-x86 / xor_64.h
blob1eee7fcb2420682f613eb7f2528a7056f38d7e04
1 /*
2 * Optimized RAID-5 checksumming functions for MMX and SSE.
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2, or (at your option)
7 * any later version.
9 * You should have received a copy of the GNU General Public License
10 * (for example /usr/src/linux/COPYING); if not, write to the Free
11 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
16 * Cache avoiding checksumming functions utilizing KNI instructions
17 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
21 * Based on
22 * High-speed RAID5 checksumming functions utilizing SSE instructions.
23 * Copyright (C) 1998 Ingo Molnar.
27 * x86-64 changes / gcc fixes from Andi Kleen.
28 * Copyright 2002 Andi Kleen, SuSE Labs.
30 * This hasn't been optimized for the hammer yet, but there are likely
31 * no advantages to be gotten from x86-64 here anyways.
34 typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
36 /* Doesn't use gcc to save the XMM registers, because there is no easy way to
37 tell it to do a clts before the register saving. */
38 #define XMMS_SAVE do { \
39 preempt_disable(); \
40 asm volatile ( \
41 "movq %%cr0,%0 ;\n\t" \
42 "clts ;\n\t" \
43 "movups %%xmm0,(%1) ;\n\t" \
44 "movups %%xmm1,0x10(%1) ;\n\t" \
45 "movups %%xmm2,0x20(%1) ;\n\t" \
46 "movups %%xmm3,0x30(%1) ;\n\t" \
47 : "=&r" (cr0) \
48 : "r" (xmm_save) \
49 : "memory"); \
50 } while(0)
52 #define XMMS_RESTORE do { \
53 asm volatile ( \
54 "sfence ;\n\t" \
55 "movups (%1),%%xmm0 ;\n\t" \
56 "movups 0x10(%1),%%xmm1 ;\n\t" \
57 "movups 0x20(%1),%%xmm2 ;\n\t" \
58 "movups 0x30(%1),%%xmm3 ;\n\t" \
59 "movq %0,%%cr0 ;\n\t" \
60 : \
61 : "r" (cr0), "r" (xmm_save) \
62 : "memory"); \
63 preempt_enable(); \
64 } while(0)
66 #define OFFS(x) "16*("#x")"
67 #define PF_OFFS(x) "256+16*("#x")"
68 #define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
69 #define LD(x,y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
70 #define ST(x,y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
71 #define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
72 #define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
73 #define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
74 #define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
75 #define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n"
76 #define XO1(x,y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
77 #define XO2(x,y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
78 #define XO3(x,y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
79 #define XO4(x,y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
80 #define XO5(x,y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
83 static void
84 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
86 unsigned int lines = bytes >> 8;
87 unsigned long cr0;
88 xmm_store_t xmm_save[4];
90 XMMS_SAVE;
92 asm volatile (
93 #undef BLOCK
94 #define BLOCK(i) \
95 LD(i,0) \
96 LD(i+1,1) \
97 PF1(i) \
98 PF1(i+2) \
99 LD(i+2,2) \
100 LD(i+3,3) \
101 PF0(i+4) \
102 PF0(i+6) \
103 XO1(i,0) \
104 XO1(i+1,1) \
105 XO1(i+2,2) \
106 XO1(i+3,3) \
107 ST(i,0) \
108 ST(i+1,1) \
109 ST(i+2,2) \
110 ST(i+3,3) \
113 PF0(0)
114 PF0(2)
116 " .align 32 ;\n"
117 " 1: ;\n"
119 BLOCK(0)
120 BLOCK(4)
121 BLOCK(8)
122 BLOCK(12)
124 " addq %[inc], %[p1] ;\n"
125 " addq %[inc], %[p2] ;\n"
126 " decl %[cnt] ; jnz 1b"
127 : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
128 : [inc] "r" (256UL)
129 : "memory");
131 XMMS_RESTORE;
134 static void
135 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
136 unsigned long *p3)
138 unsigned int lines = bytes >> 8;
139 xmm_store_t xmm_save[4];
140 unsigned long cr0;
142 XMMS_SAVE;
144 __asm__ __volatile__ (
145 #undef BLOCK
146 #define BLOCK(i) \
147 PF1(i) \
148 PF1(i+2) \
149 LD(i,0) \
150 LD(i+1,1) \
151 LD(i+2,2) \
152 LD(i+3,3) \
153 PF2(i) \
154 PF2(i+2) \
155 PF0(i+4) \
156 PF0(i+6) \
157 XO1(i,0) \
158 XO1(i+1,1) \
159 XO1(i+2,2) \
160 XO1(i+3,3) \
161 XO2(i,0) \
162 XO2(i+1,1) \
163 XO2(i+2,2) \
164 XO2(i+3,3) \
165 ST(i,0) \
166 ST(i+1,1) \
167 ST(i+2,2) \
168 ST(i+3,3) \
171 PF0(0)
172 PF0(2)
174 " .align 32 ;\n"
175 " 1: ;\n"
177 BLOCK(0)
178 BLOCK(4)
179 BLOCK(8)
180 BLOCK(12)
182 " addq %[inc], %[p1] ;\n"
183 " addq %[inc], %[p2] ;\n"
184 " addq %[inc], %[p3] ;\n"
185 " decl %[cnt] ; jnz 1b"
186 : [cnt] "+r" (lines),
187 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
188 : [inc] "r" (256UL)
189 : "memory");
190 XMMS_RESTORE;
193 static void
194 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
195 unsigned long *p3, unsigned long *p4)
197 unsigned int lines = bytes >> 8;
198 xmm_store_t xmm_save[4];
199 unsigned long cr0;
201 XMMS_SAVE;
203 __asm__ __volatile__ (
204 #undef BLOCK
205 #define BLOCK(i) \
206 PF1(i) \
207 PF1(i+2) \
208 LD(i,0) \
209 LD(i+1,1) \
210 LD(i+2,2) \
211 LD(i+3,3) \
212 PF2(i) \
213 PF2(i+2) \
214 XO1(i,0) \
215 XO1(i+1,1) \
216 XO1(i+2,2) \
217 XO1(i+3,3) \
218 PF3(i) \
219 PF3(i+2) \
220 PF0(i+4) \
221 PF0(i+6) \
222 XO2(i,0) \
223 XO2(i+1,1) \
224 XO2(i+2,2) \
225 XO2(i+3,3) \
226 XO3(i,0) \
227 XO3(i+1,1) \
228 XO3(i+2,2) \
229 XO3(i+3,3) \
230 ST(i,0) \
231 ST(i+1,1) \
232 ST(i+2,2) \
233 ST(i+3,3) \
236 PF0(0)
237 PF0(2)
239 " .align 32 ;\n"
240 " 1: ;\n"
242 BLOCK(0)
243 BLOCK(4)
244 BLOCK(8)
245 BLOCK(12)
247 " addq %[inc], %[p1] ;\n"
248 " addq %[inc], %[p2] ;\n"
249 " addq %[inc], %[p3] ;\n"
250 " addq %[inc], %[p4] ;\n"
251 " decl %[cnt] ; jnz 1b"
252 : [cnt] "+c" (lines),
253 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
254 : [inc] "r" (256UL)
255 : "memory" );
257 XMMS_RESTORE;
260 static void
261 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
262 unsigned long *p3, unsigned long *p4, unsigned long *p5)
264 unsigned int lines = bytes >> 8;
265 xmm_store_t xmm_save[4];
266 unsigned long cr0;
268 XMMS_SAVE;
270 __asm__ __volatile__ (
271 #undef BLOCK
272 #define BLOCK(i) \
273 PF1(i) \
274 PF1(i+2) \
275 LD(i,0) \
276 LD(i+1,1) \
277 LD(i+2,2) \
278 LD(i+3,3) \
279 PF2(i) \
280 PF2(i+2) \
281 XO1(i,0) \
282 XO1(i+1,1) \
283 XO1(i+2,2) \
284 XO1(i+3,3) \
285 PF3(i) \
286 PF3(i+2) \
287 XO2(i,0) \
288 XO2(i+1,1) \
289 XO2(i+2,2) \
290 XO2(i+3,3) \
291 PF4(i) \
292 PF4(i+2) \
293 PF0(i+4) \
294 PF0(i+6) \
295 XO3(i,0) \
296 XO3(i+1,1) \
297 XO3(i+2,2) \
298 XO3(i+3,3) \
299 XO4(i,0) \
300 XO4(i+1,1) \
301 XO4(i+2,2) \
302 XO4(i+3,3) \
303 ST(i,0) \
304 ST(i+1,1) \
305 ST(i+2,2) \
306 ST(i+3,3) \
309 PF0(0)
310 PF0(2)
312 " .align 32 ;\n"
313 " 1: ;\n"
315 BLOCK(0)
316 BLOCK(4)
317 BLOCK(8)
318 BLOCK(12)
320 " addq %[inc], %[p1] ;\n"
321 " addq %[inc], %[p2] ;\n"
322 " addq %[inc], %[p3] ;\n"
323 " addq %[inc], %[p4] ;\n"
324 " addq %[inc], %[p5] ;\n"
325 " decl %[cnt] ; jnz 1b"
326 : [cnt] "+c" (lines),
327 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
328 [p5] "+r" (p5)
329 : [inc] "r" (256UL)
330 : "memory");
332 XMMS_RESTORE;
335 static struct xor_block_template xor_block_sse = {
336 .name = "generic_sse",
337 .do_2 = xor_sse_2,
338 .do_3 = xor_sse_3,
339 .do_4 = xor_sse_4,
340 .do_5 = xor_sse_5,
343 #undef XOR_TRY_TEMPLATES
344 #define XOR_TRY_TEMPLATES \
345 do { \
346 xor_speed(&xor_block_sse); \
347 } while (0)
349 /* We force the use of the SSE xor block because it can write around L2.
350 We may also be able to load into the L1 only depending on how the cpu
351 deals with a load to a line that is being prefetched. */
352 #define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)