1 /* $Id: VIScsumcopy.S,v 1.6 1999/05/25 16:53:03 jj Exp $
2 * VIScsumcopy.S: High bandwidth IP checksumming with simultaneous
3 * copying utilizing the UltraSparc Visual Instruction Set.
5 * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
7 * Based on older sparc32/sparc64 checksum.S, which is:
9 * Copyright(C) 1995 Linus Torvalds
10 * Copyright(C) 1995 Miguel de Icaza
11 * Copyright(C) 1996,1997 David S. Miller
13 * Linux/Alpha checksum c-code
14 * Linux/ix86 inline checksum assembly
15 * RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
16 * David Mosberger-Tang for optimized reference c-code
17 * BSD4.4 portable checksum routine
21 #define STACKOFF 0x7ff+128
30 #include <asm/visasm.h>
32 #define ASI_BLK_XOR1 (ASI_BLK_P ^ (ASI_BLK_P >> 3) ^ ASI_P)
33 #define ASI_BLK_OR (ASI_BLK_P & ~ASI_P)
36 #define ASI_BLK_P 0xf0
40 #define ASI_BLK_XOR (ASI_BLK_P ^ ASI_P)
56 /* Dobrou noc, SunSoft engineers. Spete sladce.
57 * This has a couple of tricks in and those
58 * tricks are UltraLinux trade secrets :))
59 * Once AGAIN, the SunSoft engineers are caught
60 * asleep at the keyboard :)).
61 * The main loop does about 20 superscalar cycles
62 * per 64bytes checksummed/copied.
66 ldda [%src] %asi, %O0 /* Load Group */
69 stda %f48, [%dst] ASI_BLK_P /* Store */
72 std %fx, [%dst + off] /* Store */
78 #define DO_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,F0,F2,F4,F6,F8,F10,F12,F14,DUMMY1,A0,A2,A4,A6,A8,A10,A12,A14,B14,DYMMY2,LOAD,STORE1,STORE2,STORE3,STORE4,STORE5,STORE6,STORE7,STORE8,DUMMY3,BRANCH...) \
79 LOAD /* Load Group */; \
80 faligndata %A14, %F0, %A14 /* FPA Group */; \
82 STORE1 /* Store (optional) */; \
83 faligndata %F0, %F2, %A0 /* FPA Group */; \
84 srl %x5, 1, %x5 /* IEU0 */; \
85 add %sum, %x4, %sum /* IEU1 */; \
86 fpadd32 %F0, %f0, %F0 /* FPA Group */; \
88 STORE2 /* Store (optional) */; \
89 faligndata %F2, %F4, %A2 /* FPA Group */; \
90 srl %x6, 1, %x6 /* IEU0 */; \
91 add %sum, %x5, %sum /* IEU1 */; \
92 fpadd32 %F2, %f2, %F2 /* FPA Group */; \
93 add %src, 64, %src /* IEU0 */; \
94 add %dst, 64, %dst /* IEU1 */; \
95 fcmpgt32 %f0, %F0, %x1 /* FPM Group */; \
97 STORE3 /* Store (optional) */; \
98 faligndata %F4, %F6, %A4 /* FPA */; \
99 srl %x7, 1, %x7 /* IEU0 Group */; \
100 add %sum, %x6, %sum /* IEU1 */; \
101 fpadd32 %F4, %f4, %F4 /* FPA */; \
102 fcmpgt32 %f2, %F2, %x2 /* FPM Group */; \
103 inc %x8 /* IEU0 */; \
104 STORE4 /* Store (optional) */; \
105 faligndata %F6, %F8, %A6 /* FPA */; \
106 srl %x8, 1, %x8 /* IEU0 Group */; \
107 add %sum, %x7, %sum /* IEU1 */; \
108 fpadd32 %F6, %f6, %F6 /* FPA */; \
109 fcmpgt32 %f4, %F4, %x3 /* FPM Group */; \
110 inc %x1 /* IEU0 */; \
111 STORE5 /* Store (optional) */; \
112 faligndata %F8, %F10, %A8 /* FPA */; \
113 srl %x1, 1, %x1 /* IEU0 Group */; \
114 add %sum, %x8, %sum /* IEU1 */; \
115 fpadd32 %F8, %f8, %F8 /* FPA */; \
116 fcmpgt32 %f6, %F6, %x4 /* FPM Group */; \
117 inc %x2 /* IEU0 */; \
118 STORE6 /* Store (optional) */; \
119 faligndata %F10, %F12, %A10 /* FPA */; \
120 srl %x2, 1, %x2 /* IEU0 Group */; \
121 add %sum, %x1, %sum /* IEU1 */; \
122 fpadd32 %F10, %f10, %F10 /* FPA */; \
123 fcmpgt32 %f8, %F8, %x5 /* FPM Group */; \
124 inc %x3 /* IEU0 */; \
125 STORE7 /* Store (optional) */; \
126 faligndata %F12, %F14, %A12 /* FPA */; \
127 srl %x3, 1, %x3 /* IEU0 Group */; \
128 add %sum, %x2, %sum /* IEU1 */; \
129 fpadd32 %F12, %f12, %F12 /* FPA */; \
130 fcmpgt32 %f10, %F10, %x6 /* FPM Group */; \
131 inc %x4 /* IEU0 */; \
132 STORE8 /* Store (optional) */; \
133 fmovd %F14, %B14 /* FPA */; \
134 srl %x4, 1, %x4 /* IEU0 Group */; \
135 add %sum, %x3, %sum /* IEU1 */; \
136 fpadd32 %F14, %f14, %F14 /* FPA */; \
137 fcmpgt32 %f12, %F12, %x7 /* FPM Group */; \
138 subcc %len, 64, %len /* IEU1 */; \
140 fcmpgt32 %f14, %F14, %x8 /* FPM Group */; \
142 #define END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,S0,S1,S2,S3,T0,T1,U0,fz) \
143 inc %x5 /* IEU0 Group */; \
144 fpadd32 %f2, %f0, %S0 /* FPA */; \
145 srl %x5, 1, %x5 /* IEU0 Group */; \
146 add %sum, %x4, %sum /* IEU1 */; \
147 fpadd32 %f6, %f4, %S1 /* FPA */; \
148 inc %x6 /* IEU0 Group */; \
149 add %sum, %x5, %sum /* IEU1 */; \
150 fcmpgt32 %f0, %S0, %x1 /* FPM Group */; \
151 srl %x6, 1, %x6 /* IEU0 */; \
152 inc %x7 /* IEU1 */; \
153 fpadd32 %f10, %f8, %S2 /* FPA */; \
154 fcmpgt32 %f4, %S1, %x2 /* FPM Group */; \
155 srl %x7, 1, %x7 /* IEU0 */; \
156 add %sum, %x6, %sum /* IEU1 */; \
157 fpadd32 %f14, %f12, %S3 /* FPA */; \
158 inc %x8 /* IEU0 Group */; \
159 add %sum, %x7, %sum /* IEU1 */; \
160 fzero %fz /* FPA */; \
161 fcmpgt32 %f8, %S2, %x3 /* FPM Group */; \
162 srl %x8, 1, %x8 /* IEU0 */; \
163 inc %x1 /* IEU1 */; \
164 fpadd32 %S0, %S1, %T0 /* FPA */; \
165 fcmpgt32 %f12, %S3, %x4 /* FPM Group */; \
166 srl %x1, 1, %x1 /* IEU0 */; \
167 add %sum, %x8, %sum /* IEU1 */; \
168 fpadd32 %S2, %S3, %T1 /* FPA */; \
169 inc %x2 /* IEU0 Group */; \
170 add %sum, %x1, %sum /* IEU1 */; \
171 fcmpgt32 %S0, %T0, %x5 /* FPM Group */; \
172 srl %x2, 1, %x2 /* IEU0 */; \
173 inc %x3 /* IEU1 */; \
174 fcmpgt32 %S2, %T1, %x6 /* FPM Group */; \
175 srl %x3, 1, %x3 /* IEU0 */; \
176 add %sum, %x2, %sum /* IEU1 */; \
177 inc %x4 /* IEU0 Group */; \
178 add %sum, %x3, %sum /* IEU1 */; \
179 fcmpgt32 %fz, %f2, %x7 /* FPM Group */; \
180 srl %x4, 1, %x4 /* IEU0 */; \
181 inc %x5 /* IEU1 */; \
182 fpadd32 %T0, %T1, %U0 /* FPA */; \
183 fcmpgt32 %fz, %f6, %x8 /* FPM Group */; \
184 srl %x5, 1, %x5 /* IEU0 */; \
185 add %sum, %x4, %sum /* IEU1 */; \
186 inc %x6 /* IEU0 Group */; \
187 add %sum, %x5, %sum /* IEU1 */; \
188 fcmpgt32 %fz, %f10, %x1 /* FPM Group */; \
189 srl %x6, 1, %x6 /* IEU0 */; \
190 inc %x7 /* IEU1 */; \
191 fcmpgt32 %fz, %f14, %x2 /* FPM Group */; \
192 ba,pt %xcc, ett /* CTI */; \
193 fmovd %FA, %FB /* FPA */; \
195 #define END_THE_TRICK1(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB) \
196 END_THE_TRICK(f0,f2,f4,f6,f8,f10,f12,f14,FA,FB,f48,f50,f52,f54,f56,f58,f60,f62)
198 #define END_THE_TRICK2(S0,S1,S2,S3,T0,T1,U0,U1,V0,fz) \
199 fpadd32 %U0, %U1, %V0 /* FPA Group */; \
200 srl %x7, 1, %x7 /* IEU0 */; \
201 add %sum, %x6, %sum /* IEU1 */; \
202 std %V0, [%sp + STACKOFF] /* Store Group */; \
203 inc %x8 /* IEU0 */; \
204 sub %sum, %x7, %sum /* IEU1 */; \
205 fcmpgt32 %fz, %S1, %x3 /* FPM Group */; \
206 srl %x8, 1, %x8 /* IEU0 */; \
207 inc %x1 /* IEU1 */; \
208 fcmpgt32 %fz, %S3, %x4 /* FPM Group */; \
209 srl %x1, 1, %x1 /* IEU0 */; \
210 sub %sum, %x8, %sum /* IEU1 */; \
211 ldx [%sp + STACKOFF], %x8 /* Load Group */; \
212 inc %x2 /* IEU0 */; \
213 sub %sum, %x1, %sum /* IEU1 */; \
214 fcmpgt32 %fz, %T1, %x5 /* FPM Group */; \
215 srl %x2, 1, %x2 /* IEU0 */; \
216 inc %x3 /* IEU1 */; \
217 fcmpgt32 %T0, %U0, %x6 /* FPM Group */; \
218 srl %x3, 1, %x3 /* IEU0 */; \
219 sub %sum, %x2, %sum /* IEU1 */; \
220 inc %x4 /* IEU0 Group */; \
221 sub %sum, %x3, %sum /* IEU1 */; \
222 fcmpgt32 %fz, %U1, %x7 /* FPM Group */; \
223 srl %x4, 1, %x4 /* IEU0 */; \
224 inc %x5 /* IEU1 */; \
225 fcmpgt32 %U0, %V0, %x1 /* FPM Group */; \
226 srl %x5, 1, %x5 /* IEU0 */; \
227 sub %sum, %x4, %sum /* IEU1 */; \
228 fcmpgt32 %fz, %V0, %x2 /* FPM Group */; \
229 inc %x6 /* IEU0 */; \
230 sub %sum, %x5, %sum /* IEU1 */; \
231 srl %x6, 1, %x6 /* IEU0 Group */; \
232 inc %x7 /* IEU1 */; \
233 srl %x7, 1, %x7 /* IEU0 Group */; \
234 add %sum, %x6, %sum /* IEU1 */; \
235 inc %x1 /* IEU0 Group */; \
236 sub %sum, %x7, %sum /* IEU1 */; \
237 srl %x1, 1, %x1 /* IEU0 Group */; \
238 inc %x2 /* IEU1 */; \
239 srl %x2, 1, %x2 /* IEU0 Group */; \
240 add %sum, %x1, %sum /* IEU1 */; \
241 sub %sum, %x2, %sum /* IEU0 Group */; \
242 addcc %sum, %x8, %sum /* IEU Group */; \
243 bcs,a,pn %xcc, 33f /* CTI */; \
244 add %sum, 1, %sum /* IEU0 */; \
248 .globl csum_partial_copy_vis
250 /* %asi should be either ASI_P or ASI_AIUS for csum_partial_copy resp. csum_partial_copy_from_user */
251 /* This assumes that !((%src^%dst)&3) && !((%src|%dst)&1) && %len >= 256 */
252 csum_partial_copy_vis:
253 andcc %dst, 7, %g0 /* IEU1 Group */
254 be,pt %icc, 4f /* CTI */
255 and %dst, 0x38, %o4 /* IEU0 */
256 mov 1, %g5 /* IEU0 Group */
257 andcc %dst, 2, %g0 /* IEU1 */
258 be,pt %icc, 1f /* CTI */
259 and %dst, 4, %g7 /* IEU0 Group */
260 lduha [%src] %asi, %g2 /* Load */
261 sub %len, 2, %len /* IEU0 Group */
262 add %dst, 2, %dst /* IEU1 */
263 andcc %dst, 4, %g7 /* IEU1 Group */
264 sll %g5, 16, %g5 /* IEU0 */
265 sth %g2, [%dst - 2] /* Store Group */
266 sll %g2, 16, %g2 /* IEU0 */
267 add %src, 2, %src /* IEU1 */
268 addcc %g2, %sum, %sum /* IEU1 Group */
269 bcs,a,pn %icc, 1f /* CTI */
270 add %sum, %g5, %sum /* IEU0 */
271 1: lduwa [%src] %asi, %g2 /* Load */
272 brz,a,pn %g7, 4f /* CTI+IEU1 Group */
273 and %dst, 0x38, %o4 /* IEU0 */
274 add %dst, 4, %dst /* IEU0 Group */
275 sub %len, 4, %len /* IEU1 */
276 addcc %g2, %sum, %sum /* IEU1 Group */
277 bcs,a,pn %icc, 1f /* CTI */
278 add %sum, 1, %sum /* IEU0 */
279 1: and %dst, 0x38, %o4 /* IEU0 Group */
280 stw %g2, [%dst - 4] /* Store */
281 add %src, 4, %src /* IEU1 */
286 mov %src, %g7 /* IEU1 Group */
288 alignaddr %src, %g0, %src /* Single Group */
289 subcc %g7, %src, %g7 /* IEU1 Group */
290 be,pt %xcc, 1f /* CTI */
291 mov 0x40, %g1 /* IEU0 */
292 lduwa [%src] %asi, %g2 /* Load Group */
293 subcc %sum, %g2, %sum /* IEU1 Group+load stall */
294 bcs,a,pn %icc, 1f /* CTI */
295 sub %sum, 1, %sum /* IEU0 */
296 1: srl %sum, 0, %sum /* IEU0 Group */
298 brz,pn %o4, 3f /* CTI+IEU1 Group */
299 sub %g1, %o4, %g1 /* IEU0 */
300 ldda [%src] %asi, %f0 /* Load */
301 clr %o4 /* IEU0 Group */
302 andcc %dst, 8, %g0 /* IEU1 */
303 be,pn %icc, 1f /* CTI */
304 ldda [%src + 8] %asi, %f2 /* Load Group */
305 add %src, 8, %src /* IEU0 */
306 sub %len, 8, %len /* IEU1 */
307 fpadd32 %f0, %f48, %f50 /* FPA */
308 addcc %dst, 8, %dst /* IEU1 Group */
309 faligndata %f0, %f2, %f16 /* FPA */
310 fcmpgt32 %f48, %f50, %o4 /* FPM Group */
311 fmovd %f2, %f0 /* FPA Group */
312 ldda [%src + 8] %asi, %f2 /* Load */
313 std %f16, [%dst - 8] /* Store */
314 fmovd %f50, %f48 /* FPA */
315 1: andcc %g1, 0x10, %g0 /* IEU1 Group */
316 be,pn %icc, 1f /* CTI */
317 and %g1, 0x20, %g1 /* IEU0 */
318 fpadd32 %f0, %f48, %f50 /* FPA */
319 ldda [%src + 16] %asi, %f4 /* Load Group */
320 add %src, 16, %src /* IEU0 */
321 add %dst, 16, %dst /* IEU1 */
322 faligndata %f0, %f2, %f16 /* FPA */
323 fcmpgt32 %f48, %f50, %g5 /* FPM Group */
324 sub %len, 16, %len /* IEU0 */
326 std %f16, [%dst - 16] /* Store Group */
327 fpadd32 %f2, %f50, %f48 /* FPA */
328 srl %o4, 1, %o5 /* IEU0 */
329 faligndata %f2, %f4, %f18 /* FPA Group */
330 std %f18, [%dst - 8] /* Store */
331 fcmpgt32 %f50, %f48, %o4 /* FPM Group */
332 add %o5, %sum, %sum /* IEU0 */
333 ldda [%src + 8] %asi, %f2 /* Load */
334 fmovd %f4, %f0 /* FPA */
335 1: brz,a,pn %g1, 4f /* CTI+IEU1 Group */
336 rd %asi, %g2 /* LSU Group + 4 bubbles */
338 fpadd32 %f0, %f48, %f50 /* FPA */
339 ldda [%src + 16] %asi, %f4 /* Load Group */
340 srl %g5, 1, %g5 /* IEU0 */
341 add %dst, 32, %dst /* IEU1 */
342 faligndata %f0, %f2, %f16 /* FPA */
343 fcmpgt32 %f48, %f50, %o5 /* FPM Group */
345 ldda [%src + 24] %asi, %f6 /* Load */
346 srl %o4, 1, %o4 /* IEU0 Group */
347 add %g5, %sum, %sum /* IEU1 */
348 ldda [%src + 32] %asi, %f8 /* Load */
349 fpadd32 %f2, %f50, %f48 /* FPA */
350 faligndata %f2, %f4, %f18 /* FPA Group */
351 sub %len, 32, %len /* IEU0 */
352 std %f16, [%dst - 32] /* Store */
353 fcmpgt32 %f50, %f48, %g3 /* FPM Group */
355 add %o4, %sum, %sum /* IEU1 */
356 fpadd32 %f4, %f48, %f50 /* FPA */
357 faligndata %f4, %f6, %f20 /* FPA Group */
358 srl %o5, 1, %o5 /* IEU0 */
359 fcmpgt32 %f48, %f50, %g5 /* FPM Group */
360 add %o5, %sum, %sum /* IEU0 */
361 std %f18, [%dst - 24] /* Store */
362 fpadd32 %f6, %f50, %f48 /* FPA */
363 inc %g3 /* IEU0 Group */
364 std %f20, [%dst - 16] /* Store */
365 add %src, 32, %src /* IEU1 */
366 faligndata %f6, %f8, %f22 /* FPA */
367 fcmpgt32 %f50, %f48, %o4 /* FPM Group */
368 srl %g3, 1, %g3 /* IEU0 */
369 std %f22, [%dst - 8] /* Store */
370 add %g3, %sum, %sum /* IEU0 Group */
371 3: rd %asi, %g2 /* LSU Group + 4 bubbles */
373 4: sethi %hi(vis0s), %g7 /* IEU0 Group */
374 or %g2, ASI_BLK_OR, %g2 /* IEU1 */
376 4: rd %pc, %g7 /* LSU Group + 4 bubbles */
378 inc %g5 /* IEU0 Group */
379 and %src, 0x38, %g3 /* IEU1 */
380 membar #StoreLoad /* LSU Group */
381 srl %g5, 1, %g5 /* IEU0 */
383 sll %g3, 8, %g3 /* IEU0 Group */
384 sub %len, 0xc0, %len /* IEU1 */
385 addcc %g5, %sum, %sum /* IEU1 Group */
386 srl %o4, 1, %o4 /* IEU0 */
387 add %g7, %g3, %g7 /* IEU0 Group */
388 add %o4, %sum, %sum /* IEU1 */
390 jmpl %g7 + %lo(vis0s), %g0 /* CTI+IEU1 Group */
392 jmpl %g7 + (vis0s - 4b), %g0 /* CTI+IEU1 Group */
397 vis0s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */
398 add %src, 128, %src /* IEU0 Group */
399 ldda [%src-128] %asi, %f0 /* Load Group */
400 ldda [%src-64] %asi, %f16 /* Load Group */
401 fmovd %f48, %f62 /* FPA Group f0 available */
402 faligndata %f0, %f2, %f48 /* FPA Group f2 available */
403 fcmpgt32 %f32, %f2, %x1 /* FPM Group f4 available */
404 fpadd32 %f0, %f62, %f0 /* FPA */
405 fcmpgt32 %f32, %f4, %x2 /* FPM Group f6 available */
406 faligndata %f2, %f4, %f50 /* FPA */
407 fcmpgt32 %f62, %f0, %x3 /* FPM Group f8 available */
408 faligndata %f4, %f6, %f52 /* FPA */
409 fcmpgt32 %f32, %f6, %x4 /* FPM Group f10 available */
411 faligndata %f6, %f8, %f54 /* FPA */
412 fcmpgt32 %f32, %f8, %x5 /* FPM Group f12 available */
413 srl %x1, 1, %x1 /* IEU0 */
415 faligndata %f8, %f10, %f56 /* FPA */
416 fcmpgt32 %f32, %f10, %x6 /* FPM Group f14 available */
417 srl %x2, 1, %x2 /* IEU0 */
418 add %sum, %x1, %sum /* IEU1 */
419 faligndata %f10, %f12, %f58 /* FPA */
420 fcmpgt32 %f32, %f12, %x7 /* FPM Group */
422 add %sum, %x2, %sum /* IEU1 */
423 faligndata %f12, %f14, %f60 /* FPA */
424 fcmpgt32 %f32, %f14, %x8 /* FPM Group */
425 srl %x3, 1, %x3 /* IEU0 */
427 fmovd %f14, %f62 /* FPA */
428 srl %x4, 1, %x4 /* IEU0 Group */
429 add %sum, %x3, %sum /* IEU1 */
430 vis0: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
431 ,f48,f50,f52,f54,f56,f58,f60,f62,f62,
432 ,LDBLK(f32), STBLK,,,,,,,,
433 ,bcs,pn %icc, vis0e1)
434 DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
435 ,f48,f50,f52,f54,f56,f58,f60,f62,f62,
436 ,LDBLK(f0), STBLK,,,,,,,,
437 ,bcs,pn %icc, vis0e2)
438 DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
439 ,f48,f50,f52,f54,f56,f58,f60,f62,f62,
440 ,LDBLK(f16), STBLK,,,,,,,,
442 vis0e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
443 ,f48,f50,f52,f54,f56,f58,f60,f62,f32,
444 ,SYNC, STBLK,ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48),
445 ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e2)
446 vis0e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
447 ,f48,f50,f52,f54,f56,f58,f60,f62,f0,
448 ,SYNC, STBLK,ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48),
449 ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e3)
450 vis0e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
451 ,f48,f50,f52,f54,f56,f58,f60,f62,f16,
452 ,SYNC, STBLK,ST(f48,64),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),ST(f60,48),
453 ,add %dst, 56, %dst; add %len, 192 - 8*8, %len; ba,pt %icc, e1)
455 vis1s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */
456 add %src, 128 - 8, %src /* IEU0 Group */
457 ldda [%src-128] %asi, %f0 /* Load Group */
458 ldda [%src-64] %asi, %f16 /* Load Group */
459 fmovd %f0, %f58 /* FPA Group */
460 fmovd %f48, %f0 /* FPA Group */
461 fcmpgt32 %f32, %f2, %x2 /* FPM Group */
462 faligndata %f2, %f4, %f48 /* FPA */
463 fcmpgt32 %f32, %f4, %x3 /* FPM Group */
464 faligndata %f4, %f6, %f50 /* FPA */
465 fcmpgt32 %f32, %f6, %x4 /* FPM Group */
466 faligndata %f6, %f8, %f52 /* FPA */
467 fcmpgt32 %f32, %f8, %x5 /* FPM Group */
469 faligndata %f8, %f10, %f54 /* FPA */
470 fcmpgt32 %f32, %f10, %x6 /* FPM Group */
471 srl %x2, 1, %x2 /* IEU0 */
472 faligndata %f10, %f12, %f56 /* FPA */
473 fcmpgt32 %f32, %f12, %x7 /* FPM Group */
475 add %sum, %x2, %sum /* IEU1 */
476 faligndata %f12, %f14, %f58 /* FPA */
477 fcmpgt32 %f32, %f14, %x8 /* FPM Group */
478 srl %x3, 1, %x3 /* IEU0 */
480 fmovd %f14, %f60 /* FPA */
481 srl %x4, 1, %x4 /* IEU0 Group */
482 add %sum, %x3, %sum /* IEU1 */
483 vis1: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
484 ,f62,f48,f50,f52,f54,f56,f58,f60,f60,
485 ,LDBLK(f32), ,STBLK,,,,,,,
486 ,bcs,pn %icc, vis1e1)
487 DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
488 ,f62,f48,f50,f52,f54,f56,f58,f60,f60,
489 ,LDBLK(f0), ,STBLK,,,,,,,
490 ,bcs,pn %icc, vis1e2)
491 DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
492 ,f62,f48,f50,f52,f54,f56,f58,f60,f60,
493 ,LDBLK(f16), ,STBLK,,,,,,,
495 vis1e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
496 ,f62,f48,f50,f52,f54,f56,f58,f60,f32,
497 ,SYNC, ,STBLK,ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),
498 ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e2)
499 vis1e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
500 ,f62,f48,f50,f52,f54,f56,f58,f60,f0,
501 ,SYNC, ,STBLK,ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),
502 ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e3)
503 vis1e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
504 ,f62,f48,f50,f52,f54,f56,f58,f60,f16,
505 ,SYNC, ,STBLK,ST(f48,0),ST(f50,8),ST(f52,16),ST(f54,24),ST(f56,32),ST(f58,40),
506 ,add %dst, 48, %dst; add %len, 192 - 7*8, %len; ba,pt %icc, e1)
508 vis2s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */
509 add %src, 128 - 16, %src /* IEU0 Group */
510 ldda [%src-128] %asi, %f0 /* Load Group */
511 ldda [%src-64] %asi, %f16 /* Load Group */
512 fmovd %f0, %f56 /* FPA Group */
513 fmovd %f48, %f0 /* FPA Group */
514 sub %dst, 64, %dst /* IEU0 */
515 fpsub32 %f2, %f2, %f2 /* FPA Group */
516 fcmpgt32 %f32, %f4, %x3 /* FPM Group */
517 faligndata %f4, %f6, %f48 /* FPA */
518 fcmpgt32 %f32, %f6, %x4 /* FPM Group */
519 faligndata %f6, %f8, %f50 /* FPA */
520 fcmpgt32 %f32, %f8, %x5 /* FPM Group */
521 faligndata %f8, %f10, %f52 /* FPA */
522 fcmpgt32 %f32, %f10, %x6 /* FPM Group */
523 faligndata %f10, %f12, %f54 /* FPA */
524 fcmpgt32 %f32, %f12, %x7 /* FPM Group */
526 faligndata %f12, %f14, %f56 /* FPA */
527 fcmpgt32 %f32, %f14, %x8 /* FPM Group */
528 srl %x3, 1, %x3 /* IEU0 */
530 fmovd %f14, %f58 /* FPA */
531 srl %x4, 1, %x4 /* IEU0 Group */
532 add %sum, %x3, %sum /* IEU1 */
533 vis2: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
534 ,f60,f62,f48,f50,f52,f54,f56,f58,f58,
535 ,LDBLK(f32), ,,STBLK,,,,,,
536 ,bcs,pn %icc, vis2e1)
537 DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
538 ,f60,f62,f48,f50,f52,f54,f56,f58,f58,
539 ,LDBLK(f0), ,,STBLK,,,,,,
540 ,bcs,pn %icc, vis2e2)
541 DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
542 ,f60,f62,f48,f50,f52,f54,f56,f58,f58,
543 ,LDBLK(f16), ,,STBLK,,,,,,
545 vis2e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
546 ,f60,f62,f48,f50,f52,f54,f56,f58,f32,
547 ,SYNC, ,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96),
548 ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e2)
549 vis2e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
550 ,f60,f62,f48,f50,f52,f54,f56,f58,f0,
551 ,SYNC, ,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96),
552 ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e3)
553 vis2e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
554 ,f60,f62,f48,f50,f52,f54,f56,f58,f16,
555 ,SYNC, ,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),ST(f56,96),
556 ,add %dst, 104, %dst; add %len, 192 - 6*8, %len; ba,pt %icc, e1)
558 vis3s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */
559 add %src, 128 - 24, %src /* IEU0 Group */
560 ldda [%src-128] %asi, %f0 /* Load Group */
561 ldda [%src-64] %asi, %f16 /* Load Group */
562 fmovd %f0, %f54 /* FPA Group */
563 fmovd %f48, %f0 /* FPA Group */
564 sub %dst, 64, %dst /* IEU0 */
565 fpsub32 %f2, %f2, %f2 /* FPA Group */
566 fpsub32 %f4, %f4, %f4 /* FPA Group */
567 fcmpgt32 %f32, %f6, %x4 /* FPM Group */
568 faligndata %f6, %f8, %f48 /* FPA */
569 fcmpgt32 %f32, %f8, %x5 /* FPM Group */
570 faligndata %f8, %f10, %f50 /* FPA */
571 fcmpgt32 %f32, %f10, %x6 /* FPM Group */
572 faligndata %f10, %f12, %f52 /* FPA */
573 fcmpgt32 %f32, %f12, %x7 /* FPM Group */
574 faligndata %f12, %f14, %f54 /* FPA */
575 fcmpgt32 %f32, %f14, %x8 /* FPM Group */
576 fmovd %f14, %f56 /* FPA */
578 srl %x4, 1, %x4 /* IEU0 Group */
579 vis3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
580 ,f58,f60,f62,f48,f50,f52,f54,f56,f56,
581 ,LDBLK(f32), ,,,STBLK,,,,,
582 ,bcs,pn %icc, vis3e1)
583 DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
584 ,f58,f60,f62,f48,f50,f52,f54,f56,f56,
585 ,LDBLK(f0), ,,,STBLK,,,,,
586 ,bcs,pn %icc, vis3e2)
587 DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
588 ,f58,f60,f62,f48,f50,f52,f54,f56,f56,
589 ,LDBLK(f16), ,,,STBLK,,,,,
591 vis3e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
592 ,f58,f60,f62,f48,f50,f52,f54,f56,f32,
593 ,SYNC, ,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),
594 ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e2)
595 vis3e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
596 ,f58,f60,f62,f48,f50,f52,f54,f56,f0,
597 ,SYNC, ,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),
598 ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e3)
599 vis3e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
600 ,f58,f60,f62,f48,f50,f52,f54,f56,f16,
601 ,SYNC, ,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),ST(f54,88),
602 ,add %dst, 96, %dst; add %len, 192 - 5*8, %len; ba,pt %icc, e1)
604 vis4s: wr %g2, ASI_BLK_XOR, %asi /* LSU Group */
605 add %src, 128 - 32, %src /* IEU0 Group */
606 ldda [%src-128] %asi, %f0 /* Load Group */
607 ldda [%src-64] %asi, %f16 /* Load Group */
608 fmovd %f0, %f52 /* FPA Group */
609 fmovd %f48, %f0 /* FPA Group */
610 sub %dst, 64, %dst /* IEU0 */
611 fpsub32 %f2, %f2, %f2 /* FPA Group */
612 fpsub32 %f4, %f4, %f4 /* FPA Group */
613 fpsub32 %f6, %f6, %f6 /* FPA Group */
615 fcmpgt32 %f32, %f8, %x5 /* FPM Group */
616 faligndata %f8, %f10, %f48 /* FPA */
617 fcmpgt32 %f32, %f10, %x6 /* FPM Group */
618 faligndata %f10, %f12, %f50 /* FPA */
619 fcmpgt32 %f32, %f12, %x7 /* FPM Group */
620 faligndata %f12, %f14, %f52 /* FPA */
621 fcmpgt32 %f32, %f14, %x8 /* FPM Group */
622 fmovd %f14, %f54 /* FPA */
623 vis4: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
624 ,f56,f58,f60,f62,f48,f50,f52,f54,f54,
625 ,LDBLK(f32), ,,,,STBLK,,,,
626 ,bcs,pn %icc, vis4e1)
627 DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
628 ,f56,f58,f60,f62,f48,f50,f52,f54,f54,
629 ,LDBLK(f0), ,,,,STBLK,,,,
630 ,bcs,pn %icc, vis4e2)
631 DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
632 ,f56,f58,f60,f62,f48,f50,f52,f54,f54,
633 ,LDBLK(f16), ,,,,STBLK,,,,
635 vis4e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
636 ,f56,f58,f60,f62,f48,f50,f52,f54,f32,
637 ,SYNC, ,,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),
638 ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e2)
639 vis4e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
640 ,f56,f58,f60,f62,f48,f50,f52,f54,f0,
641 ,SYNC, ,,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),
642 ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e3)
643 vis4e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
644 ,f56,f58,f60,f62,f48,f50,f52,f54,f16,
645 ,SYNC, ,,,,STBLK,ST(f48,64),ST(f50,72),ST(f52,80),
646 ,add %dst, 88, %dst; add %len, 192 - 4*8, %len; ba,pt %icc, e1)
648 vis5s: add %src, 128 - 40, %src /* IEU0 Group */
649 ldda [%src-88] %asi, %f10 /* Load Group */
650 ldda [%src-80] %asi, %f12 /* Load Group */
651 ldda [%src-72] %asi, %f14 /* Load Group */
652 wr %g2, ASI_BLK_XOR, %asi /* LSU Group */
653 ldda [%src-64] %asi, %f16 /* Load Group */
654 fmovd %f48, %f0 /* FPA Group */
655 fmuld %f32, %f32, %f2 /* FPM */
657 faddd %f32, %f32, %f4 /* FPA Group */
658 fmuld %f32, %f32, %f6 /* FPM */
660 faddd %f32, %f32, %f8 /* FPA Group */
661 fcmpgt32 %f32, %f10, %x6 /* FPM Group */
662 sub %dst, 64, %dst /* IEU0 */
663 faligndata %f10, %f12, %f48 /* FPA */
664 fcmpgt32 %f32, %f12, %x7 /* FPM Group */
665 faligndata %f12, %f14, %f50 /* FPA */
666 fcmpgt32 %f32, %f14, %x8 /* FPM Group */
667 fmovd %f14, %f52 /* FPA */
668 vis5: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
669 ,f54,f56,f58,f60,f62,f48,f50,f52,f52,
670 ,LDBLK(f32), ,,,,,STBLK,,,
671 ,bcs,pn %icc, vis5e1)
672 DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
673 ,f54,f56,f58,f60,f62,f48,f50,f52,f52,
674 ,LDBLK(f0), ,,,,,STBLK,,,
675 ,bcs,pn %icc, vis5e2)
676 DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
677 ,f54,f56,f58,f60,f62,f48,f50,f52,f52,
678 ,LDBLK(f16), ,,,,,STBLK,,,
680 vis5e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
681 ,f54,f56,f58,f60,f62,f48,f50,f52,f32,
682 ,SYNC, ,,,,,STBLK,ST(f48,64),ST(f50,72),
683 ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e2)
684 vis5e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
685 ,f54,f56,f58,f60,f62,f48,f50,f52,f0,
686 ,SYNC, ,,,,,STBLK,ST(f48,64),ST(f50,72),
687 ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e3)
688 vis5e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
689 ,f54,f56,f58,f60,f62,f48,f50,f52,f16,
690 ,SYNC, ,,,,,STBLK,ST(f48,64),ST(f50,72),
691 ,add %dst, 80, %dst; add %len, 192 - 3*8, %len; ba,pt %icc, e1)
693 vis6s: add %src, 128 - 48, %src /* IEU0 Group */
694 ldda [%src-80] %asi, %f12 /* Load Group */
695 ldda [%src-72] %asi, %f14 /* Load Group */
696 wr %g2, ASI_BLK_XOR, %asi /* LSU Group */
697 ldda [%src-64] %asi, %f16 /* Load Group */
698 fmovd %f48, %f0 /* FPA Group */
699 fmuld %f32, %f32, %f2 /* FPM */
701 faddd %f32, %f32, %f4 /* FPA Group */
702 fmuld %f32, %f32, %f6 /* FPM */
704 faddd %f32, %f32, %f8 /* FPA Group */
705 fmuld %f32, %f32, %f10 /* FPM */
707 fcmpgt32 %f32, %f12, %x7 /* FPM Group */
708 sub %dst, 64, %dst /* IEU0 */
709 fcmpgt32 %f32, %f14, %x8 /* FPM Group */
710 faligndata %f12, %f14, %f48 /* FPA */
711 fmovd %f14, %f50 /* FPA Group */
712 vis6: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
713 ,f52,f54,f56,f58,f60,f62,f48,f50,f50,
714 ,LDBLK(f32), ,,,,,,STBLK,,
715 ,bcs,pn %icc, vis6e1)
716 DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
717 ,f52,f54,f56,f58,f60,f62,f48,f50,f50,
718 ,LDBLK(f0), ,,,,,,STBLK,,
719 ,bcs,pn %icc, vis6e2)
720 DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
721 ,f52,f54,f56,f58,f60,f62,f48,f50,f50,
722 ,LDBLK(f16), ,,,,,,STBLK,,
724 vis6e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
725 ,f52,f54,f56,f58,f60,f62,f48,f50,f32,
726 ,SYNC, ,,,,,,STBLK,ST(f48,64),
727 ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e2)
728 vis6e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
729 ,f52,f54,f56,f58,f60,f62,f48,f50,f0,
730 ,SYNC, ,,,,,,STBLK,ST(f48,64),
731 ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e3)
732 vis6e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
733 ,f52,f54,f56,f58,f60,f62,f48,f50,f16,
734 ,SYNC, ,,,,,,STBLK,ST(f48,64),
735 ,add %dst, 72, %dst; add %len, 192 - 2*8, %len; ba,pt %icc, e1)
737 vis7s: add %src, 128 - 56, %src /* IEU0 Group */
738 ldda [%src-72] %asi, %f14 /* Load Group */
739 wr %g2, ASI_BLK_XOR, %asi /* LSU Group */
740 ldda [%src-64] %asi, %f16 /* Load Group */
741 fmovd %f48, %f0 /* FPA Group */
742 fmuld %f32, %f32, %f2 /* FPM */
744 faddd %f32, %f32, %f4 /* FPA Group */
745 fmuld %f32, %f32, %f6 /* FPM */
747 faddd %f32, %f32, %f8 /* FPA Group */
748 fmuld %f32, %f32, %f10 /* FPM */
750 faddd %f32, %f32, %f12 /* FPA Group */
752 fcmpgt32 %f32, %f14, %x8 /* FPM Group */
753 sub %dst, 64, %dst /* IEU0 */
754 fmovd %f14, %f48 /* FPA */
755 vis7: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
756 ,f50,f52,f54,f56,f58,f60,f62,f48,f48,
757 ,LDBLK(f32), ,,,,,,,STBLK,
758 ,bcs,pn %icc, vis7e1)
759 DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
760 ,f50,f52,f54,f56,f58,f60,f62,f48,f48,
761 ,LDBLK(f0), ,,,,,,,STBLK,
762 ,bcs,pn %icc, vis7e2)
763 DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
764 ,f50,f52,f54,f56,f58,f60,f62,f48,f48,
765 ,LDBLK(f16), ,,,,,,,STBLK,
767 vis7e3: DO_THE_TRICK( f0,f2,f4,f6,f8,f10,f12,f14,f16,f18,f20,f22,f24,f26,f28,f30,
768 ,f50,f52,f54,f56,f58,f60,f62,f48,f32,
769 ,SYNC, ,,,,,,,STBLK,
770 ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e2)
771 vis7e1: DO_THE_TRICK( f16,f18,f20,f22,f24,f26,f28,f30,f32,f34,f36,f38,f40,f42,f44,f46,
772 ,f50,f52,f54,f56,f58,f60,f62,f48,f0,
773 ,SYNC, ,,,,,,,STBLK,
774 ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e3)
775 vis7e2: DO_THE_TRICK( f32,f34,f36,f38,f40,f42,f44,f46,f0,f2,f4,f6,f8,f10,f12,f14,
776 ,f50,f52,f54,f56,f58,f60,f62,f48,f16,
777 ,SYNC, ,,,,,,,STBLK,
778 ,add %dst, 64, %dst; add %len, 192 - 1*8, %len; ba,pt %icc, e1)
779 e1: END_THE_TRICK1( f0,f2,f4,f6,f8,f10,f12,f14,f16,f6)
780 e2: END_THE_TRICK1( f16,f18,f20,f22,f24,f26,f28,f30,f32,f6)
781 e3: END_THE_TRICK1( f32,f34,f36,f38,f40,f42,f44,f46,f0,f6)
782 ett: rd %asi, %x4 /* LSU Group+4bubbles */
783 rd %gsr, %x3 /* LSU Group+4bubbles */
785 srl %x4, 3, %x5 /* IEU0 Group */
786 xor %x4, ASI_BLK_XOR1, %x4 /* IEU1 */
787 wr %x4, %x5, %asi /* LSU Group+4bubbles */
789 wr %x4, ASI_BLK_XOR, %asi /* LSU Group+4bubbles */
791 andcc %x3, 7, %x3 /* IEU1 Group */
792 add %dst, 8, %dst /* IEU0 */
793 bne,pn %icc, 1f /* CTI */
795 brz,a,pn %len, 2f /* CTI+IEU1 Group */
796 std %f6, [%dst - 8] /* Store */
797 1: cmp %len, 8 /* IEU1 */
798 blu,pn %icc, 3f /* CTI */
799 sub %src, 64, %src /* IEU0 Group */
800 1: ldda [%src] %asi, %f2 /* Load Group */
801 fpadd32 %f10, %f2, %f12 /* FPA Group+load stall */
802 add %src, 8, %src /* IEU0 */
803 add %dst, 8, %dst /* IEU1 */
804 faligndata %f6, %f2, %f14 /* FPA Group */
805 fcmpgt32 %f10, %f12, %x5 /* FPM Group */
806 std %f14, [%dst - 16] /* Store */
807 fmovd %f2, %f6 /* FPA */
808 fmovd %f12, %f10 /* FPA Group */
809 sub %len, 8, %len /* IEU1 */
810 fzero %f16 /* FPA Group - FPU nop */
811 fzero %f18 /* FPA Group - FPU nop */
813 srl %x5, 1, %x5 /* IEU0 Group (regdep) */
814 cmp %len, 8 /* IEU1 */
815 bgeu,pt %icc, 1b /* CTI */
816 add %x5, %sum, %sum /* IEU0 Group */
817 3: brz,a,pt %x3, 2f /* CTI+IEU1 */
818 std %f6, [%dst - 8] /* Store Group */
819 st %f7, [%dst - 8] /* Store Group */
820 sub %dst, 4, %dst /* IEU0 */
821 add %len, 4, %len /* IEU1 */
824 sub %sp, 8, %sp /* IEU0 Group */
826 END_THE_TRICK2( f48,f50,f52,f54,f56,f58,f60,f10,f12,f62)
827 membar #Sync /* LSU Group */
830 add %sp, 8, %sp /* IEU0 Group */
832 23: brnz,pn %len, 26f /* CTI+IEU1 Group */
833 24: sllx %sum, 32, %g1 /* IEU0 */
834 25: addcc %sum, %g1, %src /* IEU1 Group */
835 srlx %src, 32, %src /* IEU0 Group (regdep) */
836 bcs,a,pn %xcc, 1f /* CTI */
837 add %src, 1, %src /* IEU1 */
839 1: retl /* CTI Group brk forced */
840 srl %src, 0, %src /* IEU0 */
842 1: sethi %uhi(PAGE_OFFSET), %g4 /* IEU0 Group */
843 retl /* CTI Group brk forced */
844 sllx %g4, 32, %g4 /* IEU0 */
846 26: andcc %len, 8, %g0 /* IEU1 Group */
847 be,pn %icc, 1f /* CTI */
848 lduwa [%src] %asi, %o4 /* Load */
849 lduwa [%src+4] %asi, %g2 /* Load Group */
850 add %src, 8, %src /* IEU0 */
851 add %dst, 8, %dst /* IEU1 */
852 sllx %o4, 32, %g5 /* IEU0 Group */
853 stw %o4, [%dst - 8] /* Store */
854 or %g5, %g2, %g5 /* IEU0 Group */
855 stw %g2, [%dst - 4] /* Store */
856 addcc %g5, %sum, %sum /* IEU1 Group */
857 bcs,a,pn %xcc, 1f /* CTI */
858 add %sum, 1, %sum /* IEU0 */
859 1: andcc %len, 4, %g0 /* IEU1 Group */
860 be,a,pn %icc, 1f /* CTI */
862 lduwa [%src] %asi, %g7 /* Load */
863 add %src, 4, %src /* IEU0 Group */
864 add %dst, 4, %dst /* IEU1 */
865 sllx %g7, 32, %g2 /* IEU0 Group */
866 stw %g7, [%dst - 4] /* Store */
867 1: andcc %len, 2, %g0 /* IEU1 */
868 be,a,pn %icc, 1f /* CTI */
869 clr %g3 /* IEU0 Group */
870 lduha [%src] %asi, %g7 /* Load */
871 add %src, 2, %src /* IEU1 */
872 add %dst, 2, %dst /* IEU0 Group */
873 sll %g7, 16, %g3 /* IEU0 Group */
874 sth %g7, [%dst - 2] /* Store */
875 1: andcc %len, 1, %g0 /* IEU1 */
876 be,a,pn %icc, 1f /* CTI */
877 clr %o5 /* IEU0 Group */
878 lduba [%src] %asi, %g7 /* Load */
879 sll %g7, 8, %o5 /* IEU0 Group */
880 stb %g7, [%dst] /* Store */
881 1: or %g2, %g3, %g3 /* IEU1 */
882 or %o5, %g3, %g3 /* IEU0 Group (regdep) */
883 addcc %g3, %sum, %sum /* IEU1 Group (regdep) */
884 bcs,a,pn %xcc, 1f /* CTI */
885 add %sum, 1, %sum /* IEU0 */
886 1: ba,pt %xcc, 25b /* CTI Group */
887 sllx %sum, 32, %g1 /* IEU0 */
894 .word csum_partial_copy_vis, 0, end, cpc_handler