2 * This file contains assembly-language implementations
3 * of IP-style 1's complement checksum routines.
5 * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
12 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
15 #include <linux/sys.h>
16 #include <asm/processor.h>
17 #include <asm/errno.h>
18 #include <asm/ppc_asm.h>
21 * Computes the checksum of a memory block at buff, length len,
22 * and adds in "sum" (32-bit).
24 * __csum_partial(r3=buff, r4=len, r5=sum)
26 _GLOBAL(__csum_partial)
27 addic r0,r5,0 /* clear carry */
29 srdi. r6,r4,3 /* less than 8 bytes? */
33 * If only halfword aligned, align to a double word. Since odd
34 * aligned addresses should be rare and they would require more
35 * work to calculate the correct checksum, we ignore that case
36 * and take the potential slowdown of unaligned loads.
38 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
46 lhz r6,0(r3) /* align to doubleword */
54 * We unroll the loop such that each iteration is 64 bytes with an
55 * entry and exit limb of 64 bytes, meaning a minimum size of
59 beq .Lcsum_tail_doublewords /* len < 128 */
65 stdu r1,-STACKFRAMESIZE(r1)
66 std r14,STK_REG(R14)(r1)
67 std r15,STK_REG(R15)(r1)
68 std r16,STK_REG(R16)(r1)
77 * On POWER6 and POWER7 back to back addes take 2 cycles because of
78 * the XER dependency. This means the fastest this loop can go is
79 * 16 cycles per iteration. The scheduling of the loop below has
80 * been shown to hit this on both POWER6 and POWER7.
127 ld r14,STK_REG(R14)(r1)
128 ld r15,STK_REG(R15)(r1)
129 ld r16,STK_REG(R16)(r1)
130 addi r1,r1,STACKFRAMESIZE
134 .Lcsum_tail_doublewords: /* Up to 127 bytes to go */
147 .Lcsum_tail_word: /* Up to 7 bytes to go */
149 beq .Lcsum_tail_halfword
156 .Lcsum_tail_halfword: /* Up to 3 bytes to go */
165 .Lcsum_tail_byte: /* Up to 1 byte to go */
170 sldi r9,r6,8 /* Pad the byte out to 16 bits */
174 addze r0,r0 /* add in final carry */
175 rldicl r4,r0,32,0 /* fold two 32 bit halves together */
183 .section __ex_table,"a"
185 .llong 100b,.Lsrc_error_nr
191 .section __ex_table,"a"
193 .llong 150b,.Lsrc_error
199 .section __ex_table,"a"
201 .llong 200b,.Ldest_error_nr
207 .section __ex_table,"a"
209 .llong 250b,.Ldest_error
214 * Computes the checksum of a memory block at src, length len,
215 * and adds in "sum" (32-bit), while copying the block to dst.
216 * If an access exception occurs on src or dst, it stores -EFAULT
217 * to *src_err or *dst_err respectively. The caller must take any action
218 * required in this case (zeroing memory, recalculating partial checksum etc).
220 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
222 _GLOBAL(csum_partial_copy_generic)
223 addic r0,r6,0 /* clear carry */
225 srdi. r6,r5,3 /* less than 8 bytes? */
229 * If only halfword aligned, align to a double word. Since odd
230 * aligned addresses should be rare and they would require more
231 * work to calculate the correct checksum, we ignore that case
232 * and take the potential slowdown of unaligned loads.
234 * If the source and destination are relatively unaligned we only
235 * align the source. This keeps things simple.
237 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
245 srcnr; lhz r6,0(r3) /* align to doubleword */
255 * We unroll the loop such that each iteration is 64 bytes with an
256 * entry and exit limb of 64 bytes, meaning a minimum size of
260 beq .Lcopy_tail_doublewords /* len < 128 */
266 stdu r1,-STACKFRAMESIZE(r1)
267 std r14,STK_REG(R14)(r1)
268 std r15,STK_REG(R15)(r1)
269 std r16,STK_REG(R16)(r1)
274 source; ld r10,16(r3)
275 source; ld r11,24(r3)
278 * On POWER6 and POWER7 back to back addes take 2 cycles because of
279 * the XER dependency. This means the fastest this loop can go is
280 * 16 cycles per iteration. The scheduling of the loop below has
281 * been shown to hit this on both POWER6 and POWER7.
286 source; ld r12,32(r3)
287 source; ld r14,40(r3)
290 source; ld r15,48(r3)
291 source; ld r16,56(r3)
316 source; ld r10,16(r3)
317 source; ld r11,24(r3)
322 source; ld r12,32(r3)
323 source; ld r14,40(r3)
326 source; ld r15,48(r3)
327 source; ld r16,56(r3)
350 ld r14,STK_REG(R14)(r1)
351 ld r15,STK_REG(R15)(r1)
352 ld r16,STK_REG(R16)(r1)
353 addi r1,r1,STACKFRAMESIZE
357 .Lcopy_tail_doublewords: /* Up to 127 bytes to go */
372 .Lcopy_tail_word: /* Up to 7 bytes to go */
374 beq .Lcopy_tail_halfword
383 .Lcopy_tail_halfword: /* Up to 3 bytes to go */
394 .Lcopy_tail_byte: /* Up to 1 byte to go */
399 sldi r9,r6,8 /* Pad the byte out to 16 bits */
404 addze r0,r0 /* add in final carry */
405 rldicl r4,r0,32,0 /* fold two 32 bit halves together */
411 ld r14,STK_REG(R14)(r1)
412 ld r15,STK_REG(R15)(r1)
413 ld r16,STK_REG(R16)(r1)
414 addi r1,r1,STACKFRAMESIZE
423 ld r14,STK_REG(R14)(r1)
424 ld r15,STK_REG(R15)(r1)
425 ld r16,STK_REG(R16)(r1)
426 addi r1,r1,STACKFRAMESIZE