sys/arch/mvme68k/stand/sboot/oc_cksum.s

   1 |       $NetBSD: oc_cksum.s,v 1.1.1.1 1995/07/25 23:12:31 chuck Exp $
   2
   3 | Copyright (c) 1988 Regents of the University of California.
   4 | All rights reserved.
   5 |
   6 | Redistribution and use in source and binary forms, with or without
   7 | modification, are permitted provided that the following conditions
   8 | are met:
   9 | 1. Redistributions of source code must retain the above copyright
  10 |    notice, this list of conditions and the following disclaimer.
  11 | 2. Redistributions in binary form must reproduce the above copyright
  12 |    notice, this list of conditions and the following disclaimer in the
  13 |    documentation and/or other materials provided with the distribution.
  14 | 3. All advertising materials mentioning features or use of this software
  15 |    must display the following acknowledgement:
  16 |       This product includes software developed by the University of
  17 |       California, Berkeley and its contributors.
  18 | 4. Neither the name of the University nor the names of its contributors
  19 |    may be used to endorse or promote products derived from this software
  20 |    without specific prior written permission.
  21 |
  22 | THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  23 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25 | ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  26 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  27 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  28 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  29 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  31 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32 | SUCH DAMAGE.
  33 |
  34 |       @(#)oc_cksum.s  7.2 (Berkeley) 11/3/90
  35 |
  36 |
  37 | oc_cksum: ones complement 16 bit checksum for MC68020.
  38 |
  39 | oc_cksum (buffer, count, strtval)
  40 |
  41 | Do a 16 bit one's complement sum of 'count' bytes from 'buffer'.
  42 | 'strtval' is the starting value of the sum (usually zero).
  43 |
  44 | It simplifies life in in_cksum if strtval can be >= 2^16.
  45 | This routine will work as long as strtval is < 2^31.
  46 |
  47 | Performance
  48 | -----------
  49 | This routine is intended for MC 68020s but should also work
  50 | for 68030s.  It (deliberately) doesn't worry about the alignment
  51 | of the buffer so will only work on a 68010 if the buffer is
  52 | aligned on an even address.  (Also, a routine written to use
  53 | 68010 "loop mode" would almost certainly be faster than this
  54 | code on a 68010).
  55 |
  56 | We don't worry about alignment because this routine is frequently
  57 | called with small counts: 20 bytes for IP header checksums and 40
  58 | bytes for TCP ack checksums.  For these small counts, testing for
  59 | bad alignment adds ~10% to the per-call cost.  Since, by the nature
  60 | of the kernel's allocator, the data we're called with is almost
  61 | always longword aligned, there is no benefit to this added cost
  62 | and we're better off letting the loop take a big performance hit
  63 | in the rare cases where we're handed an unaligned buffer.
  64 |
  65 | Loop unrolling constants of 2, 4, 8, 16, 32 and 64 times were
  66 | tested on random data on four different types of processors (see
  67 | list below -- 64 was the largest unrolling because anything more
  68 | overflows the 68020 Icache).  On all the processors, the
  69 | throughput asymptote was located between 8 and 16 (closer to 8).
  70 | However, 16 was substantially better than 8 for small counts.
  71 | (It's clear why this happens for a count of 40: unroll-8 pays a
  72 | loop branch cost and unroll-16 doesn't.  But the tests also showed
  73 | that 16 was better than 8 for a count of 20.  It's not obvious to
  74 | me why.)  So, since 16 was good for both large and small counts,
  75 | the loop below is unrolled 16 times.
  76 |
  77 | The processors tested and their average time to checksum 1024 bytes
  78 | of random data were:
  79 |       Sun 3/50 (15MHz)        190 us/KB
  80 |       Sun 3/180 (16.6MHz)     175 us/KB
  81 |       Sun 3/60 (20MHz)        134 us/KB
  82 |       Sun 3/280 (25MHz)        95 us/KB
  83 |
  84 | The cost of calling this routine was typically 10% of the per-
  85 | kilobyte cost.  E.g., checksumming zero bytes on a 3/60 cost 9us
  86 | and each additional byte cost 125ns.  With the high fixed cost,
  87 | it would clearly be a gain to "inline" this routine -- the
  88 | subroutine call adds 400% overhead to an IP header checksum.
  89 | However, in absolute terms, inlining would only gain 10us per
  90 | packet -- a 1% effect for a 1ms ethernet packet.  This is not
  91 | enough gain to be worth the effort.
  92
  93 #include <m68k/asm.h>
  94
  95         .text
  96         .even
  97
  98 ENTRY_NOPROFILE(oc_cksum)
  99         movl    %sp@(4),%a0     | get buffer ptr
 100         movl    %sp@(8),%d1     | get byte count
 101         movl    %sp@(12),%d0    | get starting value
 102         movl    %d2,%sp@-       | free a reg
 103
 104         | test for possible 1, 2 or 3 bytes of excess at end
 105         | of buffer.  The usual case is no excess (the usual
 106         | case is header checksums) so we give that the faster
 107         | 'not taken' leg of the compare.  (We do the excess
 108         | first because we're about the trash the low order
 109         | bits of the count in d1.)
 110
 111         btst    #0,%d1
 112         jne     L5              | if one or three bytes excess
 113         btst    #1,%d1
 114         jne     L7              | if two bytes excess
 115 L1:
 116         movl    %d1,%d2
 117         lsrl    #6,%d1          | make cnt into # of 64 byte chunks
 118         andl    #0x3c,%d2       | then find fractions of a chunk
 119         negl    %d2
 120         andb    #0xf,%ccr               | clear X
 121         jmp     %pc@(L3-.-2:b,%d2)
 122 L2:
 123         movl    %a0@+,%d2
 124         addxl   %d2,%d0
 125         movl    %a0@+,%d2
 126         addxl   %d2,%d0
 127         movl    %a0@+,%d2
 128         addxl   %d2,%d0
 129         movl    %a0@+,%d2
 130         addxl   %d2,%d0
 131         movl    %a0@+,%d2
 132         addxl   %d2,%d0
 133         movl    %a0@+,%d2
 134         addxl   %d2,%d0
 135         movl    %a0@+,%d2
 136         addxl   %d2,%d0
 137         movl    %a0@+,%d2
 138         addxl   %d2,%d0
 139         movl    %a0@+,%d2
 140         addxl   %d2,%d0
 141         movl    %a0@+,%d2
 142         addxl   %d2,%d0
 143         movl    %a0@+,%d2
 144         addxl   %d2,%d0
 145         movl    %a0@+,%d2
 146         addxl   %d2,%d0
 147         movl    %a0@+,%d2
 148         addxl   %d2,%d0
 149         movl    %a0@+,%d2
 150         addxl   %d2,%d0
 151         movl    %a0@+,%d2
 152         addxl   %d2,%d0
 153         movl    %a0@+,%d2
 154         addxl   %d2,%d0
 155 L3:
 156         dbra    %d1,L2          | (NB- dbra doesn't affect X)
 157
 158         movl    %d0,%d1         | fold 32 bit sum to 16 bits
 159         swap    %d1             | (NB- swap doesn't affect X)
 160         addxw   %d1,%d0
 161         jcc     L4
 162         addw    #1,%d0
 163 L4:
 164         andl    #0xffff,%d0
 165         movl    %sp@+,%d2
 166         rts
 167
 168 L5:     | deal with 1 or 3 excess bytes at the end of the buffer.
 169         btst    #1,%d1
 170         jeq     L6              | if 1 excess
 171
 172         | 3 bytes excess
 173         clrl    %d2
 174         movw    %a0@(-3,%d1:l),%d2      | add in last full word then drop
 175         addl    %d2,%d0         |  through to pick up last byte
 176
 177 L6:     | 1 byte excess
 178         clrl    %d2
 179         movb    %a0@(-1,%d1:l),%d2
 180         lsll    #8,%d2
 181         addl    %d2,%d0
 182         jra     L1
 183
 184 L7:     | 2 bytes excess
 185         clrl    %d2
 186         movw    %a0@(-2,%d1:l),%d2
 187         addl    %d2,%d0
 188         jra     L1