arch/sparc/lib/checksum_32.S

   1 /* SPDX-License-Identifier: GPL-2.0 */
   2 /* checksum.S: Sparc optimized checksum code.
   3  *
   4  *  Copyright(C) 1995 Linus Torvalds
   5  *  Copyright(C) 1995 Miguel de Icaza
   6  *  Copyright(C) 1996 David S. Miller
   7  *  Copyright(C) 1997 Jakub Jelinek
   8  *
   9  * derived from:
  10  *      Linux/Alpha checksum c-code
  11  *      Linux/ix86 inline checksum assembly
  12  *      RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code)
  13  *      David Mosberger-Tang for optimized reference c-code
  14  *      BSD4.4 portable checksum routine
  15  */
  16
  17 #include <asm/errno.h>
  18 #include <asm/export.h>
  19
  20 #define CSUM_BIGCHUNK(buf, offset, sum, t0, t1, t2, t3, t4, t5) \
  21         ldd     [buf + offset + 0x00], t0;                      \
  22         ldd     [buf + offset + 0x08], t2;                      \
  23         addxcc  t0, sum, sum;                                   \
  24         addxcc  t1, sum, sum;                                   \
  25         ldd     [buf + offset + 0x10], t4;                      \
  26         addxcc  t2, sum, sum;                                   \
  27         addxcc  t3, sum, sum;                                   \
  28         ldd     [buf + offset + 0x18], t0;                      \
  29         addxcc  t4, sum, sum;                                   \
  30         addxcc  t5, sum, sum;                                   \
  31         addxcc  t0, sum, sum;                                   \
  32         addxcc  t1, sum, sum;
  33
  34 #define CSUM_LASTCHUNK(buf, offset, sum, t0, t1, t2, t3)        \
  35         ldd     [buf - offset - 0x08], t0;                      \
  36         ldd     [buf - offset - 0x00], t2;                      \
  37         addxcc  t0, sum, sum;                                   \
  38         addxcc  t1, sum, sum;                                   \
  39         addxcc  t2, sum, sum;                                   \
  40         addxcc  t3, sum, sum;
  41
  42         /* Do end cruft out of band to get better cache patterns. */
  43 csum_partial_end_cruft:
  44         be      1f                              ! caller asks %o1 & 0x8
  45          andcc  %o1, 4, %g0                     ! nope, check for word remaining
  46         ldd     [%o0], %g2                      ! load two
  47         addcc   %g2, %o2, %o2                   ! add first word to sum
  48         addxcc  %g3, %o2, %o2                   ! add second word as well
  49         add     %o0, 8, %o0                     ! advance buf ptr
  50         addx    %g0, %o2, %o2                   ! add in final carry
  51         andcc   %o1, 4, %g0                     ! check again for word remaining
  52 1:      be      1f                              ! nope, skip this code
  53          andcc  %o1, 3, %o1                     ! check for trailing bytes
  54         ld      [%o0], %g2                      ! load it
  55         addcc   %g2, %o2, %o2                   ! add to sum
  56         add     %o0, 4, %o0                     ! advance buf ptr
  57         addx    %g0, %o2, %o2                   ! add in final carry
  58         andcc   %o1, 3, %g0                     ! check again for trailing bytes
  59 1:      be      1f                              ! no trailing bytes, return
  60          addcc  %o1, -1, %g0                    ! only one byte remains?
  61         bne     2f                              ! at least two bytes more
  62          subcc  %o1, 2, %o1                     ! only two bytes more?
  63         b       4f                              ! only one byte remains
  64          or     %g0, %g0, %o4                   ! clear fake hword value
  65 2:      lduh    [%o0], %o4                      ! get hword
  66         be      6f                              ! jmp if only hword remains
  67          add    %o0, 2, %o0                     ! advance buf ptr either way
  68         sll     %o4, 16, %o4                    ! create upper hword
  69 4:      ldub    [%o0], %o5                      ! get final byte
  70         sll     %o5, 8, %o5                     ! put into place
  71         or      %o5, %o4, %o4                   ! coalese with hword (if any)
  72 6:      addcc   %o4, %o2, %o2                   ! add to sum
  73 1:      retl                                    ! get outta here
  74          addx   %g0, %o2, %o0                   ! add final carry into retval
  75
  76         /* Also do alignment out of band to get better cache patterns. */
  77 csum_partial_fix_alignment:
  78         cmp     %o1, 6
  79         bl      cpte - 0x4
  80          andcc  %o0, 0x2, %g0
  81         be      1f
  82          andcc  %o0, 0x4, %g0
  83         lduh    [%o0 + 0x00], %g2
  84         sub     %o1, 2, %o1
  85         add     %o0, 2, %o0
  86         sll     %g2, 16, %g2
  87         addcc   %g2, %o2, %o2
  88         srl     %o2, 16, %g3
  89         addx    %g0, %g3, %g2
  90         sll     %o2, 16, %o2
  91         sll     %g2, 16, %g3
  92         srl     %o2, 16, %o2
  93         andcc   %o0, 0x4, %g0
  94         or      %g3, %o2, %o2
  95 1:      be      cpa
  96          andcc  %o1, 0xffffff80, %o3
  97         ld      [%o0 + 0x00], %g2
  98         sub     %o1, 4, %o1
  99         addcc   %g2, %o2, %o2
 100         add     %o0, 4, %o0
 101         addx    %g0, %o2, %o2
 102         b       cpa
 103          andcc  %o1, 0xffffff80, %o3
 104
 105         /* The common case is to get called with a nicely aligned
 106          * buffer of size 0x20.  Follow the code path for that case.
 107          */
 108         .globl  csum_partial
 109         EXPORT_SYMBOL(csum_partial)
 110 csum_partial:                   /* %o0=buf, %o1=len, %o2=sum */
 111         andcc   %o0, 0x7, %g0                           ! alignment problems?
 112         bne     csum_partial_fix_alignment              ! yep, handle it
 113          sethi  %hi(cpte - 8), %g7                      ! prepare table jmp ptr
 114         andcc   %o1, 0xffffff80, %o3                    ! num loop iterations
 115 cpa:    be      3f                                      ! none to do
 116          andcc  %o1, 0x70, %g1                          ! clears carry flag too
 117 5:      CSUM_BIGCHUNK(%o0, 0x00, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
 118         CSUM_BIGCHUNK(%o0, 0x20, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
 119         CSUM_BIGCHUNK(%o0, 0x40, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
 120         CSUM_BIGCHUNK(%o0, 0x60, %o2, %o4, %o5, %g2, %g3, %g4, %g5)
 121         addx    %g0, %o2, %o2                           ! sink in final carry
 122         subcc   %o3, 128, %o3                           ! detract from loop iters
 123         bne     5b                                      ! more to do
 124          add    %o0, 128, %o0                           ! advance buf ptr
 125         andcc   %o1, 0x70, %g1                          ! clears carry flag too
 126 3:      be      cpte                                    ! nope
 127          andcc  %o1, 0xf, %g0                           ! anything left at all?
 128         srl     %g1, 1, %o4                             ! compute offset
 129         sub     %g7, %g1, %g7                           ! adjust jmp ptr
 130         sub     %g7, %o4, %g7                           ! final jmp ptr adjust
 131         jmp     %g7 + %lo(cpte - 8)                     ! enter the table
 132          add    %o0, %g1, %o0                           ! advance buf ptr
 133 cptbl:  CSUM_LASTCHUNK(%o0, 0x68, %o2, %g2, %g3, %g4, %g5)
 134         CSUM_LASTCHUNK(%o0, 0x58, %o2, %g2, %g3, %g4, %g5)
 135         CSUM_LASTCHUNK(%o0, 0x48, %o2, %g2, %g3, %g4, %g5)
 136         CSUM_LASTCHUNK(%o0, 0x38, %o2, %g2, %g3, %g4, %g5)
 137         CSUM_LASTCHUNK(%o0, 0x28, %o2, %g2, %g3, %g4, %g5)
 138         CSUM_LASTCHUNK(%o0, 0x18, %o2, %g2, %g3, %g4, %g5)
 139         CSUM_LASTCHUNK(%o0, 0x08, %o2, %g2, %g3, %g4, %g5)
 140         addx    %g0, %o2, %o2                           ! fetch final carry
 141         andcc   %o1, 0xf, %g0                           ! anything left at all?
 142 cpte:   bne     csum_partial_end_cruft                  ! yep, handle it
 143          andcc  %o1, 8, %g0                             ! check how much
 144 cpout:  retl                                            ! get outta here
 145          mov    %o2, %o0                                ! return computed csum
 146
 147 /* Work around cpp -rob */
 148 #define ALLOC #alloc
 149 #define EXECINSTR #execinstr
 150 #define EX(x,y)                                 \
 151 98:     x,y;                                    \
 152         .section __ex_table,ALLOC;              \
 153         .align  4;                              \
 154         .word   98b, cc_fault;                   \
 155         .text;                                  \
 156         .align  4
 157
 158 #define EXT(start,end)                          \
 159         .section __ex_table,ALLOC;              \
 160         .align  4;                              \
 161         .word   start, 0, end, cc_fault;         \
 162         .text;                                  \
 163         .align  4
 164
 165         /* This aligned version executes typically in 8.5 superscalar cycles, this
 166          * is the best I can do.  I say 8.5 because the final add will pair with
 167          * the next ldd in the main unrolled loop.  Thus the pipe is always full.
 168          * If you change these macros (including order of instructions),
 169          * please check the fixup code below as well.
 170          */
 171 #define CSUMCOPY_BIGCHUNK_ALIGNED(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7)   \
 172         ldd     [src + off + 0x00], t0;                                                 \
 173         ldd     [src + off + 0x08], t2;                                                 \
 174         addxcc  t0, sum, sum;                                                           \
 175         ldd     [src + off + 0x10], t4;                                                 \
 176         addxcc  t1, sum, sum;                                                           \
 177         ldd     [src + off + 0x18], t6;                                                 \
 178         addxcc  t2, sum, sum;                                                           \
 179         std     t0, [dst + off + 0x00];                                                 \
 180         addxcc  t3, sum, sum;                                                           \
 181         std     t2, [dst + off + 0x08];                                                 \
 182         addxcc  t4, sum, sum;                                                           \
 183         std     t4, [dst + off + 0x10];                                                 \
 184         addxcc  t5, sum, sum;                                                           \
 185         std     t6, [dst + off + 0x18];                                                 \
 186         addxcc  t6, sum, sum;                                                           \
 187         addxcc  t7, sum, sum;
 188
 189         /* 12 superscalar cycles seems to be the limit for this case,
 190          * because of this we thus do all the ldd's together to get
 191          * Viking MXCC into streaming mode.  Ho hum...
 192          */
 193 #define CSUMCOPY_BIGCHUNK(src, dst, sum, off, t0, t1, t2, t3, t4, t5, t6, t7)   \
 194         ldd     [src + off + 0x00], t0;                                         \
 195         ldd     [src + off + 0x08], t2;                                         \
 196         ldd     [src + off + 0x10], t4;                                         \
 197         ldd     [src + off + 0x18], t6;                                         \
 198         st      t0, [dst + off + 0x00];                                         \
 199         addxcc  t0, sum, sum;                                                   \
 200         st      t1, [dst + off + 0x04];                                         \
 201         addxcc  t1, sum, sum;                                                   \
 202         st      t2, [dst + off + 0x08];                                         \
 203         addxcc  t2, sum, sum;                                                   \
 204         st      t3, [dst + off + 0x0c];                                         \
 205         addxcc  t3, sum, sum;                                                   \
 206         st      t4, [dst + off + 0x10];                                         \
 207         addxcc  t4, sum, sum;                                                   \
 208         st      t5, [dst + off + 0x14];                                         \
 209         addxcc  t5, sum, sum;                                                   \
 210         st      t6, [dst + off + 0x18];                                         \
 211         addxcc  t6, sum, sum;                                                   \
 212         st      t7, [dst + off + 0x1c];                                         \
 213         addxcc  t7, sum, sum;
 214
 215         /* Yuck, 6 superscalar cycles... */
 216 #define CSUMCOPY_LASTCHUNK(src, dst, sum, off, t0, t1, t2, t3)  \
 217         ldd     [src - off - 0x08], t0;                         \
 218         ldd     [src - off - 0x00], t2;                         \
 219         addxcc  t0, sum, sum;                                   \
 220         st      t0, [dst - off - 0x08];                         \
 221         addxcc  t1, sum, sum;                                   \
 222         st      t1, [dst - off - 0x04];                         \
 223         addxcc  t2, sum, sum;                                   \
 224         st      t2, [dst - off - 0x00];                         \
 225         addxcc  t3, sum, sum;                                   \
 226         st      t3, [dst - off + 0x04];
 227
 228         /* Handle the end cruft code out of band for better cache patterns. */
 229 cc_end_cruft:
 230         be      1f
 231          andcc  %o3, 4, %g0
 232         EX(ldd  [%o0 + 0x00], %g2)
 233         add     %o1, 8, %o1
 234         addcc   %g2, %g7, %g7
 235         add     %o0, 8, %o0
 236         addxcc  %g3, %g7, %g7
 237         EX(st   %g2, [%o1 - 0x08])
 238         addx    %g0, %g7, %g7
 239         andcc   %o3, 4, %g0
 240         EX(st   %g3, [%o1 - 0x04])
 241 1:      be      1f
 242          andcc  %o3, 3, %o3
 243         EX(ld   [%o0 + 0x00], %g2)
 244         add     %o1, 4, %o1
 245         addcc   %g2, %g7, %g7
 246         EX(st   %g2, [%o1 - 0x04])
 247         addx    %g0, %g7, %g7
 248         andcc   %o3, 3, %g0
 249         add     %o0, 4, %o0
 250 1:      be      1f
 251          addcc  %o3, -1, %g0
 252         bne     2f
 253          subcc  %o3, 2, %o3
 254         b       4f
 255          or     %g0, %g0, %o4
 256 2:      EX(lduh [%o0 + 0x00], %o4)
 257         add     %o0, 2, %o0
 258         EX(sth  %o4, [%o1 + 0x00])
 259         be      6f
 260          add    %o1, 2, %o1
 261         sll     %o4, 16, %o4
 262 4:      EX(ldub [%o0 + 0x00], %o5)
 263         EX(stb  %o5, [%o1 + 0x00])
 264         sll     %o5, 8, %o5
 265         or      %o5, %o4, %o4
 266 6:      addcc   %o4, %g7, %g7
 267 1:      retl
 268          addx   %g0, %g7, %o0
 269
 270         /* Also, handle the alignment code out of band. */
 271 cc_dword_align:
 272         cmp     %g1, 16
 273         bge     1f
 274          srl    %g1, 1, %o3
 275 2:      cmp     %o3, 0
 276         be,a    ccte
 277          andcc  %g1, 0xf, %o3
 278         andcc   %o3, %o0, %g0   ! Check %o0 only (%o1 has the same last 2 bits)
 279         be,a    2b
 280          srl    %o3, 1, %o3
 281 1:      andcc   %o0, 0x1, %g0
 282         bne     ccslow
 283          andcc  %o0, 0x2, %g0
 284         be      1f
 285          andcc  %o0, 0x4, %g0
 286         EX(lduh [%o0 + 0x00], %g4)
 287         sub     %g1, 2, %g1
 288         EX(sth  %g4, [%o1 + 0x00])
 289         add     %o0, 2, %o0
 290         sll     %g4, 16, %g4
 291         addcc   %g4, %g7, %g7
 292         add     %o1, 2, %o1
 293         srl     %g7, 16, %g3
 294         addx    %g0, %g3, %g4
 295         sll     %g7, 16, %g7
 296         sll     %g4, 16, %g3
 297         srl     %g7, 16, %g7
 298         andcc   %o0, 0x4, %g0
 299         or      %g3, %g7, %g7
 300 1:      be      3f
 301          andcc  %g1, 0xffffff80, %g0
 302         EX(ld   [%o0 + 0x00], %g4)
 303         sub     %g1, 4, %g1
 304         EX(st   %g4, [%o1 + 0x00])
 305         add     %o0, 4, %o0
 306         addcc   %g4, %g7, %g7
 307         add     %o1, 4, %o1
 308         addx    %g0, %g7, %g7
 309         b       3f
 310          andcc  %g1, 0xffffff80, %g0
 311
 312         /* Sun, you just can't beat me, you just can't.  Stop trying,
 313          * give up.  I'm serious, I am going to kick the living shit
 314          * out of you, game over, lights out.
 315          */
 316         .align  8
 317         .globl  __csum_partial_copy_sparc_generic
 318         EXPORT_SYMBOL(__csum_partial_copy_sparc_generic)
 319 __csum_partial_copy_sparc_generic:
 320                                         /* %o0=src, %o1=dest, %g1=len, %g7=sum */
 321         xor     %o0, %o1, %o4           ! get changing bits
 322         andcc   %o4, 3, %g0             ! check for mismatched alignment
 323         bne     ccslow                  ! better this than unaligned/fixups
 324          andcc  %o0, 7, %g0             ! need to align things?
 325         bne     cc_dword_align          ! yes, we check for short lengths there
 326          andcc  %g1, 0xffffff80, %g0    ! can we use unrolled loop?
 327 3:      be      3f                      ! nope, less than one loop remains
 328          andcc  %o1, 4, %g0             ! dest aligned on 4 or 8 byte boundary?
 329         be      ccdbl + 4               ! 8 byte aligned, kick ass
 330 5:      CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x00,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
 331         CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x20,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
 332         CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x40,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
 333         CSUMCOPY_BIGCHUNK(%o0,%o1,%g7,0x60,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
 334 10:     EXT(5b, 10b)                    ! note for exception handling
 335         sub     %g1, 128, %g1           ! detract from length
 336         addx    %g0, %g7, %g7           ! add in last carry bit
 337         andcc   %g1, 0xffffff80, %g0    ! more to csum?
 338         add     %o0, 128, %o0           ! advance src ptr
 339         bne     5b                      ! we did not go negative, continue looping
 340          add    %o1, 128, %o1           ! advance dest ptr
 341 3:      andcc   %g1, 0x70, %o2          ! can use table?
 342 ccmerge:be      ccte                    ! nope, go and check for end cruft
 343          andcc  %g1, 0xf, %o3           ! get low bits of length (clears carry btw)
 344         srl     %o2, 1, %o4             ! begin negative offset computation
 345         sethi   %hi(12f), %o5           ! set up table ptr end
 346         add     %o0, %o2, %o0           ! advance src ptr
 347         sub     %o5, %o4, %o5           ! continue table calculation
 348         sll     %o2, 1, %g2             ! constant multiplies are fun...
 349         sub     %o5, %g2, %o5           ! some more adjustments
 350         jmp     %o5 + %lo(12f)          ! jump into it, duff style, wheee...
 351          add    %o1, %o2, %o1           ! advance dest ptr (carry is clear btw)
 352 cctbl:  CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x68,%g2,%g3,%g4,%g5)
 353         CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x58,%g2,%g3,%g4,%g5)
 354         CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x48,%g2,%g3,%g4,%g5)
 355         CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x38,%g2,%g3,%g4,%g5)
 356         CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x28,%g2,%g3,%g4,%g5)
 357         CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x18,%g2,%g3,%g4,%g5)
 358         CSUMCOPY_LASTCHUNK(%o0,%o1,%g7,0x08,%g2,%g3,%g4,%g5)
 359 12:     EXT(cctbl, 12b)                 ! note for exception table handling
 360         addx    %g0, %g7, %g7
 361         andcc   %o3, 0xf, %g0           ! check for low bits set
 362 ccte:   bne     cc_end_cruft            ! something left, handle it out of band
 363          andcc  %o3, 8, %g0             ! begin checks for that code
 364         retl                            ! return
 365          mov    %g7, %o0                ! give em the computed checksum
 366 ccdbl:  CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x00,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
 367         CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x20,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
 368         CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x40,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
 369         CSUMCOPY_BIGCHUNK_ALIGNED(%o0,%o1,%g7,0x60,%o4,%o5,%g2,%g3,%g4,%g5,%o2,%o3)
 370 11:     EXT(ccdbl, 11b)                 ! note for exception table handling
 371         sub     %g1, 128, %g1           ! detract from length
 372         addx    %g0, %g7, %g7           ! add in last carry bit
 373         andcc   %g1, 0xffffff80, %g0    ! more to csum?
 374         add     %o0, 128, %o0           ! advance src ptr
 375         bne     ccdbl                   ! we did not go negative, continue looping
 376          add    %o1, 128, %o1           ! advance dest ptr
 377         b       ccmerge                 ! finish it off, above
 378          andcc  %g1, 0x70, %o2          ! can use table? (clears carry btw)
 379
 380 ccslow: cmp     %g1, 0
 381         mov     0, %g5
 382         bleu    4f
 383          andcc  %o0, 1, %o5
 384         be,a    1f
 385          srl    %g1, 1, %g4
 386         sub     %g1, 1, %g1
 387         EX(ldub [%o0], %g5)
 388         add     %o0, 1, %o0
 389         EX(stb  %g5, [%o1])
 390         srl     %g1, 1, %g4
 391         add     %o1, 1, %o1
 392 1:      cmp     %g4, 0
 393         be,a    3f
 394          andcc  %g1, 1, %g0
 395         andcc   %o0, 2, %g0
 396         be,a    1f
 397          srl    %g4, 1, %g4
 398         EX(lduh [%o0], %o4)
 399         sub     %g1, 2, %g1
 400         srl     %o4, 8, %g2
 401         sub     %g4, 1, %g4
 402         EX(stb  %g2, [%o1])
 403         add     %o4, %g5, %g5
 404         EX(stb  %o4, [%o1 + 1])
 405         add     %o0, 2, %o0
 406         srl     %g4, 1, %g4
 407         add     %o1, 2, %o1
 408 1:      cmp     %g4, 0
 409         be,a    2f
 410          andcc  %g1, 2, %g0
 411         EX(ld   [%o0], %o4)
 412 5:      srl     %o4, 24, %g2
 413         srl     %o4, 16, %g3
 414         EX(stb  %g2, [%o1])
 415         srl     %o4, 8, %g2
 416         EX(stb  %g3, [%o1 + 1])
 417         add     %o0, 4, %o0
 418         EX(stb  %g2, [%o1 + 2])
 419         addcc   %o4, %g5, %g5
 420         EX(stb  %o4, [%o1 + 3])
 421         addx    %g5, %g0, %g5   ! I am now to lazy to optimize this (question it
 422         add     %o1, 4, %o1     ! is worthy). Maybe some day - with the sll/srl
 423         subcc   %g4, 1, %g4     ! tricks
 424         bne,a   5b
 425          EX(ld  [%o0], %o4)
 426         sll     %g5, 16, %g2
 427         srl     %g5, 16, %g5
 428         srl     %g2, 16, %g2
 429         andcc   %g1, 2, %g0
 430         add     %g2, %g5, %g5
 431 2:      be,a    3f
 432          andcc  %g1, 1, %g0
 433         EX(lduh [%o0], %o4)
 434         andcc   %g1, 1, %g0
 435         srl     %o4, 8, %g2
 436         add     %o0, 2, %o0
 437         EX(stb  %g2, [%o1])
 438         add     %g5, %o4, %g5
 439         EX(stb  %o4, [%o1 + 1])
 440         add     %o1, 2, %o1
 441 3:      be,a    1f
 442          sll    %g5, 16, %o4
 443         EX(ldub [%o0], %g2)
 444         sll     %g2, 8, %o4
 445         EX(stb  %g2, [%o1])
 446         add     %g5, %o4, %g5
 447         sll     %g5, 16, %o4
 448 1:      addcc   %o4, %g5, %g5
 449         srl     %g5, 16, %o4
 450         addx    %g0, %o4, %g5
 451         orcc    %o5, %g0, %g0
 452         be      4f
 453          srl    %g5, 8, %o4
 454         and     %g5, 0xff, %g2
 455         and     %o4, 0xff, %o4
 456         sll     %g2, 8, %g2
 457         or      %g2, %o4, %g5
 458 4:      addcc   %g7, %g5, %g7
 459         retl
 460          addx   %g0, %g7, %o0
 461
 462 /* We do these strange calculations for the csum_*_from_user case only, ie.
 463  * we only bother with faults on loads... */
 464
 465 cc_fault:
 466         ret
 467          clr    %o0