arch/xtensa/lib/checksum.S

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              IP/TCP/UDP checksumming routines
   7  *
   8  * Xtensa version:  Copyright (C) 2001 Tensilica, Inc. by Kevin Chea
   9  *                  Optimized by Joe Taylor
  10  *
  11  *              This program is free software; you can redistribute it and/or
  12  *              modify it under the terms of the GNU General Public License
  13  *              as published by the Free Software Foundation; either version
  14  *              2 of the License, or (at your option) any later version.
  15  */
  16
  17 #include <asm/errno.h>
  18 #include <linux/linkage.h>
  19 #include <asm/variant/core.h>
  20
  21 /*
  22  * computes a partial checksum, e.g. for TCP/UDP fragments
  23  */
  24
  25 /*
  26  * unsigned int csum_partial(const unsigned char *buf, int len,
  27  *                           unsigned int sum);
  28  *    a2 = buf
  29  *    a3 = len
  30  *    a4 = sum
  31  *
  32  * This function assumes 2- or 4-byte alignment.  Other alignments will fail!
  33  */
  34
  35 /* ONES_ADD converts twos-complement math to ones-complement. */
  36 #define ONES_ADD(sum, val)        \
  37         add     sum, sum, val   ; \
  38         bgeu    sum, val, 99f   ; \
  39         addi    sum, sum, 1     ; \
  40 99:                             ;
  41
  42 .text
  43 ENTRY(csum_partial)
  44           /*
  45            * Experiments with Ethernet and SLIP connections show that buf
  46            * is aligned on either a 2-byte or 4-byte boundary.
  47            */
  48         entry   sp, 32
  49         extui   a5, a2, 0, 2
  50         bnez    a5, 8f          /* branch if 2-byte aligned */
  51         /* Fall-through on common case, 4-byte alignment */
  52 1:
  53         srli    a5, a3, 5       /* 32-byte chunks */
  54 #if XCHAL_HAVE_LOOPS
  55         loopgtz a5, 2f
  56 #else
  57         beqz    a5, 2f
  58         slli    a5, a5, 5
  59         add     a5, a5, a2      /* a5 = end of last 32-byte chunk */
  60 .Loop1:
  61 #endif
  62         l32i    a6, a2, 0
  63         l32i    a7, a2, 4
  64         ONES_ADD(a4, a6)
  65         ONES_ADD(a4, a7)
  66         l32i    a6, a2, 8
  67         l32i    a7, a2, 12
  68         ONES_ADD(a4, a6)
  69         ONES_ADD(a4, a7)
  70         l32i    a6, a2, 16
  71         l32i    a7, a2, 20
  72         ONES_ADD(a4, a6)
  73         ONES_ADD(a4, a7)
  74         l32i    a6, a2, 24
  75         l32i    a7, a2, 28
  76         ONES_ADD(a4, a6)
  77         ONES_ADD(a4, a7)
  78         addi    a2, a2, 4*8
  79 #if !XCHAL_HAVE_LOOPS
  80         blt     a2, a5, .Loop1
  81 #endif
  82 2:
  83         extui   a5, a3, 2, 3    /* remaining 4-byte chunks */
  84 #if XCHAL_HAVE_LOOPS
  85         loopgtz a5, 3f
  86 #else
  87         beqz    a5, 3f
  88         slli    a5, a5, 2
  89         add     a5, a5, a2      /* a5 = end of last 4-byte chunk */
  90 .Loop2:
  91 #endif
  92         l32i    a6, a2, 0
  93         ONES_ADD(a4, a6)
  94         addi    a2, a2, 4
  95 #if !XCHAL_HAVE_LOOPS
  96         blt     a2, a5, .Loop2
  97 #endif
  98 3:
  99         _bbci.l a3, 1, 5f       /* remaining 2-byte chunk */
 100         l16ui   a6, a2, 0
 101         ONES_ADD(a4, a6)
 102         addi    a2, a2, 2
 103 5:
 104         _bbci.l a3, 0, 7f       /* remaining 1-byte chunk */
 105 6:      l8ui    a6, a2, 0
 106 #ifdef __XTENSA_EB__
 107         slli    a6, a6, 8       /* load byte into bits 8..15 */
 108 #endif
 109         ONES_ADD(a4, a6)
 110 7:
 111         mov     a2, a4
 112         retw
 113
 114         /* uncommon case, buf is 2-byte aligned */
 115 8:
 116         beqz    a3, 7b          /* branch if len == 0 */
 117         beqi    a3, 1, 6b       /* branch if len == 1 */
 118
 119         extui   a5, a2, 0, 1
 120         bnez    a5, 8f          /* branch if 1-byte aligned */
 121
 122         l16ui   a6, a2, 0       /* common case, len >= 2 */
 123         ONES_ADD(a4, a6)
 124         addi    a2, a2, 2       /* adjust buf */
 125         addi    a3, a3, -2      /* adjust len */
 126         j       1b              /* now buf is 4-byte aligned */
 127
 128         /* case: odd-byte aligned, len > 1
 129          * This case is dog slow, so don't give us an odd address.
 130          * (I don't think this ever happens, but just in case.)
 131          */
 132 8:
 133         srli    a5, a3, 2       /* 4-byte chunks */
 134 #if XCHAL_HAVE_LOOPS
 135         loopgtz a5, 2f
 136 #else
 137         beqz    a5, 2f
 138         slli    a5, a5, 2
 139         add     a5, a5, a2      /* a5 = end of last 4-byte chunk */
 140 .Loop3:
 141 #endif
 142         l8ui    a6, a2, 0       /* bits 24..31 */
 143         l16ui   a7, a2, 1       /* bits  8..23 */
 144         l8ui    a8, a2, 3       /* bits  0.. 8 */
 145 #ifdef  __XTENSA_EB__
 146         slli    a6, a6, 24
 147 #else
 148         slli    a8, a8, 24
 149 #endif
 150         slli    a7, a7, 8
 151         or      a7, a7, a6
 152         or      a7, a7, a8
 153         ONES_ADD(a4, a7)
 154         addi    a2, a2, 4
 155 #if !XCHAL_HAVE_LOOPS
 156         blt     a2, a5, .Loop3
 157 #endif
 158 2:
 159         _bbci.l a3, 1, 3f       /* remaining 2-byte chunk, still odd addr */
 160         l8ui    a6, a2, 0
 161         l8ui    a7, a2, 1
 162 #ifdef  __XTENSA_EB__
 163         slli    a6, a6, 8
 164 #else
 165         slli    a7, a7, 8
 166 #endif
 167         or      a7, a7, a6
 168         ONES_ADD(a4, a7)
 169         addi    a2, a2, 2
 170 3:
 171         j       5b              /* branch to handle the remaining byte */
 172
 173
 174
 175 /*
 176  * Copy from ds while checksumming, otherwise like csum_partial
 177  *
 178  * The macros SRC and DST specify the type of access for the instruction.
 179  * thus we can call a custom exception handler for each access type.
 180  */
 181
 182 #define SRC(y...)                       \
 183         9999: y;                        \
 184         .section __ex_table, "a";       \
 185         .long 9999b, 6001f      ;       \
 186         .previous
 187
 188 #define DST(y...)                       \
 189         9999: y;                        \
 190         .section __ex_table, "a";       \
 191         .long 9999b, 6002f      ;       \
 192         .previous
 193
 194 /*
 195 unsigned int csum_partial_copy_generic (const char *src, char *dst, int len,
 196                                         int sum, int *src_err_ptr, int *dst_err_ptr)
 197         a2  = src
 198         a3  = dst
 199         a4  = len
 200         a5  = sum
 201         a6  = src_err_ptr
 202         a7  = dst_err_ptr
 203         a8  = temp
 204         a9  = temp
 205         a10 = temp
 206         a11 = original len for exception handling
 207         a12 = original dst for exception handling
 208
 209     This function is optimized for 4-byte aligned addresses.  Other
 210     alignments work, but not nearly as efficiently.
 211  */
 212
 213 ENTRY(csum_partial_copy_generic)
 214         entry   sp, 32
 215         mov     a12, a3
 216         mov     a11, a4
 217         or      a10, a2, a3
 218
 219         /* We optimize the following alignment tests for the 4-byte
 220         aligned case.  Two bbsi.l instructions might seem more optimal
 221         (commented out below).  However, both labels 5: and 3: are out
 222         of the imm8 range, so the assembler relaxes them into
 223         equivalent bbci.l, j combinations, which is actually
 224         slower. */
 225
 226         extui   a9, a10, 0, 2
 227         beqz    a9, 1f          /* branch if both are 4-byte aligned */
 228         bbsi.l  a10, 0, 5f      /* branch if one address is odd */
 229         j       3f              /* one address is 2-byte aligned */
 230
 231 /*      _bbsi.l a10, 0, 5f */   /* branch if odd address */
 232 /*      _bbsi.l a10, 1, 3f */   /* branch if 2-byte-aligned address */
 233
 234 1:
 235         /* src and dst are both 4-byte aligned */
 236         srli    a10, a4, 5      /* 32-byte chunks */
 237 #if XCHAL_HAVE_LOOPS
 238         loopgtz a10, 2f
 239 #else
 240         beqz    a10, 2f
 241         slli    a10, a10, 5
 242         add     a10, a10, a2    /* a10 = end of last 32-byte src chunk */
 243 .Loop5:
 244 #endif
 245 SRC(    l32i    a9, a2, 0       )
 246 SRC(    l32i    a8, a2, 4       )
 247 DST(    s32i    a9, a3, 0       )
 248 DST(    s32i    a8, a3, 4       )
 249         ONES_ADD(a5, a9)
 250         ONES_ADD(a5, a8)
 251 SRC(    l32i    a9, a2, 8       )
 252 SRC(    l32i    a8, a2, 12      )
 253 DST(    s32i    a9, a3, 8       )
 254 DST(    s32i    a8, a3, 12      )
 255         ONES_ADD(a5, a9)
 256         ONES_ADD(a5, a8)
 257 SRC(    l32i    a9, a2, 16      )
 258 SRC(    l32i    a8, a2, 20      )
 259 DST(    s32i    a9, a3, 16      )
 260 DST(    s32i    a8, a3, 20      )
 261         ONES_ADD(a5, a9)
 262         ONES_ADD(a5, a8)
 263 SRC(    l32i    a9, a2, 24      )
 264 SRC(    l32i    a8, a2, 28      )
 265 DST(    s32i    a9, a3, 24      )
 266 DST(    s32i    a8, a3, 28      )
 267         ONES_ADD(a5, a9)
 268         ONES_ADD(a5, a8)
 269         addi    a2, a2, 32
 270         addi    a3, a3, 32
 271 #if !XCHAL_HAVE_LOOPS
 272         blt     a2, a10, .Loop5
 273 #endif
 274 2:
 275         extui   a10, a4, 2, 3   /* remaining 4-byte chunks */
 276         extui   a4, a4, 0, 2    /* reset len for general-case, 2-byte chunks */
 277 #if XCHAL_HAVE_LOOPS
 278         loopgtz a10, 3f
 279 #else
 280         beqz    a10, 3f
 281         slli    a10, a10, 2
 282         add     a10, a10, a2    /* a10 = end of last 4-byte src chunk */
 283 .Loop6:
 284 #endif
 285 SRC(    l32i    a9, a2, 0       )
 286 DST(    s32i    a9, a3, 0       )
 287         ONES_ADD(a5, a9)
 288         addi    a2, a2, 4
 289         addi    a3, a3, 4
 290 #if !XCHAL_HAVE_LOOPS
 291         blt     a2, a10, .Loop6
 292 #endif
 293 3:
 294         /*
 295         Control comes to here in two cases: (1) It may fall through
 296         to here from the 4-byte alignment case to process, at most,
 297         one 2-byte chunk.  (2) It branches to here from above if
 298         either src or dst is 2-byte aligned, and we process all bytes
 299         here, except for perhaps a trailing odd byte.  It's
 300         inefficient, so align your addresses to 4-byte boundaries.
 301
 302         a2 = src
 303         a3 = dst
 304         a4 = len
 305         a5 = sum
 306         */
 307         srli    a10, a4, 1      /* 2-byte chunks */
 308 #if XCHAL_HAVE_LOOPS
 309         loopgtz a10, 4f
 310 #else
 311         beqz    a10, 4f
 312         slli    a10, a10, 1
 313         add     a10, a10, a2    /* a10 = end of last 2-byte src chunk */
 314 .Loop7:
 315 #endif
 316 SRC(    l16ui   a9, a2, 0       )
 317 DST(    s16i    a9, a3, 0       )
 318         ONES_ADD(a5, a9)
 319         addi    a2, a2, 2
 320         addi    a3, a3, 2
 321 #if !XCHAL_HAVE_LOOPS
 322         blt     a2, a10, .Loop7
 323 #endif
 324 4:
 325         /* This section processes a possible trailing odd byte. */
 326         _bbci.l a4, 0, 8f       /* 1-byte chunk */
 327 SRC(    l8ui    a9, a2, 0       )
 328 DST(    s8i     a9, a3, 0       )
 329 #ifdef __XTENSA_EB__
 330         slli    a9, a9, 8       /* shift byte to bits 8..15 */
 331 #endif
 332         ONES_ADD(a5, a9)
 333 8:
 334         mov     a2, a5
 335         retw
 336
 337 5:
 338         /* Control branch to here when either src or dst is odd.  We
 339         process all bytes using 8-bit accesses.  Grossly inefficient,
 340         so don't feed us an odd address. */
 341
 342         srli    a10, a4, 1      /* handle in pairs for 16-bit csum */
 343 #if XCHAL_HAVE_LOOPS
 344         loopgtz a10, 6f
 345 #else
 346         beqz    a10, 6f
 347         slli    a10, a10, 1
 348         add     a10, a10, a2    /* a10 = end of last odd-aligned, 2-byte src chunk */
 349 .Loop8:
 350 #endif
 351 SRC(    l8ui    a9, a2, 0       )
 352 SRC(    l8ui    a8, a2, 1       )
 353 DST(    s8i     a9, a3, 0       )
 354 DST(    s8i     a8, a3, 1       )
 355 #ifdef __XTENSA_EB__
 356         slli    a9, a9, 8       /* combine into a single 16-bit value */
 357 #else                           /* for checksum computation */
 358         slli    a8, a8, 8
 359 #endif
 360         or      a9, a9, a8
 361         ONES_ADD(a5, a9)
 362         addi    a2, a2, 2
 363         addi    a3, a3, 2
 364 #if !XCHAL_HAVE_LOOPS
 365         blt     a2, a10, .Loop8
 366 #endif
 367 6:
 368         j       4b              /* process the possible trailing odd byte */
 369
 370
 371 # Exception handler:
 372 .section .fixup, "ax"
 373 /*
 374         a6  = src_err_ptr
 375         a7  = dst_err_ptr
 376         a11 = original len for exception handling
 377         a12 = original dst for exception handling
 378 */
 379
 380 6001:
 381         _movi   a2, -EFAULT
 382         s32i    a2, a6, 0       /* src_err_ptr */
 383
 384         # clear the complete destination - computing the rest
 385         # is too much work
 386         movi    a2, 0
 387 #if XCHAL_HAVE_LOOPS
 388         loopgtz a11, 2f
 389 #else
 390         beqz    a11, 2f
 391         add     a11, a11, a12   /* a11 = ending address */
 392 .Leloop:
 393 #endif
 394         s8i     a2, a12, 0
 395         addi    a12, a12, 1
 396 #if !XCHAL_HAVE_LOOPS
 397         blt     a12, a11, .Leloop
 398 #endif
 399 2:
 400         retw
 401
 402 6002:
 403         movi    a2, -EFAULT
 404         s32i    a2, a7, 0       /* dst_err_ptr */
 405         movi    a2, 0
 406         retw
 407
 408 .previous
 409