usr/src/lib/libc/sparc/gen/ascii_strcasecmp.s

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24  */
  25
  26 /*
  27  * The ascii_strcasecmp() function is a case insensitive versions of strcmp().
  28  * It assumes the ASCII character set and ignores differences in case
  29  * when comparing lower and upper case characters. In other words, it
  30  * behaves as if both strings had been converted to lower case using
  31  * tolower() in the "C" locale on each byte, and the results had then
  32  * been compared using strcmp().
  33  *
  34  * The assembly code below is an optimized version of the following C
  35  * reference:
  36  *
  37  * static const char charmap[] = {
  38  *      '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007',
  39  *      '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017',
  40  *      '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027',
  41  *      '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037',
  42  *      '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047',
  43  *      '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057',
  44  *      '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067',
  45  *      '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077',
  46  *      '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
  47  *      '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
  48  *      '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
  49  *      '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137',
  50  *      '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147',
  51  *      '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157',
  52  *      '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167',
  53  *      '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177',
  54  *      '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207',
  55  *      '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217',
  56  *      '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227',
  57  *      '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237',
  58  *      '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247',
  59  *      '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257',
  60  *      '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267',
  61  *      '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277',
  62  *      '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307',
  63  *      '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317',
  64  *      '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327',
  65  *      '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337',
  66  *      '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347',
  67  *      '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357',
  68  *      '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367',
  69  *      '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',
  70  * };
  71  *
  72  * int
  73  * ascii_strcasecmp(const char *s1, const char *s2)
  74  * {
  75  *      const unsigned char     *cm = (const unsigned char *)charmap;
  76  *      const unsigned char     *us1 = (const unsigned char *)s1;
  77  *      const unsigned char     *us2 = (const unsigned char *)s2;
  78  *
  79  *      while (cm[*us1] == cm[*us2++])
  80  *              if (*us1++ == '\0')
  81  *                      return (0);
  82  *      return (cm[*us1] - cm[*(us2 - 1)]);
  83  * }
  84  *
  85  * The following algorithm, from a 1987 news posting by Alan Mycroft, is
  86  * used for finding null bytes in a word:
  87  *
  88  * #define has_null(word) ((word - 0x01010101) & (~word & 0x80808080))
  89  *
  90  * The following algorithm is used for a wordwise tolower() operation:
  91  *
  92  * unsigned int
  93  * parallel_tolower (unsigned int x)
  94  * {
  95  *      unsigned int p;
  96  *      unsigned int q;
  97  *
  98  *      unsigned int m1 = 0x80808080;
  99  *      unsigned int m2 = 0x3f3f3f3f;
 100  *      unsigned int m3 = 0x25252525;
 101  *
 102  *      q = x & ~m1;// newb = byte & 0x7F
 103  *      p = q + m2; // newb > 0x5A --> MSB set
 104  *      q = q + m3; // newb < 0x41 --> MSB clear
 105  *      p = p & ~q; // newb > 0x40 && newb < 0x5B --> MSB set
 106  *      q = m1 & ~x;//  byte < 0x80 --> 0x80
 107  *      q = p & q;  // newb > 0x40 && newb < 0x5B && byte < 0x80 -> 0x80,else 0
 108  *      q = q >> 2; // newb > 0x40 && newb < 0x5B && byte < 0x80 -> 0x20,else 0
 109  *      return (x + q); // translate uppercase characters to lowercase
 110  * }
 111  *
 112  * Both algorithms have been tested exhaustively for all possible 2^32 inputs.
 113  */
 114
 115 #include <sys/asm_linkage.h>
 116
 117         ! The first part of this algorithm walks through the beginning of
 118         ! both strings a byte at a time until the source ptr is  aligned to
 119         ! a word boundary. During these steps, the bytes are translated to
 120         ! lower-case if they are upper-case, and are checked against
 121         ! the source string.
 122
 123         ENTRY(ascii_strcasecmp)
 124
 125         .align 32
 126
 127         save    %sp, -SA(WINDOWSIZE), %sp
 128         subcc   %i0, %i1, %i2           ! s1 == s2 ?
 129         bz      .stringsequal           ! yup, done, strings equal
 130         andcc   %i0, 3, %i3             ! s1 word-aligned ?
 131         bz      .s1aligned1             ! yup
 132         sethi   %hi(0x80808080), %i4    ! start loading Mycroft's magic1
 133
 134         ldub    [%i1 + %i2], %i0        ! s1[0]
 135         ldub    [%i1], %g1              ! s2[0]
 136         sub     %i0, 'A', %l0           ! transform for faster uppercase check
 137         sub     %g1, 'A', %l1           ! transform for faster uppercase check
 138         cmp     %l0, ('Z' - 'A')        ! s1[0] uppercase?
 139         bleu,a  .noxlate11              ! yes
 140         add     %i0, ('a' - 'A'), %i0   ! s1[0] = tolower(s1[0])
 141 .noxlate11:
 142         cmp     %l1, ('Z' - 'A')        ! s2[0] uppercase?
 143         bleu,a  .noxlate12              ! yes
 144         add     %g1, ('a' - 'A'), %g1   ! s2[0] = tolower(s2[0])
 145 .noxlate12:
 146         subcc   %i0, %g1, %i0           ! tolower(s1[0]) != tolower(s2[0]) ?
 147         bne     .done                   ! yup, done
 148         inc     %i1                     ! s1++, s2++
 149         addcc   %i0, %g1, %i0           ! s1[0] == 0 ?
 150         bz      .done                   ! yup, done, strings equal
 151         cmp     %i3, 3                  ! s1 aligned now?
 152         bz      .s1aligned2             ! yup
 153         sethi   %hi(0x01010101), %i5    ! start loading Mycroft's magic2
 154
 155         ldub    [%i1 + %i2], %i0        ! s1[1]
 156         ldub    [%i1], %g1              ! s2[1]
 157         sub     %i0, 'A', %l0           ! transform for faster uppercase check
 158         sub     %g1, 'A', %l1           ! transform for faster uppercase check
 159         cmp     %l0, ('Z' - 'A')        ! s1[1] uppercase?
 160         bleu,a  .noxlate21              ! yes
 161         add     %i0, ('a' - 'A'), %i0   ! s1[1] = tolower(s1[1])
 162 .noxlate21:
 163         cmp     %l1, ('Z' - 'A')        ! s2[1] uppercase?
 164         bleu,a  .noxlate22              ! yes
 165         add     %g1, ('a' - 'A'), %g1   ! s2[1] = tolower(s2[1])
 166 .noxlate22:
 167         subcc   %i0, %g1, %i0           ! tolower(s1[1]) != tolower(s2[1]) ?
 168         bne     .done                   ! yup, done
 169         inc     %i1                     ! s1++, s2++
 170         addcc   %i0, %g1, %i0           ! s1[1] == 0 ?
 171         bz      .done                   ! yup, done, strings equal
 172         cmp     %i3, 2                  ! s1 aligned now?
 173         bz      .s1aligned3             ! yup
 174         or      %i4, %lo(0x80808080),%i4! finish loading Mycroft's magic1
 175
 176         ldub    [%i1 + %i2], %i0        ! s1[2]
 177         ldub    [%i1], %g1              ! s2[2]
 178         sub     %i0, 'A', %l0           ! transform for faster uppercase check
 179         sub     %g1, 'A', %l1           ! transform for faster uppercase check
 180         cmp     %l0, ('Z' - 'A')        ! s1[2] uppercase?
 181         bleu,a  .noxlate31              ! yes
 182         add     %i0, ('a' - 'A'), %i0   ! s1[2] = tolower(s1[2])
 183 .noxlate31:
 184         cmp     %l1, ('Z' - 'A')        ! s2[2] uppercase?
 185         bleu,a  .noxlate32              ! yes
 186         add     %g1, ('a' - 'A'), %g1   ! s2[2] = tolower(s2[2])
 187 .noxlate32:
 188         subcc   %i0, %g1, %i0           ! tolower(s1[2]) != tolower(s2[2]) ?
 189         bne     .done                   ! yup, done
 190         inc     %i1                     ! s1++, s2++
 191         addcc   %i0, %g1, %i0           ! s1[2] == 0 ?
 192         bz      .done                   ! yup, done, strings equal
 193         or      %i5, %lo(0x01010101),%i5! finish loading Mycroft's magic2
 194         ba      .s1aligned4             ! s1 aligned now
 195         andcc   %i1, 3, %i3             ! s2 word-aligned ?
 196
 197         ! Here, we initialize our checks for a zero byte and decide
 198         ! whether or not we can optimize further if we're fortunate
 199         ! enough to have a word aligned desintation
 200
 201 .s1aligned1:
 202         sethi   %hi(0x01010101), %i5    ! start loading Mycroft's magic2
 203 .s1aligned2:
 204         or      %i4, %lo(0x80808080),%i4! finish loading Mycroft's magic1
 205 .s1aligned3:
 206         or      %i5, %lo(0x01010101),%i5! finish loading Mycroft's magic2
 207         andcc   %i1, 3, %i3             ! s2 word aligned ?
 208 .s1aligned4:
 209         sethi   %hi(0x3f3f3f3f), %l2    ! load m2 for parallel tolower()
 210         sethi   %hi(0x25252525), %l3    ! load m3 for parallel tolower()
 211         or      %l2, %lo(0x3f3f3f3f),%l2! finish loading m2
 212         bz      .word4                  ! yup, s2 word-aligned
 213         or      %l3, %lo(0x25252525),%l3! finish loading m3
 214
 215         add     %i2, %i3, %i2           ! start adjusting offset s1-s2
 216         sll     %i3, 3, %l6             ! shift factor for left shifts
 217         andn    %i1, 3, %i1             ! round s1 pointer down to next word
 218         sub     %g0, %l6, %l7           ! shift factor for right shifts
 219         orn     %i3, %g0, %i3           ! generate all ones
 220         lduw    [%i1], %i0              ! new lower word from s2
 221         srl     %i3, %l6, %i3           ! mask for fixing up bytes
 222         sll     %i0, %l6, %g1           ! partial unaligned word from s2
 223         orn     %i0, %i3, %i0           ! force start bytes to non-zero
 224         nop                             ! pad to align loop to 16-byte boundary
 225         nop                             ! pad to align loop to 16-byte boundary
 226
 227         ! This is the comparision procedure used if the destination is not
 228         ! word aligned, if it is, we use word4 & cmp4
 229
 230 .cmp:
 231         andn    %i4, %i0, %l4           ! ~word & 0x80808080
 232         sub     %i0, %i5, %l5           ! word - 0x01010101
 233         andcc   %l5, %l4, %g0           ! (word - 0x01010101) & ~word & 0x80808080
 234         bz,a    .doload                 ! null byte in previous aligned s2 word
 235         lduw    [%i1 + 4], %i0          ! load next aligned word from s2
 236 .doload:
 237         srl     %i0, %l7, %i3           ! byte(s) from new aligned word from s2
 238         or      %g1, %i3, %g1           ! merge to get unaligned word from s2
 239         lduw    [%i1 + %i2], %i3        ! x1 = word from s1
 240         andn    %i3, %i4, %l0           ! q1 = x1 & ~m1
 241         andn    %g1, %i4, %l4           ! q2 = x2 & ~m1
 242         add     %l0, %l2, %l1           ! p1 = q1 + m2
 243         add     %l4, %l2, %l5           ! p2 = q2 + m2
 244         add     %l0, %l3, %l0           ! q1 = q1 + m3
 245         add     %l4, %l3, %l4           ! q2 = q2 + m3
 246         andn    %l1, %l0, %l1           ! p1 = p1 & ~q1
 247         andn    %l5, %l4, %l5           ! p2 = p2 & ~q2
 248         andn    %i4, %i3, %l0           ! q1 = m1 & ~x1
 249         andn    %i4, %g1, %l4           ! q2 = m1 & ~x2
 250         and     %l0, %l1, %l0           ! q1 = p1 & q1
 251         and     %l4, %l5, %l4           ! q2 = p2 & q2
 252         srl     %l0, 2, %l0             ! q1 = q1 >> 2
 253         srl     %l4, 2, %l4             ! q2 = q2 >> 2
 254         add     %l0, %i3, %i3           ! lowercase word from s1
 255         add     %l4, %g1, %g1           ! lowercase word from s2
 256         cmp     %i3, %g1                ! tolower(*s1) != tolower(*s2) ?
 257         bne     .wordsdiffer            ! yup, now find byte that is different
 258         add     %i1, 4, %i1             ! s1+=4, s2+=4
 259         andn    %i4, %i3, %l4           ! ~word & 0x80808080
 260         sub     %i3, %i5, %l5           ! word - 0x01010101
 261         andcc   %l5, %l4, %g0           ! (word - 0x01010101) & ~word & 0x80808080
 262         bz      .cmp                    ! no null-byte in s1 yet
 263         sll     %i0, %l6, %g1           ! bytes from old aligned word from s2
 264
 265         ! words are equal but the end of s1 has been reached
 266         ! this means the strings must be equal
 267 .stringsequal:
 268         ret                             ! return
 269         restore %g0, %g0, %o0           ! return 0, i.e. strings are equal
 270         nop                             ! pad
 271
 272         ! we have a word aligned source and destination!  This means
 273         ! things get to go fast!
 274
 275 .word4:
 276         lduw    [%i1 + %i2], %i3        ! x1 = word from s1
 277
 278 .cmp4:
 279         andn    %i3, %i4, %l0           ! q1 = x1 & ~m1
 280         lduw    [%i1], %g1              ! x2 = word from s2
 281         andn    %g1, %i4, %l4           ! q2 = x2 & ~m1
 282         add     %l0, %l2, %l1           ! p1 = q1 + m2
 283         add     %l4, %l2, %l5           ! p2 = q2 + m2
 284         add     %l0, %l3, %l0           ! q1 = q1 + m3
 285         add     %l4, %l3, %l4           ! q2 = q2 + m3
 286         andn    %l1, %l0, %l1           ! p1 = p1 & ~q1
 287         andn    %l5, %l4, %l5           ! p2 = p2 & ~q2
 288         andn    %i4, %i3, %l0           ! q1 = m1 & ~x1
 289         andn    %i4, %g1, %l4           ! q2 = m1 & ~x2
 290         and     %l0, %l1, %l0           ! q1 = p1 & q1
 291         and     %l4, %l5, %l4           ! q2 = p2 & q2
 292         srl     %l0, 2, %l0             ! q1 = q1 >> 2
 293         srl     %l4, 2, %l4             ! q2 = q2 >> 2
 294         add     %l0, %i3, %i3           ! lowercase word from s1
 295         add     %l4, %g1, %g1           ! lowercase word from s2
 296         cmp     %i3, %g1                ! tolower(*s1) != tolower(*s2) ?
 297         bne     .wordsdiffer            ! yup, now find mismatching character
 298         add     %i1, 4, %i1             ! s1+=4, s2+=4
 299         andn    %i4, %i3, %l4           ! ~word & 0x80808080
 300         sub     %i3, %i5, %l5           ! word - 0x01010101
 301         andcc   %l5, %l4, %g0           ! (word - 0x01010101) & ~word & 0x80808080
 302         bz,a    .cmp4                   ! no null-byte in s1 yet
 303         lduw    [%i1 + %i2], %i3        ! load word from s1
 304
 305         ! words are equal but the end of s1 has been reached
 306         ! this means the strings must be equal
 307 .stringsequal4:
 308         ret                             ! return
 309         restore %g0, %g0, %o0           ! return 0, i.e. strings are equal
 310
 311 .wordsdiffer:
 312         srl     %g1, 24, %i2            ! first byte of mismatching word in s2
 313         srl     %i3, 24, %i1            ! first byte of mismatching word in s1
 314         subcc   %i1, %i2, %i0           ! *s1-*s2
 315         bnz     .done                   ! bytes differ, return difference
 316         srl     %g1, 16, %i2            ! second byte of mismatching word in s2
 317         andcc   %i1, 0xff, %i0          ! *s1 == 0 ?
 318         bz      .done                   ! yup, done, strings equal
 319
 320         ! we know byte 1 is equal, so can compare bytes 1,2 as a group
 321
 322         srl     %i3, 16, %i1            ! second byte of mismatching word in s1
 323         subcc   %i1, %i2, %i0           ! *s1-*s2
 324         bnz     .done                   ! bytes differ, return difference
 325         srl     %g1, 8, %i2             ! third byte of mismatching word in s2
 326         andcc   %i1, 0xff, %i0          ! *s1 == 0 ?
 327         bz      .done                   ! yup, done, strings equal
 328
 329         ! we know bytes 1, 2 are equal, so can compare bytes 1,2,3 as a group
 330
 331         srl     %i3, 8, %i1             ! third byte of mismatching word in s1
 332         subcc   %i1, %i2, %i0           ! *s1-*s2
 333         bnz     .done                   ! bytes differ, return difference
 334         andcc   %i1, 0xff, %g0          ! *s1 == 0 ?
 335         bz      .stringsequal           ! yup, done, strings equal
 336
 337         ! we know bytes 1,2,3 are equal, so can compare bytes 1,2,3,4 as group
 338
 339         subcc   %i3, %g1, %i0           ! *s1-*s2
 340         bz,a    .done                   ! bytes differ, return difference
 341         andcc   %i3, 0xff, %i0          ! *s1 == 0, strings equal
 342
 343 .done:
 344         ret                             ! return
 345         restore %i0, %g0, %o0           ! return 0 or byte difference
 346
 347         SET_SIZE(ascii_strcasecmp)