sysdeps/alpha/stxncpy.S

   1 /* Copyright (C) 1996, 1997, 2002 Free Software Foundation, Inc.
   2    Contributed by Richard Henderson (rth@tamu.edu)
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, write to the Free
  17    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  18    02111-1307 USA.  */
  19
  20 /* Copy no more than COUNT bytes of the null-terminated string from
  21    SRC to DST.
  22
  23    This is an internal routine used by strncpy, stpncpy, and strncat.
  24    As such, it uses special linkage conventions to make implementation
  25    of these public functions more efficient.
  26
  27    On input:
  28         t9 = return address
  29         a0 = DST
  30         a1 = SRC
  31         a2 = COUNT
  32
  33    Furthermore, COUNT may not be zero.
  34
  35    On output:
  36         t0  = last word written
  37         t8  = bitmask (with one bit set) indicating the last byte written
  38         t10 = bitmask (with one bit set) indicating the byte position of
  39               the end of the range specified by COUNT
  40         a0  = unaligned address of the last *word* written
  41         a2  = the number of full words left in COUNT
  42
  43    Furthermore, v0, a3-a5, t11, and t12 are untouched.
  44 */
  45
  46
  47 /* This is generally scheduled for the EV5, but should still be pretty
  48    good for the EV4 too.  */
  49
  50 #include <sysdep.h>
  51
  52         .set noat
  53         .set noreorder
  54
  55         .text
  56
  57 /* There is a problem with either gdb (as of 4.16) or gas (as of 2.7) that
  58    doesn't like putting the entry point for a procedure somewhere in the
  59    middle of the procedure descriptor.  Work around this by putting the
  60    aligned copy in its own procedure descriptor */
  61
  62         .ent stxncpy_aligned
  63         .align 3
  64 stxncpy_aligned:
  65         .frame sp, 0, t9, 0
  66         .prologue 0
  67
  68         /* On entry to this basic block:
  69            t0 == the first destination word for masking back in
  70            t1 == the first source word.  */
  71
  72         /* Create the 1st output word and detect 0's in the 1st input word.  */
  73         lda     t2, -1          # e1    : build a mask against false zero
  74         mskqh   t2, a1, t2      # e0    :   detection in the src word
  75         mskqh   t1, a1, t3      # e0    :
  76         ornot   t1, t2, t2      # .. e1 :
  77         mskql   t0, a1, t0      # e0    : assemble the first output word
  78         cmpbge  zero, t2, t7    # .. e1 : bits set iff null found
  79         or      t0, t3, t0      # e0    :
  80         beq     a2, $a_eoc      # .. e1 :
  81         bne     t7, $a_eos      # .. e1 :
  82
  83         /* On entry to this basic block:
  84            t0 == a source word not containing a null.  */
  85
  86 $a_loop:
  87         stq_u   t0, 0(a0)       # e0    :
  88         addq    a0, 8, a0       # .. e1 :
  89         ldq_u   t0, 0(a1)       # e0    :
  90         addq    a1, 8, a1       # .. e1 :
  91         subq    a2, 1, a2       # e0    :
  92         cmpbge  zero, t0, t7    # .. e1 (stall)
  93         beq     a2, $a_eoc      # e1    :
  94         beq     t7, $a_loop     # e1    :
  95
  96         /* Take care of the final (partial) word store.  At this point
  97            the end-of-count bit is set in t7 iff it applies.
  98
  99            On entry to this basic block we have:
 100            t0 == the source word containing the null
 101            t7 == the cmpbge mask that found it.  */
 102
 103 $a_eos:
 104         negq    t7, t8          # e0    : find low bit set
 105         and     t7, t8, t8      # e1 (stall)
 106
 107         /* For the sake of the cache, don't read a destination word
 108            if we're not going to need it.  */
 109         and     t8, 0x80, t6    # e0    :
 110         bne     t6, 1f          # .. e1 (zdb)
 111
 112         /* We're doing a partial word store and so need to combine
 113            our source and original destination words.  */
 114         ldq_u   t1, 0(a0)       # e0    :
 115         subq    t8, 1, t6       # .. e1 :
 116         or      t8, t6, t7      # e0    :
 117         unop                    #
 118         zapnot  t0, t7, t0      # e0    : clear src bytes > null
 119         zap     t1, t7, t1      # .. e1 : clear dst bytes <= null
 120         or      t0, t1, t0      # e1    :
 121
 122 1:      stq_u   t0, 0(a0)       # e0    :
 123         ret     (t9)            # e1    :
 124
 125         /* Add the end-of-count bit to the eos detection bitmask.  */
 126 $a_eoc:
 127         or      t10, t7, t7
 128         br      $a_eos
 129
 130         .end stxncpy_aligned
 131
 132         .align 3
 133         .ent __stxncpy
 134         .globl __stxncpy
 135 __stxncpy:
 136         .frame sp, 0, t9, 0
 137         .prologue 0
 138
 139         /* Are source and destination co-aligned?  */
 140         xor     a0, a1, t1      # e0    :
 141         and     a0, 7, t0       # .. e1 : find dest misalignment
 142         and     t1, 7, t1       # e0    :
 143         addq    a2, t0, a2      # .. e1 : bias count by dest misalignment
 144         subq    a2, 1, a2       # e0    :
 145         and     a2, 7, t2       # e1    :
 146         srl     a2, 3, a2       # e0    : a2 = loop counter = (count - 1)/8
 147         addq    zero, 1, t10    # .. e1 :
 148         sll     t10, t2, t10    # e0    : t10 = bitmask of last count byte
 149         bne     t1, $unaligned  # .. e1 :
 150
 151         /* We are co-aligned; take care of a partial first word.  */
 152
 153         ldq_u   t1, 0(a1)       # e0    : load first src word
 154         addq    a1, 8, a1       # .. e1 :
 155
 156         beq     t0, stxncpy_aligned     # avoid loading dest word if not needed
 157         ldq_u   t0, 0(a0)       # e0    :
 158         br      stxncpy_aligned # .. e1 :
 159
 160
 161 /* The source and destination are not co-aligned.  Align the destination
 162    and cope.  We have to be very careful about not reading too much and
 163    causing a SEGV.  */
 164
 165         .align 3
 166 $u_head:
 167         /* We know just enough now to be able to assemble the first
 168            full source word.  We can still find a zero at the end of it
 169            that prevents us from outputting the whole thing.
 170
 171            On entry to this basic block:
 172            t0 == the first dest word, unmasked
 173            t1 == the shifted low bits of the first source word
 174            t6 == bytemask that is -1 in dest word bytes */
 175
 176         ldq_u   t2, 8(a1)       # e0    : load second src word
 177         addq    a1, 8, a1       # .. e1 :
 178         mskql   t0, a0, t0      # e0    : mask trailing garbage in dst
 179         extqh   t2, a1, t4      # e0    :
 180         or      t1, t4, t1      # e1    : first aligned src word complete
 181         mskqh   t1, a0, t1      # e0    : mask leading garbage in src
 182         or      t0, t1, t0      # e0    : first output word complete
 183         or      t0, t6, t6      # e1    : mask original data for zero test
 184         cmpbge  zero, t6, t7    # e0    :
 185         beq     a2, $u_eocfin   # .. e1 :
 186         lda     t6, -1          # e0    :
 187         bne     t7, $u_final    # .. e1 :
 188
 189         mskql   t6, a1, t6              # e0    : mask out bits already seen
 190         nop                             # .. e1 :
 191         stq_u   t0, 0(a0)               # e0    : store first output word
 192         or      t6, t2, t2              # .. e1 :
 193         cmpbge  zero, t2, t7            # e0    : find nulls in second partial
 194         addq    a0, 8, a0               # .. e1 :
 195         subq    a2, 1, a2               # e0    :
 196         bne     t7, $u_late_head_exit   # .. e1 :
 197
 198         /* Finally, we've got all the stupid leading edge cases taken care
 199            of and we can set up to enter the main loop.  */
 200
 201         extql   t2, a1, t1      # e0    : position hi-bits of lo word
 202         beq     a2, $u_eoc      # .. e1 :
 203         ldq_u   t2, 8(a1)       # e0    : read next high-order source word
 204         addq    a1, 8, a1       # .. e1 :
 205         extqh   t2, a1, t0      # e0    : position lo-bits of hi word
 206         cmpbge  zero, t2, t7    # .. e1 : test new word for eos
 207         nop                     # e0    :
 208         bne     t7, $u_eos      # .. e1 :
 209
 210         /* Unaligned copy main loop.  In order to avoid reading too much,
 211            the loop is structured to detect zeros in aligned source words.
 212            This has, unfortunately, effectively pulled half of a loop
 213            iteration out into the head and half into the tail, but it does
 214            prevent nastiness from accumulating in the very thing we want
 215            to run as fast as possible.
 216
 217            On entry to this basic block:
 218            t0 == the shifted low-order bits from the current source word
 219            t1 == the shifted high-order bits from the previous source word
 220            t2 == the unshifted current source word
 221
 222            We further know that t2 does not contain a null terminator.  */
 223
 224         .align 3
 225 $u_loop:
 226         or      t0, t1, t0      # e0    : current dst word now complete
 227         subq    a2, 1, a2       # .. e1 : decrement word count
 228         stq_u   t0, 0(a0)       # e0    : save the current word
 229         addq    a0, 8, a0       # .. e1 :
 230         extql   t2, a1, t1      # e0    : extract high bits for next time
 231         beq     a2, $u_eoc      # .. e1 :
 232         ldq_u   t2, 8(a1)       # e0    : load high word for next time
 233         addq    a1, 8, a1       # .. e1 :
 234         nop                     # e0    :
 235         cmpbge  zero, t2, t7    # .. e1 : test new word for eos
 236         extqh   t2, a1, t0      # e0    : extract low bits for current word
 237         beq     t7, $u_loop     # .. e1 :
 238
 239         /* We've found a zero somewhere in the source word we just read.
 240            If it resides in the lower half, we have one (probably partial)
 241            word to write out, and if it resides in the upper half, we
 242            have one full and one partial word left to write out.
 243
 244            On entry to this basic block:
 245            t0 == the shifted low-order bits from the current source word
 246            t1 == the shifted high-order bits from the previous source word
 247            t2 == the unshifted current source word.  */
 248 $u_eos:
 249         or      t0, t1, t0      # e0    : first (partial) source word complete
 250         cmpbge  zero, t0, t7    # e0    : is the null in this first bit?
 251         bne     t7, $u_final    # .. e1 (zdb)
 252
 253         stq_u   t0, 0(a0)       # e0    : the null was in the high-order bits
 254         addq    a0, 8, a0       # .. e1 :
 255         subq    a2, 1, a2       # e0    :
 256
 257 $u_late_head_exit:
 258         extql   t2, a1, t0      # e0    :
 259         cmpbge  zero, t0, t7    # e0    :
 260         or      t7, t10, t6     # e1    :
 261         cmoveq  a2, t6, t7      # e0    :
 262
 263         /* Take care of a final (probably partial) result word.
 264            On entry to this basic block:
 265            t0 == assembled source word
 266            t7 == cmpbge mask that found the null.  */
 267 $u_final:
 268         negq    t7, t6          # e0    : isolate low bit set
 269         and     t6, t7, t8      # e1    :
 270
 271         and     t8, 0x80, t6    # e0    : avoid dest word load if we can
 272         bne     t6, 1f          # .. e1 (zdb)
 273
 274         ldq_u   t1, 0(a0)       # e0    :
 275         subq    t8, 1, t6       # .. e1 :
 276         or      t6, t8, t7      # e0    :
 277         zapnot  t0, t7, t0      # .. e1 : kill source bytes > null
 278         zap     t1, t7, t1      # e0    : kill dest bytes <= null
 279         or      t0, t1, t0      # e1    :
 280
 281 1:      stq_u   t0, 0(a0)       # e0    :
 282         ret     (t9)            # .. e1 :
 283
 284         /* Got to end-of-count before end of string.
 285            On entry to this basic block:
 286            t1 == the shifted high-order bits from the previous source word  */
 287 $u_eoc:
 288         and     a1, 7, t6       # e1    :
 289         sll     t10, t6, t6     # e0    :
 290         and     t6, 0xff, t6    # e0    :
 291         bne     t6, 1f          # e1    : avoid src word load if we can
 292
 293         ldq_u   t2, 8(a1)       # e0    : load final src word
 294         nop                     # .. e1 :
 295         extqh   t2, a1, t0      # e0    : extract high bits for last word
 296         or      t1, t0, t1      # e1    :
 297
 298 1:      cmpbge  zero, t1, t7
 299         mov     t1, t0
 300
 301 $u_eocfin:                      # end-of-count, final word
 302         or      t10, t7, t7
 303         br      $u_final
 304
 305         /* Unaligned copy entry point.  */
 306         .align 3
 307 $unaligned:
 308
 309         ldq_u   t1, 0(a1)       # e0    : load first source word
 310
 311         and     a0, 7, t4       # .. e1 : find dest misalignment
 312         and     a1, 7, t5       # e0    : find src misalignment
 313
 314         /* Conditionally load the first destination word and a bytemask
 315            with 0xff indicating that the destination byte is sacrosanct.  */
 316
 317         mov     zero, t0        # .. e1 :
 318         mov     zero, t6        # e0    :
 319         beq     t4, 1f          # .. e1 :
 320         ldq_u   t0, 0(a0)       # e0    :
 321         lda     t6, -1          # .. e1 :
 322         mskql   t6, a0, t6      # e0    :
 323 1:
 324         subq    a1, t4, a1      # .. e1 : sub dest misalignment from src addr
 325
 326         /* If source misalignment is larger than dest misalignment, we need
 327            extra startup checks to avoid SEGV.  */
 328
 329         cmplt   t4, t5, t8      # e1    :
 330         extql   t1, a1, t1      # .. e0 : shift src into place
 331         lda     t2, -1          # e0    : for creating masks later
 332         beq     t8, $u_head     # e1    :
 333
 334         mskqh   t2, t5, t2      # e0    : begin src byte validity mask
 335         cmpbge  zero, t1, t7    # .. e1 : is there a zero?
 336         extql   t2, a1, t2      # e0    :
 337         or      t7, t10, t5     # .. e1 : test for end-of-count too
 338         cmpbge  zero, t2, t3    # e0    :
 339         cmoveq  a2, t5, t7      # .. e1 :
 340         andnot  t7, t3, t7      # e0    :
 341         beq     t7, $u_head     # .. e1 (zdb)
 342
 343         /* At this point we've found a zero in the first partial word of
 344            the source.  We need to isolate the valid source data and mask
 345            it into the original destination data.  (Incidentally, we know
 346            that we'll need at least one byte of that original dest word.) */
 347
 348         ldq_u   t0, 0(a0)       # e0    :
 349         negq    t7, t6          # .. e1 : build bitmask of bytes <= zero
 350         mskqh   t1, t4, t1      # e0    :
 351         and     t6, t7, t8      # .. e1 :
 352         subq    t8, 1, t6       # e0    :
 353         or      t6, t8, t7      # e1    :
 354
 355         zapnot  t2, t7, t2      # e0    : prepare source word; mirror changes
 356         zapnot  t1, t7, t1      # .. e1 : to source validity mask
 357
 358         andnot  t0, t2, t0      # e0    : zero place for source to reside
 359         or      t0, t1, t0      # e1    : and put it there
 360         stq_u   t0, 0(a0)       # e0    :
 361         ret     (t9)            # .. e1 :
 362
 363         .end __stxncpy