arch/alpha/lib/ev6-memset.S

   1 /* SPDX-License-Identifier: GPL-2.0 */
   2 /*
   3  * arch/alpha/lib/ev6-memset.S
   4  *
   5  * This is an efficient (and relatively small) implementation of the C library
   6  * "memset()" function for the 21264 implementation of Alpha.
   7  *
   8  * 21264 version  contributed by Rick Gorton <rick.gorton@alpha-processor.com>
   9  *
  10  * Much of the information about 21264 scheduling/coding comes from:
  11  *      Compiler Writer's Guide for the Alpha 21264
  12  *      abbreviated as 'CWG' in other comments here
  13  *      ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
  14  * Scheduling notation:
  15  *      E       - either cluster
  16  *      U       - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
  17  *      L       - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
  18  * The algorithm for the leading and trailing quadwords remains the same,
  19  * however the loop has been unrolled to enable better memory throughput,
  20  * and the code has been replicated for each of the entry points: __memset
  21  * and __memset16 to permit better scheduling to eliminate the stalling
  22  * encountered during the mask replication.
  23  * A future enhancement might be to put in a byte store loop for really
  24  * small (say < 32 bytes) memset()s.  Whether or not that change would be
  25  * a win in the kernel would depend upon the contextual usage.
  26  * WARNING: Maintaining this is going to be more work than the above version,
  27  * as fixes will need to be made in multiple places.  The performance gain
  28  * is worth it.
  29  */
  30 #include <asm/export.h>
  31         .set noat
  32         .set noreorder
  33 .text
  34         .globl memset
  35         .globl __memset
  36         .globl ___memset
  37         .globl __memset16
  38         .globl __constant_c_memset
  39
  40         .ent ___memset
  41 .align 5
  42 ___memset:
  43         .frame $30,0,$26,0
  44         .prologue 0
  45
  46         /*
  47          * Serious stalling happens.  The only way to mitigate this is to
  48          * undertake a major re-write to interleave the constant materialization
  49          * with other parts of the fall-through code.  This is important, even
  50          * though it makes maintenance tougher.
  51          * Do this later.
  52          */
  53         and $17,255,$1          # E : 00000000000000ch
  54         insbl $17,1,$2          # U : 000000000000ch00
  55         bis $16,$16,$0          # E : return value
  56         ble $18,end_b           # U : zero length requested?
  57
  58         addq $18,$16,$6         # E : max address to write to
  59         bis     $1,$2,$17       # E : 000000000000chch
  60         insbl   $1,2,$3         # U : 0000000000ch0000
  61         insbl   $1,3,$4         # U : 00000000ch000000
  62
  63         or      $3,$4,$3        # E : 00000000chch0000
  64         inswl   $17,4,$5        # U : 0000chch00000000
  65         xor     $16,$6,$1       # E : will complete write be within one quadword?
  66         inswl   $17,6,$2        # U : chch000000000000
  67
  68         or      $17,$3,$17      # E : 00000000chchchch
  69         or      $2,$5,$2        # E : chchchch00000000
  70         bic     $1,7,$1         # E : fit within a single quadword?
  71         and     $16,7,$3        # E : Target addr misalignment
  72
  73         or      $17,$2,$17      # E : chchchchchchchch
  74         beq     $1,within_quad_b # U :
  75         nop                     # E :
  76         beq     $3,aligned_b    # U : target is 0mod8
  77
  78         /*
  79          * Target address is misaligned, and won't fit within a quadword
  80          */
  81         ldq_u $4,0($16)         # L : Fetch first partial
  82         bis $16,$16,$5          # E : Save the address
  83         insql $17,$16,$2        # U : Insert new bytes
  84         subq $3,8,$3            # E : Invert (for addressing uses)
  85
  86         addq $18,$3,$18         # E : $18 is new count ($3 is negative)
  87         mskql $4,$16,$4         # U : clear relevant parts of the quad
  88         subq $16,$3,$16         # E : $16 is new aligned destination
  89         bis $2,$4,$1            # E : Final bytes
  90
  91         nop
  92         stq_u $1,0($5)          # L : Store result
  93         nop
  94         nop
  95
  96 .align 4
  97 aligned_b:
  98         /*
  99          * We are now guaranteed to be quad aligned, with at least
 100          * one partial quad to write.
 101          */
 102
 103         sra $18,3,$3            # U : Number of remaining quads to write
 104         and $18,7,$18           # E : Number of trailing bytes to write
 105         bis $16,$16,$5          # E : Save dest address
 106         beq $3,no_quad_b        # U : tail stuff only
 107
 108         /*
 109          * it's worth the effort to unroll this and use wh64 if possible
 110          * Lifted a bunch of code from clear_user.S
 111          * At this point, entry values are:
 112          * $16  Current destination address
 113          * $5   A copy of $16
 114          * $6   The max quadword address to write to
 115          * $18  Number trailer bytes
 116          * $3   Number quads to write
 117          */
 118
 119         and     $16, 0x3f, $2   # E : Forward work (only useful for unrolled loop)
 120         subq    $3, 16, $4      # E : Only try to unroll if > 128 bytes
 121         subq    $2, 0x40, $1    # E : bias counter (aligning stuff 0mod64)
 122         blt     $4, loop_b      # U :
 123
 124         /*
 125          * We know we've got at least 16 quads, minimum of one trip
 126          * through unrolled loop.  Do a quad at a time to get us 0mod64
 127          * aligned.
 128          */
 129
 130         nop                     # E :
 131         nop                     # E :
 132         nop                     # E :
 133         beq     $1, $bigalign_b # U :
 134
 135 $alignmod64_b:
 136         stq     $17, 0($5)      # L :
 137         subq    $3, 1, $3       # E : For consistency later
 138         addq    $1, 8, $1       # E : Increment towards zero for alignment
 139         addq    $5, 8, $4       # E : Initial wh64 address (filler instruction)
 140
 141         nop
 142         nop
 143         addq    $5, 8, $5       # E : Inc address
 144         blt     $1, $alignmod64_b # U :
 145
 146 $bigalign_b:
 147         /*
 148          * $3 - number quads left to go
 149          * $5 - target address (aligned 0mod64)
 150          * $17 - mask of stuff to store
 151          * Scratch registers available: $7, $2, $4, $1
 152          * we know that we'll be taking a minimum of one trip through
 153          * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
 154          * Assumes the wh64 needs to be for 2 trips through the loop in the future
 155          * The wh64 is issued on for the starting destination address for trip +2
 156          * through the loop, and if there are less than two trips left, the target
 157          * address will be for the current trip.
 158          */
 159
 160 $do_wh64_b:
 161         wh64    ($4)            # L1 : memory subsystem write hint
 162         subq    $3, 24, $2      # E : For determining future wh64 addresses
 163         stq     $17, 0($5)      # L :
 164         nop                     # E :
 165
 166         addq    $5, 128, $4     # E : speculative target of next wh64
 167         stq     $17, 8($5)      # L :
 168         stq     $17, 16($5)     # L :
 169         addq    $5, 64, $7      # E : Fallback address for wh64 (== next trip addr)
 170
 171         stq     $17, 24($5)     # L :
 172         stq     $17, 32($5)     # L :
 173         cmovlt  $2, $7, $4      # E : Latency 2, extra mapping cycle
 174         nop
 175
 176         stq     $17, 40($5)     # L :
 177         stq     $17, 48($5)     # L :
 178         subq    $3, 16, $2      # E : Repeat the loop at least once more?
 179         nop
 180
 181         stq     $17, 56($5)     # L :
 182         addq    $5, 64, $5      # E :
 183         subq    $3, 8, $3       # E :
 184         bge     $2, $do_wh64_b  # U :
 185
 186         nop
 187         nop
 188         nop
 189         beq     $3, no_quad_b   # U : Might have finished already
 190
 191 .align 4
 192         /*
 193          * Simple loop for trailing quadwords, or for small amounts
 194          * of data (where we can't use an unrolled loop and wh64)
 195          */
 196 loop_b:
 197         stq $17,0($5)           # L :
 198         subq $3,1,$3            # E : Decrement number quads left
 199         addq $5,8,$5            # E : Inc address
 200         bne $3,loop_b           # U : more?
 201
 202 no_quad_b:
 203         /*
 204          * Write 0..7 trailing bytes.
 205          */
 206         nop                     # E :
 207         beq $18,end_b           # U : All done?
 208         ldq $7,0($5)            # L :
 209         mskqh $7,$6,$2          # U : Mask final quad
 210
 211         insqh $17,$6,$4         # U : New bits
 212         bis $2,$4,$1            # E : Put it all together
 213         stq $1,0($5)            # L : And back to memory
 214         ret $31,($26),1         # L0 :
 215
 216 within_quad_b:
 217         ldq_u $1,0($16)         # L :
 218         insql $17,$16,$2        # U : New bits
 219         mskql $1,$16,$4         # U : Clear old
 220         bis $2,$4,$2            # E : New result
 221
 222         mskql $2,$6,$4          # U :
 223         mskqh $1,$6,$2          # U :
 224         bis $2,$4,$1            # E :
 225         stq_u $1,0($16)         # L :
 226
 227 end_b:
 228         nop
 229         nop
 230         nop
 231         ret $31,($26),1         # L0 :
 232         .end ___memset
 233         EXPORT_SYMBOL(___memset)
 234
 235         /*
 236          * This is the original body of code, prior to replication and
 237          * rescheduling.  Leave it here, as there may be calls to this
 238          * entry point.
 239          */
 240 .align 4
 241         .ent __constant_c_memset
 242 __constant_c_memset:
 243         .frame $30,0,$26,0
 244         .prologue 0
 245
 246         addq $18,$16,$6         # E : max address to write to
 247         bis $16,$16,$0          # E : return value
 248         xor $16,$6,$1           # E : will complete write be within one quadword?
 249         ble $18,end             # U : zero length requested?
 250
 251         bic $1,7,$1             # E : fit within a single quadword
 252         beq $1,within_one_quad  # U :
 253         and $16,7,$3            # E : Target addr misalignment
 254         beq $3,aligned          # U : target is 0mod8
 255
 256         /*
 257          * Target address is misaligned, and won't fit within a quadword
 258          */
 259         ldq_u $4,0($16)         # L : Fetch first partial
 260         bis $16,$16,$5          # E : Save the address
 261         insql $17,$16,$2        # U : Insert new bytes
 262         subq $3,8,$3            # E : Invert (for addressing uses)
 263
 264         addq $18,$3,$18         # E : $18 is new count ($3 is negative)
 265         mskql $4,$16,$4         # U : clear relevant parts of the quad
 266         subq $16,$3,$16         # E : $16 is new aligned destination
 267         bis $2,$4,$1            # E : Final bytes
 268
 269         nop
 270         stq_u $1,0($5)          # L : Store result
 271         nop
 272         nop
 273
 274 .align 4
 275 aligned:
 276         /*
 277          * We are now guaranteed to be quad aligned, with at least
 278          * one partial quad to write.
 279          */
 280
 281         sra $18,3,$3            # U : Number of remaining quads to write
 282         and $18,7,$18           # E : Number of trailing bytes to write
 283         bis $16,$16,$5          # E : Save dest address
 284         beq $3,no_quad          # U : tail stuff only
 285
 286         /*
 287          * it's worth the effort to unroll this and use wh64 if possible
 288          * Lifted a bunch of code from clear_user.S
 289          * At this point, entry values are:
 290          * $16  Current destination address
 291          * $5   A copy of $16
 292          * $6   The max quadword address to write to
 293          * $18  Number trailer bytes
 294          * $3   Number quads to write
 295          */
 296
 297         and     $16, 0x3f, $2   # E : Forward work (only useful for unrolled loop)
 298         subq    $3, 16, $4      # E : Only try to unroll if > 128 bytes
 299         subq    $2, 0x40, $1    # E : bias counter (aligning stuff 0mod64)
 300         blt     $4, loop        # U :
 301
 302         /*
 303          * We know we've got at least 16 quads, minimum of one trip
 304          * through unrolled loop.  Do a quad at a time to get us 0mod64
 305          * aligned.
 306          */
 307
 308         nop                     # E :
 309         nop                     # E :
 310         nop                     # E :
 311         beq     $1, $bigalign   # U :
 312
 313 $alignmod64:
 314         stq     $17, 0($5)      # L :
 315         subq    $3, 1, $3       # E : For consistency later
 316         addq    $1, 8, $1       # E : Increment towards zero for alignment
 317         addq    $5, 8, $4       # E : Initial wh64 address (filler instruction)
 318
 319         nop
 320         nop
 321         addq    $5, 8, $5       # E : Inc address
 322         blt     $1, $alignmod64 # U :
 323
 324 $bigalign:
 325         /*
 326          * $3 - number quads left to go
 327          * $5 - target address (aligned 0mod64)
 328          * $17 - mask of stuff to store
 329          * Scratch registers available: $7, $2, $4, $1
 330          * we know that we'll be taking a minimum of one trip through
 331          * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
 332          * Assumes the wh64 needs to be for 2 trips through the loop in the future
 333          * The wh64 is issued on for the starting destination address for trip +2
 334          * through the loop, and if there are less than two trips left, the target
 335          * address will be for the current trip.
 336          */
 337
 338 $do_wh64:
 339         wh64    ($4)            # L1 : memory subsystem write hint
 340         subq    $3, 24, $2      # E : For determining future wh64 addresses
 341         stq     $17, 0($5)      # L :
 342         nop                     # E :
 343
 344         addq    $5, 128, $4     # E : speculative target of next wh64
 345         stq     $17, 8($5)      # L :
 346         stq     $17, 16($5)     # L :
 347         addq    $5, 64, $7      # E : Fallback address for wh64 (== next trip addr)
 348
 349         stq     $17, 24($5)     # L :
 350         stq     $17, 32($5)     # L :
 351         cmovlt  $2, $7, $4      # E : Latency 2, extra mapping cycle
 352         nop
 353
 354         stq     $17, 40($5)     # L :
 355         stq     $17, 48($5)     # L :
 356         subq    $3, 16, $2      # E : Repeat the loop at least once more?
 357         nop
 358
 359         stq     $17, 56($5)     # L :
 360         addq    $5, 64, $5      # E :
 361         subq    $3, 8, $3       # E :
 362         bge     $2, $do_wh64    # U :
 363
 364         nop
 365         nop
 366         nop
 367         beq     $3, no_quad     # U : Might have finished already
 368
 369 .align 4
 370         /*
 371          * Simple loop for trailing quadwords, or for small amounts
 372          * of data (where we can't use an unrolled loop and wh64)
 373          */
 374 loop:
 375         stq $17,0($5)           # L :
 376         subq $3,1,$3            # E : Decrement number quads left
 377         addq $5,8,$5            # E : Inc address
 378         bne $3,loop             # U : more?
 379
 380 no_quad:
 381         /*
 382          * Write 0..7 trailing bytes.
 383          */
 384         nop                     # E :
 385         beq $18,end             # U : All done?
 386         ldq $7,0($5)            # L :
 387         mskqh $7,$6,$2          # U : Mask final quad
 388
 389         insqh $17,$6,$4         # U : New bits
 390         bis $2,$4,$1            # E : Put it all together
 391         stq $1,0($5)            # L : And back to memory
 392         ret $31,($26),1         # L0 :
 393
 394 within_one_quad:
 395         ldq_u $1,0($16)         # L :
 396         insql $17,$16,$2        # U : New bits
 397         mskql $1,$16,$4         # U : Clear old
 398         bis $2,$4,$2            # E : New result
 399
 400         mskql $2,$6,$4          # U :
 401         mskqh $1,$6,$2          # U :
 402         bis $2,$4,$1            # E :
 403         stq_u $1,0($16)         # L :
 404
 405 end:
 406         nop
 407         nop
 408         nop
 409         ret $31,($26),1         # L0 :
 410         .end __constant_c_memset
 411         EXPORT_SYMBOL(__constant_c_memset)
 412
 413         /*
 414          * This is a replicant of the __constant_c_memset code, rescheduled
 415          * to mask stalls.  Note that entry point names also had to change
 416          */
 417         .align 5
 418         .ent __memset16
 419
 420 __memset16:
 421         .frame $30,0,$26,0
 422         .prologue 0
 423
 424         inswl $17,0,$5          # U : 000000000000c1c2
 425         inswl $17,2,$2          # U : 00000000c1c20000
 426         bis $16,$16,$0          # E : return value
 427         addq    $18,$16,$6      # E : max address to write to
 428
 429         ble $18, end_w          # U : zero length requested?
 430         inswl   $17,4,$3        # U : 0000c1c200000000
 431         inswl   $17,6,$4        # U : c1c2000000000000
 432         xor     $16,$6,$1       # E : will complete write be within one quadword?
 433
 434         or      $2,$5,$2        # E : 00000000c1c2c1c2
 435         or      $3,$4,$17       # E : c1c2c1c200000000
 436         bic     $1,7,$1         # E : fit within a single quadword
 437         and     $16,7,$3        # E : Target addr misalignment
 438
 439         or      $17,$2,$17      # E : c1c2c1c2c1c2c1c2
 440         beq $1,within_quad_w    # U :
 441         nop
 442         beq $3,aligned_w        # U : target is 0mod8
 443
 444         /*
 445          * Target address is misaligned, and won't fit within a quadword
 446          */
 447         ldq_u $4,0($16)         # L : Fetch first partial
 448         bis $16,$16,$5          # E : Save the address
 449         insql $17,$16,$2        # U : Insert new bytes
 450         subq $3,8,$3            # E : Invert (for addressing uses)
 451
 452         addq $18,$3,$18         # E : $18 is new count ($3 is negative)
 453         mskql $4,$16,$4         # U : clear relevant parts of the quad
 454         subq $16,$3,$16         # E : $16 is new aligned destination
 455         bis $2,$4,$1            # E : Final bytes
 456
 457         nop
 458         stq_u $1,0($5)          # L : Store result
 459         nop
 460         nop
 461
 462 .align 4
 463 aligned_w:
 464         /*
 465          * We are now guaranteed to be quad aligned, with at least
 466          * one partial quad to write.
 467          */
 468
 469         sra $18,3,$3            # U : Number of remaining quads to write
 470         and $18,7,$18           # E : Number of trailing bytes to write
 471         bis $16,$16,$5          # E : Save dest address
 472         beq $3,no_quad_w        # U : tail stuff only
 473
 474         /*
 475          * it's worth the effort to unroll this and use wh64 if possible
 476          * Lifted a bunch of code from clear_user.S
 477          * At this point, entry values are:
 478          * $16  Current destination address
 479          * $5   A copy of $16
 480          * $6   The max quadword address to write to
 481          * $18  Number trailer bytes
 482          * $3   Number quads to write
 483          */
 484
 485         and     $16, 0x3f, $2   # E : Forward work (only useful for unrolled loop)
 486         subq    $3, 16, $4      # E : Only try to unroll if > 128 bytes
 487         subq    $2, 0x40, $1    # E : bias counter (aligning stuff 0mod64)
 488         blt     $4, loop_w      # U :
 489
 490         /*
 491          * We know we've got at least 16 quads, minimum of one trip
 492          * through unrolled loop.  Do a quad at a time to get us 0mod64
 493          * aligned.
 494          */
 495
 496         nop                     # E :
 497         nop                     # E :
 498         nop                     # E :
 499         beq     $1, $bigalign_w # U :
 500
 501 $alignmod64_w:
 502         stq     $17, 0($5)      # L :
 503         subq    $3, 1, $3       # E : For consistency later
 504         addq    $1, 8, $1       # E : Increment towards zero for alignment
 505         addq    $5, 8, $4       # E : Initial wh64 address (filler instruction)
 506
 507         nop
 508         nop
 509         addq    $5, 8, $5       # E : Inc address
 510         blt     $1, $alignmod64_w       # U :
 511
 512 $bigalign_w:
 513         /*
 514          * $3 - number quads left to go
 515          * $5 - target address (aligned 0mod64)
 516          * $17 - mask of stuff to store
 517          * Scratch registers available: $7, $2, $4, $1
 518          * we know that we'll be taking a minimum of one trip through
 519          * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
 520          * Assumes the wh64 needs to be for 2 trips through the loop in the future
 521          * The wh64 is issued on for the starting destination address for trip +2
 522          * through the loop, and if there are less than two trips left, the target
 523          * address will be for the current trip.
 524          */
 525
 526 $do_wh64_w:
 527         wh64    ($4)            # L1 : memory subsystem write hint
 528         subq    $3, 24, $2      # E : For determining future wh64 addresses
 529         stq     $17, 0($5)      # L :
 530         nop                     # E :
 531
 532         addq    $5, 128, $4     # E : speculative target of next wh64
 533         stq     $17, 8($5)      # L :
 534         stq     $17, 16($5)     # L :
 535         addq    $5, 64, $7      # E : Fallback address for wh64 (== next trip addr)
 536
 537         stq     $17, 24($5)     # L :
 538         stq     $17, 32($5)     # L :
 539         cmovlt  $2, $7, $4      # E : Latency 2, extra mapping cycle
 540         nop
 541
 542         stq     $17, 40($5)     # L :
 543         stq     $17, 48($5)     # L :
 544         subq    $3, 16, $2      # E : Repeat the loop at least once more?
 545         nop
 546
 547         stq     $17, 56($5)     # L :
 548         addq    $5, 64, $5      # E :
 549         subq    $3, 8, $3       # E :
 550         bge     $2, $do_wh64_w  # U :
 551
 552         nop
 553         nop
 554         nop
 555         beq     $3, no_quad_w   # U : Might have finished already
 556
 557 .align 4
 558         /*
 559          * Simple loop for trailing quadwords, or for small amounts
 560          * of data (where we can't use an unrolled loop and wh64)
 561          */
 562 loop_w:
 563         stq $17,0($5)           # L :
 564         subq $3,1,$3            # E : Decrement number quads left
 565         addq $5,8,$5            # E : Inc address
 566         bne $3,loop_w           # U : more?
 567
 568 no_quad_w:
 569         /*
 570          * Write 0..7 trailing bytes.
 571          */
 572         nop                     # E :
 573         beq $18,end_w           # U : All done?
 574         ldq $7,0($5)            # L :
 575         mskqh $7,$6,$2          # U : Mask final quad
 576
 577         insqh $17,$6,$4         # U : New bits
 578         bis $2,$4,$1            # E : Put it all together
 579         stq $1,0($5)            # L : And back to memory
 580         ret $31,($26),1         # L0 :
 581
 582 within_quad_w:
 583         ldq_u $1,0($16)         # L :
 584         insql $17,$16,$2        # U : New bits
 585         mskql $1,$16,$4         # U : Clear old
 586         bis $2,$4,$2            # E : New result
 587
 588         mskql $2,$6,$4          # U :
 589         mskqh $1,$6,$2          # U :
 590         bis $2,$4,$1            # E :
 591         stq_u $1,0($16)         # L :
 592
 593 end_w:
 594         nop
 595         nop
 596         nop
 597         ret $31,($26),1         # L0 :
 598
 599         .end __memset16
 600         EXPORT_SYMBOL(__memset16)
 601
 602 memset = ___memset
 603 __memset = ___memset
 604         EXPORT_SYMBOL(memset)
 605         EXPORT_SYMBOL(__memset)