arch/powerpc/lib/memcmp_64.S

   1 /*
   2  * Author: Anton Blanchard <anton@au.ibm.com>
   3  * Copyright 2015 IBM Corporation.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License
   7  * as published by the Free Software Foundation; either version
   8  * 2 of the License, or (at your option) any later version.
   9  */
  10 #include <asm/ppc_asm.h>
  11 #include <asm/export.h>
  12 #include <asm/ppc-opcode.h>
  13
  14 #define off8    r6
  15 #define off16   r7
  16 #define off24   r8
  17
  18 #define rA      r9
  19 #define rB      r10
  20 #define rC      r11
  21 #define rD      r27
  22 #define rE      r28
  23 #define rF      r29
  24 #define rG      r30
  25 #define rH      r31
  26
  27 #ifdef __LITTLE_ENDIAN__
  28 #define LH      lhbrx
  29 #define LW      lwbrx
  30 #define LD      ldbrx
  31 #define LVS     lvsr
  32 #define VPERM(_VRT,_VRA,_VRB,_VRC) \
  33         vperm _VRT,_VRB,_VRA,_VRC
  34 #else
  35 #define LH      lhzx
  36 #define LW      lwzx
  37 #define LD      ldx
  38 #define LVS     lvsl
  39 #define VPERM(_VRT,_VRA,_VRB,_VRC) \
  40         vperm _VRT,_VRA,_VRB,_VRC
  41 #endif
  42
  43 #define VMX_THRESH 4096
  44 #define ENTER_VMX_OPS   \
  45         mflr    r0;     \
  46         std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
  47         std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
  48         std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
  49         std     r0,16(r1); \
  50         stdu    r1,-STACKFRAMESIZE(r1); \
  51         bl      enter_vmx_ops; \
  52         cmpwi   cr1,r3,0; \
  53         ld      r0,STACKFRAMESIZE+16(r1); \
  54         ld      r3,STK_REG(R31)(r1); \
  55         ld      r4,STK_REG(R30)(r1); \
  56         ld      r5,STK_REG(R29)(r1); \
  57         addi    r1,r1,STACKFRAMESIZE; \
  58         mtlr    r0
  59
  60 #define EXIT_VMX_OPS \
  61         mflr    r0; \
  62         std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
  63         std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
  64         std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
  65         std     r0,16(r1); \
  66         stdu    r1,-STACKFRAMESIZE(r1); \
  67         bl      exit_vmx_ops; \
  68         ld      r0,STACKFRAMESIZE+16(r1); \
  69         ld      r3,STK_REG(R31)(r1); \
  70         ld      r4,STK_REG(R30)(r1); \
  71         ld      r5,STK_REG(R29)(r1); \
  72         addi    r1,r1,STACKFRAMESIZE; \
  73         mtlr    r0
  74
  75 /*
  76  * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
  77  * 16 bytes boundary and permute the result with the 1st 16 bytes.
  78
  79  *    |  y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
  80  *    ^                                  ^                                 ^
  81  * 0xbbbb10                          0xbbbb20                          0xbbb30
  82  *                                 ^
  83  *                                _vaddr
  84  *
  85  *
  86  * _vmask is the mask generated by LVS
  87  * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
  88  *   for example: 0xyyyyyyyyyyyyy012 for big endian
  89  * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
  90  *   for example: 0x3456789abcdefzzz for big endian
  91  * The permute result is saved in _v_res.
  92  *   for example: 0x0123456789abcdef for big endian.
  93  */
  94 #define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
  95         lvx     _v2nd_qw,_vaddr,off16; \
  96         VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
  97
  98 /*
  99  * There are 2 categories for memcmp:
 100  * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
 101  * are named like .Lsameoffset_xxxx
 102  * 2) src/dst has different offset to the 8 bytes boundary. The handlers
 103  * are named like .Ldiffoffset_xxxx
 104  */
 105 _GLOBAL_TOC(memcmp)
 106         cmpdi   cr1,r5,0
 107
 108         /* Use the short loop if the src/dst addresses are not
 109          * with the same offset of 8 bytes align boundary.
 110          */
 111         xor     r6,r3,r4
 112         andi.   r6,r6,7
 113
 114         /* Fall back to short loop if compare at aligned addrs
 115          * with less than 8 bytes.
 116          */
 117         cmpdi   cr6,r5,7
 118
 119         beq     cr1,.Lzero
 120         bgt     cr6,.Lno_short
 121
 122 .Lshort:
 123         mtctr   r5
 124 1:      lbz     rA,0(r3)
 125         lbz     rB,0(r4)
 126         subf.   rC,rB,rA
 127         bne     .Lnon_zero
 128         bdz     .Lzero
 129
 130         lbz     rA,1(r3)
 131         lbz     rB,1(r4)
 132         subf.   rC,rB,rA
 133         bne     .Lnon_zero
 134         bdz     .Lzero
 135
 136         lbz     rA,2(r3)
 137         lbz     rB,2(r4)
 138         subf.   rC,rB,rA
 139         bne     .Lnon_zero
 140         bdz     .Lzero
 141
 142         lbz     rA,3(r3)
 143         lbz     rB,3(r4)
 144         subf.   rC,rB,rA
 145         bne     .Lnon_zero
 146
 147         addi    r3,r3,4
 148         addi    r4,r4,4
 149
 150         bdnz    1b
 151
 152 .Lzero:
 153         li      r3,0
 154         blr
 155
 156 .Lno_short:
 157         dcbt    0,r3
 158         dcbt    0,r4
 159         bne     .Ldiffoffset_8bytes_make_align_start
 160
 161
 162 .Lsameoffset_8bytes_make_align_start:
 163         /* attempt to compare bytes not aligned with 8 bytes so that
 164          * rest comparison can run based on 8 bytes alignment.
 165          */
 166         andi.   r6,r3,7
 167
 168         /* Try to compare the first double word which is not 8 bytes aligned:
 169          * load the first double word at (src & ~7UL) and shift left appropriate
 170          * bits before comparision.
 171          */
 172         rlwinm  r6,r3,3,26,28
 173         beq     .Lsameoffset_8bytes_aligned
 174         clrrdi  r3,r3,3
 175         clrrdi  r4,r4,3
 176         LD      rA,0,r3
 177         LD      rB,0,r4
 178         sld     rA,rA,r6
 179         sld     rB,rB,r6
 180         cmpld   cr0,rA,rB
 181         srwi    r6,r6,3
 182         bne     cr0,.LcmpAB_lightweight
 183         subfic  r6,r6,8
 184         subf.   r5,r6,r5
 185         addi    r3,r3,8
 186         addi    r4,r4,8
 187         beq     .Lzero
 188
 189 .Lsameoffset_8bytes_aligned:
 190         /* now we are aligned with 8 bytes.
 191          * Use .Llong loop if left cmp bytes are equal or greater than 32B.
 192          */
 193         cmpdi   cr6,r5,31
 194         bgt     cr6,.Llong
 195
 196 .Lcmp_lt32bytes:
 197         /* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
 198         cmpdi   cr5,r5,7
 199         srdi    r0,r5,3
 200         ble     cr5,.Lcmp_rest_lt8bytes
 201
 202         /* handle 8 ~ 31 bytes */
 203         clrldi  r5,r5,61
 204         mtctr   r0
 205 2:
 206         LD      rA,0,r3
 207         LD      rB,0,r4
 208         cmpld   cr0,rA,rB
 209         addi    r3,r3,8
 210         addi    r4,r4,8
 211         bne     cr0,.LcmpAB_lightweight
 212         bdnz    2b
 213
 214         cmpwi   r5,0
 215         beq     .Lzero
 216
 217 .Lcmp_rest_lt8bytes:
 218         /* Here we have only less than 8 bytes to compare with. at least s1
 219          * Address is aligned with 8 bytes.
 220          * The next double words are load and shift right with appropriate
 221          * bits.
 222          */
 223         subfic  r6,r5,8
 224         slwi    r6,r6,3
 225         LD      rA,0,r3
 226         LD      rB,0,r4
 227         srd     rA,rA,r6
 228         srd     rB,rB,r6
 229         cmpld   cr0,rA,rB
 230         bne     cr0,.LcmpAB_lightweight
 231         b       .Lzero
 232
 233 .Lnon_zero:
 234         mr      r3,rC
 235         blr
 236
 237 .Llong:
 238 #ifdef CONFIG_ALTIVEC
 239 BEGIN_FTR_SECTION
 240         /* Try to use vmx loop if length is equal or greater than 4K */
 241         cmpldi  cr6,r5,VMX_THRESH
 242         bge     cr6,.Lsameoffset_vmx_cmp
 243 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 244
 245 .Llong_novmx_cmp:
 246 #endif
 247         /* At least s1 addr is aligned with 8 bytes */
 248         li      off8,8
 249         li      off16,16
 250         li      off24,24
 251
 252         std     r31,-8(r1)
 253         std     r30,-16(r1)
 254         std     r29,-24(r1)
 255         std     r28,-32(r1)
 256         std     r27,-40(r1)
 257
 258         srdi    r0,r5,5
 259         mtctr   r0
 260         andi.   r5,r5,31
 261
 262         LD      rA,0,r3
 263         LD      rB,0,r4
 264
 265         LD      rC,off8,r3
 266         LD      rD,off8,r4
 267
 268         LD      rE,off16,r3
 269         LD      rF,off16,r4
 270
 271         LD      rG,off24,r3
 272         LD      rH,off24,r4
 273         cmpld   cr0,rA,rB
 274
 275         addi    r3,r3,32
 276         addi    r4,r4,32
 277
 278         bdz     .Lfirst32
 279
 280         LD      rA,0,r3
 281         LD      rB,0,r4
 282         cmpld   cr1,rC,rD
 283
 284         LD      rC,off8,r3
 285         LD      rD,off8,r4
 286         cmpld   cr6,rE,rF
 287
 288         LD      rE,off16,r3
 289         LD      rF,off16,r4
 290         cmpld   cr7,rG,rH
 291         bne     cr0,.LcmpAB
 292
 293         LD      rG,off24,r3
 294         LD      rH,off24,r4
 295         cmpld   cr0,rA,rB
 296         bne     cr1,.LcmpCD
 297
 298         addi    r3,r3,32
 299         addi    r4,r4,32
 300
 301         bdz     .Lsecond32
 302
 303         .balign 16
 304
 305 1:      LD      rA,0,r3
 306         LD      rB,0,r4
 307         cmpld   cr1,rC,rD
 308         bne     cr6,.LcmpEF
 309
 310         LD      rC,off8,r3
 311         LD      rD,off8,r4
 312         cmpld   cr6,rE,rF
 313         bne     cr7,.LcmpGH
 314
 315         LD      rE,off16,r3
 316         LD      rF,off16,r4
 317         cmpld   cr7,rG,rH
 318         bne     cr0,.LcmpAB
 319
 320         LD      rG,off24,r3
 321         LD      rH,off24,r4
 322         cmpld   cr0,rA,rB
 323         bne     cr1,.LcmpCD
 324
 325         addi    r3,r3,32
 326         addi    r4,r4,32
 327
 328         bdnz    1b
 329
 330 .Lsecond32:
 331         cmpld   cr1,rC,rD
 332         bne     cr6,.LcmpEF
 333
 334         cmpld   cr6,rE,rF
 335         bne     cr7,.LcmpGH
 336
 337         cmpld   cr7,rG,rH
 338         bne     cr0,.LcmpAB
 339
 340         bne     cr1,.LcmpCD
 341         bne     cr6,.LcmpEF
 342         bne     cr7,.LcmpGH
 343
 344 .Ltail:
 345         ld      r31,-8(r1)
 346         ld      r30,-16(r1)
 347         ld      r29,-24(r1)
 348         ld      r28,-32(r1)
 349         ld      r27,-40(r1)
 350
 351         cmpdi   r5,0
 352         beq     .Lzero
 353         b       .Lshort
 354
 355 .Lfirst32:
 356         cmpld   cr1,rC,rD
 357         cmpld   cr6,rE,rF
 358         cmpld   cr7,rG,rH
 359
 360         bne     cr0,.LcmpAB
 361         bne     cr1,.LcmpCD
 362         bne     cr6,.LcmpEF
 363         bne     cr7,.LcmpGH
 364
 365         b       .Ltail
 366
 367 .LcmpAB:
 368         li      r3,1
 369         bgt     cr0,.Lout
 370         li      r3,-1
 371         b       .Lout
 372
 373 .LcmpCD:
 374         li      r3,1
 375         bgt     cr1,.Lout
 376         li      r3,-1
 377         b       .Lout
 378
 379 .LcmpEF:
 380         li      r3,1
 381         bgt     cr6,.Lout
 382         li      r3,-1
 383         b       .Lout
 384
 385 .LcmpGH:
 386         li      r3,1
 387         bgt     cr7,.Lout
 388         li      r3,-1
 389
 390 .Lout:
 391         ld      r31,-8(r1)
 392         ld      r30,-16(r1)
 393         ld      r29,-24(r1)
 394         ld      r28,-32(r1)
 395         ld      r27,-40(r1)
 396         blr
 397
 398 .LcmpAB_lightweight:   /* skip NV GPRS restore */
 399         li      r3,1
 400         bgtlr
 401         li      r3,-1
 402         blr
 403
 404 #ifdef CONFIG_ALTIVEC
 405 .Lsameoffset_vmx_cmp:
 406         /* Enter with src/dst addrs has the same offset with 8 bytes
 407          * align boundary.
 408          *
 409          * There is an optimization based on following fact: memcmp()
 410          * prones to fail early at the first 32 bytes.
 411          * Before applying VMX instructions which will lead to 32x128bits
 412          * VMX regs load/restore penalty, we compare the first 32 bytes
 413          * so that we can catch the ~80% fail cases.
 414          */
 415
 416         li      r0,4
 417         mtctr   r0
 418 .Lsameoffset_prechk_32B_loop:
 419         LD      rA,0,r3
 420         LD      rB,0,r4
 421         cmpld   cr0,rA,rB
 422         addi    r3,r3,8
 423         addi    r4,r4,8
 424         bne     cr0,.LcmpAB_lightweight
 425         addi    r5,r5,-8
 426         bdnz    .Lsameoffset_prechk_32B_loop
 427
 428         ENTER_VMX_OPS
 429         beq     cr1,.Llong_novmx_cmp
 430
 431 3:
 432         /* need to check whether r4 has the same offset with r3
 433          * for 16 bytes boundary.
 434          */
 435         xor     r0,r3,r4
 436         andi.   r0,r0,0xf
 437         bne     .Ldiffoffset_vmx_cmp_start
 438
 439         /* len is no less than 4KB. Need to align with 16 bytes further.
 440          */
 441         andi.   rA,r3,8
 442         LD      rA,0,r3
 443         beq     4f
 444         LD      rB,0,r4
 445         cmpld   cr0,rA,rB
 446         addi    r3,r3,8
 447         addi    r4,r4,8
 448         addi    r5,r5,-8
 449
 450         beq     cr0,4f
 451         /* save and restore cr0 */
 452         mfocrf  r5,128
 453         EXIT_VMX_OPS
 454         mtocrf  128,r5
 455         b       .LcmpAB_lightweight
 456
 457 4:
 458         /* compare 32 bytes for each loop */
 459         srdi    r0,r5,5
 460         mtctr   r0
 461         clrldi  r5,r5,59
 462         li      off16,16
 463
 464 .balign 16
 465 5:
 466         lvx     v0,0,r3
 467         lvx     v1,0,r4
 468         VCMPEQUD_RC(v0,v0,v1)
 469         bnl     cr6,7f
 470         lvx     v0,off16,r3
 471         lvx     v1,off16,r4
 472         VCMPEQUD_RC(v0,v0,v1)
 473         bnl     cr6,6f
 474         addi    r3,r3,32
 475         addi    r4,r4,32
 476         bdnz    5b
 477
 478         EXIT_VMX_OPS
 479         cmpdi   r5,0
 480         beq     .Lzero
 481         b       .Lcmp_lt32bytes
 482
 483 6:
 484         addi    r3,r3,16
 485         addi    r4,r4,16
 486
 487 7:
 488         /* diff the last 16 bytes */
 489         EXIT_VMX_OPS
 490         LD      rA,0,r3
 491         LD      rB,0,r4
 492         cmpld   cr0,rA,rB
 493         li      off8,8
 494         bne     cr0,.LcmpAB_lightweight
 495
 496         LD      rA,off8,r3
 497         LD      rB,off8,r4
 498         cmpld   cr0,rA,rB
 499         bne     cr0,.LcmpAB_lightweight
 500         b       .Lzero
 501 #endif
 502
 503 .Ldiffoffset_8bytes_make_align_start:
 504         /* now try to align s1 with 8 bytes */
 505         rlwinm  r6,r3,3,26,28
 506         beq     .Ldiffoffset_align_s1_8bytes
 507
 508         clrrdi  r3,r3,3
 509         LD      rA,0,r3
 510         LD      rB,0,r4  /* unaligned load */
 511         sld     rA,rA,r6
 512         srd     rA,rA,r6
 513         srd     rB,rB,r6
 514         cmpld   cr0,rA,rB
 515         srwi    r6,r6,3
 516         bne     cr0,.LcmpAB_lightweight
 517
 518         subfic  r6,r6,8
 519         subf.   r5,r6,r5
 520         addi    r3,r3,8
 521         add     r4,r4,r6
 522
 523         beq     .Lzero
 524
 525 .Ldiffoffset_align_s1_8bytes:
 526         /* now s1 is aligned with 8 bytes. */
 527 #ifdef CONFIG_ALTIVEC
 528 BEGIN_FTR_SECTION
 529         /* only do vmx ops when the size equal or greater than 4K bytes */
 530         cmpdi   cr5,r5,VMX_THRESH
 531         bge     cr5,.Ldiffoffset_vmx_cmp
 532 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 533
 534 .Ldiffoffset_novmx_cmp:
 535 #endif
 536
 537
 538         cmpdi   cr5,r5,31
 539         ble     cr5,.Lcmp_lt32bytes
 540
 541 #ifdef CONFIG_ALTIVEC
 542         b       .Llong_novmx_cmp
 543 #else
 544         b       .Llong
 545 #endif
 546
 547 #ifdef CONFIG_ALTIVEC
 548 .Ldiffoffset_vmx_cmp:
 549         /* perform a 32 bytes pre-checking before
 550          * enable VMX operations.
 551          */
 552         li      r0,4
 553         mtctr   r0
 554 .Ldiffoffset_prechk_32B_loop:
 555         LD      rA,0,r3
 556         LD      rB,0,r4
 557         cmpld   cr0,rA,rB
 558         addi    r3,r3,8
 559         addi    r4,r4,8
 560         bne     cr0,.LcmpAB_lightweight
 561         addi    r5,r5,-8
 562         bdnz    .Ldiffoffset_prechk_32B_loop
 563
 564         ENTER_VMX_OPS
 565         beq     cr1,.Ldiffoffset_novmx_cmp
 566
 567 .Ldiffoffset_vmx_cmp_start:
 568         /* Firstly try to align r3 with 16 bytes */
 569         andi.   r6,r3,0xf
 570         li      off16,16
 571         beq     .Ldiffoffset_vmx_s1_16bytes_align
 572
 573         LVS     v3,0,r3
 574         LVS     v4,0,r4
 575
 576         lvx     v5,0,r3
 577         lvx     v6,0,r4
 578         LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
 579         LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
 580
 581         VCMPEQUB_RC(v7,v9,v10)
 582         bnl     cr6,.Ldiffoffset_vmx_diff_found
 583
 584         subfic  r6,r6,16
 585         subf    r5,r6,r5
 586         add     r3,r3,r6
 587         add     r4,r4,r6
 588
 589 .Ldiffoffset_vmx_s1_16bytes_align:
 590         /* now s1 is aligned with 16 bytes */
 591         lvx     v6,0,r4
 592         LVS     v4,0,r4
 593         srdi    r6,r5,5  /* loop for 32 bytes each */
 594         clrldi  r5,r5,59
 595         mtctr   r6
 596
 597 .balign 16
 598 .Ldiffoffset_vmx_32bytesloop:
 599         /* the first qw of r4 was saved in v6 */
 600         lvx     v9,0,r3
 601         LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
 602         VCMPEQUB_RC(v7,v9,v10)
 603         vor     v6,v8,v8
 604         bnl     cr6,.Ldiffoffset_vmx_diff_found
 605
 606         addi    r3,r3,16
 607         addi    r4,r4,16
 608
 609         lvx     v9,0,r3
 610         LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
 611         VCMPEQUB_RC(v7,v9,v10)
 612         vor     v6,v8,v8
 613         bnl     cr6,.Ldiffoffset_vmx_diff_found
 614
 615         addi    r3,r3,16
 616         addi    r4,r4,16
 617
 618         bdnz    .Ldiffoffset_vmx_32bytesloop
 619
 620         EXIT_VMX_OPS
 621
 622         cmpdi   r5,0
 623         beq     .Lzero
 624         b       .Lcmp_lt32bytes
 625
 626 .Ldiffoffset_vmx_diff_found:
 627         EXIT_VMX_OPS
 628         /* anyway, the diff will appear in next 16 bytes */
 629         li      r5,16
 630         b       .Lcmp_lt32bytes
 631
 632 #endif
 633 EXPORT_SYMBOL(memcmp)