1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * Author: Anton Blanchard <anton@au.ibm.com>
4 * Copyright 2015 IBM Corporation.
6 #include <asm/ppc_asm.h>
7 #include <asm/export.h>
8 #include <asm/ppc-opcode.h>
23 #ifdef __LITTLE_ENDIAN__
28 #define VPERM(_VRT,_VRA,_VRB,_VRC) \
29 vperm _VRT,_VRB,_VRA,_VRC
35 #define VPERM(_VRT,_VRA,_VRB,_VRC) \
36 vperm _VRT,_VRA,_VRB,_VRC
39 #define VMX_THRESH 4096
40 #define ENTER_VMX_OPS \
42 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
43 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
44 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
46 stdu r1,-STACKFRAMESIZE(r1); \
49 ld r0,STACKFRAMESIZE+16(r1); \
50 ld r3,STK_REG(R31)(r1); \
51 ld r4,STK_REG(R30)(r1); \
52 ld r5,STK_REG(R29)(r1); \
53 addi r1,r1,STACKFRAMESIZE; \
56 #define EXIT_VMX_OPS \
58 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
59 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
60 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
62 stdu r1,-STACKFRAMESIZE(r1); \
64 ld r0,STACKFRAMESIZE+16(r1); \
65 ld r3,STK_REG(R31)(r1); \
66 ld r4,STK_REG(R30)(r1); \
67 ld r5,STK_REG(R29)(r1); \
68 addi r1,r1,STACKFRAMESIZE; \
72 * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
73 * 16 bytes boundary and permute the result with the 1st 16 bytes.
75 * | y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
77 * 0xbbbb10 0xbbbb20 0xbbb30
82 * _vmask is the mask generated by LVS
83 * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
84 * for example: 0xyyyyyyyyyyyyy012 for big endian
85 * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
86 * for example: 0x3456789abcdefzzz for big endian
87 * The permute result is saved in _v_res.
88 * for example: 0x0123456789abcdef for big endian.
90 #define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
91 lvx _v2nd_qw,_vaddr,off16; \
92 VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
95 * There are 2 categories for memcmp:
96 * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
97 * are named like .Lsameoffset_xxxx
98 * 2) src/dst has different offset to the 8 bytes boundary. The handlers
99 * are named like .Ldiffoffset_xxxx
104 /* Use the short loop if the src/dst addresses are not
105 * with the same offset of 8 bytes align boundary.
110 /* Fall back to short loop if compare at aligned addrs
111 * with less than 8 bytes.
155 bne .Ldiffoffset_8bytes_make_align_start
158 .Lsameoffset_8bytes_make_align_start:
159 /* attempt to compare bytes not aligned with 8 bytes so that
160 * rest comparison can run based on 8 bytes alignment.
164 /* Try to compare the first double word which is not 8 bytes aligned:
165 * load the first double word at (src & ~7UL) and shift left appropriate
166 * bits before comparision.
169 beq .Lsameoffset_8bytes_aligned
178 bne cr0,.LcmpAB_lightweight
185 .Lsameoffset_8bytes_aligned:
186 /* now we are aligned with 8 bytes.
187 * Use .Llong loop if left cmp bytes are equal or greater than 32B.
193 /* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
196 ble cr5,.Lcmp_rest_lt8bytes
198 /* handle 8 ~ 31 bytes */
207 bne cr0,.LcmpAB_lightweight
215 * Here we have less than 8 bytes to compare. At least s1 is aligned to
216 * 8 bytes, but s2 may not be. We must make sure s2 + 7 doesn't cross a
217 * page boundary, otherwise we might read past the end of the buffer and
218 * trigger a page fault. We use 4K as the conservative minimum page
219 * size. If we detect that case we go to the byte-by-byte loop.
221 * Otherwise the next double word is loaded from s1 and s2, and shifted
222 * right to compare the appropriate bits.
224 clrldi r6,r4,(64-12) // r6 = r4 & 0xfff
235 bne cr0,.LcmpAB_lightweight
243 #ifdef CONFIG_ALTIVEC
245 /* Try to use vmx loop if length is equal or greater than 4K */
246 cmpldi cr6,r5,VMX_THRESH
247 bge cr6,.Lsameoffset_vmx_cmp
248 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
252 /* At least s1 addr is aligned with 8 bytes */
403 .LcmpAB_lightweight: /* skip NV GPRS restore */
409 #ifdef CONFIG_ALTIVEC
410 .Lsameoffset_vmx_cmp:
411 /* Enter with src/dst addrs has the same offset with 8 bytes
414 * There is an optimization based on following fact: memcmp()
415 * prones to fail early at the first 32 bytes.
416 * Before applying VMX instructions which will lead to 32x128bits
417 * VMX regs load/restore penalty, we compare the first 32 bytes
418 * so that we can catch the ~80% fail cases.
423 .Lsameoffset_prechk_32B_loop:
429 bne cr0,.LcmpAB_lightweight
431 bdnz .Lsameoffset_prechk_32B_loop
434 beq cr1,.Llong_novmx_cmp
437 /* need to check whether r4 has the same offset with r3
438 * for 16 bytes boundary.
442 bne .Ldiffoffset_vmx_cmp_start
444 /* len is no less than 4KB. Need to align with 16 bytes further.
456 /* save and restore cr0 */
460 b .LcmpAB_lightweight
463 /* compare 32 bytes for each loop */
473 VCMPEQUD_RC(v0,v0,v1)
477 VCMPEQUD_RC(v0,v0,v1)
493 /* diff the last 16 bytes */
499 bne cr0,.LcmpAB_lightweight
504 bne cr0,.LcmpAB_lightweight
508 .Ldiffoffset_8bytes_make_align_start:
509 /* now try to align s1 with 8 bytes */
511 beq .Ldiffoffset_align_s1_8bytes
515 LD rB,0,r4 /* unaligned load */
521 bne cr0,.LcmpAB_lightweight
530 .Ldiffoffset_align_s1_8bytes:
531 /* now s1 is aligned with 8 bytes. */
532 #ifdef CONFIG_ALTIVEC
534 /* only do vmx ops when the size equal or greater than 4K bytes */
535 cmpdi cr5,r5,VMX_THRESH
536 bge cr5,.Ldiffoffset_vmx_cmp
537 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
539 .Ldiffoffset_novmx_cmp:
544 ble cr5,.Lcmp_lt32bytes
546 #ifdef CONFIG_ALTIVEC
552 #ifdef CONFIG_ALTIVEC
553 .Ldiffoffset_vmx_cmp:
554 /* perform a 32 bytes pre-checking before
555 * enable VMX operations.
559 .Ldiffoffset_prechk_32B_loop:
565 bne cr0,.LcmpAB_lightweight
567 bdnz .Ldiffoffset_prechk_32B_loop
570 beq cr1,.Ldiffoffset_novmx_cmp
572 .Ldiffoffset_vmx_cmp_start:
573 /* Firstly try to align r3 with 16 bytes */
576 beq .Ldiffoffset_vmx_s1_16bytes_align
583 LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
584 LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
586 VCMPEQUB_RC(v7,v9,v10)
587 bnl cr6,.Ldiffoffset_vmx_diff_found
594 .Ldiffoffset_vmx_s1_16bytes_align:
595 /* now s1 is aligned with 16 bytes */
598 srdi r6,r5,5 /* loop for 32 bytes each */
603 .Ldiffoffset_vmx_32bytesloop:
604 /* the first qw of r4 was saved in v6 */
606 LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
607 VCMPEQUB_RC(v7,v9,v10)
609 bnl cr6,.Ldiffoffset_vmx_diff_found
615 LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
616 VCMPEQUB_RC(v7,v9,v10)
618 bnl cr6,.Ldiffoffset_vmx_diff_found
623 bdnz .Ldiffoffset_vmx_32bytesloop
631 .Ldiffoffset_vmx_diff_found:
633 /* anyway, the diff will appear in next 16 bytes */
638 EXPORT_SYMBOL(memcmp)