2 * Author: Anton Blanchard <anton@au.ibm.com>
3 * Copyright 2015 IBM Corporation.
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; either version
8 * 2 of the License, or (at your option) any later version.
10 #include <asm/ppc_asm.h>
11 #include <asm/export.h>
12 #include <asm/ppc-opcode.h>
27 #ifdef __LITTLE_ENDIAN__
32 #define VPERM(_VRT,_VRA,_VRB,_VRC) \
33 vperm _VRT,_VRB,_VRA,_VRC
39 #define VPERM(_VRT,_VRA,_VRB,_VRC) \
40 vperm _VRT,_VRA,_VRB,_VRC
43 #define VMX_THRESH 4096
44 #define ENTER_VMX_OPS \
46 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
47 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
48 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
50 stdu r1,-STACKFRAMESIZE(r1); \
53 ld r0,STACKFRAMESIZE+16(r1); \
54 ld r3,STK_REG(R31)(r1); \
55 ld r4,STK_REG(R30)(r1); \
56 ld r5,STK_REG(R29)(r1); \
57 addi r1,r1,STACKFRAMESIZE; \
60 #define EXIT_VMX_OPS \
62 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
63 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
64 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
66 stdu r1,-STACKFRAMESIZE(r1); \
68 ld r0,STACKFRAMESIZE+16(r1); \
69 ld r3,STK_REG(R31)(r1); \
70 ld r4,STK_REG(R30)(r1); \
71 ld r5,STK_REG(R29)(r1); \
72 addi r1,r1,STACKFRAMESIZE; \
76 * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with
77 * 16 bytes boundary and permute the result with the 1st 16 bytes.
79 * | y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z |
81 * 0xbbbb10 0xbbbb20 0xbbb30
86 * _vmask is the mask generated by LVS
87 * _v1st_qw is the 1st aligned QW of current addr which is already loaded.
88 * for example: 0xyyyyyyyyyyyyy012 for big endian
89 * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded.
90 * for example: 0x3456789abcdefzzz for big endian
91 * The permute result is saved in _v_res.
92 * for example: 0x0123456789abcdef for big endian.
94 #define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
95 lvx _v2nd_qw,_vaddr,off16; \
96 VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
99 * There are 2 categories for memcmp:
100 * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
101 * are named like .Lsameoffset_xxxx
102 * 2) src/dst has different offset to the 8 bytes boundary. The handlers
103 * are named like .Ldiffoffset_xxxx
108 /* Use the short loop if the src/dst addresses are not
109 * with the same offset of 8 bytes align boundary.
114 /* Fall back to short loop if compare at aligned addrs
115 * with less than 8 bytes.
159 bne .Ldiffoffset_8bytes_make_align_start
162 .Lsameoffset_8bytes_make_align_start:
163 /* attempt to compare bytes not aligned with 8 bytes so that
164 * rest comparison can run based on 8 bytes alignment.
168 /* Try to compare the first double word which is not 8 bytes aligned:
169 * load the first double word at (src & ~7UL) and shift left appropriate
170 * bits before comparision.
173 beq .Lsameoffset_8bytes_aligned
182 bne cr0,.LcmpAB_lightweight
189 .Lsameoffset_8bytes_aligned:
190 /* now we are aligned with 8 bytes.
191 * Use .Llong loop if left cmp bytes are equal or greater than 32B.
197 /* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
200 ble cr5,.Lcmp_rest_lt8bytes
202 /* handle 8 ~ 31 bytes */
211 bne cr0,.LcmpAB_lightweight
218 /* Here we have only less than 8 bytes to compare with. at least s1
219 * Address is aligned with 8 bytes.
220 * The next double words are load and shift right with appropriate
230 bne cr0,.LcmpAB_lightweight
238 #ifdef CONFIG_ALTIVEC
240 /* Try to use vmx loop if length is equal or greater than 4K */
241 cmpldi cr6,r5,VMX_THRESH
242 bge cr6,.Lsameoffset_vmx_cmp
243 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
247 /* At least s1 addr is aligned with 8 bytes */
398 .LcmpAB_lightweight: /* skip NV GPRS restore */
404 #ifdef CONFIG_ALTIVEC
405 .Lsameoffset_vmx_cmp:
406 /* Enter with src/dst addrs has the same offset with 8 bytes
409 * There is an optimization based on following fact: memcmp()
410 * prones to fail early at the first 32 bytes.
411 * Before applying VMX instructions which will lead to 32x128bits
412 * VMX regs load/restore penalty, we compare the first 32 bytes
413 * so that we can catch the ~80% fail cases.
418 .Lsameoffset_prechk_32B_loop:
424 bne cr0,.LcmpAB_lightweight
426 bdnz .Lsameoffset_prechk_32B_loop
429 beq cr1,.Llong_novmx_cmp
432 /* need to check whether r4 has the same offset with r3
433 * for 16 bytes boundary.
437 bne .Ldiffoffset_vmx_cmp_start
439 /* len is no less than 4KB. Need to align with 16 bytes further.
451 /* save and restore cr0 */
455 b .LcmpAB_lightweight
458 /* compare 32 bytes for each loop */
468 VCMPEQUD_RC(v0,v0,v1)
472 VCMPEQUD_RC(v0,v0,v1)
488 /* diff the last 16 bytes */
494 bne cr0,.LcmpAB_lightweight
499 bne cr0,.LcmpAB_lightweight
503 .Ldiffoffset_8bytes_make_align_start:
504 /* now try to align s1 with 8 bytes */
506 beq .Ldiffoffset_align_s1_8bytes
510 LD rB,0,r4 /* unaligned load */
516 bne cr0,.LcmpAB_lightweight
525 .Ldiffoffset_align_s1_8bytes:
526 /* now s1 is aligned with 8 bytes. */
527 #ifdef CONFIG_ALTIVEC
529 /* only do vmx ops when the size equal or greater than 4K bytes */
530 cmpdi cr5,r5,VMX_THRESH
531 bge cr5,.Ldiffoffset_vmx_cmp
532 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
534 .Ldiffoffset_novmx_cmp:
539 ble cr5,.Lcmp_lt32bytes
541 #ifdef CONFIG_ALTIVEC
547 #ifdef CONFIG_ALTIVEC
548 .Ldiffoffset_vmx_cmp:
549 /* perform a 32 bytes pre-checking before
550 * enable VMX operations.
554 .Ldiffoffset_prechk_32B_loop:
560 bne cr0,.LcmpAB_lightweight
562 bdnz .Ldiffoffset_prechk_32B_loop
565 beq cr1,.Ldiffoffset_novmx_cmp
567 .Ldiffoffset_vmx_cmp_start:
568 /* Firstly try to align r3 with 16 bytes */
571 beq .Ldiffoffset_vmx_s1_16bytes_align
578 LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
579 LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
581 VCMPEQUB_RC(v7,v9,v10)
582 bnl cr6,.Ldiffoffset_vmx_diff_found
589 .Ldiffoffset_vmx_s1_16bytes_align:
590 /* now s1 is aligned with 16 bytes */
593 srdi r6,r5,5 /* loop for 32 bytes each */
598 .Ldiffoffset_vmx_32bytesloop:
599 /* the first qw of r4 was saved in v6 */
601 LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
602 VCMPEQUB_RC(v7,v9,v10)
604 bnl cr6,.Ldiffoffset_vmx_diff_found
610 LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
611 VCMPEQUB_RC(v7,v9,v10)
613 bnl cr6,.Ldiffoffset_vmx_diff_found
618 bdnz .Ldiffoffset_vmx_32bytesloop
626 .Ldiffoffset_vmx_diff_found:
628 /* anyway, the diff will appear in next 16 bytes */
633 EXPORT_SYMBOL(memcmp)