newlib/libc/machine/arc64/memcmp.S

   1 /*
   2    Copyright (c) 2024, Synopsys, Inc. All rights reserved.
   3
   4    Redistribution and use in source and binary forms, with or without
   5    modification, are permitted provided that the following conditions are met:
   6
   7    1) Redistributions of source code must retain the above copyright notice,
   8    this list of conditions and the following disclaimer.
   9
  10    2) Redistributions in binary form must reproduce the above copyright notice,
  11    this list of conditions and the following disclaimer in the documentation
  12    and/or other materials provided with the distribution.
  13
  14    3) Neither the name of the Synopsys, Inc., nor the names of its contributors
  15    may be used to endorse or promote products derived from this software
  16    without specific prior written permission.
  17
  18    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  19    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  20    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  21    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  22    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  23    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  24    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  25    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  26    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  27    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  28    POSSIBILITY OF SUCH DAMAGE.
  29 */
  30
  31 #include <sys/asm.h>
  32
  33 #if defined (__ARC64_ARCH64__)
  34
  35 ; R0: lhs
  36 ; R1: rhs
  37 ; R2: count
  38 ; ret (R0):
  39 ;   - lhs < rhs: <0
  40 ;   - lhs = rhs:  0
  41 ;   - lhs > rhs: >0
  42 ENTRY (memcmp)
  43         cmpl      r2, 64
  44         bls.d     @.L_compare_1_bytes
  45         movl      r3, r0        ; "r0" will be used as return value
  46         ; If one is curious why the code below looks like the way it does,
  47         ; there is a documentation at the end of this file.
  48         lsrl      r12, r2, 5    ; counter for 32-byte chunks
  49         xor       r13, r13, r13 ; the mask showing inequal registers
  50         ldl.ab    r4,  [r3, +8]
  51         ldl.ab    r5,  [r1, +8]
  52 .L_compare_32_bytes:
  53         ldl.ab    r6,  [r3, +8]
  54         ldl.ab    r7,  [r1, +8]
  55         ldl.ab    r8,  [r3, +8]
  56         ldl.ab    r9,  [r1, +8]
  57         ldl.ab    r10, [r3, +8]
  58         ldl.ab    r11, [r1, +8]
  59         xorl.f    0, r4, r5
  60         xor.ne    r13, r13, 0b0001
  61         xorl.f    0, r6, r7
  62         xor.ne    r13, r13, 0b0010
  63         xorl.f    0, r8, r9
  64         xor.ne    r13, r13, 0b0100
  65         xorl.f    0, r10, r11
  66         xor.ne    r13, r13, 0b1000
  67         brne      r13, 0, @.L_unequal_find
  68         ldl.ab    r4,  [r3, +8]
  69         dbnz.d    r12, @.L_compare_32_bytes
  70         ldl.ab    r5,  [r1, +8]
  71         ; Adjusting the pointers because of the extra loads in the end
  72         subl      r1, r1, 8
  73         subl      r3, r3, 8
  74         bmsk_s    r2, r2, 4     ; any remaining bytes to compare
  75 .L_compare_1_bytes:
  76         cmp       r2, 0
  77         jeq.d     [blink]
  78         xor_s     r0, r0, r0
  79         ldb.ab    r4, [r3, +1]
  80         ldb.ab    r5, [r1, +1]
  81 2:
  82         sub.f     r0, r4, r5
  83         jne.d     [blink]
  84         ldb.ab    r4, [r3, +1]
  85         dbnz.d    r2, @2b
  86         ldb.ab    r5, [r1, +1]  ; this load may read beyond the "count".
  87         j_s       [blink]
  88 ; At this point, we want to find the _first_ comparison that marked the
  89 ; inequality of "lhs" and "rhs".  The rest acts like a multiplexer:
  90 ;
  91 ; if r4  was not equal to r5  --> r1=r4,  r2=r5
  92 ; if r6  was not equal to r7  --> r1=r6,  r2=r7
  93 ; if r8  was not equal to r9  --> r1=r8,  r2=r9
  94 ; if r10 was not equal to r11 --> r1=r10, r2=r11
  95 ; find_different_byte(r1, r2)
  96 ;
  97 ; About the "bi [n]" (branch index) instruction:  This instruction alters
  98 ; next PC (program counter):
  99 ;
 100 ; next_pc = current_pc + n*4                n*4 is the same as n<<2
 101 ;
 102 ; In other words, it tells the processor to execute the n'th instruction
 103 ; from where we are (assuming all the next instructions are 4 bytes long).
 104 ;
 105 ; We used this to our benefit.  We made each "case" (unequal_r4r5,
 106 ; unequal_r5r6, ...) 16 bytes long (power of 2) and fed "bi" an index
 107 ; that is already multiplied by 4 (asl r13, r13, 2).  This translates
 108 ; into "bi [n]" jumping to 16-bytes slots.  The last slot we did not
 109 ; make 16 bytes long with "nop" because we don't need to address after
 110 ; it.
 111 .L_unequal_find:
 112         ffs       r13, r13
 113         asl       r13, r13, 2
 114         bi        [r13]
 115 .L_unequal_r4r5:
 116         movl      r1, r4
 117         b.d       @.L_diff_byte_in_regs
 118         movl      r2, r5
 119         nop
 120 .L_unequal_r6r7:
 121         movl      r1, r6
 122         b.d       @.L_diff_byte_in_regs
 123         movl      r2, r7
 124         nop
 125 .L_unequal_r8r9:
 126         movl      r1, r8
 127         b.d       @.L_diff_byte_in_regs
 128         movl      r2, r9
 129         nop
 130 .L_unequal_r10r11:
 131         movl      r1, r10
 132         movl      r2, r11
 133         ; fall-through
 134 ; If we're here, that means the two operands are not equal.
 135 ; 1) First we have to get a mask of their inequality through "xor"
 136 ; 2) Then, find the first bit position that they're different: "ffs"
 137 ; 3) Depending on the bit position, we want the whole byte containing
 138 ;    that bit, in both operands, to become the very first byte (least
 139 ;    significant byte), so that we can subtract one from another.
 140 ;    Below is an illustration of bit positions and how much we should
 141 ;    shift the numbers right:
 142 ;    bit position range : (in binary)       | shift right by : (in binary)
 143 ;    -------------------+-------------------+----------------+------------
 144 ;    [ 0, 7]            : (000000 - 000111) | lsr  0         : 000000
 145 ;    [ 8,15]            : (001000 - 001111) | lsr  8         : 001000
 146 ;    [16,23]            : (010000 - 010111) | lsr 16         : 010000
 147 ;    [24,31]            : (011000 - 011111) | lsr 24         : 011000
 148 ;    ...                : ...               | ...            : ...
 149 ;    [56,63]            : (111000 - 111111) | lsr 56         : 111000
 150 ;    We need to ignore the least 3 bits of "position" to get "shift right"
 151 ;    amount: "and 0x38, ..."
 152 ; 4) When the bytes are positioned at byte #0, mask out the rest of the
 153 ;    bytes and subtract the two operands: lhs - rhs
 154 .L_diff_byte_in_regs:
 155         xorl      r0, r1, r2    ; (1)
 156         ffsl      r0, r0        ; (2)
 157         and       r0, r0, 0x38  ; (3)
 158         lsrl      r1, r1, r0    ; (3)
 159         lsrl      r2, r2, r0    ; (3)
 160         bmsk_s    r1, r1, 7     ; (4)
 161         bmsk_s    r2, r2, 7     ; (4)
 162         j_s.d     [blink]
 163         subl      r0, r1, r2    ; (4)
 164 ENDFUNC (memcmp)
 165
 166 ; __ARC64_ARCH64__
 167 #endif
 168
 169 ; The loop at the heart of the "memcmp" function follows some specific
 170 ; logic and has gone through a few optimisation filters.  Knowing them
 171 ; will help understand the code better.
 172 ;
 173 ; The comparison logic
 174 ; --------------------
 175 ; In each loop, we compare 32 bytes of data from "lhs" and "rhs".  Those
 176 ; comparisons takes place by using 8 sets of registers:
 177 ;
 178 ; r4  == r5        xor.f 0, r4,  r5       lhs[i+0]  == rhs[i+0]
 179 ; r6  == r7        xor.f 0, r6,  r7       lhs[i+8]  == rhs[i+8]
 180 ; r8  == r9        xor.f 0, r8,  r9       lhs[i+16] == rhs[i+16]
 181 ; r10 == r11       xor.f 0, r10, r11      lhs[i+24] == rhs[i+32]
 182 ;
 183 ; The idea is to set a corresponding bit in r13 register for each
 184 ; comparison that fails.  The relation between the bits and the
 185 ; comparisons are:
 186 ;
 187 ; r13[0..63] = 0
 188 ; r13[0]     = 1 if r4  != r5
 189 ; r13[1]     = 1 if r6  != r7
 190 ; r13[2]     = 1 if r8  != r9
 191 ; r13[3]     = 1 if r10 != r11
 192 ;
 193 ; If r13 remains 0, the next possible iteration of the loop begins.
 194 ; If it is not 0 anymore, the algorithm will be interested in the
 195 ; lowest bit that is set to 1.  That is achieved by the "ffs"
 196 ; (find first set) instruction.
 197 ;
 198 ; The loop transformation
 199 ; -----------------------
 200 ; 1) At first, the loop looks like below:
 201 ;
 202 ;    .loop
 203 ;      ldl.ab  r4,  [r3, +8]
 204 ;      ldl.ab  r5,  [r1, +8]
 205 ;      ...
 206 ;      ldl.ab  r10, [r3, +8]
 207 ;      ldl.ab  r11, [r1, +8]
 208 ;      xorl.f  0, r4, r5
 209 ;      xor.ne  r13, r13, 0b0001
 210 ;      ...
 211 ;      xorl.f  0, r10, r11
 212 ;      xor.ne  r13, r13, 0b1000
 213 ;      brne    r13, 0, @.unequal_find
 214 ;      dbnz    r12, @.loop
 215 ;
 216 ; 2) "dbnz" instruction has a delay slot.  To make the code more
 217 ;    efficient, we can bring the first 2 instructions of the loop
 218 ;    to the end (they will be executed just before the next iteration
 219 ;    begins).  To make the logic of the program sound, those 2
 220 ;    instructions need to be duplicated before the loop start as well:
 221 ;
 222 ;      ldl.ab  r4,  [r3, +8]
 223 ;      ldl.ab  r5,  [r1, +8]
 224 ;    .loop
 225 ;      ldl.ab  r6, [r3, +8]
 226 ;      ldl.ab  r7, [r1, +8]
 227 ;      ...
 228 ;      ldl.ab  r10, [r3, +8]
 229 ;      ldl.ab  r11, [r1, +8]
 230 ;      xorl.f  0, r4, r5
 231 ;      xor.ne  r13, r13, 0b0001
 232 ;      ...
 233 ;      xorl.f  0, r10, r11
 234 ;      xor.ne  r13, r13, 0b1000
 235 ;      brne    r13, 0, @.unequal_find
 236 ;      ldl.ab  r4,  [r3, +8]
 237 ;      dbnz.d  r12, @.loop
 238 ;      ldl.ab  r5,  [r1, +8]
 239 ;
 240 ;    There is one more loose end to take care of:  At the last iteration
 241 ;    of the loop, there is an extra load into r4 and r5 registers while
 242 ;    incrementing the pointers (r3 and r1).  We have to correct for that
 243 ;    after the loop:
 244 ;
 245 ;    .loop:
 246 ;      ..
 247 ;      brne    r13, 0, @.unequal_find
 248 ;      ldl.ab  r4,  [r3, +8]
 249 ;      dbnz.d  r12, @.loop
 250 ;      ldl.ab  r5,  [r1, +8]
 251 ;      subl    r1, r1, 8
 252 ;      subl    r3, r3, 8
 253 ;
 254 ;    One last remark about NOT filling the delay slot of "brne" with
 255 ;    "ldl.ab r4, ...".  If the branch is taken, the rest of code that
 256 ;    is responsible for finding the differentiating bytes relies that
 257 ;    all 8 registers hold the comparison data of the loop.  Putting
 258 ;    "ldl.ab r4, ..." into the delay slot of "brne ..." would clobber
 259 ;    the "r4" register:
 260 ;
 261 ;    .loop:
 262 ;      ..
 263 ;      brne.d  r13, 0, @.unequal_find    --> this branch might be taken
 264 ;      ldl.ab  r4,  [r3, +8]             --> clobbers r4
 265 ;      dbnz.d  r12, @.loop
 266 ;      ldl.ab  r5,  [r1, +8]
 267 ;
 268 ;    Having "ldl.ab r4, ..." between "brne" and "dbnz" as two control flow
 269 ;    altering instructions is good enough.