2 Copyright (c) 2024, Synopsys, Inc. All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
7 1) Redistributions of source code must retain the above copyright notice,
8 this list of conditions and the following disclaimer.
10 2) Redistributions in binary form must reproduce the above copyright notice,
11 this list of conditions and the following disclaimer in the documentation
12 and/or other materials provided with the distribution.
14 3) Neither the name of the Synopsys, Inc., nor the names of its contributors
15 may be used to endorse or promote products derived from this software
16 without specific prior written permission.
18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 POSSIBILITY OF SUCH DAMAGE.
33 #if defined (__ARC64_ARCH64__)
44 bls.d @.L_compare_1_bytes
45 movl r3, r0 ; "r0" will be used as return value
46 ; If one is curious why the code below looks like the way it does,
47 ; there is a documentation at the end of this file.
48 lsrl r12, r2, 5 ; counter for 32-byte chunks
49 xor r13, r13, r13 ; the mask showing inequal registers
60 xor.ne r13, r13, 0b0001
62 xor.ne r13, r13, 0b0010
64 xor.ne r13, r13, 0b0100
66 xor.ne r13, r13, 0b1000
67 brne r13, 0, @.L_unequal_find
69 dbnz.d r12, @.L_compare_32_bytes
71 ; Adjusting the pointers because of the extra loads in the end
74 bmsk_s r2, r2, 4 ; any remaining bytes to compare
86 ldb.ab r5, [r1, +1] ; this load may read beyond the "count".
88 ; At this point, we want to find the _first_ comparison that marked the
89 ; inequality of "lhs" and "rhs". The rest acts like a multiplexer:
91 ; if r4 was not equal to r5 --> r1=r4, r2=r5
92 ; if r6 was not equal to r7 --> r1=r6, r2=r7
93 ; if r8 was not equal to r9 --> r1=r8, r2=r9
94 ; if r10 was not equal to r11 --> r1=r10, r2=r11
95 ; find_different_byte(r1, r2)
97 ; About the "bi [n]" (branch index) instruction: This instruction alters
98 ; next PC (program counter):
100 ; next_pc = current_pc + n*4 n*4 is the same as n<<2
102 ; In other words, it tells the processor to execute the n'th instruction
103 ; from where we are (assuming all the next instructions are 4 bytes long).
105 ; We used this to our benefit. We made each "case" (unequal_r4r5,
106 ; unequal_r5r6, ...) 16 bytes long (power of 2) and fed "bi" an index
107 ; that is already multiplied by 4 (asl r13, r13, 2). This translates
108 ; into "bi [n]" jumping to 16-bytes slots. The last slot we did not
109 ; make 16 bytes long with "nop" because we don't need to address after
117 b.d @.L_diff_byte_in_regs
122 b.d @.L_diff_byte_in_regs
127 b.d @.L_diff_byte_in_regs
134 ; If we're here, that means the two operands are not equal.
135 ; 1) First we have to get a mask of their inequality through "xor"
136 ; 2) Then, find the first bit position that they're different: "ffs"
137 ; 3) Depending on the bit position, we want the whole byte containing
138 ; that bit, in both operands, to become the very first byte (least
139 ; significant byte), so that we can subtract one from another.
140 ; Below is an illustration of bit positions and how much we should
141 ; shift the numbers right:
142 ; bit position range : (in binary) | shift right by : (in binary)
143 ; -------------------+-------------------+----------------+------------
144 ; [ 0, 7] : (000000 - 000111) | lsr 0 : 000000
145 ; [ 8,15] : (001000 - 001111) | lsr 8 : 001000
146 ; [16,23] : (010000 - 010111) | lsr 16 : 010000
147 ; [24,31] : (011000 - 011111) | lsr 24 : 011000
148 ; ... : ... | ... : ...
149 ; [56,63] : (111000 - 111111) | lsr 56 : 111000
150 ; We need to ignore the least 3 bits of "position" to get "shift right"
151 ; amount: "and 0x38, ..."
152 ; 4) When the bytes are positioned at byte #0, mask out the rest of the
153 ; bytes and subtract the two operands: lhs - rhs
154 .L_diff_byte_in_regs:
155 xorl r0, r1, r2 ; (1)
157 and r0, r0, 0x38 ; (3)
158 lsrl r1, r1, r0 ; (3)
159 lsrl r2, r2, r0 ; (3)
160 bmsk_s r1, r1, 7 ; (4)
161 bmsk_s r2, r2, 7 ; (4)
163 subl r0, r1, r2 ; (4)
169 ; The loop at the heart of the "memcmp" function follows some specific
170 ; logic and has gone through a few optimisation filters. Knowing them
171 ; will help understand the code better.
173 ; The comparison logic
174 ; --------------------
175 ; In each loop, we compare 32 bytes of data from "lhs" and "rhs". Those
176 ; comparisons takes place by using 8 sets of registers:
178 ; r4 == r5 xor.f 0, r4, r5 lhs[i+0] == rhs[i+0]
179 ; r6 == r7 xor.f 0, r6, r7 lhs[i+8] == rhs[i+8]
180 ; r8 == r9 xor.f 0, r8, r9 lhs[i+16] == rhs[i+16]
181 ; r10 == r11 xor.f 0, r10, r11 lhs[i+24] == rhs[i+32]
183 ; The idea is to set a corresponding bit in r13 register for each
184 ; comparison that fails. The relation between the bits and the
188 ; r13[0] = 1 if r4 != r5
189 ; r13[1] = 1 if r6 != r7
190 ; r13[2] = 1 if r8 != r9
191 ; r13[3] = 1 if r10 != r11
193 ; If r13 remains 0, the next possible iteration of the loop begins.
194 ; If it is not 0 anymore, the algorithm will be interested in the
195 ; lowest bit that is set to 1. That is achieved by the "ffs"
196 ; (find first set) instruction.
198 ; The loop transformation
199 ; -----------------------
200 ; 1) At first, the loop looks like below:
203 ; ldl.ab r4, [r3, +8]
204 ; ldl.ab r5, [r1, +8]
206 ; ldl.ab r10, [r3, +8]
207 ; ldl.ab r11, [r1, +8]
209 ; xor.ne r13, r13, 0b0001
212 ; xor.ne r13, r13, 0b1000
213 ; brne r13, 0, @.unequal_find
216 ; 2) "dbnz" instruction has a delay slot. To make the code more
217 ; efficient, we can bring the first 2 instructions of the loop
218 ; to the end (they will be executed just before the next iteration
219 ; begins). To make the logic of the program sound, those 2
220 ; instructions need to be duplicated before the loop start as well:
222 ; ldl.ab r4, [r3, +8]
223 ; ldl.ab r5, [r1, +8]
225 ; ldl.ab r6, [r3, +8]
226 ; ldl.ab r7, [r1, +8]
228 ; ldl.ab r10, [r3, +8]
229 ; ldl.ab r11, [r1, +8]
231 ; xor.ne r13, r13, 0b0001
234 ; xor.ne r13, r13, 0b1000
235 ; brne r13, 0, @.unequal_find
236 ; ldl.ab r4, [r3, +8]
238 ; ldl.ab r5, [r1, +8]
240 ; There is one more loose end to take care of: At the last iteration
241 ; of the loop, there is an extra load into r4 and r5 registers while
242 ; incrementing the pointers (r3 and r1). We have to correct for that
247 ; brne r13, 0, @.unequal_find
248 ; ldl.ab r4, [r3, +8]
250 ; ldl.ab r5, [r1, +8]
254 ; One last remark about NOT filling the delay slot of "brne" with
255 ; "ldl.ab r4, ...". If the branch is taken, the rest of code that
256 ; is responsible for finding the differentiating bytes relies that
257 ; all 8 registers hold the comparison data of the loop. Putting
258 ; "ldl.ab r4, ..." into the delay slot of "brne ..." would clobber
263 ; brne.d r13, 0, @.unequal_find --> this branch might be taken
264 ; ldl.ab r4, [r3, +8] --> clobbers r4
266 ; ldl.ab r5, [r1, +8]
268 ; Having "ldl.ab r4, ..." between "brne" and "dbnz" as two control flow
269 ; altering instructions is good enough.