2 Copyright (c) 2024, Synopsys, Inc. All rights reserved.
4 Redistribution and use in source and binary forms, with or without
5 modification, are permitted provided that the following conditions are met:
7 1) Redistributions of source code must retain the above copyright notice,
8 this list of conditions and the following disclaimer.
10 2) Redistributions in binary form must reproduce the above copyright notice,
11 this list of conditions and the following disclaimer in the documentation
12 and/or other materials provided with the distribution.
14 3) Neither the name of the Synopsys, Inc., nor the names of its contributors
15 may be used to endorse or promote products derived from this software
16 without specific prior written permission.
18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 POSSIBILITY OF SUCH DAMAGE.
37 ; dest and src MUST NOT intercept
40 ; Perform the same operation as strlen for finding the end of r0 string
43 ; Do 4 byte search until there are no more 4 byte chunks
44 ; Then, do 1 byte search
45 ; Otherwise, 1 byte search until alignment
46 ; Then, do 4 byte search as previously specified
48 ;; More in depth description at the end
50 ; R0 char* dest (destination string)
51 ; R1 const char* src (source string)
53 ; - char* (destiantion string)
56 #if defined (__ARC64_ARCH32__)
59 ; Find end of r0 string
60 ; ========================== STRLEN CODE START ==========================
62 ; Preserve r0 for size calculation when returning
66 ; Setup byte detector (more information below) [1]
72 #if defined (__ARC64_LL64__)
74 ldd.ab r2r3, [r13, +8]
75 ldd.ab r4r5, [r13, +8]
86 ; NULL byte position is detected and encoded in r6 [0] [9]
109 breq.d r6, 0, @.L_4_4B_search
113 ; Point r13 to first NULL byte containing double word [3]
116 ; Select appropriate register to analyze [4]
128 ; Point r13 to first NULL byte in selected double word
133 xbfu r2, r2, 0b0111000011 ; [7]
135 add r13, r13, r2 ; [8]
138 ; ========================== STRLEN CODE END >|< ==========================
144 #if defined (__ARC64_LL64__)
146 ldd.ab r2r3, [r1, +8]
147 ldd.ab r4r5, [r1, +8]
158 ; NULL byte position is detected and encoded in r6 [0] [9]
181 brne r6, 0, @.L_found_in_32B
183 #if defined (__ARC64_LL64__)
185 std.ab r2r3, [r13, +8]
186 std.ab r4r5, [r13, +8]
197 b @.L_4_4B_search_src
203 ; Point r1 to first NULL byte containing double word [3]
206 ;; Store the already loaded data
211 ; Invert so the biggest branch is at the end, and we dont need to increase
216 ; Condense the two subs here
224 b.d @.L_store_lastL32bits
230 b.d @.L_store_lastL32bits
236 b.d @.L_store_lastL32bits
244 ; r11 now contains the data to write
245 .L_store_lastL32bits:
248 and r10, r10, r9 ; [5]
253 xbfu r2, r2, 0b0111000011 ; [7]
255 mov r3, -1; Bitmask setup
257 ; If the NULL byte is in byte 3 (starting from the right)
258 ; we want to store 8-3 bytes
262 ; According to the target byte, setup masks
266 ; Obtain relevant data from destination
269 ; Get which data from dest is not to be overwritten and OR it
270 ; with the relevant data to write
286 ; Find end of r0 string
287 ; ========================== STRLEN CODE START ==========================
289 ; Preserve r0 for size calculation when returning
293 ; Setup byte detector (more information below) [1]
294 vpack2wl r8, NULL_32DT_1, NULL_32DT_1
299 ; Using 128-bit memory operations
300 #if defined (__ARC64_M128__)
302 lddl.ab r2r3, [r13, +16]
303 lddl.ab r4r5, [r13, +16]
305 ; The 64-bit crunching implementation.
306 #elif defined (__ARC64_ARCH64__)
314 # error Unknown configuration
317 ; NULL byte position is detected and encoded in r6 [0] [9]
340 breq.d r6, 0, @.L_4_8B_search
344 ; Point r13 to first NULL byte containing double word [3]
347 ; Select appropriate register to analyze [4]
359 ; Point r13 to first NULL byte in selected double word
360 andl r2, r2, r9 ; [5]
364 xbful r2, r2, 0b0111000011 ; [7]
366 addl r13, r13, r2 ; [8]
369 ; ========================== STRLEN CODE END >|< ==========================
374 #if defined (__ARC64_M128__)
376 lddl.ab r2r3, [r1, +16]
377 lddl.ab r4r5, [r1, +16]
379 #elif defined (__ARC64_ARCH64__)
387 # error Unknown configuration
390 ; NULL byte position is detected and encoded in r6 [0] [9]
413 brne r6, 0, @.L_found_in_32B
415 #if defined (__ARC64_M128__)
417 stdl.ab r2r3, [r13, +16]
418 stdl.ab r4r5, [r13, +16]
420 #elif defined (__ARC64_ARCH64__)
428 # error Unknown configuration
431 b @.L_4_8B_search_src
437 ; Point r1 to first NULL byte containing double word [3]
440 ;; Store the already loaded data
445 ; Invert so the biggest branch is at the end, and we dont need to increase
450 ; Condense the two subs here
458 b.d @.L_store_lastL64bits
464 b.d @.L_store_lastL64bits
470 b.d @.L_store_lastL64bits
478 ; r11 now contains the data to write
479 .L_store_lastL64bits:
483 andl r10, r10, r9 ; [5]
488 xbful r2, r2, 0b0111000011 ; [7]
490 movl r3, -1; Bitmask setup
492 ; If the NULL byte is in byte 3 (starting from the right)
493 ; we want to store 8-3 bytes
497 ; According to the target byte, setup masks
501 ; Obtain relevant data from destination
504 ; Get which data from dest is not to be overwritten and OR it
505 ; with the relevant data to write
519 ;; This code uses a common technique for NULL byte detection inside a word.
520 ;; Details on this technique can be found in:
521 ;; (https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord)
523 ; In sum, this technique allows for detecting a NULL byte inside any given
524 ; amount of bits by performing the following operation
525 ; DETECTNULL(X) (((X) - 0x01010101) & ~(X) & 0x80808080) [0]
527 ; The code above implements this by setting r8 to a 0x01010101... sequence and
528 ; r9 to a 0x80808080... sequence of appropriate length
529 ; As LIMM are 32 bit only, we need to perform MOVHL and ORL [1] operations to
530 ; have the appropriate 64 bit values in place
532 ;; Search is done 32 bytes at a time, either with 64 bit loads or 128 bit loads
533 ;; If a NULL byte is detected, the position of the double word is encoded
534 ;; in r6, which is then used to adjust r13 to the exact byte
536 ; r6 is set via bset, which means we can simply use a fls to obtain the first
537 ; match (or ffs depending on the values in bset) [2].
538 ; The reason for starting at 1 and not 0 is so r6 encodes how many double
539 ; words to go back, and it wouldnt make sense to go back 0 (the NULL would be
540 ; in the next loop iteration).
542 ; The first step to take is point r13 to the appropriate double word.
543 ; As the chosen encoded information is how many double words to go back,
544 ; we can simply multiply r6 by 8 and reduce r13 by that amount [3]
546 ; Then, we need to place the loaded double word containing the first NULL byte
547 ; into a "common" register we can operate on later [4].
549 ; To do this without any jumps, we can shift r6 and perform a conditional mov
550 ; based on the carry flag value.
551 ; The order is very important because the NULL byte can appear in several
552 ; double words, so we want to analyze from last to first.
554 ; We can ignore the first asr (which would be asr.f 2, as we started r6 on 1)
555 ; because if r7 isnt the NULL byte, r2 will always be overwritten so we can
556 ; just decide to start at r7, and overwrite it if needed.
558 ; Now comes the tricky part. In order to obtain the first NULL byte, we need to
559 ; understand the NULL byte detection operation. It is explained in depth in the
560 ; link above but in short, it works by first setting the highest bit of each
561 ; byte to 1, if the corresponding byte is either 0 or less than 0x80
562 ; Then, separately, it makes the highest bit of each byte 1, if the byte is
563 ; less than 0x80. The last step is to and these two values (this operation is
564 ; simplified with the subl, bicl and tst instructions).
566 ; This means that the evaluated equation result value [5] has zeros for all non
567 ; zero bytes, except for the NULL bytes. Therefore, we can simply find the
568 ; first non zero bit (counting from bit 0) which will be inside the position of
569 ; the first NULL byte.
571 ; One thing to note, is that ffs oddly returns 31 if no bit is found, setting
572 ; the zero flag. As r9 is never all 0s at this stage (would mean there is no
573 ; NULL byte and we wouldnt be here) we dont need to worry about that. [6]
575 ; We can then convert the bit position into the last byte position by looking
576 ; into bits 3 to 5, and shifting 3 bits to the right. This can be combined into
577 ; a single xbful operation. The bottom 000011 represent shift by 3 and the top
578 ; 0111 represents the mask (3 to 5 shifted by 3 is 0 to 2). We dont need to worry
579 ; about the case where ffs does not find a bit, because we know for sure there is
580 ; at least one NULL byte, and therefore one of the highest bits is set to 1 [7]
582 ; Finally, we can add the NULL byte position inside the loaded double word to
583 ; r13 and subtract r0 from r13 to obtain the string size [8]
585 ; Some operations are re-ordered such that register dependency is reduced,
586 ; allowing the CPU to run more instructions in parallel [9]
589 ; Some data was already read, and needs to be stored following the same read
590 ; order. To do this, we need to make the