1 /* Subroutines used to expand string operations for RISC-V.
2 Copyright (C) 2023-2025 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published
8 by the Free Software Foundation; either version 3, or (at your
9 option) any later version.
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
13 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
14 License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
20 #define IN_TARGET_CODE 1
24 #include "coretypes.h"
31 #include "print-tree.h"
39 #include "riscv-protos.h"
41 #include "tm-constrs.h"
43 /* Emit proper instruction depending on mode of dest. */
45 #define GEN_EMIT_HELPER2(name) \
47 do_## name ## 2(rtx dest, rtx src) \
50 if (GET_MODE (dest) == DImode) \
51 insn = emit_insn (gen_ ## name ## di2 (dest, src)); \
53 insn = emit_insn (gen_ ## name ## si2 (dest, src)); \
57 /* Emit proper instruction depending on mode of dest. */
59 #define GEN_EMIT_HELPER3(name) \
61 do_## name ## 3(rtx dest, rtx src1, rtx src2) \
64 if (GET_MODE (dest) == DImode) \
65 insn = emit_insn (gen_ ## name ## di3 (dest, src1, src2)); \
67 insn = emit_insn (gen_ ## name ## si3 (dest, src1, src2)); \
71 GEN_EMIT_HELPER3(add
) /* do_add3 */
72 GEN_EMIT_HELPER3(and) /* do_and3 */
73 GEN_EMIT_HELPER3(ashl
) /* do_ashl3 */
74 GEN_EMIT_HELPER2(bswap
) /* do_bswap2 */
75 GEN_EMIT_HELPER2(clz
) /* do_clz2 */
76 GEN_EMIT_HELPER2(ctz
) /* do_ctz2 */
77 GEN_EMIT_HELPER3(ior
) /* do_ior3 */
78 GEN_EMIT_HELPER3(ior_not
) /* do_ior_not3 */
79 GEN_EMIT_HELPER3(lshr
) /* do_lshr3 */
80 GEN_EMIT_HELPER2(neg
) /* do_neg2 */
81 GEN_EMIT_HELPER2(orcb
) /* do_orcb2 */
82 GEN_EMIT_HELPER2(one_cmpl
) /* do_one_cmpl2 */
83 GEN_EMIT_HELPER3(rotr
) /* do_rotr3 */
84 GEN_EMIT_HELPER3(sub
) /* do_sub3 */
85 GEN_EMIT_HELPER2(th_rev
) /* do_th_rev2 */
86 GEN_EMIT_HELPER2(th_tstnbz
) /* do_th_tstnbz2 */
87 GEN_EMIT_HELPER3(xor) /* do_xor3 */
88 GEN_EMIT_HELPER2(zero_extendqi
) /* do_zero_extendqi2 */
89 GEN_EMIT_HELPER2(zero_extendhi
) /* do_zero_extendhi2 */
91 #undef GEN_EMIT_HELPER2
92 #undef GEN_EMIT_HELPER3
94 /* Helper function to emit zero-extended loads.
96 MODE is the mode to use for the load.
97 DEST is the destination register for the data.
98 MEM is the source to load from. */
101 do_load (machine_mode mode
, rtx dest
, rtx mem
)
104 do_zero_extendqi2 (dest
, mem
);
105 else if (mode
== HImode
)
106 do_zero_extendhi2 (dest
, mem
);
107 else if (mode
== SImode
&& TARGET_64BIT
)
108 emit_insn (gen_zero_extendsidi2 (dest
, mem
));
109 else if (mode
== Xmode
)
110 emit_move_insn (dest
, mem
);
115 /* Helper function to emit zero-extended loads.
117 MODE is the mode to use for the load (QImode or Pmode).
118 DEST is the destination register for the data.
119 ADDR_REG is the register that holds the address.
120 ADDR is the address expression to load from. */
123 do_load_from_addr (machine_mode mode
, rtx dest
, rtx addr_reg
, rtx addr
)
125 rtx mem
= gen_rtx_MEM (mode
, addr_reg
);
126 MEM_COPY_ATTRIBUTES (mem
, addr
);
127 set_mem_size (mem
, GET_MODE_SIZE (mode
));
129 do_load (mode
, dest
, mem
);
132 /* Generate a sequence to compare single characters in data1 and data2.
134 RESULT is the register where the return value of str(n)cmp will be stored.
135 DATA1 is a register which contains character1.
136 DATA2 is a register which contains character2.
137 FINAL_LABEL is the location after the calculation of the return value. */
140 emit_strcmp_scalar_compare_byte (rtx result
, rtx data1
, rtx data2
,
143 do_sub3 (result
, data1
, data2
);
144 emit_jump_insn (gen_jump (final_label
));
145 emit_barrier (); /* No fall-through. */
148 /* Generate a sequence to compare two strings in data1 and data2.
150 DATA1 is a register which contains string1.
151 DATA2 is a register which contains string2.
152 ORC1 is a register where orc.b(data1) will be stored.
153 CMP_BYTES is the length of the strings.
154 END_LABEL is the location of the code that calculates the return value. */
157 emit_strcmp_scalar_compare_subword (rtx data1
, rtx data2
, rtx orc1
,
158 unsigned HOST_WIDE_INT cmp_bytes
,
161 /* Set a NUL-byte after the relevant data (behind the string). */
162 long long im
= -256ll;
163 rtx imask
= gen_rtx_CONST_INT (Xmode
, im
);
164 rtx m_reg
= gen_reg_rtx (Xmode
);
165 emit_insn (gen_rtx_SET (m_reg
, imask
));
166 do_rotr3 (m_reg
, m_reg
, GEN_INT (BITS_PER_WORD
- cmp_bytes
* BITS_PER_UNIT
));
167 do_and3 (data1
, m_reg
, data1
);
168 do_and3 (data2
, m_reg
, data2
);
170 do_orcb2 (orc1
, data1
);
172 do_th_tstnbz2 (orc1
, data1
);
173 emit_jump_insn (gen_jump (end_label
));
174 emit_barrier (); /* No fall-through. */
177 /* Generate a sequence to compare two strings in data1 and data2.
179 DATA1 is a register which contains string1.
180 DATA2 is a register which contains string2.
181 ORC1 is a register where orc.b(data1) will be stored.
182 TESTVAL is the value to test ORC1 against.
183 END_LABEL is the location of the code that calculates the return value.
184 NONUL_END_LABEL is the location of the code that calculates the return value
185 in case the first string does not contain a NULL-byte. */
188 emit_strcmp_scalar_compare_word (rtx data1
, rtx data2
, rtx orc1
, rtx testval
,
189 rtx end_label
, rtx nonul_end_label
)
191 /* Check if data1 contains a NUL character. */
193 do_orcb2 (orc1
, data1
);
195 do_th_tstnbz2 (orc1
, data1
);
196 rtx cond1
= gen_rtx_NE (VOIDmode
, orc1
, testval
);
197 emit_unlikely_jump_insn (gen_cbranch4 (Pmode
, cond1
, orc1
, testval
,
199 /* Break out if u1 != u2 */
200 rtx cond2
= gen_rtx_NE (VOIDmode
, data1
, data2
);
201 emit_unlikely_jump_insn (gen_cbranch4 (Pmode
, cond2
, data1
,
202 data2
, nonul_end_label
));
203 /* Fall-through on equality. */
206 /* Generate the sequence of compares for strcmp/strncmp using zbb instructions.
208 RESULT is the register where the return value of str(n)cmp will be stored.
209 The strings are referenced by SRC1 and SRC2.
210 The number of bytes to compare is defined by NBYTES.
211 DATA1 is a register where string1 will be stored.
212 DATA2 is a register where string2 will be stored.
213 ORC1 is a register where orc.b(data1) will be stored.
214 END_LABEL is the location of the code that calculates the return value.
215 NONUL_END_LABEL is the location of the code that calculates the return value
216 in case the first string does not contain a NULL-byte.
217 FINAL_LABEL is the location of the code that comes after the calculation
218 of the return value. */
221 emit_strcmp_scalar_load_and_compare (rtx result
, rtx src1
, rtx src2
,
222 unsigned HOST_WIDE_INT nbytes
,
223 rtx data1
, rtx data2
, rtx orc1
,
224 rtx end_label
, rtx nonul_end_label
,
227 const unsigned HOST_WIDE_INT xlen
= GET_MODE_SIZE (Xmode
);
228 unsigned HOST_WIDE_INT offset
= 0;
230 rtx testval
= gen_reg_rtx (Xmode
);
232 emit_insn (gen_rtx_SET (testval
, constm1_rtx
));
234 emit_insn (gen_rtx_SET (testval
, const0_rtx
));
238 unsigned HOST_WIDE_INT cmp_bytes
= xlen
< nbytes
? xlen
: nbytes
;
239 machine_mode load_mode
;
245 rtx addr1
= adjust_address (src1
, load_mode
, offset
);
246 do_load (load_mode
, data1
, addr1
);
247 rtx addr2
= adjust_address (src2
, load_mode
, offset
);
248 do_load (load_mode
, data2
, addr2
);
252 emit_strcmp_scalar_compare_byte (result
, data1
, data2
, final_label
);
255 else if (cmp_bytes
< xlen
)
257 emit_strcmp_scalar_compare_subword (data1
, data2
, orc1
,
258 cmp_bytes
, end_label
);
262 emit_strcmp_scalar_compare_word (data1
, data2
, orc1
, testval
,
263 end_label
, nonul_end_label
);
270 /* Fixup pointers and generate a call to strcmp.
272 RESULT is the register where the return value of str(n)cmp will be stored.
273 The strings are referenced by SRC1 and SRC2.
274 The number of already compared bytes is defined by NBYTES. */
277 emit_strcmp_scalar_call_to_libc (rtx result
, rtx src1
, rtx src2
,
278 unsigned HOST_WIDE_INT nbytes
)
280 /* Update pointers past what has been compared already. */
281 rtx src1_addr
= force_reg (Pmode
, XEXP (src1
, 0));
282 rtx src2_addr
= force_reg (Pmode
, XEXP (src2
, 0));
283 rtx src1_new
= force_reg (Pmode
,
284 gen_rtx_PLUS (Pmode
, src1_addr
, GEN_INT (nbytes
)));
285 rtx src2_new
= force_reg (Pmode
,
286 gen_rtx_PLUS (Pmode
, src2_addr
, GEN_INT (nbytes
)));
288 /* Construct call to strcmp to compare the rest of the string. */
289 tree fun
= builtin_decl_explicit (BUILT_IN_STRCMP
);
290 emit_library_call_value (XEXP (DECL_RTL (fun
), 0),
291 result
, LCT_NORMAL
, GET_MODE (result
),
292 src1_new
, Pmode
, src2_new
, Pmode
);
295 /* Fast strcmp-result calculation if no NULL-byte in string1.
297 RESULT is the register where the return value of str(n)cmp will be stored.
298 The mismatching strings are stored in DATA1 and DATA2. */
301 emit_strcmp_scalar_result_calculation_nonul (rtx result
, rtx data1
, rtx data2
)
303 /* Words don't match, and no NUL byte in one word.
304 Get bytes in big-endian order and compare as words. */
305 do_bswap2 (data1
, data1
);
306 do_bswap2 (data2
, data2
);
307 /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence. */
308 rtx tmp
= gen_reg_rtx (Xmode
);
309 emit_insn (gen_slt_3 (LTU
, Xmode
, Xmode
, tmp
, data1
, data2
));
311 do_ior3 (result
, tmp
, const1_rtx
);
314 /* strcmp-result calculation.
316 RESULT is the register where the return value of str(n)cmp will be stored.
317 The strings are stored in DATA1 and DATA2.
318 ORC1 contains orc.b(DATA1). */
321 emit_strcmp_scalar_result_calculation (rtx result
, rtx data1
, rtx data2
,
324 const unsigned HOST_WIDE_INT xlen
= GET_MODE_SIZE (Xmode
);
326 /* Convert non-equal bytes into non-NUL bytes. */
327 rtx diff
= gen_reg_rtx (Xmode
);
328 do_xor3 (diff
, data1
, data2
);
329 rtx shift
= gen_reg_rtx (Xmode
);
333 /* Convert non-equal or NUL-bytes into non-NUL bytes. */
334 rtx syndrome
= gen_reg_rtx (Xmode
);
335 do_orcb2 (diff
, diff
);
336 do_ior_not3 (syndrome
, orc1
, diff
);
337 /* Count the number of equal bits from the beginning of the word. */
338 do_ctz2 (shift
, syndrome
);
342 /* Convert non-equal or NUL-bytes into non-NUL bytes. */
343 rtx syndrome
= gen_reg_rtx (Xmode
);
344 do_th_tstnbz2 (diff
, diff
);
345 do_one_cmpl2 (diff
, diff
);
346 do_ior3 (syndrome
, orc1
, diff
);
347 /* Count the number of equal bits from the beginning of the word. */
348 do_th_rev2 (syndrome
, syndrome
);
349 do_clz2 (shift
, syndrome
);
352 do_bswap2 (data1
, data1
);
353 do_bswap2 (data2
, data2
);
355 /* The most-significant-non-zero bit of the syndrome marks either the
356 first bit that is different, or the top bit of the first zero byte.
357 Shifting left now will bring the critical information into the
359 do_ashl3 (data1
, data1
, gen_lowpart (QImode
, shift
));
360 do_ashl3 (data2
, data2
, gen_lowpart (QImode
, shift
));
362 /* But we need to zero-extend (char is unsigned) the value and then
363 perform a signed 32-bit subtraction. */
364 unsigned int shiftr
= (xlen
- 1) * BITS_PER_UNIT
;
365 do_lshr3 (data1
, data1
, GEN_INT (shiftr
));
366 do_lshr3 (data2
, data2
, GEN_INT (shiftr
));
367 do_sub3 (result
, data1
, data2
);
370 /* Expand str(n)cmp using Zbb/TheadBb instructions.
372 The result will be stored in RESULT.
373 The strings are referenced by SRC1 and SRC2.
374 The number of bytes to compare is defined by NBYTES.
375 The alignment is defined by ALIGNMENT.
376 If NCOMPARE is false then libc's strcmp() will be called if comparing
377 NBYTES of both strings did not find differences or NULL-bytes.
379 Return true if expansion was successful, or false otherwise. */
382 riscv_expand_strcmp_scalar (rtx result
, rtx src1
, rtx src2
,
383 unsigned HOST_WIDE_INT nbytes
,
384 unsigned HOST_WIDE_INT alignment
,
387 const unsigned HOST_WIDE_INT xlen
= GET_MODE_SIZE (Xmode
);
389 gcc_assert (TARGET_ZBB
|| TARGET_XTHEADBB
);
390 gcc_assert (nbytes
> 0);
391 gcc_assert ((int)nbytes
<= riscv_strcmp_inline_limit
);
392 gcc_assert (ncompare
|| (nbytes
& (xlen
- 1)) == 0);
394 /* Limit to 12-bits (maximum load-offset). */
395 if (nbytes
> IMM_REACH
)
398 /* We don't support big endian. */
399 if (BYTES_BIG_ENDIAN
)
402 /* We need xlen-aligned strings. */
403 if (alignment
< xlen
)
406 /* Overall structure of emitted code:
408 - Load data1 and data2
409 - Set orc1 := orc.b (data1) (or th.tstnbz)
410 - Compare strings and either:
411 - Fall-through on equality
412 - Jump to nonul_end_label if data1 !or end_label
413 - Calculate result value and jump to final_label
415 Call-to-libc or set result to 0 (depending on ncompare)
417 nonul_end_label: // words don't match, and no null byte in first word.
418 Calculate result value with the use of data1, data2 and orc1
421 Calculate result value with the use of data1, data2 and orc1
426 rtx data1
= gen_reg_rtx (Xmode
);
427 rtx data2
= gen_reg_rtx (Xmode
);
428 rtx orc1
= gen_reg_rtx (Xmode
);
429 rtx nonul_end_label
= gen_label_rtx ();
430 rtx end_label
= gen_label_rtx ();
431 rtx final_label
= gen_label_rtx ();
433 /* Generate a sequence of zbb instructions to compare out
434 to the length specified. */
435 emit_strcmp_scalar_load_and_compare (result
, src1
, src2
, nbytes
,
437 end_label
, nonul_end_label
, final_label
);
439 /* All compared and everything was equal. */
442 emit_insn (gen_rtx_SET (result
, CONST0_RTX (GET_MODE (result
))));
443 emit_jump_insn (gen_jump (final_label
));
444 emit_barrier (); /* No fall-through. */
448 emit_strcmp_scalar_call_to_libc (result
, src1
, src2
, nbytes
);
449 emit_jump_insn (gen_jump (final_label
));
450 emit_barrier (); /* No fall-through. */
454 emit_label (nonul_end_label
);
455 emit_strcmp_scalar_result_calculation_nonul (result
, data1
, data2
);
456 emit_jump_insn (gen_jump (final_label
));
457 emit_barrier (); /* No fall-through. */
459 emit_label (end_label
);
460 emit_strcmp_scalar_result_calculation (result
, data1
, data2
, orc1
);
461 emit_jump_insn (gen_jump (final_label
));
462 emit_barrier (); /* No fall-through. */
464 emit_label (final_label
);
468 /* Expand a string compare operation.
470 The result will be stored in RESULT.
471 The strings are referenced by SRC1 and SRC2.
472 The argument BYTES_RTX either holds the number of characters to
473 compare, or is NULL_RTX. The argument ALIGN_RTX holds the alignment.
475 Return true if expansion was successful, or false otherwise. */
478 riscv_expand_strcmp (rtx result
, rtx src1
, rtx src2
,
479 rtx bytes_rtx
, rtx align_rtx
)
481 unsigned HOST_WIDE_INT compare_max
;
482 unsigned HOST_WIDE_INT nbytes
;
483 unsigned HOST_WIDE_INT alignment
;
484 bool ncompare
= bytes_rtx
!= NULL_RTX
;
485 const unsigned HOST_WIDE_INT xlen
= GET_MODE_SIZE (Xmode
);
487 if (riscv_strcmp_inline_limit
== 0)
490 /* Round down the comparision limit to a multiple of xlen. */
491 compare_max
= riscv_strcmp_inline_limit
& ~(xlen
- 1);
493 /* Decide how many bytes to compare inline. */
494 if (bytes_rtx
== NULL_RTX
)
496 nbytes
= compare_max
;
500 /* If we have a length, it must be constant. */
501 if (!CONST_INT_P (bytes_rtx
))
503 nbytes
= UINTVAL (bytes_rtx
);
505 /* If NBYTES is zero the result of strncmp will always be zero,
506 but that would require special casing in the caller. So for
507 now just don't do an inline expansion. This probably rarely
508 happens in practice, but it is tested by the testsuite. */
512 /* We don't emit parts of a strncmp() call. */
513 if (nbytes
> compare_max
)
519 - nbytes <= riscv_strcmp_inline_limit
520 - nbytes is a multiple of xlen if !ncompare */
522 if (!CONST_INT_P (align_rtx
))
524 alignment
= UINTVAL (align_rtx
);
526 if (TARGET_VECTOR
&& stringop_strategy
& STRATEGY_VECTOR
)
528 bool ok
= riscv_vector::expand_strcmp (result
, src1
, src2
,
529 bytes_rtx
, alignment
,
535 if ((TARGET_ZBB
|| TARGET_XTHEADBB
) && stringop_strategy
& STRATEGY_SCALAR
)
536 return riscv_expand_strcmp_scalar (result
, src1
, src2
, nbytes
, alignment
,
542 /* If the provided string is aligned, then read XLEN bytes
543 in a loop and use orc.b to find NUL-bytes. */
546 riscv_expand_strlen_scalar (rtx result
, rtx src
, rtx align
)
548 rtx testval
, addr
, addr_plus_regsz
, word
, zeros
;
549 rtx loop_label
, cond
;
551 gcc_assert (TARGET_ZBB
|| TARGET_XTHEADBB
);
553 /* The alignment needs to be known and big enough. */
554 if (!CONST_INT_P (align
) || UINTVAL (align
) < GET_MODE_SIZE (Xmode
))
557 testval
= gen_reg_rtx (Xmode
);
558 addr
= copy_addr_to_reg (XEXP (src
, 0));
559 addr_plus_regsz
= gen_reg_rtx (Pmode
);
560 word
= gen_reg_rtx (Xmode
);
561 zeros
= gen_reg_rtx (Xmode
);
564 emit_insn (gen_rtx_SET (testval
, constm1_rtx
));
566 emit_insn (gen_rtx_SET (testval
, const0_rtx
));
568 do_add3 (addr_plus_regsz
, addr
, GEN_INT (UNITS_PER_WORD
));
570 loop_label
= gen_label_rtx ();
571 emit_label (loop_label
);
573 /* Load a word and use orc.b/th.tstnbz to find a zero-byte. */
574 do_load_from_addr (Xmode
, word
, addr
, src
);
575 do_add3 (addr
, addr
, GEN_INT (UNITS_PER_WORD
));
577 do_orcb2 (word
, word
);
579 do_th_tstnbz2 (word
, word
);
580 cond
= gen_rtx_EQ (VOIDmode
, word
, testval
);
581 emit_unlikely_jump_insn (gen_cbranch4 (Xmode
, cond
, word
, testval
, loop_label
));
583 /* Calculate the return value by counting zero-bits. */
585 do_one_cmpl2 (word
, word
);
586 if (TARGET_BIG_ENDIAN
)
587 do_clz2 (zeros
, word
);
589 do_ctz2 (zeros
, word
);
592 do_th_rev2 (word
, word
);
593 do_clz2 (zeros
, word
);
596 do_lshr3 (zeros
, zeros
, GEN_INT (exact_log2 (BITS_PER_UNIT
)));
597 do_add3 (addr
, addr
, zeros
);
598 do_sub3 (result
, addr
, addr_plus_regsz
);
603 /* Expand a strlen operation and return true if successful.
604 Return false if we should let the compiler generate normal
605 code, probably a strlen call. */
608 riscv_expand_strlen (rtx result
, rtx src
, rtx search_char
, rtx align
)
610 if (TARGET_VECTOR
&& stringop_strategy
& STRATEGY_VECTOR
)
612 riscv_vector::expand_rawmemchr (E_QImode
, result
, src
, search_char
,
617 gcc_assert (search_char
== const0_rtx
);
619 if ((TARGET_ZBB
|| TARGET_XTHEADBB
) && stringop_strategy
& STRATEGY_SCALAR
)
620 return riscv_expand_strlen_scalar (result
, src
, align
);
625 /* Generate the sequence of load and compares for memcmp using Zbb.
627 RESULT is the register where the return value of memcmp will be stored.
628 The source pointers are SRC1 and SRC2 (NBYTES bytes to compare).
629 DATA1 and DATA2 are registers where the data chunks will be stored.
630 DIFF_LABEL is the location of the code that calculates the return value.
631 FINAL_LABEL is the location of the code that comes after the calculation
632 of the return value. */
635 emit_memcmp_scalar_load_and_compare (rtx result
, rtx src1
, rtx src2
,
636 unsigned HOST_WIDE_INT nbytes
,
637 rtx data1
, rtx data2
,
638 rtx diff_label
, rtx final_label
)
640 const unsigned HOST_WIDE_INT xlen
= GET_MODE_SIZE (Xmode
);
641 unsigned HOST_WIDE_INT offset
= 0;
645 unsigned HOST_WIDE_INT cmp_bytes
= xlen
< nbytes
? xlen
: nbytes
;
646 machine_mode load_mode
;
648 /* Special cases to avoid masking of trailing bytes. */
651 else if (cmp_bytes
== 2)
653 else if (cmp_bytes
== 4)
658 rtx addr1
= adjust_address (src1
, load_mode
, offset
);
659 do_load (load_mode
, data1
, addr1
);
660 rtx addr2
= adjust_address (src2
, load_mode
, offset
);
661 do_load (load_mode
, data2
, addr2
);
663 /* Fast-path for a single byte. */
666 do_sub3 (result
, data1
, data2
);
667 emit_jump_insn (gen_jump (final_label
));
668 emit_barrier (); /* No fall-through. */
672 /* Shift off trailing bytes in words if needed. */
673 unsigned int load_bytes
= GET_MODE_SIZE (load_mode
).to_constant ();
674 if (cmp_bytes
< load_bytes
)
676 int shamt
= (load_bytes
- cmp_bytes
) * BITS_PER_UNIT
;
677 do_ashl3 (data1
, data1
, GEN_INT (shamt
));
678 do_ashl3 (data2
, data2
, GEN_INT (shamt
));
681 /* Break out if data1 != data2 */
682 rtx cond
= gen_rtx_NE (VOIDmode
, data1
, data2
);
683 emit_unlikely_jump_insn (gen_cbranch4 (Pmode
, cond
, data1
,
685 /* Fall-through on equality. */
692 /* memcmp result calculation.
694 RESULT is the register where the return value will be stored.
695 The two data chunks are in DATA1 and DATA2. */
698 emit_memcmp_scalar_result_calculation (rtx result
, rtx data1
, rtx data2
)
700 /* Get bytes in big-endian order and compare as words. */
701 do_bswap2 (data1
, data1
);
702 do_bswap2 (data2
, data2
);
704 /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence. */
705 emit_insn (gen_slt_3 (LTU
, Xmode
, Xmode
, result
, data1
, data2
));
706 do_neg2 (result
, result
);
707 do_ior3 (result
, result
, const1_rtx
);
710 /* Expand memcmp using scalar instructions (incl. Zbb).
712 RESULT is the register where the return value will be stored.
713 The source pointers are SRC1 and SRC2 (NBYTES bytes to compare). */
716 riscv_expand_block_compare_scalar (rtx result
, rtx src1
, rtx src2
, rtx nbytes
)
718 const unsigned HOST_WIDE_INT xlen
= GET_MODE_SIZE (Xmode
);
720 if (optimize_function_for_size_p (cfun
))
723 /* We don't support big endian. */
724 if (BYTES_BIG_ENDIAN
)
727 if (!CONST_INT_P (nbytes
))
730 /* We need the rev (bswap) instruction. */
734 unsigned HOST_WIDE_INT length
= UINTVAL (nbytes
);
736 /* Limit to 12-bits (maximum load-offset). */
737 if (length
> IMM_REACH
)
740 /* We need xlen-aligned memory. */
741 unsigned HOST_WIDE_INT align
= MIN (MEM_ALIGN (src1
), MEM_ALIGN (src2
));
742 if (align
< (xlen
* BITS_PER_UNIT
))
745 if (length
> RISCV_MAX_MOVE_BYTES_STRAIGHT
)
748 /* Overall structure of emitted code:
750 - Load data1 and data2
751 - Compare strings and either:
752 - Fall-through on equality
753 - Jump to end_label if data1 != data2
755 Set result to 0 and jump to final_label
757 Calculate result value with the use of data1 and data2
762 rtx data1
= gen_reg_rtx (Xmode
);
763 rtx data2
= gen_reg_rtx (Xmode
);
764 rtx diff_label
= gen_label_rtx ();
765 rtx final_label
= gen_label_rtx ();
767 /* Generate a sequence of zbb instructions to compare out
768 to the length specified. */
769 emit_memcmp_scalar_load_and_compare (result
, src1
, src2
, length
,
771 diff_label
, final_label
);
773 emit_move_insn (result
, CONST0_RTX (GET_MODE (result
)));
774 emit_jump_insn (gen_jump (final_label
));
775 emit_barrier (); /* No fall-through. */
777 emit_label (diff_label
);
778 emit_memcmp_scalar_result_calculation (result
, data1
, data2
);
779 emit_jump_insn (gen_jump (final_label
));
780 emit_barrier (); /* No fall-through. */
782 emit_label (final_label
);
786 /* Expand memcmp operation.
788 RESULT is the register where the return value will be stored.
789 The source pointers are SRC1 and SRC2 (NBYTES bytes to compare). */
792 riscv_expand_block_compare (rtx result
, rtx src1
, rtx src2
, rtx nbytes
)
794 if (stringop_strategy
& STRATEGY_SCALAR
)
795 return riscv_expand_block_compare_scalar (result
, src1
, src2
, nbytes
);
799 /* Emit straight-line code to move LENGTH bytes from SRC to DEST
800 with accesses that are ALIGN bytes aligned.
801 Assume that the areas do not overlap. */
804 riscv_block_move_straight (rtx dest
, rtx src
, unsigned HOST_WIDE_INT length
,
805 unsigned HOST_WIDE_INT align
)
807 unsigned HOST_WIDE_INT offset
, delta
;
808 unsigned HOST_WIDE_INT bits
;
810 enum machine_mode mode
;
813 bits
= MAX (BITS_PER_UNIT
, MIN (BITS_PER_WORD
, align
));
815 mode
= mode_for_size (bits
, MODE_INT
, 0).require ();
816 delta
= bits
/ BITS_PER_UNIT
;
818 /* Allocate a buffer for the temporary registers. */
819 regs
= XALLOCAVEC (rtx
, length
/ delta
- 1);
821 /* Load as many BITS-sized chunks as possible. Use a normal load if
822 the source has enough alignment, otherwise use left/right pairs. */
823 for (offset
= 0, i
= 0; offset
+ 2 * delta
<= length
; offset
+= delta
, i
++)
825 regs
[i
] = gen_reg_rtx (mode
);
826 riscv_emit_move (regs
[i
], adjust_address (src
, mode
, offset
));
829 /* Copy the chunks to the destination. */
830 for (offset
= 0, i
= 0; offset
+ 2 * delta
<= length
; offset
+= delta
, i
++)
831 riscv_emit_move (adjust_address (dest
, mode
, offset
), regs
[i
]);
833 /* Mop up any left-over bytes. */
836 src
= adjust_address (src
, BLKmode
, offset
);
837 dest
= adjust_address (dest
, BLKmode
, offset
);
838 move_by_pieces (dest
, src
, length
- offset
, align
, RETURN_BEGIN
);
842 /* Helper function for doing a loop-based block operation on memory
845 Create a new base register for use within the loop and point it to
846 the start of MEM. Create a new memory reference that uses this
847 register and has an alignment of ALIGN. Store them in *LOOP_REG
848 and *LOOP_MEM respectively. */
851 riscv_adjust_block_mem (rtx mem
, unsigned HOST_WIDE_INT align
,
852 rtx
*loop_reg
, rtx
*loop_mem
)
854 *loop_reg
= copy_addr_to_reg (XEXP (mem
, 0));
856 /* Although the new mem does not refer to a known location,
857 it does keep up to LENGTH bytes of alignment. */
858 *loop_mem
= change_address (mem
, BLKmode
, *loop_reg
);
859 set_mem_align (*loop_mem
, align
);
862 /* Move LENGTH bytes from SRC to DEST using a loop that moves BYTES_PER_ITER
863 bytes at a time. LENGTH must be at least BYTES_PER_ITER. The alignment
864 of the access can be set by ALIGN. Assume that the memory regions do not
868 riscv_block_move_loop (rtx dest
, rtx src
, unsigned HOST_WIDE_INT length
,
869 unsigned HOST_WIDE_INT align
,
870 unsigned HOST_WIDE_INT bytes_per_iter
)
872 rtx label
, src_reg
, dest_reg
, final_src
, test
;
873 unsigned HOST_WIDE_INT leftover
;
875 leftover
= length
% bytes_per_iter
;
878 /* Create registers and memory references for use within the loop. */
879 riscv_adjust_block_mem (src
, align
, &src_reg
, &src
);
880 riscv_adjust_block_mem (dest
, align
, &dest_reg
, &dest
);
882 /* Calculate the value that SRC_REG should have after the last iteration
884 final_src
= expand_simple_binop (Pmode
, PLUS
, src_reg
, GEN_INT (length
),
887 /* Emit the start of the loop. */
888 label
= gen_label_rtx ();
891 /* Emit the loop body. */
892 riscv_block_move_straight (dest
, src
, bytes_per_iter
, align
);
894 /* Move on to the next block. */
895 riscv_emit_move (src_reg
, plus_constant (Pmode
, src_reg
, bytes_per_iter
));
896 riscv_emit_move (dest_reg
, plus_constant (Pmode
, dest_reg
, bytes_per_iter
));
898 /* Emit the loop condition. */
899 test
= gen_rtx_NE (VOIDmode
, src_reg
, final_src
);
900 emit_jump_insn (gen_cbranch4 (Pmode
, test
, src_reg
, final_src
, label
));
902 /* Mop up any left-over bytes. */
904 riscv_block_move_straight (dest
, src
, leftover
, align
);
906 emit_insn(gen_nop ());
909 /* Expand a cpymemsi instruction, which copies LENGTH bytes from
910 memory reference SRC to memory reference DEST. */
913 riscv_expand_block_move_scalar (rtx dest
, rtx src
, rtx length
)
915 if (!CONST_INT_P (length
))
918 unsigned HOST_WIDE_INT hwi_length
= UINTVAL (length
);
919 unsigned HOST_WIDE_INT factor
, align
;
921 if (riscv_slow_unaligned_access_p
)
923 align
= MIN (MIN (MEM_ALIGN (src
), MEM_ALIGN (dest
)), BITS_PER_WORD
);
924 factor
= BITS_PER_WORD
/ align
;
928 /* Pretend word-alignment. */
929 align
= BITS_PER_WORD
;
933 if (optimize_function_for_size_p (cfun
)
934 && hwi_length
* factor
* UNITS_PER_WORD
> MOVE_RATIO (false))
937 if (hwi_length
<= (RISCV_MAX_MOVE_BYTES_STRAIGHT
/ factor
))
939 riscv_block_move_straight (dest
, src
, hwi_length
, align
);
942 else if (optimize
&& align
>= BITS_PER_WORD
)
944 unsigned min_iter_words
945 = RISCV_MAX_MOVE_BYTES_PER_LOOP_ITER
/ UNITS_PER_WORD
;
946 unsigned iter_words
= min_iter_words
;
947 unsigned HOST_WIDE_INT bytes
= hwi_length
;
948 unsigned HOST_WIDE_INT words
= bytes
/ UNITS_PER_WORD
;
950 /* Lengthen the loop body if it shortens the tail. */
951 for (unsigned i
= min_iter_words
; i
< min_iter_words
* 2 - 1; i
++)
953 unsigned cur_cost
= iter_words
+ words
% iter_words
;
954 unsigned new_cost
= i
+ words
% i
;
955 if (new_cost
<= cur_cost
)
959 riscv_block_move_loop (dest
, src
, bytes
, align
,
960 iter_words
* UNITS_PER_WORD
);
967 /* This function delegates block-move expansion to either the vector
968 implementation or the scalar one. Return TRUE if successful or FALSE
969 otherwise. Assume that the memory regions do not overlap. */
972 riscv_expand_block_move (rtx dest
, rtx src
, rtx length
)
974 if ((TARGET_VECTOR
&& !TARGET_XTHEADVECTOR
)
975 && stringop_strategy
& STRATEGY_VECTOR
)
977 bool ok
= riscv_vector::expand_block_move (dest
, src
, length
, false);
982 if (stringop_strategy
& STRATEGY_SCALAR
)
983 return riscv_expand_block_move_scalar (dest
, src
, length
);
988 /* Expand a block-clear instruction via cbo.zero instructions. */
991 riscv_expand_block_clear_zicboz_zic64b (rtx dest
, rtx length
)
993 unsigned HOST_WIDE_INT hwi_length
;
994 unsigned HOST_WIDE_INT align
;
995 const unsigned HOST_WIDE_INT cbo_bytes
= 64;
997 gcc_assert (TARGET_ZICBOZ
&& TARGET_ZIC64B
);
999 if (!CONST_INT_P (length
))
1002 hwi_length
= UINTVAL (length
);
1003 if (hwi_length
< cbo_bytes
)
1006 align
= MEM_ALIGN (dest
) / BITS_PER_UNIT
;
1007 if (align
< cbo_bytes
)
1010 /* We don't emit loops. Instead apply move-bytes limitation. */
1011 unsigned HOST_WIDE_INT max_bytes
= RISCV_MAX_MOVE_BYTES_STRAIGHT
/
1012 UNITS_PER_WORD
* cbo_bytes
;
1013 if (hwi_length
> max_bytes
)
1016 unsigned HOST_WIDE_INT offset
= 0;
1017 while (offset
+ cbo_bytes
<= hwi_length
)
1019 rtx mem
= adjust_address (dest
, BLKmode
, offset
);
1020 rtx addr
= force_reg (Pmode
, XEXP (mem
, 0));
1022 emit_insn (gen_riscv_zero_di (addr
));
1024 emit_insn (gen_riscv_zero_si (addr
));
1025 offset
+= cbo_bytes
;
1028 if (offset
< hwi_length
)
1030 rtx mem
= adjust_address (dest
, BLKmode
, offset
);
1031 clear_by_pieces (mem
, hwi_length
- offset
, align
);
1038 riscv_expand_block_clear (rtx dest
, rtx length
)
1040 /* Only use setmem-zero expansion for Zicboz + Zic64b. */
1041 if (!TARGET_ZICBOZ
|| !TARGET_ZIC64B
)
1044 if (optimize_function_for_size_p (cfun
))
1047 return riscv_expand_block_clear_zicboz_zic64b (dest
, length
);
1050 /* --- Vector expanders --- */
1052 namespace riscv_vector
{
1054 struct stringop_info
{
1060 /* If a vectorized stringop should be used populate INFO and return TRUE.
1061 Otherwise return false and leave INFO unchanged.
1063 MAX_EW is the maximum element width that the caller wants to use and
1064 LENGTH_IN is the length of the stringop in bytes.
1066 This is currently used for cpymem and setmem. If expand_vec_cmpmem switches
1067 to using it too then check_vectorise_memory_operation can be removed.
1071 use_vector_stringop_p (struct stringop_info
&info
, HOST_WIDE_INT max_ew
,
1074 bool need_loop
= true;
1075 machine_mode vmode
= VOIDmode
;
1076 /* The number of elements in the stringop. */
1077 rtx avl
= length_in
;
1078 HOST_WIDE_INT potential_ew
= max_ew
;
1080 if (!TARGET_VECTOR
|| !(stringop_strategy
& STRATEGY_VECTOR
))
1083 if (CONST_INT_P (length_in
))
1085 HOST_WIDE_INT length
= INTVAL (length_in
);
1087 /* If the VLEN and preferred LMUL allow the entire block to be copied in
1088 one go then no loop is needed. */
1089 if (known_le (length
, BYTES_PER_RISCV_VECTOR
* TARGET_MAX_LMUL
))
1093 /* If a single scalar load / store pair can do the job, leave it
1094 to the scalar code to do that. */
1095 /* ??? If fast unaligned access is supported, the scalar code could
1096 use suitably sized scalars irrespective of alignment. If that
1097 gets fixed, we have to adjust the test here. */
1099 if (pow2p_hwi (length
) && length
<= potential_ew
)
1103 /* Find the vector mode to use. Using the largest possible element
1104 size is likely to give smaller constants, and thus potentially
1105 reducing code size. However, if we need a loop, we need to update
1106 the pointers, and that is more complicated with a larger element
1107 size, unless we use an immediate, which prevents us from dynamically
1108 using the targets transfer size that the hart supports. And then,
1109 unless we know the *exact* vector size of the hart, we'd need
1110 multiple vsetvli / branch statements, so it's not even a size win.
1111 If, in the future, we find an RISCV-V implementation that is slower
1112 for small element widths, we might allow larger element widths for
1116 for (; potential_ew
; potential_ew
>>= 1)
1118 scalar_int_mode elem_mode
;
1119 unsigned HOST_WIDE_INT bits
= potential_ew
* BITS_PER_UNIT
;
1120 poly_uint64 per_iter
;
1124 per_iter
= BYTES_PER_RISCV_VECTOR
* TARGET_MAX_LMUL
;
1127 /* BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL may not be divisible by
1128 this potential_ew. */
1129 if (!multiple_p (per_iter
, potential_ew
, &nunits
))
1132 /* Unless we get an implementation that's slow for small element
1133 size / non-word-aligned accesses, we assume that the hardware
1134 handles this well, and we don't want to complicate the code
1135 with shifting word contents around or handling extra bytes at
1136 the start and/or end. So we want the total transfer size and
1137 alignment to fit with the element size. */
1138 if (length
% potential_ew
!= 0
1139 || !int_mode_for_size (bits
, 0).exists (&elem_mode
))
1142 poly_uint64 mode_units
;
1143 /* Find the mode to use for the copy inside the loop - or the
1144 sole copy, if there is no loop. */
1147 /* Try if we have an exact mode for the copy. */
1148 if (riscv_vector::get_vector_mode (elem_mode
,
1149 nunits
).exists (&vmode
))
1151 /* Since we don't have a mode that exactly matches the transfer
1152 size, we'll need to use pred_store, which is not available
1153 for all vector modes, but only iE_RVV_M* modes, hence trying
1154 to find a vector mode for a merely rounded-up size is
1156 Still, by choosing a lower LMUL factor that still allows
1157 an entire transfer, we can reduce register pressure. */
1158 for (unsigned lmul
= 1; lmul
< TARGET_MAX_LMUL
; lmul
<<= 1)
1159 if (known_le (length
* BITS_PER_UNIT
, TARGET_MIN_VLEN
* lmul
)
1160 && multiple_p (BYTES_PER_RISCV_VECTOR
* lmul
, potential_ew
,
1162 && (riscv_vector::get_vector_mode
1163 (elem_mode
, mode_units
).exists (&vmode
)))
1167 /* Stop searching if a suitable vmode has been found. */
1168 if (vmode
!= VOIDmode
)
1171 /* BYTES_PER_RISCV_VECTOR * TARGET_MAX_LMUL will at least be divisible
1172 by potential_ew 1, so this should succeed eventually. */
1173 if (multiple_p (BYTES_PER_RISCV_VECTOR
* TARGET_MAX_LMUL
,
1174 potential_ew
, &mode_units
)
1175 && riscv_vector::get_vector_mode (elem_mode
,
1176 mode_units
).exists (&vmode
))
1179 /* We may get here if we tried an element size that's larger than
1180 the hardware supports, but we should at least find a suitable
1181 byte vector mode. */
1182 gcc_assert (potential_ew
> 1);
1184 if (potential_ew
> 1)
1185 avl
= GEN_INT (length
/ potential_ew
);
1189 gcc_assert (get_lmul_mode (QImode
, TARGET_MAX_LMUL
).exists (&vmode
));
1192 /* A memcpy libcall in the worst case takes 3 instructions to prepare the
1193 arguments + 1 for the call. When RVV should take 7 instructions and
1194 we're optimizing for size a libcall may be preferable. */
1195 if (optimize_function_for_size_p (cfun
) && need_loop
)
1198 info
.need_loop
= need_loop
;
1204 /* Used by cpymemsi in riscv.md . */
1207 expand_block_move (rtx dst_in
, rtx src_in
, rtx length_in
, bool movmem_p
)
1211 mv a3, a0 # Copy destination
1213 vsetvli t0, a2, e8, m8, ta, ma # Vectors of 8b
1214 vle8.v v0, (a1) # Load bytes
1215 add a1, a1, t0 # Bump pointer
1216 sub a2, a2, t0 # Decrement count
1217 vse8.v v0, (a3) # Store bytes
1218 add a3, a3, t0 # Bump pointer
1219 bnez a2, loop # Any more?
1222 struct stringop_info info
;
1224 HOST_WIDE_INT potential_ew
1225 = (MIN (MIN (MEM_ALIGN (src_in
), MEM_ALIGN (dst_in
)), BITS_PER_WORD
)
1228 if (!use_vector_stringop_p (info
, potential_ew
, length_in
))
1231 /* Inlining general memmove is a pessimisation: we can't avoid having to
1232 decide which direction to go at runtime, which is costly in instruction
1233 count however for situations where the entire move fits in one vector
1234 operation we can do all reads before doing any writes so we don't have to
1235 worry so generate the inline vector code in such situations. */
1236 if (info
.need_loop
&& movmem_p
)
1242 /* avl holds the (remaining) length of the required copy.
1243 cnt holds the length we copy with the current load/store pair. */
1245 rtx label
= NULL_RTX
;
1246 rtx dst_addr
= copy_addr_to_reg (XEXP (dst_in
, 0));
1247 rtx src_addr
= copy_addr_to_reg (XEXP (src_in
, 0));
1251 info
.avl
= copy_to_mode_reg (Pmode
, info
.avl
);
1252 cnt
= gen_reg_rtx (Pmode
);
1253 label
= gen_label_rtx ();
1256 emit_insn (riscv_vector::gen_no_side_effects_vsetvl_rtx (info
.vmode
, cnt
,
1260 vec
= gen_reg_rtx (info
.vmode
);
1261 src
= change_address (src_in
, info
.vmode
, src_addr
);
1262 dst
= change_address (dst_in
, info
.vmode
, dst_addr
);
1264 /* If we don't need a loop and have a suitable mode to describe the size,
1265 just do a load / store pair and leave it up to the later lazy code
1266 motion pass to insert the appropriate vsetvli. */
1268 && known_eq (GET_MODE_SIZE (info
.vmode
), INTVAL (length_in
)))
1270 emit_move_insn (vec
, src
);
1271 emit_move_insn (dst
, vec
);
1275 machine_mode mask_mode
= riscv_vector::get_vector_mode
1276 (BImode
, GET_MODE_NUNITS (info
.vmode
)).require ();
1277 rtx mask
= CONSTM1_RTX (mask_mode
);
1278 if (!satisfies_constraint_K (cnt
))
1279 cnt
= force_reg (Pmode
, cnt
);
1280 rtx m_ops
[] = {vec
, mask
, src
};
1281 emit_nonvlmax_insn (code_for_pred_mov (info
.vmode
),
1282 riscv_vector::UNARY_OP_TAMA
, m_ops
, cnt
);
1283 emit_insn (gen_pred_store (info
.vmode
, dst
, mask
, vec
, cnt
,
1284 get_avl_type_rtx (riscv_vector::NONVLMAX
)));
1289 emit_insn (gen_rtx_SET (src_addr
, gen_rtx_PLUS (Pmode
, src_addr
, cnt
)));
1290 emit_insn (gen_rtx_SET (dst_addr
, gen_rtx_PLUS (Pmode
, dst_addr
, cnt
)));
1291 emit_insn (gen_rtx_SET (info
.avl
, gen_rtx_MINUS (Pmode
, info
.avl
, cnt
)));
1293 /* Emit the loop condition. */
1294 rtx test
= gen_rtx_NE (VOIDmode
, info
.avl
, const0_rtx
);
1295 emit_jump_insn (gen_cbranch4 (Pmode
, test
, info
.avl
, const0_rtx
, label
));
1296 emit_insn (gen_nop ());
1303 /* Implement rawmemchr<mode> and strlen using vector instructions.
1304 It can be assumed that the needle is in the haystack, otherwise the
1305 behavior is undefined. */
1308 expand_rawmemchr (machine_mode mode
, rtx dst
, rtx haystack
, rtx needle
,
1314 vsetvli a1, zero, e[8,16,32,64], m1, ta, ma
1315 vle[8,16,32,64]ff.v v8, (a0) # Load.
1316 csrr a1, vl # Get number of bytes read.
1317 vmseq.vx v0, v8, pat # v0 = (v8 == {pat, pat, ...})
1318 vfirst.m a2, v0 # Find first hit.
1319 add a0, a0, a1 # Bump pointer.
1320 bltz a2, loop # Not found?
1322 sub a0, a0, a1 # Go back by a1.
1323 shll a2, a2, [0,1,2,3] # Shift to get byte offset.
1324 add a0, a0, a2 # Add the offset.
1328 gcc_assert (TARGET_VECTOR
);
1331 gcc_assert (mode
== E_QImode
);
1333 unsigned int isize
= GET_MODE_SIZE (mode
).to_constant ();
1334 int lmul
= TARGET_MAX_LMUL
;
1335 poly_int64 nunits
= exact_div (BYTES_PER_RISCV_VECTOR
* lmul
, isize
);
1338 if (!riscv_vector::get_vector_mode (GET_MODE_INNER (mode
),
1339 nunits
).exists (&vmode
))
1342 machine_mode mask_mode
= riscv_vector::get_mask_mode (vmode
);
1344 rtx cnt
= gen_reg_rtx (Pmode
);
1345 emit_move_insn (cnt
, CONST0_RTX (Pmode
));
1347 rtx end
= gen_reg_rtx (Pmode
);
1348 rtx vec
= gen_reg_rtx (vmode
);
1349 rtx mask
= gen_reg_rtx (mask_mode
);
1351 /* After finding the first vector element matching the needle, we
1352 need to multiply by the vector element width (SEW) in order to
1353 return a pointer to the matching byte. */
1354 unsigned int shift
= exact_log2 (GET_MODE_SIZE (mode
).to_constant ());
1356 rtx src_addr
= copy_addr_to_reg (XEXP (haystack
, 0));
1357 rtx start_addr
= copy_addr_to_reg (XEXP (haystack
, 0));
1359 rtx loop
= gen_label_rtx ();
1362 rtx vsrc
= change_address (haystack
, vmode
, src_addr
);
1364 /* Bump the pointer. */
1365 rtx step
= gen_reg_rtx (Pmode
);
1366 emit_insn (gen_rtx_SET (step
, gen_rtx_ASHIFT (Pmode
, cnt
, GEN_INT (shift
))));
1367 emit_insn (gen_rtx_SET (src_addr
, gen_rtx_PLUS (Pmode
, src_addr
, step
)));
1369 /* Emit a first-fault load. */
1370 rtx vlops
[] = {vec
, vsrc
};
1371 emit_vlmax_insn (code_for_pred_fault_load (vmode
),
1372 riscv_vector::UNARY_OP
, vlops
);
1374 /* Read how far we read. */
1375 if (Pmode
== SImode
)
1376 emit_insn (gen_read_vlsi (cnt
));
1378 emit_insn (gen_read_vldi_zero_extend (cnt
));
1380 /* Compare needle with haystack and store in a mask. */
1381 rtx eq
= gen_rtx_EQ (mask_mode
, gen_const_vec_duplicate (vmode
, needle
), vec
);
1382 rtx vmsops
[] = {mask
, eq
, vec
, needle
};
1383 emit_nonvlmax_insn (code_for_pred_cmp_scalar (vmode
),
1384 riscv_vector::COMPARE_OP
, vmsops
, cnt
);
1386 /* Find the first bit in the mask. */
1387 rtx vfops
[] = {end
, mask
};
1388 emit_nonvlmax_insn (code_for_pred_ffs (mask_mode
, Pmode
),
1389 riscv_vector::CPOP_OP
, vfops
, cnt
);
1391 /* Emit the loop condition. */
1392 rtx test
= gen_rtx_LT (VOIDmode
, end
, const0_rtx
);
1393 emit_jump_insn (gen_cbranch4 (Pmode
, test
, end
, const0_rtx
, loop
));
1397 /* For strlen, return the length. */
1398 emit_insn (gen_rtx_SET (dst
, gen_rtx_PLUS (Pmode
, src_addr
, end
)));
1399 emit_insn (gen_rtx_SET (dst
, gen_rtx_MINUS (Pmode
, dst
, start_addr
)));
1403 /* For rawmemchr, return the position at SRC + END * [1,2,4,8]. */
1404 emit_insn (gen_rtx_SET (end
, gen_rtx_ASHIFT (Pmode
, end
, GEN_INT (shift
))));
1405 emit_insn (gen_rtx_SET (dst
, gen_rtx_PLUS (Pmode
, src_addr
, end
)));
1409 /* Implement cmpstr<mode> using vector instructions. The ALIGNMENT and
1410 NCOMPARE parameters are unused for now. */
1413 expand_strcmp (rtx result
, rtx src1
, rtx src2
, rtx nbytes
,
1414 unsigned HOST_WIDE_INT
, bool)
1416 gcc_assert (TARGET_VECTOR
);
1418 /* We don't support big endian. */
1419 if (BYTES_BIG_ENDIAN
)
1422 bool with_length
= nbytes
!= NULL_RTX
;
1425 && (!REG_P (nbytes
) && !SUBREG_P (nbytes
) && !CONST_INT_P (nbytes
)))
1428 if (with_length
&& CONST_INT_P (nbytes
))
1429 nbytes
= force_reg (Pmode
, nbytes
);
1431 machine_mode mode
= E_QImode
;
1432 unsigned int isize
= GET_MODE_SIZE (mode
).to_constant ();
1433 int lmul
= TARGET_MAX_LMUL
;
1434 poly_int64 nunits
= exact_div (BYTES_PER_RISCV_VECTOR
* lmul
, isize
);
1437 if (!riscv_vector::get_vector_mode (GET_MODE_INNER (mode
), nunits
)
1441 machine_mode mask_mode
= riscv_vector::get_mask_mode (vmode
);
1443 /* Prepare addresses. */
1444 rtx src_addr1
= copy_addr_to_reg (XEXP (src1
, 0));
1445 rtx vsrc1
= change_address (src1
, vmode
, src_addr1
);
1447 rtx src_addr2
= copy_addr_to_reg (XEXP (src2
, 0));
1448 rtx vsrc2
= change_address (src2
, vmode
, src_addr2
);
1450 /* Set initial pointer bump to 0. */
1451 rtx cnt
= gen_reg_rtx (Pmode
);
1452 emit_move_insn (cnt
, CONST0_RTX (Pmode
));
1454 rtx sub
= gen_reg_rtx (Pmode
);
1455 emit_move_insn (sub
, CONST0_RTX (Pmode
));
1457 /* Create source vectors. */
1458 rtx vec1
= gen_reg_rtx (vmode
);
1459 rtx vec2
= gen_reg_rtx (vmode
);
1461 rtx done
= gen_label_rtx ();
1462 rtx loop
= gen_label_rtx ();
1465 /* Bump the pointers. */
1466 emit_insn (gen_rtx_SET (src_addr1
, gen_rtx_PLUS (Pmode
, src_addr1
, cnt
)));
1467 emit_insn (gen_rtx_SET (src_addr2
, gen_rtx_PLUS (Pmode
, src_addr2
, cnt
)));
1469 rtx vlops1
[] = {vec1
, vsrc1
};
1470 rtx vlops2
[] = {vec2
, vsrc2
};
1474 emit_vlmax_insn (code_for_pred_fault_load (vmode
),
1475 riscv_vector::UNARY_OP
, vlops1
);
1477 emit_vlmax_insn (code_for_pred_fault_load (vmode
),
1478 riscv_vector::UNARY_OP
, vlops2
);
1482 nbytes
= gen_lowpart (Pmode
, nbytes
);
1483 emit_nonvlmax_insn (code_for_pred_fault_load (vmode
),
1484 riscv_vector::UNARY_OP
, vlops1
, nbytes
);
1486 emit_nonvlmax_insn (code_for_pred_fault_load (vmode
),
1487 riscv_vector::UNARY_OP
, vlops2
, nbytes
);
1490 /* Read the vl for the next pointer bump. */
1491 if (Pmode
== SImode
)
1492 emit_insn (gen_read_vlsi (cnt
));
1494 emit_insn (gen_read_vldi_zero_extend (cnt
));
1498 rtx test_done
= gen_rtx_EQ (VOIDmode
, cnt
, const0_rtx
);
1499 emit_jump_insn (gen_cbranch4 (Pmode
, test_done
, cnt
, const0_rtx
, done
));
1500 emit_insn (gen_rtx_SET (nbytes
, gen_rtx_MINUS (Pmode
, nbytes
, cnt
)));
1503 /* Look for a \0 in the first string. */
1504 rtx mask0
= gen_reg_rtx (mask_mode
);
1506 = gen_rtx_EQ (mask_mode
, gen_const_vec_duplicate (vmode
, CONST0_RTX (mode
)),
1508 rtx vmsops1
[] = {mask0
, eq0
, vec1
, CONST0_RTX (mode
)};
1509 emit_nonvlmax_insn (code_for_pred_cmp_scalar (vmode
),
1510 riscv_vector::COMPARE_OP
, vmsops1
, cnt
);
1512 /* Look for vec1 != vec2 (includes vec2[i] == 0). */
1513 rtx maskne
= gen_reg_rtx (mask_mode
);
1514 rtx ne
= gen_rtx_NE (mask_mode
, vec1
, vec2
);
1515 rtx vmsops
[] = {maskne
, ne
, vec1
, vec2
};
1516 emit_nonvlmax_insn (code_for_pred_cmp (vmode
), riscv_vector::COMPARE_OP
,
1519 /* Combine both masks into one. */
1520 rtx mask
= gen_reg_rtx (mask_mode
);
1521 rtx vmorops
[] = {mask
, mask0
, maskne
};
1522 emit_nonvlmax_insn (code_for_pred (IOR
, mask_mode
),
1523 riscv_vector::BINARY_MASK_OP
, vmorops
, cnt
);
1525 /* Find the first bit in the mask (the first unequal element). */
1526 rtx found_at
= gen_reg_rtx (Pmode
);
1527 rtx vfops
[] = {found_at
, mask
};
1528 emit_nonvlmax_insn (code_for_pred_ffs (mask_mode
, Pmode
),
1529 riscv_vector::CPOP_OP
, vfops
, cnt
);
1531 /* Emit the loop condition. */
1532 rtx test
= gen_rtx_LT (VOIDmode
, found_at
, const0_rtx
);
1533 emit_jump_insn (gen_cbranch4 (Pmode
, test
, found_at
, const0_rtx
, loop
));
1535 /* Walk up to the difference point. */
1537 gen_rtx_SET (src_addr1
, gen_rtx_PLUS (Pmode
, src_addr1
, found_at
)));
1539 gen_rtx_SET (src_addr2
, gen_rtx_PLUS (Pmode
, src_addr2
, found_at
)));
1541 /* Load the respective byte and compute the difference. */
1542 rtx c1
= gen_reg_rtx (Pmode
);
1543 rtx c2
= gen_reg_rtx (Pmode
);
1545 do_load_from_addr (mode
, c1
, src_addr1
, src1
);
1546 do_load_from_addr (mode
, c2
, src_addr2
, src2
);
1548 do_sub3 (sub
, c1
, c2
);
1553 emit_move_insn (result
, sub
);
1557 /* Check we are permitted to vectorise a memory operation.
1558 If so, return true and populate lmul_out.
1559 Otherwise, return false and leave lmul_out unchanged. */
1561 check_vectorise_memory_operation (rtx length_in
, HOST_WIDE_INT
&lmul_out
)
1563 /* If we either can't or have been asked not to vectorise, respect this. */
1566 if (!(stringop_strategy
& STRATEGY_VECTOR
))
1569 /* If we can't reason about the length, don't vectorise. */
1570 if (!CONST_INT_P (length_in
))
1573 HOST_WIDE_INT length
= INTVAL (length_in
);
1575 /* If it's tiny, default operation is likely better; maybe worth
1576 considering fractional lmul in the future as well. */
1577 if (length
< (TARGET_MIN_VLEN
/ 8))
1580 /* If we've been asked to use a specific LMUL,
1581 check the operation fits and do that. */
1582 if (rvv_max_lmul
!= RVV_DYNAMIC
)
1584 lmul_out
= TARGET_MAX_LMUL
;
1585 return (length
<= ((TARGET_MAX_LMUL
* TARGET_MIN_VLEN
) / 8));
1588 /* Find smallest lmul large enough for entire op. */
1589 HOST_WIDE_INT lmul
= 1;
1590 while ((lmul
<= 8) && (length
> ((lmul
* TARGET_MIN_VLEN
) / 8)))
1602 /* Used by setmemdi in riscv.md. */
1604 expand_vec_setmem (rtx dst_in
, rtx length_in
, rtx fill_value_in
)
1608 /* Check we are able and allowed to vectorise this operation;
1610 if (!use_vector_stringop_p (info
, 1, length_in
) || info
.need_loop
)
1613 rtx dst_addr
= copy_addr_to_reg (XEXP (dst_in
, 0));
1614 rtx dst
= change_address (dst_in
, info
.vmode
, dst_addr
);
1616 rtx fill_value
= gen_reg_rtx (info
.vmode
);
1617 rtx broadcast_ops
[] = { fill_value
, fill_value_in
};
1619 /* If the length is exactly vlmax for the selected mode, do that.
1620 Otherwise, use a predicated store. */
1621 if (known_eq (GET_MODE_SIZE (info
.vmode
), INTVAL (info
.avl
)))
1623 emit_vlmax_insn (code_for_pred_broadcast (info
.vmode
), UNARY_OP
,
1625 emit_move_insn (dst
, fill_value
);
1629 if (!satisfies_constraint_K (info
.avl
))
1630 info
.avl
= force_reg (Pmode
, info
.avl
);
1631 emit_nonvlmax_insn (code_for_pred_broadcast (info
.vmode
),
1632 riscv_vector::UNARY_OP
, broadcast_ops
, info
.avl
);
1633 machine_mode mask_mode
1634 = riscv_vector::get_vector_mode (BImode
, GET_MODE_NUNITS (info
.vmode
))
1636 rtx mask
= CONSTM1_RTX (mask_mode
);
1637 emit_insn (gen_pred_store (info
.vmode
, dst
, mask
, fill_value
, info
.avl
,
1638 get_avl_type_rtx (riscv_vector::NONVLMAX
)));
1644 /* Used by cmpmemsi in riscv.md. */
1647 expand_vec_cmpmem (rtx result_out
, rtx blk_a_in
, rtx blk_b_in
, rtx length_in
)
1650 /* Check we are able and allowed to vectorise this operation;
1652 if (!check_vectorise_memory_operation (length_in
, lmul
))
1656 load entire blocks at a and b into vector regs
1657 generate mask of bytes that differ
1658 find first set bit in mask
1659 find offset of first set bit in mask, use 0 if none set
1660 result is ((char*)a[offset] - (char*)b[offset])
1664 = riscv_vector::get_vector_mode (QImode
, BYTES_PER_RISCV_VECTOR
* lmul
)
1666 rtx blk_a_addr
= copy_addr_to_reg (XEXP (blk_a_in
, 0));
1667 rtx blk_a
= change_address (blk_a_in
, vmode
, blk_a_addr
);
1668 rtx blk_b_addr
= copy_addr_to_reg (XEXP (blk_b_in
, 0));
1669 rtx blk_b
= change_address (blk_b_in
, vmode
, blk_b_addr
);
1671 rtx vec_a
= gen_reg_rtx (vmode
);
1672 rtx vec_b
= gen_reg_rtx (vmode
);
1674 machine_mode mask_mode
= get_mask_mode (vmode
);
1675 rtx mask
= gen_reg_rtx (mask_mode
);
1676 rtx mismatch_ofs
= gen_reg_rtx (Pmode
);
1678 rtx ne
= gen_rtx_NE (mask_mode
, vec_a
, vec_b
);
1679 rtx vmsops
[] = { mask
, ne
, vec_a
, vec_b
};
1680 rtx vfops
[] = { mismatch_ofs
, mask
};
1682 /* If the length is exactly vlmax for the selected mode, do that.
1683 Otherwise, use a predicated store. */
1685 if (known_eq (GET_MODE_SIZE (vmode
), INTVAL (length_in
)))
1687 emit_move_insn (vec_a
, blk_a
);
1688 emit_move_insn (vec_b
, blk_b
);
1689 emit_vlmax_insn (code_for_pred_cmp (vmode
), riscv_vector::COMPARE_OP
,
1692 emit_vlmax_insn (code_for_pred_ffs (mask_mode
, Pmode
),
1693 riscv_vector::CPOP_OP
, vfops
);
1697 if (!satisfies_constraint_K (length_in
))
1698 length_in
= force_reg (Pmode
, length_in
);
1700 rtx memmask
= CONSTM1_RTX (mask_mode
);
1702 rtx m_ops_a
[] = { vec_a
, memmask
, blk_a
};
1703 rtx m_ops_b
[] = { vec_b
, memmask
, blk_b
};
1705 emit_nonvlmax_insn (code_for_pred_mov (vmode
),
1706 riscv_vector::UNARY_OP_TAMA
, m_ops_a
, length_in
);
1707 emit_nonvlmax_insn (code_for_pred_mov (vmode
),
1708 riscv_vector::UNARY_OP_TAMA
, m_ops_b
, length_in
);
1710 emit_nonvlmax_insn (code_for_pred_cmp (vmode
), riscv_vector::COMPARE_OP
,
1713 emit_nonvlmax_insn (code_for_pred_ffs (mask_mode
, Pmode
),
1714 riscv_vector::CPOP_OP
, vfops
, length_in
);
1717 /* Mismatch_ofs is -1 if blocks match, or the offset of
1718 the first mismatch otherwise. */
1719 rtx ltz
= gen_reg_rtx (Xmode
);
1720 emit_insn (gen_slt_3 (LT
, Xmode
, Xmode
, ltz
, mismatch_ofs
, const0_rtx
));
1721 /* mismatch_ofs += (mismatch_ofs < 0) ? 1 : 0. */
1723 gen_rtx_SET (mismatch_ofs
, gen_rtx_PLUS (Pmode
, mismatch_ofs
, ltz
)));
1725 /* Unconditionally load the bytes at mismatch_ofs and subtract them
1726 to get our result. */
1727 emit_insn (gen_rtx_SET (blk_a_addr
,
1728 gen_rtx_PLUS (Pmode
, mismatch_ofs
, blk_a_addr
)));
1729 emit_insn (gen_rtx_SET (blk_b_addr
,
1730 gen_rtx_PLUS (Pmode
, mismatch_ofs
, blk_b_addr
)));
1732 blk_a
= change_address (blk_a
, QImode
, blk_a_addr
);
1733 blk_b
= change_address (blk_b
, QImode
, blk_b_addr
);
1735 rtx byte_a
= gen_reg_rtx (SImode
);
1736 rtx byte_b
= gen_reg_rtx (SImode
);
1737 do_zero_extendqi2 (byte_a
, blk_a
);
1738 do_zero_extendqi2 (byte_b
, blk_b
);
1740 emit_insn (gen_rtx_SET (result_out
, gen_rtx_MINUS (SImode
, byte_a
, byte_b
)));