libcpp, c, middle-end: Optimize initializers using #embed in C
[official-gcc.git] / gcc / config / i386 / i386-features.cc
blob3434d006943958dd6bbb8c2a0fe9f3728589ada3
1 /* Copyright (C) 1988-2024 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
19 #define IN_TARGET_CODE 1
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "memmodel.h"
28 #include "gimple.h"
29 #include "cfghooks.h"
30 #include "cfgloop.h"
31 #include "df.h"
32 #include "tm_p.h"
33 #include "stringpool.h"
34 #include "expmed.h"
35 #include "optabs.h"
36 #include "regs.h"
37 #include "emit-rtl.h"
38 #include "recog.h"
39 #include "cgraph.h"
40 #include "diagnostic.h"
41 #include "cfgbuild.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "attribs.h"
45 #include "calls.h"
46 #include "stor-layout.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "insn-attr.h"
50 #include "flags.h"
51 #include "except.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "cfgrtl.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
57 #include "reload.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "tm-constrs.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "shrink-wrap.h"
70 #include "builtins.h"
71 #include "rtl-iter.h"
72 #include "tree-iterator.h"
73 #include "dbgcnt.h"
74 #include "case-cfn-macros.h"
75 #include "dojump.h"
76 #include "fold-const-call.h"
77 #include "tree-vrp.h"
78 #include "tree-ssanames.h"
79 #include "selftest.h"
80 #include "selftest-rtl.h"
81 #include "print-rtl.h"
82 #include "intl.h"
83 #include "ifcvt.h"
84 #include "symbol-summary.h"
85 #include "sreal.h"
86 #include "ipa-cp.h"
87 #include "ipa-prop.h"
88 #include "ipa-fnsummary.h"
89 #include "wide-int-bitmask.h"
90 #include "tree-vector-builder.h"
91 #include "debug.h"
92 #include "dwarf2out.h"
93 #include "i386-builtins.h"
94 #include "i386-features.h"
95 #include "i386-expand.h"
97 const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
98 "savms64",
99 "resms64",
100 "resms64x",
101 "savms64f",
102 "resms64f",
103 "resms64fx"
106 const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
107 /* The below offset values are where each register is stored for the layout
108 relative to incoming stack pointer. The value of each m_regs[].offset will
109 be relative to the incoming base pointer (rax or rsi) used by the stub.
111 s_instances: 0 1 2 3
112 Offset: realigned or aligned + 8
113 Register aligned aligned + 8 aligned w/HFP w/HFP */
114 XMM15_REG, /* 0x10 0x18 0x10 0x18 */
115 XMM14_REG, /* 0x20 0x28 0x20 0x28 */
116 XMM13_REG, /* 0x30 0x38 0x30 0x38 */
117 XMM12_REG, /* 0x40 0x48 0x40 0x48 */
118 XMM11_REG, /* 0x50 0x58 0x50 0x58 */
119 XMM10_REG, /* 0x60 0x68 0x60 0x68 */
120 XMM9_REG, /* 0x70 0x78 0x70 0x78 */
121 XMM8_REG, /* 0x80 0x88 0x80 0x88 */
122 XMM7_REG, /* 0x90 0x98 0x90 0x98 */
123 XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */
124 SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */
125 DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */
126 BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */
127 BP_REG, /* 0xc0 0xc8 N/A N/A */
128 R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */
129 R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */
130 R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */
131 R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */
134 /* Instantiate static const values. */
135 const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
136 const unsigned xlogue_layout::MIN_REGS;
137 const unsigned xlogue_layout::MAX_REGS;
138 const unsigned xlogue_layout::MAX_EXTRA_REGS;
139 const unsigned xlogue_layout::VARIANT_COUNT;
140 const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
142 /* Initialize xlogue_layout::s_stub_names to zero. */
143 char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
144 [STUB_NAME_MAX_LEN];
146 /* Instantiates all xlogue_layout instances. */
147 const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
148 xlogue_layout (0, false),
149 xlogue_layout (8, false),
150 xlogue_layout (0, true),
151 xlogue_layout (8, true)
154 /* Return an appropriate const instance of xlogue_layout based upon values
155 in cfun->machine and crtl. */
156 const class xlogue_layout &
157 xlogue_layout::get_instance ()
159 enum xlogue_stub_sets stub_set;
160 bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
162 if (stack_realign_fp)
163 stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
164 else if (frame_pointer_needed)
165 stub_set = aligned_plus_8
166 ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
167 : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
168 else
169 stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
171 return s_instances[stub_set];
174 /* Determine how many clobbered registers can be saved by the stub.
175 Returns the count of registers the stub will save and restore. */
176 unsigned
177 xlogue_layout::count_stub_managed_regs ()
179 bool hfp = frame_pointer_needed || stack_realign_fp;
180 unsigned i, count;
181 unsigned regno;
183 for (count = i = MIN_REGS; i < MAX_REGS; ++i)
185 regno = REG_ORDER[i];
186 if (regno == BP_REG && hfp)
187 continue;
188 if (!ix86_save_reg (regno, false, false))
189 break;
190 ++count;
192 return count;
195 /* Determine if register REGNO is a stub managed register given the
196 total COUNT of stub managed registers. */
197 bool
198 xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
200 bool hfp = frame_pointer_needed || stack_realign_fp;
201 unsigned i;
203 for (i = 0; i < count; ++i)
205 gcc_assert (i < MAX_REGS);
206 if (REG_ORDER[i] == BP_REG && hfp)
207 ++count;
208 else if (REG_ORDER[i] == regno)
209 return true;
211 return false;
214 /* Constructor for xlogue_layout. */
215 xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
216 : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
217 m_stack_align_off_in (stack_align_off_in)
219 HOST_WIDE_INT offset = stack_align_off_in;
220 unsigned i, j;
222 for (i = j = 0; i < MAX_REGS; ++i)
224 unsigned regno = REG_ORDER[i];
226 if (regno == BP_REG && hfp)
227 continue;
228 if (SSE_REGNO_P (regno))
230 offset += 16;
231 /* Verify that SSE regs are always aligned. */
232 gcc_assert (!((stack_align_off_in + offset) & 15));
234 else
235 offset += 8;
237 m_regs[j].regno = regno;
238 m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
240 gcc_assert (j == m_nregs);
243 const char *
244 xlogue_layout::get_stub_name (enum xlogue_stub stub,
245 unsigned n_extra_regs)
247 const int have_avx = TARGET_AVX;
248 char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
250 /* Lazy init */
251 if (!*name)
253 int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
254 (have_avx ? "avx" : "sse"),
255 STUB_BASE_NAMES[stub],
256 MIN_REGS + n_extra_regs);
257 gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
260 return name;
263 /* Return rtx of a symbol ref for the entry point (based upon
264 cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
266 xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
268 const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
269 gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
270 gcc_assert (stub < XLOGUE_STUB_COUNT);
271 gcc_assert (crtl->stack_realign_finalized);
273 return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
276 unsigned scalar_chain::max_id = 0;
278 namespace {
280 /* Initialize new chain. */
282 scalar_chain::scalar_chain (enum machine_mode smode_, enum machine_mode vmode_)
284 smode = smode_;
285 vmode = vmode_;
287 chain_id = ++max_id;
289 if (dump_file)
290 fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
292 bitmap_obstack_initialize (NULL);
293 insns = BITMAP_ALLOC (NULL);
294 defs = BITMAP_ALLOC (NULL);
295 defs_conv = BITMAP_ALLOC (NULL);
296 insns_conv = BITMAP_ALLOC (NULL);
297 queue = NULL;
299 n_sse_to_integer = 0;
300 n_integer_to_sse = 0;
302 max_visits = x86_stv_max_visits;
305 /* Free chain's data. */
307 scalar_chain::~scalar_chain ()
309 BITMAP_FREE (insns);
310 BITMAP_FREE (defs);
311 BITMAP_FREE (defs_conv);
312 BITMAP_FREE (insns_conv);
313 bitmap_obstack_release (NULL);
316 /* Add instruction into chains' queue. */
318 void
319 scalar_chain::add_to_queue (unsigned insn_uid)
321 if (!bitmap_set_bit (queue, insn_uid))
322 return;
324 if (dump_file)
325 fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
326 insn_uid, chain_id);
329 /* For DImode conversion, mark register defined by DEF as requiring
330 conversion. */
332 void
333 scalar_chain::mark_dual_mode_def (df_ref def)
335 gcc_assert (DF_REF_REG_DEF_P (def));
337 /* Record the def/insn pair so we can later efficiently iterate over
338 the defs to convert on insns not in the chain. */
339 bool reg_new = bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
340 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def)))
342 if (!bitmap_set_bit (insns_conv, DF_REF_INSN_UID (def))
343 && !reg_new)
344 return;
345 n_integer_to_sse++;
347 else
349 if (!reg_new)
350 return;
351 n_sse_to_integer++;
354 if (dump_file)
355 fprintf (dump_file,
356 " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
357 DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
360 /* Check REF's chain to add new insns into a queue
361 and find registers requiring conversion. Return true if OK, false
362 if the analysis was aborted. */
364 bool
365 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref,
366 bitmap disallowed)
368 df_link *chain;
369 bool mark_def = false;
371 gcc_checking_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)));
373 for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
375 unsigned uid = DF_REF_INSN_UID (chain->ref);
377 if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
378 continue;
380 if (--max_visits == 0)
381 return false;
383 if (!DF_REF_REG_MEM_P (chain->ref))
385 if (bitmap_bit_p (insns, uid))
386 continue;
388 if (bitmap_bit_p (candidates, uid))
390 add_to_queue (uid);
391 continue;
394 /* If we run into parts of an aborted chain discovery abort. */
395 if (bitmap_bit_p (disallowed, uid))
396 return false;
399 if (DF_REF_REG_DEF_P (chain->ref))
401 if (dump_file)
402 fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
403 DF_REF_REGNO (chain->ref), uid);
404 mark_dual_mode_def (chain->ref);
406 else
408 if (dump_file)
409 fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
410 DF_REF_REGNO (chain->ref), uid);
411 mark_def = true;
415 if (mark_def)
416 mark_dual_mode_def (ref);
418 return true;
421 /* Add instruction into a chain. Return true if OK, false if the search
422 was aborted. */
424 bool
425 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid,
426 bitmap disallowed)
428 if (!bitmap_set_bit (insns, insn_uid))
429 return true;
431 if (dump_file)
432 fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
434 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
435 rtx def_set = single_set (insn);
436 if (def_set && REG_P (SET_DEST (def_set))
437 && !HARD_REGISTER_P (SET_DEST (def_set)))
438 bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
440 /* ??? The following is quadratic since analyze_register_chain
441 iterates over all refs to look for dual-mode regs. Instead this
442 should be done separately for all regs mentioned in the chain once. */
443 df_ref ref;
444 for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
445 if (!HARD_REGISTER_P (DF_REF_REG (ref)))
446 if (!analyze_register_chain (candidates, ref, disallowed))
447 return false;
449 /* The operand(s) of VEC_SELECT don't need to be converted/convertible. */
450 if (def_set && GET_CODE (SET_SRC (def_set)) == VEC_SELECT)
451 return true;
453 for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
454 if (!DF_REF_REG_MEM_P (ref))
455 if (!analyze_register_chain (candidates, ref, disallowed))
456 return false;
458 return true;
461 /* Build new chain starting from insn INSN_UID recursively
462 adding all dependent uses and definitions. Return true if OK, false
463 if the chain discovery was aborted. */
465 bool
466 scalar_chain::build (bitmap candidates, unsigned insn_uid, bitmap disallowed)
468 queue = BITMAP_ALLOC (NULL);
469 bitmap_set_bit (queue, insn_uid);
471 if (dump_file)
472 fprintf (dump_file, "Building chain #%d...\n", chain_id);
474 while (!bitmap_empty_p (queue))
476 insn_uid = bitmap_first_set_bit (queue);
477 bitmap_clear_bit (queue, insn_uid);
478 bitmap_clear_bit (candidates, insn_uid);
479 if (!add_insn (candidates, insn_uid, disallowed))
481 /* If we aborted the search put sofar found insn on the set of
482 disallowed insns so that further searches reaching them also
483 abort and thus we abort the whole but yet undiscovered chain. */
484 bitmap_ior_into (disallowed, insns);
485 if (dump_file)
486 fprintf (dump_file, "Aborted chain #%d discovery\n", chain_id);
487 BITMAP_FREE (queue);
488 return false;
492 if (dump_file)
494 fprintf (dump_file, "Collected chain #%d...\n", chain_id);
495 fprintf (dump_file, " insns: ");
496 dump_bitmap (dump_file, insns);
497 if (!bitmap_empty_p (defs_conv))
499 bitmap_iterator bi;
500 unsigned id;
501 const char *comma = "";
502 fprintf (dump_file, " defs to convert: ");
503 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
505 fprintf (dump_file, "%sr%d", comma, id);
506 comma = ", ";
508 fprintf (dump_file, "\n");
512 BITMAP_FREE (queue);
514 return true;
517 /* Return a cost of building a vector costant
518 instead of using a scalar one. */
521 general_scalar_chain::vector_const_cost (rtx exp)
523 gcc_assert (CONST_INT_P (exp));
525 if (standard_sse_constant_p (exp, vmode))
526 return ix86_cost->sse_op;
527 /* We have separate costs for SImode and DImode, use SImode costs
528 for smaller modes. */
529 return ix86_cost->sse_load[smode == DImode ? 1 : 0];
532 /* Compute a gain for chain conversion. */
535 general_scalar_chain::compute_convert_gain ()
537 bitmap_iterator bi;
538 unsigned insn_uid;
539 int gain = 0;
540 int cost = 0;
542 if (dump_file)
543 fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
545 /* SSE costs distinguish between SImode and DImode loads/stores, for
546 int costs factor in the number of GPRs involved. When supporting
547 smaller modes than SImode the int load/store costs need to be
548 adjusted as well. */
549 unsigned sse_cost_idx = smode == DImode ? 1 : 0;
550 unsigned m = smode == DImode ? (TARGET_64BIT ? 1 : 2) : 1;
552 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
554 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
555 rtx def_set = single_set (insn);
556 rtx src = SET_SRC (def_set);
557 rtx dst = SET_DEST (def_set);
558 int igain = 0;
560 if (REG_P (src) && REG_P (dst))
561 igain += 2 * m - ix86_cost->xmm_move;
562 else if (REG_P (src) && MEM_P (dst))
563 igain
564 += m * ix86_cost->int_store[2] - ix86_cost->sse_store[sse_cost_idx];
565 else if (MEM_P (src) && REG_P (dst))
566 igain += m * ix86_cost->int_load[2] - ix86_cost->sse_load[sse_cost_idx];
567 else
569 /* For operations on memory operands, include the overhead
570 of explicit load and store instructions. */
571 if (MEM_P (dst))
572 igain += optimize_insn_for_size_p ()
573 ? -COSTS_N_BYTES (8)
574 : (m * (ix86_cost->int_load[2]
575 + ix86_cost->int_store[2])
576 - (ix86_cost->sse_load[sse_cost_idx] +
577 ix86_cost->sse_store[sse_cost_idx]));
579 switch (GET_CODE (src))
581 case ASHIFT:
582 case ASHIFTRT:
583 case LSHIFTRT:
584 if (m == 2)
586 if (INTVAL (XEXP (src, 1)) >= 32)
587 igain += ix86_cost->add;
588 /* Gain for extend highpart case. */
589 else if (GET_CODE (XEXP (src, 0)) == ASHIFT)
590 igain += ix86_cost->shift_const - ix86_cost->sse_op;
591 else
592 igain += ix86_cost->shift_const;
595 igain += ix86_cost->shift_const - ix86_cost->sse_op;
597 if (CONST_INT_P (XEXP (src, 0)))
598 igain -= vector_const_cost (XEXP (src, 0));
599 break;
601 case ROTATE:
602 case ROTATERT:
603 igain += m * ix86_cost->shift_const;
604 if (TARGET_AVX512VL)
605 igain -= ix86_cost->sse_op;
606 else if (smode == DImode)
608 int bits = INTVAL (XEXP (src, 1));
609 if ((bits & 0x0f) == 0)
610 igain -= ix86_cost->sse_op;
611 else if ((bits & 0x07) == 0)
612 igain -= 2 * ix86_cost->sse_op;
613 else
614 igain -= 3 * ix86_cost->sse_op;
616 else if (INTVAL (XEXP (src, 1)) == 16)
617 igain -= ix86_cost->sse_op;
618 else
619 igain -= 2 * ix86_cost->sse_op;
620 break;
622 case AND:
623 case IOR:
624 case XOR:
625 case PLUS:
626 case MINUS:
627 igain += m * ix86_cost->add - ix86_cost->sse_op;
628 /* Additional gain for andnot for targets without BMI. */
629 if (GET_CODE (XEXP (src, 0)) == NOT
630 && !TARGET_BMI)
631 igain += m * ix86_cost->add;
633 if (CONST_INT_P (XEXP (src, 0)))
634 igain -= vector_const_cost (XEXP (src, 0));
635 if (CONST_INT_P (XEXP (src, 1)))
636 igain -= vector_const_cost (XEXP (src, 1));
637 if (MEM_P (XEXP (src, 1)))
639 if (optimize_insn_for_size_p ())
640 igain -= COSTS_N_BYTES (m == 2 ? 3 : 5);
641 else
642 igain += m * ix86_cost->int_load[2]
643 - ix86_cost->sse_load[sse_cost_idx];
645 break;
647 case NEG:
648 case NOT:
649 igain -= ix86_cost->sse_op + COSTS_N_INSNS (1);
651 if (GET_CODE (XEXP (src, 0)) != ABS)
653 igain += m * ix86_cost->add;
654 break;
656 /* FALLTHRU */
658 case ABS:
659 case SMAX:
660 case SMIN:
661 case UMAX:
662 case UMIN:
663 /* We do not have any conditional move cost, estimate it as a
664 reg-reg move. Comparisons are costed as adds. */
665 igain += m * (COSTS_N_INSNS (2) + ix86_cost->add);
666 /* Integer SSE ops are all costed the same. */
667 igain -= ix86_cost->sse_op;
668 break;
670 case COMPARE:
671 if (XEXP (src, 1) != const0_rtx)
673 /* cmp vs. pxor;pshufd;ptest. */
674 igain += COSTS_N_INSNS (m - 3);
676 else if (GET_CODE (XEXP (src, 0)) != AND)
678 /* test vs. pshufd;ptest. */
679 igain += COSTS_N_INSNS (m - 2);
681 else if (GET_CODE (XEXP (XEXP (src, 0), 0)) != NOT)
683 /* and;test vs. pshufd;ptest. */
684 igain += COSTS_N_INSNS (2 * m - 2);
686 else if (TARGET_BMI)
688 /* andn;test vs. pandn;pshufd;ptest. */
689 igain += COSTS_N_INSNS (2 * m - 3);
691 else
693 /* not;and;test vs. pandn;pshufd;ptest. */
694 igain += COSTS_N_INSNS (3 * m - 3);
696 break;
698 case CONST_INT:
699 if (REG_P (dst))
701 if (optimize_insn_for_size_p ())
703 /* xor (2 bytes) vs. xorps (3 bytes). */
704 if (src == const0_rtx)
705 igain -= COSTS_N_BYTES (1);
706 /* movdi_internal vs. movv2di_internal. */
707 /* => mov (5 bytes) vs. movaps (7 bytes). */
708 else if (x86_64_immediate_operand (src, SImode))
709 igain -= COSTS_N_BYTES (2);
710 else
711 /* ??? Larger immediate constants are placed in the
712 constant pool, where the size benefit/impact of
713 STV conversion is affected by whether and how
714 often each constant pool entry is shared/reused.
715 The value below is empirically derived from the
716 CSiBE benchmark (and the optimal value may drift
717 over time). */
718 igain += COSTS_N_BYTES (0);
720 else
722 /* DImode can be immediate for TARGET_64BIT
723 and SImode always. */
724 igain += m * COSTS_N_INSNS (1);
725 igain -= vector_const_cost (src);
728 else if (MEM_P (dst))
730 igain += (m * ix86_cost->int_store[2]
731 - ix86_cost->sse_store[sse_cost_idx]);
732 igain -= vector_const_cost (src);
734 break;
736 case VEC_SELECT:
737 if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx)
739 // movd (4 bytes) replaced with movdqa (4 bytes).
740 if (!optimize_insn_for_size_p ())
741 igain += ix86_cost->sse_to_integer - ix86_cost->xmm_move;
743 else
745 // pshufd; movd replaced with pshufd.
746 if (optimize_insn_for_size_p ())
747 igain += COSTS_N_BYTES (4);
748 else
749 igain += ix86_cost->sse_to_integer;
751 break;
753 default:
754 gcc_unreachable ();
758 if (igain != 0 && dump_file)
760 fprintf (dump_file, " Instruction gain %d for ", igain);
761 dump_insn_slim (dump_file, insn);
763 gain += igain;
766 if (dump_file)
767 fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
769 /* Cost the integer to sse and sse to integer moves. */
770 if (!optimize_function_for_size_p (cfun))
772 cost += n_sse_to_integer * ix86_cost->sse_to_integer;
773 /* ??? integer_to_sse but we only have that in the RA cost table.
774 Assume sse_to_integer/integer_to_sse are the same which they
775 are at the moment. */
776 cost += n_integer_to_sse * ix86_cost->sse_to_integer;
778 else if (TARGET_64BIT || smode == SImode)
780 cost += n_sse_to_integer * COSTS_N_BYTES (4);
781 cost += n_integer_to_sse * COSTS_N_BYTES (4);
783 else if (TARGET_SSE4_1)
785 /* vmovd (4 bytes) + vpextrd (6 bytes). */
786 cost += n_sse_to_integer * COSTS_N_BYTES (10);
787 /* vmovd (4 bytes) + vpinsrd (6 bytes). */
788 cost += n_integer_to_sse * COSTS_N_BYTES (10);
790 else
792 /* movd (4 bytes) + psrlq (5 bytes) + movd (4 bytes). */
793 cost += n_sse_to_integer * COSTS_N_BYTES (13);
794 /* movd (4 bytes) + movd (4 bytes) + unpckldq (4 bytes). */
795 cost += n_integer_to_sse * COSTS_N_BYTES (12);
798 if (dump_file)
799 fprintf (dump_file, " Registers conversion cost: %d\n", cost);
801 gain -= cost;
803 if (dump_file)
804 fprintf (dump_file, " Total gain: %d\n", gain);
806 return gain;
809 /* Insert generated conversion instruction sequence INSNS
810 after instruction AFTER. New BB may be required in case
811 instruction has EH region attached. */
813 void
814 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
816 if (!control_flow_insn_p (after))
818 emit_insn_after (insns, after);
819 return;
822 basic_block bb = BLOCK_FOR_INSN (after);
823 edge e = find_fallthru_edge (bb->succs);
824 gcc_assert (e);
826 basic_block new_bb = split_edge (e);
827 emit_insn_after (insns, BB_HEAD (new_bb));
830 } // anon namespace
832 /* Generate the canonical SET_SRC to move GPR to a VMODE vector register,
833 zeroing the upper parts. */
835 static rtx
836 gen_gpr_to_xmm_move_src (enum machine_mode vmode, rtx gpr)
838 switch (GET_MODE_NUNITS (vmode))
840 case 1:
841 return gen_rtx_SUBREG (vmode, gpr, 0);
842 case 2:
843 return gen_rtx_VEC_CONCAT (vmode, gpr,
844 CONST0_RTX (GET_MODE_INNER (vmode)));
845 default:
846 return gen_rtx_VEC_MERGE (vmode, gen_rtx_VEC_DUPLICATE (vmode, gpr),
847 CONST0_RTX (vmode), GEN_INT (HOST_WIDE_INT_1U));
851 /* Make vector copies for all register REGNO definitions
852 and replace its uses in a chain. */
854 void
855 scalar_chain::make_vector_copies (rtx_insn *insn, rtx reg)
857 rtx vreg = *defs_map.get (reg);
859 start_sequence ();
860 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
862 rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP);
863 if (smode == DImode && !TARGET_64BIT)
865 emit_move_insn (adjust_address (tmp, SImode, 0),
866 gen_rtx_SUBREG (SImode, reg, 0));
867 emit_move_insn (adjust_address (tmp, SImode, 4),
868 gen_rtx_SUBREG (SImode, reg, 4));
870 else
871 emit_move_insn (copy_rtx (tmp), reg);
872 emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0),
873 gen_gpr_to_xmm_move_src (vmode, tmp)));
875 else if (!TARGET_64BIT && smode == DImode)
877 if (TARGET_SSE4_1)
879 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
880 CONST0_RTX (V4SImode),
881 gen_rtx_SUBREG (SImode, reg, 0)));
882 emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
883 gen_rtx_SUBREG (V4SImode, vreg, 0),
884 gen_rtx_SUBREG (SImode, reg, 4),
885 GEN_INT (2)));
887 else
889 rtx tmp = gen_reg_rtx (DImode);
890 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
891 CONST0_RTX (V4SImode),
892 gen_rtx_SUBREG (SImode, reg, 0)));
893 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
894 CONST0_RTX (V4SImode),
895 gen_rtx_SUBREG (SImode, reg, 4)));
896 emit_insn (gen_vec_interleave_lowv4si
897 (gen_rtx_SUBREG (V4SImode, vreg, 0),
898 gen_rtx_SUBREG (V4SImode, vreg, 0),
899 gen_rtx_SUBREG (V4SImode, tmp, 0)));
902 else
903 emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0),
904 gen_gpr_to_xmm_move_src (vmode, reg)));
905 rtx_insn *seq = get_insns ();
906 end_sequence ();
907 emit_conversion_insns (seq, insn);
909 if (dump_file)
910 fprintf (dump_file,
911 " Copied r%d to a vector register r%d for insn %d\n",
912 REGNO (reg), REGNO (vreg), INSN_UID (insn));
915 /* Copy the definition SRC of INSN inside the chain to DST for
916 scalar uses outside of the chain. */
918 void
919 scalar_chain::convert_reg (rtx_insn *insn, rtx dst, rtx src)
921 start_sequence ();
922 if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
924 rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP);
925 emit_move_insn (tmp, src);
926 if (!TARGET_64BIT && smode == DImode)
928 emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0),
929 adjust_address (tmp, SImode, 0));
930 emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4),
931 adjust_address (tmp, SImode, 4));
933 else
934 emit_move_insn (dst, copy_rtx (tmp));
936 else if (!TARGET_64BIT && smode == DImode)
938 if (TARGET_SSE4_1)
940 rtx tmp = gen_rtx_PARALLEL (VOIDmode,
941 gen_rtvec (1, const0_rtx));
942 emit_insn
943 (gen_rtx_SET
944 (gen_rtx_SUBREG (SImode, dst, 0),
945 gen_rtx_VEC_SELECT (SImode,
946 gen_rtx_SUBREG (V4SImode, src, 0),
947 tmp)));
949 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
950 emit_insn
951 (gen_rtx_SET
952 (gen_rtx_SUBREG (SImode, dst, 4),
953 gen_rtx_VEC_SELECT (SImode,
954 gen_rtx_SUBREG (V4SImode, src, 0),
955 tmp)));
957 else
959 rtx vcopy = gen_reg_rtx (V2DImode);
960 emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, src, 0));
961 emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0),
962 gen_rtx_SUBREG (SImode, vcopy, 0));
963 emit_move_insn (vcopy,
964 gen_rtx_LSHIFTRT (V2DImode,
965 vcopy, GEN_INT (32)));
966 emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4),
967 gen_rtx_SUBREG (SImode, vcopy, 0));
970 else
971 emit_move_insn (dst, src);
973 rtx_insn *seq = get_insns ();
974 end_sequence ();
975 emit_conversion_insns (seq, insn);
977 if (dump_file)
978 fprintf (dump_file,
979 " Copied r%d to a scalar register r%d for insn %d\n",
980 REGNO (src), REGNO (dst), INSN_UID (insn));
983 /* Helper function to convert immediate constant X to vmode. */
984 static rtx
985 smode_convert_cst (rtx x, enum machine_mode vmode)
987 /* Prefer all ones vector in case of -1. */
988 if (constm1_operand (x, GET_MODE (x)))
989 return CONSTM1_RTX (vmode);
991 unsigned n = GET_MODE_NUNITS (vmode);
992 rtx *v = XALLOCAVEC (rtx, n);
993 v[0] = x;
994 for (unsigned i = 1; i < n; ++i)
995 v[i] = const0_rtx;
996 return gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (n, v));
999 /* Convert operand OP in INSN. We should handle
1000 memory operands and uninitialized registers.
1001 All other register uses are converted during
1002 registers conversion. */
1004 void
1005 scalar_chain::convert_op (rtx *op, rtx_insn *insn)
1007 rtx tmp;
1009 if (GET_MODE (*op) == V1TImode)
1010 return;
1012 *op = copy_rtx_if_shared (*op);
1014 if (GET_CODE (*op) == NOT
1015 || GET_CODE (*op) == ASHIFT)
1017 convert_op (&XEXP (*op, 0), insn);
1018 PUT_MODE (*op, vmode);
1020 else if (MEM_P (*op))
1022 rtx_insn *movabs = NULL;
1024 /* Emit MOVABS to load from a 64-bit absolute address to a GPR. */
1025 if (!memory_operand (*op, GET_MODE (*op)))
1027 tmp = gen_reg_rtx (GET_MODE (*op));
1028 movabs = emit_insn_before (gen_rtx_SET (tmp, *op), insn);
1030 *op = tmp;
1033 tmp = gen_rtx_SUBREG (vmode, gen_reg_rtx (GET_MODE (*op)), 0);
1035 rtx_insn *eh_insn
1036 = emit_insn_before (gen_rtx_SET (copy_rtx (tmp),
1037 gen_gpr_to_xmm_move_src (vmode, *op)),
1038 insn);
1040 if (cfun->can_throw_non_call_exceptions)
1042 /* Handle REG_EH_REGION note. */
1043 rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
1044 if (note)
1046 if (movabs)
1047 eh_insn = movabs;
1048 control_flow_insns.safe_push (eh_insn);
1049 add_reg_note (eh_insn, REG_EH_REGION, XEXP (note, 0));
1053 *op = tmp;
1055 if (dump_file)
1056 fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
1057 INSN_UID (insn), reg_or_subregno (tmp));
1059 else if (REG_P (*op))
1060 *op = gen_rtx_SUBREG (vmode, *op, 0);
1061 else if (CONST_SCALAR_INT_P (*op))
1063 rtx vec_cst = smode_convert_cst (*op, vmode);
1065 if (!standard_sse_constant_p (vec_cst, vmode))
1067 start_sequence ();
1068 vec_cst = validize_mem (force_const_mem (vmode, vec_cst));
1069 rtx_insn *seq = get_insns ();
1070 end_sequence ();
1071 emit_insn_before (seq, insn);
1074 tmp = gen_rtx_SUBREG (vmode, gen_reg_rtx (smode), 0);
1076 emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
1077 *op = tmp;
1079 else
1081 gcc_assert (SUBREG_P (*op));
1082 gcc_assert (GET_MODE (*op) == vmode);
1086 /* Convert CCZmode COMPARE to vector mode. */
1089 scalar_chain::convert_compare (rtx op1, rtx op2, rtx_insn *insn)
1091 rtx src, tmp;
1093 /* Handle any REG_EQUAL notes. */
1094 tmp = find_reg_equal_equiv_note (insn);
1095 if (tmp)
1097 if (GET_CODE (XEXP (tmp, 0)) == COMPARE
1098 && GET_MODE (XEXP (tmp, 0)) == CCZmode
1099 && REG_P (XEXP (XEXP (tmp, 0), 0)))
1101 rtx *op = &XEXP (XEXP (tmp, 0), 1);
1102 if (CONST_SCALAR_INT_P (*op))
1104 if (constm1_operand (*op, GET_MODE (*op)))
1105 *op = CONSTM1_RTX (vmode);
1106 else
1108 unsigned n = GET_MODE_NUNITS (vmode);
1109 rtx *v = XALLOCAVEC (rtx, n);
1110 v[0] = *op;
1111 for (unsigned i = 1; i < n; ++i)
1112 v[i] = const0_rtx;
1113 *op = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (n, v));
1115 tmp = NULL_RTX;
1117 else if (REG_P (*op))
1118 tmp = NULL_RTX;
1121 if (tmp)
1122 remove_note (insn, tmp);
1125 /* Comparison against anything other than zero, requires an XOR. */
1126 if (op2 != const0_rtx)
1128 convert_op (&op1, insn);
1129 convert_op (&op2, insn);
1130 /* If both operands are MEMs, explicitly load the OP1 into TMP. */
1131 if (MEM_P (op1) && MEM_P (op2))
1133 tmp = gen_reg_rtx (vmode);
1134 emit_insn_before (gen_rtx_SET (tmp, op1), insn);
1135 src = tmp;
1137 else
1138 src = op1;
1139 src = gen_rtx_XOR (vmode, src, op2);
1141 else if (GET_CODE (op1) == AND
1142 && GET_CODE (XEXP (op1, 0)) == NOT)
1144 rtx op11 = XEXP (XEXP (op1, 0), 0);
1145 rtx op12 = XEXP (op1, 1);
1146 convert_op (&op11, insn);
1147 convert_op (&op12, insn);
1148 if (!REG_P (op11))
1150 tmp = gen_reg_rtx (vmode);
1151 emit_insn_before (gen_rtx_SET (tmp, op11), insn);
1152 op11 = tmp;
1154 src = gen_rtx_AND (vmode, gen_rtx_NOT (vmode, op11), op12);
1156 else if (GET_CODE (op1) == AND)
1158 rtx op11 = XEXP (op1, 0);
1159 rtx op12 = XEXP (op1, 1);
1160 convert_op (&op11, insn);
1161 convert_op (&op12, insn);
1162 if (!REG_P (op11))
1164 tmp = gen_reg_rtx (vmode);
1165 emit_insn_before (gen_rtx_SET (tmp, op11), insn);
1166 op11 = tmp;
1168 return gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, op11, op12),
1169 UNSPEC_PTEST);
1171 else
1173 convert_op (&op1, insn);
1174 src = op1;
1177 if (!REG_P (src))
1179 tmp = gen_reg_rtx (vmode);
1180 emit_insn_before (gen_rtx_SET (tmp, src), insn);
1181 src = tmp;
1184 if (vmode == V2DImode)
1186 tmp = gen_reg_rtx (vmode);
1187 emit_insn_before (gen_vec_interleave_lowv2di (tmp, src, src), insn);
1188 src = tmp;
1190 else if (vmode == V4SImode)
1192 tmp = gen_reg_rtx (vmode);
1193 emit_insn_before (gen_sse2_pshufd (tmp, src, const0_rtx), insn);
1194 src = tmp;
1197 return gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, src, src), UNSPEC_PTEST);
1200 /* Helper function for converting INSN to vector mode. */
1202 void
1203 scalar_chain::convert_insn_common (rtx_insn *insn)
1205 /* Generate copies for out-of-chain uses of defs and adjust debug uses. */
1206 for (df_ref ref = DF_INSN_DEFS (insn); ref; ref = DF_REF_NEXT_LOC (ref))
1207 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref)))
1209 df_link *use;
1210 for (use = DF_REF_CHAIN (ref); use; use = use->next)
1211 if (NONDEBUG_INSN_P (DF_REF_INSN (use->ref))
1212 && (DF_REF_REG_MEM_P (use->ref)
1213 || !bitmap_bit_p (insns, DF_REF_INSN_UID (use->ref))))
1214 break;
1215 if (use)
1216 convert_reg (insn, DF_REF_REG (ref),
1217 *defs_map.get (regno_reg_rtx [DF_REF_REGNO (ref)]));
1218 else if (MAY_HAVE_DEBUG_BIND_INSNS)
1220 /* If we generated a scalar copy we can leave debug-insns
1221 as-is, if not, we have to adjust them. */
1222 auto_vec<rtx_insn *, 5> to_reset_debug_insns;
1223 for (use = DF_REF_CHAIN (ref); use; use = use->next)
1224 if (DEBUG_INSN_P (DF_REF_INSN (use->ref)))
1226 rtx_insn *debug_insn = DF_REF_INSN (use->ref);
1227 /* If there's a reaching definition outside of the
1228 chain we have to reset. */
1229 df_link *def;
1230 for (def = DF_REF_CHAIN (use->ref); def; def = def->next)
1231 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def->ref)))
1232 break;
1233 if (def)
1234 to_reset_debug_insns.safe_push (debug_insn);
1235 else
1237 *DF_REF_REAL_LOC (use->ref)
1238 = *defs_map.get (regno_reg_rtx [DF_REF_REGNO (ref)]);
1239 df_insn_rescan (debug_insn);
1242 /* Have to do the reset outside of the DF_CHAIN walk to not
1243 disrupt it. */
1244 while (!to_reset_debug_insns.is_empty ())
1246 rtx_insn *debug_insn = to_reset_debug_insns.pop ();
1247 INSN_VAR_LOCATION_LOC (debug_insn) = gen_rtx_UNKNOWN_VAR_LOC ();
1248 df_insn_rescan_debug_internal (debug_insn);
1253 /* Replace uses in this insn with the defs we use in the chain. */
1254 for (df_ref ref = DF_INSN_USES (insn); ref; ref = DF_REF_NEXT_LOC (ref))
1255 if (!DF_REF_REG_MEM_P (ref))
1256 if (rtx *vreg = defs_map.get (regno_reg_rtx[DF_REF_REGNO (ref)]))
1258 /* Also update a corresponding REG_DEAD note. */
1259 rtx note = find_reg_note (insn, REG_DEAD, DF_REF_REG (ref));
1260 if (note)
1261 XEXP (note, 0) = *vreg;
1262 *DF_REF_REAL_LOC (ref) = *vreg;
1266 /* Convert INSN which is an SImode or DImode rotation by a constant
1267 to vector mode. CODE is either ROTATE or ROTATERT with operands
1268 OP0 and OP1. Returns the SET_SRC of the last instruction in the
1269 resulting sequence, which is emitted before INSN. */
1272 general_scalar_chain::convert_rotate (enum rtx_code code, rtx op0, rtx op1,
1273 rtx_insn *insn)
1275 int bits = INTVAL (op1);
1276 rtx pat, result;
1278 convert_op (&op0, insn);
1279 if (bits == 0)
1280 return op0;
1282 if (smode == DImode)
1284 if (code == ROTATE)
1285 bits = 64 - bits;
1286 if (bits == 32)
1288 rtx tmp1 = gen_reg_rtx (V4SImode);
1289 pat = gen_sse2_pshufd (tmp1, gen_lowpart (V4SImode, op0),
1290 GEN_INT (225));
1291 emit_insn_before (pat, insn);
1292 result = gen_lowpart (V2DImode, tmp1);
1294 else if (TARGET_AVX512VL)
1295 result = simplify_gen_binary (code, V2DImode, op0, op1);
1296 else if (bits == 16 || bits == 48)
1298 rtx tmp1 = gen_reg_rtx (V8HImode);
1299 pat = gen_sse2_pshuflw (tmp1, gen_lowpart (V8HImode, op0),
1300 GEN_INT (bits == 16 ? 57 : 147));
1301 emit_insn_before (pat, insn);
1302 result = gen_lowpart (V2DImode, tmp1);
1304 else if ((bits & 0x07) == 0)
1306 rtx tmp1 = gen_reg_rtx (V4SImode);
1307 pat = gen_sse2_pshufd (tmp1, gen_lowpart (V4SImode, op0),
1308 GEN_INT (68));
1309 emit_insn_before (pat, insn);
1310 rtx tmp2 = gen_reg_rtx (V1TImode);
1311 pat = gen_sse2_lshrv1ti3 (tmp2, gen_lowpart (V1TImode, tmp1),
1312 GEN_INT (bits));
1313 emit_insn_before (pat, insn);
1314 result = gen_lowpart (V2DImode, tmp2);
1316 else
1318 rtx tmp1 = gen_reg_rtx (V4SImode);
1319 pat = gen_sse2_pshufd (tmp1, gen_lowpart (V4SImode, op0),
1320 GEN_INT (20));
1321 emit_insn_before (pat, insn);
1322 rtx tmp2 = gen_reg_rtx (V2DImode);
1323 pat = gen_lshrv2di3 (tmp2, gen_lowpart (V2DImode, tmp1),
1324 GEN_INT (bits & 31));
1325 emit_insn_before (pat, insn);
1326 rtx tmp3 = gen_reg_rtx (V4SImode);
1327 pat = gen_sse2_pshufd (tmp3, gen_lowpart (V4SImode, tmp2),
1328 GEN_INT (bits > 32 ? 34 : 136));
1329 emit_insn_before (pat, insn);
1330 result = gen_lowpart (V2DImode, tmp3);
1333 else if (bits == 16)
1335 rtx tmp1 = gen_reg_rtx (V8HImode);
1336 pat = gen_sse2_pshuflw (tmp1, gen_lowpart (V8HImode, op0), GEN_INT (225));
1337 emit_insn_before (pat, insn);
1338 result = gen_lowpart (V4SImode, tmp1);
1340 else if (TARGET_AVX512VL)
1341 result = simplify_gen_binary (code, V4SImode, op0, op1);
1342 else
1344 if (code == ROTATE)
1345 bits = 32 - bits;
1347 rtx tmp1 = gen_reg_rtx (V4SImode);
1348 emit_insn_before (gen_sse2_pshufd (tmp1, op0, GEN_INT (224)), insn);
1349 rtx tmp2 = gen_reg_rtx (V2DImode);
1350 pat = gen_lshrv2di3 (tmp2, gen_lowpart (V2DImode, tmp1),
1351 GEN_INT (bits));
1352 emit_insn_before (pat, insn);
1353 result = gen_lowpart (V4SImode, tmp2);
1356 return result;
1359 /* Convert INSN to vector mode. */
1361 void
1362 general_scalar_chain::convert_insn (rtx_insn *insn)
1364 rtx def_set = single_set (insn);
1365 rtx src = SET_SRC (def_set);
1366 rtx dst = SET_DEST (def_set);
1367 rtx subreg;
1369 if (MEM_P (dst) && !REG_P (src))
1371 /* There are no scalar integer instructions and therefore
1372 temporary register usage is required. */
1373 rtx tmp = gen_reg_rtx (smode);
1374 emit_conversion_insns (gen_move_insn (dst, tmp), insn);
1375 dst = gen_rtx_SUBREG (vmode, tmp, 0);
1377 else if (REG_P (dst) && GET_MODE (dst) == smode)
1379 /* Replace the definition with a SUBREG to the definition we
1380 use inside the chain. */
1381 rtx *vdef = defs_map.get (dst);
1382 if (vdef)
1383 dst = *vdef;
1384 dst = gen_rtx_SUBREG (vmode, dst, 0);
1385 /* IRA doesn't like to have REG_EQUAL/EQUIV notes when the SET_DEST
1386 is a non-REG_P. So kill those off. */
1387 rtx note = find_reg_equal_equiv_note (insn);
1388 if (note)
1389 remove_note (insn, note);
1392 switch (GET_CODE (src))
1394 case PLUS:
1395 case MINUS:
1396 case IOR:
1397 case XOR:
1398 case AND:
1399 case SMAX:
1400 case SMIN:
1401 case UMAX:
1402 case UMIN:
1403 convert_op (&XEXP (src, 1), insn);
1404 /* FALLTHRU */
1406 case ABS:
1407 case ASHIFT:
1408 case ASHIFTRT:
1409 case LSHIFTRT:
1410 convert_op (&XEXP (src, 0), insn);
1411 PUT_MODE (src, vmode);
1412 break;
1414 case ROTATE:
1415 case ROTATERT:
1416 src = convert_rotate (GET_CODE (src), XEXP (src, 0), XEXP (src, 1),
1417 insn);
1418 break;
1420 case NEG:
1421 src = XEXP (src, 0);
1423 if (GET_CODE (src) == ABS)
1425 src = XEXP (src, 0);
1426 convert_op (&src, insn);
1427 subreg = gen_reg_rtx (vmode);
1428 emit_insn_before (gen_rtx_SET (subreg,
1429 gen_rtx_ABS (vmode, src)), insn);
1430 src = subreg;
1432 else
1433 convert_op (&src, insn);
1435 subreg = gen_reg_rtx (vmode);
1436 emit_insn_before (gen_move_insn (subreg, CONST0_RTX (vmode)), insn);
1437 src = gen_rtx_MINUS (vmode, subreg, src);
1438 break;
1440 case NOT:
1441 src = XEXP (src, 0);
1442 convert_op (&src, insn);
1443 subreg = gen_reg_rtx (vmode);
1444 emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (vmode)), insn);
1445 src = gen_rtx_XOR (vmode, src, subreg);
1446 break;
1448 case MEM:
1449 if (!REG_P (dst))
1450 convert_op (&src, insn);
1451 break;
1453 case REG:
1454 if (!MEM_P (dst))
1455 convert_op (&src, insn);
1456 break;
1458 case SUBREG:
1459 gcc_assert (GET_MODE (src) == vmode);
1460 break;
1462 case COMPARE:
1463 dst = gen_rtx_REG (CCZmode, FLAGS_REG);
1464 src = convert_compare (XEXP (src, 0), XEXP (src, 1), insn);
1465 break;
1467 case CONST_INT:
1468 convert_op (&src, insn);
1469 break;
1471 case VEC_SELECT:
1472 if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx)
1473 src = XEXP (src, 0);
1474 else if (smode == DImode)
1476 rtx tmp = gen_lowpart (V1TImode, XEXP (src, 0));
1477 dst = gen_lowpart (V1TImode, dst);
1478 src = gen_rtx_LSHIFTRT (V1TImode, tmp, GEN_INT (64));
1480 else
1482 rtx tmp = XVECEXP (XEXP (src, 1), 0, 0);
1483 rtvec vec = gen_rtvec (4, tmp, tmp, tmp, tmp);
1484 rtx par = gen_rtx_PARALLEL (VOIDmode, vec);
1485 src = gen_rtx_VEC_SELECT (vmode, XEXP (src, 0), par);
1487 break;
1489 default:
1490 gcc_unreachable ();
1493 SET_SRC (def_set) = src;
1494 SET_DEST (def_set) = dst;
1496 /* Drop possible dead definitions. */
1497 PATTERN (insn) = def_set;
1499 INSN_CODE (insn) = -1;
1500 int patt = recog_memoized (insn);
1501 if (patt == -1)
1502 fatal_insn_not_found (insn);
1503 df_insn_rescan (insn);
1506 /* Helper function to compute gain for loading an immediate constant.
1507 Typically, two movabsq for TImode vs. vmovdqa for V1TImode, but
1508 with numerous special cases. */
1510 static int
1511 timode_immed_const_gain (rtx cst)
1513 /* movabsq vs. movabsq+vmovq+vunpacklqdq. */
1514 if (CONST_WIDE_INT_P (cst)
1515 && CONST_WIDE_INT_NUNITS (cst) == 2
1516 && CONST_WIDE_INT_ELT (cst, 0) == CONST_WIDE_INT_ELT (cst, 1))
1517 return optimize_insn_for_size_p () ? -COSTS_N_BYTES (9)
1518 : -COSTS_N_INSNS (2);
1519 /* 2x movabsq ~ vmovdqa. */
1520 return 0;
1523 /* Compute a gain for chain conversion. */
1526 timode_scalar_chain::compute_convert_gain ()
1528 /* Assume that if we have to move TImode values between units,
1529 then transforming this chain isn't worth it. */
1530 if (n_sse_to_integer || n_integer_to_sse)
1531 return -1;
1533 bitmap_iterator bi;
1534 unsigned insn_uid;
1536 /* Split ties to prefer V1TImode when not optimizing for size. */
1537 int gain = optimize_size ? 0 : 1;
1539 if (dump_file)
1540 fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
1542 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
1544 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1545 rtx def_set = single_set (insn);
1546 rtx src = SET_SRC (def_set);
1547 rtx dst = SET_DEST (def_set);
1548 HOST_WIDE_INT op1val;
1549 int scost, vcost;
1550 int igain = 0;
1552 switch (GET_CODE (src))
1554 case REG:
1555 if (optimize_insn_for_size_p ())
1556 igain = MEM_P (dst) ? COSTS_N_BYTES (6) : COSTS_N_BYTES (3);
1557 else
1558 igain = COSTS_N_INSNS (1);
1559 break;
1561 case MEM:
1562 igain = optimize_insn_for_size_p () ? COSTS_N_BYTES (7)
1563 : COSTS_N_INSNS (1);
1564 break;
1566 case CONST_INT:
1567 if (MEM_P (dst)
1568 && standard_sse_constant_p (src, V1TImode))
1569 igain = optimize_insn_for_size_p () ? COSTS_N_BYTES (11) : 1;
1570 break;
1572 case CONST_WIDE_INT:
1573 /* 2 x mov vs. vmovdqa. */
1574 if (MEM_P (dst))
1575 igain = optimize_insn_for_size_p () ? COSTS_N_BYTES (3)
1576 : COSTS_N_INSNS (1);
1577 break;
1579 case NOT:
1580 if (MEM_P (dst))
1581 igain = -COSTS_N_INSNS (1);
1582 break;
1584 case AND:
1585 case XOR:
1586 case IOR:
1587 if (!MEM_P (dst))
1588 igain = COSTS_N_INSNS (1);
1589 if (CONST_SCALAR_INT_P (XEXP (src, 1)))
1590 igain += timode_immed_const_gain (XEXP (src, 1));
1591 break;
1593 case ASHIFT:
1594 case LSHIFTRT:
1595 /* See ix86_expand_v1ti_shift. */
1596 op1val = INTVAL (XEXP (src, 1));
1597 if (optimize_insn_for_size_p ())
1599 if (op1val == 64 || op1val == 65)
1600 scost = COSTS_N_BYTES (5);
1601 else if (op1val >= 66)
1602 scost = COSTS_N_BYTES (6);
1603 else if (op1val == 1)
1604 scost = COSTS_N_BYTES (8);
1605 else
1606 scost = COSTS_N_BYTES (9);
1608 if ((op1val & 7) == 0)
1609 vcost = COSTS_N_BYTES (5);
1610 else if (op1val > 64)
1611 vcost = COSTS_N_BYTES (10);
1612 else
1613 vcost = TARGET_AVX ? COSTS_N_BYTES (19) : COSTS_N_BYTES (23);
1615 else
1617 scost = COSTS_N_INSNS (2);
1618 if ((op1val & 7) == 0)
1619 vcost = COSTS_N_INSNS (1);
1620 else if (op1val > 64)
1621 vcost = COSTS_N_INSNS (2);
1622 else
1623 vcost = TARGET_AVX ? COSTS_N_INSNS (4) : COSTS_N_INSNS (5);
1625 igain = scost - vcost;
1626 break;
1628 case ASHIFTRT:
1629 /* See ix86_expand_v1ti_ashiftrt. */
1630 op1val = INTVAL (XEXP (src, 1));
1631 if (optimize_insn_for_size_p ())
1633 if (op1val == 64 || op1val == 127)
1634 scost = COSTS_N_BYTES (7);
1635 else if (op1val == 1)
1636 scost = COSTS_N_BYTES (8);
1637 else if (op1val == 65)
1638 scost = COSTS_N_BYTES (10);
1639 else if (op1val >= 66)
1640 scost = COSTS_N_BYTES (11);
1641 else
1642 scost = COSTS_N_BYTES (9);
1644 if (op1val == 127)
1645 vcost = COSTS_N_BYTES (10);
1646 else if (op1val == 64)
1647 vcost = COSTS_N_BYTES (14);
1648 else if (op1val == 96)
1649 vcost = COSTS_N_BYTES (18);
1650 else if (op1val >= 111)
1651 vcost = COSTS_N_BYTES (15);
1652 else if (TARGET_AVX2 && op1val == 32)
1653 vcost = COSTS_N_BYTES (16);
1654 else if (TARGET_SSE4_1 && op1val == 32)
1655 vcost = COSTS_N_BYTES (20);
1656 else if (op1val >= 96)
1657 vcost = COSTS_N_BYTES (23);
1658 else if ((op1val & 7) == 0)
1659 vcost = COSTS_N_BYTES (28);
1660 else if (TARGET_AVX2 && op1val < 32)
1661 vcost = COSTS_N_BYTES (30);
1662 else if (op1val == 1 || op1val >= 64)
1663 vcost = COSTS_N_BYTES (42);
1664 else
1665 vcost = COSTS_N_BYTES (47);
1667 else
1669 if (op1val >= 65 && op1val <= 126)
1670 scost = COSTS_N_INSNS (3);
1671 else
1672 scost = COSTS_N_INSNS (2);
1674 if (op1val == 127)
1675 vcost = COSTS_N_INSNS (2);
1676 else if (op1val == 64)
1677 vcost = COSTS_N_INSNS (3);
1678 else if (op1val == 96)
1679 vcost = COSTS_N_INSNS (3);
1680 else if (op1val >= 111)
1681 vcost = COSTS_N_INSNS (3);
1682 else if (TARGET_SSE4_1 && op1val == 32)
1683 vcost = COSTS_N_INSNS (3);
1684 else if (TARGET_SSE4_1
1685 && (op1val == 8 || op1val == 16 || op1val == 24))
1686 vcost = COSTS_N_INSNS (3);
1687 else if (op1val >= 96)
1688 vcost = COSTS_N_INSNS (4);
1689 else if (TARGET_SSE4_1 && (op1val == 28 || op1val == 80))
1690 vcost = COSTS_N_INSNS (4);
1691 else if ((op1val & 7) == 0)
1692 vcost = COSTS_N_INSNS (5);
1693 else if (TARGET_AVX2 && op1val < 32)
1694 vcost = COSTS_N_INSNS (6);
1695 else if (TARGET_SSE4_1 && op1val < 15)
1696 vcost = COSTS_N_INSNS (6);
1697 else if (op1val == 1 || op1val >= 64)
1698 vcost = COSTS_N_INSNS (8);
1699 else
1700 vcost = COSTS_N_INSNS (9);
1702 igain = scost - vcost;
1703 break;
1705 case ROTATE:
1706 case ROTATERT:
1707 /* See ix86_expand_v1ti_rotate. */
1708 op1val = INTVAL (XEXP (src, 1));
1709 if (optimize_insn_for_size_p ())
1711 scost = COSTS_N_BYTES (13);
1712 if ((op1val & 31) == 0)
1713 vcost = COSTS_N_BYTES (5);
1714 else if ((op1val & 7) == 0)
1715 vcost = TARGET_AVX ? COSTS_N_BYTES (13) : COSTS_N_BYTES (18);
1716 else if (op1val > 32 && op1val < 96)
1717 vcost = COSTS_N_BYTES (24);
1718 else
1719 vcost = COSTS_N_BYTES (19);
1721 else
1723 scost = COSTS_N_INSNS (3);
1724 if ((op1val & 31) == 0)
1725 vcost = COSTS_N_INSNS (1);
1726 else if ((op1val & 7) == 0)
1727 vcost = TARGET_AVX ? COSTS_N_INSNS (3) : COSTS_N_INSNS (4);
1728 else if (op1val > 32 && op1val < 96)
1729 vcost = COSTS_N_INSNS (5);
1730 else
1731 vcost = COSTS_N_INSNS (1);
1733 igain = scost - vcost;
1734 break;
1736 case COMPARE:
1737 if (XEXP (src, 1) == const0_rtx)
1739 if (GET_CODE (XEXP (src, 0)) == AND)
1740 /* and;and;or (9 bytes) vs. ptest (5 bytes). */
1741 igain = optimize_insn_for_size_p() ? COSTS_N_BYTES (4)
1742 : COSTS_N_INSNS (2);
1743 /* or (3 bytes) vs. ptest (5 bytes). */
1744 else if (optimize_insn_for_size_p ())
1745 igain = -COSTS_N_BYTES (2);
1747 else if (XEXP (src, 1) == const1_rtx)
1748 /* and;cmp -1 (7 bytes) vs. pcmpeqd;pxor;ptest (13 bytes). */
1749 igain = optimize_insn_for_size_p() ? -COSTS_N_BYTES (6)
1750 : -COSTS_N_INSNS (1);
1751 break;
1753 default:
1754 break;
1757 if (igain != 0 && dump_file)
1759 fprintf (dump_file, " Instruction gain %d for ", igain);
1760 dump_insn_slim (dump_file, insn);
1762 gain += igain;
1765 if (dump_file)
1766 fprintf (dump_file, " Total gain: %d\n", gain);
1768 return gain;
1771 /* Fix uses of converted REG in debug insns. */
1773 void
1774 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
1776 if (!flag_var_tracking)
1777 return;
1779 df_ref ref, next;
1780 for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
1782 rtx_insn *insn = DF_REF_INSN (ref);
1783 /* Make sure the next ref is for a different instruction,
1784 so that we're not affected by the rescan. */
1785 next = DF_REF_NEXT_REG (ref);
1786 while (next && DF_REF_INSN (next) == insn)
1787 next = DF_REF_NEXT_REG (next);
1789 if (DEBUG_INSN_P (insn))
1791 /* It may be a debug insn with a TImode variable in
1792 register. */
1793 bool changed = false;
1794 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
1796 rtx *loc = DF_REF_LOC (ref);
1797 if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
1799 *loc = gen_rtx_SUBREG (TImode, *loc, 0);
1800 changed = true;
1803 if (changed)
1804 df_insn_rescan (insn);
1809 /* Convert INSN from TImode to V1T1mode. */
1811 void
1812 timode_scalar_chain::convert_insn (rtx_insn *insn)
1814 rtx def_set = single_set (insn);
1815 rtx src = SET_SRC (def_set);
1816 rtx dst = SET_DEST (def_set);
1817 rtx tmp;
1819 switch (GET_CODE (dst))
1821 case REG:
1822 if (GET_MODE (dst) == TImode)
1824 PUT_MODE (dst, V1TImode);
1825 fix_debug_reg_uses (dst);
1827 if (GET_MODE (dst) == V1TImode)
1829 /* It might potentially be helpful to convert REG_EQUAL notes,
1830 but for now we just remove them. */
1831 rtx note = find_reg_equal_equiv_note (insn);
1832 if (note)
1833 remove_note (insn, note);
1835 break;
1836 case MEM:
1837 PUT_MODE (dst, V1TImode);
1838 break;
1840 default:
1841 gcc_unreachable ();
1844 switch (GET_CODE (src))
1846 case REG:
1847 if (GET_MODE (src) == TImode)
1849 PUT_MODE (src, V1TImode);
1850 fix_debug_reg_uses (src);
1852 break;
1854 case MEM:
1855 PUT_MODE (src, V1TImode);
1856 break;
1858 case CONST_WIDE_INT:
1859 if (NONDEBUG_INSN_P (insn))
1861 /* Since there are no instructions to store 128-bit constant,
1862 temporary register usage is required. */
1863 bool use_move;
1864 start_sequence ();
1865 tmp = ix86_convert_const_wide_int_to_broadcast (TImode, src);
1866 if (tmp)
1868 src = lowpart_subreg (V1TImode, tmp, TImode);
1869 use_move = true;
1871 else
1873 src = smode_convert_cst (src, V1TImode);
1874 src = validize_mem (force_const_mem (V1TImode, src));
1875 use_move = MEM_P (dst);
1877 rtx_insn *seq = get_insns ();
1878 end_sequence ();
1879 if (seq)
1880 emit_insn_before (seq, insn);
1881 if (use_move)
1883 tmp = gen_reg_rtx (V1TImode);
1884 emit_insn_before (gen_rtx_SET (tmp, src), insn);
1885 src = tmp;
1888 break;
1890 case CONST_INT:
1891 switch (standard_sse_constant_p (src, TImode))
1893 case 1:
1894 src = CONST0_RTX (GET_MODE (dst));
1895 break;
1896 case 2:
1897 src = CONSTM1_RTX (GET_MODE (dst));
1898 break;
1899 default:
1900 gcc_unreachable ();
1902 if (MEM_P (dst))
1904 tmp = gen_reg_rtx (V1TImode);
1905 emit_insn_before (gen_rtx_SET (tmp, src), insn);
1906 src = tmp;
1908 break;
1910 case AND:
1911 if (GET_CODE (XEXP (src, 0)) == NOT)
1913 convert_op (&XEXP (XEXP (src, 0), 0), insn);
1914 convert_op (&XEXP (src, 1), insn);
1915 PUT_MODE (XEXP (src, 0), V1TImode);
1916 PUT_MODE (src, V1TImode);
1917 break;
1919 /* FALLTHRU */
1921 case XOR:
1922 case IOR:
1923 convert_op (&XEXP (src, 0), insn);
1924 convert_op (&XEXP (src, 1), insn);
1925 PUT_MODE (src, V1TImode);
1926 if (MEM_P (dst))
1928 tmp = gen_reg_rtx (V1TImode);
1929 emit_insn_before (gen_rtx_SET (tmp, src), insn);
1930 src = tmp;
1932 break;
1934 case NOT:
1935 src = XEXP (src, 0);
1936 convert_op (&src, insn);
1937 tmp = gen_reg_rtx (V1TImode);
1938 emit_insn_before (gen_move_insn (tmp, CONSTM1_RTX (V1TImode)), insn);
1939 src = gen_rtx_XOR (V1TImode, src, tmp);
1940 if (MEM_P (dst))
1942 tmp = gen_reg_rtx (V1TImode);
1943 emit_insn_before (gen_rtx_SET (tmp, src), insn);
1944 src = tmp;
1946 break;
1948 case COMPARE:
1949 dst = gen_rtx_REG (CCZmode, FLAGS_REG);
1950 src = convert_compare (XEXP (src, 0), XEXP (src, 1), insn);
1951 break;
1953 case ASHIFT:
1954 case LSHIFTRT:
1955 case ASHIFTRT:
1956 case ROTATERT:
1957 case ROTATE:
1958 convert_op (&XEXP (src, 0), insn);
1959 PUT_MODE (src, V1TImode);
1960 break;
1962 default:
1963 gcc_unreachable ();
1966 SET_SRC (def_set) = src;
1967 SET_DEST (def_set) = dst;
1969 /* Drop possible dead definitions. */
1970 PATTERN (insn) = def_set;
1972 INSN_CODE (insn) = -1;
1973 recog_memoized (insn);
1974 df_insn_rescan (insn);
1977 /* Generate copies from defs used by the chain but not defined therein.
1978 Also populates defs_map which is used later by convert_insn. */
1980 void
1981 scalar_chain::convert_registers ()
1983 bitmap_iterator bi;
1984 unsigned id;
1985 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
1987 rtx chain_reg = gen_reg_rtx (smode);
1988 defs_map.put (regno_reg_rtx[id], chain_reg);
1990 EXECUTE_IF_SET_IN_BITMAP (insns_conv, 0, id, bi)
1991 for (df_ref ref = DF_INSN_UID_DEFS (id); ref; ref = DF_REF_NEXT_LOC (ref))
1992 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref)))
1993 make_vector_copies (DF_REF_INSN (ref), DF_REF_REAL_REG (ref));
1996 /* Convert whole chain creating required register
1997 conversions and copies. */
2000 scalar_chain::convert ()
2002 bitmap_iterator bi;
2003 unsigned id;
2004 int converted_insns = 0;
2006 if (!dbg_cnt (stv_conversion))
2007 return 0;
2009 if (dump_file)
2010 fprintf (dump_file, "Converting chain #%d...\n", chain_id);
2012 convert_registers ();
2014 EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
2016 rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
2017 convert_insn_common (insn);
2018 convert_insn (insn);
2019 converted_insns++;
2022 return converted_insns;
2025 /* Return the SET expression if INSN doesn't reference hard register.
2026 Return NULL if INSN uses or defines a hard register, excluding
2027 pseudo register pushes, hard register uses in a memory address,
2028 clobbers and flags definitions. */
2030 static rtx
2031 pseudo_reg_set (rtx_insn *insn)
2033 rtx set = single_set (insn);
2034 if (!set)
2035 return NULL;
2037 /* Check pseudo register push first. */
2038 machine_mode mode = TARGET_64BIT ? TImode : DImode;
2039 if (REG_P (SET_SRC (set))
2040 && !HARD_REGISTER_P (SET_SRC (set))
2041 && push_operand (SET_DEST (set), mode))
2042 return set;
2044 df_ref ref;
2045 FOR_EACH_INSN_DEF (ref, insn)
2046 if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
2047 && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
2048 && DF_REF_REGNO (ref) != FLAGS_REG)
2049 return NULL;
2051 FOR_EACH_INSN_USE (ref, insn)
2052 if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
2053 return NULL;
2055 return set;
2058 /* Return true if the register REG is defined in a single DEF chain.
2059 If it is defined in more than one DEF chains, we may not be able
2060 to convert it in all chains. */
2062 static bool
2063 single_def_chain_p (rtx reg)
2065 df_ref ref = DF_REG_DEF_CHAIN (REGNO (reg));
2066 if (!ref)
2067 return false;
2068 return DF_REF_NEXT_REG (ref) == nullptr;
2071 /* Check if comparison INSN may be transformed into vector comparison.
2072 Currently we transform equality/inequality checks which look like:
2073 (set (reg:CCZ 17 flags) (compare:CCZ (reg:TI x) (reg:TI y))) */
2075 static bool
2076 convertible_comparison_p (rtx_insn *insn, enum machine_mode mode)
2078 if (mode != (TARGET_64BIT ? TImode : DImode))
2079 return false;
2081 if (!TARGET_SSE4_1)
2082 return false;
2084 rtx def_set = single_set (insn);
2086 gcc_assert (def_set);
2088 rtx src = SET_SRC (def_set);
2089 rtx dst = SET_DEST (def_set);
2091 gcc_assert (GET_CODE (src) == COMPARE);
2093 if (GET_CODE (dst) != REG
2094 || REGNO (dst) != FLAGS_REG
2095 || GET_MODE (dst) != CCZmode)
2096 return false;
2098 rtx op1 = XEXP (src, 0);
2099 rtx op2 = XEXP (src, 1);
2101 /* *cmp<dwi>_doubleword. */
2102 if ((CONST_SCALAR_INT_P (op1)
2103 || ((REG_P (op1) || MEM_P (op1))
2104 && GET_MODE (op1) == mode))
2105 && (CONST_SCALAR_INT_P (op2)
2106 || ((REG_P (op2) || MEM_P (op2))
2107 && GET_MODE (op2) == mode)))
2108 return true;
2110 /* *testti_doubleword. */
2111 if (op2 == const0_rtx
2112 && GET_CODE (op1) == AND
2113 && REG_P (XEXP (op1, 0)))
2115 rtx op12 = XEXP (op1, 1);
2116 return GET_MODE (XEXP (op1, 0)) == TImode
2117 && (CONST_SCALAR_INT_P (op12)
2118 || ((REG_P (op12) || MEM_P (op12))
2119 && GET_MODE (op12) == TImode));
2122 /* *test<dwi>_not_doubleword. */
2123 if (op2 == const0_rtx
2124 && GET_CODE (op1) == AND
2125 && GET_CODE (XEXP (op1, 0)) == NOT)
2127 rtx op11 = XEXP (XEXP (op1, 0), 0);
2128 rtx op12 = XEXP (op1, 1);
2129 return (REG_P (op11) || MEM_P (op11))
2130 && (REG_P (op12) || MEM_P (op12))
2131 && GET_MODE (op11) == mode
2132 && GET_MODE (op12) == mode;
2135 return false;
2138 /* The general version of scalar_to_vector_candidate_p. */
2140 static bool
2141 general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode)
2143 rtx def_set = pseudo_reg_set (insn);
2145 if (!def_set)
2146 return false;
2148 rtx src = SET_SRC (def_set);
2149 rtx dst = SET_DEST (def_set);
2151 if (GET_CODE (src) == COMPARE)
2152 return convertible_comparison_p (insn, mode);
2154 /* We are interested in "mode" only. */
2155 if ((GET_MODE (src) != mode
2156 && !CONST_INT_P (src))
2157 || GET_MODE (dst) != mode)
2158 return false;
2160 if (!REG_P (dst) && !MEM_P (dst))
2161 return false;
2163 switch (GET_CODE (src))
2165 case ASHIFT:
2166 case LSHIFTRT:
2167 case ASHIFTRT:
2168 case ROTATE:
2169 case ROTATERT:
2170 if (!CONST_INT_P (XEXP (src, 1))
2171 || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, GET_MODE_BITSIZE (mode)-1))
2172 return false;
2174 /* Check for extend highpart case. */
2175 if (mode != DImode
2176 || GET_CODE (src) != ASHIFTRT
2177 || GET_CODE (XEXP (src, 0)) != ASHIFT)
2178 break;
2180 src = XEXP (src, 0);
2181 break;
2183 case SMAX:
2184 case SMIN:
2185 case UMAX:
2186 case UMIN:
2187 if ((mode == DImode && !TARGET_AVX512VL)
2188 || (mode == SImode && !TARGET_SSE4_1))
2189 return false;
2190 /* Fallthru. */
2192 case AND:
2193 case IOR:
2194 case XOR:
2195 case PLUS:
2196 case MINUS:
2197 if (!REG_P (XEXP (src, 1))
2198 && !MEM_P (XEXP (src, 1))
2199 && !CONST_INT_P (XEXP (src, 1)))
2200 return false;
2202 if (GET_MODE (XEXP (src, 1)) != mode
2203 && !CONST_INT_P (XEXP (src, 1)))
2204 return false;
2206 /* Check for andnot case. */
2207 if (GET_CODE (src) != AND
2208 || GET_CODE (XEXP (src, 0)) != NOT)
2209 break;
2211 src = XEXP (src, 0);
2212 /* FALLTHRU */
2214 case NOT:
2215 break;
2217 case NEG:
2218 /* Check for nabs case. */
2219 if (GET_CODE (XEXP (src, 0)) != ABS)
2220 break;
2222 src = XEXP (src, 0);
2223 /* FALLTHRU */
2225 case ABS:
2226 if ((mode == DImode && !TARGET_AVX512VL)
2227 || (mode == SImode && !TARGET_SSSE3))
2228 return false;
2229 break;
2231 case REG:
2232 return true;
2234 case MEM:
2235 case CONST_INT:
2236 return REG_P (dst);
2238 case VEC_SELECT:
2239 /* Excluding MEM_P (dst) avoids intefering with vpextr[dq]. */
2240 return REG_P (dst)
2241 && REG_P (XEXP (src, 0))
2242 && GET_MODE (XEXP (src, 0)) == (mode == DImode ? V2DImode
2243 : V4SImode)
2244 && GET_CODE (XEXP (src, 1)) == PARALLEL
2245 && XVECLEN (XEXP (src, 1), 0) == 1
2246 && CONST_INT_P (XVECEXP (XEXP (src, 1), 0, 0));
2248 default:
2249 return false;
2252 if (!REG_P (XEXP (src, 0))
2253 && !MEM_P (XEXP (src, 0))
2254 && !CONST_INT_P (XEXP (src, 0)))
2255 return false;
2257 if (GET_MODE (XEXP (src, 0)) != mode
2258 && !CONST_INT_P (XEXP (src, 0)))
2259 return false;
2261 return true;
2264 /* Check for a suitable TImode memory operand. */
2266 static bool
2267 timode_mem_p (rtx x)
2269 return MEM_P (x)
2270 && (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
2271 || !misaligned_operand (x, TImode));
2274 /* The TImode version of scalar_to_vector_candidate_p. */
2276 static bool
2277 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
2279 rtx def_set = pseudo_reg_set (insn);
2281 if (!def_set)
2282 return false;
2284 rtx src = SET_SRC (def_set);
2285 rtx dst = SET_DEST (def_set);
2287 if (GET_CODE (src) == COMPARE)
2288 return convertible_comparison_p (insn, TImode);
2290 if (GET_MODE (dst) != TImode
2291 || (GET_MODE (src) != TImode
2292 && !CONST_SCALAR_INT_P (src)))
2293 return false;
2295 if (!REG_P (dst) && !MEM_P (dst))
2296 return false;
2298 if (MEM_P (dst)
2299 && misaligned_operand (dst, TImode)
2300 && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
2301 return false;
2303 if (REG_P (dst) && !single_def_chain_p (dst))
2304 return false;
2306 switch (GET_CODE (src))
2308 case REG:
2309 return single_def_chain_p (src);
2311 case CONST_WIDE_INT:
2312 return true;
2314 case CONST_INT:
2315 /* ??? Verify performance impact before enabling CONST_INT for
2316 __int128 store. */
2317 return standard_sse_constant_p (src, TImode);
2319 case MEM:
2320 /* Memory must be aligned or unaligned load is optimal. */
2321 return (REG_P (dst)
2322 && (!misaligned_operand (src, TImode)
2323 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
2325 case AND:
2326 if (!MEM_P (dst)
2327 && GET_CODE (XEXP (src, 0)) == NOT
2328 && REG_P (XEXP (XEXP (src, 0), 0))
2329 && (REG_P (XEXP (src, 1))
2330 || CONST_SCALAR_INT_P (XEXP (src, 1))
2331 || timode_mem_p (XEXP (src, 1))))
2332 return true;
2333 return (REG_P (XEXP (src, 0))
2334 || timode_mem_p (XEXP (src, 0)))
2335 && (REG_P (XEXP (src, 1))
2336 || CONST_SCALAR_INT_P (XEXP (src, 1))
2337 || timode_mem_p (XEXP (src, 1)));
2339 case IOR:
2340 case XOR:
2341 return (REG_P (XEXP (src, 0))
2342 || timode_mem_p (XEXP (src, 0)))
2343 && (REG_P (XEXP (src, 1))
2344 || CONST_SCALAR_INT_P (XEXP (src, 1))
2345 || timode_mem_p (XEXP (src, 1)));
2347 case NOT:
2348 return REG_P (XEXP (src, 0)) || timode_mem_p (XEXP (src, 0));
2350 case ASHIFT:
2351 case LSHIFTRT:
2352 case ASHIFTRT:
2353 case ROTATERT:
2354 case ROTATE:
2355 /* Handle shifts/rotates by integer constants between 0 and 127. */
2356 return REG_P (XEXP (src, 0))
2357 && CONST_INT_P (XEXP (src, 1))
2358 && (INTVAL (XEXP (src, 1)) & ~0x7f) == 0;
2360 default:
2361 return false;
2365 /* For a register REGNO, scan instructions for its defs and uses.
2366 Put REGNO in REGS if a def or use isn't in CANDIDATES. */
2368 static void
2369 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
2370 unsigned int regno)
2372 /* Do nothing if REGNO is already in REGS or is a hard reg. */
2373 if (bitmap_bit_p (regs, regno)
2374 || HARD_REGISTER_NUM_P (regno))
2375 return;
2377 for (df_ref def = DF_REG_DEF_CHAIN (regno);
2378 def;
2379 def = DF_REF_NEXT_REG (def))
2381 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
2383 if (dump_file)
2384 fprintf (dump_file,
2385 "r%d has non convertible def in insn %d\n",
2386 regno, DF_REF_INSN_UID (def));
2388 bitmap_set_bit (regs, regno);
2389 break;
2393 for (df_ref ref = DF_REG_USE_CHAIN (regno);
2394 ref;
2395 ref = DF_REF_NEXT_REG (ref))
2397 /* Debug instructions are skipped. */
2398 if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
2399 && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
2401 if (dump_file)
2402 fprintf (dump_file,
2403 "r%d has non convertible use in insn %d\n",
2404 regno, DF_REF_INSN_UID (ref));
2406 bitmap_set_bit (regs, regno);
2407 break;
2412 /* For a given bitmap of insn UIDs scans all instructions and
2413 remove insn from CANDIDATES in case it has both convertible
2414 and not convertible definitions.
2416 All insns in a bitmap are conversion candidates according to
2417 scalar_to_vector_candidate_p. Currently it implies all insns
2418 are single_set. */
2420 static void
2421 timode_remove_non_convertible_regs (bitmap candidates)
2423 bitmap_iterator bi;
2424 unsigned id;
2425 bitmap regs = BITMAP_ALLOC (NULL);
2426 bool changed;
2428 do {
2429 changed = false;
2430 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
2432 rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
2433 df_ref ref;
2435 FOR_EACH_INSN_DEF (ref, insn)
2436 if (!DF_REF_REG_MEM_P (ref)
2437 && GET_MODE (DF_REF_REG (ref)) == TImode)
2438 timode_check_non_convertible_regs (candidates, regs,
2439 DF_REF_REGNO (ref));
2441 FOR_EACH_INSN_USE (ref, insn)
2442 if (!DF_REF_REG_MEM_P (ref)
2443 && GET_MODE (DF_REF_REG (ref)) == TImode)
2444 timode_check_non_convertible_regs (candidates, regs,
2445 DF_REF_REGNO (ref));
2448 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
2450 for (df_ref def = DF_REG_DEF_CHAIN (id);
2451 def;
2452 def = DF_REF_NEXT_REG (def))
2453 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
2455 if (dump_file)
2456 fprintf (dump_file, "Removing insn %d from candidates list\n",
2457 DF_REF_INSN_UID (def));
2459 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
2460 changed = true;
2463 for (df_ref ref = DF_REG_USE_CHAIN (id);
2464 ref;
2465 ref = DF_REF_NEXT_REG (ref))
2466 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
2468 if (dump_file)
2469 fprintf (dump_file, "Removing insn %d from candidates list\n",
2470 DF_REF_INSN_UID (ref));
2472 bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
2473 changed = true;
2476 } while (changed);
2478 BITMAP_FREE (regs);
2481 /* Main STV pass function. Find and convert scalar
2482 instructions into vector mode when profitable. */
2484 static unsigned int
2485 convert_scalars_to_vector (bool timode_p)
2487 basic_block bb;
2488 int converted_insns = 0;
2489 auto_vec<rtx_insn *> control_flow_insns;
2491 bitmap_obstack_initialize (NULL);
2492 const machine_mode cand_mode[3] = { SImode, DImode, TImode };
2493 const machine_mode cand_vmode[3] = { V4SImode, V2DImode, V1TImode };
2494 bitmap_head candidates[3]; /* { SImode, DImode, TImode } */
2495 for (unsigned i = 0; i < 3; ++i)
2496 bitmap_initialize (&candidates[i], &bitmap_default_obstack);
2498 calculate_dominance_info (CDI_DOMINATORS);
2499 df_set_flags (DF_DEFER_INSN_RESCAN | DF_RD_PRUNE_DEAD_DEFS);
2500 df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
2501 df_analyze ();
2503 /* Find all instructions we want to convert into vector mode. */
2504 if (dump_file)
2505 fprintf (dump_file, "Searching for mode conversion candidates...\n");
2507 FOR_EACH_BB_FN (bb, cfun)
2509 rtx_insn *insn;
2510 FOR_BB_INSNS (bb, insn)
2511 if (timode_p
2512 && timode_scalar_to_vector_candidate_p (insn))
2514 if (dump_file)
2515 fprintf (dump_file, " insn %d is marked as a TImode candidate\n",
2516 INSN_UID (insn));
2518 bitmap_set_bit (&candidates[2], INSN_UID (insn));
2520 else if (!timode_p)
2522 /* Check {SI,DI}mode. */
2523 for (unsigned i = 0; i <= 1; ++i)
2524 if (general_scalar_to_vector_candidate_p (insn, cand_mode[i]))
2526 if (dump_file)
2527 fprintf (dump_file, " insn %d is marked as a %s candidate\n",
2528 INSN_UID (insn), i == 0 ? "SImode" : "DImode");
2530 bitmap_set_bit (&candidates[i], INSN_UID (insn));
2531 break;
2536 if (timode_p)
2537 timode_remove_non_convertible_regs (&candidates[2]);
2539 for (unsigned i = 0; i <= 2; ++i)
2540 if (!bitmap_empty_p (&candidates[i]))
2541 break;
2542 else if (i == 2 && dump_file)
2543 fprintf (dump_file, "There are no candidates for optimization.\n");
2545 for (unsigned i = 0; i <= 2; ++i)
2547 auto_bitmap disallowed;
2548 bitmap_tree_view (&candidates[i]);
2549 while (!bitmap_empty_p (&candidates[i]))
2551 unsigned uid = bitmap_first_set_bit (&candidates[i]);
2552 scalar_chain *chain;
2554 if (cand_mode[i] == TImode)
2555 chain = new timode_scalar_chain;
2556 else
2557 chain = new general_scalar_chain (cand_mode[i], cand_vmode[i]);
2559 /* Find instructions chain we want to convert to vector mode.
2560 Check all uses and definitions to estimate all required
2561 conversions. */
2562 if (chain->build (&candidates[i], uid, disallowed))
2564 if (chain->compute_convert_gain () > 0)
2565 converted_insns += chain->convert ();
2566 else if (dump_file)
2567 fprintf (dump_file, "Chain #%d conversion is not profitable\n",
2568 chain->chain_id);
2571 rtx_insn* iter_insn;
2572 unsigned int ii;
2573 FOR_EACH_VEC_ELT (chain->control_flow_insns, ii, iter_insn)
2574 control_flow_insns.safe_push (iter_insn);
2576 delete chain;
2580 if (dump_file)
2581 fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
2583 for (unsigned i = 0; i <= 2; ++i)
2584 bitmap_release (&candidates[i]);
2585 bitmap_obstack_release (NULL);
2586 df_process_deferred_rescans ();
2588 /* Conversion means we may have 128bit register spills/fills
2589 which require aligned stack. */
2590 if (converted_insns)
2592 if (crtl->stack_alignment_needed < 128)
2593 crtl->stack_alignment_needed = 128;
2594 if (crtl->stack_alignment_estimated < 128)
2595 crtl->stack_alignment_estimated = 128;
2597 crtl->stack_realign_needed
2598 = INCOMING_STACK_BOUNDARY < crtl->stack_alignment_estimated;
2599 crtl->stack_realign_tried = crtl->stack_realign_needed;
2601 crtl->stack_realign_processed = true;
2603 if (!crtl->drap_reg)
2605 rtx drap_rtx = targetm.calls.get_drap_rtx ();
2607 /* stack_realign_drap and drap_rtx must match. */
2608 gcc_assert ((stack_realign_drap != 0) == (drap_rtx != NULL));
2610 /* Do nothing if NULL is returned,
2611 which means DRAP is not needed. */
2612 if (drap_rtx != NULL)
2614 crtl->args.internal_arg_pointer = drap_rtx;
2616 /* Call fixup_tail_calls to clean up
2617 REG_EQUIV note if DRAP is needed. */
2618 fixup_tail_calls ();
2622 /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
2623 if (TARGET_64BIT)
2624 for (tree parm = DECL_ARGUMENTS (current_function_decl);
2625 parm; parm = DECL_CHAIN (parm))
2627 if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
2628 continue;
2629 if (DECL_RTL_SET_P (parm)
2630 && GET_MODE (DECL_RTL (parm)) == V1TImode)
2632 rtx r = DECL_RTL (parm);
2633 if (REG_P (r))
2634 SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
2636 if (DECL_INCOMING_RTL (parm)
2637 && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
2639 rtx r = DECL_INCOMING_RTL (parm);
2640 if (REG_P (r))
2641 DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
2645 if (!control_flow_insns.is_empty ())
2647 free_dominance_info (CDI_DOMINATORS);
2649 unsigned int i;
2650 rtx_insn* insn;
2651 FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
2652 if (control_flow_insn_p (insn))
2654 /* Split the block after insn. There will be a fallthru
2655 edge, which is OK so we keep it. We have to create
2656 the exception edges ourselves. */
2657 bb = BLOCK_FOR_INSN (insn);
2658 split_block (bb, insn);
2659 rtl_make_eh_edge (NULL, bb, BB_END (bb));
2664 return 0;
2667 static unsigned int
2668 rest_of_handle_insert_vzeroupper (void)
2670 /* vzeroupper instructions are inserted immediately after reload and
2671 postreload_cse to clean up after it a little bit to account for possible
2672 spills from 256bit or 512bit registers. The pass reuses mode switching
2673 infrastructure by re-running mode insertion pass, so disable entities
2674 that have already been processed. */
2675 for (int i = 0; i < MAX_386_ENTITIES; i++)
2676 ix86_optimize_mode_switching[i] = 0;
2678 ix86_optimize_mode_switching[AVX_U128] = 1;
2680 /* Call optimize_mode_switching. */
2681 g->get_passes ()->execute_pass_mode_switching ();
2683 /* LRA removes all REG_DEAD/REG_UNUSED notes and normally they
2684 reappear in the IL only at the start of pass_rtl_dse2, which does
2685 df_note_add_problem (); df_analyze ();
2686 The vzeroupper is scheduled after postreload_cse pass and mode
2687 switching computes the notes as well, the problem is that e.g.
2688 pass_gcse2 doesn't maintain the notes, see PR113059 and
2689 PR112760. Remove the notes now to restore status quo ante
2690 until we figure out how to maintain the notes or what else
2691 to do. */
2692 basic_block bb;
2693 rtx_insn *insn;
2694 FOR_EACH_BB_FN (bb, cfun)
2695 FOR_BB_INSNS (bb, insn)
2696 if (NONDEBUG_INSN_P (insn))
2698 rtx *pnote = &REG_NOTES (insn);
2699 while (*pnote != 0)
2701 if (REG_NOTE_KIND (*pnote) == REG_DEAD
2702 || REG_NOTE_KIND (*pnote) == REG_UNUSED)
2703 *pnote = XEXP (*pnote, 1);
2704 else
2705 pnote = &XEXP (*pnote, 1);
2709 df_remove_problem (df_note);
2710 df_analyze ();
2711 return 0;
2714 namespace {
2716 const pass_data pass_data_insert_vzeroupper =
2718 RTL_PASS, /* type */
2719 "vzeroupper", /* name */
2720 OPTGROUP_NONE, /* optinfo_flags */
2721 TV_MACH_DEP, /* tv_id */
2722 0, /* properties_required */
2723 0, /* properties_provided */
2724 0, /* properties_destroyed */
2725 0, /* todo_flags_start */
2726 TODO_df_finish, /* todo_flags_finish */
2729 class pass_insert_vzeroupper : public rtl_opt_pass
2731 public:
2732 pass_insert_vzeroupper(gcc::context *ctxt)
2733 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
2736 /* opt_pass methods: */
2737 bool gate (function *) final override
2739 return TARGET_AVX && TARGET_VZEROUPPER;
2742 unsigned int execute (function *) final override
2744 return rest_of_handle_insert_vzeroupper ();
2747 }; // class pass_insert_vzeroupper
2749 const pass_data pass_data_stv =
2751 RTL_PASS, /* type */
2752 "stv", /* name */
2753 OPTGROUP_NONE, /* optinfo_flags */
2754 TV_MACH_DEP, /* tv_id */
2755 0, /* properties_required */
2756 0, /* properties_provided */
2757 0, /* properties_destroyed */
2758 0, /* todo_flags_start */
2759 TODO_df_finish, /* todo_flags_finish */
2762 class pass_stv : public rtl_opt_pass
2764 public:
2765 pass_stv (gcc::context *ctxt)
2766 : rtl_opt_pass (pass_data_stv, ctxt),
2767 timode_p (false)
2770 /* opt_pass methods: */
2771 bool gate (function *) final override
2773 return ((!timode_p || TARGET_64BIT)
2774 && TARGET_STV && TARGET_SSE2 && optimize > 1);
2777 unsigned int execute (function *) final override
2779 return convert_scalars_to_vector (timode_p);
2782 opt_pass *clone () final override
2784 return new pass_stv (m_ctxt);
2787 void set_pass_param (unsigned int n, bool param) final override
2789 gcc_assert (n == 0);
2790 timode_p = param;
2793 private:
2794 bool timode_p;
2795 }; // class pass_stv
2797 } // anon namespace
2799 rtl_opt_pass *
2800 make_pass_insert_vzeroupper (gcc::context *ctxt)
2802 return new pass_insert_vzeroupper (ctxt);
2805 rtl_opt_pass *
2806 make_pass_stv (gcc::context *ctxt)
2808 return new pass_stv (ctxt);
2811 /* Inserting ENDBR and pseudo patchable-area instructions. */
2813 static void
2814 rest_of_insert_endbr_and_patchable_area (bool need_endbr,
2815 unsigned int patchable_area_size)
2817 rtx endbr;
2818 rtx_insn *insn;
2819 rtx_insn *endbr_insn = NULL;
2820 basic_block bb;
2822 if (need_endbr)
2824 /* Currently emit EB if it's a tracking function, i.e. 'nocf_check'
2825 is absent among function attributes. Later an optimization will
2826 be introduced to make analysis if an address of a static function
2827 is taken. A static function whose address is not taken will get
2828 a nocf_check attribute. This will allow to reduce the number of
2829 EB. */
2830 if (!lookup_attribute ("nocf_check",
2831 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
2832 && (!flag_manual_endbr
2833 || lookup_attribute ("cf_check",
2834 DECL_ATTRIBUTES (cfun->decl)))
2835 && (!cgraph_node::get (cfun->decl)->only_called_directly_p ()
2836 || ix86_cmodel == CM_LARGE
2837 || ix86_cmodel == CM_LARGE_PIC
2838 || flag_force_indirect_call
2839 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES
2840 && DECL_DLLIMPORT_P (cfun->decl))))
2842 if (crtl->profile && flag_fentry)
2844 /* Queue ENDBR insertion to x86_function_profiler.
2845 NB: Any patchable-area insn will be inserted after
2846 ENDBR. */
2847 cfun->machine->insn_queued_at_entrance = TYPE_ENDBR;
2849 else
2851 endbr = gen_nop_endbr ();
2852 bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
2853 rtx_insn *insn = BB_HEAD (bb);
2854 endbr_insn = emit_insn_before (endbr, insn);
2859 if (patchable_area_size)
2861 if (crtl->profile && flag_fentry)
2863 /* Queue patchable-area insertion to x86_function_profiler.
2864 NB: If there is a queued ENDBR, x86_function_profiler
2865 will also handle patchable-area. */
2866 if (!cfun->machine->insn_queued_at_entrance)
2867 cfun->machine->insn_queued_at_entrance = TYPE_PATCHABLE_AREA;
2869 else
2871 rtx patchable_area
2872 = gen_patchable_area (GEN_INT (patchable_area_size),
2873 GEN_INT (crtl->patch_area_entry == 0));
2874 if (endbr_insn)
2875 emit_insn_after (patchable_area, endbr_insn);
2876 else
2878 bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
2879 insn = BB_HEAD (bb);
2880 emit_insn_before (patchable_area, insn);
2885 if (!need_endbr)
2886 return;
2888 bb = 0;
2889 FOR_EACH_BB_FN (bb, cfun)
2891 for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
2892 insn = NEXT_INSN (insn))
2894 if (CALL_P (insn))
2896 need_endbr = find_reg_note (insn, REG_SETJMP, NULL) != NULL;
2897 if (!need_endbr && !SIBLING_CALL_P (insn))
2899 rtx call = get_call_rtx_from (insn);
2900 rtx fnaddr = XEXP (call, 0);
2901 tree fndecl = NULL_TREE;
2903 /* Also generate ENDBRANCH for non-tail call which
2904 may return via indirect branch. */
2905 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
2906 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
2907 if (fndecl == NULL_TREE)
2908 fndecl = MEM_EXPR (fnaddr);
2909 if (fndecl
2910 && TREE_CODE (TREE_TYPE (fndecl)) != FUNCTION_TYPE
2911 && TREE_CODE (TREE_TYPE (fndecl)) != METHOD_TYPE)
2912 fndecl = NULL_TREE;
2913 if (fndecl && TYPE_ARG_TYPES (TREE_TYPE (fndecl)))
2915 tree fntype = TREE_TYPE (fndecl);
2916 if (lookup_attribute ("indirect_return",
2917 TYPE_ATTRIBUTES (fntype)))
2918 need_endbr = true;
2921 if (!need_endbr)
2922 continue;
2923 /* Generate ENDBRANCH after CALL, which can return more than
2924 twice, setjmp-like functions. */
2926 endbr = gen_nop_endbr ();
2927 emit_insn_after_setloc (endbr, insn, INSN_LOCATION (insn));
2928 continue;
2931 if (JUMP_P (insn) && flag_cet_switch)
2933 rtx target = JUMP_LABEL (insn);
2934 if (target == NULL_RTX || ANY_RETURN_P (target))
2935 continue;
2937 /* Check the jump is a switch table. */
2938 rtx_insn *label = as_a<rtx_insn *> (target);
2939 rtx_insn *table = next_insn (label);
2940 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
2941 continue;
2943 /* For the indirect jump find out all places it jumps and insert
2944 ENDBRANCH there. It should be done under a special flag to
2945 control ENDBRANCH generation for switch stmts. */
2946 edge_iterator ei;
2947 edge e;
2948 basic_block dest_blk;
2950 FOR_EACH_EDGE (e, ei, bb->succs)
2952 rtx_insn *insn;
2954 dest_blk = e->dest;
2955 insn = BB_HEAD (dest_blk);
2956 gcc_assert (LABEL_P (insn));
2957 endbr = gen_nop_endbr ();
2958 emit_insn_after (endbr, insn);
2960 continue;
2963 if (LABEL_P (insn) && LABEL_PRESERVE_P (insn))
2965 endbr = gen_nop_endbr ();
2966 emit_insn_after (endbr, insn);
2967 continue;
2972 return;
2975 namespace {
2977 const pass_data pass_data_insert_endbr_and_patchable_area =
2979 RTL_PASS, /* type. */
2980 "endbr_and_patchable_area", /* name. */
2981 OPTGROUP_NONE, /* optinfo_flags. */
2982 TV_MACH_DEP, /* tv_id. */
2983 0, /* properties_required. */
2984 0, /* properties_provided. */
2985 0, /* properties_destroyed. */
2986 0, /* todo_flags_start. */
2987 0, /* todo_flags_finish. */
2990 class pass_insert_endbr_and_patchable_area : public rtl_opt_pass
2992 public:
2993 pass_insert_endbr_and_patchable_area (gcc::context *ctxt)
2994 : rtl_opt_pass (pass_data_insert_endbr_and_patchable_area, ctxt)
2997 /* opt_pass methods: */
2998 bool gate (function *) final override
3000 need_endbr = (flag_cf_protection & CF_BRANCH) != 0;
3001 patchable_area_size = crtl->patch_area_size - crtl->patch_area_entry;
3002 return need_endbr || patchable_area_size;
3005 unsigned int execute (function *) final override
3007 timevar_push (TV_MACH_DEP);
3008 rest_of_insert_endbr_and_patchable_area (need_endbr,
3009 patchable_area_size);
3010 timevar_pop (TV_MACH_DEP);
3011 return 0;
3014 private:
3015 bool need_endbr;
3016 unsigned int patchable_area_size;
3017 }; // class pass_insert_endbr_and_patchable_area
3019 } // anon namespace
3021 rtl_opt_pass *
3022 make_pass_insert_endbr_and_patchable_area (gcc::context *ctxt)
3024 return new pass_insert_endbr_and_patchable_area (ctxt);
3027 bool
3028 ix86_rpad_gate ()
3030 return (TARGET_AVX
3031 && TARGET_SSE_PARTIAL_REG_DEPENDENCY
3032 && TARGET_SSE_MATH
3033 && optimize
3034 && optimize_function_for_speed_p (cfun));
3037 /* At entry of the nearest common dominator for basic blocks with
3038 conversions/rcp/sqrt/rsqrt/round, generate a single
3039 vxorps %xmmN, %xmmN, %xmmN
3040 for all
3041 vcvtss2sd op, %xmmN, %xmmX
3042 vcvtsd2ss op, %xmmN, %xmmX
3043 vcvtsi2ss op, %xmmN, %xmmX
3044 vcvtsi2sd op, %xmmN, %xmmX
3046 NB: We want to generate only a single vxorps to cover the whole
3047 function. The LCM algorithm isn't appropriate here since it may
3048 place a vxorps inside the loop. */
3050 static unsigned int
3051 remove_partial_avx_dependency (void)
3053 timevar_push (TV_MACH_DEP);
3055 bitmap_obstack_initialize (NULL);
3056 bitmap convert_bbs = BITMAP_ALLOC (NULL);
3058 basic_block bb;
3059 rtx_insn *insn, *set_insn;
3060 rtx set;
3061 rtx v4sf_const0 = NULL_RTX;
3063 auto_vec<rtx_insn *> control_flow_insns;
3065 /* We create invalid RTL initially so defer rescans. */
3066 df_set_flags (DF_DEFER_INSN_RESCAN);
3068 FOR_EACH_BB_FN (bb, cfun)
3070 FOR_BB_INSNS (bb, insn)
3072 if (!NONDEBUG_INSN_P (insn))
3073 continue;
3075 set = single_set (insn);
3076 if (!set)
3077 continue;
3079 if (get_attr_avx_partial_xmm_update (insn)
3080 != AVX_PARTIAL_XMM_UPDATE_TRUE)
3081 continue;
3083 /* Convert PARTIAL_XMM_UPDATE_TRUE insns, DF -> SF, SF -> DF,
3084 SI -> SF, SI -> DF, DI -> SF, DI -> DF, sqrt, rsqrt, rcp,
3085 round, to vec_dup and vec_merge with subreg. */
3086 rtx src = SET_SRC (set);
3087 rtx dest = SET_DEST (set);
3088 machine_mode dest_mode = GET_MODE (dest);
3089 bool convert_p = false;
3090 switch (GET_CODE (src))
3092 case FLOAT:
3093 case FLOAT_EXTEND:
3094 case FLOAT_TRUNCATE:
3095 case UNSIGNED_FLOAT:
3096 convert_p = true;
3097 break;
3098 default:
3099 break;
3102 /* Only hanlde conversion here. */
3103 machine_mode src_mode
3104 = convert_p ? GET_MODE (XEXP (src, 0)) : VOIDmode;
3105 switch (src_mode)
3107 case E_SFmode:
3108 case E_DFmode:
3109 if (TARGET_USE_VECTOR_FP_CONVERTS
3110 || !TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY)
3111 continue;
3112 break;
3113 case E_SImode:
3114 case E_DImode:
3115 if (TARGET_USE_VECTOR_CONVERTS
3116 || !TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY)
3117 continue;
3118 break;
3119 case E_VOIDmode:
3120 gcc_assert (!convert_p);
3121 break;
3122 default:
3123 gcc_unreachable ();
3126 if (!v4sf_const0)
3127 v4sf_const0 = gen_reg_rtx (V4SFmode);
3129 rtx zero;
3130 machine_mode dest_vecmode;
3131 switch (dest_mode)
3133 case E_HFmode:
3134 dest_vecmode = V8HFmode;
3135 zero = gen_rtx_SUBREG (V8HFmode, v4sf_const0, 0);
3136 break;
3137 case E_SFmode:
3138 dest_vecmode = V4SFmode;
3139 zero = v4sf_const0;
3140 break;
3141 case E_DFmode:
3142 dest_vecmode = V2DFmode;
3143 zero = gen_rtx_SUBREG (V2DFmode, v4sf_const0, 0);
3144 break;
3145 default:
3146 gcc_unreachable ();
3149 /* Change source to vector mode. */
3150 src = gen_rtx_VEC_DUPLICATE (dest_vecmode, src);
3151 src = gen_rtx_VEC_MERGE (dest_vecmode, src, zero,
3152 GEN_INT (HOST_WIDE_INT_1U));
3153 /* Change destination to vector mode. */
3154 rtx vec = gen_reg_rtx (dest_vecmode);
3155 /* Generate an XMM vector SET. */
3156 set = gen_rtx_SET (vec, src);
3157 set_insn = emit_insn_before (set, insn);
3158 df_insn_rescan (set_insn);
3160 if (cfun->can_throw_non_call_exceptions)
3162 /* Handle REG_EH_REGION note. */
3163 rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
3164 if (note)
3166 control_flow_insns.safe_push (set_insn);
3167 add_reg_note (set_insn, REG_EH_REGION, XEXP (note, 0));
3171 src = gen_rtx_SUBREG (dest_mode, vec, 0);
3172 set = gen_rtx_SET (dest, src);
3174 /* Drop possible dead definitions. */
3175 PATTERN (insn) = set;
3177 INSN_CODE (insn) = -1;
3178 recog_memoized (insn);
3179 df_insn_rescan (insn);
3180 bitmap_set_bit (convert_bbs, bb->index);
3184 if (v4sf_const0)
3186 /* (Re-)discover loops so that bb->loop_father can be used in the
3187 analysis below. */
3188 calculate_dominance_info (CDI_DOMINATORS);
3189 loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
3191 /* Generate a vxorps at entry of the nearest dominator for basic
3192 blocks with conversions, which is in the fake loop that
3193 contains the whole function, so that there is only a single
3194 vxorps in the whole function. */
3195 bb = nearest_common_dominator_for_set (CDI_DOMINATORS,
3196 convert_bbs);
3197 while (bb->loop_father->latch
3198 != EXIT_BLOCK_PTR_FOR_FN (cfun))
3199 bb = get_immediate_dominator (CDI_DOMINATORS,
3200 bb->loop_father->header);
3202 set = gen_rtx_SET (v4sf_const0, CONST0_RTX (V4SFmode));
3204 insn = BB_HEAD (bb);
3205 while (insn && !NONDEBUG_INSN_P (insn))
3207 if (insn == BB_END (bb))
3209 insn = NULL;
3210 break;
3212 insn = NEXT_INSN (insn);
3214 if (insn == BB_HEAD (bb))
3215 set_insn = emit_insn_before (set, insn);
3216 else
3217 set_insn = emit_insn_after (set,
3218 insn ? PREV_INSN (insn) : BB_END (bb));
3219 df_insn_rescan (set_insn);
3220 loop_optimizer_finalize ();
3222 if (!control_flow_insns.is_empty ())
3224 free_dominance_info (CDI_DOMINATORS);
3226 unsigned int i;
3227 FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
3228 if (control_flow_insn_p (insn))
3230 /* Split the block after insn. There will be a fallthru
3231 edge, which is OK so we keep it. We have to create
3232 the exception edges ourselves. */
3233 bb = BLOCK_FOR_INSN (insn);
3234 split_block (bb, insn);
3235 rtl_make_eh_edge (NULL, bb, BB_END (bb));
3240 df_process_deferred_rescans ();
3241 df_clear_flags (DF_DEFER_INSN_RESCAN);
3242 bitmap_obstack_release (NULL);
3243 BITMAP_FREE (convert_bbs);
3245 timevar_pop (TV_MACH_DEP);
3246 return 0;
3249 namespace {
3251 const pass_data pass_data_remove_partial_avx_dependency =
3253 RTL_PASS, /* type */
3254 "rpad", /* name */
3255 OPTGROUP_NONE, /* optinfo_flags */
3256 TV_MACH_DEP, /* tv_id */
3257 0, /* properties_required */
3258 0, /* properties_provided */
3259 0, /* properties_destroyed */
3260 0, /* todo_flags_start */
3261 0, /* todo_flags_finish */
3264 class pass_remove_partial_avx_dependency : public rtl_opt_pass
3266 public:
3267 pass_remove_partial_avx_dependency (gcc::context *ctxt)
3268 : rtl_opt_pass (pass_data_remove_partial_avx_dependency, ctxt)
3271 /* opt_pass methods: */
3272 bool gate (function *) final override
3274 return ix86_rpad_gate ();
3277 unsigned int execute (function *) final override
3279 return remove_partial_avx_dependency ();
3281 }; // class pass_rpad
3283 } // anon namespace
3285 rtl_opt_pass *
3286 make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
3288 return new pass_remove_partial_avx_dependency (ctxt);
3291 /* Convert legacy instructions that clobbers EFLAGS to APX_NF
3292 instructions when there are no flag set between a flag
3293 producer and user. */
3295 static unsigned int
3296 ix86_apx_nf_convert (void)
3298 timevar_push (TV_MACH_DEP);
3300 basic_block bb;
3301 rtx_insn *insn;
3302 hash_map <rtx_insn *, rtx> converting_map;
3303 auto_vec <rtx_insn *> current_convert_list;
3305 bool converting_seq = false;
3306 rtx cc = gen_rtx_REG (CCmode, FLAGS_REG);
3308 FOR_EACH_BB_FN (bb, cfun)
3310 /* Reset conversion for each bb. */
3311 converting_seq = false;
3312 FOR_BB_INSNS (bb, insn)
3314 if (!NONDEBUG_INSN_P (insn))
3315 continue;
3317 if (recog_memoized (insn) < 0)
3318 continue;
3320 /* Convert candidate insns after cstore, which should
3321 satisify the two conditions:
3322 1. Is not flag user or producer, only clobbers
3323 FLAGS_REG.
3324 2. Have corresponding nf pattern. */
3326 rtx pat = PATTERN (insn);
3328 /* Starting convertion at first cstorecc. */
3329 rtx set = NULL_RTX;
3330 if (!converting_seq
3331 && (set = single_set (insn))
3332 && ix86_comparison_operator (SET_SRC (set), VOIDmode)
3333 && reg_overlap_mentioned_p (cc, SET_SRC (set))
3334 && !reg_overlap_mentioned_p (cc, SET_DEST (set)))
3336 converting_seq = true;
3337 current_convert_list.truncate (0);
3339 /* Terminate at the next explicit flag set. */
3340 else if (reg_set_p (cc, pat)
3341 && GET_CODE (set_of (cc, pat)) != CLOBBER)
3342 converting_seq = false;
3344 if (!converting_seq)
3345 continue;
3347 if (get_attr_has_nf (insn)
3348 && GET_CODE (pat) == PARALLEL)
3350 /* Record the insn to candidate map. */
3351 current_convert_list.safe_push (insn);
3352 converting_map.put (insn, pat);
3354 /* If the insn clobbers flags but has no nf_attr,
3355 revoke all previous candidates. */
3356 else if (!get_attr_has_nf (insn)
3357 && reg_set_p (cc, pat)
3358 && GET_CODE (set_of (cc, pat)) == CLOBBER)
3360 for (auto item : current_convert_list)
3361 converting_map.remove (item);
3362 converting_seq = false;
3367 if (!converting_map.is_empty ())
3369 for (auto iter = converting_map.begin ();
3370 iter != converting_map.end (); ++iter)
3372 rtx_insn *replace = (*iter).first;
3373 rtx pat = (*iter).second;
3374 int i, n = 0, len = XVECLEN (pat, 0);
3375 rtx *new_elems = XALLOCAVEC (rtx, len);
3376 rtx new_pat;
3377 for (i = 0; i < len; i++)
3379 rtx temp = XVECEXP (pat, 0, i);
3380 if (! (GET_CODE (temp) == CLOBBER
3381 && reg_overlap_mentioned_p (cc,
3382 XEXP (temp, 0))))
3384 new_elems[n] = temp;
3385 n++;
3389 if (n == 1)
3390 new_pat = new_elems[0];
3391 else
3392 new_pat =
3393 gen_rtx_PARALLEL (VOIDmode,
3394 gen_rtvec_v (n,
3395 new_elems));
3397 PATTERN (replace) = new_pat;
3398 INSN_CODE (replace) = -1;
3399 recog_memoized (replace);
3400 df_insn_rescan (replace);
3404 timevar_pop (TV_MACH_DEP);
3405 return 0;
3409 namespace {
3411 const pass_data pass_data_apx_nf_convert =
3413 RTL_PASS, /* type */
3414 "apx_nfcvt", /* name */
3415 OPTGROUP_NONE, /* optinfo_flags */
3416 TV_MACH_DEP, /* tv_id */
3417 0, /* properties_required */
3418 0, /* properties_provided */
3419 0, /* properties_destroyed */
3420 0, /* todo_flags_start */
3421 0, /* todo_flags_finish */
3424 class pass_apx_nf_convert : public rtl_opt_pass
3426 public:
3427 pass_apx_nf_convert (gcc::context *ctxt)
3428 : rtl_opt_pass (pass_data_apx_nf_convert, ctxt)
3431 /* opt_pass methods: */
3432 bool gate (function *) final override
3434 return (TARGET_APX_NF
3435 && optimize
3436 && optimize_function_for_speed_p (cfun));
3439 unsigned int execute (function *) final override
3441 return ix86_apx_nf_convert ();
3443 }; // class pass_rpad
3445 } // anon namespace
3447 rtl_opt_pass *
3448 make_pass_apx_nf_convert (gcc::context *ctxt)
3450 return new pass_apx_nf_convert (ctxt);
3453 /* When a hot loop can be fit into one cacheline,
3454 force align the loop without considering the max skip. */
3455 static void
3456 ix86_align_loops ()
3458 basic_block bb;
3460 /* Don't do this when we don't know cache line size. */
3461 if (ix86_cost->prefetch_block == 0)
3462 return;
3464 loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
3465 profile_count count_threshold = cfun->cfg->count_max / param_align_threshold;
3466 FOR_EACH_BB_FN (bb, cfun)
3468 rtx_insn *label = BB_HEAD (bb);
3469 bool has_fallthru = 0;
3470 edge e;
3471 edge_iterator ei;
3473 if (!LABEL_P (label))
3474 continue;
3476 profile_count fallthru_count = profile_count::zero ();
3477 profile_count branch_count = profile_count::zero ();
3479 FOR_EACH_EDGE (e, ei, bb->preds)
3481 if (e->flags & EDGE_FALLTHRU)
3482 has_fallthru = 1, fallthru_count += e->count ();
3483 else
3484 branch_count += e->count ();
3487 if (!fallthru_count.initialized_p () || !branch_count.initialized_p ())
3488 continue;
3490 if (bb->loop_father
3491 && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun)
3492 && (has_fallthru
3493 ? (!(single_succ_p (bb)
3494 && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun))
3495 && optimize_bb_for_speed_p (bb)
3496 && branch_count + fallthru_count > count_threshold
3497 && (branch_count > fallthru_count * param_align_loop_iterations))
3498 /* In case there'no fallthru for the loop.
3499 Nops inserted won't be executed. */
3500 : (branch_count > count_threshold
3501 || (bb->count > bb->prev_bb->count * 10
3502 && (bb->prev_bb->count
3503 <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2)))))
3505 rtx_insn* insn, *end_insn;
3506 HOST_WIDE_INT size = 0;
3507 bool padding_p = true;
3508 basic_block tbb = bb;
3509 unsigned cond_branch_num = 0;
3510 bool detect_tight_loop_p = false;
3512 for (unsigned int i = 0; i != bb->loop_father->num_nodes;
3513 i++, tbb = tbb->next_bb)
3515 /* Only handle continuous cfg layout. */
3516 if (bb->loop_father != tbb->loop_father)
3518 padding_p = false;
3519 break;
3522 FOR_BB_INSNS (tbb, insn)
3524 if (!NONDEBUG_INSN_P (insn))
3525 continue;
3526 size += ix86_min_insn_size (insn);
3528 /* We don't know size of inline asm.
3529 Don't align loop for call. */
3530 if (asm_noperands (PATTERN (insn)) >= 0
3531 || CALL_P (insn))
3533 size = -1;
3534 break;
3538 if (size == -1 || size > ix86_cost->prefetch_block)
3540 padding_p = false;
3541 break;
3544 FOR_EACH_EDGE (e, ei, tbb->succs)
3546 /* It could be part of the loop. */
3547 if (e->dest == bb)
3549 detect_tight_loop_p = true;
3550 break;
3554 if (detect_tight_loop_p)
3555 break;
3557 end_insn = BB_END (tbb);
3558 if (JUMP_P (end_insn))
3560 /* For decoded icache:
3561 1. Up to two branches are allowed per Way.
3562 2. A non-conditional branch is the last micro-op in a Way.
3564 if (onlyjump_p (end_insn)
3565 && (any_uncondjump_p (end_insn)
3566 || single_succ_p (tbb)))
3568 padding_p = false;
3569 break;
3571 else if (++cond_branch_num >= 2)
3573 padding_p = false;
3574 break;
3580 if (padding_p && detect_tight_loop_p)
3582 emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)),
3583 GEN_INT (0)), label);
3584 /* End of function. */
3585 if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun))
3586 break;
3587 /* Skip bb which already fits into one cacheline. */
3588 bb = tbb;
3593 loop_optimizer_finalize ();
3594 free_dominance_info (CDI_DOMINATORS);
3597 namespace {
3599 const pass_data pass_data_align_tight_loops =
3601 RTL_PASS, /* type */
3602 "align_tight_loops", /* name */
3603 OPTGROUP_NONE, /* optinfo_flags */
3604 TV_MACH_DEP, /* tv_id */
3605 0, /* properties_required */
3606 0, /* properties_provided */
3607 0, /* properties_destroyed */
3608 0, /* todo_flags_start */
3609 0, /* todo_flags_finish */
3612 class pass_align_tight_loops : public rtl_opt_pass
3614 public:
3615 pass_align_tight_loops (gcc::context *ctxt)
3616 : rtl_opt_pass (pass_data_align_tight_loops, ctxt)
3619 /* opt_pass methods: */
3620 bool gate (function *) final override
3622 return optimize && optimize_function_for_speed_p (cfun);
3625 unsigned int execute (function *) final override
3627 timevar_push (TV_MACH_DEP);
3628 #ifdef ASM_OUTPUT_MAX_SKIP_ALIGN
3629 ix86_align_loops ();
3630 #endif
3631 timevar_pop (TV_MACH_DEP);
3632 return 0;
3634 }; // class pass_align_tight_loops
3636 } // anon namespace
3638 rtl_opt_pass *
3639 make_pass_align_tight_loops (gcc::context *ctxt)
3641 return new pass_align_tight_loops (ctxt);
3644 /* This compares the priority of target features in function DECL1
3645 and DECL2. It returns positive value if DECL1 is higher priority,
3646 negative value if DECL2 is higher priority and 0 if they are the
3647 same. */
3650 ix86_compare_version_priority (tree decl1, tree decl2)
3652 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
3653 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
3655 return (int)priority1 - (int)priority2;
3658 /* V1 and V2 point to function versions with different priorities
3659 based on the target ISA. This function compares their priorities. */
3661 static int
3662 feature_compare (const void *v1, const void *v2)
3664 typedef struct _function_version_info
3666 tree version_decl;
3667 tree predicate_chain;
3668 unsigned int dispatch_priority;
3669 } function_version_info;
3671 const function_version_info c1 = *(const function_version_info *)v1;
3672 const function_version_info c2 = *(const function_version_info *)v2;
3673 return (c2.dispatch_priority - c1.dispatch_priority);
3676 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
3677 to return a pointer to VERSION_DECL if the outcome of the expression
3678 formed by PREDICATE_CHAIN is true. This function will be called during
3679 version dispatch to decide which function version to execute. It returns
3680 the basic block at the end, to which more conditions can be added. */
3682 static basic_block
3683 add_condition_to_bb (tree function_decl, tree version_decl,
3684 tree predicate_chain, basic_block new_bb)
3686 gimple *return_stmt;
3687 tree convert_expr, result_var;
3688 gimple *convert_stmt;
3689 gimple *call_cond_stmt;
3690 gimple *if_else_stmt;
3692 basic_block bb1, bb2, bb3;
3693 edge e12, e23;
3695 tree cond_var, and_expr_var = NULL_TREE;
3696 gimple_seq gseq;
3698 tree predicate_decl, predicate_arg;
3700 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
3702 gcc_assert (new_bb != NULL);
3703 gseq = bb_seq (new_bb);
3706 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
3707 build_fold_addr_expr (version_decl));
3708 result_var = create_tmp_var (ptr_type_node);
3709 convert_stmt = gimple_build_assign (result_var, convert_expr);
3710 return_stmt = gimple_build_return (result_var);
3712 if (predicate_chain == NULL_TREE)
3714 gimple_seq_add_stmt (&gseq, convert_stmt);
3715 gimple_seq_add_stmt (&gseq, return_stmt);
3716 set_bb_seq (new_bb, gseq);
3717 gimple_set_bb (convert_stmt, new_bb);
3718 gimple_set_bb (return_stmt, new_bb);
3719 pop_cfun ();
3720 return new_bb;
3723 while (predicate_chain != NULL)
3725 cond_var = create_tmp_var (integer_type_node);
3726 predicate_decl = TREE_PURPOSE (predicate_chain);
3727 predicate_arg = TREE_VALUE (predicate_chain);
3728 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
3729 gimple_call_set_lhs (call_cond_stmt, cond_var);
3731 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
3732 gimple_set_bb (call_cond_stmt, new_bb);
3733 gimple_seq_add_stmt (&gseq, call_cond_stmt);
3735 predicate_chain = TREE_CHAIN (predicate_chain);
3737 if (and_expr_var == NULL)
3738 and_expr_var = cond_var;
3739 else
3741 gimple *assign_stmt;
3742 /* Use MIN_EXPR to check if any integer is zero?.
3743 and_expr_var = min_expr <cond_var, and_expr_var> */
3744 assign_stmt = gimple_build_assign (and_expr_var,
3745 build2 (MIN_EXPR, integer_type_node,
3746 cond_var, and_expr_var));
3748 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
3749 gimple_set_bb (assign_stmt, new_bb);
3750 gimple_seq_add_stmt (&gseq, assign_stmt);
3754 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
3755 integer_zero_node,
3756 NULL_TREE, NULL_TREE);
3757 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
3758 gimple_set_bb (if_else_stmt, new_bb);
3759 gimple_seq_add_stmt (&gseq, if_else_stmt);
3761 gimple_seq_add_stmt (&gseq, convert_stmt);
3762 gimple_seq_add_stmt (&gseq, return_stmt);
3763 set_bb_seq (new_bb, gseq);
3765 bb1 = new_bb;
3766 e12 = split_block (bb1, if_else_stmt);
3767 bb2 = e12->dest;
3768 e12->flags &= ~EDGE_FALLTHRU;
3769 e12->flags |= EDGE_TRUE_VALUE;
3771 e23 = split_block (bb2, return_stmt);
3773 gimple_set_bb (convert_stmt, bb2);
3774 gimple_set_bb (return_stmt, bb2);
3776 bb3 = e23->dest;
3777 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
3779 remove_edge (e23);
3780 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
3782 pop_cfun ();
3784 return bb3;
3787 /* This function generates the dispatch function for
3788 multi-versioned functions. DISPATCH_DECL is the function which will
3789 contain the dispatch logic. FNDECLS are the function choices for
3790 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
3791 in DISPATCH_DECL in which the dispatch code is generated. */
3793 static int
3794 dispatch_function_versions (tree dispatch_decl,
3795 void *fndecls_p,
3796 basic_block *empty_bb)
3798 tree default_decl;
3799 gimple *ifunc_cpu_init_stmt;
3800 gimple_seq gseq;
3801 int ix;
3802 tree ele;
3803 vec<tree> *fndecls;
3804 unsigned int num_versions = 0;
3805 unsigned int actual_versions = 0;
3806 unsigned int i;
3808 struct _function_version_info
3810 tree version_decl;
3811 tree predicate_chain;
3812 unsigned int dispatch_priority;
3813 }*function_version_info;
3815 gcc_assert (dispatch_decl != NULL
3816 && fndecls_p != NULL
3817 && empty_bb != NULL);
3819 /*fndecls_p is actually a vector. */
3820 fndecls = static_cast<vec<tree> *> (fndecls_p);
3822 /* At least one more version other than the default. */
3823 num_versions = fndecls->length ();
3824 gcc_assert (num_versions >= 2);
3826 function_version_info = (struct _function_version_info *)
3827 XNEWVEC (struct _function_version_info, (num_versions - 1));
3829 /* The first version in the vector is the default decl. */
3830 default_decl = (*fndecls)[0];
3832 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
3834 gseq = bb_seq (*empty_bb);
3835 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
3836 constructors, so explicity call __builtin_cpu_init here. */
3837 ifunc_cpu_init_stmt
3838 = gimple_build_call_vec (get_ix86_builtin (IX86_BUILTIN_CPU_INIT), vNULL);
3839 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
3840 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
3841 set_bb_seq (*empty_bb, gseq);
3843 pop_cfun ();
3846 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
3848 tree version_decl = ele;
3849 tree predicate_chain = NULL_TREE;
3850 unsigned int priority;
3851 /* Get attribute string, parse it and find the right predicate decl.
3852 The predicate function could be a lengthy combination of many
3853 features, like arch-type and various isa-variants. */
3854 priority = get_builtin_code_for_version (version_decl,
3855 &predicate_chain);
3857 if (predicate_chain == NULL_TREE)
3858 continue;
3860 function_version_info [actual_versions].version_decl = version_decl;
3861 function_version_info [actual_versions].predicate_chain
3862 = predicate_chain;
3863 function_version_info [actual_versions].dispatch_priority = priority;
3864 actual_versions++;
3867 /* Sort the versions according to descending order of dispatch priority. The
3868 priority is based on the ISA. This is not a perfect solution. There
3869 could still be ambiguity. If more than one function version is suitable
3870 to execute, which one should be dispatched? In future, allow the user
3871 to specify a dispatch priority next to the version. */
3872 qsort (function_version_info, actual_versions,
3873 sizeof (struct _function_version_info), feature_compare);
3875 for (i = 0; i < actual_versions; ++i)
3876 *empty_bb = add_condition_to_bb (dispatch_decl,
3877 function_version_info[i].version_decl,
3878 function_version_info[i].predicate_chain,
3879 *empty_bb);
3881 /* dispatch default version at the end. */
3882 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
3883 NULL, *empty_bb);
3885 free (function_version_info);
3886 return 0;
3889 /* This function changes the assembler name for functions that are
3890 versions. If DECL is a function version and has a "target"
3891 attribute, it appends the attribute string to its assembler name. */
3893 static tree
3894 ix86_mangle_function_version_assembler_name (tree decl, tree id)
3896 tree version_attr;
3897 const char *orig_name, *version_string;
3898 char *attr_str, *assembler_name;
3900 if (DECL_DECLARED_INLINE_P (decl)
3901 && lookup_attribute ("gnu_inline",
3902 DECL_ATTRIBUTES (decl)))
3903 error_at (DECL_SOURCE_LOCATION (decl),
3904 "function versions cannot be marked as %<gnu_inline%>,"
3905 " bodies have to be generated");
3907 if (DECL_VIRTUAL_P (decl)
3908 || DECL_VINDEX (decl))
3909 sorry ("virtual function multiversioning not supported");
3911 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
3913 /* target attribute string cannot be NULL. */
3914 gcc_assert (version_attr != NULL_TREE);
3916 orig_name = IDENTIFIER_POINTER (id);
3917 version_string
3918 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
3920 if (strcmp (version_string, "default") == 0)
3921 return id;
3923 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
3924 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
3926 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
3928 /* Allow assembler name to be modified if already set. */
3929 if (DECL_ASSEMBLER_NAME_SET_P (decl))
3930 SET_DECL_RTL (decl, NULL);
3932 tree ret = get_identifier (assembler_name);
3933 XDELETEVEC (attr_str);
3934 XDELETEVEC (assembler_name);
3935 return ret;
3938 tree
3939 ix86_mangle_decl_assembler_name (tree decl, tree id)
3941 /* For function version, add the target suffix to the assembler name. */
3942 if (TREE_CODE (decl) == FUNCTION_DECL
3943 && DECL_FUNCTION_VERSIONED (decl))
3944 id = ix86_mangle_function_version_assembler_name (decl, id);
3945 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
3946 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
3947 #endif
3949 return id;
3952 /* Make a dispatcher declaration for the multi-versioned function DECL.
3953 Calls to DECL function will be replaced with calls to the dispatcher
3954 by the front-end. Returns the decl of the dispatcher function. */
3956 tree
3957 ix86_get_function_versions_dispatcher (void *decl)
3959 tree fn = (tree) decl;
3960 struct cgraph_node *node = NULL;
3961 struct cgraph_node *default_node = NULL;
3962 struct cgraph_function_version_info *node_v = NULL;
3963 struct cgraph_function_version_info *first_v = NULL;
3965 tree dispatch_decl = NULL;
3967 struct cgraph_function_version_info *default_version_info = NULL;
3969 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
3971 node = cgraph_node::get (fn);
3972 gcc_assert (node != NULL);
3974 node_v = node->function_version ();
3975 gcc_assert (node_v != NULL);
3977 if (node_v->dispatcher_resolver != NULL)
3978 return node_v->dispatcher_resolver;
3980 /* Find the default version and make it the first node. */
3981 first_v = node_v;
3982 /* Go to the beginning of the chain. */
3983 while (first_v->prev != NULL)
3984 first_v = first_v->prev;
3985 default_version_info = first_v;
3986 while (default_version_info != NULL)
3988 if (is_function_default_version
3989 (default_version_info->this_node->decl))
3990 break;
3991 default_version_info = default_version_info->next;
3994 /* If there is no default node, just return NULL. */
3995 if (default_version_info == NULL)
3996 return NULL;
3998 /* Make default info the first node. */
3999 if (first_v != default_version_info)
4001 default_version_info->prev->next = default_version_info->next;
4002 if (default_version_info->next)
4003 default_version_info->next->prev = default_version_info->prev;
4004 first_v->prev = default_version_info;
4005 default_version_info->next = first_v;
4006 default_version_info->prev = NULL;
4009 default_node = default_version_info->this_node;
4011 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
4012 if (targetm.has_ifunc_p ())
4014 struct cgraph_function_version_info *it_v = NULL;
4015 struct cgraph_node *dispatcher_node = NULL;
4016 struct cgraph_function_version_info *dispatcher_version_info = NULL;
4018 /* Right now, the dispatching is done via ifunc. */
4019 dispatch_decl = make_dispatcher_decl (default_node->decl);
4020 TREE_NOTHROW (dispatch_decl) = TREE_NOTHROW (fn);
4022 dispatcher_node = cgraph_node::get_create (dispatch_decl);
4023 gcc_assert (dispatcher_node != NULL);
4024 dispatcher_node->dispatcher_function = 1;
4025 dispatcher_version_info
4026 = dispatcher_node->insert_new_function_version ();
4027 dispatcher_version_info->next = default_version_info;
4028 dispatcher_node->definition = 1;
4030 /* Set the dispatcher for all the versions. */
4031 it_v = default_version_info;
4032 while (it_v != NULL)
4034 it_v->dispatcher_resolver = dispatch_decl;
4035 it_v = it_v->next;
4038 else
4039 #endif
4041 error_at (DECL_SOURCE_LOCATION (default_node->decl),
4042 "multiversioning needs %<ifunc%> which is not supported "
4043 "on this target");
4046 return dispatch_decl;
4049 /* Make the resolver function decl to dispatch the versions of
4050 a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
4051 ifunc alias that will point to the created resolver. Create an
4052 empty basic block in the resolver and store the pointer in
4053 EMPTY_BB. Return the decl of the resolver function. */
4055 static tree
4056 make_resolver_func (const tree default_decl,
4057 const tree ifunc_alias_decl,
4058 basic_block *empty_bb)
4060 tree decl, type, t;
4062 /* Create resolver function name based on default_decl. */
4063 tree decl_name = clone_function_name (default_decl, "resolver");
4064 const char *resolver_name = IDENTIFIER_POINTER (decl_name);
4066 /* The resolver function should return a (void *). */
4067 type = build_function_type_list (ptr_type_node, NULL_TREE);
4069 decl = build_fn_decl (resolver_name, type);
4070 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
4072 DECL_NAME (decl) = decl_name;
4073 TREE_USED (decl) = 1;
4074 DECL_ARTIFICIAL (decl) = 1;
4075 DECL_IGNORED_P (decl) = 1;
4076 TREE_PUBLIC (decl) = 0;
4077 DECL_UNINLINABLE (decl) = 1;
4079 /* Resolver is not external, body is generated. */
4080 DECL_EXTERNAL (decl) = 0;
4081 DECL_EXTERNAL (ifunc_alias_decl) = 0;
4083 DECL_CONTEXT (decl) = NULL_TREE;
4084 DECL_INITIAL (decl) = make_node (BLOCK);
4085 DECL_STATIC_CONSTRUCTOR (decl) = 0;
4087 if (DECL_COMDAT_GROUP (default_decl)
4088 || TREE_PUBLIC (default_decl))
4090 /* In this case, each translation unit with a call to this
4091 versioned function will put out a resolver. Ensure it
4092 is comdat to keep just one copy. */
4093 DECL_COMDAT (decl) = 1;
4094 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
4096 else
4097 TREE_PUBLIC (ifunc_alias_decl) = 0;
4099 /* Build result decl and add to function_decl. */
4100 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
4101 DECL_CONTEXT (t) = decl;
4102 DECL_ARTIFICIAL (t) = 1;
4103 DECL_IGNORED_P (t) = 1;
4104 DECL_RESULT (decl) = t;
4106 gimplify_function_tree (decl);
4107 push_cfun (DECL_STRUCT_FUNCTION (decl));
4108 *empty_bb = init_lowered_empty_function (decl, false,
4109 profile_count::uninitialized ());
4111 cgraph_node::add_new_function (decl, true);
4112 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
4114 pop_cfun ();
4116 gcc_assert (ifunc_alias_decl != NULL);
4117 /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
4118 DECL_ATTRIBUTES (ifunc_alias_decl)
4119 = make_attribute ("ifunc", resolver_name,
4120 DECL_ATTRIBUTES (ifunc_alias_decl));
4122 /* Create the alias for dispatch to resolver here. */
4123 cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
4124 return decl;
4127 /* Generate the dispatching code body to dispatch multi-versioned function
4128 DECL. The target hook is called to process the "target" attributes and
4129 provide the code to dispatch the right function at run-time. NODE points
4130 to the dispatcher decl whose body will be created. */
4132 tree
4133 ix86_generate_version_dispatcher_body (void *node_p)
4135 tree resolver_decl;
4136 basic_block empty_bb;
4137 tree default_ver_decl;
4138 struct cgraph_node *versn;
4139 struct cgraph_node *node;
4141 struct cgraph_function_version_info *node_version_info = NULL;
4142 struct cgraph_function_version_info *versn_info = NULL;
4144 node = (cgraph_node *)node_p;
4146 node_version_info = node->function_version ();
4147 gcc_assert (node->dispatcher_function
4148 && node_version_info != NULL);
4150 if (node_version_info->dispatcher_resolver)
4151 return node_version_info->dispatcher_resolver;
4153 /* The first version in the chain corresponds to the default version. */
4154 default_ver_decl = node_version_info->next->this_node->decl;
4156 /* node is going to be an alias, so remove the finalized bit. */
4157 node->definition = false;
4159 resolver_decl = make_resolver_func (default_ver_decl,
4160 node->decl, &empty_bb);
4162 node_version_info->dispatcher_resolver = resolver_decl;
4164 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
4166 auto_vec<tree, 2> fn_ver_vec;
4168 for (versn_info = node_version_info->next; versn_info;
4169 versn_info = versn_info->next)
4171 versn = versn_info->this_node;
4172 /* Check for virtual functions here again, as by this time it should
4173 have been determined if this function needs a vtable index or
4174 not. This happens for methods in derived classes that override
4175 virtual methods in base classes but are not explicitly marked as
4176 virtual. */
4177 if (DECL_VINDEX (versn->decl))
4178 sorry ("virtual function multiversioning not supported");
4180 fn_ver_vec.safe_push (versn->decl);
4183 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
4184 cgraph_edge::rebuild_edges ();
4185 pop_cfun ();
4186 return resolver_decl;