1 /* Copyright (C) 1988-2024 Free Software Foundation, Inc.
3 This file is part of GCC.
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
19 #define IN_TARGET_CODE 1
23 #include "coretypes.h"
33 #include "stringpool.h"
40 #include "diagnostic.h"
43 #include "fold-const.h"
46 #include "stor-layout.h"
49 #include "insn-attr.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
60 #include "tm-constrs.h"
62 #include "sched-int.h"
64 #include "tree-pass.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "shrink-wrap.h"
72 #include "tree-iterator.h"
74 #include "case-cfn-macros.h"
76 #include "fold-const-call.h"
78 #include "tree-ssanames.h"
80 #include "selftest-rtl.h"
81 #include "print-rtl.h"
84 #include "symbol-summary.h"
88 #include "ipa-fnsummary.h"
89 #include "wide-int-bitmask.h"
90 #include "tree-vector-builder.h"
92 #include "dwarf2out.h"
93 #include "i386-builtins.h"
94 #include "i386-features.h"
95 #include "i386-expand.h"
97 const char * const xlogue_layout::STUB_BASE_NAMES
[XLOGUE_STUB_COUNT
] = {
106 const unsigned xlogue_layout::REG_ORDER
[xlogue_layout::MAX_REGS
] = {
107 /* The below offset values are where each register is stored for the layout
108 relative to incoming stack pointer. The value of each m_regs[].offset will
109 be relative to the incoming base pointer (rax or rsi) used by the stub.
112 Offset: realigned or aligned + 8
113 Register aligned aligned + 8 aligned w/HFP w/HFP */
114 XMM15_REG
, /* 0x10 0x18 0x10 0x18 */
115 XMM14_REG
, /* 0x20 0x28 0x20 0x28 */
116 XMM13_REG
, /* 0x30 0x38 0x30 0x38 */
117 XMM12_REG
, /* 0x40 0x48 0x40 0x48 */
118 XMM11_REG
, /* 0x50 0x58 0x50 0x58 */
119 XMM10_REG
, /* 0x60 0x68 0x60 0x68 */
120 XMM9_REG
, /* 0x70 0x78 0x70 0x78 */
121 XMM8_REG
, /* 0x80 0x88 0x80 0x88 */
122 XMM7_REG
, /* 0x90 0x98 0x90 0x98 */
123 XMM6_REG
, /* 0xa0 0xa8 0xa0 0xa8 */
124 SI_REG
, /* 0xa8 0xb0 0xa8 0xb0 */
125 DI_REG
, /* 0xb0 0xb8 0xb0 0xb8 */
126 BX_REG
, /* 0xb8 0xc0 0xb8 0xc0 */
127 BP_REG
, /* 0xc0 0xc8 N/A N/A */
128 R12_REG
, /* 0xc8 0xd0 0xc0 0xc8 */
129 R13_REG
, /* 0xd0 0xd8 0xc8 0xd0 */
130 R14_REG
, /* 0xd8 0xe0 0xd0 0xd8 */
131 R15_REG
, /* 0xe0 0xe8 0xd8 0xe0 */
134 /* Instantiate static const values. */
135 const HOST_WIDE_INT
xlogue_layout::STUB_INDEX_OFFSET
;
136 const unsigned xlogue_layout::MIN_REGS
;
137 const unsigned xlogue_layout::MAX_REGS
;
138 const unsigned xlogue_layout::MAX_EXTRA_REGS
;
139 const unsigned xlogue_layout::VARIANT_COUNT
;
140 const unsigned xlogue_layout::STUB_NAME_MAX_LEN
;
142 /* Initialize xlogue_layout::s_stub_names to zero. */
143 char xlogue_layout::s_stub_names
[2][XLOGUE_STUB_COUNT
][VARIANT_COUNT
]
146 /* Instantiates all xlogue_layout instances. */
147 const xlogue_layout
xlogue_layout::s_instances
[XLOGUE_SET_COUNT
] = {
148 xlogue_layout (0, false),
149 xlogue_layout (8, false),
150 xlogue_layout (0, true),
151 xlogue_layout (8, true)
154 /* Return an appropriate const instance of xlogue_layout based upon values
155 in cfun->machine and crtl. */
156 const class xlogue_layout
&
157 xlogue_layout::get_instance ()
159 enum xlogue_stub_sets stub_set
;
160 bool aligned_plus_8
= cfun
->machine
->call_ms2sysv_pad_in
;
162 if (stack_realign_fp
)
163 stub_set
= XLOGUE_SET_HFP_ALIGNED_OR_REALIGN
;
164 else if (frame_pointer_needed
)
165 stub_set
= aligned_plus_8
166 ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
167 : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN
;
169 stub_set
= aligned_plus_8
? XLOGUE_SET_ALIGNED_PLUS_8
: XLOGUE_SET_ALIGNED
;
171 return s_instances
[stub_set
];
174 /* Determine how many clobbered registers can be saved by the stub.
175 Returns the count of registers the stub will save and restore. */
177 xlogue_layout::count_stub_managed_regs ()
179 bool hfp
= frame_pointer_needed
|| stack_realign_fp
;
183 for (count
= i
= MIN_REGS
; i
< MAX_REGS
; ++i
)
185 regno
= REG_ORDER
[i
];
186 if (regno
== BP_REG
&& hfp
)
188 if (!ix86_save_reg (regno
, false, false))
195 /* Determine if register REGNO is a stub managed register given the
196 total COUNT of stub managed registers. */
198 xlogue_layout::is_stub_managed_reg (unsigned regno
, unsigned count
)
200 bool hfp
= frame_pointer_needed
|| stack_realign_fp
;
203 for (i
= 0; i
< count
; ++i
)
205 gcc_assert (i
< MAX_REGS
);
206 if (REG_ORDER
[i
] == BP_REG
&& hfp
)
208 else if (REG_ORDER
[i
] == regno
)
214 /* Constructor for xlogue_layout. */
215 xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in
, bool hfp
)
216 : m_hfp (hfp
) , m_nregs (hfp
? 17 : 18),
217 m_stack_align_off_in (stack_align_off_in
)
219 HOST_WIDE_INT offset
= stack_align_off_in
;
222 for (i
= j
= 0; i
< MAX_REGS
; ++i
)
224 unsigned regno
= REG_ORDER
[i
];
226 if (regno
== BP_REG
&& hfp
)
228 if (SSE_REGNO_P (regno
))
231 /* Verify that SSE regs are always aligned. */
232 gcc_assert (!((stack_align_off_in
+ offset
) & 15));
237 m_regs
[j
].regno
= regno
;
238 m_regs
[j
++].offset
= offset
- STUB_INDEX_OFFSET
;
240 gcc_assert (j
== m_nregs
);
244 xlogue_layout::get_stub_name (enum xlogue_stub stub
,
245 unsigned n_extra_regs
)
247 const int have_avx
= TARGET_AVX
;
248 char *name
= s_stub_names
[!!have_avx
][stub
][n_extra_regs
];
253 int res
= snprintf (name
, STUB_NAME_MAX_LEN
, "__%s_%s_%u",
254 (have_avx
? "avx" : "sse"),
255 STUB_BASE_NAMES
[stub
],
256 MIN_REGS
+ n_extra_regs
);
257 gcc_checking_assert (res
< (int)STUB_NAME_MAX_LEN
);
263 /* Return rtx of a symbol ref for the entry point (based upon
264 cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
266 xlogue_layout::get_stub_rtx (enum xlogue_stub stub
)
268 const unsigned n_extra_regs
= cfun
->machine
->call_ms2sysv_extra_regs
;
269 gcc_checking_assert (n_extra_regs
<= MAX_EXTRA_REGS
);
270 gcc_assert (stub
< XLOGUE_STUB_COUNT
);
271 gcc_assert (crtl
->stack_realign_finalized
);
273 return gen_rtx_SYMBOL_REF (Pmode
, get_stub_name (stub
, n_extra_regs
));
276 unsigned scalar_chain::max_id
= 0;
280 /* Initialize new chain. */
282 scalar_chain::scalar_chain (enum machine_mode smode_
, enum machine_mode vmode_
)
290 fprintf (dump_file
, "Created a new instruction chain #%d\n", chain_id
);
292 bitmap_obstack_initialize (NULL
);
293 insns
= BITMAP_ALLOC (NULL
);
294 defs
= BITMAP_ALLOC (NULL
);
295 defs_conv
= BITMAP_ALLOC (NULL
);
296 insns_conv
= BITMAP_ALLOC (NULL
);
299 n_sse_to_integer
= 0;
300 n_integer_to_sse
= 0;
302 max_visits
= x86_stv_max_visits
;
305 /* Free chain's data. */
307 scalar_chain::~scalar_chain ()
311 BITMAP_FREE (defs_conv
);
312 BITMAP_FREE (insns_conv
);
313 bitmap_obstack_release (NULL
);
316 /* Add instruction into chains' queue. */
319 scalar_chain::add_to_queue (unsigned insn_uid
)
321 if (!bitmap_set_bit (queue
, insn_uid
))
325 fprintf (dump_file
, " Adding insn %d into chain's #%d queue\n",
329 /* For DImode conversion, mark register defined by DEF as requiring
333 scalar_chain::mark_dual_mode_def (df_ref def
)
335 gcc_assert (DF_REF_REG_DEF_P (def
));
337 /* Record the def/insn pair so we can later efficiently iterate over
338 the defs to convert on insns not in the chain. */
339 bool reg_new
= bitmap_set_bit (defs_conv
, DF_REF_REGNO (def
));
340 if (!bitmap_bit_p (insns
, DF_REF_INSN_UID (def
)))
342 if (!bitmap_set_bit (insns_conv
, DF_REF_INSN_UID (def
))
356 " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
357 DF_REF_REGNO (def
), DF_REF_INSN_UID (def
), chain_id
);
360 /* Check REF's chain to add new insns into a queue
361 and find registers requiring conversion. Return true if OK, false
362 if the analysis was aborted. */
365 scalar_chain::analyze_register_chain (bitmap candidates
, df_ref ref
,
369 bool mark_def
= false;
371 gcc_checking_assert (bitmap_bit_p (insns
, DF_REF_INSN_UID (ref
)));
373 for (chain
= DF_REF_CHAIN (ref
); chain
; chain
= chain
->next
)
375 unsigned uid
= DF_REF_INSN_UID (chain
->ref
);
377 if (!NONDEBUG_INSN_P (DF_REF_INSN (chain
->ref
)))
380 if (--max_visits
== 0)
383 if (!DF_REF_REG_MEM_P (chain
->ref
))
385 if (bitmap_bit_p (insns
, uid
))
388 if (bitmap_bit_p (candidates
, uid
))
394 /* If we run into parts of an aborted chain discovery abort. */
395 if (bitmap_bit_p (disallowed
, uid
))
399 if (DF_REF_REG_DEF_P (chain
->ref
))
402 fprintf (dump_file
, " r%d def in insn %d isn't convertible\n",
403 DF_REF_REGNO (chain
->ref
), uid
);
404 mark_dual_mode_def (chain
->ref
);
409 fprintf (dump_file
, " r%d use in insn %d isn't convertible\n",
410 DF_REF_REGNO (chain
->ref
), uid
);
416 mark_dual_mode_def (ref
);
421 /* Add instruction into a chain. Return true if OK, false if the search
425 scalar_chain::add_insn (bitmap candidates
, unsigned int insn_uid
,
428 if (!bitmap_set_bit (insns
, insn_uid
))
432 fprintf (dump_file
, " Adding insn %d to chain #%d\n", insn_uid
, chain_id
);
434 rtx_insn
*insn
= DF_INSN_UID_GET (insn_uid
)->insn
;
435 rtx def_set
= single_set (insn
);
436 if (def_set
&& REG_P (SET_DEST (def_set
))
437 && !HARD_REGISTER_P (SET_DEST (def_set
)))
438 bitmap_set_bit (defs
, REGNO (SET_DEST (def_set
)));
440 /* ??? The following is quadratic since analyze_register_chain
441 iterates over all refs to look for dual-mode regs. Instead this
442 should be done separately for all regs mentioned in the chain once. */
444 for (ref
= DF_INSN_UID_DEFS (insn_uid
); ref
; ref
= DF_REF_NEXT_LOC (ref
))
445 if (!HARD_REGISTER_P (DF_REF_REG (ref
)))
446 if (!analyze_register_chain (candidates
, ref
, disallowed
))
449 /* The operand(s) of VEC_SELECT don't need to be converted/convertible. */
450 if (def_set
&& GET_CODE (SET_SRC (def_set
)) == VEC_SELECT
)
453 for (ref
= DF_INSN_UID_USES (insn_uid
); ref
; ref
= DF_REF_NEXT_LOC (ref
))
454 if (!DF_REF_REG_MEM_P (ref
))
455 if (!analyze_register_chain (candidates
, ref
, disallowed
))
461 /* Build new chain starting from insn INSN_UID recursively
462 adding all dependent uses and definitions. Return true if OK, false
463 if the chain discovery was aborted. */
466 scalar_chain::build (bitmap candidates
, unsigned insn_uid
, bitmap disallowed
)
468 queue
= BITMAP_ALLOC (NULL
);
469 bitmap_set_bit (queue
, insn_uid
);
472 fprintf (dump_file
, "Building chain #%d...\n", chain_id
);
474 while (!bitmap_empty_p (queue
))
476 insn_uid
= bitmap_first_set_bit (queue
);
477 bitmap_clear_bit (queue
, insn_uid
);
478 bitmap_clear_bit (candidates
, insn_uid
);
479 if (!add_insn (candidates
, insn_uid
, disallowed
))
481 /* If we aborted the search put sofar found insn on the set of
482 disallowed insns so that further searches reaching them also
483 abort and thus we abort the whole but yet undiscovered chain. */
484 bitmap_ior_into (disallowed
, insns
);
486 fprintf (dump_file
, "Aborted chain #%d discovery\n", chain_id
);
494 fprintf (dump_file
, "Collected chain #%d...\n", chain_id
);
495 fprintf (dump_file
, " insns: ");
496 dump_bitmap (dump_file
, insns
);
497 if (!bitmap_empty_p (defs_conv
))
501 const char *comma
= "";
502 fprintf (dump_file
, " defs to convert: ");
503 EXECUTE_IF_SET_IN_BITMAP (defs_conv
, 0, id
, bi
)
505 fprintf (dump_file
, "%sr%d", comma
, id
);
508 fprintf (dump_file
, "\n");
517 /* Return a cost of building a vector costant
518 instead of using a scalar one. */
521 general_scalar_chain::vector_const_cost (rtx exp
)
523 gcc_assert (CONST_INT_P (exp
));
525 if (standard_sse_constant_p (exp
, vmode
))
526 return ix86_cost
->sse_op
;
527 /* We have separate costs for SImode and DImode, use SImode costs
528 for smaller modes. */
529 return ix86_cost
->sse_load
[smode
== DImode
? 1 : 0];
532 /* Compute a gain for chain conversion. */
535 general_scalar_chain::compute_convert_gain ()
543 fprintf (dump_file
, "Computing gain for chain #%d...\n", chain_id
);
545 /* SSE costs distinguish between SImode and DImode loads/stores, for
546 int costs factor in the number of GPRs involved. When supporting
547 smaller modes than SImode the int load/store costs need to be
549 unsigned sse_cost_idx
= smode
== DImode
? 1 : 0;
550 unsigned m
= smode
== DImode
? (TARGET_64BIT
? 1 : 2) : 1;
552 EXECUTE_IF_SET_IN_BITMAP (insns
, 0, insn_uid
, bi
)
554 rtx_insn
*insn
= DF_INSN_UID_GET (insn_uid
)->insn
;
555 rtx def_set
= single_set (insn
);
556 rtx src
= SET_SRC (def_set
);
557 rtx dst
= SET_DEST (def_set
);
560 if (REG_P (src
) && REG_P (dst
))
561 igain
+= 2 * m
- ix86_cost
->xmm_move
;
562 else if (REG_P (src
) && MEM_P (dst
))
564 += m
* ix86_cost
->int_store
[2] - ix86_cost
->sse_store
[sse_cost_idx
];
565 else if (MEM_P (src
) && REG_P (dst
))
566 igain
+= m
* ix86_cost
->int_load
[2] - ix86_cost
->sse_load
[sse_cost_idx
];
569 /* For operations on memory operands, include the overhead
570 of explicit load and store instructions. */
572 igain
+= optimize_insn_for_size_p ()
574 : (m
* (ix86_cost
->int_load
[2]
575 + ix86_cost
->int_store
[2])
576 - (ix86_cost
->sse_load
[sse_cost_idx
] +
577 ix86_cost
->sse_store
[sse_cost_idx
]));
579 switch (GET_CODE (src
))
586 if (INTVAL (XEXP (src
, 1)) >= 32)
587 igain
+= ix86_cost
->add
;
588 /* Gain for extend highpart case. */
589 else if (GET_CODE (XEXP (src
, 0)) == ASHIFT
)
590 igain
+= ix86_cost
->shift_const
- ix86_cost
->sse_op
;
592 igain
+= ix86_cost
->shift_const
;
595 igain
+= ix86_cost
->shift_const
- ix86_cost
->sse_op
;
597 if (CONST_INT_P (XEXP (src
, 0)))
598 igain
-= vector_const_cost (XEXP (src
, 0));
603 igain
+= m
* ix86_cost
->shift_const
;
605 igain
-= ix86_cost
->sse_op
;
606 else if (smode
== DImode
)
608 int bits
= INTVAL (XEXP (src
, 1));
609 if ((bits
& 0x0f) == 0)
610 igain
-= ix86_cost
->sse_op
;
611 else if ((bits
& 0x07) == 0)
612 igain
-= 2 * ix86_cost
->sse_op
;
614 igain
-= 3 * ix86_cost
->sse_op
;
616 else if (INTVAL (XEXP (src
, 1)) == 16)
617 igain
-= ix86_cost
->sse_op
;
619 igain
-= 2 * ix86_cost
->sse_op
;
627 igain
+= m
* ix86_cost
->add
- ix86_cost
->sse_op
;
628 /* Additional gain for andnot for targets without BMI. */
629 if (GET_CODE (XEXP (src
, 0)) == NOT
631 igain
+= m
* ix86_cost
->add
;
633 if (CONST_INT_P (XEXP (src
, 0)))
634 igain
-= vector_const_cost (XEXP (src
, 0));
635 if (CONST_INT_P (XEXP (src
, 1)))
636 igain
-= vector_const_cost (XEXP (src
, 1));
637 if (MEM_P (XEXP (src
, 1)))
639 if (optimize_insn_for_size_p ())
640 igain
-= COSTS_N_BYTES (m
== 2 ? 3 : 5);
642 igain
+= m
* ix86_cost
->int_load
[2]
643 - ix86_cost
->sse_load
[sse_cost_idx
];
649 igain
-= ix86_cost
->sse_op
+ COSTS_N_INSNS (1);
651 if (GET_CODE (XEXP (src
, 0)) != ABS
)
653 igain
+= m
* ix86_cost
->add
;
663 /* We do not have any conditional move cost, estimate it as a
664 reg-reg move. Comparisons are costed as adds. */
665 igain
+= m
* (COSTS_N_INSNS (2) + ix86_cost
->add
);
666 /* Integer SSE ops are all costed the same. */
667 igain
-= ix86_cost
->sse_op
;
671 if (XEXP (src
, 1) != const0_rtx
)
673 /* cmp vs. pxor;pshufd;ptest. */
674 igain
+= COSTS_N_INSNS (m
- 3);
676 else if (GET_CODE (XEXP (src
, 0)) != AND
)
678 /* test vs. pshufd;ptest. */
679 igain
+= COSTS_N_INSNS (m
- 2);
681 else if (GET_CODE (XEXP (XEXP (src
, 0), 0)) != NOT
)
683 /* and;test vs. pshufd;ptest. */
684 igain
+= COSTS_N_INSNS (2 * m
- 2);
688 /* andn;test vs. pandn;pshufd;ptest. */
689 igain
+= COSTS_N_INSNS (2 * m
- 3);
693 /* not;and;test vs. pandn;pshufd;ptest. */
694 igain
+= COSTS_N_INSNS (3 * m
- 3);
701 if (optimize_insn_for_size_p ())
703 /* xor (2 bytes) vs. xorps (3 bytes). */
704 if (src
== const0_rtx
)
705 igain
-= COSTS_N_BYTES (1);
706 /* movdi_internal vs. movv2di_internal. */
707 /* => mov (5 bytes) vs. movaps (7 bytes). */
708 else if (x86_64_immediate_operand (src
, SImode
))
709 igain
-= COSTS_N_BYTES (2);
711 /* ??? Larger immediate constants are placed in the
712 constant pool, where the size benefit/impact of
713 STV conversion is affected by whether and how
714 often each constant pool entry is shared/reused.
715 The value below is empirically derived from the
716 CSiBE benchmark (and the optimal value may drift
718 igain
+= COSTS_N_BYTES (0);
722 /* DImode can be immediate for TARGET_64BIT
723 and SImode always. */
724 igain
+= m
* COSTS_N_INSNS (1);
725 igain
-= vector_const_cost (src
);
728 else if (MEM_P (dst
))
730 igain
+= (m
* ix86_cost
->int_store
[2]
731 - ix86_cost
->sse_store
[sse_cost_idx
]);
732 igain
-= vector_const_cost (src
);
737 if (XVECEXP (XEXP (src
, 1), 0, 0) == const0_rtx
)
739 // movd (4 bytes) replaced with movdqa (4 bytes).
740 if (!optimize_insn_for_size_p ())
741 igain
+= ix86_cost
->sse_to_integer
- ix86_cost
->xmm_move
;
745 // pshufd; movd replaced with pshufd.
746 if (optimize_insn_for_size_p ())
747 igain
+= COSTS_N_BYTES (4);
749 igain
+= ix86_cost
->sse_to_integer
;
758 if (igain
!= 0 && dump_file
)
760 fprintf (dump_file
, " Instruction gain %d for ", igain
);
761 dump_insn_slim (dump_file
, insn
);
767 fprintf (dump_file
, " Instruction conversion gain: %d\n", gain
);
769 /* Cost the integer to sse and sse to integer moves. */
770 if (!optimize_function_for_size_p (cfun
))
772 cost
+= n_sse_to_integer
* ix86_cost
->sse_to_integer
;
773 /* ??? integer_to_sse but we only have that in the RA cost table.
774 Assume sse_to_integer/integer_to_sse are the same which they
775 are at the moment. */
776 cost
+= n_integer_to_sse
* ix86_cost
->sse_to_integer
;
778 else if (TARGET_64BIT
|| smode
== SImode
)
780 cost
+= n_sse_to_integer
* COSTS_N_BYTES (4);
781 cost
+= n_integer_to_sse
* COSTS_N_BYTES (4);
783 else if (TARGET_SSE4_1
)
785 /* vmovd (4 bytes) + vpextrd (6 bytes). */
786 cost
+= n_sse_to_integer
* COSTS_N_BYTES (10);
787 /* vmovd (4 bytes) + vpinsrd (6 bytes). */
788 cost
+= n_integer_to_sse
* COSTS_N_BYTES (10);
792 /* movd (4 bytes) + psrlq (5 bytes) + movd (4 bytes). */
793 cost
+= n_sse_to_integer
* COSTS_N_BYTES (13);
794 /* movd (4 bytes) + movd (4 bytes) + unpckldq (4 bytes). */
795 cost
+= n_integer_to_sse
* COSTS_N_BYTES (12);
799 fprintf (dump_file
, " Registers conversion cost: %d\n", cost
);
804 fprintf (dump_file
, " Total gain: %d\n", gain
);
809 /* Insert generated conversion instruction sequence INSNS
810 after instruction AFTER. New BB may be required in case
811 instruction has EH region attached. */
814 scalar_chain::emit_conversion_insns (rtx insns
, rtx_insn
*after
)
816 if (!control_flow_insn_p (after
))
818 emit_insn_after (insns
, after
);
822 basic_block bb
= BLOCK_FOR_INSN (after
);
823 edge e
= find_fallthru_edge (bb
->succs
);
826 basic_block new_bb
= split_edge (e
);
827 emit_insn_after (insns
, BB_HEAD (new_bb
));
832 /* Generate the canonical SET_SRC to move GPR to a VMODE vector register,
833 zeroing the upper parts. */
836 gen_gpr_to_xmm_move_src (enum machine_mode vmode
, rtx gpr
)
838 switch (GET_MODE_NUNITS (vmode
))
841 return gen_rtx_SUBREG (vmode
, gpr
, 0);
843 return gen_rtx_VEC_CONCAT (vmode
, gpr
,
844 CONST0_RTX (GET_MODE_INNER (vmode
)));
846 return gen_rtx_VEC_MERGE (vmode
, gen_rtx_VEC_DUPLICATE (vmode
, gpr
),
847 CONST0_RTX (vmode
), GEN_INT (HOST_WIDE_INT_1U
));
851 /* Make vector copies for all register REGNO definitions
852 and replace its uses in a chain. */
855 scalar_chain::make_vector_copies (rtx_insn
*insn
, rtx reg
)
857 rtx vreg
= *defs_map
.get (reg
);
860 if (!TARGET_INTER_UNIT_MOVES_TO_VEC
)
862 rtx tmp
= assign_386_stack_local (smode
, SLOT_STV_TEMP
);
863 if (smode
== DImode
&& !TARGET_64BIT
)
865 emit_move_insn (adjust_address (tmp
, SImode
, 0),
866 gen_rtx_SUBREG (SImode
, reg
, 0));
867 emit_move_insn (adjust_address (tmp
, SImode
, 4),
868 gen_rtx_SUBREG (SImode
, reg
, 4));
871 emit_move_insn (copy_rtx (tmp
), reg
);
872 emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode
, vreg
, 0),
873 gen_gpr_to_xmm_move_src (vmode
, tmp
)));
875 else if (!TARGET_64BIT
&& smode
== DImode
)
879 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode
, vreg
, 0),
880 CONST0_RTX (V4SImode
),
881 gen_rtx_SUBREG (SImode
, reg
, 0)));
882 emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode
, vreg
, 0),
883 gen_rtx_SUBREG (V4SImode
, vreg
, 0),
884 gen_rtx_SUBREG (SImode
, reg
, 4),
889 rtx tmp
= gen_reg_rtx (DImode
);
890 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode
, vreg
, 0),
891 CONST0_RTX (V4SImode
),
892 gen_rtx_SUBREG (SImode
, reg
, 0)));
893 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode
, tmp
, 0),
894 CONST0_RTX (V4SImode
),
895 gen_rtx_SUBREG (SImode
, reg
, 4)));
896 emit_insn (gen_vec_interleave_lowv4si
897 (gen_rtx_SUBREG (V4SImode
, vreg
, 0),
898 gen_rtx_SUBREG (V4SImode
, vreg
, 0),
899 gen_rtx_SUBREG (V4SImode
, tmp
, 0)));
903 emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode
, vreg
, 0),
904 gen_gpr_to_xmm_move_src (vmode
, reg
)));
905 rtx_insn
*seq
= get_insns ();
907 emit_conversion_insns (seq
, insn
);
911 " Copied r%d to a vector register r%d for insn %d\n",
912 REGNO (reg
), REGNO (vreg
), INSN_UID (insn
));
915 /* Copy the definition SRC of INSN inside the chain to DST for
916 scalar uses outside of the chain. */
919 scalar_chain::convert_reg (rtx_insn
*insn
, rtx dst
, rtx src
)
922 if (!TARGET_INTER_UNIT_MOVES_FROM_VEC
)
924 rtx tmp
= assign_386_stack_local (smode
, SLOT_STV_TEMP
);
925 emit_move_insn (tmp
, src
);
926 if (!TARGET_64BIT
&& smode
== DImode
)
928 emit_move_insn (gen_rtx_SUBREG (SImode
, dst
, 0),
929 adjust_address (tmp
, SImode
, 0));
930 emit_move_insn (gen_rtx_SUBREG (SImode
, dst
, 4),
931 adjust_address (tmp
, SImode
, 4));
934 emit_move_insn (dst
, copy_rtx (tmp
));
936 else if (!TARGET_64BIT
&& smode
== DImode
)
940 rtx tmp
= gen_rtx_PARALLEL (VOIDmode
,
941 gen_rtvec (1, const0_rtx
));
944 (gen_rtx_SUBREG (SImode
, dst
, 0),
945 gen_rtx_VEC_SELECT (SImode
,
946 gen_rtx_SUBREG (V4SImode
, src
, 0),
949 tmp
= gen_rtx_PARALLEL (VOIDmode
, gen_rtvec (1, const1_rtx
));
952 (gen_rtx_SUBREG (SImode
, dst
, 4),
953 gen_rtx_VEC_SELECT (SImode
,
954 gen_rtx_SUBREG (V4SImode
, src
, 0),
959 rtx vcopy
= gen_reg_rtx (V2DImode
);
960 emit_move_insn (vcopy
, gen_rtx_SUBREG (V2DImode
, src
, 0));
961 emit_move_insn (gen_rtx_SUBREG (SImode
, dst
, 0),
962 gen_rtx_SUBREG (SImode
, vcopy
, 0));
963 emit_move_insn (vcopy
,
964 gen_rtx_LSHIFTRT (V2DImode
,
965 vcopy
, GEN_INT (32)));
966 emit_move_insn (gen_rtx_SUBREG (SImode
, dst
, 4),
967 gen_rtx_SUBREG (SImode
, vcopy
, 0));
971 emit_move_insn (dst
, src
);
973 rtx_insn
*seq
= get_insns ();
975 emit_conversion_insns (seq
, insn
);
979 " Copied r%d to a scalar register r%d for insn %d\n",
980 REGNO (src
), REGNO (dst
), INSN_UID (insn
));
983 /* Helper function to convert immediate constant X to vmode. */
985 smode_convert_cst (rtx x
, enum machine_mode vmode
)
987 /* Prefer all ones vector in case of -1. */
988 if (constm1_operand (x
, GET_MODE (x
)))
989 return CONSTM1_RTX (vmode
);
991 unsigned n
= GET_MODE_NUNITS (vmode
);
992 rtx
*v
= XALLOCAVEC (rtx
, n
);
994 for (unsigned i
= 1; i
< n
; ++i
)
996 return gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (n
, v
));
999 /* Convert operand OP in INSN. We should handle
1000 memory operands and uninitialized registers.
1001 All other register uses are converted during
1002 registers conversion. */
1005 scalar_chain::convert_op (rtx
*op
, rtx_insn
*insn
)
1009 if (GET_MODE (*op
) == V1TImode
)
1012 *op
= copy_rtx_if_shared (*op
);
1014 if (GET_CODE (*op
) == NOT
1015 || GET_CODE (*op
) == ASHIFT
)
1017 convert_op (&XEXP (*op
, 0), insn
);
1018 PUT_MODE (*op
, vmode
);
1020 else if (MEM_P (*op
))
1022 rtx_insn
*movabs
= NULL
;
1024 /* Emit MOVABS to load from a 64-bit absolute address to a GPR. */
1025 if (!memory_operand (*op
, GET_MODE (*op
)))
1027 tmp
= gen_reg_rtx (GET_MODE (*op
));
1028 movabs
= emit_insn_before (gen_rtx_SET (tmp
, *op
), insn
);
1033 tmp
= gen_rtx_SUBREG (vmode
, gen_reg_rtx (GET_MODE (*op
)), 0);
1036 = emit_insn_before (gen_rtx_SET (copy_rtx (tmp
),
1037 gen_gpr_to_xmm_move_src (vmode
, *op
)),
1040 if (cfun
->can_throw_non_call_exceptions
)
1042 /* Handle REG_EH_REGION note. */
1043 rtx note
= find_reg_note (insn
, REG_EH_REGION
, NULL_RTX
);
1048 control_flow_insns
.safe_push (eh_insn
);
1049 add_reg_note (eh_insn
, REG_EH_REGION
, XEXP (note
, 0));
1056 fprintf (dump_file
, " Preloading operand for insn %d into r%d\n",
1057 INSN_UID (insn
), reg_or_subregno (tmp
));
1059 else if (REG_P (*op
))
1060 *op
= gen_rtx_SUBREG (vmode
, *op
, 0);
1061 else if (CONST_SCALAR_INT_P (*op
))
1063 rtx vec_cst
= smode_convert_cst (*op
, vmode
);
1065 if (!standard_sse_constant_p (vec_cst
, vmode
))
1068 vec_cst
= validize_mem (force_const_mem (vmode
, vec_cst
));
1069 rtx_insn
*seq
= get_insns ();
1071 emit_insn_before (seq
, insn
);
1074 tmp
= gen_rtx_SUBREG (vmode
, gen_reg_rtx (smode
), 0);
1076 emit_insn_before (gen_move_insn (copy_rtx (tmp
), vec_cst
), insn
);
1081 gcc_assert (SUBREG_P (*op
));
1082 gcc_assert (GET_MODE (*op
) == vmode
);
1086 /* Convert CCZmode COMPARE to vector mode. */
1089 scalar_chain::convert_compare (rtx op1
, rtx op2
, rtx_insn
*insn
)
1093 /* Handle any REG_EQUAL notes. */
1094 tmp
= find_reg_equal_equiv_note (insn
);
1097 if (GET_CODE (XEXP (tmp
, 0)) == COMPARE
1098 && GET_MODE (XEXP (tmp
, 0)) == CCZmode
1099 && REG_P (XEXP (XEXP (tmp
, 0), 0)))
1101 rtx
*op
= &XEXP (XEXP (tmp
, 0), 1);
1102 if (CONST_SCALAR_INT_P (*op
))
1104 if (constm1_operand (*op
, GET_MODE (*op
)))
1105 *op
= CONSTM1_RTX (vmode
);
1108 unsigned n
= GET_MODE_NUNITS (vmode
);
1109 rtx
*v
= XALLOCAVEC (rtx
, n
);
1111 for (unsigned i
= 1; i
< n
; ++i
)
1113 *op
= gen_rtx_CONST_VECTOR (vmode
, gen_rtvec_v (n
, v
));
1117 else if (REG_P (*op
))
1122 remove_note (insn
, tmp
);
1125 /* Comparison against anything other than zero, requires an XOR. */
1126 if (op2
!= const0_rtx
)
1128 convert_op (&op1
, insn
);
1129 convert_op (&op2
, insn
);
1130 /* If both operands are MEMs, explicitly load the OP1 into TMP. */
1131 if (MEM_P (op1
) && MEM_P (op2
))
1133 tmp
= gen_reg_rtx (vmode
);
1134 emit_insn_before (gen_rtx_SET (tmp
, op1
), insn
);
1139 src
= gen_rtx_XOR (vmode
, src
, op2
);
1141 else if (GET_CODE (op1
) == AND
1142 && GET_CODE (XEXP (op1
, 0)) == NOT
)
1144 rtx op11
= XEXP (XEXP (op1
, 0), 0);
1145 rtx op12
= XEXP (op1
, 1);
1146 convert_op (&op11
, insn
);
1147 convert_op (&op12
, insn
);
1150 tmp
= gen_reg_rtx (vmode
);
1151 emit_insn_before (gen_rtx_SET (tmp
, op11
), insn
);
1154 src
= gen_rtx_AND (vmode
, gen_rtx_NOT (vmode
, op11
), op12
);
1156 else if (GET_CODE (op1
) == AND
)
1158 rtx op11
= XEXP (op1
, 0);
1159 rtx op12
= XEXP (op1
, 1);
1160 convert_op (&op11
, insn
);
1161 convert_op (&op12
, insn
);
1164 tmp
= gen_reg_rtx (vmode
);
1165 emit_insn_before (gen_rtx_SET (tmp
, op11
), insn
);
1168 return gen_rtx_UNSPEC (CCZmode
, gen_rtvec (2, op11
, op12
),
1173 convert_op (&op1
, insn
);
1179 tmp
= gen_reg_rtx (vmode
);
1180 emit_insn_before (gen_rtx_SET (tmp
, src
), insn
);
1184 if (vmode
== V2DImode
)
1186 tmp
= gen_reg_rtx (vmode
);
1187 emit_insn_before (gen_vec_interleave_lowv2di (tmp
, src
, src
), insn
);
1190 else if (vmode
== V4SImode
)
1192 tmp
= gen_reg_rtx (vmode
);
1193 emit_insn_before (gen_sse2_pshufd (tmp
, src
, const0_rtx
), insn
);
1197 return gen_rtx_UNSPEC (CCZmode
, gen_rtvec (2, src
, src
), UNSPEC_PTEST
);
1200 /* Helper function for converting INSN to vector mode. */
1203 scalar_chain::convert_insn_common (rtx_insn
*insn
)
1205 /* Generate copies for out-of-chain uses of defs and adjust debug uses. */
1206 for (df_ref ref
= DF_INSN_DEFS (insn
); ref
; ref
= DF_REF_NEXT_LOC (ref
))
1207 if (bitmap_bit_p (defs_conv
, DF_REF_REGNO (ref
)))
1210 for (use
= DF_REF_CHAIN (ref
); use
; use
= use
->next
)
1211 if (NONDEBUG_INSN_P (DF_REF_INSN (use
->ref
))
1212 && (DF_REF_REG_MEM_P (use
->ref
)
1213 || !bitmap_bit_p (insns
, DF_REF_INSN_UID (use
->ref
))))
1216 convert_reg (insn
, DF_REF_REG (ref
),
1217 *defs_map
.get (regno_reg_rtx
[DF_REF_REGNO (ref
)]));
1218 else if (MAY_HAVE_DEBUG_BIND_INSNS
)
1220 /* If we generated a scalar copy we can leave debug-insns
1221 as-is, if not, we have to adjust them. */
1222 auto_vec
<rtx_insn
*, 5> to_reset_debug_insns
;
1223 for (use
= DF_REF_CHAIN (ref
); use
; use
= use
->next
)
1224 if (DEBUG_INSN_P (DF_REF_INSN (use
->ref
)))
1226 rtx_insn
*debug_insn
= DF_REF_INSN (use
->ref
);
1227 /* If there's a reaching definition outside of the
1228 chain we have to reset. */
1230 for (def
= DF_REF_CHAIN (use
->ref
); def
; def
= def
->next
)
1231 if (!bitmap_bit_p (insns
, DF_REF_INSN_UID (def
->ref
)))
1234 to_reset_debug_insns
.safe_push (debug_insn
);
1237 *DF_REF_REAL_LOC (use
->ref
)
1238 = *defs_map
.get (regno_reg_rtx
[DF_REF_REGNO (ref
)]);
1239 df_insn_rescan (debug_insn
);
1242 /* Have to do the reset outside of the DF_CHAIN walk to not
1244 while (!to_reset_debug_insns
.is_empty ())
1246 rtx_insn
*debug_insn
= to_reset_debug_insns
.pop ();
1247 INSN_VAR_LOCATION_LOC (debug_insn
) = gen_rtx_UNKNOWN_VAR_LOC ();
1248 df_insn_rescan_debug_internal (debug_insn
);
1253 /* Replace uses in this insn with the defs we use in the chain. */
1254 for (df_ref ref
= DF_INSN_USES (insn
); ref
; ref
= DF_REF_NEXT_LOC (ref
))
1255 if (!DF_REF_REG_MEM_P (ref
))
1256 if (rtx
*vreg
= defs_map
.get (regno_reg_rtx
[DF_REF_REGNO (ref
)]))
1258 /* Also update a corresponding REG_DEAD note. */
1259 rtx note
= find_reg_note (insn
, REG_DEAD
, DF_REF_REG (ref
));
1261 XEXP (note
, 0) = *vreg
;
1262 *DF_REF_REAL_LOC (ref
) = *vreg
;
1266 /* Convert INSN which is an SImode or DImode rotation by a constant
1267 to vector mode. CODE is either ROTATE or ROTATERT with operands
1268 OP0 and OP1. Returns the SET_SRC of the last instruction in the
1269 resulting sequence, which is emitted before INSN. */
1272 general_scalar_chain::convert_rotate (enum rtx_code code
, rtx op0
, rtx op1
,
1275 int bits
= INTVAL (op1
);
1278 convert_op (&op0
, insn
);
1282 if (smode
== DImode
)
1288 rtx tmp1
= gen_reg_rtx (V4SImode
);
1289 pat
= gen_sse2_pshufd (tmp1
, gen_lowpart (V4SImode
, op0
),
1291 emit_insn_before (pat
, insn
);
1292 result
= gen_lowpart (V2DImode
, tmp1
);
1294 else if (TARGET_AVX512VL
)
1295 result
= simplify_gen_binary (code
, V2DImode
, op0
, op1
);
1296 else if (bits
== 16 || bits
== 48)
1298 rtx tmp1
= gen_reg_rtx (V8HImode
);
1299 pat
= gen_sse2_pshuflw (tmp1
, gen_lowpart (V8HImode
, op0
),
1300 GEN_INT (bits
== 16 ? 57 : 147));
1301 emit_insn_before (pat
, insn
);
1302 result
= gen_lowpart (V2DImode
, tmp1
);
1304 else if ((bits
& 0x07) == 0)
1306 rtx tmp1
= gen_reg_rtx (V4SImode
);
1307 pat
= gen_sse2_pshufd (tmp1
, gen_lowpart (V4SImode
, op0
),
1309 emit_insn_before (pat
, insn
);
1310 rtx tmp2
= gen_reg_rtx (V1TImode
);
1311 pat
= gen_sse2_lshrv1ti3 (tmp2
, gen_lowpart (V1TImode
, tmp1
),
1313 emit_insn_before (pat
, insn
);
1314 result
= gen_lowpart (V2DImode
, tmp2
);
1318 rtx tmp1
= gen_reg_rtx (V4SImode
);
1319 pat
= gen_sse2_pshufd (tmp1
, gen_lowpart (V4SImode
, op0
),
1321 emit_insn_before (pat
, insn
);
1322 rtx tmp2
= gen_reg_rtx (V2DImode
);
1323 pat
= gen_lshrv2di3 (tmp2
, gen_lowpart (V2DImode
, tmp1
),
1324 GEN_INT (bits
& 31));
1325 emit_insn_before (pat
, insn
);
1326 rtx tmp3
= gen_reg_rtx (V4SImode
);
1327 pat
= gen_sse2_pshufd (tmp3
, gen_lowpart (V4SImode
, tmp2
),
1328 GEN_INT (bits
> 32 ? 34 : 136));
1329 emit_insn_before (pat
, insn
);
1330 result
= gen_lowpart (V2DImode
, tmp3
);
1333 else if (bits
== 16)
1335 rtx tmp1
= gen_reg_rtx (V8HImode
);
1336 pat
= gen_sse2_pshuflw (tmp1
, gen_lowpart (V8HImode
, op0
), GEN_INT (225));
1337 emit_insn_before (pat
, insn
);
1338 result
= gen_lowpart (V4SImode
, tmp1
);
1340 else if (TARGET_AVX512VL
)
1341 result
= simplify_gen_binary (code
, V4SImode
, op0
, op1
);
1347 rtx tmp1
= gen_reg_rtx (V4SImode
);
1348 emit_insn_before (gen_sse2_pshufd (tmp1
, op0
, GEN_INT (224)), insn
);
1349 rtx tmp2
= gen_reg_rtx (V2DImode
);
1350 pat
= gen_lshrv2di3 (tmp2
, gen_lowpart (V2DImode
, tmp1
),
1352 emit_insn_before (pat
, insn
);
1353 result
= gen_lowpart (V4SImode
, tmp2
);
1359 /* Convert INSN to vector mode. */
1362 general_scalar_chain::convert_insn (rtx_insn
*insn
)
1364 rtx def_set
= single_set (insn
);
1365 rtx src
= SET_SRC (def_set
);
1366 rtx dst
= SET_DEST (def_set
);
1369 if (MEM_P (dst
) && !REG_P (src
))
1371 /* There are no scalar integer instructions and therefore
1372 temporary register usage is required. */
1373 rtx tmp
= gen_reg_rtx (smode
);
1374 emit_conversion_insns (gen_move_insn (dst
, tmp
), insn
);
1375 dst
= gen_rtx_SUBREG (vmode
, tmp
, 0);
1377 else if (REG_P (dst
) && GET_MODE (dst
) == smode
)
1379 /* Replace the definition with a SUBREG to the definition we
1380 use inside the chain. */
1381 rtx
*vdef
= defs_map
.get (dst
);
1384 dst
= gen_rtx_SUBREG (vmode
, dst
, 0);
1385 /* IRA doesn't like to have REG_EQUAL/EQUIV notes when the SET_DEST
1386 is a non-REG_P. So kill those off. */
1387 rtx note
= find_reg_equal_equiv_note (insn
);
1389 remove_note (insn
, note
);
1392 switch (GET_CODE (src
))
1403 convert_op (&XEXP (src
, 1), insn
);
1410 convert_op (&XEXP (src
, 0), insn
);
1411 PUT_MODE (src
, vmode
);
1416 src
= convert_rotate (GET_CODE (src
), XEXP (src
, 0), XEXP (src
, 1),
1421 src
= XEXP (src
, 0);
1423 if (GET_CODE (src
) == ABS
)
1425 src
= XEXP (src
, 0);
1426 convert_op (&src
, insn
);
1427 subreg
= gen_reg_rtx (vmode
);
1428 emit_insn_before (gen_rtx_SET (subreg
,
1429 gen_rtx_ABS (vmode
, src
)), insn
);
1433 convert_op (&src
, insn
);
1435 subreg
= gen_reg_rtx (vmode
);
1436 emit_insn_before (gen_move_insn (subreg
, CONST0_RTX (vmode
)), insn
);
1437 src
= gen_rtx_MINUS (vmode
, subreg
, src
);
1441 src
= XEXP (src
, 0);
1442 convert_op (&src
, insn
);
1443 subreg
= gen_reg_rtx (vmode
);
1444 emit_insn_before (gen_move_insn (subreg
, CONSTM1_RTX (vmode
)), insn
);
1445 src
= gen_rtx_XOR (vmode
, src
, subreg
);
1450 convert_op (&src
, insn
);
1455 convert_op (&src
, insn
);
1459 gcc_assert (GET_MODE (src
) == vmode
);
1463 dst
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
1464 src
= convert_compare (XEXP (src
, 0), XEXP (src
, 1), insn
);
1468 convert_op (&src
, insn
);
1472 if (XVECEXP (XEXP (src
, 1), 0, 0) == const0_rtx
)
1473 src
= XEXP (src
, 0);
1474 else if (smode
== DImode
)
1476 rtx tmp
= gen_lowpart (V1TImode
, XEXP (src
, 0));
1477 dst
= gen_lowpart (V1TImode
, dst
);
1478 src
= gen_rtx_LSHIFTRT (V1TImode
, tmp
, GEN_INT (64));
1482 rtx tmp
= XVECEXP (XEXP (src
, 1), 0, 0);
1483 rtvec vec
= gen_rtvec (4, tmp
, tmp
, tmp
, tmp
);
1484 rtx par
= gen_rtx_PARALLEL (VOIDmode
, vec
);
1485 src
= gen_rtx_VEC_SELECT (vmode
, XEXP (src
, 0), par
);
1493 SET_SRC (def_set
) = src
;
1494 SET_DEST (def_set
) = dst
;
1496 /* Drop possible dead definitions. */
1497 PATTERN (insn
) = def_set
;
1499 INSN_CODE (insn
) = -1;
1500 int patt
= recog_memoized (insn
);
1502 fatal_insn_not_found (insn
);
1503 df_insn_rescan (insn
);
1506 /* Helper function to compute gain for loading an immediate constant.
1507 Typically, two movabsq for TImode vs. vmovdqa for V1TImode, but
1508 with numerous special cases. */
1511 timode_immed_const_gain (rtx cst
)
1513 /* movabsq vs. movabsq+vmovq+vunpacklqdq. */
1514 if (CONST_WIDE_INT_P (cst
)
1515 && CONST_WIDE_INT_NUNITS (cst
) == 2
1516 && CONST_WIDE_INT_ELT (cst
, 0) == CONST_WIDE_INT_ELT (cst
, 1))
1517 return optimize_insn_for_size_p () ? -COSTS_N_BYTES (9)
1518 : -COSTS_N_INSNS (2);
1519 /* 2x movabsq ~ vmovdqa. */
1523 /* Compute a gain for chain conversion. */
1526 timode_scalar_chain::compute_convert_gain ()
1528 /* Assume that if we have to move TImode values between units,
1529 then transforming this chain isn't worth it. */
1530 if (n_sse_to_integer
|| n_integer_to_sse
)
1536 /* Split ties to prefer V1TImode when not optimizing for size. */
1537 int gain
= optimize_size
? 0 : 1;
1540 fprintf (dump_file
, "Computing gain for chain #%d...\n", chain_id
);
1542 EXECUTE_IF_SET_IN_BITMAP (insns
, 0, insn_uid
, bi
)
1544 rtx_insn
*insn
= DF_INSN_UID_GET (insn_uid
)->insn
;
1545 rtx def_set
= single_set (insn
);
1546 rtx src
= SET_SRC (def_set
);
1547 rtx dst
= SET_DEST (def_set
);
1548 HOST_WIDE_INT op1val
;
1552 switch (GET_CODE (src
))
1555 if (optimize_insn_for_size_p ())
1556 igain
= MEM_P (dst
) ? COSTS_N_BYTES (6) : COSTS_N_BYTES (3);
1558 igain
= COSTS_N_INSNS (1);
1562 igain
= optimize_insn_for_size_p () ? COSTS_N_BYTES (7)
1563 : COSTS_N_INSNS (1);
1568 && standard_sse_constant_p (src
, V1TImode
))
1569 igain
= optimize_insn_for_size_p () ? COSTS_N_BYTES (11) : 1;
1572 case CONST_WIDE_INT
:
1573 /* 2 x mov vs. vmovdqa. */
1575 igain
= optimize_insn_for_size_p () ? COSTS_N_BYTES (3)
1576 : COSTS_N_INSNS (1);
1581 igain
= -COSTS_N_INSNS (1);
1588 igain
= COSTS_N_INSNS (1);
1589 if (CONST_SCALAR_INT_P (XEXP (src
, 1)))
1590 igain
+= timode_immed_const_gain (XEXP (src
, 1));
1595 /* See ix86_expand_v1ti_shift. */
1596 op1val
= INTVAL (XEXP (src
, 1));
1597 if (optimize_insn_for_size_p ())
1599 if (op1val
== 64 || op1val
== 65)
1600 scost
= COSTS_N_BYTES (5);
1601 else if (op1val
>= 66)
1602 scost
= COSTS_N_BYTES (6);
1603 else if (op1val
== 1)
1604 scost
= COSTS_N_BYTES (8);
1606 scost
= COSTS_N_BYTES (9);
1608 if ((op1val
& 7) == 0)
1609 vcost
= COSTS_N_BYTES (5);
1610 else if (op1val
> 64)
1611 vcost
= COSTS_N_BYTES (10);
1613 vcost
= TARGET_AVX
? COSTS_N_BYTES (19) : COSTS_N_BYTES (23);
1617 scost
= COSTS_N_INSNS (2);
1618 if ((op1val
& 7) == 0)
1619 vcost
= COSTS_N_INSNS (1);
1620 else if (op1val
> 64)
1621 vcost
= COSTS_N_INSNS (2);
1623 vcost
= TARGET_AVX
? COSTS_N_INSNS (4) : COSTS_N_INSNS (5);
1625 igain
= scost
- vcost
;
1629 /* See ix86_expand_v1ti_ashiftrt. */
1630 op1val
= INTVAL (XEXP (src
, 1));
1631 if (optimize_insn_for_size_p ())
1633 if (op1val
== 64 || op1val
== 127)
1634 scost
= COSTS_N_BYTES (7);
1635 else if (op1val
== 1)
1636 scost
= COSTS_N_BYTES (8);
1637 else if (op1val
== 65)
1638 scost
= COSTS_N_BYTES (10);
1639 else if (op1val
>= 66)
1640 scost
= COSTS_N_BYTES (11);
1642 scost
= COSTS_N_BYTES (9);
1645 vcost
= COSTS_N_BYTES (10);
1646 else if (op1val
== 64)
1647 vcost
= COSTS_N_BYTES (14);
1648 else if (op1val
== 96)
1649 vcost
= COSTS_N_BYTES (18);
1650 else if (op1val
>= 111)
1651 vcost
= COSTS_N_BYTES (15);
1652 else if (TARGET_AVX2
&& op1val
== 32)
1653 vcost
= COSTS_N_BYTES (16);
1654 else if (TARGET_SSE4_1
&& op1val
== 32)
1655 vcost
= COSTS_N_BYTES (20);
1656 else if (op1val
>= 96)
1657 vcost
= COSTS_N_BYTES (23);
1658 else if ((op1val
& 7) == 0)
1659 vcost
= COSTS_N_BYTES (28);
1660 else if (TARGET_AVX2
&& op1val
< 32)
1661 vcost
= COSTS_N_BYTES (30);
1662 else if (op1val
== 1 || op1val
>= 64)
1663 vcost
= COSTS_N_BYTES (42);
1665 vcost
= COSTS_N_BYTES (47);
1669 if (op1val
>= 65 && op1val
<= 126)
1670 scost
= COSTS_N_INSNS (3);
1672 scost
= COSTS_N_INSNS (2);
1675 vcost
= COSTS_N_INSNS (2);
1676 else if (op1val
== 64)
1677 vcost
= COSTS_N_INSNS (3);
1678 else if (op1val
== 96)
1679 vcost
= COSTS_N_INSNS (3);
1680 else if (op1val
>= 111)
1681 vcost
= COSTS_N_INSNS (3);
1682 else if (TARGET_SSE4_1
&& op1val
== 32)
1683 vcost
= COSTS_N_INSNS (3);
1684 else if (TARGET_SSE4_1
1685 && (op1val
== 8 || op1val
== 16 || op1val
== 24))
1686 vcost
= COSTS_N_INSNS (3);
1687 else if (op1val
>= 96)
1688 vcost
= COSTS_N_INSNS (4);
1689 else if (TARGET_SSE4_1
&& (op1val
== 28 || op1val
== 80))
1690 vcost
= COSTS_N_INSNS (4);
1691 else if ((op1val
& 7) == 0)
1692 vcost
= COSTS_N_INSNS (5);
1693 else if (TARGET_AVX2
&& op1val
< 32)
1694 vcost
= COSTS_N_INSNS (6);
1695 else if (TARGET_SSE4_1
&& op1val
< 15)
1696 vcost
= COSTS_N_INSNS (6);
1697 else if (op1val
== 1 || op1val
>= 64)
1698 vcost
= COSTS_N_INSNS (8);
1700 vcost
= COSTS_N_INSNS (9);
1702 igain
= scost
- vcost
;
1707 /* See ix86_expand_v1ti_rotate. */
1708 op1val
= INTVAL (XEXP (src
, 1));
1709 if (optimize_insn_for_size_p ())
1711 scost
= COSTS_N_BYTES (13);
1712 if ((op1val
& 31) == 0)
1713 vcost
= COSTS_N_BYTES (5);
1714 else if ((op1val
& 7) == 0)
1715 vcost
= TARGET_AVX
? COSTS_N_BYTES (13) : COSTS_N_BYTES (18);
1716 else if (op1val
> 32 && op1val
< 96)
1717 vcost
= COSTS_N_BYTES (24);
1719 vcost
= COSTS_N_BYTES (19);
1723 scost
= COSTS_N_INSNS (3);
1724 if ((op1val
& 31) == 0)
1725 vcost
= COSTS_N_INSNS (1);
1726 else if ((op1val
& 7) == 0)
1727 vcost
= TARGET_AVX
? COSTS_N_INSNS (3) : COSTS_N_INSNS (4);
1728 else if (op1val
> 32 && op1val
< 96)
1729 vcost
= COSTS_N_INSNS (5);
1731 vcost
= COSTS_N_INSNS (1);
1733 igain
= scost
- vcost
;
1737 if (XEXP (src
, 1) == const0_rtx
)
1739 if (GET_CODE (XEXP (src
, 0)) == AND
)
1740 /* and;and;or (9 bytes) vs. ptest (5 bytes). */
1741 igain
= optimize_insn_for_size_p() ? COSTS_N_BYTES (4)
1742 : COSTS_N_INSNS (2);
1743 /* or (3 bytes) vs. ptest (5 bytes). */
1744 else if (optimize_insn_for_size_p ())
1745 igain
= -COSTS_N_BYTES (2);
1747 else if (XEXP (src
, 1) == const1_rtx
)
1748 /* and;cmp -1 (7 bytes) vs. pcmpeqd;pxor;ptest (13 bytes). */
1749 igain
= optimize_insn_for_size_p() ? -COSTS_N_BYTES (6)
1750 : -COSTS_N_INSNS (1);
1757 if (igain
!= 0 && dump_file
)
1759 fprintf (dump_file
, " Instruction gain %d for ", igain
);
1760 dump_insn_slim (dump_file
, insn
);
1766 fprintf (dump_file
, " Total gain: %d\n", gain
);
1771 /* Fix uses of converted REG in debug insns. */
1774 timode_scalar_chain::fix_debug_reg_uses (rtx reg
)
1776 if (!flag_var_tracking
)
1780 for (ref
= DF_REG_USE_CHAIN (REGNO (reg
)); ref
; ref
= next
)
1782 rtx_insn
*insn
= DF_REF_INSN (ref
);
1783 /* Make sure the next ref is for a different instruction,
1784 so that we're not affected by the rescan. */
1785 next
= DF_REF_NEXT_REG (ref
);
1786 while (next
&& DF_REF_INSN (next
) == insn
)
1787 next
= DF_REF_NEXT_REG (next
);
1789 if (DEBUG_INSN_P (insn
))
1791 /* It may be a debug insn with a TImode variable in
1793 bool changed
= false;
1794 for (; ref
!= next
; ref
= DF_REF_NEXT_REG (ref
))
1796 rtx
*loc
= DF_REF_LOC (ref
);
1797 if (REG_P (*loc
) && GET_MODE (*loc
) == V1TImode
)
1799 *loc
= gen_rtx_SUBREG (TImode
, *loc
, 0);
1804 df_insn_rescan (insn
);
1809 /* Convert INSN from TImode to V1T1mode. */
1812 timode_scalar_chain::convert_insn (rtx_insn
*insn
)
1814 rtx def_set
= single_set (insn
);
1815 rtx src
= SET_SRC (def_set
);
1816 rtx dst
= SET_DEST (def_set
);
1819 switch (GET_CODE (dst
))
1822 if (GET_MODE (dst
) == TImode
)
1824 PUT_MODE (dst
, V1TImode
);
1825 fix_debug_reg_uses (dst
);
1827 if (GET_MODE (dst
) == V1TImode
)
1829 /* It might potentially be helpful to convert REG_EQUAL notes,
1830 but for now we just remove them. */
1831 rtx note
= find_reg_equal_equiv_note (insn
);
1833 remove_note (insn
, note
);
1837 PUT_MODE (dst
, V1TImode
);
1844 switch (GET_CODE (src
))
1847 if (GET_MODE (src
) == TImode
)
1849 PUT_MODE (src
, V1TImode
);
1850 fix_debug_reg_uses (src
);
1855 PUT_MODE (src
, V1TImode
);
1858 case CONST_WIDE_INT
:
1859 if (NONDEBUG_INSN_P (insn
))
1861 /* Since there are no instructions to store 128-bit constant,
1862 temporary register usage is required. */
1865 tmp
= ix86_convert_const_wide_int_to_broadcast (TImode
, src
);
1868 src
= lowpart_subreg (V1TImode
, tmp
, TImode
);
1873 src
= smode_convert_cst (src
, V1TImode
);
1874 src
= validize_mem (force_const_mem (V1TImode
, src
));
1875 use_move
= MEM_P (dst
);
1877 rtx_insn
*seq
= get_insns ();
1880 emit_insn_before (seq
, insn
);
1883 tmp
= gen_reg_rtx (V1TImode
);
1884 emit_insn_before (gen_rtx_SET (tmp
, src
), insn
);
1891 switch (standard_sse_constant_p (src
, TImode
))
1894 src
= CONST0_RTX (GET_MODE (dst
));
1897 src
= CONSTM1_RTX (GET_MODE (dst
));
1904 tmp
= gen_reg_rtx (V1TImode
);
1905 emit_insn_before (gen_rtx_SET (tmp
, src
), insn
);
1911 if (GET_CODE (XEXP (src
, 0)) == NOT
)
1913 convert_op (&XEXP (XEXP (src
, 0), 0), insn
);
1914 convert_op (&XEXP (src
, 1), insn
);
1915 PUT_MODE (XEXP (src
, 0), V1TImode
);
1916 PUT_MODE (src
, V1TImode
);
1923 convert_op (&XEXP (src
, 0), insn
);
1924 convert_op (&XEXP (src
, 1), insn
);
1925 PUT_MODE (src
, V1TImode
);
1928 tmp
= gen_reg_rtx (V1TImode
);
1929 emit_insn_before (gen_rtx_SET (tmp
, src
), insn
);
1935 src
= XEXP (src
, 0);
1936 convert_op (&src
, insn
);
1937 tmp
= gen_reg_rtx (V1TImode
);
1938 emit_insn_before (gen_move_insn (tmp
, CONSTM1_RTX (V1TImode
)), insn
);
1939 src
= gen_rtx_XOR (V1TImode
, src
, tmp
);
1942 tmp
= gen_reg_rtx (V1TImode
);
1943 emit_insn_before (gen_rtx_SET (tmp
, src
), insn
);
1949 dst
= gen_rtx_REG (CCZmode
, FLAGS_REG
);
1950 src
= convert_compare (XEXP (src
, 0), XEXP (src
, 1), insn
);
1958 convert_op (&XEXP (src
, 0), insn
);
1959 PUT_MODE (src
, V1TImode
);
1966 SET_SRC (def_set
) = src
;
1967 SET_DEST (def_set
) = dst
;
1969 /* Drop possible dead definitions. */
1970 PATTERN (insn
) = def_set
;
1972 INSN_CODE (insn
) = -1;
1973 recog_memoized (insn
);
1974 df_insn_rescan (insn
);
1977 /* Generate copies from defs used by the chain but not defined therein.
1978 Also populates defs_map which is used later by convert_insn. */
1981 scalar_chain::convert_registers ()
1985 EXECUTE_IF_SET_IN_BITMAP (defs_conv
, 0, id
, bi
)
1987 rtx chain_reg
= gen_reg_rtx (smode
);
1988 defs_map
.put (regno_reg_rtx
[id
], chain_reg
);
1990 EXECUTE_IF_SET_IN_BITMAP (insns_conv
, 0, id
, bi
)
1991 for (df_ref ref
= DF_INSN_UID_DEFS (id
); ref
; ref
= DF_REF_NEXT_LOC (ref
))
1992 if (bitmap_bit_p (defs_conv
, DF_REF_REGNO (ref
)))
1993 make_vector_copies (DF_REF_INSN (ref
), DF_REF_REAL_REG (ref
));
1996 /* Convert whole chain creating required register
1997 conversions and copies. */
2000 scalar_chain::convert ()
2004 int converted_insns
= 0;
2006 if (!dbg_cnt (stv_conversion
))
2010 fprintf (dump_file
, "Converting chain #%d...\n", chain_id
);
2012 convert_registers ();
2014 EXECUTE_IF_SET_IN_BITMAP (insns
, 0, id
, bi
)
2016 rtx_insn
*insn
= DF_INSN_UID_GET (id
)->insn
;
2017 convert_insn_common (insn
);
2018 convert_insn (insn
);
2022 return converted_insns
;
2025 /* Return the SET expression if INSN doesn't reference hard register.
2026 Return NULL if INSN uses or defines a hard register, excluding
2027 pseudo register pushes, hard register uses in a memory address,
2028 clobbers and flags definitions. */
2031 pseudo_reg_set (rtx_insn
*insn
)
2033 rtx set
= single_set (insn
);
2037 /* Check pseudo register push first. */
2038 machine_mode mode
= TARGET_64BIT
? TImode
: DImode
;
2039 if (REG_P (SET_SRC (set
))
2040 && !HARD_REGISTER_P (SET_SRC (set
))
2041 && push_operand (SET_DEST (set
), mode
))
2045 FOR_EACH_INSN_DEF (ref
, insn
)
2046 if (HARD_REGISTER_P (DF_REF_REAL_REG (ref
))
2047 && !DF_REF_FLAGS_IS_SET (ref
, DF_REF_MUST_CLOBBER
)
2048 && DF_REF_REGNO (ref
) != FLAGS_REG
)
2051 FOR_EACH_INSN_USE (ref
, insn
)
2052 if (!DF_REF_REG_MEM_P (ref
) && HARD_REGISTER_P (DF_REF_REAL_REG (ref
)))
2058 /* Return true if the register REG is defined in a single DEF chain.
2059 If it is defined in more than one DEF chains, we may not be able
2060 to convert it in all chains. */
2063 single_def_chain_p (rtx reg
)
2065 df_ref ref
= DF_REG_DEF_CHAIN (REGNO (reg
));
2068 return DF_REF_NEXT_REG (ref
) == nullptr;
2071 /* Check if comparison INSN may be transformed into vector comparison.
2072 Currently we transform equality/inequality checks which look like:
2073 (set (reg:CCZ 17 flags) (compare:CCZ (reg:TI x) (reg:TI y))) */
2076 convertible_comparison_p (rtx_insn
*insn
, enum machine_mode mode
)
2078 if (mode
!= (TARGET_64BIT
? TImode
: DImode
))
2084 rtx def_set
= single_set (insn
);
2086 gcc_assert (def_set
);
2088 rtx src
= SET_SRC (def_set
);
2089 rtx dst
= SET_DEST (def_set
);
2091 gcc_assert (GET_CODE (src
) == COMPARE
);
2093 if (GET_CODE (dst
) != REG
2094 || REGNO (dst
) != FLAGS_REG
2095 || GET_MODE (dst
) != CCZmode
)
2098 rtx op1
= XEXP (src
, 0);
2099 rtx op2
= XEXP (src
, 1);
2101 /* *cmp<dwi>_doubleword. */
2102 if ((CONST_SCALAR_INT_P (op1
)
2103 || ((REG_P (op1
) || MEM_P (op1
))
2104 && GET_MODE (op1
) == mode
))
2105 && (CONST_SCALAR_INT_P (op2
)
2106 || ((REG_P (op2
) || MEM_P (op2
))
2107 && GET_MODE (op2
) == mode
)))
2110 /* *testti_doubleword. */
2111 if (op2
== const0_rtx
2112 && GET_CODE (op1
) == AND
2113 && REG_P (XEXP (op1
, 0)))
2115 rtx op12
= XEXP (op1
, 1);
2116 return GET_MODE (XEXP (op1
, 0)) == TImode
2117 && (CONST_SCALAR_INT_P (op12
)
2118 || ((REG_P (op12
) || MEM_P (op12
))
2119 && GET_MODE (op12
) == TImode
));
2122 /* *test<dwi>_not_doubleword. */
2123 if (op2
== const0_rtx
2124 && GET_CODE (op1
) == AND
2125 && GET_CODE (XEXP (op1
, 0)) == NOT
)
2127 rtx op11
= XEXP (XEXP (op1
, 0), 0);
2128 rtx op12
= XEXP (op1
, 1);
2129 return (REG_P (op11
) || MEM_P (op11
))
2130 && (REG_P (op12
) || MEM_P (op12
))
2131 && GET_MODE (op11
) == mode
2132 && GET_MODE (op12
) == mode
;
2138 /* The general version of scalar_to_vector_candidate_p. */
2141 general_scalar_to_vector_candidate_p (rtx_insn
*insn
, enum machine_mode mode
)
2143 rtx def_set
= pseudo_reg_set (insn
);
2148 rtx src
= SET_SRC (def_set
);
2149 rtx dst
= SET_DEST (def_set
);
2151 if (GET_CODE (src
) == COMPARE
)
2152 return convertible_comparison_p (insn
, mode
);
2154 /* We are interested in "mode" only. */
2155 if ((GET_MODE (src
) != mode
2156 && !CONST_INT_P (src
))
2157 || GET_MODE (dst
) != mode
)
2160 if (!REG_P (dst
) && !MEM_P (dst
))
2163 switch (GET_CODE (src
))
2170 if (!CONST_INT_P (XEXP (src
, 1))
2171 || !IN_RANGE (INTVAL (XEXP (src
, 1)), 0, GET_MODE_BITSIZE (mode
)-1))
2174 /* Check for extend highpart case. */
2176 || GET_CODE (src
) != ASHIFTRT
2177 || GET_CODE (XEXP (src
, 0)) != ASHIFT
)
2180 src
= XEXP (src
, 0);
2187 if ((mode
== DImode
&& !TARGET_AVX512VL
)
2188 || (mode
== SImode
&& !TARGET_SSE4_1
))
2197 if (!REG_P (XEXP (src
, 1))
2198 && !MEM_P (XEXP (src
, 1))
2199 && !CONST_INT_P (XEXP (src
, 1)))
2202 if (GET_MODE (XEXP (src
, 1)) != mode
2203 && !CONST_INT_P (XEXP (src
, 1)))
2206 /* Check for andnot case. */
2207 if (GET_CODE (src
) != AND
2208 || GET_CODE (XEXP (src
, 0)) != NOT
)
2211 src
= XEXP (src
, 0);
2218 /* Check for nabs case. */
2219 if (GET_CODE (XEXP (src
, 0)) != ABS
)
2222 src
= XEXP (src
, 0);
2226 if ((mode
== DImode
&& !TARGET_AVX512VL
)
2227 || (mode
== SImode
&& !TARGET_SSSE3
))
2239 /* Excluding MEM_P (dst) avoids intefering with vpextr[dq]. */
2241 && REG_P (XEXP (src
, 0))
2242 && GET_MODE (XEXP (src
, 0)) == (mode
== DImode
? V2DImode
2244 && GET_CODE (XEXP (src
, 1)) == PARALLEL
2245 && XVECLEN (XEXP (src
, 1), 0) == 1
2246 && CONST_INT_P (XVECEXP (XEXP (src
, 1), 0, 0));
2252 if (!REG_P (XEXP (src
, 0))
2253 && !MEM_P (XEXP (src
, 0))
2254 && !CONST_INT_P (XEXP (src
, 0)))
2257 if (GET_MODE (XEXP (src
, 0)) != mode
2258 && !CONST_INT_P (XEXP (src
, 0)))
2264 /* Check for a suitable TImode memory operand. */
2267 timode_mem_p (rtx x
)
2270 && (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
2271 || !misaligned_operand (x
, TImode
));
2274 /* The TImode version of scalar_to_vector_candidate_p. */
2277 timode_scalar_to_vector_candidate_p (rtx_insn
*insn
)
2279 rtx def_set
= pseudo_reg_set (insn
);
2284 rtx src
= SET_SRC (def_set
);
2285 rtx dst
= SET_DEST (def_set
);
2287 if (GET_CODE (src
) == COMPARE
)
2288 return convertible_comparison_p (insn
, TImode
);
2290 if (GET_MODE (dst
) != TImode
2291 || (GET_MODE (src
) != TImode
2292 && !CONST_SCALAR_INT_P (src
)))
2295 if (!REG_P (dst
) && !MEM_P (dst
))
2299 && misaligned_operand (dst
, TImode
)
2300 && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL
)
2303 if (REG_P (dst
) && !single_def_chain_p (dst
))
2306 switch (GET_CODE (src
))
2309 return single_def_chain_p (src
);
2311 case CONST_WIDE_INT
:
2315 /* ??? Verify performance impact before enabling CONST_INT for
2317 return standard_sse_constant_p (src
, TImode
);
2320 /* Memory must be aligned or unaligned load is optimal. */
2322 && (!misaligned_operand (src
, TImode
)
2323 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
));
2327 && GET_CODE (XEXP (src
, 0)) == NOT
2328 && REG_P (XEXP (XEXP (src
, 0), 0))
2329 && (REG_P (XEXP (src
, 1))
2330 || CONST_SCALAR_INT_P (XEXP (src
, 1))
2331 || timode_mem_p (XEXP (src
, 1))))
2333 return (REG_P (XEXP (src
, 0))
2334 || timode_mem_p (XEXP (src
, 0)))
2335 && (REG_P (XEXP (src
, 1))
2336 || CONST_SCALAR_INT_P (XEXP (src
, 1))
2337 || timode_mem_p (XEXP (src
, 1)));
2341 return (REG_P (XEXP (src
, 0))
2342 || timode_mem_p (XEXP (src
, 0)))
2343 && (REG_P (XEXP (src
, 1))
2344 || CONST_SCALAR_INT_P (XEXP (src
, 1))
2345 || timode_mem_p (XEXP (src
, 1)));
2348 return REG_P (XEXP (src
, 0)) || timode_mem_p (XEXP (src
, 0));
2355 /* Handle shifts/rotates by integer constants between 0 and 127. */
2356 return REG_P (XEXP (src
, 0))
2357 && CONST_INT_P (XEXP (src
, 1))
2358 && (INTVAL (XEXP (src
, 1)) & ~0x7f) == 0;
2365 /* For a register REGNO, scan instructions for its defs and uses.
2366 Put REGNO in REGS if a def or use isn't in CANDIDATES. */
2369 timode_check_non_convertible_regs (bitmap candidates
, bitmap regs
,
2372 /* Do nothing if REGNO is already in REGS or is a hard reg. */
2373 if (bitmap_bit_p (regs
, regno
)
2374 || HARD_REGISTER_NUM_P (regno
))
2377 for (df_ref def
= DF_REG_DEF_CHAIN (regno
);
2379 def
= DF_REF_NEXT_REG (def
))
2381 if (!bitmap_bit_p (candidates
, DF_REF_INSN_UID (def
)))
2385 "r%d has non convertible def in insn %d\n",
2386 regno
, DF_REF_INSN_UID (def
));
2388 bitmap_set_bit (regs
, regno
);
2393 for (df_ref ref
= DF_REG_USE_CHAIN (regno
);
2395 ref
= DF_REF_NEXT_REG (ref
))
2397 /* Debug instructions are skipped. */
2398 if (NONDEBUG_INSN_P (DF_REF_INSN (ref
))
2399 && !bitmap_bit_p (candidates
, DF_REF_INSN_UID (ref
)))
2403 "r%d has non convertible use in insn %d\n",
2404 regno
, DF_REF_INSN_UID (ref
));
2406 bitmap_set_bit (regs
, regno
);
2412 /* For a given bitmap of insn UIDs scans all instructions and
2413 remove insn from CANDIDATES in case it has both convertible
2414 and not convertible definitions.
2416 All insns in a bitmap are conversion candidates according to
2417 scalar_to_vector_candidate_p. Currently it implies all insns
2421 timode_remove_non_convertible_regs (bitmap candidates
)
2425 bitmap regs
= BITMAP_ALLOC (NULL
);
2430 EXECUTE_IF_SET_IN_BITMAP (candidates
, 0, id
, bi
)
2432 rtx_insn
*insn
= DF_INSN_UID_GET (id
)->insn
;
2435 FOR_EACH_INSN_DEF (ref
, insn
)
2436 if (!DF_REF_REG_MEM_P (ref
)
2437 && GET_MODE (DF_REF_REG (ref
)) == TImode
)
2438 timode_check_non_convertible_regs (candidates
, regs
,
2439 DF_REF_REGNO (ref
));
2441 FOR_EACH_INSN_USE (ref
, insn
)
2442 if (!DF_REF_REG_MEM_P (ref
)
2443 && GET_MODE (DF_REF_REG (ref
)) == TImode
)
2444 timode_check_non_convertible_regs (candidates
, regs
,
2445 DF_REF_REGNO (ref
));
2448 EXECUTE_IF_SET_IN_BITMAP (regs
, 0, id
, bi
)
2450 for (df_ref def
= DF_REG_DEF_CHAIN (id
);
2452 def
= DF_REF_NEXT_REG (def
))
2453 if (bitmap_bit_p (candidates
, DF_REF_INSN_UID (def
)))
2456 fprintf (dump_file
, "Removing insn %d from candidates list\n",
2457 DF_REF_INSN_UID (def
));
2459 bitmap_clear_bit (candidates
, DF_REF_INSN_UID (def
));
2463 for (df_ref ref
= DF_REG_USE_CHAIN (id
);
2465 ref
= DF_REF_NEXT_REG (ref
))
2466 if (bitmap_bit_p (candidates
, DF_REF_INSN_UID (ref
)))
2469 fprintf (dump_file
, "Removing insn %d from candidates list\n",
2470 DF_REF_INSN_UID (ref
));
2472 bitmap_clear_bit (candidates
, DF_REF_INSN_UID (ref
));
2481 /* Main STV pass function. Find and convert scalar
2482 instructions into vector mode when profitable. */
2485 convert_scalars_to_vector (bool timode_p
)
2488 int converted_insns
= 0;
2489 auto_vec
<rtx_insn
*> control_flow_insns
;
2491 bitmap_obstack_initialize (NULL
);
2492 const machine_mode cand_mode
[3] = { SImode
, DImode
, TImode
};
2493 const machine_mode cand_vmode
[3] = { V4SImode
, V2DImode
, V1TImode
};
2494 bitmap_head candidates
[3]; /* { SImode, DImode, TImode } */
2495 for (unsigned i
= 0; i
< 3; ++i
)
2496 bitmap_initialize (&candidates
[i
], &bitmap_default_obstack
);
2498 calculate_dominance_info (CDI_DOMINATORS
);
2499 df_set_flags (DF_DEFER_INSN_RESCAN
| DF_RD_PRUNE_DEAD_DEFS
);
2500 df_chain_add_problem (DF_DU_CHAIN
| DF_UD_CHAIN
);
2503 /* Find all instructions we want to convert into vector mode. */
2505 fprintf (dump_file
, "Searching for mode conversion candidates...\n");
2507 FOR_EACH_BB_FN (bb
, cfun
)
2510 FOR_BB_INSNS (bb
, insn
)
2512 && timode_scalar_to_vector_candidate_p (insn
))
2515 fprintf (dump_file
, " insn %d is marked as a TImode candidate\n",
2518 bitmap_set_bit (&candidates
[2], INSN_UID (insn
));
2522 /* Check {SI,DI}mode. */
2523 for (unsigned i
= 0; i
<= 1; ++i
)
2524 if (general_scalar_to_vector_candidate_p (insn
, cand_mode
[i
]))
2527 fprintf (dump_file
, " insn %d is marked as a %s candidate\n",
2528 INSN_UID (insn
), i
== 0 ? "SImode" : "DImode");
2530 bitmap_set_bit (&candidates
[i
], INSN_UID (insn
));
2537 timode_remove_non_convertible_regs (&candidates
[2]);
2539 for (unsigned i
= 0; i
<= 2; ++i
)
2540 if (!bitmap_empty_p (&candidates
[i
]))
2542 else if (i
== 2 && dump_file
)
2543 fprintf (dump_file
, "There are no candidates for optimization.\n");
2545 for (unsigned i
= 0; i
<= 2; ++i
)
2547 auto_bitmap disallowed
;
2548 bitmap_tree_view (&candidates
[i
]);
2549 while (!bitmap_empty_p (&candidates
[i
]))
2551 unsigned uid
= bitmap_first_set_bit (&candidates
[i
]);
2552 scalar_chain
*chain
;
2554 if (cand_mode
[i
] == TImode
)
2555 chain
= new timode_scalar_chain
;
2557 chain
= new general_scalar_chain (cand_mode
[i
], cand_vmode
[i
]);
2559 /* Find instructions chain we want to convert to vector mode.
2560 Check all uses and definitions to estimate all required
2562 if (chain
->build (&candidates
[i
], uid
, disallowed
))
2564 if (chain
->compute_convert_gain () > 0)
2565 converted_insns
+= chain
->convert ();
2567 fprintf (dump_file
, "Chain #%d conversion is not profitable\n",
2571 rtx_insn
* iter_insn
;
2573 FOR_EACH_VEC_ELT (chain
->control_flow_insns
, ii
, iter_insn
)
2574 control_flow_insns
.safe_push (iter_insn
);
2581 fprintf (dump_file
, "Total insns converted: %d\n", converted_insns
);
2583 for (unsigned i
= 0; i
<= 2; ++i
)
2584 bitmap_release (&candidates
[i
]);
2585 bitmap_obstack_release (NULL
);
2586 df_process_deferred_rescans ();
2588 /* Conversion means we may have 128bit register spills/fills
2589 which require aligned stack. */
2590 if (converted_insns
)
2592 if (crtl
->stack_alignment_needed
< 128)
2593 crtl
->stack_alignment_needed
= 128;
2594 if (crtl
->stack_alignment_estimated
< 128)
2595 crtl
->stack_alignment_estimated
= 128;
2597 crtl
->stack_realign_needed
2598 = INCOMING_STACK_BOUNDARY
< crtl
->stack_alignment_estimated
;
2599 crtl
->stack_realign_tried
= crtl
->stack_realign_needed
;
2601 crtl
->stack_realign_processed
= true;
2603 if (!crtl
->drap_reg
)
2605 rtx drap_rtx
= targetm
.calls
.get_drap_rtx ();
2607 /* stack_realign_drap and drap_rtx must match. */
2608 gcc_assert ((stack_realign_drap
!= 0) == (drap_rtx
!= NULL
));
2610 /* Do nothing if NULL is returned,
2611 which means DRAP is not needed. */
2612 if (drap_rtx
!= NULL
)
2614 crtl
->args
.internal_arg_pointer
= drap_rtx
;
2616 /* Call fixup_tail_calls to clean up
2617 REG_EQUIV note if DRAP is needed. */
2618 fixup_tail_calls ();
2622 /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
2624 for (tree parm
= DECL_ARGUMENTS (current_function_decl
);
2625 parm
; parm
= DECL_CHAIN (parm
))
2627 if (TYPE_MODE (TREE_TYPE (parm
)) != TImode
)
2629 if (DECL_RTL_SET_P (parm
)
2630 && GET_MODE (DECL_RTL (parm
)) == V1TImode
)
2632 rtx r
= DECL_RTL (parm
);
2634 SET_DECL_RTL (parm
, gen_rtx_SUBREG (TImode
, r
, 0));
2636 if (DECL_INCOMING_RTL (parm
)
2637 && GET_MODE (DECL_INCOMING_RTL (parm
)) == V1TImode
)
2639 rtx r
= DECL_INCOMING_RTL (parm
);
2641 DECL_INCOMING_RTL (parm
) = gen_rtx_SUBREG (TImode
, r
, 0);
2645 if (!control_flow_insns
.is_empty ())
2647 free_dominance_info (CDI_DOMINATORS
);
2651 FOR_EACH_VEC_ELT (control_flow_insns
, i
, insn
)
2652 if (control_flow_insn_p (insn
))
2654 /* Split the block after insn. There will be a fallthru
2655 edge, which is OK so we keep it. We have to create
2656 the exception edges ourselves. */
2657 bb
= BLOCK_FOR_INSN (insn
);
2658 split_block (bb
, insn
);
2659 rtl_make_eh_edge (NULL
, bb
, BB_END (bb
));
2668 rest_of_handle_insert_vzeroupper (void)
2670 /* vzeroupper instructions are inserted immediately after reload and
2671 postreload_cse to clean up after it a little bit to account for possible
2672 spills from 256bit or 512bit registers. The pass reuses mode switching
2673 infrastructure by re-running mode insertion pass, so disable entities
2674 that have already been processed. */
2675 for (int i
= 0; i
< MAX_386_ENTITIES
; i
++)
2676 ix86_optimize_mode_switching
[i
] = 0;
2678 ix86_optimize_mode_switching
[AVX_U128
] = 1;
2680 /* Call optimize_mode_switching. */
2681 g
->get_passes ()->execute_pass_mode_switching ();
2683 /* LRA removes all REG_DEAD/REG_UNUSED notes and normally they
2684 reappear in the IL only at the start of pass_rtl_dse2, which does
2685 df_note_add_problem (); df_analyze ();
2686 The vzeroupper is scheduled after postreload_cse pass and mode
2687 switching computes the notes as well, the problem is that e.g.
2688 pass_gcse2 doesn't maintain the notes, see PR113059 and
2689 PR112760. Remove the notes now to restore status quo ante
2690 until we figure out how to maintain the notes or what else
2694 FOR_EACH_BB_FN (bb
, cfun
)
2695 FOR_BB_INSNS (bb
, insn
)
2696 if (NONDEBUG_INSN_P (insn
))
2698 rtx
*pnote
= ®_NOTES (insn
);
2701 if (REG_NOTE_KIND (*pnote
) == REG_DEAD
2702 || REG_NOTE_KIND (*pnote
) == REG_UNUSED
)
2703 *pnote
= XEXP (*pnote
, 1);
2705 pnote
= &XEXP (*pnote
, 1);
2709 df_remove_problem (df_note
);
2716 const pass_data pass_data_insert_vzeroupper
=
2718 RTL_PASS
, /* type */
2719 "vzeroupper", /* name */
2720 OPTGROUP_NONE
, /* optinfo_flags */
2721 TV_MACH_DEP
, /* tv_id */
2722 0, /* properties_required */
2723 0, /* properties_provided */
2724 0, /* properties_destroyed */
2725 0, /* todo_flags_start */
2726 TODO_df_finish
, /* todo_flags_finish */
2729 class pass_insert_vzeroupper
: public rtl_opt_pass
2732 pass_insert_vzeroupper(gcc::context
*ctxt
)
2733 : rtl_opt_pass(pass_data_insert_vzeroupper
, ctxt
)
2736 /* opt_pass methods: */
2737 bool gate (function
*) final override
2739 return TARGET_AVX
&& TARGET_VZEROUPPER
;
2742 unsigned int execute (function
*) final override
2744 return rest_of_handle_insert_vzeroupper ();
2747 }; // class pass_insert_vzeroupper
2749 const pass_data pass_data_stv
=
2751 RTL_PASS
, /* type */
2753 OPTGROUP_NONE
, /* optinfo_flags */
2754 TV_MACH_DEP
, /* tv_id */
2755 0, /* properties_required */
2756 0, /* properties_provided */
2757 0, /* properties_destroyed */
2758 0, /* todo_flags_start */
2759 TODO_df_finish
, /* todo_flags_finish */
2762 class pass_stv
: public rtl_opt_pass
2765 pass_stv (gcc::context
*ctxt
)
2766 : rtl_opt_pass (pass_data_stv
, ctxt
),
2770 /* opt_pass methods: */
2771 bool gate (function
*) final override
2773 return ((!timode_p
|| TARGET_64BIT
)
2774 && TARGET_STV
&& TARGET_SSE2
&& optimize
> 1);
2777 unsigned int execute (function
*) final override
2779 return convert_scalars_to_vector (timode_p
);
2782 opt_pass
*clone () final override
2784 return new pass_stv (m_ctxt
);
2787 void set_pass_param (unsigned int n
, bool param
) final override
2789 gcc_assert (n
== 0);
2795 }; // class pass_stv
2800 make_pass_insert_vzeroupper (gcc::context
*ctxt
)
2802 return new pass_insert_vzeroupper (ctxt
);
2806 make_pass_stv (gcc::context
*ctxt
)
2808 return new pass_stv (ctxt
);
2811 /* Inserting ENDBR and pseudo patchable-area instructions. */
2814 rest_of_insert_endbr_and_patchable_area (bool need_endbr
,
2815 unsigned int patchable_area_size
)
2819 rtx_insn
*endbr_insn
= NULL
;
2824 /* Currently emit EB if it's a tracking function, i.e. 'nocf_check'
2825 is absent among function attributes. Later an optimization will
2826 be introduced to make analysis if an address of a static function
2827 is taken. A static function whose address is not taken will get
2828 a nocf_check attribute. This will allow to reduce the number of
2830 if (!lookup_attribute ("nocf_check",
2831 TYPE_ATTRIBUTES (TREE_TYPE (cfun
->decl
)))
2832 && (!flag_manual_endbr
2833 || lookup_attribute ("cf_check",
2834 DECL_ATTRIBUTES (cfun
->decl
)))
2835 && (!cgraph_node::get (cfun
->decl
)->only_called_directly_p ()
2836 || ix86_cmodel
== CM_LARGE
2837 || ix86_cmodel
== CM_LARGE_PIC
2838 || flag_force_indirect_call
2839 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES
2840 && DECL_DLLIMPORT_P (cfun
->decl
))))
2842 if (crtl
->profile
&& flag_fentry
)
2844 /* Queue ENDBR insertion to x86_function_profiler.
2845 NB: Any patchable-area insn will be inserted after
2847 cfun
->machine
->insn_queued_at_entrance
= TYPE_ENDBR
;
2851 endbr
= gen_nop_endbr ();
2852 bb
= ENTRY_BLOCK_PTR_FOR_FN (cfun
)->next_bb
;
2853 rtx_insn
*insn
= BB_HEAD (bb
);
2854 endbr_insn
= emit_insn_before (endbr
, insn
);
2859 if (patchable_area_size
)
2861 if (crtl
->profile
&& flag_fentry
)
2863 /* Queue patchable-area insertion to x86_function_profiler.
2864 NB: If there is a queued ENDBR, x86_function_profiler
2865 will also handle patchable-area. */
2866 if (!cfun
->machine
->insn_queued_at_entrance
)
2867 cfun
->machine
->insn_queued_at_entrance
= TYPE_PATCHABLE_AREA
;
2872 = gen_patchable_area (GEN_INT (patchable_area_size
),
2873 GEN_INT (crtl
->patch_area_entry
== 0));
2875 emit_insn_after (patchable_area
, endbr_insn
);
2878 bb
= ENTRY_BLOCK_PTR_FOR_FN (cfun
)->next_bb
;
2879 insn
= BB_HEAD (bb
);
2880 emit_insn_before (patchable_area
, insn
);
2889 FOR_EACH_BB_FN (bb
, cfun
)
2891 for (insn
= BB_HEAD (bb
); insn
!= NEXT_INSN (BB_END (bb
));
2892 insn
= NEXT_INSN (insn
))
2896 need_endbr
= find_reg_note (insn
, REG_SETJMP
, NULL
) != NULL
;
2897 if (!need_endbr
&& !SIBLING_CALL_P (insn
))
2899 rtx call
= get_call_rtx_from (insn
);
2900 rtx fnaddr
= XEXP (call
, 0);
2901 tree fndecl
= NULL_TREE
;
2903 /* Also generate ENDBRANCH for non-tail call which
2904 may return via indirect branch. */
2905 if (GET_CODE (XEXP (fnaddr
, 0)) == SYMBOL_REF
)
2906 fndecl
= SYMBOL_REF_DECL (XEXP (fnaddr
, 0));
2907 if (fndecl
== NULL_TREE
)
2908 fndecl
= MEM_EXPR (fnaddr
);
2910 && TREE_CODE (TREE_TYPE (fndecl
)) != FUNCTION_TYPE
2911 && TREE_CODE (TREE_TYPE (fndecl
)) != METHOD_TYPE
)
2913 if (fndecl
&& TYPE_ARG_TYPES (TREE_TYPE (fndecl
)))
2915 tree fntype
= TREE_TYPE (fndecl
);
2916 if (lookup_attribute ("indirect_return",
2917 TYPE_ATTRIBUTES (fntype
)))
2923 /* Generate ENDBRANCH after CALL, which can return more than
2924 twice, setjmp-like functions. */
2926 endbr
= gen_nop_endbr ();
2927 emit_insn_after_setloc (endbr
, insn
, INSN_LOCATION (insn
));
2931 if (JUMP_P (insn
) && flag_cet_switch
)
2933 rtx target
= JUMP_LABEL (insn
);
2934 if (target
== NULL_RTX
|| ANY_RETURN_P (target
))
2937 /* Check the jump is a switch table. */
2938 rtx_insn
*label
= as_a
<rtx_insn
*> (target
);
2939 rtx_insn
*table
= next_insn (label
);
2940 if (table
== NULL_RTX
|| !JUMP_TABLE_DATA_P (table
))
2943 /* For the indirect jump find out all places it jumps and insert
2944 ENDBRANCH there. It should be done under a special flag to
2945 control ENDBRANCH generation for switch stmts. */
2948 basic_block dest_blk
;
2950 FOR_EACH_EDGE (e
, ei
, bb
->succs
)
2955 insn
= BB_HEAD (dest_blk
);
2956 gcc_assert (LABEL_P (insn
));
2957 endbr
= gen_nop_endbr ();
2958 emit_insn_after (endbr
, insn
);
2963 if (LABEL_P (insn
) && LABEL_PRESERVE_P (insn
))
2965 endbr
= gen_nop_endbr ();
2966 emit_insn_after (endbr
, insn
);
2977 const pass_data pass_data_insert_endbr_and_patchable_area
=
2979 RTL_PASS
, /* type. */
2980 "endbr_and_patchable_area", /* name. */
2981 OPTGROUP_NONE
, /* optinfo_flags. */
2982 TV_MACH_DEP
, /* tv_id. */
2983 0, /* properties_required. */
2984 0, /* properties_provided. */
2985 0, /* properties_destroyed. */
2986 0, /* todo_flags_start. */
2987 0, /* todo_flags_finish. */
2990 class pass_insert_endbr_and_patchable_area
: public rtl_opt_pass
2993 pass_insert_endbr_and_patchable_area (gcc::context
*ctxt
)
2994 : rtl_opt_pass (pass_data_insert_endbr_and_patchable_area
, ctxt
)
2997 /* opt_pass methods: */
2998 bool gate (function
*) final override
3000 need_endbr
= (flag_cf_protection
& CF_BRANCH
) != 0;
3001 patchable_area_size
= crtl
->patch_area_size
- crtl
->patch_area_entry
;
3002 return need_endbr
|| patchable_area_size
;
3005 unsigned int execute (function
*) final override
3007 timevar_push (TV_MACH_DEP
);
3008 rest_of_insert_endbr_and_patchable_area (need_endbr
,
3009 patchable_area_size
);
3010 timevar_pop (TV_MACH_DEP
);
3016 unsigned int patchable_area_size
;
3017 }; // class pass_insert_endbr_and_patchable_area
3022 make_pass_insert_endbr_and_patchable_area (gcc::context
*ctxt
)
3024 return new pass_insert_endbr_and_patchable_area (ctxt
);
3031 && TARGET_SSE_PARTIAL_REG_DEPENDENCY
3034 && optimize_function_for_speed_p (cfun
));
3037 /* At entry of the nearest common dominator for basic blocks with
3038 conversions/rcp/sqrt/rsqrt/round, generate a single
3039 vxorps %xmmN, %xmmN, %xmmN
3041 vcvtss2sd op, %xmmN, %xmmX
3042 vcvtsd2ss op, %xmmN, %xmmX
3043 vcvtsi2ss op, %xmmN, %xmmX
3044 vcvtsi2sd op, %xmmN, %xmmX
3046 NB: We want to generate only a single vxorps to cover the whole
3047 function. The LCM algorithm isn't appropriate here since it may
3048 place a vxorps inside the loop. */
3051 remove_partial_avx_dependency (void)
3053 timevar_push (TV_MACH_DEP
);
3055 bitmap_obstack_initialize (NULL
);
3056 bitmap convert_bbs
= BITMAP_ALLOC (NULL
);
3059 rtx_insn
*insn
, *set_insn
;
3061 rtx v4sf_const0
= NULL_RTX
;
3063 auto_vec
<rtx_insn
*> control_flow_insns
;
3065 /* We create invalid RTL initially so defer rescans. */
3066 df_set_flags (DF_DEFER_INSN_RESCAN
);
3068 FOR_EACH_BB_FN (bb
, cfun
)
3070 FOR_BB_INSNS (bb
, insn
)
3072 if (!NONDEBUG_INSN_P (insn
))
3075 set
= single_set (insn
);
3079 if (get_attr_avx_partial_xmm_update (insn
)
3080 != AVX_PARTIAL_XMM_UPDATE_TRUE
)
3083 /* Convert PARTIAL_XMM_UPDATE_TRUE insns, DF -> SF, SF -> DF,
3084 SI -> SF, SI -> DF, DI -> SF, DI -> DF, sqrt, rsqrt, rcp,
3085 round, to vec_dup and vec_merge with subreg. */
3086 rtx src
= SET_SRC (set
);
3087 rtx dest
= SET_DEST (set
);
3088 machine_mode dest_mode
= GET_MODE (dest
);
3089 bool convert_p
= false;
3090 switch (GET_CODE (src
))
3094 case FLOAT_TRUNCATE
:
3095 case UNSIGNED_FLOAT
:
3102 /* Only hanlde conversion here. */
3103 machine_mode src_mode
3104 = convert_p
? GET_MODE (XEXP (src
, 0)) : VOIDmode
;
3109 if (TARGET_USE_VECTOR_FP_CONVERTS
3110 || !TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY
)
3115 if (TARGET_USE_VECTOR_CONVERTS
3116 || !TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY
)
3120 gcc_assert (!convert_p
);
3127 v4sf_const0
= gen_reg_rtx (V4SFmode
);
3130 machine_mode dest_vecmode
;
3134 dest_vecmode
= V8HFmode
;
3135 zero
= gen_rtx_SUBREG (V8HFmode
, v4sf_const0
, 0);
3138 dest_vecmode
= V4SFmode
;
3142 dest_vecmode
= V2DFmode
;
3143 zero
= gen_rtx_SUBREG (V2DFmode
, v4sf_const0
, 0);
3149 /* Change source to vector mode. */
3150 src
= gen_rtx_VEC_DUPLICATE (dest_vecmode
, src
);
3151 src
= gen_rtx_VEC_MERGE (dest_vecmode
, src
, zero
,
3152 GEN_INT (HOST_WIDE_INT_1U
));
3153 /* Change destination to vector mode. */
3154 rtx vec
= gen_reg_rtx (dest_vecmode
);
3155 /* Generate an XMM vector SET. */
3156 set
= gen_rtx_SET (vec
, src
);
3157 set_insn
= emit_insn_before (set
, insn
);
3158 df_insn_rescan (set_insn
);
3160 if (cfun
->can_throw_non_call_exceptions
)
3162 /* Handle REG_EH_REGION note. */
3163 rtx note
= find_reg_note (insn
, REG_EH_REGION
, NULL_RTX
);
3166 control_flow_insns
.safe_push (set_insn
);
3167 add_reg_note (set_insn
, REG_EH_REGION
, XEXP (note
, 0));
3171 src
= gen_rtx_SUBREG (dest_mode
, vec
, 0);
3172 set
= gen_rtx_SET (dest
, src
);
3174 /* Drop possible dead definitions. */
3175 PATTERN (insn
) = set
;
3177 INSN_CODE (insn
) = -1;
3178 recog_memoized (insn
);
3179 df_insn_rescan (insn
);
3180 bitmap_set_bit (convert_bbs
, bb
->index
);
3186 /* (Re-)discover loops so that bb->loop_father can be used in the
3188 calculate_dominance_info (CDI_DOMINATORS
);
3189 loop_optimizer_init (AVOID_CFG_MODIFICATIONS
);
3191 /* Generate a vxorps at entry of the nearest dominator for basic
3192 blocks with conversions, which is in the fake loop that
3193 contains the whole function, so that there is only a single
3194 vxorps in the whole function. */
3195 bb
= nearest_common_dominator_for_set (CDI_DOMINATORS
,
3197 while (bb
->loop_father
->latch
3198 != EXIT_BLOCK_PTR_FOR_FN (cfun
))
3199 bb
= get_immediate_dominator (CDI_DOMINATORS
,
3200 bb
->loop_father
->header
);
3202 set
= gen_rtx_SET (v4sf_const0
, CONST0_RTX (V4SFmode
));
3204 insn
= BB_HEAD (bb
);
3205 while (insn
&& !NONDEBUG_INSN_P (insn
))
3207 if (insn
== BB_END (bb
))
3212 insn
= NEXT_INSN (insn
);
3214 if (insn
== BB_HEAD (bb
))
3215 set_insn
= emit_insn_before (set
, insn
);
3217 set_insn
= emit_insn_after (set
,
3218 insn
? PREV_INSN (insn
) : BB_END (bb
));
3219 df_insn_rescan (set_insn
);
3220 loop_optimizer_finalize ();
3222 if (!control_flow_insns
.is_empty ())
3224 free_dominance_info (CDI_DOMINATORS
);
3227 FOR_EACH_VEC_ELT (control_flow_insns
, i
, insn
)
3228 if (control_flow_insn_p (insn
))
3230 /* Split the block after insn. There will be a fallthru
3231 edge, which is OK so we keep it. We have to create
3232 the exception edges ourselves. */
3233 bb
= BLOCK_FOR_INSN (insn
);
3234 split_block (bb
, insn
);
3235 rtl_make_eh_edge (NULL
, bb
, BB_END (bb
));
3240 df_process_deferred_rescans ();
3241 df_clear_flags (DF_DEFER_INSN_RESCAN
);
3242 bitmap_obstack_release (NULL
);
3243 BITMAP_FREE (convert_bbs
);
3245 timevar_pop (TV_MACH_DEP
);
3251 const pass_data pass_data_remove_partial_avx_dependency
=
3253 RTL_PASS
, /* type */
3255 OPTGROUP_NONE
, /* optinfo_flags */
3256 TV_MACH_DEP
, /* tv_id */
3257 0, /* properties_required */
3258 0, /* properties_provided */
3259 0, /* properties_destroyed */
3260 0, /* todo_flags_start */
3261 0, /* todo_flags_finish */
3264 class pass_remove_partial_avx_dependency
: public rtl_opt_pass
3267 pass_remove_partial_avx_dependency (gcc::context
*ctxt
)
3268 : rtl_opt_pass (pass_data_remove_partial_avx_dependency
, ctxt
)
3271 /* opt_pass methods: */
3272 bool gate (function
*) final override
3274 return ix86_rpad_gate ();
3277 unsigned int execute (function
*) final override
3279 return remove_partial_avx_dependency ();
3281 }; // class pass_rpad
3286 make_pass_remove_partial_avx_dependency (gcc::context
*ctxt
)
3288 return new pass_remove_partial_avx_dependency (ctxt
);
3291 /* Convert legacy instructions that clobbers EFLAGS to APX_NF
3292 instructions when there are no flag set between a flag
3293 producer and user. */
3296 ix86_apx_nf_convert (void)
3298 timevar_push (TV_MACH_DEP
);
3302 hash_map
<rtx_insn
*, rtx
> converting_map
;
3303 auto_vec
<rtx_insn
*> current_convert_list
;
3305 bool converting_seq
= false;
3306 rtx cc
= gen_rtx_REG (CCmode
, FLAGS_REG
);
3308 FOR_EACH_BB_FN (bb
, cfun
)
3310 /* Reset conversion for each bb. */
3311 converting_seq
= false;
3312 FOR_BB_INSNS (bb
, insn
)
3314 if (!NONDEBUG_INSN_P (insn
))
3317 if (recog_memoized (insn
) < 0)
3320 /* Convert candidate insns after cstore, which should
3321 satisify the two conditions:
3322 1. Is not flag user or producer, only clobbers
3324 2. Have corresponding nf pattern. */
3326 rtx pat
= PATTERN (insn
);
3328 /* Starting convertion at first cstorecc. */
3331 && (set
= single_set (insn
))
3332 && ix86_comparison_operator (SET_SRC (set
), VOIDmode
)
3333 && reg_overlap_mentioned_p (cc
, SET_SRC (set
))
3334 && !reg_overlap_mentioned_p (cc
, SET_DEST (set
)))
3336 converting_seq
= true;
3337 current_convert_list
.truncate (0);
3339 /* Terminate at the next explicit flag set. */
3340 else if (reg_set_p (cc
, pat
)
3341 && GET_CODE (set_of (cc
, pat
)) != CLOBBER
)
3342 converting_seq
= false;
3344 if (!converting_seq
)
3347 if (get_attr_has_nf (insn
)
3348 && GET_CODE (pat
) == PARALLEL
)
3350 /* Record the insn to candidate map. */
3351 current_convert_list
.safe_push (insn
);
3352 converting_map
.put (insn
, pat
);
3354 /* If the insn clobbers flags but has no nf_attr,
3355 revoke all previous candidates. */
3356 else if (!get_attr_has_nf (insn
)
3357 && reg_set_p (cc
, pat
)
3358 && GET_CODE (set_of (cc
, pat
)) == CLOBBER
)
3360 for (auto item
: current_convert_list
)
3361 converting_map
.remove (item
);
3362 converting_seq
= false;
3367 if (!converting_map
.is_empty ())
3369 for (auto iter
= converting_map
.begin ();
3370 iter
!= converting_map
.end (); ++iter
)
3372 rtx_insn
*replace
= (*iter
).first
;
3373 rtx pat
= (*iter
).second
;
3374 int i
, n
= 0, len
= XVECLEN (pat
, 0);
3375 rtx
*new_elems
= XALLOCAVEC (rtx
, len
);
3377 for (i
= 0; i
< len
; i
++)
3379 rtx temp
= XVECEXP (pat
, 0, i
);
3380 if (! (GET_CODE (temp
) == CLOBBER
3381 && reg_overlap_mentioned_p (cc
,
3384 new_elems
[n
] = temp
;
3390 new_pat
= new_elems
[0];
3393 gen_rtx_PARALLEL (VOIDmode
,
3397 PATTERN (replace
) = new_pat
;
3398 INSN_CODE (replace
) = -1;
3399 recog_memoized (replace
);
3400 df_insn_rescan (replace
);
3404 timevar_pop (TV_MACH_DEP
);
3411 const pass_data pass_data_apx_nf_convert
=
3413 RTL_PASS
, /* type */
3414 "apx_nfcvt", /* name */
3415 OPTGROUP_NONE
, /* optinfo_flags */
3416 TV_MACH_DEP
, /* tv_id */
3417 0, /* properties_required */
3418 0, /* properties_provided */
3419 0, /* properties_destroyed */
3420 0, /* todo_flags_start */
3421 0, /* todo_flags_finish */
3424 class pass_apx_nf_convert
: public rtl_opt_pass
3427 pass_apx_nf_convert (gcc::context
*ctxt
)
3428 : rtl_opt_pass (pass_data_apx_nf_convert
, ctxt
)
3431 /* opt_pass methods: */
3432 bool gate (function
*) final override
3434 return (TARGET_APX_NF
3436 && optimize_function_for_speed_p (cfun
));
3439 unsigned int execute (function
*) final override
3441 return ix86_apx_nf_convert ();
3443 }; // class pass_rpad
3448 make_pass_apx_nf_convert (gcc::context
*ctxt
)
3450 return new pass_apx_nf_convert (ctxt
);
3453 /* When a hot loop can be fit into one cacheline,
3454 force align the loop without considering the max skip. */
3460 /* Don't do this when we don't know cache line size. */
3461 if (ix86_cost
->prefetch_block
== 0)
3464 loop_optimizer_init (AVOID_CFG_MODIFICATIONS
);
3465 profile_count count_threshold
= cfun
->cfg
->count_max
/ param_align_threshold
;
3466 FOR_EACH_BB_FN (bb
, cfun
)
3468 rtx_insn
*label
= BB_HEAD (bb
);
3469 bool has_fallthru
= 0;
3473 if (!LABEL_P (label
))
3476 profile_count fallthru_count
= profile_count::zero ();
3477 profile_count branch_count
= profile_count::zero ();
3479 FOR_EACH_EDGE (e
, ei
, bb
->preds
)
3481 if (e
->flags
& EDGE_FALLTHRU
)
3482 has_fallthru
= 1, fallthru_count
+= e
->count ();
3484 branch_count
+= e
->count ();
3487 if (!fallthru_count
.initialized_p () || !branch_count
.initialized_p ())
3491 && bb
->loop_father
->latch
!= EXIT_BLOCK_PTR_FOR_FN (cfun
)
3493 ? (!(single_succ_p (bb
)
3494 && single_succ (bb
) == EXIT_BLOCK_PTR_FOR_FN (cfun
))
3495 && optimize_bb_for_speed_p (bb
)
3496 && branch_count
+ fallthru_count
> count_threshold
3497 && (branch_count
> fallthru_count
* param_align_loop_iterations
))
3498 /* In case there'no fallthru for the loop.
3499 Nops inserted won't be executed. */
3500 : (branch_count
> count_threshold
3501 || (bb
->count
> bb
->prev_bb
->count
* 10
3502 && (bb
->prev_bb
->count
3503 <= ENTRY_BLOCK_PTR_FOR_FN (cfun
)->count
/ 2)))))
3505 rtx_insn
* insn
, *end_insn
;
3506 HOST_WIDE_INT size
= 0;
3507 bool padding_p
= true;
3508 basic_block tbb
= bb
;
3509 unsigned cond_branch_num
= 0;
3510 bool detect_tight_loop_p
= false;
3512 for (unsigned int i
= 0; i
!= bb
->loop_father
->num_nodes
;
3513 i
++, tbb
= tbb
->next_bb
)
3515 /* Only handle continuous cfg layout. */
3516 if (bb
->loop_father
!= tbb
->loop_father
)
3522 FOR_BB_INSNS (tbb
, insn
)
3524 if (!NONDEBUG_INSN_P (insn
))
3526 size
+= ix86_min_insn_size (insn
);
3528 /* We don't know size of inline asm.
3529 Don't align loop for call. */
3530 if (asm_noperands (PATTERN (insn
)) >= 0
3538 if (size
== -1 || size
> ix86_cost
->prefetch_block
)
3544 FOR_EACH_EDGE (e
, ei
, tbb
->succs
)
3546 /* It could be part of the loop. */
3549 detect_tight_loop_p
= true;
3554 if (detect_tight_loop_p
)
3557 end_insn
= BB_END (tbb
);
3558 if (JUMP_P (end_insn
))
3560 /* For decoded icache:
3561 1. Up to two branches are allowed per Way.
3562 2. A non-conditional branch is the last micro-op in a Way.
3564 if (onlyjump_p (end_insn
)
3565 && (any_uncondjump_p (end_insn
)
3566 || single_succ_p (tbb
)))
3571 else if (++cond_branch_num
>= 2)
3580 if (padding_p
&& detect_tight_loop_p
)
3582 emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size
)),
3583 GEN_INT (0)), label
);
3584 /* End of function. */
3585 if (!tbb
|| tbb
== EXIT_BLOCK_PTR_FOR_FN (cfun
))
3587 /* Skip bb which already fits into one cacheline. */
3593 loop_optimizer_finalize ();
3594 free_dominance_info (CDI_DOMINATORS
);
3599 const pass_data pass_data_align_tight_loops
=
3601 RTL_PASS
, /* type */
3602 "align_tight_loops", /* name */
3603 OPTGROUP_NONE
, /* optinfo_flags */
3604 TV_MACH_DEP
, /* tv_id */
3605 0, /* properties_required */
3606 0, /* properties_provided */
3607 0, /* properties_destroyed */
3608 0, /* todo_flags_start */
3609 0, /* todo_flags_finish */
3612 class pass_align_tight_loops
: public rtl_opt_pass
3615 pass_align_tight_loops (gcc::context
*ctxt
)
3616 : rtl_opt_pass (pass_data_align_tight_loops
, ctxt
)
3619 /* opt_pass methods: */
3620 bool gate (function
*) final override
3622 return optimize
&& optimize_function_for_speed_p (cfun
);
3625 unsigned int execute (function
*) final override
3627 timevar_push (TV_MACH_DEP
);
3628 #ifdef ASM_OUTPUT_MAX_SKIP_ALIGN
3629 ix86_align_loops ();
3631 timevar_pop (TV_MACH_DEP
);
3634 }; // class pass_align_tight_loops
3639 make_pass_align_tight_loops (gcc::context
*ctxt
)
3641 return new pass_align_tight_loops (ctxt
);
3644 /* This compares the priority of target features in function DECL1
3645 and DECL2. It returns positive value if DECL1 is higher priority,
3646 negative value if DECL2 is higher priority and 0 if they are the
3650 ix86_compare_version_priority (tree decl1
, tree decl2
)
3652 unsigned int priority1
= get_builtin_code_for_version (decl1
, NULL
);
3653 unsigned int priority2
= get_builtin_code_for_version (decl2
, NULL
);
3655 return (int)priority1
- (int)priority2
;
3658 /* V1 and V2 point to function versions with different priorities
3659 based on the target ISA. This function compares their priorities. */
3662 feature_compare (const void *v1
, const void *v2
)
3664 typedef struct _function_version_info
3667 tree predicate_chain
;
3668 unsigned int dispatch_priority
;
3669 } function_version_info
;
3671 const function_version_info c1
= *(const function_version_info
*)v1
;
3672 const function_version_info c2
= *(const function_version_info
*)v2
;
3673 return (c2
.dispatch_priority
- c1
.dispatch_priority
);
3676 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
3677 to return a pointer to VERSION_DECL if the outcome of the expression
3678 formed by PREDICATE_CHAIN is true. This function will be called during
3679 version dispatch to decide which function version to execute. It returns
3680 the basic block at the end, to which more conditions can be added. */
3683 add_condition_to_bb (tree function_decl
, tree version_decl
,
3684 tree predicate_chain
, basic_block new_bb
)
3686 gimple
*return_stmt
;
3687 tree convert_expr
, result_var
;
3688 gimple
*convert_stmt
;
3689 gimple
*call_cond_stmt
;
3690 gimple
*if_else_stmt
;
3692 basic_block bb1
, bb2
, bb3
;
3695 tree cond_var
, and_expr_var
= NULL_TREE
;
3698 tree predicate_decl
, predicate_arg
;
3700 push_cfun (DECL_STRUCT_FUNCTION (function_decl
));
3702 gcc_assert (new_bb
!= NULL
);
3703 gseq
= bb_seq (new_bb
);
3706 convert_expr
= build1 (CONVERT_EXPR
, ptr_type_node
,
3707 build_fold_addr_expr (version_decl
));
3708 result_var
= create_tmp_var (ptr_type_node
);
3709 convert_stmt
= gimple_build_assign (result_var
, convert_expr
);
3710 return_stmt
= gimple_build_return (result_var
);
3712 if (predicate_chain
== NULL_TREE
)
3714 gimple_seq_add_stmt (&gseq
, convert_stmt
);
3715 gimple_seq_add_stmt (&gseq
, return_stmt
);
3716 set_bb_seq (new_bb
, gseq
);
3717 gimple_set_bb (convert_stmt
, new_bb
);
3718 gimple_set_bb (return_stmt
, new_bb
);
3723 while (predicate_chain
!= NULL
)
3725 cond_var
= create_tmp_var (integer_type_node
);
3726 predicate_decl
= TREE_PURPOSE (predicate_chain
);
3727 predicate_arg
= TREE_VALUE (predicate_chain
);
3728 call_cond_stmt
= gimple_build_call (predicate_decl
, 1, predicate_arg
);
3729 gimple_call_set_lhs (call_cond_stmt
, cond_var
);
3731 gimple_set_block (call_cond_stmt
, DECL_INITIAL (function_decl
));
3732 gimple_set_bb (call_cond_stmt
, new_bb
);
3733 gimple_seq_add_stmt (&gseq
, call_cond_stmt
);
3735 predicate_chain
= TREE_CHAIN (predicate_chain
);
3737 if (and_expr_var
== NULL
)
3738 and_expr_var
= cond_var
;
3741 gimple
*assign_stmt
;
3742 /* Use MIN_EXPR to check if any integer is zero?.
3743 and_expr_var = min_expr <cond_var, and_expr_var> */
3744 assign_stmt
= gimple_build_assign (and_expr_var
,
3745 build2 (MIN_EXPR
, integer_type_node
,
3746 cond_var
, and_expr_var
));
3748 gimple_set_block (assign_stmt
, DECL_INITIAL (function_decl
));
3749 gimple_set_bb (assign_stmt
, new_bb
);
3750 gimple_seq_add_stmt (&gseq
, assign_stmt
);
3754 if_else_stmt
= gimple_build_cond (GT_EXPR
, and_expr_var
,
3756 NULL_TREE
, NULL_TREE
);
3757 gimple_set_block (if_else_stmt
, DECL_INITIAL (function_decl
));
3758 gimple_set_bb (if_else_stmt
, new_bb
);
3759 gimple_seq_add_stmt (&gseq
, if_else_stmt
);
3761 gimple_seq_add_stmt (&gseq
, convert_stmt
);
3762 gimple_seq_add_stmt (&gseq
, return_stmt
);
3763 set_bb_seq (new_bb
, gseq
);
3766 e12
= split_block (bb1
, if_else_stmt
);
3768 e12
->flags
&= ~EDGE_FALLTHRU
;
3769 e12
->flags
|= EDGE_TRUE_VALUE
;
3771 e23
= split_block (bb2
, return_stmt
);
3773 gimple_set_bb (convert_stmt
, bb2
);
3774 gimple_set_bb (return_stmt
, bb2
);
3777 make_edge (bb1
, bb3
, EDGE_FALSE_VALUE
);
3780 make_edge (bb2
, EXIT_BLOCK_PTR_FOR_FN (cfun
), 0);
3787 /* This function generates the dispatch function for
3788 multi-versioned functions. DISPATCH_DECL is the function which will
3789 contain the dispatch logic. FNDECLS are the function choices for
3790 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
3791 in DISPATCH_DECL in which the dispatch code is generated. */
3794 dispatch_function_versions (tree dispatch_decl
,
3796 basic_block
*empty_bb
)
3799 gimple
*ifunc_cpu_init_stmt
;
3804 unsigned int num_versions
= 0;
3805 unsigned int actual_versions
= 0;
3808 struct _function_version_info
3811 tree predicate_chain
;
3812 unsigned int dispatch_priority
;
3813 }*function_version_info
;
3815 gcc_assert (dispatch_decl
!= NULL
3816 && fndecls_p
!= NULL
3817 && empty_bb
!= NULL
);
3819 /*fndecls_p is actually a vector. */
3820 fndecls
= static_cast<vec
<tree
> *> (fndecls_p
);
3822 /* At least one more version other than the default. */
3823 num_versions
= fndecls
->length ();
3824 gcc_assert (num_versions
>= 2);
3826 function_version_info
= (struct _function_version_info
*)
3827 XNEWVEC (struct _function_version_info
, (num_versions
- 1));
3829 /* The first version in the vector is the default decl. */
3830 default_decl
= (*fndecls
)[0];
3832 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl
));
3834 gseq
= bb_seq (*empty_bb
);
3835 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
3836 constructors, so explicity call __builtin_cpu_init here. */
3838 = gimple_build_call_vec (get_ix86_builtin (IX86_BUILTIN_CPU_INIT
), vNULL
);
3839 gimple_seq_add_stmt (&gseq
, ifunc_cpu_init_stmt
);
3840 gimple_set_bb (ifunc_cpu_init_stmt
, *empty_bb
);
3841 set_bb_seq (*empty_bb
, gseq
);
3846 for (ix
= 1; fndecls
->iterate (ix
, &ele
); ++ix
)
3848 tree version_decl
= ele
;
3849 tree predicate_chain
= NULL_TREE
;
3850 unsigned int priority
;
3851 /* Get attribute string, parse it and find the right predicate decl.
3852 The predicate function could be a lengthy combination of many
3853 features, like arch-type and various isa-variants. */
3854 priority
= get_builtin_code_for_version (version_decl
,
3857 if (predicate_chain
== NULL_TREE
)
3860 function_version_info
[actual_versions
].version_decl
= version_decl
;
3861 function_version_info
[actual_versions
].predicate_chain
3863 function_version_info
[actual_versions
].dispatch_priority
= priority
;
3867 /* Sort the versions according to descending order of dispatch priority. The
3868 priority is based on the ISA. This is not a perfect solution. There
3869 could still be ambiguity. If more than one function version is suitable
3870 to execute, which one should be dispatched? In future, allow the user
3871 to specify a dispatch priority next to the version. */
3872 qsort (function_version_info
, actual_versions
,
3873 sizeof (struct _function_version_info
), feature_compare
);
3875 for (i
= 0; i
< actual_versions
; ++i
)
3876 *empty_bb
= add_condition_to_bb (dispatch_decl
,
3877 function_version_info
[i
].version_decl
,
3878 function_version_info
[i
].predicate_chain
,
3881 /* dispatch default version at the end. */
3882 *empty_bb
= add_condition_to_bb (dispatch_decl
, default_decl
,
3885 free (function_version_info
);
3889 /* This function changes the assembler name for functions that are
3890 versions. If DECL is a function version and has a "target"
3891 attribute, it appends the attribute string to its assembler name. */
3894 ix86_mangle_function_version_assembler_name (tree decl
, tree id
)
3897 const char *orig_name
, *version_string
;
3898 char *attr_str
, *assembler_name
;
3900 if (DECL_DECLARED_INLINE_P (decl
)
3901 && lookup_attribute ("gnu_inline",
3902 DECL_ATTRIBUTES (decl
)))
3903 error_at (DECL_SOURCE_LOCATION (decl
),
3904 "function versions cannot be marked as %<gnu_inline%>,"
3905 " bodies have to be generated");
3907 if (DECL_VIRTUAL_P (decl
)
3908 || DECL_VINDEX (decl
))
3909 sorry ("virtual function multiversioning not supported");
3911 version_attr
= lookup_attribute ("target", DECL_ATTRIBUTES (decl
));
3913 /* target attribute string cannot be NULL. */
3914 gcc_assert (version_attr
!= NULL_TREE
);
3916 orig_name
= IDENTIFIER_POINTER (id
);
3918 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr
)));
3920 if (strcmp (version_string
, "default") == 0)
3923 attr_str
= sorted_attr_string (TREE_VALUE (version_attr
));
3924 assembler_name
= XNEWVEC (char, strlen (orig_name
) + strlen (attr_str
) + 2);
3926 sprintf (assembler_name
, "%s.%s", orig_name
, attr_str
);
3928 /* Allow assembler name to be modified if already set. */
3929 if (DECL_ASSEMBLER_NAME_SET_P (decl
))
3930 SET_DECL_RTL (decl
, NULL
);
3932 tree ret
= get_identifier (assembler_name
);
3933 XDELETEVEC (attr_str
);
3934 XDELETEVEC (assembler_name
);
3939 ix86_mangle_decl_assembler_name (tree decl
, tree id
)
3941 /* For function version, add the target suffix to the assembler name. */
3942 if (TREE_CODE (decl
) == FUNCTION_DECL
3943 && DECL_FUNCTION_VERSIONED (decl
))
3944 id
= ix86_mangle_function_version_assembler_name (decl
, id
);
3945 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
3946 id
= SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl
, id
);
3952 /* Make a dispatcher declaration for the multi-versioned function DECL.
3953 Calls to DECL function will be replaced with calls to the dispatcher
3954 by the front-end. Returns the decl of the dispatcher function. */
3957 ix86_get_function_versions_dispatcher (void *decl
)
3959 tree fn
= (tree
) decl
;
3960 struct cgraph_node
*node
= NULL
;
3961 struct cgraph_node
*default_node
= NULL
;
3962 struct cgraph_function_version_info
*node_v
= NULL
;
3963 struct cgraph_function_version_info
*first_v
= NULL
;
3965 tree dispatch_decl
= NULL
;
3967 struct cgraph_function_version_info
*default_version_info
= NULL
;
3969 gcc_assert (fn
!= NULL
&& DECL_FUNCTION_VERSIONED (fn
));
3971 node
= cgraph_node::get (fn
);
3972 gcc_assert (node
!= NULL
);
3974 node_v
= node
->function_version ();
3975 gcc_assert (node_v
!= NULL
);
3977 if (node_v
->dispatcher_resolver
!= NULL
)
3978 return node_v
->dispatcher_resolver
;
3980 /* Find the default version and make it the first node. */
3982 /* Go to the beginning of the chain. */
3983 while (first_v
->prev
!= NULL
)
3984 first_v
= first_v
->prev
;
3985 default_version_info
= first_v
;
3986 while (default_version_info
!= NULL
)
3988 if (is_function_default_version
3989 (default_version_info
->this_node
->decl
))
3991 default_version_info
= default_version_info
->next
;
3994 /* If there is no default node, just return NULL. */
3995 if (default_version_info
== NULL
)
3998 /* Make default info the first node. */
3999 if (first_v
!= default_version_info
)
4001 default_version_info
->prev
->next
= default_version_info
->next
;
4002 if (default_version_info
->next
)
4003 default_version_info
->next
->prev
= default_version_info
->prev
;
4004 first_v
->prev
= default_version_info
;
4005 default_version_info
->next
= first_v
;
4006 default_version_info
->prev
= NULL
;
4009 default_node
= default_version_info
->this_node
;
4011 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
4012 if (targetm
.has_ifunc_p ())
4014 struct cgraph_function_version_info
*it_v
= NULL
;
4015 struct cgraph_node
*dispatcher_node
= NULL
;
4016 struct cgraph_function_version_info
*dispatcher_version_info
= NULL
;
4018 /* Right now, the dispatching is done via ifunc. */
4019 dispatch_decl
= make_dispatcher_decl (default_node
->decl
);
4020 TREE_NOTHROW (dispatch_decl
) = TREE_NOTHROW (fn
);
4022 dispatcher_node
= cgraph_node::get_create (dispatch_decl
);
4023 gcc_assert (dispatcher_node
!= NULL
);
4024 dispatcher_node
->dispatcher_function
= 1;
4025 dispatcher_version_info
4026 = dispatcher_node
->insert_new_function_version ();
4027 dispatcher_version_info
->next
= default_version_info
;
4028 dispatcher_node
->definition
= 1;
4030 /* Set the dispatcher for all the versions. */
4031 it_v
= default_version_info
;
4032 while (it_v
!= NULL
)
4034 it_v
->dispatcher_resolver
= dispatch_decl
;
4041 error_at (DECL_SOURCE_LOCATION (default_node
->decl
),
4042 "multiversioning needs %<ifunc%> which is not supported "
4046 return dispatch_decl
;
4049 /* Make the resolver function decl to dispatch the versions of
4050 a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
4051 ifunc alias that will point to the created resolver. Create an
4052 empty basic block in the resolver and store the pointer in
4053 EMPTY_BB. Return the decl of the resolver function. */
4056 make_resolver_func (const tree default_decl
,
4057 const tree ifunc_alias_decl
,
4058 basic_block
*empty_bb
)
4062 /* Create resolver function name based on default_decl. */
4063 tree decl_name
= clone_function_name (default_decl
, "resolver");
4064 const char *resolver_name
= IDENTIFIER_POINTER (decl_name
);
4066 /* The resolver function should return a (void *). */
4067 type
= build_function_type_list (ptr_type_node
, NULL_TREE
);
4069 decl
= build_fn_decl (resolver_name
, type
);
4070 SET_DECL_ASSEMBLER_NAME (decl
, decl_name
);
4072 DECL_NAME (decl
) = decl_name
;
4073 TREE_USED (decl
) = 1;
4074 DECL_ARTIFICIAL (decl
) = 1;
4075 DECL_IGNORED_P (decl
) = 1;
4076 TREE_PUBLIC (decl
) = 0;
4077 DECL_UNINLINABLE (decl
) = 1;
4079 /* Resolver is not external, body is generated. */
4080 DECL_EXTERNAL (decl
) = 0;
4081 DECL_EXTERNAL (ifunc_alias_decl
) = 0;
4083 DECL_CONTEXT (decl
) = NULL_TREE
;
4084 DECL_INITIAL (decl
) = make_node (BLOCK
);
4085 DECL_STATIC_CONSTRUCTOR (decl
) = 0;
4087 if (DECL_COMDAT_GROUP (default_decl
)
4088 || TREE_PUBLIC (default_decl
))
4090 /* In this case, each translation unit with a call to this
4091 versioned function will put out a resolver. Ensure it
4092 is comdat to keep just one copy. */
4093 DECL_COMDAT (decl
) = 1;
4094 make_decl_one_only (decl
, DECL_ASSEMBLER_NAME (decl
));
4097 TREE_PUBLIC (ifunc_alias_decl
) = 0;
4099 /* Build result decl and add to function_decl. */
4100 t
= build_decl (UNKNOWN_LOCATION
, RESULT_DECL
, NULL_TREE
, ptr_type_node
);
4101 DECL_CONTEXT (t
) = decl
;
4102 DECL_ARTIFICIAL (t
) = 1;
4103 DECL_IGNORED_P (t
) = 1;
4104 DECL_RESULT (decl
) = t
;
4106 gimplify_function_tree (decl
);
4107 push_cfun (DECL_STRUCT_FUNCTION (decl
));
4108 *empty_bb
= init_lowered_empty_function (decl
, false,
4109 profile_count::uninitialized ());
4111 cgraph_node::add_new_function (decl
, true);
4112 symtab
->call_cgraph_insertion_hooks (cgraph_node::get_create (decl
));
4116 gcc_assert (ifunc_alias_decl
!= NULL
);
4117 /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
4118 DECL_ATTRIBUTES (ifunc_alias_decl
)
4119 = make_attribute ("ifunc", resolver_name
,
4120 DECL_ATTRIBUTES (ifunc_alias_decl
));
4122 /* Create the alias for dispatch to resolver here. */
4123 cgraph_node::create_same_body_alias (ifunc_alias_decl
, decl
);
4127 /* Generate the dispatching code body to dispatch multi-versioned function
4128 DECL. The target hook is called to process the "target" attributes and
4129 provide the code to dispatch the right function at run-time. NODE points
4130 to the dispatcher decl whose body will be created. */
4133 ix86_generate_version_dispatcher_body (void *node_p
)
4136 basic_block empty_bb
;
4137 tree default_ver_decl
;
4138 struct cgraph_node
*versn
;
4139 struct cgraph_node
*node
;
4141 struct cgraph_function_version_info
*node_version_info
= NULL
;
4142 struct cgraph_function_version_info
*versn_info
= NULL
;
4144 node
= (cgraph_node
*)node_p
;
4146 node_version_info
= node
->function_version ();
4147 gcc_assert (node
->dispatcher_function
4148 && node_version_info
!= NULL
);
4150 if (node_version_info
->dispatcher_resolver
)
4151 return node_version_info
->dispatcher_resolver
;
4153 /* The first version in the chain corresponds to the default version. */
4154 default_ver_decl
= node_version_info
->next
->this_node
->decl
;
4156 /* node is going to be an alias, so remove the finalized bit. */
4157 node
->definition
= false;
4159 resolver_decl
= make_resolver_func (default_ver_decl
,
4160 node
->decl
, &empty_bb
);
4162 node_version_info
->dispatcher_resolver
= resolver_decl
;
4164 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl
));
4166 auto_vec
<tree
, 2> fn_ver_vec
;
4168 for (versn_info
= node_version_info
->next
; versn_info
;
4169 versn_info
= versn_info
->next
)
4171 versn
= versn_info
->this_node
;
4172 /* Check for virtual functions here again, as by this time it should
4173 have been determined if this function needs a vtable index or
4174 not. This happens for methods in derived classes that override
4175 virtual methods in base classes but are not explicitly marked as
4177 if (DECL_VINDEX (versn
->decl
))
4178 sorry ("virtual function multiversioning not supported");
4180 fn_ver_vec
.safe_push (versn
->decl
);
4183 dispatch_function_versions (resolver_decl
, &fn_ver_vec
, &empty_bb
);
4184 cgraph_edge::rebuild_edges ();
4186 return resolver_decl
;