1 // z_Linux_asm.S: - microtasking routines specifically
2 // written for Intel platforms running Linux* OS
5 ////===----------------------------------------------------------------------===//
7 //// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 //// See https://llvm.org/LICENSE.txt for license information.
9 //// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11 ////===----------------------------------------------------------------------===//
14 // -----------------------------------------------------------------------
16 // -----------------------------------------------------------------------
18 #include "kmp_config.h"
20 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
23 // the 'delay r16/r32/r64' should be used instead of the 'pause'.
24 // The delay operation has the effect of removing the current thread from
25 // the round-robin HT mechanism, and therefore speeds up the issue rate of
26 // the other threads on the same core.
28 // A value of 0 works fine for <= 2 threads per core, but causes the EPCC
29 // barrier time to increase greatly for 3 or more threads per core.
31 // A value of 100 works pretty well for up to 4 threads per core, but isn't
32 // quite as fast as 0 for 2 threads per core.
34 // We need to check what happens for oversubscription / > 4 threads per core.
35 // It is possible that we need to pass the delay value in as a parameter
36 // that the caller determines based on the total # threads / # cores.
43 # define pause_op .byte 0xf3,0x90
47 # define KMP_PREFIX_UNDERSCORE(x) _##x // extra underscore for OS X* symbols
48 # define KMP_LABEL(x) L_##x // form the name of label
49 .macro KMP_CFI_DEF_OFFSET
53 .macro KMP_CFI_REGISTER
61 /* Not sure what .size does in icc, not sure if we need to do something
67 .globl KMP_PREFIX_UNDERSCORE($0)
68 KMP_PREFIX_UNDERSCORE($0):
70 # else // KMP_OS_DARWIN
71 # define KMP_PREFIX_UNDERSCORE(x) x //no extra underscore for Linux* OS symbols
72 // Format labels so that they don't override function names in gdb's backtraces
73 // MIC assembler doesn't accept .L syntax, the L works fine there (as well as
76 # define KMP_LABEL(x) L_##x // local label
78 # define KMP_LABEL(x) .L_##x // local label hidden from backtraces
83 .macro DEBUG_INFO proc
85 // Not sure why we need .type and .size for the functions
92 .globl KMP_PREFIX_UNDERSCORE(\proc)
93 KMP_PREFIX_UNDERSCORE(\proc):
96 .macro KMP_CFI_DEF_OFFSET sz
97 .cfi_def_cfa_offset \sz
99 .macro KMP_CFI_OFFSET reg, sz
102 .macro KMP_CFI_REGISTER reg
103 .cfi_def_cfa_register \reg
105 .macro KMP_CFI_DEF reg, sz
106 .cfi_def_cfa \reg,\sz
108 # endif // KMP_OS_DARWIN
109 #endif // KMP_ARCH_X86 || KMP_ARCH_x86_64
111 #if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_ARM)
114 # define KMP_PREFIX_UNDERSCORE(x) _##x // extra underscore for OS X* symbols
115 # define KMP_LABEL(x) L_##x // form the name of label
122 /* Not sure what .size does in icc, not sure if we need to do something
129 .globl KMP_PREFIX_UNDERSCORE($0)
130 KMP_PREFIX_UNDERSCORE($0):
132 # elif KMP_OS_WINDOWS
133 # define KMP_PREFIX_UNDERSCORE(x) x // no extra underscore for Windows/ARM64 symbols
134 // Format labels so that they don't override function names in gdb's backtraces
135 # define KMP_LABEL(x) .L_##x // local label hidden from backtraces
141 .macro DEBUG_INFO proc
147 .globl KMP_PREFIX_UNDERSCORE(\proc)
148 KMP_PREFIX_UNDERSCORE(\proc):
150 # else // KMP_OS_DARWIN || KMP_OS_WINDOWS
151 # define KMP_PREFIX_UNDERSCORE(x) x // no extra underscore for Linux* OS symbols
152 // Format labels so that they don't override function names in gdb's backtraces
153 # define KMP_LABEL(x) .L_##x // local label hidden from backtraces
159 .macro DEBUG_INFO proc
161 // Not sure why we need .type and .size for the functions
164 .type \proc,%function
166 .type \proc,@function
173 .globl KMP_PREFIX_UNDERSCORE(\proc)
174 KMP_PREFIX_UNDERSCORE(\proc):
177 # endif // KMP_OS_DARWIN
179 #endif // (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && (KMP_ARCH_AARCH64 || KMP_ARCH_ARM)
181 .macro COMMON name, size, align_power
185 .comm \name, \size, \align_power
186 #else // !KMP_OS_DARWIN && !KMP_OS_WINDOWS
187 .comm \name, \size, (1<<(\align_power))
191 // -----------------------------------------------------------------------
193 // -----------------------------------------------------------------------
195 #ifdef KMP_GOMP_COMPAT
197 // Support for unnamed common blocks.
199 // Because the symbol ".gomp_critical_user_" contains a ".", we have to
200 // put this stuff in assembly.
205 .comm .gomp_critical_user_,32
207 .globl ___kmp_unnamed_critical_addr
208 ___kmp_unnamed_critical_addr:
209 .long .gomp_critical_user_
210 # else /* Linux* OS */
212 .comm .gomp_critical_user_,32,8
215 .global __kmp_unnamed_critical_addr
216 __kmp_unnamed_critical_addr:
217 .4byte .gomp_critical_user_
218 .type __kmp_unnamed_critical_addr,@object
219 .size __kmp_unnamed_critical_addr,4
220 # endif /* KMP_OS_DARWIN */
221 # endif /* KMP_ARCH_X86 */
226 .comm .gomp_critical_user_,32
228 .globl ___kmp_unnamed_critical_addr
229 ___kmp_unnamed_critical_addr:
230 .quad .gomp_critical_user_
231 # else /* Linux* OS */
233 .comm .gomp_critical_user_,32,8
236 .global __kmp_unnamed_critical_addr
237 __kmp_unnamed_critical_addr:
238 .8byte .gomp_critical_user_
239 .type __kmp_unnamed_critical_addr,@object
240 .size __kmp_unnamed_critical_addr,8
241 # endif /* KMP_OS_DARWIN */
242 # endif /* KMP_ARCH_X86_64 */
244 #endif /* KMP_GOMP_COMPAT */
247 #if KMP_ARCH_X86 && !KMP_ARCH_PPC64
249 // -----------------------------------------------------------------------
250 // microtasking routines specifically written for IA-32 architecture
252 // -----------------------------------------------------------------------
254 .ident "Intel Corporation"
258 // __kmp_x86_pause( void );
266 DEBUG_INFO __kmp_x86_pause
268 # if !KMP_ASM_INTRINS
270 //------------------------------------------------------------------------
272 // __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
274 PROC __kmp_test_then_add32
282 DEBUG_INFO __kmp_test_then_add32
284 //------------------------------------------------------------------------
285 // FUNCTION __kmp_xchg_fixed8
288 // __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
295 PROC __kmp_xchg_fixed8
297 movl 4(%esp), %ecx // "p"
298 movb 8(%esp), %al // "d"
304 DEBUG_INFO __kmp_xchg_fixed8
307 //------------------------------------------------------------------------
308 // FUNCTION __kmp_xchg_fixed16
311 // __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
317 PROC __kmp_xchg_fixed16
319 movl 4(%esp), %ecx // "p"
320 movw 8(%esp), %ax // "d"
326 DEBUG_INFO __kmp_xchg_fixed16
329 //------------------------------------------------------------------------
330 // FUNCTION __kmp_xchg_fixed32
333 // __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
340 PROC __kmp_xchg_fixed32
342 movl 4(%esp), %ecx // "p"
343 movl 8(%esp), %eax // "d"
349 DEBUG_INFO __kmp_xchg_fixed32
353 // __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
354 PROC __kmp_compare_and_store8
361 sete %al // if %al == (%ecx) set %al = 1 else set %al = 0
362 and $1, %eax // sign extend previous instruction
365 DEBUG_INFO __kmp_compare_and_store8
368 // __kmp_compare_and_store16(volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv);
369 PROC __kmp_compare_and_store16
376 sete %al // if %ax == (%ecx) set %al = 1 else set %al = 0
377 and $1, %eax // sign extend previous instruction
380 DEBUG_INFO __kmp_compare_and_store16
383 // __kmp_compare_and_store32(volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv);
384 PROC __kmp_compare_and_store32
391 sete %al // if %eax == (%ecx) set %al = 1 else set %al = 0
392 and $1, %eax // sign extend previous instruction
395 DEBUG_INFO __kmp_compare_and_store32
398 // __kmp_compare_and_store64(volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 s );
399 PROC __kmp_compare_and_store64
406 movl 12(%ebp), %eax // "cv" low order word
407 movl 16(%ebp), %edx // "cv" high order word
408 movl 20(%ebp), %ebx // "sv" low order word
409 movl 24(%ebp), %ecx // "sv" high order word
412 sete %al // if %edx:eax == (%edi) set %al = 1 else set %al = 0
413 and $1, %eax // sign extend previous instruction
420 DEBUG_INFO __kmp_compare_and_store64
423 // __kmp_compare_and_store_ret8(volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv);
424 PROC __kmp_compare_and_store_ret8
433 DEBUG_INFO __kmp_compare_and_store_ret8
436 // __kmp_compare_and_store_ret16(volatile kmp_int16 *p, kmp_int16 cv,
438 PROC __kmp_compare_and_store_ret16
447 DEBUG_INFO __kmp_compare_and_store_ret16
450 // __kmp_compare_and_store_ret32(volatile kmp_int32 *p, kmp_int32 cv,
452 PROC __kmp_compare_and_store_ret32
461 DEBUG_INFO __kmp_compare_and_store_ret32
464 // __kmp_compare_and_store_ret64(volatile kmp_int64 *p, kmp_int64 cv,
466 PROC __kmp_compare_and_store_ret64
473 movl 12(%ebp), %eax // "cv" low order word
474 movl 16(%ebp), %edx // "cv" high order word
475 movl 20(%ebp), %ebx // "sv" low order word
476 movl 24(%ebp), %ecx // "sv" high order word
485 DEBUG_INFO __kmp_compare_and_store_ret64
488 //------------------------------------------------------------------------
489 // FUNCTION __kmp_xchg_real32
492 // __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
499 PROC __kmp_xchg_real32
525 DEBUG_INFO __kmp_xchg_real32
527 # endif /* !KMP_ASM_INTRINS */
529 //------------------------------------------------------------------------
531 // __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
532 // int gtid, int tid,
533 // int argc, void *p_argv[]
536 // void **exit_frame_ptr
540 // *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
543 // (*pkfn)( & gtid, & tid, argv[0], ... );
547 // -- Begin __kmp_invoke_microtask
549 PROC __kmp_invoke_microtask
553 KMP_CFI_OFFSET ebp,-8
554 movl %esp,%ebp // establish the base pointer for this routine.
556 subl $8,%esp // allocate space for two local variables.
557 // These varibales are:
561 pushl %ebx // save %ebx to use during this routine
564 movl 28(%ebp),%ebx // get exit_frame address
565 movl %ebp,(%ebx) // save exit_frame
568 movl 20(%ebp),%ebx // Stack alignment - # args
569 addl $2,%ebx // #args +2 Always pass at least 2 args (gtid and tid)
570 shll $2,%ebx // Number of bytes used on stack: (#args+2)*4
572 subl %ebx,%eax // %esp-((#args+2)*4) -> %eax -- without mods, stack ptr would be this
573 movl %eax,%ebx // Save to %ebx
574 andl $0xFFFFFF80,%eax // mask off 7 bits
575 subl %eax,%ebx // Amount to subtract from %esp
576 subl %ebx,%esp // Prepare the stack ptr --
577 // now it will be aligned on 128-byte boundary at the call
579 movl 24(%ebp),%eax // copy from p_argv[]
580 movl %eax,-4(%ebp) // into the local variable *argv.
582 movl 20(%ebp),%ebx // argc is 20(%ebp)
587 jg KMP_LABEL(invoke_4)
588 jmp KMP_LABEL(invoke_3)
592 subl $4,%ebx // decrement argc.
593 addl %ebx,%eax // index into argv.
597 jmp KMP_LABEL(invoke_2)
600 leal 16(%ebp),%eax // push & tid
603 leal 12(%ebp),%eax // push & gtid
607 call *%ebx // call (*pkfn)();
609 movl $1,%eax // return 1;
611 movl -12(%ebp),%ebx // restore %ebx
616 DEBUG_INFO __kmp_invoke_microtask
617 // -- End __kmp_invoke_microtask
621 // __kmp_hardware_timestamp(void)
622 PROC __kmp_hardware_timestamp
626 DEBUG_INFO __kmp_hardware_timestamp
627 // -- End __kmp_hardware_timestamp
629 #endif /* KMP_ARCH_X86 */
634 // -----------------------------------------------------------------------
635 // microtasking routines specifically written for IA-32 architecture and
636 // Intel(R) 64 running Linux* OS
637 // -----------------------------------------------------------------------
640 // mark_description "Intel Corporation";
641 .ident "Intel Corporation"
642 // -- .file "z_Linux_asm.S"
646 // To prevent getting our code into .data section .text added to every routine
647 // definition for x86_64.
648 //------------------------------------------------------------------------
649 # if !KMP_ASM_INTRINS
651 //------------------------------------------------------------------------
652 // FUNCTION __kmp_test_then_add32
655 // __kmp_test_then_add32( volatile kmp_int32 *p, kmp_int32 d );
663 PROC __kmp_test_then_add32
665 movl %esi, %eax // "d"
670 DEBUG_INFO __kmp_test_then_add32
673 //------------------------------------------------------------------------
674 // FUNCTION __kmp_test_then_add64
677 // __kmp_test_then_add64( volatile kmp_int64 *p, kmp_int64 d );
684 PROC __kmp_test_then_add64
686 movq %rsi, %rax // "d"
691 DEBUG_INFO __kmp_test_then_add64
694 //------------------------------------------------------------------------
695 // FUNCTION __kmp_xchg_fixed8
698 // __kmp_xchg_fixed8( volatile kmp_int8 *p, kmp_int8 d );
706 PROC __kmp_xchg_fixed8
708 movb %sil, %al // "d"
714 DEBUG_INFO __kmp_xchg_fixed8
717 //------------------------------------------------------------------------
718 // FUNCTION __kmp_xchg_fixed16
721 // __kmp_xchg_fixed16( volatile kmp_int16 *p, kmp_int16 d );
728 PROC __kmp_xchg_fixed16
736 DEBUG_INFO __kmp_xchg_fixed16
739 //------------------------------------------------------------------------
740 // FUNCTION __kmp_xchg_fixed32
743 // __kmp_xchg_fixed32( volatile kmp_int32 *p, kmp_int32 d );
751 PROC __kmp_xchg_fixed32
753 movl %esi, %eax // "d"
759 DEBUG_INFO __kmp_xchg_fixed32
762 //------------------------------------------------------------------------
763 // FUNCTION __kmp_xchg_fixed64
766 // __kmp_xchg_fixed64( volatile kmp_int64 *p, kmp_int64 d );
773 PROC __kmp_xchg_fixed64
775 movq %rsi, %rax // "d"
781 DEBUG_INFO __kmp_xchg_fixed64
784 //------------------------------------------------------------------------
785 // FUNCTION __kmp_compare_and_store8
788 // __kmp_compare_and_store8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
797 PROC __kmp_compare_and_store8
799 movb %sil, %al // "cv"
802 sete %al // if %al == (%rdi) set %al = 1 else set %al = 0
803 andq $1, %rax // sign extend previous instruction for return value
806 DEBUG_INFO __kmp_compare_and_store8
809 //------------------------------------------------------------------------
810 // FUNCTION __kmp_compare_and_store16
813 // __kmp_compare_and_store16( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
822 PROC __kmp_compare_and_store16
824 movw %si, %ax // "cv"
827 sete %al // if %ax == (%rdi) set %al = 1 else set %al = 0
828 andq $1, %rax // sign extend previous instruction for return value
831 DEBUG_INFO __kmp_compare_and_store16
834 //------------------------------------------------------------------------
835 // FUNCTION __kmp_compare_and_store32
838 // __kmp_compare_and_store32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
847 PROC __kmp_compare_and_store32
849 movl %esi, %eax // "cv"
852 sete %al // if %eax == (%rdi) set %al = 1 else set %al = 0
853 andq $1, %rax // sign extend previous instruction for return value
856 DEBUG_INFO __kmp_compare_and_store32
859 //------------------------------------------------------------------------
860 // FUNCTION __kmp_compare_and_store64
863 // __kmp_compare_and_store64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
871 PROC __kmp_compare_and_store64
873 movq %rsi, %rax // "cv"
876 sete %al // if %rax == (%rdi) set %al = 1 else set %al = 0
877 andq $1, %rax // sign extend previous instruction for return value
880 DEBUG_INFO __kmp_compare_and_store64
882 //------------------------------------------------------------------------
883 // FUNCTION __kmp_compare_and_store_ret8
886 // __kmp_compare_and_store_ret8( volatile kmp_int8 *p, kmp_int8 cv, kmp_int8 sv );
895 PROC __kmp_compare_and_store_ret8
897 movb %sil, %al // "cv"
902 DEBUG_INFO __kmp_compare_and_store_ret8
905 //------------------------------------------------------------------------
906 // FUNCTION __kmp_compare_and_store_ret16
909 // __kmp_compare_and_store16_ret( volatile kmp_int16 *p, kmp_int16 cv, kmp_int16 sv );
918 PROC __kmp_compare_and_store_ret16
920 movw %si, %ax // "cv"
925 DEBUG_INFO __kmp_compare_and_store_ret16
928 //------------------------------------------------------------------------
929 // FUNCTION __kmp_compare_and_store_ret32
932 // __kmp_compare_and_store_ret32( volatile kmp_int32 *p, kmp_int32 cv, kmp_int32 sv );
941 PROC __kmp_compare_and_store_ret32
943 movl %esi, %eax // "cv"
948 DEBUG_INFO __kmp_compare_and_store_ret32
951 //------------------------------------------------------------------------
952 // FUNCTION __kmp_compare_and_store_ret64
955 // __kmp_compare_and_store_ret64( volatile kmp_int64 *p, kmp_int64 cv, kmp_int64 sv );
963 PROC __kmp_compare_and_store_ret64
965 movq %rsi, %rax // "cv"
970 DEBUG_INFO __kmp_compare_and_store_ret64
972 # endif /* !KMP_ASM_INTRINS */
977 # if !KMP_ASM_INTRINS
979 //------------------------------------------------------------------------
980 // FUNCTION __kmp_xchg_real32
983 // __kmp_xchg_real32( volatile kmp_real32 *addr, kmp_real32 data );
987 // data: %xmm0 (lower 4 bytes)
989 // return: %xmm0 (lower 4 bytes)
991 PROC __kmp_xchg_real32
993 movd %xmm0, %eax // load "data" to eax
998 movd %eax, %xmm0 // load old value into return register
1002 DEBUG_INFO __kmp_xchg_real32
1005 //------------------------------------------------------------------------
1006 // FUNCTION __kmp_xchg_real64
1009 // __kmp_xchg_real64( volatile kmp_real64 *addr, kmp_real64 data );
1013 // data: %xmm0 (lower 8 bytes)
1014 // return: %xmm0 (lower 8 bytes)
1016 PROC __kmp_xchg_real64
1018 movd %xmm0, %rax // load "data" to rax
1023 movd %rax, %xmm0 // load old value into return register
1026 DEBUG_INFO __kmp_xchg_real64
1029 # endif /* !KMP_MIC */
1031 # endif /* !KMP_ASM_INTRINS */
1033 //------------------------------------------------------------------------
1035 // __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
1036 // int gtid, int tid,
1037 // int argc, void *p_argv[]
1040 // void **exit_frame_ptr
1044 // *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
1047 // (*pkfn)( & gtid, & tid, argv[0], ... );
1051 // note: at call to pkfn must have %rsp 128-byte aligned for compiler
1062 // __gtid: gtid parm pushed on stack so can pass >id to pkfn
1063 // __tid: tid parm pushed on stack so can pass &tid to pkfn
1066 // %rax: used all over the place
1067 // %rdx: used in stack pointer alignment calculation
1068 // %r11: used to traverse p_argv array
1069 // %rsi: used as temporary for stack parameters
1070 // used as temporary for number of pkfn parms to push
1071 // %rbx: used to hold pkfn address, and zero constant, callee-save
1073 // return: %eax (always 1/TRUE)
1077 // -- Begin __kmp_invoke_microtask
1080 PROC __kmp_invoke_microtask
1082 pushq %rbp // save base pointer
1083 KMP_CFI_DEF_OFFSET 16
1084 KMP_CFI_OFFSET rbp,-16
1085 movq %rsp,%rbp // establish the base pointer for this routine.
1086 KMP_CFI_REGISTER rbp
1089 movq %rbp, (%r9) // save exit_frame
1092 pushq %rbx // %rbx is callee-saved register
1093 pushq %rsi // Put gtid on stack so can pass &tgid to pkfn
1094 pushq %rdx // Put tid on stack so can pass &tid to pkfn
1096 movq %rcx, %rax // Stack alignment calculation begins; argc -> %rax
1097 movq $0, %rbx // constant for cmovs later
1098 subq $4, %rax // subtract four args passed in registers to pkfn
1100 js KMP_LABEL(kmp_0) // jump to movq
1101 jmp KMP_LABEL(kmp_0_exit) // jump ahead
1103 movq %rbx, %rax // zero negative value in %rax <- max(0, argc-4)
1104 KMP_LABEL(kmp_0_exit):
1106 cmovsq %rbx, %rax // zero negative value in %rax <- max(0, argc-4)
1109 movq %rax, %rsi // save max(0, argc-4) -> %rsi for later
1110 shlq $3, %rax // Number of bytes used on stack: max(0, argc-4)*8
1113 subq %rax, %rdx // %rsp-(max(0,argc-4)*8) -> %rdx --
1114 // without align, stack ptr would be this
1115 movq %rdx, %rax // Save to %rax
1117 andq $0xFFFFFFFFFFFFFF80, %rax // mask off lower 7 bits (128 bytes align)
1118 subq %rax, %rdx // Amount to subtract from %rsp
1119 subq %rdx, %rsp // Prepare the stack ptr --
1120 // now %rsp will align to 128-byte boundary at call site
1122 // setup pkfn parameter reg and stack
1123 movq %rcx, %rax // argc -> %rax
1125 je KMP_LABEL(kmp_invoke_pass_parms) // jump ahead if no parms to push
1126 shlq $3, %rcx // argc*8 -> %rcx
1127 movq %r8, %rdx // p_argv -> %rdx
1128 addq %rcx, %rdx // &p_argv[argc] -> %rdx
1130 movq %rsi, %rcx // max (0, argc-4) -> %rcx
1132 KMP_LABEL(kmp_invoke_push_parms):
1133 // push nth - 7th parms to pkfn on stack
1134 subq $8, %rdx // decrement p_argv pointer to previous parm
1135 movq (%rdx), %rsi // p_argv[%rcx-1] -> %rsi
1136 pushq %rsi // push p_argv[%rcx-1] onto stack (reverse order)
1139 // C69570: "X86_64_RELOC_BRANCH not supported" error at linking on mac_32e
1140 // if the name of the label that is an operand of this jecxz starts with a dot (".");
1141 // Apple's linker does not support 1-byte length relocation;
1142 // Resolution: replace all .labelX entries with L_labelX.
1144 jecxz KMP_LABEL(kmp_invoke_pass_parms) // stop when four p_argv[] parms left
1145 jmp KMP_LABEL(kmp_invoke_push_parms)
1147 KMP_LABEL(kmp_invoke_pass_parms): // put 1st - 6th parms to pkfn in registers.
1148 // order here is important to avoid trashing
1149 // registers used for both input and output parms!
1150 movq %rdi, %rbx // pkfn -> %rbx
1151 leaq __gtid(%rbp), %rdi // >id -> %rdi (store 1st parm to pkfn)
1152 leaq __tid(%rbp), %rsi // &tid -> %rsi (store 2nd parm to pkfn)
1154 movq %r8, %r11 // p_argv -> %r11
1157 cmpq $4, %rax // argc >= 4?
1158 jns KMP_LABEL(kmp_4) // jump to movq
1159 jmp KMP_LABEL(kmp_4_exit) // jump ahead
1161 movq 24(%r11), %r9 // p_argv[3] -> %r9 (store 6th parm to pkfn)
1162 KMP_LABEL(kmp_4_exit):
1164 cmpq $3, %rax // argc >= 3?
1165 jns KMP_LABEL(kmp_3) // jump to movq
1166 jmp KMP_LABEL(kmp_3_exit) // jump ahead
1168 movq 16(%r11), %r8 // p_argv[2] -> %r8 (store 5th parm to pkfn)
1169 KMP_LABEL(kmp_3_exit):
1171 cmpq $2, %rax // argc >= 2?
1172 jns KMP_LABEL(kmp_2) // jump to movq
1173 jmp KMP_LABEL(kmp_2_exit) // jump ahead
1175 movq 8(%r11), %rcx // p_argv[1] -> %rcx (store 4th parm to pkfn)
1176 KMP_LABEL(kmp_2_exit):
1178 cmpq $1, %rax // argc >= 1?
1179 jns KMP_LABEL(kmp_1) // jump to movq
1180 jmp KMP_LABEL(kmp_1_exit) // jump ahead
1182 movq (%r11), %rdx // p_argv[0] -> %rdx (store 3rd parm to pkfn)
1183 KMP_LABEL(kmp_1_exit):
1185 cmpq $4, %rax // argc >= 4?
1186 cmovnsq 24(%r11), %r9 // p_argv[3] -> %r9 (store 6th parm to pkfn)
1188 cmpq $3, %rax // argc >= 3?
1189 cmovnsq 16(%r11), %r8 // p_argv[2] -> %r8 (store 5th parm to pkfn)
1191 cmpq $2, %rax // argc >= 2?
1192 cmovnsq 8(%r11), %rcx // p_argv[1] -> %rcx (store 4th parm to pkfn)
1194 cmpq $1, %rax // argc >= 1?
1195 cmovnsq (%r11), %rdx // p_argv[0] -> %rdx (store 3rd parm to pkfn)
1198 call *%rbx // call (*pkfn)();
1199 movq $1, %rax // move 1 into return register;
1201 movq -8(%rbp), %rbx // restore %rbx using %rbp since %rsp was modified
1202 movq %rbp, %rsp // restore stack pointer
1203 popq %rbp // restore frame pointer
1207 DEBUG_INFO __kmp_invoke_microtask
1208 // -- End __kmp_invoke_microtask
1211 // __kmp_hardware_timestamp(void)
1213 PROC __kmp_hardware_timestamp
1219 DEBUG_INFO __kmp_hardware_timestamp
1220 // -- End __kmp_hardware_timestamp
1222 //------------------------------------------------------------------------
1223 // FUNCTION __kmp_bsr32
1226 // __kmp_bsr32( int );
1233 DEBUG_INFO __kmp_bsr32
1235 // -----------------------------------------------------------------------
1236 #endif /* KMP_ARCH_X86_64 */
1239 #if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_AARCH64
1241 //------------------------------------------------------------------------
1243 // __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
1244 // int gtid, int tid,
1245 // int argc, void *p_argv[]
1248 // void **exit_frame_ptr
1252 // *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
1255 // (*pkfn)( & gtid, & tid, argv[0], ... );
1257 // // FIXME: This is done at call-site and can be removed here.
1259 // *exit_frame_ptr = 0;
1274 // __gtid: gtid parm pushed on stack so can pass >id to pkfn
1275 // __tid: tid parm pushed on stack so can pass &tid to pkfn
1278 // x8: used to hold pkfn address
1279 // w9: used as temporary for number of pkfn parms
1280 // x10: used to traverse p_argv array
1281 // x11: used as temporary for stack placement calculation
1282 // x12: used as temporary for stack parameters
1283 // x19: used to preserve exit_frame_ptr, callee-save
1285 // return: w0 (always 1/TRUE)
1291 // -- Begin __kmp_invoke_microtask
1294 PROC __kmp_invoke_microtask
1296 stp x29, x30, [sp, #-16]!
1298 stp x19, x20, [sp, #-16]!
1303 add w9, w9, w3, lsr #1
1304 sub sp, sp, w9, uxtw #4
1308 str w1, [x29, #-__gtid]
1309 str w2, [x29, #-__tid]
1317 sub x0, x29, #__gtid
1320 cbz w9, KMP_LABEL(kmp_1)
1324 cbz w9, KMP_LABEL(kmp_1)
1328 cbz w9, KMP_LABEL(kmp_1)
1332 cbz w9, KMP_LABEL(kmp_1)
1336 cbz w9, KMP_LABEL(kmp_1)
1340 cbz w9, KMP_LABEL(kmp_1)
1345 cbz w9, KMP_LABEL(kmp_1)
1355 ldp x19, x20, [sp], #16
1357 ldp x29, x30, [sp], #16
1360 DEBUG_INFO __kmp_invoke_microtask
1361 // -- End __kmp_invoke_microtask
1363 #endif /* (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_AARCH64 */
1365 #if (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_ARM
1367 //------------------------------------------------------------------------
1369 // __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
1370 // int gtid, int tid,
1371 // int argc, void *p_argv[]
1374 // void **exit_frame_ptr
1378 // *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
1381 // (*pkfn)( & gtid, & tid, argv[0], ... );
1383 // // FIXME: This is done at call-site and can be removed here.
1385 // *exit_frame_ptr = 0;
1396 // r4(stack): p_argv
1397 // r5(stack): &exit_frame
1400 // __gtid: gtid parm pushed on stack so can pass >id to pkfn
1401 // __tid: tid parm pushed on stack so can pass &tid to pkfn
1404 // r4: used to hold pkfn address
1405 // r5: used as temporary for number of pkfn parms
1406 // r6: used to traverse p_argv array
1407 // r7: frame pointer (in some configurations)
1408 // r8: used as temporary for stack placement calculation
1409 // and as pointer to base of callee saved area
1410 // r9: used as temporary for stack parameters
1411 // r10: used to preserve exit_frame_ptr, callee-save
1412 // r11: frame pointer (in some configurations)
1414 // return: r0 (always 1/TRUE)
1420 // -- Begin __kmp_invoke_microtask
1423 PROC __kmp_invoke_microtask
1425 // Pushing one extra register (r3) to keep the stack aligned
1426 // for when we call pkfn below
1428 // Load p_argv and &exit_frame
1434 # if KMP_OS_DARWIN || (defined(__thumb__) && !KMP_OS_WINDOWS)
1448 // Calculate how much stack to allocate, in increments of 8 bytes.
1449 // We strictly need 4*(argc-2) bytes (2 arguments are passed in
1450 // registers) but allocate 4*argc for simplicity (to avoid needing
1451 // to handle the argc<2 cases). We align the number of bytes
1452 // allocated to 8 bytes, to keep the stack aligned. (Since we
1453 // already allocate more than enough, it's ok to round down
1454 // instead of up for the alignment.) We allocate another extra
1455 // 8 bytes for gtid and tid.
1457 add r5, r5, r3, lsr #1
1458 sub sp, sp, r5, lsl #3
1460 str r1, [r8, #-__gtid]
1461 str r2, [r8, #-__tid]
1466 // Prepare the first 2 parameters to pkfn - pointers to gtid and tid
1467 // in our stack frame.
1473 // Load p_argv[0] and p_argv[1] into r2 and r3, if argc >= 1/2
1475 beq KMP_LABEL(kmp_1)
1479 beq KMP_LABEL(kmp_1)
1482 // Loop, loading the rest of p_argv and writing the elements on the
1486 beq KMP_LABEL(kmp_1)
1505 DEBUG_INFO __kmp_invoke_microtask
1506 // -- End __kmp_invoke_microtask
1508 #endif /* (KMP_OS_LINUX || KMP_OS_DARWIN || KMP_OS_WINDOWS) && KMP_ARCH_AARCH64 */
1512 //------------------------------------------------------------------------
1514 // __kmp_invoke_microtask( void (*pkfn) (int gtid, int tid, ...),
1515 // int gtid, int tid,
1516 // int argc, void *p_argv[]
1519 // void **exit_frame_ptr
1523 // *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
1526 // (*pkfn)( & gtid, & tid, argv[0], ... );
1528 // // FIXME: This is done at call-site and can be removed here.
1530 // *exit_frame_ptr = 0;
1544 // return: r3 (always 1/TRUE)
1547 # if KMP_ARCH_PPC64_ELFv2
1550 .globl __kmp_invoke_microtask
1552 # if KMP_ARCH_PPC64_ELFv2
1558 .type __kmp_invoke_microtask,@function
1560 # if KMP_ARCH_PPC64_ELFv2
1561 __kmp_invoke_microtask:
1564 addis 2, 12, .TOC.-.Lfunc_gep0@ha
1565 addi 2, 2, .TOC.-.Lfunc_gep0@l
1567 .localentry __kmp_invoke_microtask, .Lfunc_lep0-.Lfunc_gep0
1569 .section .opd,"aw",@progbits
1570 __kmp_invoke_microtask:
1579 // -- Begin __kmp_invoke_microtask
1582 // We need to allocate a stack frame large enough to hold all of the parameters
1583 // on the stack for the microtask plus what this function needs. That's 48
1584 // bytes under the ELFv1 ABI (32 bytes under ELFv2), plus 8*(2 + argc) for the
1585 // parameters to the microtask, plus 8 bytes to store the values of r4 and r5,
1586 // and 8 bytes to store r31. With OMP-T support, we need an additional 8 bytes
1587 // to save r30 to hold a copy of r8.
1594 // This is unusual because normally we'd set r31 equal to r1 after the stack
1595 // frame is established. In this case, however, we need to dynamically compute
1596 // the stack frame size, and so we keep a direct copy of r1 to access our
1597 // register save areas and restore the r1 value before returning.
1599 .cfi_def_cfa_register r31
1603 // Compute the size necessary for the local stack frame.
1604 # if KMP_ARCH_PPC64_ELFv2
1613 // We need to make sure that the stack frame stays aligned (to 16 bytes).
1617 // Establish the local stack frame.
1621 .cfi_offset r30, -16
1627 // Store gtid and tid to the stack because they're passed by reference to the microtask.
1667 // There are more than 6 microtask parameters, so we need to store the
1668 // remainder to the stack.
1672 // These are set to 8 bytes before the first desired store address (we're using
1673 // pre-increment loads and stores in the loop below). The parameter save area
1674 // for the microtask begins 48 + 8*8 == 112 bytes above r1 for ELFv1 and
1675 // 32 + 8*8 == 96 bytes above r1 for ELFv2.
1677 # if KMP_ARCH_PPC64_ELFv2
1689 # if KMP_ARCH_PPC64_ELFv2
1694 // For ELFv1, we need to load the actual function address from the function descriptor.
1705 # if KMP_ARCH_PPC64_ELFv2
1731 .size __kmp_invoke_microtask, .Lfunc_end0-.Lfunc_begin0
1734 // -- End __kmp_invoke_microtask
1736 #endif /* KMP_ARCH_PPC64 */
1738 #if KMP_ARCH_RISCV64
1740 //------------------------------------------------------------------------
1742 // typedef void (*microtask_t)(int *gtid, int *tid, ...);
1744 // int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
1748 // void **exit_frame_ptr
1752 // *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
1755 // (*pkfn)(>id, &tid, argv[0], ...);
1766 // a5: exit_frame_ptr
1769 // __gtid: gtid param pushed on stack so can pass >id to pkfn
1770 // __tid: tid param pushed on stack so can pass &tid to pkfn
1774 // t0: used to calculate the dynamic stack size / used to hold pkfn address
1775 // t1: used as temporary for stack placement calculation
1776 // t2: used as temporary for stack arguments
1777 // t3: used as temporary for number of remaining pkfn parms
1778 // t4: used to traverse p_argv array
1780 // return: a0 (always 1/TRUE)
1786 // -- Begin __kmp_invoke_microtask
1789 .globl __kmp_invoke_microtask
1791 .type __kmp_invoke_microtask,@function
1792 __kmp_invoke_microtask:
1795 // First, save ra and fp
1804 // Compute the dynamic stack size:
1806 // - We need 8 bytes for storing 'gtid' and 'tid', so we can pass them by
1808 // - We need 8 bytes for each argument that cannot be passed to the 'pkfn'
1809 // function by register. Given that we have 8 of such registers (a[0-7])
1810 // and two + 'argc' arguments (consider >id and &tid), we need to
1811 // reserve max(0, argc - 6)*8 extra bytes
1813 // The total number of bytes is then max(0, argc - 6)*8 + 8
1815 // Compute max(0, argc - 6) using the following bithack:
1816 // max(0, x) = x - (x & (x >> 31)), where x := argc - 6
1817 // Source: http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
1828 // Align the stack to 16 bytes
1836 // Save frame pointer into exit_frame
1840 // Prepare arguments for the pkfn function (first 8 using a0-a7 registers)
1871 // Prepare any additional argument passed through the stack
1885 // Call pkfn function
1888 // Restore stack and return
1898 .size __kmp_invoke_microtask, .Lfunc_end0-__kmp_invoke_microtask
1901 // -- End __kmp_invoke_microtask
1903 #endif /* KMP_ARCH_RISCV64 */
1905 #if KMP_ARCH_LOONGARCH64
1907 //------------------------------------------------------------------------
1909 // typedef void (*microtask_t)(int *gtid, int *tid, ...);
1911 // int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
1915 // void **exit_frame_ptr
1919 // *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
1922 // (*pkfn)(>id, &tid, argv[0], ...);
1933 // a5: exit_frame_ptr
1936 // __gtid: gtid param pushed on stack so can pass >id to pkfn
1937 // __tid: tid param pushed on stack so can pass &tid to pkfn
1941 // t0: used to calculate the dynamic stack size / used to hold pkfn address
1942 // t1: used as temporary for stack placement calculation
1943 // t2: used as temporary for stack arguments
1944 // t3: used as temporary for number of remaining pkfn parms
1945 // t4: used to traverse p_argv array
1947 // return: a0 (always 1/TRUE)
1950 // -- Begin __kmp_invoke_microtask
1953 .globl __kmp_invoke_microtask
1955 .type __kmp_invoke_microtask,@function
1956 __kmp_invoke_microtask:
1959 // First, save ra and fp
1960 addi.d $sp, $sp, -16
1968 // Compute the dynamic stack size:
1970 // - We need 8 bytes for storing 'gtid' and 'tid', so we can pass them by
1972 // - We need 8 bytes for each argument that cannot be passed to the 'pkfn'
1973 // function by register. Given that we have 8 of such registers (a[0-7])
1974 // and two + 'argc' arguments (consider >id and &tid), we need to
1975 // reserve max(0, argc - 6)*8 extra bytes
1977 // The total number of bytes is then max(0, argc - 6)*8 + 8
1981 masknez $t0, $t0, $t1
1986 // Align the stack to 16 bytes
1987 bstrins.d $sp, $zero, 3, 0
1994 // Save frame pointer into exit_frame
1998 // Prepare arguments for the pkfn function (first 8 using a0-a7 registers)
2003 addi.d $a0, $fp, -20
2004 addi.d $a1, $fp, -24
2029 // Prepare any additional argument passed through the stack
2043 // Call pkfn function
2046 // Restore stack and return
2048 addi.d $a0, $zero, 1
2050 addi.d $sp, $fp, -16
2056 .size __kmp_invoke_microtask, .Lfunc_end0-__kmp_invoke_microtask
2059 // -- End __kmp_invoke_microtask
2061 #endif /* KMP_ARCH_LOONGARCH64 */
2065 //------------------------------------------------------------------------
2067 // typedef void (*microtask_t)(int *gtid, int *tid, ...);
2069 // int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
2073 // void **exit_frame_ptr
2077 // *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
2080 // (*pkfn)(>id, &tid, argv[0], ...);
2091 // s5: exit_frame_ptr
2094 // __gtid: gtid param pushed on stack so can pass >id to pkfn
2095 // __tid: tid param pushed on stack so can pass &tid to pkfn
2099 // s34: used to calculate the dynamic stack size
2100 // s35: used as temporary for stack placement calculation
2101 // s36: used as temporary for stack arguments
2102 // s37: used as temporary for number of remaining pkfn parms
2103 // s38: used to traverse p_argv array
2105 // return: s0 (always 1/TRUE)
2111 // -- Begin __kmp_invoke_microtask
2114 .globl __kmp_invoke_microtask
2115 // A function requires 8 bytes align.
2117 .type __kmp_invoke_microtask,@function
2118 __kmp_invoke_microtask:
2121 // First, save fp and lr. VE stores them at caller stack frame.
2129 // Compute the dynamic stack size:
2131 // - We need 8 bytes for storing 'gtid' and 'tid', so we can pass them
2133 // - We need 8 bytes for whole arguments. We have two + 'argc'
2134 // arguments (condider >id and &tid). We need to reserve
2135 // (argc + 2) * 8 bytes.
2136 // - We need 176 bytes for RSA and others
2138 // The total number of bytes is then (argc + 2) * 8 + 8 + 176.
2140 // |------------------------------|
2141 // | return address of callee | 8(%fp)
2142 // |------------------------------|
2143 // | frame pointer of callee | 0(%fp)
2144 // |------------------------------| <------------------ %fp
2145 // | __tid / __gtid | -8(%fp) / -4(%fp)
2146 // |------------------------------|
2147 // | argc+2 for arguments | 176(%sp)
2148 // |------------------------------|
2150 // |------------------------------|
2151 // | return address |
2152 // |------------------------------|
2153 // | frame pointer |
2154 // |------------------------------| <------------------ %sp
2156 adds.w.sx %s34, 2, %s3
2158 lea %s34, 184(, %s34)
2159 subs.l %sp, %sp, %s34
2161 // Align the stack to 16 bytes.
2167 // Call host to allocate stack if it is necessary.
2168 brge.l %sp, %sl, .L_kmp_pass
2177 lea %s35, 176(, %sp)
2178 adds.w.sx %s37, 0, %s3
2182 // Save frame pointer into exit_frame.
2186 // Prepare arguments for the pkfn function (first 8 using s0-s7
2187 // registers, but need to store stack also because of varargs).
2189 stl %s1, __gtid(%fp)
2192 adds.l %s0, __gtid, %fp
2194 adds.l %s1, __tid, %fp
2197 breq.l 0, %s37, .L_kmp_call
2201 breq.l 1, %s37, .L_kmp_call
2205 breq.l 2, %s37, .L_kmp_call
2209 breq.l 3, %s37, .L_kmp_call
2213 breq.l 4, %s37, .L_kmp_call
2217 breq.l 5, %s37, .L_kmp_call
2221 breq.l 6, %s37, .L_kmp_call
2223 // Prepare any additional argument passed through the stack.
2224 adds.l %s37, -6, %s37
2225 lea %s38, 48(, %s38)
2226 lea %s35, 64(, %s35)
2230 adds.l %s37, -1, %s37
2231 adds.l %s38, 8, %s38
2232 adds.l %s35, 8, %s35
2233 brne.l 0, %s37, .L_kmp_loop
2236 // Call pkfn function.
2242 // Restore stack and return.
2248 .size __kmp_invoke_microtask, .Lfunc_end0-__kmp_invoke_microtask
2251 // -- End __kmp_invoke_microtask
2253 #endif /* KMP_ARCH_VE */
2255 #if KMP_ARCH_ARM || KMP_ARCH_MIPS
2257 COMMON .gomp_critical_user_, 32, 3
2260 .global __kmp_unnamed_critical_addr
2261 __kmp_unnamed_critical_addr:
2262 .4byte .gomp_critical_user_
2264 .size __kmp_unnamed_critical_addr,4
2266 #endif /* KMP_ARCH_ARM */
2268 #if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 || \
2269 KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE
2270 #ifndef KMP_PREFIX_UNDERSCORE
2271 # define KMP_PREFIX_UNDERSCORE(x) x
2274 COMMON .gomp_critical_user_, 32, 3
2277 .global KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr)
2278 KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr):
2279 .8byte .gomp_critical_user_
2281 .size KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr),8
2283 #endif /* KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 ||
2284 KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE */
2287 # if KMP_ARCH_ARM || KMP_ARCH_AARCH64
2288 .section .note.GNU-stack,"",%progbits
2290 .section .note.GNU-stack,"",@progbits