4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
27 #include <sys/modctl.h>
28 #include <sys/dtrace.h>
32 #include <sys/sunddi.h>
35 #define FBT_PUSHL_EBP 0x55
36 #define FBT_MOVL_ESP_EBP0_V0 0x8b
37 #define FBT_MOVL_ESP_EBP1_V0 0xec
38 #define FBT_MOVL_ESP_EBP0_V1 0x89
39 #define FBT_MOVL_ESP_EBP1_V1 0xe5
40 #define FBT_REX_RSP_RBP 0x48
42 #define FBT_POPL_EBP 0x5d
44 #define FBT_RET_IMM16 0xc2
45 #define FBT_LEAVE 0xc9
48 #define FBT_PATCHVAL 0xcc
50 #define FBT_PATCHVAL 0xf0
53 #define FBT_ENTRY "entry"
54 #define FBT_RETURN "return"
55 #define FBT_ADDR2NDX(addr) ((((uintptr_t)(addr)) >> 4) & fbt_probetab_mask)
56 #define FBT_PROBETAB_SIZE 0x8000 /* 32k entries -- 128K total */
58 typedef struct fbt_probe
{
59 struct fbt_probe
*fbtp_hashnext
;
60 uint8_t *fbtp_patchpoint
;
62 uint8_t fbtp_patchval
;
63 uint8_t fbtp_savedval
;
64 uintptr_t fbtp_roffset
;
67 struct modctl
*fbtp_ctl
;
71 struct fbt_probe
*fbtp_next
;
74 static dev_info_t
*fbt_devi
;
75 static dtrace_provider_id_t fbt_id
;
76 static fbt_probe_t
**fbt_probetab
;
77 static int fbt_probetab_size
;
78 static int fbt_probetab_mask
;
79 static int fbt_verbose
= 0;
82 fbt_invop(uintptr_t addr
, uintptr_t *stack
, uintptr_t rval
)
84 uintptr_t stack0
, stack1
, stack2
, stack3
, stack4
;
85 fbt_probe_t
*fbt
= fbt_probetab
[FBT_ADDR2NDX(addr
)];
87 for (; fbt
!= NULL
; fbt
= fbt
->fbtp_hashnext
) {
88 if ((uintptr_t)fbt
->fbtp_patchpoint
== addr
) {
89 if (fbt
->fbtp_roffset
== 0) {
92 * When accessing the arguments on the stack,
93 * we must protect against accessing beyond
94 * the stack. We can safely set NOFAULT here
95 * -- we know that interrupts are already
98 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT
);
99 CPU
->cpu_dtrace_caller
= stack
[i
++];
102 * On amd64, stack[0] contains the dereferenced
103 * stack pointer, stack[1] contains savfp,
104 * stack[2] contains savpc. We want to step
105 * over these entries.
114 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT
|
117 dtrace_probe(fbt
->fbtp_id
, stack0
, stack1
,
118 stack2
, stack3
, stack4
);
120 CPU
->cpu_dtrace_caller
= (uintptr_t)NULL
;
124 * On amd64, we instrument the ret, not the
125 * leave. We therefore need to set the caller
126 * to assure that the top frame of a stack()
129 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT
);
130 CPU
->cpu_dtrace_caller
= stack
[0];
131 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT
|
135 dtrace_probe(fbt
->fbtp_id
, fbt
->fbtp_roffset
,
137 CPU
->cpu_dtrace_caller
= (uintptr_t)NULL
;
140 return (fbt
->fbtp_rval
);
149 fbt_provide_module(void *arg
, struct modctl
*ctl
)
151 struct module
*mp
= ctl
->mod_mp
;
152 char *str
= mp
->strings
;
153 int nsyms
= mp
->nsyms
;
154 Shdr
*symhdr
= mp
->symhdr
;
155 char *modname
= ctl
->mod_modname
;
157 fbt_probe_t
*fbt
, *retfbt
;
162 * Employees of dtrace and their families are ineligible. Void
165 if (strcmp(modname
, "dtrace") == 0)
168 if (ctl
->mod_requisites
!= NULL
) {
169 struct modctl_list
*list
;
171 list
= (struct modctl_list
*)ctl
->mod_requisites
;
173 for (; list
!= NULL
; list
= list
->modl_next
) {
174 if (strcmp(list
->modl_modp
->mod_modname
, "dtrace") == 0)
180 * KMDB is ineligible for instrumentation -- it may execute in
181 * any context, including probe context.
183 if (strcmp(modname
, "kmdbmod") == 0)
186 if (str
== NULL
|| symhdr
== NULL
||
187 symhdr
->sh_addr
== (uintptr_t)NULL
) {
189 * If this module doesn't (yet) have its string or symbol
190 * table allocated, clear out.
195 symsize
= symhdr
->sh_entsize
;
197 if (mp
->fbt_nentries
) {
199 * This module has some FBT entries allocated; we're afraid
205 for (i
= 1; i
< nsyms
; i
++) {
206 uint8_t *instr
, *limit
;
207 Sym
*sym
= (Sym
*)(symhdr
->sh_addr
+ i
* symsize
);
210 if (ELF_ST_TYPE(sym
->st_info
) != STT_FUNC
)
214 * Weak symbols are not candidates. This could be made to
215 * work (where weak functions and their underlying function
216 * appear as two disjoint probes), but it's not simple.
218 if (ELF_ST_BIND(sym
->st_info
) == STB_WEAK
)
221 name
= str
+ sym
->st_name
;
223 if (strstr(name
, "dtrace_") == name
&&
224 strstr(name
, "dtrace_safe_") != name
) {
226 * Anything beginning with "dtrace_" may be called
227 * from probe context unless it explitly indicates
228 * that it won't be called from probe context by
229 * using the prefix "dtrace_safe_".
234 if (strstr(name
, "kdi_") == name
||
235 strstr(name
, "_kdi_") != NULL
) {
237 * Any function name beginning with "kdi_" or
238 * containing the string "_kdi_" is a part of the
239 * kernel debugger interface and may be called in
240 * arbitrary context -- including probe context.
246 * Due to 4524008, _init and _fini may have a bloated st_size.
247 * While this bug was fixed quite some time ago, old drivers
248 * may be lurking. We need to develop a better solution to
249 * this problem, such that correct _init and _fini functions
250 * (the vast majority) may be correctly traced. One solution
251 * may be to scan through the entire symbol table to see if
252 * any symbol overlaps with _init. If none does, set a bit in
253 * the module structure that this module has correct _init and
254 * _fini sizes. This will cause some pain the first time a
255 * module is scanned, but at least it would be O(N) instead of
258 if (strcmp(name
, "_init") == 0)
261 if (strcmp(name
, "_fini") == 0)
265 * In order to be eligible, the function must begin with the
266 * following sequence:
271 * Note that there are two variants of encodings that generate
272 * the movl; we must check for both. For 64-bit, we would
273 * normally insist that a function begin with the following
279 * However, the compiler for 64-bit often splits these two
280 * instructions -- and the first instruction in the function
281 * is often not the pushq. As a result, on 64-bit we look
282 * for any "pushq %rbp" in the function and we instrument
283 * this with a breakpoint instruction.
285 instr
= (uint8_t *)sym
->st_value
;
286 limit
= (uint8_t *)(sym
->st_value
+ sym
->st_size
);
289 while (instr
< limit
) {
290 if (*instr
== FBT_PUSHL_EBP
)
293 if ((size
= dtrace_instr_size(instr
)) <= 0)
299 if (instr
>= limit
|| *instr
!= FBT_PUSHL_EBP
) {
301 * We either don't save the frame pointer in this
302 * function, or we ran into some disassembly
303 * screw-up. Either way, we bail.
308 if (instr
[0] != FBT_PUSHL_EBP
)
311 if (!(instr
[1] == FBT_MOVL_ESP_EBP0_V0
&&
312 instr
[2] == FBT_MOVL_ESP_EBP1_V0
) &&
313 !(instr
[1] == FBT_MOVL_ESP_EBP0_V1
&&
314 instr
[2] == FBT_MOVL_ESP_EBP1_V1
))
318 fbt
= kmem_zalloc(sizeof (fbt_probe_t
), KM_SLEEP
);
319 fbt
->fbtp_name
= name
;
320 fbt
->fbtp_id
= dtrace_probe_create(fbt_id
, modname
,
321 name
, FBT_ENTRY
, 3, fbt
);
322 fbt
->fbtp_patchpoint
= instr
;
324 fbt
->fbtp_loadcnt
= ctl
->mod_loadcnt
;
325 fbt
->fbtp_rval
= DTRACE_INVOP_PUSHL_EBP
;
326 fbt
->fbtp_savedval
= *instr
;
327 fbt
->fbtp_patchval
= FBT_PATCHVAL
;
329 fbt
->fbtp_hashnext
= fbt_probetab
[FBT_ADDR2NDX(instr
)];
330 fbt
->fbtp_symndx
= i
;
331 fbt_probetab
[FBT_ADDR2NDX(instr
)] = fbt
;
341 * If this disassembly fails, then we've likely walked off into
342 * a jump table or some other unsuitable area. Bail out of the
345 if ((size
= dtrace_instr_size(instr
)) <= 0)
350 * We only instrument "ret" on amd64 -- we don't yet instrument
351 * ret imm16, largely because the compiler doesn't seem to
352 * (yet) emit them in the kernel...
354 if (*instr
!= FBT_RET
) {
360 (*instr
== FBT_POPL_EBP
|| *instr
== FBT_LEAVE
) &&
361 (*(instr
+ 1) == FBT_RET
||
362 *(instr
+ 1) == FBT_RET_IMM16
))) {
369 * We (desperately) want to avoid erroneously instrumenting a
370 * jump table, especially given that our markers are pretty
371 * short: two bytes on x86, and just one byte on amd64. To
372 * determine if we're looking at a true instruction sequence
373 * or an inline jump table that happens to contain the same
374 * byte sequences, we resort to some heuristic sleeze: we
375 * treat this instruction as being contained within a pointer,
376 * and see if that pointer points to within the body of the
377 * function. If it does, we refuse to instrument it.
379 for (j
= 0; j
< sizeof (uintptr_t); j
++) {
380 uintptr_t check
= (uintptr_t)instr
- j
;
383 if (check
< sym
->st_value
)
386 if (check
+ sizeof (uintptr_t) > (uintptr_t)limit
)
389 ptr
= *(uint8_t **)check
;
391 if (ptr
>= (uint8_t *)sym
->st_value
&& ptr
< limit
) {
400 fbt
= kmem_zalloc(sizeof (fbt_probe_t
), KM_SLEEP
);
401 fbt
->fbtp_name
= name
;
403 if (retfbt
== NULL
) {
404 fbt
->fbtp_id
= dtrace_probe_create(fbt_id
, modname
,
405 name
, FBT_RETURN
, 3, fbt
);
407 retfbt
->fbtp_next
= fbt
;
408 fbt
->fbtp_id
= retfbt
->fbtp_id
;
412 fbt
->fbtp_patchpoint
= instr
;
414 fbt
->fbtp_loadcnt
= ctl
->mod_loadcnt
;
417 if (*instr
== FBT_POPL_EBP
) {
418 fbt
->fbtp_rval
= DTRACE_INVOP_POPL_EBP
;
420 ASSERT(*instr
== FBT_LEAVE
);
421 fbt
->fbtp_rval
= DTRACE_INVOP_LEAVE
;
424 (uintptr_t)(instr
- (uint8_t *)sym
->st_value
) + 1;
427 ASSERT(*instr
== FBT_RET
);
428 fbt
->fbtp_rval
= DTRACE_INVOP_RET
;
430 (uintptr_t)(instr
- (uint8_t *)sym
->st_value
);
433 fbt
->fbtp_savedval
= *instr
;
434 fbt
->fbtp_patchval
= FBT_PATCHVAL
;
435 fbt
->fbtp_hashnext
= fbt_probetab
[FBT_ADDR2NDX(instr
)];
436 fbt
->fbtp_symndx
= i
;
437 fbt_probetab
[FBT_ADDR2NDX(instr
)] = fbt
;
448 fbt_destroy(void *arg
, dtrace_id_t id
, void *parg
)
450 fbt_probe_t
*fbt
= parg
, *next
, *hash
, *last
;
451 struct modctl
*ctl
= fbt
->fbtp_ctl
;
455 if (ctl
!= NULL
&& ctl
->mod_loadcnt
== fbt
->fbtp_loadcnt
) {
456 if ((ctl
->mod_loadcnt
== fbt
->fbtp_loadcnt
&&
459 (ctl
->mod_mp
))->fbt_nentries
--;
464 * Now we need to remove this probe from the fbt_probetab.
466 ndx
= FBT_ADDR2NDX(fbt
->fbtp_patchpoint
);
468 hash
= fbt_probetab
[ndx
];
470 while (hash
!= fbt
) {
471 ASSERT(hash
!= NULL
);
473 hash
= hash
->fbtp_hashnext
;
477 last
->fbtp_hashnext
= fbt
->fbtp_hashnext
;
479 fbt_probetab
[ndx
] = fbt
->fbtp_hashnext
;
482 next
= fbt
->fbtp_next
;
483 kmem_free(fbt
, sizeof (fbt_probe_t
));
486 } while (fbt
!= NULL
);
491 fbt_enable(void *arg
, dtrace_id_t id
, void *parg
)
493 fbt_probe_t
*fbt
= parg
;
494 struct modctl
*ctl
= fbt
->fbtp_ctl
;
498 if (!ctl
->mod_loaded
) {
500 cmn_err(CE_NOTE
, "fbt is failing for probe %s "
501 "(module %s unloaded)",
502 fbt
->fbtp_name
, ctl
->mod_modname
);
509 * Now check that our modctl has the expected load count. If it
510 * doesn't, this module must have been unloaded and reloaded -- and
511 * we're not going to touch it.
513 if (ctl
->mod_loadcnt
!= fbt
->fbtp_loadcnt
) {
515 cmn_err(CE_NOTE
, "fbt is failing for probe %s "
516 "(module %s reloaded)",
517 fbt
->fbtp_name
, ctl
->mod_modname
);
523 for (; fbt
!= NULL
; fbt
= fbt
->fbtp_next
)
524 *fbt
->fbtp_patchpoint
= fbt
->fbtp_patchval
;
531 fbt_disable(void *arg
, dtrace_id_t id
, void *parg
)
533 fbt_probe_t
*fbt
= parg
;
534 struct modctl
*ctl
= fbt
->fbtp_ctl
;
536 ASSERT(ctl
->mod_nenabled
> 0);
539 if (!ctl
->mod_loaded
|| (ctl
->mod_loadcnt
!= fbt
->fbtp_loadcnt
))
542 for (; fbt
!= NULL
; fbt
= fbt
->fbtp_next
)
543 *fbt
->fbtp_patchpoint
= fbt
->fbtp_savedval
;
548 fbt_suspend(void *arg
, dtrace_id_t id
, void *parg
)
550 fbt_probe_t
*fbt
= parg
;
551 struct modctl
*ctl
= fbt
->fbtp_ctl
;
553 ASSERT(ctl
->mod_nenabled
> 0);
555 if (!ctl
->mod_loaded
|| (ctl
->mod_loadcnt
!= fbt
->fbtp_loadcnt
))
558 for (; fbt
!= NULL
; fbt
= fbt
->fbtp_next
)
559 *fbt
->fbtp_patchpoint
= fbt
->fbtp_savedval
;
564 fbt_resume(void *arg
, dtrace_id_t id
, void *parg
)
566 fbt_probe_t
*fbt
= parg
;
567 struct modctl
*ctl
= fbt
->fbtp_ctl
;
569 ASSERT(ctl
->mod_nenabled
> 0);
571 if (!ctl
->mod_loaded
|| (ctl
->mod_loadcnt
!= fbt
->fbtp_loadcnt
))
574 for (; fbt
!= NULL
; fbt
= fbt
->fbtp_next
)
575 *fbt
->fbtp_patchpoint
= fbt
->fbtp_patchval
;
580 fbt_getargdesc(void *arg
, dtrace_id_t id
, void *parg
, dtrace_argdesc_t
*desc
)
582 fbt_probe_t
*fbt
= parg
;
583 struct modctl
*ctl
= fbt
->fbtp_ctl
;
584 struct module
*mp
= ctl
->mod_mp
;
585 ctf_file_t
*fp
= NULL
, *pfp
;
588 ctf_id_t argv
[32], type
;
589 int argc
= sizeof (argv
) / sizeof (ctf_id_t
);
592 if (!ctl
->mod_loaded
|| (ctl
->mod_loadcnt
!= fbt
->fbtp_loadcnt
))
595 if (fbt
->fbtp_roffset
!= 0 && desc
->dtargd_ndx
== 0) {
596 (void) strcpy(desc
->dtargd_native
, "int");
600 if ((fp
= ctf_modopen(mp
, &error
)) == NULL
) {
602 * We have no CTF information for this module -- and therefore
603 * no args[] information.
609 * If we have a parent container, we must manually import it.
611 if ((parent
= ctf_parent_name(fp
)) != NULL
) {
612 struct modctl
*mp
= &modules
;
613 struct modctl
*mod
= NULL
;
616 * We must iterate over all modules to find the module that
620 if (strcmp(mp
->mod_modname
, parent
) == 0) {
624 } while ((mp
= mp
->mod_next
) != &modules
);
629 if ((pfp
= ctf_modopen(mod
->mod_mp
, &error
)) == NULL
) {
634 * If the parent module does not have the label we expect,
635 * ignore it and fail to avoid presenting non-sensical data.
637 if (ctf_label_info(pfp
, ctf_parent_label(fp
),
643 if (ctf_import(fp
, pfp
) != 0) {
651 if (ctf_func_info(fp
, fbt
->fbtp_symndx
, &f
) == CTF_ERR
)
654 if (fbt
->fbtp_roffset
!= 0) {
655 if (desc
->dtargd_ndx
> 1)
658 ASSERT(desc
->dtargd_ndx
== 1);
661 if (desc
->dtargd_ndx
+ 1 > f
.ctc_argc
)
664 if (ctf_func_args(fp
, fbt
->fbtp_symndx
, argc
, argv
) == CTF_ERR
)
667 type
= argv
[desc
->dtargd_ndx
];
670 if (ctf_type_name(fp
, type
, desc
->dtargd_native
,
671 DTRACE_ARGTYPELEN
) != NULL
) {
679 desc
->dtargd_ndx
= DTRACE_ARGNONE
;
682 static dtrace_pattr_t fbt_attr
= {
683 { DTRACE_STABILITY_EVOLVING
, DTRACE_STABILITY_EVOLVING
, DTRACE_CLASS_ISA
},
684 { DTRACE_STABILITY_PRIVATE
, DTRACE_STABILITY_PRIVATE
, DTRACE_CLASS_UNKNOWN
},
685 { DTRACE_STABILITY_PRIVATE
, DTRACE_STABILITY_PRIVATE
, DTRACE_CLASS_UNKNOWN
},
686 { DTRACE_STABILITY_EVOLVING
, DTRACE_STABILITY_EVOLVING
, DTRACE_CLASS_ISA
},
687 { DTRACE_STABILITY_PRIVATE
, DTRACE_STABILITY_PRIVATE
, DTRACE_CLASS_ISA
},
690 static dtrace_pops_t fbt_pops
= {
704 fbt_cleanup(dev_info_t
*devi
)
706 dtrace_invop_remove(fbt_invop
);
707 ddi_remove_minor_node(devi
, NULL
);
708 kmem_free(fbt_probetab
, fbt_probetab_size
* sizeof (fbt_probe_t
*));
710 fbt_probetab_mask
= 0;
714 fbt_attach(dev_info_t
*devi
, ddi_attach_cmd_t cmd
)
720 return (DDI_SUCCESS
);
722 return (DDI_FAILURE
);
725 if (fbt_probetab_size
== 0)
726 fbt_probetab_size
= FBT_PROBETAB_SIZE
;
728 fbt_probetab_mask
= fbt_probetab_size
- 1;
730 kmem_zalloc(fbt_probetab_size
* sizeof (fbt_probe_t
*), KM_SLEEP
);
732 dtrace_invop_add(fbt_invop
);
734 if (ddi_create_minor_node(devi
, "fbt", S_IFCHR
, 0,
735 DDI_PSEUDO
, 0) == DDI_FAILURE
||
736 dtrace_register("fbt", &fbt_attr
, DTRACE_PRIV_KERNEL
, NULL
,
737 &fbt_pops
, NULL
, &fbt_id
) != 0) {
739 return (DDI_FAILURE
);
742 ddi_report_dev(devi
);
745 return (DDI_SUCCESS
);
749 fbt_detach(dev_info_t
*devi
, ddi_detach_cmd_t cmd
)
755 return (DDI_SUCCESS
);
757 return (DDI_FAILURE
);
760 if (dtrace_unregister(fbt_id
) != 0)
761 return (DDI_FAILURE
);
765 return (DDI_SUCCESS
);
770 fbt_info(dev_info_t
*dip
, ddi_info_cmd_t infocmd
, void *arg
, void **result
)
775 case DDI_INFO_DEVT2DEVINFO
:
776 *result
= (void *)fbt_devi
;
779 case DDI_INFO_DEVT2INSTANCE
:
791 fbt_open(dev_t
*devp
, int flag
, int otyp
, cred_t
*cred_p
)
796 static struct cb_ops fbt_cb_ops
= {
799 nulldev
, /* strategy */
809 ddi_prop_op
, /* cb_prop_op */
811 D_NEW
| D_MP
/* Driver compatibility flag */
814 static struct dev_ops fbt_ops
= {
815 DEVO_REV
, /* devo_rev */
817 fbt_info
, /* get_dev_info */
818 nulldev
, /* identify */
820 fbt_attach
, /* attach */
821 fbt_detach
, /* detach */
823 &fbt_cb_ops
, /* driver operations */
824 NULL
, /* bus operations */
825 nodev
, /* dev power */
826 ddi_quiesce_not_needed
, /* quiesce */
830 * Module linkage information for the kernel.
832 static struct modldrv modldrv
= {
833 &mod_driverops
, /* module type (this is a pseudo driver) */
834 "Function Boundary Tracing", /* name of module */
835 &fbt_ops
, /* driver ops */
838 static struct modlinkage modlinkage
= {
847 return (mod_install(&modlinkage
));
851 _info(struct modinfo
*modinfop
)
853 return (mod_info(&modlinkage
, modinfop
));
859 return (mod_remove(&modlinkage
));