4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
27 /* #pragma ident "@(#)fbt.c 1.15 05/09/19 SMI" */
31 #define _KERNEL /* Solaris vs. Darwin */
35 #define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from mach/ppc/thread_status.h */
36 #include <kern/thread.h>
37 #include <mach/thread_status.h>
38 #include <mach/vm_param.h>
39 #include <mach-o/loader.h>
40 #include <mach-o/nlist.h>
41 #include <libkern/kernel_mach_header.h>
42 #include <libkern/OSAtomic.h>
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/errno.h>
48 #include <sys/ioctl.h>
50 #include <sys/fcntl.h>
51 #include <miscfs/devfs/devfs.h>
53 #include <sys/dtrace.h>
54 #include <sys/dtrace_impl.h>
57 #include <sys/dtrace_glue.h>
59 #define DTRACE_INVOP_NOP_SKIP 1
60 #define DTRACE_INVOP_MOVL_ESP_EBP 10
61 #define DTRACE_INVOP_MOVL_ESP_EBP_SKIP 2
62 #define DTRACE_INVOP_MOV_RSP_RBP 11
63 #define DTRACE_INVOP_MOV_RSP_RBP_SKIP 3
64 #define DTRACE_INVOP_POP_RBP 12
65 #define DTRACE_INVOP_POP_RBP_SKIP 1
66 #define DTRACE_INVOP_LEAVE_SKIP 1
68 #define FBT_PUSHL_EBP 0x55
69 #define FBT_MOVL_ESP_EBP0_V0 0x8b
70 #define FBT_MOVL_ESP_EBP1_V0 0xec
71 #define FBT_MOVL_ESP_EBP0_V1 0x89
72 #define FBT_MOVL_ESP_EBP1_V1 0xe5
74 #define FBT_PUSH_RBP 0x55
75 #define FBT_REX_RSP_RBP 0x48
76 #define FBT_MOV_RSP_RBP0 0x89
77 #define FBT_MOV_RSP_RBP1 0xe5
78 #define FBT_POP_RBP 0x5d
80 #define FBT_POPL_EBP 0x5d
82 #define FBT_RET_IMM16 0xc2
83 #define FBT_LEAVE 0xc9
84 #define FBT_JMP_SHORT_REL 0xeb /* Jump short, relative, displacement relative to next instr. */
85 #define FBT_JMP_NEAR_REL 0xe9 /* Jump near, relative, displacement relative to next instr. */
86 #define FBT_JMP_FAR_ABS 0xea /* Jump far, absolute, address given in operand */
88 #define FBT_RET_IMM16_LEN 3
89 #define FBT_JMP_SHORT_REL_LEN 2
90 #define FBT_JMP_NEAR_REL_LEN 5
91 #define FBT_JMP_FAR_ABS_LEN 5
93 #define FBT_PATCHVAL 0xf0
94 #define FBT_AFRAMES_ENTRY 7
95 #define FBT_AFRAMES_RETURN 6
97 #define FBT_ENTRY "entry"
98 #define FBT_RETURN "return"
99 #define FBT_ADDR2NDX(addr) ((((uintptr_t)(addr)) >> 4) & fbt_probetab_mask)
101 extern dtrace_provider_id_t fbt_id
;
102 extern fbt_probe_t
**fbt_probetab
;
103 extern int fbt_probetab_mask
;
105 kern_return_t
fbt_perfCallback(int, x86_saved_state_t
*, uintptr_t *, __unused
int);
108 fbt_invop(uintptr_t addr
, uintptr_t *state
, uintptr_t rval
)
110 fbt_probe_t
*fbt
= fbt_probetab
[FBT_ADDR2NDX(addr
)];
112 for (; fbt
!= NULL
; fbt
= fbt
->fbtp_hashnext
) {
113 if ((uintptr_t)fbt
->fbtp_patchpoint
== addr
) {
115 if (fbt
->fbtp_roffset
== 0) {
116 x86_saved_state64_t
*regs
= (x86_saved_state64_t
*)state
;
118 CPU
->cpu_dtrace_caller
= *(uintptr_t *)(((uintptr_t)(regs
->isf
.rsp
))+sizeof(uint64_t)); // 8(%rsp)
119 /* 64-bit ABI, arguments passed in registers. */
120 dtrace_probe(fbt
->fbtp_id
, regs
->rdi
, regs
->rsi
, regs
->rdx
, regs
->rcx
, regs
->r8
);
121 CPU
->cpu_dtrace_caller
= 0;
124 dtrace_probe(fbt
->fbtp_id
, fbt
->fbtp_roffset
, rval
, 0, 0, 0);
125 CPU
->cpu_dtrace_caller
= 0;
128 return (fbt
->fbtp_rval
);
135 #define IS_USER_TRAP(regs) (regs && (((regs)->isf.cs & 3) != 0))
136 #define T_INVALID_OPCODE 6
137 #define FBT_EXCEPTION_CODE T_INVALID_OPCODE
138 #define T_PREEMPT 255
143 x86_saved_state_t
*tagged_regs
,
145 __unused
int unused2
)
147 kern_return_t retval
= KERN_FAILURE
;
148 x86_saved_state64_t
*saved_state
= saved_state64(tagged_regs
);
150 if (FBT_EXCEPTION_CODE
== trapno
&& !IS_USER_TRAP(saved_state
)) {
152 uint64_t rsp_probe
, fp
, delta
= 0;
158 oldlevel
= ml_set_interrupts_enabled(FALSE
);
160 /* Calculate where the stack pointer was when the probe instruction "fired." */
161 rsp_probe
= saved_state
->isf
.rsp
; /* Easy, x86_64 establishes this value in idt64.s */
164 "Ldtrace_invop_callsite_pre_label:\n"
166 ".private_extern _dtrace_invop_callsite_pre\n"
167 "_dtrace_invop_callsite_pre:\n"
168 " .quad Ldtrace_invop_callsite_pre_label\n"
172 emul
= dtrace_invop( saved_state
->isf
.rip
, (uintptr_t *)saved_state
, saved_state
->rax
);
175 "Ldtrace_invop_callsite_post_label:\n"
177 ".private_extern _dtrace_invop_callsite_post\n"
178 "_dtrace_invop_callsite_post:\n"
179 " .quad Ldtrace_invop_callsite_post_label\n"
184 case DTRACE_INVOP_NOP
:
185 saved_state
->isf
.rip
+= DTRACE_INVOP_NOP_SKIP
; /* Skip over the patched NOP (planted by sdt). */
186 retval
= KERN_SUCCESS
;
189 case DTRACE_INVOP_MOV_RSP_RBP
:
190 saved_state
->rbp
= rsp_probe
; /* Emulate patched mov %rsp,%rbp */
191 saved_state
->isf
.rip
+= DTRACE_INVOP_MOV_RSP_RBP_SKIP
; /* Skip over the bytes of the patched mov %rsp,%rbp */
192 retval
= KERN_SUCCESS
;
195 case DTRACE_INVOP_POP_RBP
:
196 case DTRACE_INVOP_LEAVE
:
198 * Emulate first micro-op of patched leave: mov %rbp,%rsp
199 * fp points just below the return address slot for target's ret
200 * and at the slot holding the frame pointer saved by the target's prologue.
202 fp
= saved_state
->rbp
;
203 /* Emulate second micro-op of patched leave: patched pop %rbp
204 * savearea rbp is set for the frame of the caller to target
205 * The *live* %rsp will be adjusted below for pop increment(s)
207 saved_state
->rbp
= *(uint64_t *)fp
;
208 /* Skip over the patched leave */
209 saved_state
->isf
.rip
+= DTRACE_INVOP_LEAVE_SKIP
;
211 * Lift the stack to account for the emulated leave
212 * Account for words local in this frame
213 * (in "case DTRACE_INVOP_POPL_EBP:" this is zero.)
215 delta
= ((uint32_t *)fp
) - ((uint32_t *)rsp_probe
); /* delta is a *word* increment */
216 /* Account for popping off the rbp (just accomplished by the emulation
220 saved_state
->isf
.rsp
+= (delta
<< 2);
221 /* Obtain the stack pointer recorded by the trampolines */
223 /* Shift contents of stack */
224 for (pDst
= (uint32_t *)fp
;
225 pDst
> (((uint32_t *)old_sp
));
227 *pDst
= pDst
[-delta
];
229 /* Track the stack lift in "saved_state". */
230 saved_state
= (x86_saved_state64_t
*) (((uintptr_t)saved_state
) + (delta
<< 2));
231 /* Adjust the stack pointer utilized by the trampolines */
232 *lo_spp
= old_sp
+ (delta
<< 2);
234 retval
= KERN_SUCCESS
;
238 retval
= KERN_FAILURE
;
242 /* Trick trap_from_kernel into not attempting to handle pending AST_URGENT */
243 saved_state
->isf
.trapno
= T_PREEMPT
;
245 ml_set_interrupts_enabled(oldlevel
);
252 fbt_provide_probe(struct modctl
*ctl
, uintptr_t instrLow
, uintptr_t instrHigh
, char *modname
, char* symbolName
, machine_inst_t
* symbolStart
)
255 unsigned int doenable
= 0;
258 fbt_probe_t
*newfbt
, *retfbt
, *entryfbt
;
259 machine_inst_t
*instr
, *limit
, theInstr
, i1
, i2
, i3
;
263 * Guard against null symbols
265 if (!symbolStart
|| !instrLow
|| !instrHigh
) {
266 kprintf("dtrace: %s has an invalid address\n", symbolName
);
270 for (j
= 0, instr
= symbolStart
, theInstr
= 0;
271 (j
< 4) && ((uintptr_t)instr
>= instrLow
) && (instrHigh
> (uintptr_t)(instr
+ 2));
274 if (theInstr
== FBT_PUSH_RBP
|| theInstr
== FBT_RET
|| theInstr
== FBT_RET_IMM16
)
277 if ((size
= dtrace_instr_size(instr
)) <= 0)
283 if (theInstr
!= FBT_PUSH_RBP
)
290 limit
= (machine_inst_t
*)instrHigh
;
292 if (i1
== FBT_REX_RSP_RBP
&& i2
== FBT_MOV_RSP_RBP0
&& i3
== FBT_MOV_RSP_RBP1
) {
293 instr
+= 1; /* Advance to the mov %rsp,%rbp */
301 * Sometimes, the compiler will schedule an intervening instruction
302 * in the function prologue. Example:
305 * 000006d8 pushl %ebp
306 * 000006d9 movl $0x00000004,%edx
307 * 000006de movl %esp,%ebp
309 * Try the next instruction, to see if it is a movl %esp,%ebp
312 instr
+= 1; /* Advance past the pushl %ebp */
313 if ((size
= dtrace_instr_size(instr
)) <= 0)
318 if ((instr
+ 1) >= limit
)
324 if (!(i1
== FBT_MOVL_ESP_EBP0_V0
&& i2
== FBT_MOVL_ESP_EBP1_V0
) &&
325 !(i1
== FBT_MOVL_ESP_EBP0_V1
&& i2
== FBT_MOVL_ESP_EBP1_V1
))
328 /* instr already points at the movl %esp,%ebp */
332 thisid
= dtrace_probe_lookup(fbt_id
, modname
, symbolName
, FBT_ENTRY
);
333 newfbt
= kmem_zalloc(sizeof (fbt_probe_t
), KM_SLEEP
);
334 strlcpy( (char *)&(newfbt
->fbtp_name
), symbolName
, MAX_FBTP_NAME_CHARS
);
338 * The dtrace_probe previously existed, so we have to hook
339 * the newfbt entry onto the end of the existing fbt's chain.
340 * If we find an fbt entry that was previously patched to
341 * fire, (as indicated by the current patched value), then
342 * we want to enable this newfbt on the spot.
344 entryfbt
= dtrace_probe_arg (fbt_id
, thisid
);
345 ASSERT (entryfbt
!= NULL
);
346 for(; entryfbt
!= NULL
; entryfbt
= entryfbt
->fbtp_next
) {
347 if (entryfbt
->fbtp_currentval
== entryfbt
->fbtp_patchval
)
350 if (entryfbt
->fbtp_next
== NULL
) {
351 entryfbt
->fbtp_next
= newfbt
;
352 newfbt
->fbtp_id
= entryfbt
->fbtp_id
;
359 * The dtrace_probe did not previously exist, so we
360 * create it and hook in the newfbt. Since the probe is
361 * new, we obviously do not need to enable it on the spot.
363 newfbt
->fbtp_id
= dtrace_probe_create(fbt_id
, modname
, symbolName
, FBT_ENTRY
, FBT_AFRAMES_ENTRY
, newfbt
);
367 newfbt
->fbtp_patchpoint
= instr
;
368 newfbt
->fbtp_ctl
= ctl
;
369 newfbt
->fbtp_loadcnt
= ctl
->mod_loadcnt
;
370 newfbt
->fbtp_rval
= DTRACE_INVOP_MOV_RSP_RBP
;
371 newfbt
->fbtp_savedval
= theInstr
;
372 newfbt
->fbtp_patchval
= FBT_PATCHVAL
;
373 newfbt
->fbtp_currentval
= 0;
374 newfbt
->fbtp_hashnext
= fbt_probetab
[FBT_ADDR2NDX(instr
)];
375 fbt_probetab
[FBT_ADDR2NDX(instr
)] = newfbt
;
378 fbt_enable(NULL
, newfbt
->fbtp_id
, newfbt
);
381 * The fbt entry chain is in place, one entry point per symbol.
382 * The fbt return chain can have multiple return points per symbol.
383 * Here we find the end of the fbt return chain.
388 thisid
= dtrace_probe_lookup(fbt_id
, modname
, symbolName
, FBT_RETURN
);
390 /* The dtrace_probe previously existed, so we have to
391 * find the end of the existing fbt chain. If we find
392 * an fbt return that was previously patched to fire,
393 * (as indicated by the currrent patched value), then
394 * we want to enable any new fbts on the spot.
396 retfbt
= dtrace_probe_arg (fbt_id
, thisid
);
397 ASSERT(retfbt
!= NULL
);
398 for (; retfbt
!= NULL
; retfbt
= retfbt
->fbtp_next
) {
399 if (retfbt
->fbtp_currentval
== retfbt
->fbtp_patchval
)
401 if(retfbt
->fbtp_next
== NULL
)
415 * If this disassembly fails, then we've likely walked off into
416 * a jump table or some other unsuitable area. Bail out of the
419 if ((size
= dtrace_instr_size(instr
)) <= 0)
423 * We (desperately) want to avoid erroneously instrumenting a
424 * jump table, especially given that our markers are pretty
425 * short: two bytes on x86, and just one byte on amd64. To
426 * determine if we're looking at a true instruction sequence
427 * or an inline jump table that happens to contain the same
428 * byte sequences, we resort to some heuristic sleeze: we
429 * treat this instruction as being contained within a pointer,
430 * and see if that pointer points to within the body of the
431 * function. If it does, we refuse to instrument it.
433 for (j
= 0; j
< sizeof (uintptr_t); j
++) {
434 uintptr_t check
= (uintptr_t)instr
- j
;
437 if (check
< (uintptr_t)symbolStart
)
440 if (check
+ sizeof (uintptr_t) > (uintptr_t)limit
)
443 ptr
= *(uint8_t **)check
;
445 if (ptr
>= (uint8_t *)symbolStart
&& ptr
< limit
) {
452 * OK, it's an instruction.
456 /* Walked onto the start of the next routine? If so, bail out of this function. */
457 if (theInstr
== FBT_PUSH_RBP
)
460 if (!(size
== 1 && (theInstr
== FBT_POP_RBP
|| theInstr
== FBT_LEAVE
))) {
466 * Found the pop %rbp; or leave.
468 machine_inst_t
*patch_instr
= instr
;
471 * Scan forward for a "ret", or "jmp".
477 size
= dtrace_instr_size(instr
);
478 if (size
<= 0) /* Failed instruction decode? */
483 if (!(size
== FBT_RET_LEN
&& (theInstr
== FBT_RET
)) &&
484 !(size
== FBT_RET_IMM16_LEN
&& (theInstr
== FBT_RET_IMM16
)) &&
485 !(size
== FBT_JMP_SHORT_REL_LEN
&& (theInstr
== FBT_JMP_SHORT_REL
)) &&
486 !(size
== FBT_JMP_NEAR_REL_LEN
&& (theInstr
== FBT_JMP_NEAR_REL
)) &&
487 !(size
== FBT_JMP_FAR_ABS_LEN
&& (theInstr
== FBT_JMP_FAR_ABS
)))
491 * pop %rbp; ret; or leave; ret; or leave; jmp tailCalledFun; -- We have a winner!
493 newfbt
= kmem_zalloc(sizeof (fbt_probe_t
), KM_SLEEP
);
494 strlcpy( (char *)&(newfbt
->fbtp_name
), symbolName
, MAX_FBTP_NAME_CHARS
);
496 if (retfbt
== NULL
) {
497 newfbt
->fbtp_id
= dtrace_probe_create(fbt_id
, modname
,
498 symbolName
, FBT_RETURN
, FBT_AFRAMES_RETURN
, newfbt
);
500 retfbt
->fbtp_next
= newfbt
;
501 newfbt
->fbtp_id
= retfbt
->fbtp_id
;
505 newfbt
->fbtp_patchpoint
= patch_instr
;
506 newfbt
->fbtp_ctl
= ctl
;
507 newfbt
->fbtp_loadcnt
= ctl
->mod_loadcnt
;
509 if (*patch_instr
== FBT_POP_RBP
) {
510 newfbt
->fbtp_rval
= DTRACE_INVOP_POP_RBP
;
512 ASSERT(*patch_instr
== FBT_LEAVE
);
513 newfbt
->fbtp_rval
= DTRACE_INVOP_LEAVE
;
515 newfbt
->fbtp_roffset
=
516 (uintptr_t)(patch_instr
- (uint8_t *)symbolStart
);
518 newfbt
->fbtp_savedval
= *patch_instr
;
519 newfbt
->fbtp_patchval
= FBT_PATCHVAL
;
520 newfbt
->fbtp_hashnext
= fbt_probetab
[FBT_ADDR2NDX(patch_instr
)];
521 fbt_probetab
[FBT_ADDR2NDX(patch_instr
)] = newfbt
;
524 fbt_enable(NULL
, newfbt
->fbtp_id
, newfbt
);
531 fbt_provide_module_kernel_syms(struct modctl
*ctl
)
533 kernel_mach_header_t
*mh
;
534 struct load_command
*cmd
;
535 kernel_segment_command_t
*orig_ts
= NULL
, *orig_le
= NULL
;
536 struct symtab_command
*orig_st
= NULL
;
537 kernel_nlist_t
*sym
= NULL
;
539 uintptr_t instrLow
, instrHigh
;
543 mh
= (kernel_mach_header_t
*)(ctl
->mod_address
);
544 modname
= ctl
->mod_modname
;
546 if (mh
->magic
!= MH_MAGIC_KERNEL
)
549 cmd
= (struct load_command
*) &mh
[1];
550 for (i
= 0; i
< mh
->ncmds
; i
++) {
551 if (cmd
->cmd
== LC_SEGMENT_KERNEL
) {
552 kernel_segment_command_t
*orig_sg
= (kernel_segment_command_t
*) cmd
;
554 if (LIT_STRNEQL(orig_sg
->segname
, SEG_TEXT
))
556 else if (LIT_STRNEQL(orig_sg
->segname
, SEG_LINKEDIT
))
558 else if (LIT_STRNEQL(orig_sg
->segname
, ""))
559 orig_ts
= orig_sg
; /* kexts have a single unnamed segment */
561 else if (cmd
->cmd
== LC_SYMTAB
)
562 orig_st
= (struct symtab_command
*) cmd
;
564 cmd
= (struct load_command
*) ((caddr_t
) cmd
+ cmd
->cmdsize
);
567 if ((orig_ts
== NULL
) || (orig_st
== NULL
) || (orig_le
== NULL
))
570 sym
= (kernel_nlist_t
*)(orig_le
->vmaddr
+ orig_st
->symoff
- orig_le
->fileoff
);
571 strings
= (char *)(orig_le
->vmaddr
+ orig_st
->stroff
- orig_le
->fileoff
);
573 /* Find extent of the TEXT section */
574 instrLow
= (uintptr_t)orig_ts
->vmaddr
;
575 instrHigh
= (uintptr_t)(orig_ts
->vmaddr
+ orig_ts
->vmsize
);
577 for (i
= 0; i
< orig_st
->nsyms
; i
++) {
578 uint8_t n_type
= sym
[i
].n_type
& (N_TYPE
| N_EXT
);
579 char *name
= strings
+ sym
[i
].n_un
.n_strx
;
581 /* Check that the symbol is a global and that it has a name. */
582 if (((N_SECT
| N_EXT
) != n_type
&& (N_ABS
| N_EXT
) != n_type
))
585 if (0 == sym
[i
].n_un
.n_strx
) /* iff a null, "", name. */
588 /* Lop off omnipresent leading underscore. */
593 * We're only blacklisting functions in the kernel for now.
595 if (MOD_IS_MACH_KERNEL(ctl
) && fbt_excluded(name
))
598 fbt_provide_probe(ctl
, instrLow
, instrHigh
, modname
, name
, (machine_inst_t
*)sym
[i
].n_value
);