4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License"). You may not use this file except in compliance
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
27 /* #pragma ident "@(#)fbt.c 1.15 05/09/19 SMI" */
31 #define _KERNEL /* Solaris vs. Darwin */
35 #define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from mach/ppc/thread_status.h */
36 #include <kern/thread.h>
37 #include <mach/thread_status.h>
38 #include <mach/vm_param.h>
39 #include <mach-o/loader.h>
40 #include <mach-o/nlist.h>
41 #include <libkern/kernel_mach_header.h>
42 #include <libkern/OSAtomic.h>
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/errno.h>
48 #include <sys/ioctl.h>
50 #include <sys/fcntl.h>
51 #include <miscfs/devfs/devfs.h>
53 #include <sys/dtrace.h>
54 #include <sys/dtrace_impl.h>
57 #include <sys/dtrace_glue.h>
59 #include <san/kasan.h>
61 #define DTRACE_INVOP_NOP_SKIP 1
62 #define DTRACE_INVOP_MOVL_ESP_EBP 10
63 #define DTRACE_INVOP_MOVL_ESP_EBP_SKIP 2
64 #define DTRACE_INVOP_MOV_RSP_RBP 11
65 #define DTRACE_INVOP_MOV_RSP_RBP_SKIP 3
66 #define DTRACE_INVOP_POP_RBP 12
67 #define DTRACE_INVOP_POP_RBP_SKIP 1
68 #define DTRACE_INVOP_LEAVE_SKIP 1
70 #define FBT_PUSHL_EBP 0x55
71 #define FBT_MOVL_ESP_EBP0_V0 0x8b
72 #define FBT_MOVL_ESP_EBP1_V0 0xec
73 #define FBT_MOVL_ESP_EBP0_V1 0x89
74 #define FBT_MOVL_ESP_EBP1_V1 0xe5
76 #define FBT_PUSH_RBP 0x55
77 #define FBT_REX_RSP_RBP 0x48
78 #define FBT_MOV_RSP_RBP0 0x89
79 #define FBT_MOV_RSP_RBP1 0xe5
80 #define FBT_POP_RBP 0x5d
82 #define FBT_POPL_EBP 0x5d
84 #define FBT_RET_IMM16 0xc2
85 #define FBT_LEAVE 0xc9
86 #define FBT_JMP_SHORT_REL 0xeb /* Jump short, relative, displacement relative to next instr. */
87 #define FBT_JMP_NEAR_REL 0xe9 /* Jump near, relative, displacement relative to next instr. */
88 #define FBT_JMP_FAR_ABS 0xea /* Jump far, absolute, address given in operand */
90 #define FBT_RET_IMM16_LEN 3
91 #define FBT_JMP_SHORT_REL_LEN 2
92 #define FBT_JMP_NEAR_REL_LEN 5
93 #define FBT_JMP_FAR_ABS_LEN 5
95 #define FBT_PATCHVAL 0xf0
96 #define FBT_AFRAMES_ENTRY 7
97 #define FBT_AFRAMES_RETURN 6
99 #define FBT_ENTRY "entry"
100 #define FBT_RETURN "return"
101 #define FBT_ADDR2NDX(addr) ((((uintptr_t)(addr)) >> 4) & fbt_probetab_mask)
103 extern dtrace_provider_id_t fbt_id
;
104 extern fbt_probe_t
**fbt_probetab
;
105 extern int fbt_probetab_mask
;
107 kern_return_t
fbt_perfCallback(int, x86_saved_state_t
*, uintptr_t *, __unused
int);
110 fbt_invop(uintptr_t addr
, uintptr_t *state
, uintptr_t rval
)
112 fbt_probe_t
*fbt
= fbt_probetab
[FBT_ADDR2NDX(addr
)];
114 for (; fbt
!= NULL
; fbt
= fbt
->fbtp_hashnext
) {
115 if ((uintptr_t)fbt
->fbtp_patchpoint
== addr
) {
117 if (fbt
->fbtp_roffset
== 0) {
118 x86_saved_state64_t
*regs
= (x86_saved_state64_t
*)state
;
120 CPU
->cpu_dtrace_caller
= *(uintptr_t *)(((uintptr_t)(regs
->isf
.rsp
))+sizeof(uint64_t)); // 8(%rsp)
121 /* 64-bit ABI, arguments passed in registers. */
122 dtrace_probe(fbt
->fbtp_id
, regs
->rdi
, regs
->rsi
, regs
->rdx
, regs
->rcx
, regs
->r8
);
123 CPU
->cpu_dtrace_caller
= 0;
126 dtrace_probe(fbt
->fbtp_id
, fbt
->fbtp_roffset
, rval
, 0, 0, 0);
127 CPU
->cpu_dtrace_caller
= 0;
130 return (fbt
->fbtp_rval
);
137 #define IS_USER_TRAP(regs) (regs && (((regs)->isf.cs & 3) != 0))
138 #define T_INVALID_OPCODE 6
139 #define FBT_EXCEPTION_CODE T_INVALID_OPCODE
140 #define T_PREEMPT 255
145 x86_saved_state_t
*tagged_regs
,
147 __unused
int unused2
)
149 kern_return_t retval
= KERN_FAILURE
;
150 x86_saved_state64_t
*saved_state
= saved_state64(tagged_regs
);
152 if (FBT_EXCEPTION_CODE
== trapno
&& !IS_USER_TRAP(saved_state
)) {
154 uint64_t rsp_probe
, fp
, delta
= 0;
160 oldlevel
= ml_set_interrupts_enabled(FALSE
);
162 /* Calculate where the stack pointer was when the probe instruction "fired." */
163 rsp_probe
= saved_state
->isf
.rsp
; /* Easy, x86_64 establishes this value in idt64.s */
166 "Ldtrace_invop_callsite_pre_label:\n"
168 ".private_extern _dtrace_invop_callsite_pre\n"
169 "_dtrace_invop_callsite_pre:\n"
170 " .quad Ldtrace_invop_callsite_pre_label\n"
174 emul
= dtrace_invop( saved_state
->isf
.rip
, (uintptr_t *)saved_state
, saved_state
->rax
);
177 "Ldtrace_invop_callsite_post_label:\n"
179 ".private_extern _dtrace_invop_callsite_post\n"
180 "_dtrace_invop_callsite_post:\n"
181 " .quad Ldtrace_invop_callsite_post_label\n"
186 case DTRACE_INVOP_NOP
:
187 saved_state
->isf
.rip
+= DTRACE_INVOP_NOP_SKIP
; /* Skip over the patched NOP (planted by sdt). */
188 retval
= KERN_SUCCESS
;
191 case DTRACE_INVOP_MOV_RSP_RBP
:
192 saved_state
->rbp
= rsp_probe
; /* Emulate patched mov %rsp,%rbp */
193 saved_state
->isf
.rip
+= DTRACE_INVOP_MOV_RSP_RBP_SKIP
; /* Skip over the bytes of the patched mov %rsp,%rbp */
194 retval
= KERN_SUCCESS
;
197 case DTRACE_INVOP_POP_RBP
:
198 case DTRACE_INVOP_LEAVE
:
200 * Emulate first micro-op of patched leave: mov %rbp,%rsp
201 * fp points just below the return address slot for target's ret
202 * and at the slot holding the frame pointer saved by the target's prologue.
204 fp
= saved_state
->rbp
;
205 /* Emulate second micro-op of patched leave: patched pop %rbp
206 * savearea rbp is set for the frame of the caller to target
207 * The *live* %rsp will be adjusted below for pop increment(s)
209 saved_state
->rbp
= *(uint64_t *)fp
;
210 /* Skip over the patched leave */
211 saved_state
->isf
.rip
+= DTRACE_INVOP_LEAVE_SKIP
;
213 * Lift the stack to account for the emulated leave
214 * Account for words local in this frame
215 * (in "case DTRACE_INVOP_POPL_EBP:" this is zero.)
217 delta
= ((uint32_t *)fp
) - ((uint32_t *)rsp_probe
); /* delta is a *word* increment */
218 /* Account for popping off the rbp (just accomplished by the emulation
222 saved_state
->isf
.rsp
+= (delta
<< 2);
223 /* Obtain the stack pointer recorded by the trampolines */
225 /* Shift contents of stack */
226 for (pDst
= (uint32_t *)fp
;
227 pDst
> (((uint32_t *)old_sp
));
229 *pDst
= pDst
[-delta
];
233 * The above has moved stack objects so they are no longer in sync
236 uintptr_t base
= (uintptr_t)((uint32_t *)old_sp
- delta
);
237 uintptr_t size
= (uintptr_t)fp
- base
;
238 if (base
>= VM_MIN_KERNEL_AND_KEXT_ADDRESS
) {
239 kasan_unpoison_stack(base
, size
);
243 /* Track the stack lift in "saved_state". */
244 saved_state
= (x86_saved_state64_t
*) (((uintptr_t)saved_state
) + (delta
<< 2));
245 /* Adjust the stack pointer utilized by the trampolines */
246 *lo_spp
= old_sp
+ (delta
<< 2);
248 retval
= KERN_SUCCESS
;
252 retval
= KERN_FAILURE
;
256 /* Trick trap_from_kernel into not attempting to handle pending AST_URGENT */
257 saved_state
->isf
.trapno
= T_PREEMPT
;
259 ml_set_interrupts_enabled(oldlevel
);
266 fbt_provide_probe(struct modctl
*ctl
, uintptr_t instrLow
, uintptr_t instrHigh
, char *modname
, char* symbolName
, machine_inst_t
* symbolStart
)
269 unsigned int doenable
= 0;
272 fbt_probe_t
*newfbt
, *retfbt
, *entryfbt
;
273 machine_inst_t
*instr
, *limit
, theInstr
, i1
, i2
, i3
;
277 * Guard against null symbols
279 if (!symbolStart
|| !instrLow
|| !instrHigh
) {
280 kprintf("dtrace: %s has an invalid address\n", symbolName
);
284 for (j
= 0, instr
= symbolStart
, theInstr
= 0;
285 (j
< 4) && ((uintptr_t)instr
>= instrLow
) && (instrHigh
> (uintptr_t)(instr
+ 2));
288 if (theInstr
== FBT_PUSH_RBP
|| theInstr
== FBT_RET
|| theInstr
== FBT_RET_IMM16
)
291 if ((size
= dtrace_instr_size(instr
)) <= 0)
297 if (theInstr
!= FBT_PUSH_RBP
)
304 limit
= (machine_inst_t
*)instrHigh
;
306 if (i1
== FBT_REX_RSP_RBP
&& i2
== FBT_MOV_RSP_RBP0
&& i3
== FBT_MOV_RSP_RBP1
) {
307 instr
+= 1; /* Advance to the mov %rsp,%rbp */
315 * Sometimes, the compiler will schedule an intervening instruction
316 * in the function prologue. Example:
319 * 000006d8 pushl %ebp
320 * 000006d9 movl $0x00000004,%edx
321 * 000006de movl %esp,%ebp
323 * Try the next instruction, to see if it is a movl %esp,%ebp
326 instr
+= 1; /* Advance past the pushl %ebp */
327 if ((size
= dtrace_instr_size(instr
)) <= 0)
332 if ((instr
+ 1) >= limit
)
338 if (!(i1
== FBT_MOVL_ESP_EBP0_V0
&& i2
== FBT_MOVL_ESP_EBP1_V0
) &&
339 !(i1
== FBT_MOVL_ESP_EBP0_V1
&& i2
== FBT_MOVL_ESP_EBP1_V1
))
342 /* instr already points at the movl %esp,%ebp */
346 thisid
= dtrace_probe_lookup(fbt_id
, modname
, symbolName
, FBT_ENTRY
);
347 newfbt
= kmem_zalloc(sizeof (fbt_probe_t
), KM_SLEEP
);
348 strlcpy( (char *)&(newfbt
->fbtp_name
), symbolName
, MAX_FBTP_NAME_CHARS
);
352 * The dtrace_probe previously existed, so we have to hook
353 * the newfbt entry onto the end of the existing fbt's chain.
354 * If we find an fbt entry that was previously patched to
355 * fire, (as indicated by the current patched value), then
356 * we want to enable this newfbt on the spot.
358 entryfbt
= dtrace_probe_arg (fbt_id
, thisid
);
359 ASSERT (entryfbt
!= NULL
);
360 for(; entryfbt
!= NULL
; entryfbt
= entryfbt
->fbtp_next
) {
361 if (entryfbt
->fbtp_currentval
== entryfbt
->fbtp_patchval
)
364 if (entryfbt
->fbtp_next
== NULL
) {
365 entryfbt
->fbtp_next
= newfbt
;
366 newfbt
->fbtp_id
= entryfbt
->fbtp_id
;
373 * The dtrace_probe did not previously exist, so we
374 * create it and hook in the newfbt. Since the probe is
375 * new, we obviously do not need to enable it on the spot.
377 newfbt
->fbtp_id
= dtrace_probe_create(fbt_id
, modname
, symbolName
, FBT_ENTRY
, FBT_AFRAMES_ENTRY
, newfbt
);
381 newfbt
->fbtp_patchpoint
= instr
;
382 newfbt
->fbtp_ctl
= ctl
;
383 newfbt
->fbtp_loadcnt
= ctl
->mod_loadcnt
;
384 newfbt
->fbtp_rval
= DTRACE_INVOP_MOV_RSP_RBP
;
385 newfbt
->fbtp_savedval
= theInstr
;
386 newfbt
->fbtp_patchval
= FBT_PATCHVAL
;
387 newfbt
->fbtp_currentval
= 0;
388 newfbt
->fbtp_hashnext
= fbt_probetab
[FBT_ADDR2NDX(instr
)];
389 fbt_probetab
[FBT_ADDR2NDX(instr
)] = newfbt
;
392 fbt_enable(NULL
, newfbt
->fbtp_id
, newfbt
);
395 * The fbt entry chain is in place, one entry point per symbol.
396 * The fbt return chain can have multiple return points per symbol.
397 * Here we find the end of the fbt return chain.
402 thisid
= dtrace_probe_lookup(fbt_id
, modname
, symbolName
, FBT_RETURN
);
404 /* The dtrace_probe previously existed, so we have to
405 * find the end of the existing fbt chain. If we find
406 * an fbt return that was previously patched to fire,
407 * (as indicated by the currrent patched value), then
408 * we want to enable any new fbts on the spot.
410 retfbt
= dtrace_probe_arg (fbt_id
, thisid
);
411 ASSERT(retfbt
!= NULL
);
412 for (; retfbt
!= NULL
; retfbt
= retfbt
->fbtp_next
) {
413 if (retfbt
->fbtp_currentval
== retfbt
->fbtp_patchval
)
415 if(retfbt
->fbtp_next
== NULL
)
429 * If this disassembly fails, then we've likely walked off into
430 * a jump table or some other unsuitable area. Bail out of the
433 if ((size
= dtrace_instr_size(instr
)) <= 0)
437 * We (desperately) want to avoid erroneously instrumenting a
438 * jump table, especially given that our markers are pretty
439 * short: two bytes on x86, and just one byte on amd64. To
440 * determine if we're looking at a true instruction sequence
441 * or an inline jump table that happens to contain the same
442 * byte sequences, we resort to some heuristic sleeze: we
443 * treat this instruction as being contained within a pointer,
444 * and see if that pointer points to within the body of the
445 * function. If it does, we refuse to instrument it.
447 for (j
= 0; j
< sizeof (uintptr_t); j
++) {
448 uintptr_t check
= (uintptr_t)instr
- j
;
451 if (check
< (uintptr_t)symbolStart
)
454 if (check
+ sizeof (uintptr_t) > (uintptr_t)limit
)
457 ptr
= *(uint8_t **)check
;
459 if (ptr
>= (uint8_t *)symbolStart
&& ptr
< limit
) {
466 * OK, it's an instruction.
470 /* Walked onto the start of the next routine? If so, bail out of this function. */
471 if (theInstr
== FBT_PUSH_RBP
)
474 if (!(size
== 1 && (theInstr
== FBT_POP_RBP
|| theInstr
== FBT_LEAVE
))) {
480 * Found the pop %rbp; or leave.
482 machine_inst_t
*patch_instr
= instr
;
485 * Scan forward for a "ret", or "jmp".
491 size
= dtrace_instr_size(instr
);
492 if (size
<= 0) /* Failed instruction decode? */
497 if (!(size
== FBT_RET_LEN
&& (theInstr
== FBT_RET
)) &&
498 !(size
== FBT_RET_IMM16_LEN
&& (theInstr
== FBT_RET_IMM16
)) &&
499 !(size
== FBT_JMP_SHORT_REL_LEN
&& (theInstr
== FBT_JMP_SHORT_REL
)) &&
500 !(size
== FBT_JMP_NEAR_REL_LEN
&& (theInstr
== FBT_JMP_NEAR_REL
)) &&
501 !(size
== FBT_JMP_FAR_ABS_LEN
&& (theInstr
== FBT_JMP_FAR_ABS
)))
505 * pop %rbp; ret; or leave; ret; or leave; jmp tailCalledFun; -- We have a winner!
507 newfbt
= kmem_zalloc(sizeof (fbt_probe_t
), KM_SLEEP
);
508 strlcpy( (char *)&(newfbt
->fbtp_name
), symbolName
, MAX_FBTP_NAME_CHARS
);
510 if (retfbt
== NULL
) {
511 newfbt
->fbtp_id
= dtrace_probe_create(fbt_id
, modname
,
512 symbolName
, FBT_RETURN
, FBT_AFRAMES_RETURN
, newfbt
);
514 retfbt
->fbtp_next
= newfbt
;
515 newfbt
->fbtp_id
= retfbt
->fbtp_id
;
519 newfbt
->fbtp_patchpoint
= patch_instr
;
520 newfbt
->fbtp_ctl
= ctl
;
521 newfbt
->fbtp_loadcnt
= ctl
->mod_loadcnt
;
523 if (*patch_instr
== FBT_POP_RBP
) {
524 newfbt
->fbtp_rval
= DTRACE_INVOP_POP_RBP
;
526 ASSERT(*patch_instr
== FBT_LEAVE
);
527 newfbt
->fbtp_rval
= DTRACE_INVOP_LEAVE
;
529 newfbt
->fbtp_roffset
=
530 (uintptr_t)(patch_instr
- (uint8_t *)symbolStart
);
532 newfbt
->fbtp_savedval
= *patch_instr
;
533 newfbt
->fbtp_patchval
= FBT_PATCHVAL
;
534 newfbt
->fbtp_hashnext
= fbt_probetab
[FBT_ADDR2NDX(patch_instr
)];
535 fbt_probetab
[FBT_ADDR2NDX(patch_instr
)] = newfbt
;
538 fbt_enable(NULL
, newfbt
->fbtp_id
, newfbt
);
545 fbt_provide_module_kernel_syms(struct modctl
*ctl
)
547 kernel_mach_header_t
*mh
;
548 struct load_command
*cmd
;
549 kernel_segment_command_t
*orig_ts
= NULL
, *orig_le
= NULL
;
550 struct symtab_command
*orig_st
= NULL
;
551 kernel_nlist_t
*sym
= NULL
;
553 uintptr_t instrLow
, instrHigh
;
557 mh
= (kernel_mach_header_t
*)(ctl
->mod_address
);
558 modname
= ctl
->mod_modname
;
560 if (mh
->magic
!= MH_MAGIC_KERNEL
)
563 cmd
= (struct load_command
*) &mh
[1];
564 for (i
= 0; i
< mh
->ncmds
; i
++) {
565 if (cmd
->cmd
== LC_SEGMENT_KERNEL
) {
566 kernel_segment_command_t
*orig_sg
= (kernel_segment_command_t
*) cmd
;
568 if (LIT_STRNEQL(orig_sg
->segname
, SEG_TEXT
))
570 else if (LIT_STRNEQL(orig_sg
->segname
, SEG_LINKEDIT
))
572 else if (LIT_STRNEQL(orig_sg
->segname
, ""))
573 orig_ts
= orig_sg
; /* kexts have a single unnamed segment */
575 else if (cmd
->cmd
== LC_SYMTAB
)
576 orig_st
= (struct symtab_command
*) cmd
;
578 cmd
= (struct load_command
*) ((caddr_t
) cmd
+ cmd
->cmdsize
);
581 if ((orig_ts
== NULL
) || (orig_st
== NULL
) || (orig_le
== NULL
))
584 sym
= (kernel_nlist_t
*)(orig_le
->vmaddr
+ orig_st
->symoff
- orig_le
->fileoff
);
585 strings
= (char *)(orig_le
->vmaddr
+ orig_st
->stroff
- orig_le
->fileoff
);
587 /* Find extent of the TEXT section */
588 instrLow
= (uintptr_t)orig_ts
->vmaddr
;
589 instrHigh
= (uintptr_t)(orig_ts
->vmaddr
+ orig_ts
->vmsize
);
591 for (i
= 0; i
< orig_st
->nsyms
; i
++) {
592 uint8_t n_type
= sym
[i
].n_type
& (N_TYPE
| N_EXT
);
593 char *name
= strings
+ sym
[i
].n_un
.n_strx
;
595 /* Check that the symbol is a global and that it has a name. */
596 if (((N_SECT
| N_EXT
) != n_type
&& (N_ABS
| N_EXT
) != n_type
))
599 if (0 == sym
[i
].n_un
.n_strx
) /* iff a null, "", name. */
602 /* Lop off omnipresent leading underscore. */
607 * We're only blacklisting functions in the kernel for now.
609 if (MOD_IS_MACH_KERNEL(ctl
) && fbt_excluded(name
))
612 fbt_provide_probe(ctl
, instrLow
, instrHigh
, modname
, name
, (machine_inst_t
*)sym
[i
].n_value
);