4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <kern/cpu_data.h>
27 #include <kern/thread.h>
28 #include <kern/assert.h>
29 #include <mach/thread_status.h>
31 #include <sys/param.h>
32 #include <sys/systm.h>
33 #include <sys/errno.h>
35 #include <sys/ioctl.h>
37 #include <sys/fcntl.h>
38 #include <miscfs/devfs/devfs.h>
40 #include <sys/dtrace.h>
41 #include <sys/dtrace_impl.h>
43 #include <sys/dtrace_glue.h>
45 #include <machine/pal_routines.h>
47 #if defined(__x86_64__)
48 extern x86_saved_state_t
*find_kern_regs(thread_t
);
49 #elif defined (__arm__) || defined(__arm64__)
50 extern struct arm_saved_state
*find_kern_regs(thread_t
);
52 #error Unknown architecture
56 #define ASSERT(x) do {} while(0)
58 extern void profile_init(void);
60 static dtrace_provider_id_t profile_id
;
63 * Regardless of platform, the stack frames look like this in the case of the
72 * On x86, there are five frames from the generic interrupt code; further, the
73 * interrupted instruction appears as its own stack frame, giving us a total of
76 * On SPARC, the picture is further complicated because the compiler
77 * optimizes away tail-calls -- so the following frames are optimized away:
82 * This gives three frames. However, on DEBUG kernels, the cyclic_expire
83 * frame cannot be tail-call eliminated, yielding four frames in this case.
85 * All of the above constraints lead to the mess below. Yes, the profile
86 * provider should ideally figure this out on-the-fly by hitting one of its own
87 * probes and then walking its own stack trace. This is complicated, however,
88 * and the static definition doesn't seem to be overly brittle. Still, we
89 * allow for a manual override in case we get it completely wrong.
92 #if defined(__x86_64__)
93 #define PROF_ARTIFICIAL_FRAMES 9
94 #elif defined(__arm__) || defined(__arm64__)
95 #define PROF_ARTIFICIAL_FRAMES 8
97 #error Unknown architecture
100 #define PROF_NAMELEN 15
102 #define PROF_PROFILE 0
104 #define PROF_PREFIX_PROFILE "profile-"
105 #define PROF_PREFIX_TICK "tick-"
107 typedef struct profile_probe
{
108 char prof_name
[PROF_NAMELEN
];
111 hrtime_t prof_interval
;
112 cyclic_id_t prof_cyclic
;
115 typedef struct profile_probe_percpu
{
116 hrtime_t profc_expected
;
117 hrtime_t profc_interval
;
118 profile_probe_t
*profc_probe
;
119 } profile_probe_percpu_t
;
121 hrtime_t profile_interval_min
= NANOSEC
/ 5000; /* 5000 hz */
122 int profile_aframes
= 0; /* override */
124 static int profile_rates
[] = {
125 97, 199, 499, 997, 1999,
131 static int profile_ticks
[] = {
132 1, 10, 100, 500, 1000,
138 * profile_max defines the upper bound on the number of profile probes that
139 * can exist (this is to prevent malicious or clumsy users from exhausing
140 * system resources by creating a slew of profile probes). At mod load time,
141 * this gets its value from PROFILE_MAX_DEFAULT or profile-max-probes if it's
142 * present in the profile.conf file.
144 #define PROFILE_MAX_DEFAULT 1000 /* default max. number of probes */
145 static uint32_t profile_max
; /* maximum number of profile probes */
146 static uint32_t profile_total
; /* current number of profile probes */
149 profile_fire(void *arg
)
151 profile_probe_percpu_t
*pcpu
= arg
;
152 profile_probe_t
*prof
= pcpu
->profc_probe
;
155 late
= dtrace_gethrtime() - pcpu
->profc_expected
;
156 pcpu
->profc_expected
+= pcpu
->profc_interval
;
158 #if defined(__x86_64__)
159 x86_saved_state_t
*kern_regs
= find_kern_regs(current_thread());
161 if (NULL
!= kern_regs
) {
162 /* Kernel was interrupted. */
163 dtrace_probe(prof
->prof_id
, saved_state64(kern_regs
)->isf
.rip
, 0x0, late
, 0, 0);
165 pal_register_cache_state(current_thread(), VALID
);
166 /* Possibly a user interrupt */
167 x86_saved_state_t
*tagged_regs
= (x86_saved_state_t
*)find_user_regs(current_thread());
169 if (NULL
== tagged_regs
) {
170 /* Too bad, so sad, no useful interrupt state. */
171 dtrace_probe(prof
->prof_id
, 0xcafebabe,
172 0x0, late
, 0, 0); /* XXX_BOGUS also see profile_usermode() below. */
173 } else if (is_saved_state64(tagged_regs
)) {
174 x86_saved_state64_t
*regs
= saved_state64(tagged_regs
);
176 dtrace_probe(prof
->prof_id
, 0x0, regs
->isf
.rip
, late
, 0, 0);
178 x86_saved_state32_t
*regs
= saved_state32(tagged_regs
);
180 dtrace_probe(prof
->prof_id
, 0x0, regs
->eip
, late
, 0, 0);
183 #elif defined(__arm__)
185 arm_saved_state_t
*arm_kern_regs
= (arm_saved_state_t
*) find_kern_regs(current_thread());
187 // We should only come in here from interrupt context, so we should always have valid kernel regs
188 assert(NULL
!= arm_kern_regs
);
190 if (arm_kern_regs
->cpsr
& 0xF) {
191 /* Kernel was interrupted. */
192 dtrace_probe(prof
->prof_id
, arm_kern_regs
->pc
, 0x0, late
, 0, 0);
194 /* Possibly a user interrupt */
195 arm_saved_state_t
*arm_user_regs
= (arm_saved_state_t
*)find_user_regs(current_thread());
197 if (NULL
== arm_user_regs
) {
198 /* Too bad, so sad, no useful interrupt state. */
199 dtrace_probe(prof
->prof_id
, 0xcafebabe, 0x0, late
, 0, 0); /* XXX_BOGUS also see profile_usermode() below. */
201 dtrace_probe(prof
->prof_id
, 0x0, arm_user_regs
->pc
, late
, 0, 0);
205 #elif defined(__arm64__)
207 arm_saved_state_t
*arm_kern_regs
= (arm_saved_state_t
*) find_kern_regs(current_thread());
209 // We should only come in here from interrupt context, so we should always have valid kernel regs
210 assert(NULL
!= arm_kern_regs
);
212 if (saved_state64(arm_kern_regs
)->cpsr
& 0xF) {
213 /* Kernel was interrupted. */
214 dtrace_probe(prof
->prof_id
, saved_state64(arm_kern_regs
)->pc
, 0x0, late
, 0, 0);
216 /* Possibly a user interrupt */
217 arm_saved_state_t
*arm_user_regs
= (arm_saved_state_t
*)find_user_regs(current_thread());
219 if (NULL
== arm_user_regs
) {
220 /* Too bad, so sad, no useful interrupt state. */
221 dtrace_probe(prof
->prof_id
, 0xcafebabe, 0x0, late
, 0, 0); /* XXX_BOGUS also see profile_usermode() below. */
223 dtrace_probe(prof
->prof_id
, 0x0, get_saved_state_pc(arm_user_regs
), late
, 0, 0);
228 #error Unknown architecture
233 profile_tick(void *arg
)
235 profile_probe_t
*prof
= arg
;
237 #if defined(__x86_64__)
238 x86_saved_state_t
*kern_regs
= find_kern_regs(current_thread());
240 if (NULL
!= kern_regs
) {
241 /* Kernel was interrupted. */
242 dtrace_probe(prof
->prof_id
, saved_state64(kern_regs
)->isf
.rip
, 0x0, 0, 0, 0);
244 pal_register_cache_state(current_thread(), VALID
);
245 /* Possibly a user interrupt */
246 x86_saved_state_t
*tagged_regs
= (x86_saved_state_t
*)find_user_regs(current_thread());
248 if (NULL
== tagged_regs
) {
249 /* Too bad, so sad, no useful interrupt state. */
250 dtrace_probe(prof
->prof_id
, 0xcafebabe,
251 0x0, 0, 0, 0); /* XXX_BOGUS also see profile_usermode() below. */
252 } else if (is_saved_state64(tagged_regs
)) {
253 x86_saved_state64_t
*regs
= saved_state64(tagged_regs
);
255 dtrace_probe(prof
->prof_id
, 0x0, regs
->isf
.rip
, 0, 0, 0);
257 x86_saved_state32_t
*regs
= saved_state32(tagged_regs
);
259 dtrace_probe(prof
->prof_id
, 0x0, regs
->eip
, 0, 0, 0);
262 #elif defined(__arm__)
264 arm_saved_state_t
*arm_kern_regs
= (arm_saved_state_t
*) find_kern_regs(current_thread());
266 if (NULL
!= arm_kern_regs
) {
267 /* Kernel was interrupted. */
268 dtrace_probe(prof
->prof_id
, arm_kern_regs
->pc
, 0x0, 0, 0, 0);
270 /* Possibly a user interrupt */
271 arm_saved_state_t
*arm_user_regs
= (arm_saved_state_t
*)find_user_regs(current_thread());
273 if (NULL
== arm_user_regs
) {
274 /* Too bad, so sad, no useful interrupt state. */
275 dtrace_probe(prof
->prof_id
, 0xcafebabe, 0x0, 0, 0, 0); /* XXX_BOGUS also see profile_usermode() below. */
277 dtrace_probe(prof
->prof_id
, 0x0, arm_user_regs
->pc
, 0, 0, 0);
281 #elif defined(__arm64__)
283 arm_saved_state_t
*arm_kern_regs
= (arm_saved_state_t
*) find_kern_regs(current_thread());
285 if (NULL
!= arm_kern_regs
) {
286 /* Kernel was interrupted. */
287 dtrace_probe(prof
->prof_id
, saved_state64(arm_kern_regs
)->pc
, 0x0, 0, 0, 0);
289 /* Possibly a user interrupt */
290 arm_saved_state_t
*arm_user_regs
= (arm_saved_state_t
*)find_user_regs(current_thread());
292 if (NULL
== arm_user_regs
) {
293 /* Too bad, so sad, no useful interrupt state. */
294 dtrace_probe(prof
->prof_id
, 0xcafebabe, 0x0, 0, 0, 0); /* XXX_BOGUS also see profile_usermode() below. */
296 dtrace_probe(prof
->prof_id
, 0x0, get_saved_state_pc(arm_user_regs
), 0, 0, 0);
302 #error Unknown architecture
307 profile_create(hrtime_t interval
, const char *name
, int kind
)
309 profile_probe_t
*prof
;
311 if (interval
< profile_interval_min
) {
315 if (dtrace_probe_lookup(profile_id
, NULL
, NULL
, name
) != 0) {
319 os_atomic_inc(&profile_total
, relaxed
);
320 if (profile_total
> profile_max
) {
321 os_atomic_dec(&profile_total
, relaxed
);
325 if (PROF_TICK
== kind
) {
326 prof
= kmem_zalloc(sizeof(profile_probe_t
), KM_SLEEP
);
328 prof
= kmem_zalloc(sizeof(profile_probe_t
) + NCPU
* sizeof(profile_probe_percpu_t
), KM_SLEEP
);
331 (void) strlcpy(prof
->prof_name
, name
, sizeof(prof
->prof_name
));
332 prof
->prof_interval
= interval
;
333 prof
->prof_cyclic
= CYCLIC_NONE
;
334 prof
->prof_kind
= kind
;
335 prof
->prof_id
= dtrace_probe_create(profile_id
,
337 profile_aframes
? profile_aframes
: PROF_ARTIFICIAL_FRAMES
, prof
);
342 profile_provide(void *arg
, const dtrace_probedesc_t
*desc
)
344 #pragma unused(arg) /* __APPLE__ */
345 int i
, j
, rate
, kind
;
346 hrtime_t val
= 0, mult
= 1, len
;
347 const char *name
, *suffix
= NULL
;
353 { PROF_PREFIX_PROFILE
, PROF_PROFILE
},
354 { PROF_PREFIX_TICK
, PROF_TICK
},
362 { "ns", NANOSEC
/ NANOSEC
},
363 { "nsec", NANOSEC
/ NANOSEC
},
364 { "us", NANOSEC
/ MICROSEC
},
365 { "usec", NANOSEC
/ MICROSEC
},
366 { "ms", NANOSEC
/ MILLISEC
},
367 { "msec", NANOSEC
/ MILLISEC
},
368 { "s", NANOSEC
/ SEC
},
369 { "sec", NANOSEC
/ SEC
},
370 { "m", NANOSEC
* (hrtime_t
)60 },
371 { "min", NANOSEC
* (hrtime_t
)60 },
372 { "h", NANOSEC
* (hrtime_t
)(60 * 60) },
373 { "hour", NANOSEC
* (hrtime_t
)(60 * 60) },
374 { "d", NANOSEC
* (hrtime_t
)(24 * 60 * 60) },
375 { "day", NANOSEC
* (hrtime_t
)(24 * 60 * 60) },
381 char n
[PROF_NAMELEN
];
384 * If no description was provided, provide all of our probes.
386 for (i
= 0; i
< (int)(sizeof(profile_rates
) / sizeof(int)); i
++) {
387 if ((rate
= profile_rates
[i
]) == 0) {
391 (void) snprintf(n
, PROF_NAMELEN
, "%s%d",
392 PROF_PREFIX_PROFILE
, rate
);
393 profile_create(NANOSEC
/ rate
, n
, PROF_PROFILE
);
396 for (i
= 0; i
< (int)(sizeof(profile_ticks
) / sizeof(int)); i
++) {
397 if ((rate
= profile_ticks
[i
]) == 0) {
401 (void) snprintf(n
, PROF_NAMELEN
, "%s%d",
402 PROF_PREFIX_TICK
, rate
);
403 profile_create(NANOSEC
/ rate
, n
, PROF_TICK
);
409 name
= desc
->dtpd_name
;
411 for (i
= 0; types
[i
].prefix
!= NULL
; i
++) {
412 len
= strlen(types
[i
].prefix
);
414 if (strncmp(name
, types
[i
].prefix
, len
) != 0) {
420 if (types
[i
].prefix
== NULL
) {
424 kind
= types
[i
].kind
;
425 j
= strlen(name
) - len
;
428 * We need to start before any time suffix.
430 for (j
= strlen(name
); j
>= len
; j
--) {
431 if (name
[j
] >= '0' && name
[j
] <= '9') {
437 ASSERT(suffix
!= NULL
);
440 * Now determine the numerical value present in the probe name.
442 for (; j
>= len
; j
--) {
443 if (name
[j
] < '0' || name
[j
] > '9') {
447 val
+= (name
[j
] - '0') * mult
;
448 mult
*= (hrtime_t
)10;
456 * Look-up the suffix to determine the multiplier.
458 for (i
= 0, mult
= 0; suffixes
[i
].name
!= NULL
; i
++) {
459 /* APPLE NOTE: Darwin employs size bounded string operations */
460 if (strncasecmp(suffixes
[i
].name
, suffix
, strlen(suffixes
[i
].name
) + 1) == 0) {
461 mult
= suffixes
[i
].mult
;
466 if (suffixes
[i
].name
== NULL
&& *suffix
!= '\0') {
472 * The default is frequency-per-second.
479 profile_create(val
, name
, kind
);
484 profile_destroy(void *arg
, dtrace_id_t id
, void *parg
)
486 #pragma unused(arg,id) /* __APPLE__ */
487 profile_probe_t
*prof
= parg
;
489 ASSERT(prof
->prof_cyclic
== CYCLIC_NONE
);
491 if (prof
->prof_kind
== PROF_TICK
) {
492 kmem_free(prof
, sizeof(profile_probe_t
));
494 kmem_free(prof
, sizeof(profile_probe_t
) + NCPU
* sizeof(profile_probe_percpu_t
));
497 ASSERT(profile_total
>= 1);
498 os_atomic_dec(&profile_total
, relaxed
);
503 profile_online(void *arg
, dtrace_cpu_t
*cpu
, cyc_handler_t
*hdlr
, cyc_time_t
*when
)
505 #pragma unused(cpu) /* __APPLE__ */
506 profile_probe_t
*prof
= arg
;
507 profile_probe_percpu_t
*pcpu
;
509 pcpu
= ((profile_probe_percpu_t
*)(&(prof
[1]))) + cpu_number();
510 pcpu
->profc_probe
= prof
;
512 hdlr
->cyh_func
= profile_fire
;
513 hdlr
->cyh_arg
= pcpu
;
514 hdlr
->cyh_level
= CY_HIGH_LEVEL
;
516 when
->cyt_interval
= prof
->prof_interval
;
517 when
->cyt_when
= dtrace_gethrtime() + when
->cyt_interval
;
519 pcpu
->profc_expected
= when
->cyt_when
;
520 pcpu
->profc_interval
= when
->cyt_interval
;
525 profile_offline(void *arg
, dtrace_cpu_t
*cpu
, void *oarg
)
527 profile_probe_percpu_t
*pcpu
= oarg
;
529 ASSERT(pcpu
->profc_probe
== arg
);
530 #pragma unused(pcpu,arg,cpu) /* __APPLE__ */
535 profile_enable(void *arg
, dtrace_id_t id
, void *parg
)
537 #pragma unused(arg,id) /* __APPLE__ */
538 profile_probe_t
*prof
= parg
;
539 cyc_omni_handler_t omni
;
543 ASSERT(prof
->prof_interval
!= 0);
544 ASSERT(MUTEX_HELD(&cpu_lock
));
546 if (prof
->prof_kind
== PROF_TICK
) {
547 hdlr
.cyh_func
= profile_tick
;
549 hdlr
.cyh_level
= CY_HIGH_LEVEL
;
551 when
.cyt_interval
= prof
->prof_interval
;
552 #if !defined(__APPLE__)
553 when
.cyt_when
= dtrace_gethrtime() + when
.cyt_interval
;
556 #endif /* __APPLE__ */
558 ASSERT(prof
->prof_kind
== PROF_PROFILE
);
559 omni
.cyo_online
= profile_online
;
560 omni
.cyo_offline
= profile_offline
;
564 if (prof
->prof_kind
== PROF_TICK
) {
565 prof
->prof_cyclic
= cyclic_timer_add(&hdlr
, &when
);
567 prof
->prof_cyclic
= (cyclic_id_t
)cyclic_add_omni(&omni
); /* cast puns cyclic_id_list_t with cyclic_id_t */
575 profile_disable(void *arg
, dtrace_id_t id
, void *parg
)
577 profile_probe_t
*prof
= parg
;
579 ASSERT(prof
->prof_cyclic
!= CYCLIC_NONE
);
580 ASSERT(MUTEX_HELD(&cpu_lock
));
582 #pragma unused(arg,id)
583 if (prof
->prof_kind
== PROF_TICK
) {
584 cyclic_timer_remove(prof
->prof_cyclic
);
586 cyclic_remove_omni((cyclic_id_list_t
)prof
->prof_cyclic
); /* cast puns cyclic_id_list_t with cyclic_id_t */
588 prof
->prof_cyclic
= CYCLIC_NONE
;
592 profile_getarg(void *arg
, dtrace_id_t id
, void *parg
, int argno
, int aframes
)
594 #pragma unused(arg, id, parg, argno, aframes)
596 * All the required arguments for the profile probe are passed directly
597 * to dtrace_probe, and we do not go through dtrace_getarg which doesn't
598 * know how to hop to the kernel stack from the interrupt stack like
605 profile_getargdesc(void *arg
, dtrace_id_t id
, void *parg
, dtrace_argdesc_t
*desc
)
607 #pragma unused(arg, id)
608 profile_probe_t
*prof
= parg
;
609 const char *argdesc
= NULL
;
610 switch (desc
->dtargd_ndx
) {
615 argdesc
= "user_addr_t";
618 if (prof
->prof_kind
== PROF_PROFILE
) {
619 argdesc
= "hrtime_t";
624 strlcpy(desc
->dtargd_native
, argdesc
, DTRACE_ARGTYPELEN
);
626 desc
->dtargd_ndx
= DTRACE_ARGNONE
;
631 * APPLE NOTE: profile_usermode call not supported.
634 profile_usermode(void *arg
, dtrace_id_t id
, void *parg
)
636 #pragma unused(arg,id,parg)
637 return 1; /* XXX_BOGUS */
640 static dtrace_pattr_t profile_attr
= {
641 { DTRACE_STABILITY_EVOLVING
, DTRACE_STABILITY_EVOLVING
, DTRACE_CLASS_COMMON
},
642 { DTRACE_STABILITY_UNSTABLE
, DTRACE_STABILITY_UNSTABLE
, DTRACE_CLASS_UNKNOWN
},
643 { DTRACE_STABILITY_PRIVATE
, DTRACE_STABILITY_PRIVATE
, DTRACE_CLASS_UNKNOWN
},
644 { DTRACE_STABILITY_EVOLVING
, DTRACE_STABILITY_EVOLVING
, DTRACE_CLASS_COMMON
},
645 { DTRACE_STABILITY_EVOLVING
, DTRACE_STABILITY_EVOLVING
, DTRACE_CLASS_COMMON
},
648 static dtrace_pops_t profile_pops
= {
649 .dtps_provide
= profile_provide
,
650 .dtps_provide_module
= NULL
,
651 .dtps_enable
= profile_enable
,
652 .dtps_disable
= profile_disable
,
653 .dtps_suspend
= NULL
,
655 .dtps_getargdesc
= profile_getargdesc
,
656 .dtps_getargval
= profile_getarg
,
657 .dtps_usermode
= profile_usermode
,
658 .dtps_destroy
= profile_destroy
662 profile_attach(dev_info_t
*devi
)
664 if (ddi_create_minor_node(devi
, "profile", S_IFCHR
, 0,
665 DDI_PSEUDO
, 0) == DDI_FAILURE
||
666 dtrace_register("profile", &profile_attr
,
667 DTRACE_PRIV_KERNEL
| DTRACE_PRIV_USER
, NULL
,
668 &profile_pops
, NULL
, &profile_id
) != 0) {
669 ddi_remove_minor_node(devi
, NULL
);
673 profile_max
= PROFILE_MAX_DEFAULT
;
679 * APPLE NOTE: profile_detach not implemented
681 #if !defined(__APPLE__)
683 profile_detach(dev_info_t
*devi
, ddi_detach_cmd_t cmd
)
694 if (dtrace_unregister(profile_id
) != 0) {
698 ddi_remove_minor_node(devi
, NULL
);
701 #endif /* __APPLE__ */
703 d_open_t _profile_open
;
706 _profile_open(dev_t dev
, int flags
, int devtype
, struct proc
*p
)
708 #pragma unused(dev,flags,devtype,p)
712 #define PROFILE_MAJOR -24 /* let the kernel pick the device number */
714 static const struct cdevsw profile_cdevsw
=
716 .d_open
= _profile_open
,
719 .d_write
= eno_rdwrt
,
720 .d_ioctl
= eno_ioctl
,
721 .d_stop
= (stop_fcn_t
*)nulldev
,
722 .d_reset
= (reset_fcn_t
*)nulldev
,
723 .d_select
= eno_select
,
725 .d_strategy
= eno_strat
,
726 .d_reserved_1
= eno_getc
,
727 .d_reserved_2
= eno_putc
,
733 int majdevno
= cdevsw_add(PROFILE_MAJOR
, &profile_cdevsw
);
736 printf("profile_init: failed to allocate a major number!\n");
740 profile_attach((dev_info_t
*)(uintptr_t)majdevno
);