]> git.saurik.com Git - apple/xnu.git/blob - bsd/dev/dtrace/dtrace.c
25f6d7c8e4d5156f6da257955b7a8bc80540eae0
[apple/xnu.git] / bsd / dev / dtrace / dtrace.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Portions copyright (c) 2011, Joyent, Inc. All rights reserved.
24 */
25
26 /*
27 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
28 * Use is subject to license terms.
29 */
30
31 /* #pragma ident "@(#)dtrace.c 1.65 08/07/02 SMI" */
32
33 /*
34 * DTrace - Dynamic Tracing for Solaris
35 *
36 * This is the implementation of the Solaris Dynamic Tracing framework
37 * (DTrace). The user-visible interface to DTrace is described at length in
38 * the "Solaris Dynamic Tracing Guide". The interfaces between the libdtrace
39 * library, the in-kernel DTrace framework, and the DTrace providers are
40 * described in the block comments in the <sys/dtrace.h> header file. The
41 * internal architecture of DTrace is described in the block comments in the
42 * <sys/dtrace_impl.h> header file. The comments contained within the DTrace
43 * implementation very much assume mastery of all of these sources; if one has
44 * an unanswered question about the implementation, one should consult them
45 * first.
46 *
47 * The functions here are ordered roughly as follows:
48 *
49 * - Probe context functions
50 * - Probe hashing functions
51 * - Non-probe context utility functions
52 * - Matching functions
53 * - Provider-to-Framework API functions
54 * - Probe management functions
55 * - DIF object functions
56 * - Format functions
57 * - Predicate functions
58 * - ECB functions
59 * - Buffer functions
60 * - Enabling functions
61 * - DOF functions
62 * - Anonymous enabling functions
63 * - Consumer state functions
64 * - Helper functions
65 * - Hook functions
66 * - Driver cookbook functions
67 *
68 * Each group of functions begins with a block comment labelled the "DTrace
69 * [Group] Functions", allowing one to find each block by searching forward
70 * on capital-f functions.
71 */
72 #include <sys/errno.h>
73 #include <sys/types.h>
74 #include <sys/stat.h>
75 #include <sys/conf.h>
76 #include <sys/systm.h>
77 #include <sys/dtrace_impl.h>
78 #include <sys/param.h>
79 #include <sys/proc_internal.h>
80 #include <sys/ioctl.h>
81 #include <sys/fcntl.h>
82 #include <miscfs/devfs/devfs.h>
83 #include <sys/malloc.h>
84 #include <sys/kernel_types.h>
85 #include <sys/proc_internal.h>
86 #include <sys/uio_internal.h>
87 #include <sys/kauth.h>
88 #include <vm/pmap.h>
89 #include <sys/user.h>
90 #include <mach/exception_types.h>
91 #include <sys/signalvar.h>
92 #include <mach/task.h>
93 #include <kern/zalloc.h>
94 #include <kern/ast.h>
95 #include <kern/task.h>
96 #include <netinet/in.h>
97
98 #include <kern/cpu_data.h>
99 extern uint32_t pmap_find_phys(void *, uint64_t);
100 extern boolean_t pmap_valid_page(uint32_t);
101 extern void OSKextRegisterKextsWithDTrace(void);
102 extern kmod_info_t g_kernel_kmod_info;
103
104 /* Solaris proc_t is the struct. Darwin's proc_t is a pointer to it. */
105 #define proc_t struct proc /* Steer clear of the Darwin typedef for proc_t */
106
107 #define t_predcache t_dtrace_predcache /* Cosmetic. Helps readability of thread.h */
108
109 extern void dtrace_suspend(void);
110 extern void dtrace_resume(void);
111 extern void dtrace_init(void);
112 extern void helper_init(void);
113 extern void fasttrap_init(void);
114 extern void dtrace_lazy_dofs_duplicate(proc_t *, proc_t *);
115 extern void dtrace_lazy_dofs_destroy(proc_t *);
116 extern void dtrace_postinit(void);
117
118 #include "../../../osfmk/chud/chud_dtrace.h"
119
120 extern kern_return_t chudxnu_dtrace_callback
121 (uint64_t selector, uint64_t *args, uint32_t count);
122
123 /* Import this function to retrieve the physical memory. */
124 extern int kernel_sysctlbyname(const char *name, void *oldp,
125 size_t *oldlenp, void *newp, size_t newlen);
126
127 /*
128 * DTrace Tunable Variables
129 *
130 * The following variables may be dynamically tuned by using sysctl(8), the
131 * variables being stored in the kern.dtrace namespace. For example:
132 * sysctl kern.dtrace.dof_maxsize = 1048575 # 1M
133 *
134 * In general, the only variables that one should be tuning this way are those
135 * that affect system-wide DTrace behavior, and for which the default behavior
136 * is undesirable. Most of these variables are tunable on a per-consumer
137 * basis using DTrace options, and need not be tuned on a system-wide basis.
138 * When tuning these variables, avoid pathological values; while some attempt
139 * is made to verify the integrity of these variables, they are not considered
140 * part of the supported interface to DTrace, and they are therefore not
141 * checked comprehensively.
142 */
143 uint64_t dtrace_buffer_memory_maxsize = 0; /* initialized in dtrace_init */
144 uint64_t dtrace_buffer_memory_inuse = 0;
145 int dtrace_destructive_disallow = 0;
146 dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024);
147 size_t dtrace_difo_maxsize = (256 * 1024);
148 dtrace_optval_t dtrace_dof_maxsize = (384 * 1024);
149 size_t dtrace_global_maxsize = (16 * 1024);
150 size_t dtrace_actions_max = (16 * 1024);
151 size_t dtrace_retain_max = 1024;
152 dtrace_optval_t dtrace_helper_actions_max = 32;
153 dtrace_optval_t dtrace_helper_providers_max = 64;
154 dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024);
155 size_t dtrace_strsize_default = 256;
156 dtrace_optval_t dtrace_cleanrate_default = 990099000; /* 1.1 hz */
157 dtrace_optval_t dtrace_cleanrate_min = 20000000; /* 50 hz */
158 dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC; /* 1/minute */
159 dtrace_optval_t dtrace_aggrate_default = NANOSEC; /* 1 hz */
160 dtrace_optval_t dtrace_statusrate_default = NANOSEC; /* 1 hz */
161 dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC; /* 6/minute */
162 dtrace_optval_t dtrace_switchrate_default = NANOSEC; /* 1 hz */
163 dtrace_optval_t dtrace_nspec_default = 1;
164 dtrace_optval_t dtrace_specsize_default = 32 * 1024;
165 dtrace_optval_t dtrace_stackframes_default = 20;
166 dtrace_optval_t dtrace_ustackframes_default = 20;
167 dtrace_optval_t dtrace_jstackframes_default = 50;
168 dtrace_optval_t dtrace_jstackstrsize_default = 512;
169 int dtrace_msgdsize_max = 128;
170 hrtime_t dtrace_chill_max = 500 * (NANOSEC / MILLISEC); /* 500 ms */
171 hrtime_t dtrace_chill_interval = NANOSEC; /* 1000 ms */
172 int dtrace_devdepth_max = 32;
173 int dtrace_err_verbose;
174 int dtrace_provide_private_probes = 0;
175 hrtime_t dtrace_deadman_interval = NANOSEC;
176 hrtime_t dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
177 hrtime_t dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
178
179 /*
180 * DTrace External Variables
181 *
182 * As dtrace(7D) is a kernel module, any DTrace variables are obviously
183 * available to DTrace consumers via the backtick (`) syntax. One of these,
184 * dtrace_zero, is made deliberately so: it is provided as a source of
185 * well-known, zero-filled memory. While this variable is not documented,
186 * it is used by some translators as an implementation detail.
187 */
188 const char dtrace_zero[256] = { 0 }; /* zero-filled memory */
189 unsigned int dtrace_max_cpus = 0; /* number of enabled cpus */
190 /*
191 * DTrace Internal Variables
192 */
193 static dev_info_t *dtrace_devi; /* device info */
194 static vmem_t *dtrace_arena; /* probe ID arena */
195 static vmem_t *dtrace_minor; /* minor number arena */
196 static taskq_t *dtrace_taskq; /* task queue */
197 static dtrace_probe_t **dtrace_probes; /* array of all probes */
198 static int dtrace_nprobes; /* number of probes */
199 static dtrace_provider_t *dtrace_provider; /* provider list */
200 static dtrace_meta_t *dtrace_meta_pid; /* user-land meta provider */
201 static int dtrace_opens; /* number of opens */
202 static int dtrace_helpers; /* number of helpers */
203 static void *dtrace_softstate; /* softstate pointer */
204 static dtrace_hash_t *dtrace_bymod; /* probes hashed by module */
205 static dtrace_hash_t *dtrace_byfunc; /* probes hashed by function */
206 static dtrace_hash_t *dtrace_byname; /* probes hashed by name */
207 static dtrace_toxrange_t *dtrace_toxrange; /* toxic range array */
208 static int dtrace_toxranges; /* number of toxic ranges */
209 static int dtrace_toxranges_max; /* size of toxic range array */
210 static dtrace_anon_t dtrace_anon; /* anonymous enabling */
211 static kmem_cache_t *dtrace_state_cache; /* cache for dynamic state */
212 static uint64_t dtrace_vtime_references; /* number of vtimestamp refs */
213 static kthread_t *dtrace_panicked; /* panicking thread */
214 static dtrace_ecb_t *dtrace_ecb_create_cache; /* cached created ECB */
215 static dtrace_genid_t dtrace_probegen; /* current probe generation */
216 static dtrace_helpers_t *dtrace_deferred_pid; /* deferred helper list */
217 static dtrace_enabling_t *dtrace_retained; /* list of retained enablings */
218 static dtrace_genid_t dtrace_retained_gen; /* current retained enab gen */
219 static dtrace_dynvar_t dtrace_dynhash_sink; /* end of dynamic hash chains */
220
221 static int dtrace_dof_mode; /* See dtrace_impl.h for a description of Darwin's dof modes. */
222
223 /*
224 * This does't quite fit as an internal variable, as it must be accessed in
225 * fbt_provide and sdt_provide. Its clearly not a dtrace tunable variable either...
226 */
227 int dtrace_kernel_symbol_mode; /* See dtrace_impl.h for a description of Darwin's kernel symbol modes. */
228
229
230 /*
231 * To save memory, some common memory allocations are given a
232 * unique zone. For example, dtrace_probe_t is 72 bytes in size,
233 * which means it would fall into the kalloc.128 bucket. With
234 * 20k elements allocated, the space saved is substantial.
235 */
236
237 struct zone *dtrace_probe_t_zone;
238
239 static int dtrace_module_unloaded(struct kmod_info *kmod);
240
241 /*
242 * DTrace Locking
243 * DTrace is protected by three (relatively coarse-grained) locks:
244 *
245 * (1) dtrace_lock is required to manipulate essentially any DTrace state,
246 * including enabling state, probes, ECBs, consumer state, helper state,
247 * etc. Importantly, dtrace_lock is _not_ required when in probe context;
248 * probe context is lock-free -- synchronization is handled via the
249 * dtrace_sync() cross call mechanism.
250 *
251 * (2) dtrace_provider_lock is required when manipulating provider state, or
252 * when provider state must be held constant.
253 *
254 * (3) dtrace_meta_lock is required when manipulating meta provider state, or
255 * when meta provider state must be held constant.
256 *
257 * The lock ordering between these three locks is dtrace_meta_lock before
258 * dtrace_provider_lock before dtrace_lock. (In particular, there are
259 * several places where dtrace_provider_lock is held by the framework as it
260 * calls into the providers -- which then call back into the framework,
261 * grabbing dtrace_lock.)
262 *
263 * There are two other locks in the mix: mod_lock and cpu_lock. With respect
264 * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
265 * role as a coarse-grained lock; it is acquired before both of these locks.
266 * With respect to dtrace_meta_lock, its behavior is stranger: cpu_lock must
267 * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
268 * mod_lock is similar with respect to dtrace_provider_lock in that it must be
269 * acquired _between_ dtrace_provider_lock and dtrace_lock.
270 */
271
272
273 /*
274 * APPLE NOTE:
275 *
276 * For porting purposes, all kmutex_t vars have been changed
277 * to lck_mtx_t, which require explicit initialization.
278 *
279 * kmutex_t becomes lck_mtx_t
280 * mutex_enter() becomes lck_mtx_lock()
281 * mutex_exit() becomes lck_mtx_unlock()
282 *
283 * Lock asserts are changed like this:
284 *
285 * ASSERT(MUTEX_HELD(&cpu_lock));
286 * becomes:
287 * lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
288 *
289 */
290 static lck_mtx_t dtrace_lock; /* probe state lock */
291 static lck_mtx_t dtrace_provider_lock; /* provider state lock */
292 static lck_mtx_t dtrace_meta_lock; /* meta-provider state lock */
293 static lck_rw_t dtrace_dof_mode_lock; /* dof mode lock */
294
295 /*
296 * DTrace Provider Variables
297 *
298 * These are the variables relating to DTrace as a provider (that is, the
299 * provider of the BEGIN, END, and ERROR probes).
300 */
301 static dtrace_pattr_t dtrace_provider_attr = {
302 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
303 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
304 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
305 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
306 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
307 };
308
309 static void
310 dtrace_nullop(void)
311 {}
312
313 static int
314 dtrace_enable_nullop(void)
315 {
316 return (0);
317 }
318
319 static dtrace_pops_t dtrace_provider_ops = {
320 (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop,
321 (void (*)(void *, struct modctl *))dtrace_nullop,
322 (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop,
323 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
324 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
325 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
326 NULL,
327 NULL,
328 NULL,
329 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop
330 };
331
332 static dtrace_id_t dtrace_probeid_begin; /* special BEGIN probe */
333 static dtrace_id_t dtrace_probeid_end; /* special END probe */
334 dtrace_id_t dtrace_probeid_error; /* special ERROR probe */
335
336 /*
337 * DTrace Helper Tracing Variables
338 */
339 uint32_t dtrace_helptrace_next = 0;
340 uint32_t dtrace_helptrace_nlocals;
341 char *dtrace_helptrace_buffer;
342 size_t dtrace_helptrace_bufsize = 512 * 1024;
343
344 #if DEBUG
345 int dtrace_helptrace_enabled = 1;
346 #else
347 int dtrace_helptrace_enabled = 0;
348 #endif
349
350
351 /*
352 * DTrace Error Hashing
353 *
354 * On DEBUG kernels, DTrace will track the errors that has seen in a hash
355 * table. This is very useful for checking coverage of tests that are
356 * expected to induce DIF or DOF processing errors, and may be useful for
357 * debugging problems in the DIF code generator or in DOF generation . The
358 * error hash may be examined with the ::dtrace_errhash MDB dcmd.
359 */
360 #if DEBUG
361 static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ];
362 static const char *dtrace_errlast;
363 static kthread_t *dtrace_errthread;
364 static lck_mtx_t dtrace_errlock;
365 #endif
366
367 /*
368 * DTrace Macros and Constants
369 *
370 * These are various macros that are useful in various spots in the
371 * implementation, along with a few random constants that have no meaning
372 * outside of the implementation. There is no real structure to this cpp
373 * mishmash -- but is there ever?
374 */
375 #define DTRACE_HASHSTR(hash, probe) \
376 dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))
377
378 #define DTRACE_HASHNEXT(hash, probe) \
379 (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)
380
381 #define DTRACE_HASHPREV(hash, probe) \
382 (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)
383
384 #define DTRACE_HASHEQ(hash, lhs, rhs) \
385 (strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
386 *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)
387
388 #define DTRACE_AGGHASHSIZE_SLEW 17
389
390 #define DTRACE_V4MAPPED_OFFSET (sizeof (uint32_t) * 3)
391
392 /*
393 * The key for a thread-local variable consists of the lower 61 bits of the
394 * current_thread(), plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
395 * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
396 * equal to a variable identifier. This is necessary (but not sufficient) to
397 * assure that global associative arrays never collide with thread-local
398 * variables. To guarantee that they cannot collide, we must also define the
399 * order for keying dynamic variables. That order is:
400 *
401 * [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
402 *
403 * Because the variable-key and the tls-key are in orthogonal spaces, there is
404 * no way for a global variable key signature to match a thread-local key
405 * signature.
406 */
407 #if defined (__x86_64__)
408 /* FIXME: two function calls!! */
409 #define DTRACE_TLS_THRKEY(where) { \
410 uint_t intr = ml_at_interrupt_context(); /* Note: just one measly bit */ \
411 uint64_t thr = (uintptr_t)current_thread(); \
412 ASSERT(intr < (1 << 3)); \
413 (where) = ((thr + DIF_VARIABLE_MAX) & \
414 (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
415 }
416 #else
417 #error Unknown architecture
418 #endif
419
420 #define DT_BSWAP_8(x) ((x) & 0xff)
421 #define DT_BSWAP_16(x) ((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
422 #define DT_BSWAP_32(x) ((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
423 #define DT_BSWAP_64(x) ((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
424
425 #define DT_MASK_LO 0x00000000FFFFFFFFULL
426
427 #define DTRACE_STORE(type, tomax, offset, what) \
428 *((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
429
430
431 #define DTRACE_ALIGNCHECK(addr, size, flags) \
432 if (addr & (MIN(size,4) - 1)) { \
433 *flags |= CPU_DTRACE_BADALIGN; \
434 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr; \
435 return (0); \
436 }
437
438 /*
439 * Test whether a range of memory starting at testaddr of size testsz falls
440 * within the range of memory described by addr, sz. We take care to avoid
441 * problems with overflow and underflow of the unsigned quantities, and
442 * disallow all negative sizes. Ranges of size 0 are allowed.
443 */
444 #define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
445 ((testaddr) - (baseaddr) < (basesz) && \
446 (testaddr) + (testsz) - (baseaddr) <= (basesz) && \
447 (testaddr) + (testsz) >= (testaddr))
448
449 /*
450 * Test whether alloc_sz bytes will fit in the scratch region. We isolate
451 * alloc_sz on the righthand side of the comparison in order to avoid overflow
452 * or underflow in the comparison with it. This is simpler than the INRANGE
453 * check above, because we know that the dtms_scratch_ptr is valid in the
454 * range. Allocations of size zero are allowed.
455 */
456 #define DTRACE_INSCRATCH(mstate, alloc_sz) \
457 ((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
458 (mstate)->dtms_scratch_ptr >= (alloc_sz))
459
460 #define RECOVER_LABEL(bits) dtraceLoadRecover##bits:
461
462 #if defined (__x86_64__)
463 #define DTRACE_LOADFUNC(bits) \
464 /*CSTYLED*/ \
465 uint##bits##_t dtrace_load##bits(uintptr_t addr); \
466 \
467 uint##bits##_t \
468 dtrace_load##bits(uintptr_t addr) \
469 { \
470 size_t size = bits / NBBY; \
471 /*CSTYLED*/ \
472 uint##bits##_t rval = 0; \
473 int i; \
474 volatile uint16_t *flags = (volatile uint16_t *) \
475 &cpu_core[CPU->cpu_id].cpuc_dtrace_flags; \
476 \
477 DTRACE_ALIGNCHECK(addr, size, flags); \
478 \
479 for (i = 0; i < dtrace_toxranges; i++) { \
480 if (addr >= dtrace_toxrange[i].dtt_limit) \
481 continue; \
482 \
483 if (addr + size <= dtrace_toxrange[i].dtt_base) \
484 continue; \
485 \
486 /* \
487 * This address falls within a toxic region; return 0. \
488 */ \
489 *flags |= CPU_DTRACE_BADADDR; \
490 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr; \
491 return (0); \
492 } \
493 \
494 { \
495 volatile vm_offset_t recover = (vm_offset_t)&&dtraceLoadRecover##bits; \
496 *flags |= CPU_DTRACE_NOFAULT; \
497 recover = dtrace_set_thread_recover(current_thread(), recover); \
498 /*CSTYLED*/ \
499 /* \
500 * PR6394061 - avoid device memory that is unpredictably \
501 * mapped and unmapped \
502 */ \
503 if (pmap_valid_page(pmap_find_phys(kernel_pmap, addr))) \
504 rval = *((volatile uint##bits##_t *)addr); \
505 RECOVER_LABEL(bits); \
506 (void)dtrace_set_thread_recover(current_thread(), recover); \
507 *flags &= ~CPU_DTRACE_NOFAULT; \
508 } \
509 \
510 return (rval); \
511 }
512 #else /* all other architectures */
513 #error Unknown Architecture
514 #endif
515
516 #ifdef __LP64__
517 #define dtrace_loadptr dtrace_load64
518 #else
519 #define dtrace_loadptr dtrace_load32
520 #endif
521
522 #define DTRACE_DYNHASH_FREE 0
523 #define DTRACE_DYNHASH_SINK 1
524 #define DTRACE_DYNHASH_VALID 2
525
526 #define DTRACE_MATCH_FAIL -1
527 #define DTRACE_MATCH_NEXT 0
528 #define DTRACE_MATCH_DONE 1
529 #define DTRACE_ANCHORED(probe) ((probe)->dtpr_func[0] != '\0')
530 #define DTRACE_STATE_ALIGN 64
531
532 #define DTRACE_FLAGS2FLT(flags) \
533 (((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR : \
534 ((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP : \
535 ((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO : \
536 ((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV : \
537 ((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV : \
538 ((flags) & CPU_DTRACE_TUPOFLOW) ? DTRACEFLT_TUPOFLOW : \
539 ((flags) & CPU_DTRACE_BADALIGN) ? DTRACEFLT_BADALIGN : \
540 ((flags) & CPU_DTRACE_NOSCRATCH) ? DTRACEFLT_NOSCRATCH : \
541 ((flags) & CPU_DTRACE_BADSTACK) ? DTRACEFLT_BADSTACK : \
542 DTRACEFLT_UNKNOWN)
543
544 #define DTRACEACT_ISSTRING(act) \
545 ((act)->dta_kind == DTRACEACT_DIFEXPR && \
546 (act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
547
548
549 static size_t dtrace_strlen(const char *, size_t);
550 static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
551 static void dtrace_enabling_provide(dtrace_provider_t *);
552 static int dtrace_enabling_match(dtrace_enabling_t *, int *);
553 static void dtrace_enabling_matchall(void);
554 static dtrace_state_t *dtrace_anon_grab(void);
555 static uint64_t dtrace_helper(int, dtrace_mstate_t *,
556 dtrace_state_t *, uint64_t, uint64_t);
557 static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
558 static void dtrace_buffer_drop(dtrace_buffer_t *);
559 static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
560 dtrace_state_t *, dtrace_mstate_t *);
561 static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
562 dtrace_optval_t);
563 static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
564 static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
565
566
567 /*
568 * DTrace sysctl handlers
569 *
570 * These declarations and functions are used for a deeper DTrace configuration.
571 * Most of them are not per-consumer basis and may impact the other DTrace
572 * consumers. Correctness may not be supported for all the variables, so you
573 * should be careful about what values you are using.
574 */
575
576 SYSCTL_DECL(_kern_dtrace);
577 SYSCTL_NODE(_kern, OID_AUTO, dtrace, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "dtrace");
578
579 static int
580 sysctl_dtrace_err_verbose SYSCTL_HANDLER_ARGS
581 {
582 #pragma unused(oidp, arg2)
583 int changed, error;
584 int value = *(int *) arg1;
585
586 error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
587 if (error || !changed)
588 return (error);
589
590 if (value != 0 && value != 1)
591 return (ERANGE);
592
593 lck_mtx_lock(&dtrace_lock);
594 dtrace_err_verbose = value;
595 lck_mtx_unlock(&dtrace_lock);
596
597 return (0);
598 }
599
600 /*
601 * kern.dtrace.err_verbose
602 *
603 * Set DTrace verbosity when an error occured (0 = disabled, 1 = enabld).
604 * Errors are reported when a DIFO or a DOF has been rejected by the kernel.
605 */
606 SYSCTL_PROC(_kern_dtrace, OID_AUTO, err_verbose,
607 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
608 &dtrace_err_verbose, 0,
609 sysctl_dtrace_err_verbose, "I", "dtrace error verbose");
610
611 static int
612 sysctl_dtrace_buffer_memory_maxsize SYSCTL_HANDLER_ARGS
613 {
614 #pragma unused(oidp, arg2, req)
615 int changed, error;
616 uint64_t value = *(uint64_t *) arg1;
617
618 error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
619 if (error || !changed)
620 return (error);
621
622 if (value <= dtrace_buffer_memory_inuse)
623 return (ERANGE);
624
625 lck_mtx_lock(&dtrace_lock);
626 dtrace_buffer_memory_maxsize = value;
627 lck_mtx_unlock(&dtrace_lock);
628
629 return (0);
630 }
631
632 /*
633 * kern.dtrace.buffer_memory_maxsize
634 *
635 * Set DTrace maximal size in bytes used by all the consumers' state buffers. By default
636 * the limit is PHYS_MEM / 3 for *all* consumers. Attempting to set a null, a negative value
637 * or a value <= to dtrace_buffer_memory_inuse will result in a failure.
638 */
639 SYSCTL_PROC(_kern_dtrace, OID_AUTO, buffer_memory_maxsize,
640 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
641 &dtrace_buffer_memory_maxsize, 0,
642 sysctl_dtrace_buffer_memory_maxsize, "Q", "dtrace state buffer memory maxsize");
643
644 /*
645 * kern.dtrace.buffer_memory_inuse
646 *
647 * Current state buffer memory used, in bytes, by all the DTrace consumers.
648 * This value is read-only.
649 */
650 SYSCTL_QUAD(_kern_dtrace, OID_AUTO, buffer_memory_inuse, CTLFLAG_RD | CTLFLAG_LOCKED,
651 &dtrace_buffer_memory_inuse, "dtrace state buffer memory in-use");
652
653 static int
654 sysctl_dtrace_difo_maxsize SYSCTL_HANDLER_ARGS
655 {
656 #pragma unused(oidp, arg2, req)
657 int changed, error;
658 size_t value = *(size_t*) arg1;
659
660 error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
661 if (error || !changed)
662 return (error);
663
664 if (value <= 0)
665 return (ERANGE);
666
667 lck_mtx_lock(&dtrace_lock);
668 dtrace_difo_maxsize = value;
669 lck_mtx_unlock(&dtrace_lock);
670
671 return (0);
672 }
673
674 /*
675 * kern.dtrace.difo_maxsize
676 *
677 * Set the DIFO max size in bytes, check the definition of dtrace_difo_maxsize
678 * to get the default value. Attempting to set a null or negative size will
679 * result in a failure.
680 */
681 SYSCTL_PROC(_kern_dtrace, OID_AUTO, difo_maxsize,
682 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
683 &dtrace_difo_maxsize, 0,
684 sysctl_dtrace_difo_maxsize, "Q", "dtrace difo maxsize");
685
686 static int
687 sysctl_dtrace_dof_maxsize SYSCTL_HANDLER_ARGS
688 {
689 #pragma unused(oidp, arg2, req)
690 int changed, error;
691 dtrace_optval_t value = *(dtrace_optval_t *) arg1;
692
693 error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
694 if (error || !changed)
695 return (error);
696
697 if (value <= 0)
698 return (ERANGE);
699
700 lck_mtx_lock(&dtrace_lock);
701 dtrace_dof_maxsize = value;
702 lck_mtx_unlock(&dtrace_lock);
703
704 return (0);
705 }
706
707 /*
708 * kern.dtrace.dof_maxsize
709 *
710 * Set the DOF max size in bytes, check the definition of dtrace_dof_maxsize to
711 * get the default value. Attempting to set a null or negative size will result
712 * in a failure.
713 */
714 SYSCTL_PROC(_kern_dtrace, OID_AUTO, dof_maxsize,
715 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
716 &dtrace_dof_maxsize, 0,
717 sysctl_dtrace_dof_maxsize, "Q", "dtrace dof maxsize");
718
719 static int
720 sysctl_dtrace_global_maxsize SYSCTL_HANDLER_ARGS
721 {
722 #pragma unused(oidp, arg2, req)
723 int changed, error;
724 dtrace_optval_t value = *(dtrace_optval_t*) arg1;
725
726 error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
727 if (error || !changed)
728 return (error);
729
730 if (value <= 0)
731 return (ERANGE);
732
733 lck_mtx_lock(&dtrace_lock);
734 dtrace_global_maxsize = value;
735 lck_mtx_unlock(&dtrace_lock);
736
737 return (0);
738 }
739
740 /*
741 * kern.dtrace.global_maxsize
742 *
743 * Set the global variable max size in bytes, check the definition of
744 * dtrace_global_maxsize to get the default value. Attempting to set a null or
745 * negative size will result in a failure.
746 */
747 SYSCTL_PROC(_kern_dtrace, OID_AUTO, global_maxsize,
748 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
749 &dtrace_global_maxsize, 0,
750 sysctl_dtrace_global_maxsize, "Q", "dtrace global maxsize");
751
752 static int
753 sysctl_dtrace_provide_private_probes SYSCTL_HANDLER_ARGS
754 {
755 #pragma unused(oidp, arg2)
756 int error;
757 int value = *(int *) arg1;
758
759 error = sysctl_io_number(req, value, sizeof(value), &value, NULL);
760 if (error)
761 return (error);
762
763 if (value != 0 && value != 1)
764 return (ERANGE);
765
766 lck_mtx_lock(&dtrace_lock);
767 dtrace_provide_private_probes = value;
768 lck_mtx_unlock(&dtrace_lock);
769
770 return (0);
771 }
772
773 /*
774 * kern.dtrace.provide_private_probes
775 *
776 * Set whether the providers must provide the private probes. This is
777 * mainly used by the FBT provider to request probes for the private/static
778 * symbols.
779 */
780 SYSCTL_PROC(_kern_dtrace, OID_AUTO, provide_private_probes,
781 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
782 &dtrace_provide_private_probes, 0,
783 sysctl_dtrace_provide_private_probes, "I", "provider must provide the private probes");
784
785 /*
786 * DTrace Probe Context Functions
787 *
788 * These functions are called from probe context. Because probe context is
789 * any context in which C may be called, arbitrarily locks may be held,
790 * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
791 * As a result, functions called from probe context may only call other DTrace
792 * support functions -- they may not interact at all with the system at large.
793 * (Note that the ASSERT macro is made probe-context safe by redefining it in
794 * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
795 * loads are to be performed from probe context, they _must_ be in terms of
796 * the safe dtrace_load*() variants.
797 *
798 * Some functions in this block are not actually called from probe context;
799 * for these functions, there will be a comment above the function reading
800 * "Note: not called from probe context."
801 */
802
803 int
804 dtrace_assfail(const char *a, const char *f, int l)
805 {
806 panic("dtrace: assertion failed: %s, file: %s, line: %d", a, f, l);
807
808 /*
809 * We just need something here that even the most clever compiler
810 * cannot optimize away.
811 */
812 return (a[(uintptr_t)f]);
813 }
814
815 /*
816 * Atomically increment a specified error counter from probe context.
817 */
818 static void
819 dtrace_error(uint32_t *counter)
820 {
821 /*
822 * Most counters stored to in probe context are per-CPU counters.
823 * However, there are some error conditions that are sufficiently
824 * arcane that they don't merit per-CPU storage. If these counters
825 * are incremented concurrently on different CPUs, scalability will be
826 * adversely affected -- but we don't expect them to be white-hot in a
827 * correctly constructed enabling...
828 */
829 uint32_t oval, nval;
830
831 do {
832 oval = *counter;
833
834 if ((nval = oval + 1) == 0) {
835 /*
836 * If the counter would wrap, set it to 1 -- assuring
837 * that the counter is never zero when we have seen
838 * errors. (The counter must be 32-bits because we
839 * aren't guaranteed a 64-bit compare&swap operation.)
840 * To save this code both the infamy of being fingered
841 * by a priggish news story and the indignity of being
842 * the target of a neo-puritan witch trial, we're
843 * carefully avoiding any colorful description of the
844 * likelihood of this condition -- but suffice it to
845 * say that it is only slightly more likely than the
846 * overflow of predicate cache IDs, as discussed in
847 * dtrace_predicate_create().
848 */
849 nval = 1;
850 }
851 } while (dtrace_cas32(counter, oval, nval) != oval);
852 }
853
854 /*
855 * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
856 * uint8_t, a uint16_t, a uint32_t and a uint64_t.
857 */
858 DTRACE_LOADFUNC(8)
859 DTRACE_LOADFUNC(16)
860 DTRACE_LOADFUNC(32)
861 DTRACE_LOADFUNC(64)
862
863 static int
864 dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
865 {
866 if (dest < mstate->dtms_scratch_base)
867 return (0);
868
869 if (dest + size < dest)
870 return (0);
871
872 if (dest + size > mstate->dtms_scratch_ptr)
873 return (0);
874
875 return (1);
876 }
877
878 static int
879 dtrace_canstore_statvar(uint64_t addr, size_t sz,
880 dtrace_statvar_t **svars, int nsvars)
881 {
882 int i;
883
884 for (i = 0; i < nsvars; i++) {
885 dtrace_statvar_t *svar = svars[i];
886
887 if (svar == NULL || svar->dtsv_size == 0)
888 continue;
889
890 if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size))
891 return (1);
892 }
893
894 return (0);
895 }
896
897 /*
898 * Check to see if the address is within a memory region to which a store may
899 * be issued. This includes the DTrace scratch areas, and any DTrace variable
900 * region. The caller of dtrace_canstore() is responsible for performing any
901 * alignment checks that are needed before stores are actually executed.
902 */
903 static int
904 dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
905 dtrace_vstate_t *vstate)
906 {
907 /*
908 * First, check to see if the address is in scratch space...
909 */
910 if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
911 mstate->dtms_scratch_size))
912 return (1);
913
914 /*
915 * Now check to see if it's a dynamic variable. This check will pick
916 * up both thread-local variables and any global dynamically-allocated
917 * variables.
918 */
919 if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base,
920 vstate->dtvs_dynvars.dtds_size)) {
921 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
922 uintptr_t base = (uintptr_t)dstate->dtds_base +
923 (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
924 uintptr_t chunkoffs;
925
926 /*
927 * Before we assume that we can store here, we need to make
928 * sure that it isn't in our metadata -- storing to our
929 * dynamic variable metadata would corrupt our state. For
930 * the range to not include any dynamic variable metadata,
931 * it must:
932 *
933 * (1) Start above the hash table that is at the base of
934 * the dynamic variable space
935 *
936 * (2) Have a starting chunk offset that is beyond the
937 * dtrace_dynvar_t that is at the base of every chunk
938 *
939 * (3) Not span a chunk boundary
940 *
941 */
942 if (addr < base)
943 return (0);
944
945 chunkoffs = (addr - base) % dstate->dtds_chunksize;
946
947 if (chunkoffs < sizeof (dtrace_dynvar_t))
948 return (0);
949
950 if (chunkoffs + sz > dstate->dtds_chunksize)
951 return (0);
952
953 return (1);
954 }
955
956 /*
957 * Finally, check the static local and global variables. These checks
958 * take the longest, so we perform them last.
959 */
960 if (dtrace_canstore_statvar(addr, sz,
961 vstate->dtvs_locals, vstate->dtvs_nlocals))
962 return (1);
963
964 if (dtrace_canstore_statvar(addr, sz,
965 vstate->dtvs_globals, vstate->dtvs_nglobals))
966 return (1);
967
968 return (0);
969 }
970
971
972 /*
973 * Convenience routine to check to see if the address is within a memory
974 * region in which a load may be issued given the user's privilege level;
975 * if not, it sets the appropriate error flags and loads 'addr' into the
976 * illegal value slot.
977 *
978 * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
979 * appropriate memory access protection.
980 */
981 static int
982 dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
983 dtrace_vstate_t *vstate)
984 {
985 volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
986
987 /*
988 * If we hold the privilege to read from kernel memory, then
989 * everything is readable.
990 */
991 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
992 return (1);
993
994 /*
995 * You can obviously read that which you can store.
996 */
997 if (dtrace_canstore(addr, sz, mstate, vstate))
998 return (1);
999
1000 /*
1001 * We're allowed to read from our own string table.
1002 */
1003 if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab,
1004 mstate->dtms_difo->dtdo_strlen))
1005 return (1);
1006
1007 DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
1008 *illval = addr;
1009 return (0);
1010 }
1011
1012 /*
1013 * Convenience routine to check to see if a given string is within a memory
1014 * region in which a load may be issued given the user's privilege level;
1015 * this exists so that we don't need to issue unnecessary dtrace_strlen()
1016 * calls in the event that the user has all privileges.
1017 */
1018 static int
1019 dtrace_strcanload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
1020 dtrace_vstate_t *vstate)
1021 {
1022 size_t strsz;
1023
1024 /*
1025 * If we hold the privilege to read from kernel memory, then
1026 * everything is readable.
1027 */
1028 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
1029 return (1);
1030
1031 strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr, sz);
1032 if (dtrace_canload(addr, strsz, mstate, vstate))
1033 return (1);
1034
1035 return (0);
1036 }
1037
1038 /*
1039 * Convenience routine to check to see if a given variable is within a memory
1040 * region in which a load may be issued given the user's privilege level.
1041 */
1042 static int
1043 dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate,
1044 dtrace_vstate_t *vstate)
1045 {
1046 size_t sz;
1047 ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1048
1049 /*
1050 * If we hold the privilege to read from kernel memory, then
1051 * everything is readable.
1052 */
1053 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
1054 return (1);
1055
1056 if (type->dtdt_kind == DIF_TYPE_STRING)
1057 sz = dtrace_strlen(src,
1058 vstate->dtvs_state->dts_options[DTRACEOPT_STRSIZE]) + 1;
1059 else
1060 sz = type->dtdt_size;
1061
1062 return (dtrace_canload((uintptr_t)src, sz, mstate, vstate));
1063 }
1064
1065 /*
1066 * Compare two strings using safe loads.
1067 */
1068 static int
1069 dtrace_strncmp(char *s1, char *s2, size_t limit)
1070 {
1071 uint8_t c1, c2;
1072 volatile uint16_t *flags;
1073
1074 if (s1 == s2 || limit == 0)
1075 return (0);
1076
1077 flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1078
1079 do {
1080 if (s1 == NULL) {
1081 c1 = '\0';
1082 } else {
1083 c1 = dtrace_load8((uintptr_t)s1++);
1084 }
1085
1086 if (s2 == NULL) {
1087 c2 = '\0';
1088 } else {
1089 c2 = dtrace_load8((uintptr_t)s2++);
1090 }
1091
1092 if (c1 != c2)
1093 return (c1 - c2);
1094 } while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
1095
1096 return (0);
1097 }
1098
1099 /*
1100 * Compute strlen(s) for a string using safe memory accesses. The additional
1101 * len parameter is used to specify a maximum length to ensure completion.
1102 */
1103 static size_t
1104 dtrace_strlen(const char *s, size_t lim)
1105 {
1106 uint_t len;
1107
1108 for (len = 0; len != lim; len++) {
1109 if (dtrace_load8((uintptr_t)s++) == '\0')
1110 break;
1111 }
1112
1113 return (len);
1114 }
1115
1116 /*
1117 * Check if an address falls within a toxic region.
1118 */
1119 static int
1120 dtrace_istoxic(uintptr_t kaddr, size_t size)
1121 {
1122 uintptr_t taddr, tsize;
1123 int i;
1124
1125 for (i = 0; i < dtrace_toxranges; i++) {
1126 taddr = dtrace_toxrange[i].dtt_base;
1127 tsize = dtrace_toxrange[i].dtt_limit - taddr;
1128
1129 if (kaddr - taddr < tsize) {
1130 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1131 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = kaddr;
1132 return (1);
1133 }
1134
1135 if (taddr - kaddr < size) {
1136 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1137 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = taddr;
1138 return (1);
1139 }
1140 }
1141
1142 return (0);
1143 }
1144
1145 /*
1146 * Copy src to dst using safe memory accesses. The src is assumed to be unsafe
1147 * memory specified by the DIF program. The dst is assumed to be safe memory
1148 * that we can store to directly because it is managed by DTrace. As with
1149 * standard bcopy, overlapping copies are handled properly.
1150 */
1151 static void
1152 dtrace_bcopy(const void *src, void *dst, size_t len)
1153 {
1154 if (len != 0) {
1155 uint8_t *s1 = dst;
1156 const uint8_t *s2 = src;
1157
1158 if (s1 <= s2) {
1159 do {
1160 *s1++ = dtrace_load8((uintptr_t)s2++);
1161 } while (--len != 0);
1162 } else {
1163 s2 += len;
1164 s1 += len;
1165
1166 do {
1167 *--s1 = dtrace_load8((uintptr_t)--s2);
1168 } while (--len != 0);
1169 }
1170 }
1171 }
1172
1173 /*
1174 * Copy src to dst using safe memory accesses, up to either the specified
1175 * length, or the point that a nul byte is encountered. The src is assumed to
1176 * be unsafe memory specified by the DIF program. The dst is assumed to be
1177 * safe memory that we can store to directly because it is managed by DTrace.
1178 * Unlike dtrace_bcopy(), overlapping regions are not handled.
1179 */
1180 static void
1181 dtrace_strcpy(const void *src, void *dst, size_t len)
1182 {
1183 if (len != 0) {
1184 uint8_t *s1 = dst, c;
1185 const uint8_t *s2 = src;
1186
1187 do {
1188 *s1++ = c = dtrace_load8((uintptr_t)s2++);
1189 } while (--len != 0 && c != '\0');
1190 }
1191 }
1192
1193 /*
1194 * Copy src to dst, deriving the size and type from the specified (BYREF)
1195 * variable type. The src is assumed to be unsafe memory specified by the DIF
1196 * program. The dst is assumed to be DTrace variable memory that is of the
1197 * specified type; we assume that we can store to directly.
1198 */
1199 static void
1200 dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type)
1201 {
1202 ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1203
1204 if (type->dtdt_kind == DIF_TYPE_STRING) {
1205 dtrace_strcpy(src, dst, type->dtdt_size);
1206 } else {
1207 dtrace_bcopy(src, dst, type->dtdt_size);
1208 }
1209 }
1210
1211 /*
1212 * Compare s1 to s2 using safe memory accesses. The s1 data is assumed to be
1213 * unsafe memory specified by the DIF program. The s2 data is assumed to be
1214 * safe memory that we can access directly because it is managed by DTrace.
1215 */
1216 static int
1217 dtrace_bcmp(const void *s1, const void *s2, size_t len)
1218 {
1219 volatile uint16_t *flags;
1220
1221 flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1222
1223 if (s1 == s2)
1224 return (0);
1225
1226 if (s1 == NULL || s2 == NULL)
1227 return (1);
1228
1229 if (s1 != s2 && len != 0) {
1230 const uint8_t *ps1 = s1;
1231 const uint8_t *ps2 = s2;
1232
1233 do {
1234 if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
1235 return (1);
1236 } while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
1237 }
1238 return (0);
1239 }
1240
1241 /*
1242 * Zero the specified region using a simple byte-by-byte loop. Note that this
1243 * is for safe DTrace-managed memory only.
1244 */
1245 static void
1246 dtrace_bzero(void *dst, size_t len)
1247 {
1248 uchar_t *cp;
1249
1250 for (cp = dst; len != 0; len--)
1251 *cp++ = 0;
1252 }
1253
1254 static void
1255 dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
1256 {
1257 uint64_t result[2];
1258
1259 result[0] = addend1[0] + addend2[0];
1260 result[1] = addend1[1] + addend2[1] +
1261 (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
1262
1263 sum[0] = result[0];
1264 sum[1] = result[1];
1265 }
1266
1267 /*
1268 * Shift the 128-bit value in a by b. If b is positive, shift left.
1269 * If b is negative, shift right.
1270 */
1271 static void
1272 dtrace_shift_128(uint64_t *a, int b)
1273 {
1274 uint64_t mask;
1275
1276 if (b == 0)
1277 return;
1278
1279 if (b < 0) {
1280 b = -b;
1281 if (b >= 64) {
1282 a[0] = a[1] >> (b - 64);
1283 a[1] = 0;
1284 } else {
1285 a[0] >>= b;
1286 mask = 1LL << (64 - b);
1287 mask -= 1;
1288 a[0] |= ((a[1] & mask) << (64 - b));
1289 a[1] >>= b;
1290 }
1291 } else {
1292 if (b >= 64) {
1293 a[1] = a[0] << (b - 64);
1294 a[0] = 0;
1295 } else {
1296 a[1] <<= b;
1297 mask = a[0] >> (64 - b);
1298 a[1] |= mask;
1299 a[0] <<= b;
1300 }
1301 }
1302 }
1303
1304 /*
1305 * The basic idea is to break the 2 64-bit values into 4 32-bit values,
1306 * use native multiplication on those, and then re-combine into the
1307 * resulting 128-bit value.
1308 *
1309 * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1310 * hi1 * hi2 << 64 +
1311 * hi1 * lo2 << 32 +
1312 * hi2 * lo1 << 32 +
1313 * lo1 * lo2
1314 */
1315 static void
1316 dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1317 {
1318 uint64_t hi1, hi2, lo1, lo2;
1319 uint64_t tmp[2];
1320
1321 hi1 = factor1 >> 32;
1322 hi2 = factor2 >> 32;
1323
1324 lo1 = factor1 & DT_MASK_LO;
1325 lo2 = factor2 & DT_MASK_LO;
1326
1327 product[0] = lo1 * lo2;
1328 product[1] = hi1 * hi2;
1329
1330 tmp[0] = hi1 * lo2;
1331 tmp[1] = 0;
1332 dtrace_shift_128(tmp, 32);
1333 dtrace_add_128(product, tmp, product);
1334
1335 tmp[0] = hi2 * lo1;
1336 tmp[1] = 0;
1337 dtrace_shift_128(tmp, 32);
1338 dtrace_add_128(product, tmp, product);
1339 }
1340
1341 /*
1342 * This privilege check should be used by actions and subroutines to
1343 * verify that the user credentials of the process that enabled the
1344 * invoking ECB match the target credentials
1345 */
1346 static int
1347 dtrace_priv_proc_common_user(dtrace_state_t *state)
1348 {
1349 cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1350
1351 /*
1352 * We should always have a non-NULL state cred here, since if cred
1353 * is null (anonymous tracing), we fast-path bypass this routine.
1354 */
1355 ASSERT(s_cr != NULL);
1356
1357 if ((cr = dtrace_CRED()) != NULL &&
1358 posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_uid &&
1359 posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_ruid &&
1360 posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_suid &&
1361 posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_gid &&
1362 posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_rgid &&
1363 posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_sgid)
1364 return (1);
1365
1366 return (0);
1367 }
1368
1369 /*
1370 * This privilege check should be used by actions and subroutines to
1371 * verify that the zone of the process that enabled the invoking ECB
1372 * matches the target credentials
1373 */
1374 static int
1375 dtrace_priv_proc_common_zone(dtrace_state_t *state)
1376 {
1377 cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1378 #pragma unused(cr, s_cr, state) /* __APPLE__ */
1379
1380 /*
1381 * We should always have a non-NULL state cred here, since if cred
1382 * is null (anonymous tracing), we fast-path bypass this routine.
1383 */
1384 ASSERT(s_cr != NULL);
1385
1386 return 1; /* APPLE NOTE: Darwin doesn't do zones. */
1387 }
1388
1389 /*
1390 * This privilege check should be used by actions and subroutines to
1391 * verify that the process has not setuid or changed credentials.
1392 */
1393 static int
1394 dtrace_priv_proc_common_nocd(void)
1395 {
1396 return 1; /* Darwin omits "No Core Dump" flag. */
1397 }
1398
1399 static int
1400 dtrace_priv_proc_destructive(dtrace_state_t *state)
1401 {
1402 int action = state->dts_cred.dcr_action;
1403
1404 if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1405 goto bad;
1406
1407 if (dtrace_is_restricted() && !dtrace_can_attach_to_proc(current_proc()))
1408 goto bad;
1409
1410 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1411 dtrace_priv_proc_common_zone(state) == 0)
1412 goto bad;
1413
1414 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1415 dtrace_priv_proc_common_user(state) == 0)
1416 goto bad;
1417
1418 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1419 dtrace_priv_proc_common_nocd() == 0)
1420 goto bad;
1421
1422 return (1);
1423
1424 bad:
1425 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1426
1427 return (0);
1428 }
1429
1430 static int
1431 dtrace_priv_proc_control(dtrace_state_t *state)
1432 {
1433 if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1434 goto bad;
1435
1436 if (dtrace_is_restricted() && !dtrace_can_attach_to_proc(current_proc()))
1437 goto bad;
1438
1439 if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1440 return (1);
1441
1442 if (dtrace_priv_proc_common_zone(state) &&
1443 dtrace_priv_proc_common_user(state) &&
1444 dtrace_priv_proc_common_nocd())
1445 return (1);
1446
1447 bad:
1448 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1449
1450 return (0);
1451 }
1452
1453 static int
1454 dtrace_priv_proc(dtrace_state_t *state)
1455 {
1456 if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1457 goto bad;
1458
1459 if (dtrace_is_restricted() && !dtrace_can_attach_to_proc(current_proc()))
1460 goto bad;
1461
1462 if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1463 return (1);
1464
1465 bad:
1466 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1467
1468 return (0);
1469 }
1470
1471 /*
1472 * The P_LNOATTACH check is an Apple specific check.
1473 * We need a version of dtrace_priv_proc() that omits
1474 * that check for PID and EXECNAME accesses
1475 */
1476 static int
1477 dtrace_priv_proc_relaxed(dtrace_state_t *state)
1478 {
1479
1480 if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1481 return (1);
1482
1483 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1484
1485 return (0);
1486 }
1487
1488 static int
1489 dtrace_priv_kernel(dtrace_state_t *state)
1490 {
1491 if (dtrace_is_restricted())
1492 goto bad;
1493
1494 if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1495 return (1);
1496
1497 bad:
1498 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1499
1500 return (0);
1501 }
1502
1503 static int
1504 dtrace_priv_kernel_destructive(dtrace_state_t *state)
1505 {
1506 if (dtrace_is_restricted())
1507 goto bad;
1508
1509 if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1510 return (1);
1511
1512 bad:
1513 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1514
1515 return (0);
1516 }
1517
1518 /*
1519 * Note: not called from probe context. This function is called
1520 * asynchronously (and at a regular interval) from outside of probe context to
1521 * clean the dirty dynamic variable lists on all CPUs. Dynamic variable
1522 * cleaning is explained in detail in <sys/dtrace_impl.h>.
1523 */
1524 static void
1525 dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1526 {
1527 dtrace_dynvar_t *dirty;
1528 dtrace_dstate_percpu_t *dcpu;
1529 int i, work = 0;
1530
1531 for (i = 0; i < (int)NCPU; i++) {
1532 dcpu = &dstate->dtds_percpu[i];
1533
1534 ASSERT(dcpu->dtdsc_rinsing == NULL);
1535
1536 /*
1537 * If the dirty list is NULL, there is no dirty work to do.
1538 */
1539 if (dcpu->dtdsc_dirty == NULL)
1540 continue;
1541
1542 /*
1543 * If the clean list is non-NULL, then we're not going to do
1544 * any work for this CPU -- it means that there has not been
1545 * a dtrace_dynvar() allocation on this CPU (or from this CPU)
1546 * since the last time we cleaned house.
1547 */
1548 if (dcpu->dtdsc_clean != NULL)
1549 continue;
1550
1551 work = 1;
1552
1553 /*
1554 * Atomically move the dirty list aside.
1555 */
1556 do {
1557 dirty = dcpu->dtdsc_dirty;
1558
1559 /*
1560 * Before we zap the dirty list, set the rinsing list.
1561 * (This allows for a potential assertion in
1562 * dtrace_dynvar(): if a free dynamic variable appears
1563 * on a hash chain, either the dirty list or the
1564 * rinsing list for some CPU must be non-NULL.)
1565 */
1566 dcpu->dtdsc_rinsing = dirty;
1567 dtrace_membar_producer();
1568 } while (dtrace_casptr(&dcpu->dtdsc_dirty,
1569 dirty, NULL) != dirty);
1570 }
1571
1572 if (!work) {
1573 /*
1574 * We have no work to do; we can simply return.
1575 */
1576 return;
1577 }
1578
1579 dtrace_sync();
1580
1581 for (i = 0; i < (int)NCPU; i++) {
1582 dcpu = &dstate->dtds_percpu[i];
1583
1584 if (dcpu->dtdsc_rinsing == NULL)
1585 continue;
1586
1587 /*
1588 * We are now guaranteed that no hash chain contains a pointer
1589 * into this dirty list; we can make it clean.
1590 */
1591 ASSERT(dcpu->dtdsc_clean == NULL);
1592 dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1593 dcpu->dtdsc_rinsing = NULL;
1594 }
1595
1596 /*
1597 * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1598 * sure that all CPUs have seen all of the dtdsc_clean pointers.
1599 * This prevents a race whereby a CPU incorrectly decides that
1600 * the state should be something other than DTRACE_DSTATE_CLEAN
1601 * after dtrace_dynvar_clean() has completed.
1602 */
1603 dtrace_sync();
1604
1605 dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1606 }
1607
1608 /*
1609 * Depending on the value of the op parameter, this function looks-up,
1610 * allocates or deallocates an arbitrarily-keyed dynamic variable. If an
1611 * allocation is requested, this function will return a pointer to a
1612 * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1613 * variable can be allocated. If NULL is returned, the appropriate counter
1614 * will be incremented.
1615 */
1616 static dtrace_dynvar_t *
1617 dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1618 dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1619 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1620 {
1621 uint64_t hashval = DTRACE_DYNHASH_VALID;
1622 dtrace_dynhash_t *hash = dstate->dtds_hash;
1623 dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1624 processorid_t me = CPU->cpu_id, cpu = me;
1625 dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
1626 size_t bucket, ksize;
1627 size_t chunksize = dstate->dtds_chunksize;
1628 uintptr_t kdata, lock, nstate;
1629 uint_t i;
1630
1631 ASSERT(nkeys != 0);
1632
1633 /*
1634 * Hash the key. As with aggregations, we use Jenkins' "One-at-a-time"
1635 * algorithm. For the by-value portions, we perform the algorithm in
1636 * 16-bit chunks (as opposed to 8-bit chunks). This speeds things up a
1637 * bit, and seems to have only a minute effect on distribution. For
1638 * the by-reference data, we perform "One-at-a-time" iterating (safely)
1639 * over each referenced byte. It's painful to do this, but it's much
1640 * better than pathological hash distribution. The efficacy of the
1641 * hashing algorithm (and a comparison with other algorithms) may be
1642 * found by running the ::dtrace_dynstat MDB dcmd.
1643 */
1644 for (i = 0; i < nkeys; i++) {
1645 if (key[i].dttk_size == 0) {
1646 uint64_t val = key[i].dttk_value;
1647
1648 hashval += (val >> 48) & 0xffff;
1649 hashval += (hashval << 10);
1650 hashval ^= (hashval >> 6);
1651
1652 hashval += (val >> 32) & 0xffff;
1653 hashval += (hashval << 10);
1654 hashval ^= (hashval >> 6);
1655
1656 hashval += (val >> 16) & 0xffff;
1657 hashval += (hashval << 10);
1658 hashval ^= (hashval >> 6);
1659
1660 hashval += val & 0xffff;
1661 hashval += (hashval << 10);
1662 hashval ^= (hashval >> 6);
1663 } else {
1664 /*
1665 * This is incredibly painful, but it beats the hell
1666 * out of the alternative.
1667 */
1668 uint64_t j, size = key[i].dttk_size;
1669 uintptr_t base = (uintptr_t)key[i].dttk_value;
1670
1671 if (!dtrace_canload(base, size, mstate, vstate))
1672 break;
1673
1674 for (j = 0; j < size; j++) {
1675 hashval += dtrace_load8(base + j);
1676 hashval += (hashval << 10);
1677 hashval ^= (hashval >> 6);
1678 }
1679 }
1680 }
1681
1682 if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1683 return (NULL);
1684
1685 hashval += (hashval << 3);
1686 hashval ^= (hashval >> 11);
1687 hashval += (hashval << 15);
1688
1689 /*
1690 * There is a remote chance (ideally, 1 in 2^31) that our hashval
1691 * comes out to be one of our two sentinel hash values. If this
1692 * actually happens, we set the hashval to be a value known to be a
1693 * non-sentinel value.
1694 */
1695 if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1696 hashval = DTRACE_DYNHASH_VALID;
1697
1698 /*
1699 * Yes, it's painful to do a divide here. If the cycle count becomes
1700 * important here, tricks can be pulled to reduce it. (However, it's
1701 * critical that hash collisions be kept to an absolute minimum;
1702 * they're much more painful than a divide.) It's better to have a
1703 * solution that generates few collisions and still keeps things
1704 * relatively simple.
1705 */
1706 bucket = hashval % dstate->dtds_hashsize;
1707
1708 if (op == DTRACE_DYNVAR_DEALLOC) {
1709 volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1710
1711 for (;;) {
1712 while ((lock = *lockp) & 1)
1713 continue;
1714
1715 if (dtrace_casptr((void *)(uintptr_t)lockp,
1716 (void *)lock, (void *)(lock + 1)) == (void *)lock)
1717 break;
1718 }
1719
1720 dtrace_membar_producer();
1721 }
1722
1723 top:
1724 prev = NULL;
1725 lock = hash[bucket].dtdh_lock;
1726
1727 dtrace_membar_consumer();
1728
1729 start = hash[bucket].dtdh_chain;
1730 ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
1731 start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
1732 op != DTRACE_DYNVAR_DEALLOC));
1733
1734 for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1735 dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
1736 dtrace_key_t *dkey = &dtuple->dtt_key[0];
1737
1738 if (dvar->dtdv_hashval != hashval) {
1739 if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
1740 /*
1741 * We've reached the sink, and therefore the
1742 * end of the hash chain; we can kick out of
1743 * the loop knowing that we have seen a valid
1744 * snapshot of state.
1745 */
1746 ASSERT(dvar->dtdv_next == NULL);
1747 ASSERT(dvar == &dtrace_dynhash_sink);
1748 break;
1749 }
1750
1751 if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
1752 /*
1753 * We've gone off the rails: somewhere along
1754 * the line, one of the members of this hash
1755 * chain was deleted. Note that we could also
1756 * detect this by simply letting this loop run
1757 * to completion, as we would eventually hit
1758 * the end of the dirty list. However, we
1759 * want to avoid running the length of the
1760 * dirty list unnecessarily (it might be quite
1761 * long), so we catch this as early as
1762 * possible by detecting the hash marker. In
1763 * this case, we simply set dvar to NULL and
1764 * break; the conditional after the loop will
1765 * send us back to top.
1766 */
1767 dvar = NULL;
1768 break;
1769 }
1770
1771 goto next;
1772 }
1773
1774 if (dtuple->dtt_nkeys != nkeys)
1775 goto next;
1776
1777 for (i = 0; i < nkeys; i++, dkey++) {
1778 if (dkey->dttk_size != key[i].dttk_size)
1779 goto next; /* size or type mismatch */
1780
1781 if (dkey->dttk_size != 0) {
1782 if (dtrace_bcmp(
1783 (void *)(uintptr_t)key[i].dttk_value,
1784 (void *)(uintptr_t)dkey->dttk_value,
1785 dkey->dttk_size))
1786 goto next;
1787 } else {
1788 if (dkey->dttk_value != key[i].dttk_value)
1789 goto next;
1790 }
1791 }
1792
1793 if (op != DTRACE_DYNVAR_DEALLOC)
1794 return (dvar);
1795
1796 ASSERT(dvar->dtdv_next == NULL ||
1797 dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
1798
1799 if (prev != NULL) {
1800 ASSERT(hash[bucket].dtdh_chain != dvar);
1801 ASSERT(start != dvar);
1802 ASSERT(prev->dtdv_next == dvar);
1803 prev->dtdv_next = dvar->dtdv_next;
1804 } else {
1805 if (dtrace_casptr(&hash[bucket].dtdh_chain,
1806 start, dvar->dtdv_next) != start) {
1807 /*
1808 * We have failed to atomically swing the
1809 * hash table head pointer, presumably because
1810 * of a conflicting allocation on another CPU.
1811 * We need to reread the hash chain and try
1812 * again.
1813 */
1814 goto top;
1815 }
1816 }
1817
1818 dtrace_membar_producer();
1819
1820 /*
1821 * Now set the hash value to indicate that it's free.
1822 */
1823 ASSERT(hash[bucket].dtdh_chain != dvar);
1824 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1825
1826 dtrace_membar_producer();
1827
1828 /*
1829 * Set the next pointer to point at the dirty list, and
1830 * atomically swing the dirty pointer to the newly freed dvar.
1831 */
1832 do {
1833 next = dcpu->dtdsc_dirty;
1834 dvar->dtdv_next = next;
1835 } while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
1836
1837 /*
1838 * Finally, unlock this hash bucket.
1839 */
1840 ASSERT(hash[bucket].dtdh_lock == lock);
1841 ASSERT(lock & 1);
1842 hash[bucket].dtdh_lock++;
1843
1844 return (NULL);
1845 next:
1846 prev = dvar;
1847 continue;
1848 }
1849
1850 if (dvar == NULL) {
1851 /*
1852 * If dvar is NULL, it is because we went off the rails:
1853 * one of the elements that we traversed in the hash chain
1854 * was deleted while we were traversing it. In this case,
1855 * we assert that we aren't doing a dealloc (deallocs lock
1856 * the hash bucket to prevent themselves from racing with
1857 * one another), and retry the hash chain traversal.
1858 */
1859 ASSERT(op != DTRACE_DYNVAR_DEALLOC);
1860 goto top;
1861 }
1862
1863 if (op != DTRACE_DYNVAR_ALLOC) {
1864 /*
1865 * If we are not to allocate a new variable, we want to
1866 * return NULL now. Before we return, check that the value
1867 * of the lock word hasn't changed. If it has, we may have
1868 * seen an inconsistent snapshot.
1869 */
1870 if (op == DTRACE_DYNVAR_NOALLOC) {
1871 if (hash[bucket].dtdh_lock != lock)
1872 goto top;
1873 } else {
1874 ASSERT(op == DTRACE_DYNVAR_DEALLOC);
1875 ASSERT(hash[bucket].dtdh_lock == lock);
1876 ASSERT(lock & 1);
1877 hash[bucket].dtdh_lock++;
1878 }
1879
1880 return (NULL);
1881 }
1882
1883 /*
1884 * We need to allocate a new dynamic variable. The size we need is the
1885 * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
1886 * size of any auxiliary key data (rounded up to 8-byte alignment) plus
1887 * the size of any referred-to data (dsize). We then round the final
1888 * size up to the chunksize for allocation.
1889 */
1890 for (ksize = 0, i = 0; i < nkeys; i++)
1891 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
1892
1893 /*
1894 * This should be pretty much impossible, but could happen if, say,
1895 * strange DIF specified the tuple. Ideally, this should be an
1896 * assertion and not an error condition -- but that requires that the
1897 * chunksize calculation in dtrace_difo_chunksize() be absolutely
1898 * bullet-proof. (That is, it must not be able to be fooled by
1899 * malicious DIF.) Given the lack of backwards branches in DIF,
1900 * solving this would presumably not amount to solving the Halting
1901 * Problem -- but it still seems awfully hard.
1902 */
1903 if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
1904 ksize + dsize > chunksize) {
1905 dcpu->dtdsc_drops++;
1906 return (NULL);
1907 }
1908
1909 nstate = DTRACE_DSTATE_EMPTY;
1910
1911 do {
1912 retry:
1913 free = dcpu->dtdsc_free;
1914
1915 if (free == NULL) {
1916 dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
1917 void *rval;
1918
1919 if (clean == NULL) {
1920 /*
1921 * We're out of dynamic variable space on
1922 * this CPU. Unless we have tried all CPUs,
1923 * we'll try to allocate from a different
1924 * CPU.
1925 */
1926 switch (dstate->dtds_state) {
1927 case DTRACE_DSTATE_CLEAN: {
1928 void *sp = &dstate->dtds_state;
1929
1930 if (++cpu >= (int)NCPU)
1931 cpu = 0;
1932
1933 if (dcpu->dtdsc_dirty != NULL &&
1934 nstate == DTRACE_DSTATE_EMPTY)
1935 nstate = DTRACE_DSTATE_DIRTY;
1936
1937 if (dcpu->dtdsc_rinsing != NULL)
1938 nstate = DTRACE_DSTATE_RINSING;
1939
1940 dcpu = &dstate->dtds_percpu[cpu];
1941
1942 if (cpu != me)
1943 goto retry;
1944
1945 (void) dtrace_cas32(sp,
1946 DTRACE_DSTATE_CLEAN, nstate);
1947
1948 /*
1949 * To increment the correct bean
1950 * counter, take another lap.
1951 */
1952 goto retry;
1953 }
1954
1955 case DTRACE_DSTATE_DIRTY:
1956 dcpu->dtdsc_dirty_drops++;
1957 break;
1958
1959 case DTRACE_DSTATE_RINSING:
1960 dcpu->dtdsc_rinsing_drops++;
1961 break;
1962
1963 case DTRACE_DSTATE_EMPTY:
1964 dcpu->dtdsc_drops++;
1965 break;
1966 }
1967
1968 DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
1969 return (NULL);
1970 }
1971
1972 /*
1973 * The clean list appears to be non-empty. We want to
1974 * move the clean list to the free list; we start by
1975 * moving the clean pointer aside.
1976 */
1977 if (dtrace_casptr(&dcpu->dtdsc_clean,
1978 clean, NULL) != clean) {
1979 /*
1980 * We are in one of two situations:
1981 *
1982 * (a) The clean list was switched to the
1983 * free list by another CPU.
1984 *
1985 * (b) The clean list was added to by the
1986 * cleansing cyclic.
1987 *
1988 * In either of these situations, we can
1989 * just reattempt the free list allocation.
1990 */
1991 goto retry;
1992 }
1993
1994 ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
1995
1996 /*
1997 * Now we'll move the clean list to the free list.
1998 * It's impossible for this to fail: the only way
1999 * the free list can be updated is through this
2000 * code path, and only one CPU can own the clean list.
2001 * Thus, it would only be possible for this to fail if
2002 * this code were racing with dtrace_dynvar_clean().
2003 * (That is, if dtrace_dynvar_clean() updated the clean
2004 * list, and we ended up racing to update the free
2005 * list.) This race is prevented by the dtrace_sync()
2006 * in dtrace_dynvar_clean() -- which flushes the
2007 * owners of the clean lists out before resetting
2008 * the clean lists.
2009 */
2010 rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
2011 ASSERT(rval == NULL);
2012 goto retry;
2013 }
2014
2015 dvar = free;
2016 new_free = dvar->dtdv_next;
2017 } while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
2018
2019 /*
2020 * We have now allocated a new chunk. We copy the tuple keys into the
2021 * tuple array and copy any referenced key data into the data space
2022 * following the tuple array. As we do this, we relocate dttk_value
2023 * in the final tuple to point to the key data address in the chunk.
2024 */
2025 kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
2026 dvar->dtdv_data = (void *)(kdata + ksize);
2027 dvar->dtdv_tuple.dtt_nkeys = nkeys;
2028
2029 for (i = 0; i < nkeys; i++) {
2030 dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
2031 size_t kesize = key[i].dttk_size;
2032
2033 if (kesize != 0) {
2034 dtrace_bcopy(
2035 (const void *)(uintptr_t)key[i].dttk_value,
2036 (void *)kdata, kesize);
2037 dkey->dttk_value = kdata;
2038 kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
2039 } else {
2040 dkey->dttk_value = key[i].dttk_value;
2041 }
2042
2043 dkey->dttk_size = kesize;
2044 }
2045
2046 ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
2047 dvar->dtdv_hashval = hashval;
2048 dvar->dtdv_next = start;
2049
2050 if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
2051 return (dvar);
2052
2053 /*
2054 * The cas has failed. Either another CPU is adding an element to
2055 * this hash chain, or another CPU is deleting an element from this
2056 * hash chain. The simplest way to deal with both of these cases
2057 * (though not necessarily the most efficient) is to free our
2058 * allocated block and tail-call ourselves. Note that the free is
2059 * to the dirty list and _not_ to the free list. This is to prevent
2060 * races with allocators, above.
2061 */
2062 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2063
2064 dtrace_membar_producer();
2065
2066 do {
2067 free = dcpu->dtdsc_dirty;
2068 dvar->dtdv_next = free;
2069 } while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
2070
2071 return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
2072 }
2073
2074 /*ARGSUSED*/
2075 static void
2076 dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
2077 {
2078 #pragma unused(arg) /* __APPLE__ */
2079 if ((int64_t)nval < (int64_t)*oval)
2080 *oval = nval;
2081 }
2082
2083 /*ARGSUSED*/
2084 static void
2085 dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
2086 {
2087 #pragma unused(arg) /* __APPLE__ */
2088 if ((int64_t)nval > (int64_t)*oval)
2089 *oval = nval;
2090 }
2091
2092 static void
2093 dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
2094 {
2095 int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
2096 int64_t val = (int64_t)nval;
2097
2098 if (val < 0) {
2099 for (i = 0; i < zero; i++) {
2100 if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
2101 quanta[i] += incr;
2102 return;
2103 }
2104 }
2105 } else {
2106 for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
2107 if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
2108 quanta[i - 1] += incr;
2109 return;
2110 }
2111 }
2112
2113 quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
2114 return;
2115 }
2116
2117 ASSERT(0);
2118 }
2119
2120 static void
2121 dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
2122 {
2123 uint64_t arg = *lquanta++;
2124 int32_t base = DTRACE_LQUANTIZE_BASE(arg);
2125 uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
2126 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
2127 int32_t val = (int32_t)nval, level;
2128
2129 ASSERT(step != 0);
2130 ASSERT(levels != 0);
2131
2132 if (val < base) {
2133 /*
2134 * This is an underflow.
2135 */
2136 lquanta[0] += incr;
2137 return;
2138 }
2139
2140 level = (val - base) / step;
2141
2142 if (level < levels) {
2143 lquanta[level + 1] += incr;
2144 return;
2145 }
2146
2147 /*
2148 * This is an overflow.
2149 */
2150 lquanta[levels + 1] += incr;
2151 }
2152
2153 static int
2154 dtrace_aggregate_llquantize_bucket(int16_t factor, int16_t low, int16_t high,
2155 int16_t nsteps, int64_t value)
2156 {
2157 int64_t this = 1, last, next;
2158 int base = 1, order;
2159
2160 for (order = 0; order < low; ++order)
2161 this *= factor;
2162
2163 /*
2164 * If our value is less than our factor taken to the power of the
2165 * low order of magnitude, it goes into the zeroth bucket.
2166 */
2167 if (value < this)
2168 return 0;
2169 else
2170 last = this;
2171
2172 for (this *= factor; order <= high; ++order) {
2173 int nbuckets = this > nsteps ? nsteps : this;
2174
2175 /*
2176 * We should not generally get log/linear quantizations
2177 * with a high magnitude that allows 64-bits to
2178 * overflow, but we nonetheless protect against this
2179 * by explicitly checking for overflow, and clamping
2180 * our value accordingly.
2181 */
2182 next = this * factor;
2183 if (next < this) {
2184 value = this - 1;
2185 }
2186
2187 /*
2188 * If our value lies within this order of magnitude,
2189 * determine its position by taking the offset within
2190 * the order of magnitude, dividing by the bucket
2191 * width, and adding to our (accumulated) base.
2192 */
2193 if (value < this) {
2194 return (base + (value - last) / (this / nbuckets));
2195 }
2196
2197 base += nbuckets - (nbuckets / factor);
2198 last = this;
2199 this = next;
2200 }
2201
2202 /*
2203 * Our value is greater than or equal to our factor taken to the
2204 * power of one plus the high magnitude -- return the top bucket.
2205 */
2206 return base;
2207 }
2208
2209 static void
2210 dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
2211 {
2212 uint64_t arg = *llquanta++;
2213 uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
2214 uint16_t low = DTRACE_LLQUANTIZE_LOW(arg);
2215 uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg);
2216 uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
2217
2218 llquanta[dtrace_aggregate_llquantize_bucket(factor, low, high, nsteps, nval)] += incr;
2219 }
2220
2221 /*ARGSUSED*/
2222 static void
2223 dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
2224 {
2225 #pragma unused(arg) /* __APPLE__ */
2226 data[0]++;
2227 data[1] += nval;
2228 }
2229
2230 /*ARGSUSED*/
2231 static void
2232 dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
2233 {
2234 #pragma unused(arg) /* __APPLE__ */
2235 int64_t snval = (int64_t)nval;
2236 uint64_t tmp[2];
2237
2238 data[0]++;
2239 data[1] += nval;
2240
2241 /*
2242 * What we want to say here is:
2243 *
2244 * data[2] += nval * nval;
2245 *
2246 * But given that nval is 64-bit, we could easily overflow, so
2247 * we do this as 128-bit arithmetic.
2248 */
2249 if (snval < 0)
2250 snval = -snval;
2251
2252 dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
2253 dtrace_add_128(data + 2, tmp, data + 2);
2254 }
2255
2256 /*ARGSUSED*/
2257 static void
2258 dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
2259 {
2260 #pragma unused(nval, arg) /* __APPLE__ */
2261 *oval = *oval + 1;
2262 }
2263
2264 /*ARGSUSED*/
2265 static void
2266 dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
2267 {
2268 #pragma unused(arg) /* __APPLE__ */
2269 *oval += nval;
2270 }
2271
2272 /*
2273 * Aggregate given the tuple in the principal data buffer, and the aggregating
2274 * action denoted by the specified dtrace_aggregation_t. The aggregation
2275 * buffer is specified as the buf parameter. This routine does not return
2276 * failure; if there is no space in the aggregation buffer, the data will be
2277 * dropped, and a corresponding counter incremented.
2278 */
2279 static void
2280 dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
2281 intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
2282 {
2283 #pragma unused(arg)
2284 dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
2285 uint32_t i, ndx, size, fsize;
2286 uint32_t align = sizeof (uint64_t) - 1;
2287 dtrace_aggbuffer_t *agb;
2288 dtrace_aggkey_t *key;
2289 uint32_t hashval = 0, limit, isstr;
2290 caddr_t tomax, data, kdata;
2291 dtrace_actkind_t action;
2292 dtrace_action_t *act;
2293 uintptr_t offs;
2294
2295 if (buf == NULL)
2296 return;
2297
2298 if (!agg->dtag_hasarg) {
2299 /*
2300 * Currently, only quantize() and lquantize() take additional
2301 * arguments, and they have the same semantics: an increment
2302 * value that defaults to 1 when not present. If additional
2303 * aggregating actions take arguments, the setting of the
2304 * default argument value will presumably have to become more
2305 * sophisticated...
2306 */
2307 arg = 1;
2308 }
2309
2310 action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
2311 size = rec->dtrd_offset - agg->dtag_base;
2312 fsize = size + rec->dtrd_size;
2313
2314 ASSERT(dbuf->dtb_tomax != NULL);
2315 data = dbuf->dtb_tomax + offset + agg->dtag_base;
2316
2317 if ((tomax = buf->dtb_tomax) == NULL) {
2318 dtrace_buffer_drop(buf);
2319 return;
2320 }
2321
2322 /*
2323 * The metastructure is always at the bottom of the buffer.
2324 */
2325 agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
2326 sizeof (dtrace_aggbuffer_t));
2327
2328 if (buf->dtb_offset == 0) {
2329 /*
2330 * We just kludge up approximately 1/8th of the size to be
2331 * buckets. If this guess ends up being routinely
2332 * off-the-mark, we may need to dynamically readjust this
2333 * based on past performance.
2334 */
2335 uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
2336
2337 if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
2338 (uintptr_t)tomax || hashsize == 0) {
2339 /*
2340 * We've been given a ludicrously small buffer;
2341 * increment our drop count and leave.
2342 */
2343 dtrace_buffer_drop(buf);
2344 return;
2345 }
2346
2347 /*
2348 * And now, a pathetic attempt to try to get a an odd (or
2349 * perchance, a prime) hash size for better hash distribution.
2350 */
2351 if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
2352 hashsize -= DTRACE_AGGHASHSIZE_SLEW;
2353
2354 agb->dtagb_hashsize = hashsize;
2355 agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
2356 agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
2357 agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
2358
2359 for (i = 0; i < agb->dtagb_hashsize; i++)
2360 agb->dtagb_hash[i] = NULL;
2361 }
2362
2363 ASSERT(agg->dtag_first != NULL);
2364 ASSERT(agg->dtag_first->dta_intuple);
2365
2366 /*
2367 * Calculate the hash value based on the key. Note that we _don't_
2368 * include the aggid in the hashing (but we will store it as part of
2369 * the key). The hashing algorithm is Bob Jenkins' "One-at-a-time"
2370 * algorithm: a simple, quick algorithm that has no known funnels, and
2371 * gets good distribution in practice. The efficacy of the hashing
2372 * algorithm (and a comparison with other algorithms) may be found by
2373 * running the ::dtrace_aggstat MDB dcmd.
2374 */
2375 for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2376 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2377 limit = i + act->dta_rec.dtrd_size;
2378 ASSERT(limit <= size);
2379 isstr = DTRACEACT_ISSTRING(act);
2380
2381 for (; i < limit; i++) {
2382 hashval += data[i];
2383 hashval += (hashval << 10);
2384 hashval ^= (hashval >> 6);
2385
2386 if (isstr && data[i] == '\0')
2387 break;
2388 }
2389 }
2390
2391 hashval += (hashval << 3);
2392 hashval ^= (hashval >> 11);
2393 hashval += (hashval << 15);
2394
2395 /*
2396 * Yes, the divide here is expensive -- but it's generally the least
2397 * of the performance issues given the amount of data that we iterate
2398 * over to compute hash values, compare data, etc.
2399 */
2400 ndx = hashval % agb->dtagb_hashsize;
2401
2402 for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
2403 ASSERT((caddr_t)key >= tomax);
2404 ASSERT((caddr_t)key < tomax + buf->dtb_size);
2405
2406 if (hashval != key->dtak_hashval || key->dtak_size != size)
2407 continue;
2408
2409 kdata = key->dtak_data;
2410 ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
2411
2412 for (act = agg->dtag_first; act->dta_intuple;
2413 act = act->dta_next) {
2414 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2415 limit = i + act->dta_rec.dtrd_size;
2416 ASSERT(limit <= size);
2417 isstr = DTRACEACT_ISSTRING(act);
2418
2419 for (; i < limit; i++) {
2420 if (kdata[i] != data[i])
2421 goto next;
2422
2423 if (isstr && data[i] == '\0')
2424 break;
2425 }
2426 }
2427
2428 if (action != key->dtak_action) {
2429 /*
2430 * We are aggregating on the same value in the same
2431 * aggregation with two different aggregating actions.
2432 * (This should have been picked up in the compiler,
2433 * so we may be dealing with errant or devious DIF.)
2434 * This is an error condition; we indicate as much,
2435 * and return.
2436 */
2437 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2438 return;
2439 }
2440
2441 /*
2442 * This is a hit: we need to apply the aggregator to
2443 * the value at this key.
2444 */
2445 agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2446 return;
2447 next:
2448 continue;
2449 }
2450
2451 /*
2452 * We didn't find it. We need to allocate some zero-filled space,
2453 * link it into the hash table appropriately, and apply the aggregator
2454 * to the (zero-filled) value.
2455 */
2456 offs = buf->dtb_offset;
2457 while (offs & (align - 1))
2458 offs += sizeof (uint32_t);
2459
2460 /*
2461 * If we don't have enough room to both allocate a new key _and_
2462 * its associated data, increment the drop count and return.
2463 */
2464 if ((uintptr_t)tomax + offs + fsize >
2465 agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2466 dtrace_buffer_drop(buf);
2467 return;
2468 }
2469
2470 /*CONSTCOND*/
2471 ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
2472 key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
2473 agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2474
2475 key->dtak_data = kdata = tomax + offs;
2476 buf->dtb_offset = offs + fsize;
2477
2478 /*
2479 * Now copy the data across.
2480 */
2481 *((dtrace_aggid_t *)kdata) = agg->dtag_id;
2482
2483 for (i = sizeof (dtrace_aggid_t); i < size; i++)
2484 kdata[i] = data[i];
2485
2486 /*
2487 * Because strings are not zeroed out by default, we need to iterate
2488 * looking for actions that store strings, and we need to explicitly
2489 * pad these strings out with zeroes.
2490 */
2491 for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2492 int nul;
2493
2494 if (!DTRACEACT_ISSTRING(act))
2495 continue;
2496
2497 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2498 limit = i + act->dta_rec.dtrd_size;
2499 ASSERT(limit <= size);
2500
2501 for (nul = 0; i < limit; i++) {
2502 if (nul) {
2503 kdata[i] = '\0';
2504 continue;
2505 }
2506
2507 if (data[i] != '\0')
2508 continue;
2509
2510 nul = 1;
2511 }
2512 }
2513
2514 for (i = size; i < fsize; i++)
2515 kdata[i] = 0;
2516
2517 key->dtak_hashval = hashval;
2518 key->dtak_size = size;
2519 key->dtak_action = action;
2520 key->dtak_next = agb->dtagb_hash[ndx];
2521 agb->dtagb_hash[ndx] = key;
2522
2523 /*
2524 * Finally, apply the aggregator.
2525 */
2526 *((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
2527 agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2528 }
2529
2530 /*
2531 * Given consumer state, this routine finds a speculation in the INACTIVE
2532 * state and transitions it into the ACTIVE state. If there is no speculation
2533 * in the INACTIVE state, 0 is returned. In this case, no error counter is
2534 * incremented -- it is up to the caller to take appropriate action.
2535 */
2536 static int
2537 dtrace_speculation(dtrace_state_t *state)
2538 {
2539 int i = 0;
2540 dtrace_speculation_state_t current;
2541 uint32_t *stat = &state->dts_speculations_unavail, count;
2542
2543 while (i < state->dts_nspeculations) {
2544 dtrace_speculation_t *spec = &state->dts_speculations[i];
2545
2546 current = spec->dtsp_state;
2547
2548 if (current != DTRACESPEC_INACTIVE) {
2549 if (current == DTRACESPEC_COMMITTINGMANY ||
2550 current == DTRACESPEC_COMMITTING ||
2551 current == DTRACESPEC_DISCARDING)
2552 stat = &state->dts_speculations_busy;
2553 i++;
2554 continue;
2555 }
2556
2557 if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2558 current, DTRACESPEC_ACTIVE) == current)
2559 return (i + 1);
2560 }
2561
2562 /*
2563 * We couldn't find a speculation. If we found as much as a single
2564 * busy speculation buffer, we'll attribute this failure as "busy"
2565 * instead of "unavail".
2566 */
2567 do {
2568 count = *stat;
2569 } while (dtrace_cas32(stat, count, count + 1) != count);
2570
2571 return (0);
2572 }
2573
2574 /*
2575 * This routine commits an active speculation. If the specified speculation
2576 * is not in a valid state to perform a commit(), this routine will silently do
2577 * nothing. The state of the specified speculation is transitioned according
2578 * to the state transition diagram outlined in <sys/dtrace_impl.h>
2579 */
2580 static void
2581 dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2582 dtrace_specid_t which)
2583 {
2584 dtrace_speculation_t *spec;
2585 dtrace_buffer_t *src, *dest;
2586 uintptr_t daddr, saddr, dlimit;
2587 dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
2588 intptr_t offs;
2589
2590 if (which == 0)
2591 return;
2592
2593 if (which > (dtrace_specid_t)state->dts_nspeculations) {
2594 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2595 return;
2596 }
2597
2598 spec = &state->dts_speculations[which - 1];
2599 src = &spec->dtsp_buffer[cpu];
2600 dest = &state->dts_buffer[cpu];
2601
2602 do {
2603 current = spec->dtsp_state;
2604
2605 if (current == DTRACESPEC_COMMITTINGMANY)
2606 break;
2607
2608 switch (current) {
2609 case DTRACESPEC_INACTIVE:
2610 case DTRACESPEC_DISCARDING:
2611 return;
2612
2613 case DTRACESPEC_COMMITTING:
2614 /*
2615 * This is only possible if we are (a) commit()'ing
2616 * without having done a prior speculate() on this CPU
2617 * and (b) racing with another commit() on a different
2618 * CPU. There's nothing to do -- we just assert that
2619 * our offset is 0.
2620 */
2621 ASSERT(src->dtb_offset == 0);
2622 return;
2623
2624 case DTRACESPEC_ACTIVE:
2625 new = DTRACESPEC_COMMITTING;
2626 break;
2627
2628 case DTRACESPEC_ACTIVEONE:
2629 /*
2630 * This speculation is active on one CPU. If our
2631 * buffer offset is non-zero, we know that the one CPU
2632 * must be us. Otherwise, we are committing on a
2633 * different CPU from the speculate(), and we must
2634 * rely on being asynchronously cleaned.
2635 */
2636 if (src->dtb_offset != 0) {
2637 new = DTRACESPEC_COMMITTING;
2638 break;
2639 }
2640 /*FALLTHROUGH*/
2641
2642 case DTRACESPEC_ACTIVEMANY:
2643 new = DTRACESPEC_COMMITTINGMANY;
2644 break;
2645
2646 default:
2647 ASSERT(0);
2648 }
2649 } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2650 current, new) != current);
2651
2652 /*
2653 * We have set the state to indicate that we are committing this
2654 * speculation. Now reserve the necessary space in the destination
2655 * buffer.
2656 */
2657 if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2658 sizeof (uint64_t), state, NULL)) < 0) {
2659 dtrace_buffer_drop(dest);
2660 goto out;
2661 }
2662
2663 /*
2664 * We have the space; copy the buffer across. (Note that this is a
2665 * highly subobtimal bcopy(); in the unlikely event that this becomes
2666 * a serious performance issue, a high-performance DTrace-specific
2667 * bcopy() should obviously be invented.)
2668 */
2669 daddr = (uintptr_t)dest->dtb_tomax + offs;
2670 dlimit = daddr + src->dtb_offset;
2671 saddr = (uintptr_t)src->dtb_tomax;
2672
2673 /*
2674 * First, the aligned portion.
2675 */
2676 while (dlimit - daddr >= sizeof (uint64_t)) {
2677 *((uint64_t *)daddr) = *((uint64_t *)saddr);
2678
2679 daddr += sizeof (uint64_t);
2680 saddr += sizeof (uint64_t);
2681 }
2682
2683 /*
2684 * Now any left-over bit...
2685 */
2686 while (dlimit - daddr)
2687 *((uint8_t *)daddr++) = *((uint8_t *)saddr++);
2688
2689 /*
2690 * Finally, commit the reserved space in the destination buffer.
2691 */
2692 dest->dtb_offset = offs + src->dtb_offset;
2693
2694 out:
2695 /*
2696 * If we're lucky enough to be the only active CPU on this speculation
2697 * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2698 */
2699 if (current == DTRACESPEC_ACTIVE ||
2700 (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
2701 uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
2702 DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
2703 #pragma unused(rval) /* __APPLE__ */
2704
2705 ASSERT(rval == DTRACESPEC_COMMITTING);
2706 }
2707
2708 src->dtb_offset = 0;
2709 src->dtb_xamot_drops += src->dtb_drops;
2710 src->dtb_drops = 0;
2711 }
2712
2713 /*
2714 * This routine discards an active speculation. If the specified speculation
2715 * is not in a valid state to perform a discard(), this routine will silently
2716 * do nothing. The state of the specified speculation is transitioned
2717 * according to the state transition diagram outlined in <sys/dtrace_impl.h>
2718 */
2719 static void
2720 dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
2721 dtrace_specid_t which)
2722 {
2723 dtrace_speculation_t *spec;
2724 dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
2725 dtrace_buffer_t *buf;
2726
2727 if (which == 0)
2728 return;
2729
2730 if (which > (dtrace_specid_t)state->dts_nspeculations) {
2731 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2732 return;
2733 }
2734
2735 spec = &state->dts_speculations[which - 1];
2736 buf = &spec->dtsp_buffer[cpu];
2737
2738 do {
2739 current = spec->dtsp_state;
2740
2741 switch (current) {
2742 case DTRACESPEC_INACTIVE:
2743 case DTRACESPEC_COMMITTINGMANY:
2744 case DTRACESPEC_COMMITTING:
2745 case DTRACESPEC_DISCARDING:
2746 return;
2747
2748 case DTRACESPEC_ACTIVE:
2749 case DTRACESPEC_ACTIVEMANY:
2750 new = DTRACESPEC_DISCARDING;
2751 break;
2752
2753 case DTRACESPEC_ACTIVEONE:
2754 if (buf->dtb_offset != 0) {
2755 new = DTRACESPEC_INACTIVE;
2756 } else {
2757 new = DTRACESPEC_DISCARDING;
2758 }
2759 break;
2760
2761 default:
2762 ASSERT(0);
2763 }
2764 } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2765 current, new) != current);
2766
2767 buf->dtb_offset = 0;
2768 buf->dtb_drops = 0;
2769 }
2770
2771 /*
2772 * Note: not called from probe context. This function is called
2773 * asynchronously from cross call context to clean any speculations that are
2774 * in the COMMITTINGMANY or DISCARDING states. These speculations may not be
2775 * transitioned back to the INACTIVE state until all CPUs have cleaned the
2776 * speculation.
2777 */
2778 static void
2779 dtrace_speculation_clean_here(dtrace_state_t *state)
2780 {
2781 dtrace_icookie_t cookie;
2782 processorid_t cpu = CPU->cpu_id;
2783 dtrace_buffer_t *dest = &state->dts_buffer[cpu];
2784 dtrace_specid_t i;
2785
2786 cookie = dtrace_interrupt_disable();
2787
2788 if (dest->dtb_tomax == NULL) {
2789 dtrace_interrupt_enable(cookie);
2790 return;
2791 }
2792
2793 for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
2794 dtrace_speculation_t *spec = &state->dts_speculations[i];
2795 dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
2796
2797 if (src->dtb_tomax == NULL)
2798 continue;
2799
2800 if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
2801 src->dtb_offset = 0;
2802 continue;
2803 }
2804
2805 if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2806 continue;
2807
2808 if (src->dtb_offset == 0)
2809 continue;
2810
2811 dtrace_speculation_commit(state, cpu, i + 1);
2812 }
2813
2814 dtrace_interrupt_enable(cookie);
2815 }
2816
2817 /*
2818 * Note: not called from probe context. This function is called
2819 * asynchronously (and at a regular interval) to clean any speculations that
2820 * are in the COMMITTINGMANY or DISCARDING states. If it discovers that there
2821 * is work to be done, it cross calls all CPUs to perform that work;
2822 * COMMITMANY and DISCARDING speculations may not be transitioned back to the
2823 * INACTIVE state until they have been cleaned by all CPUs.
2824 */
2825 static void
2826 dtrace_speculation_clean(dtrace_state_t *state)
2827 {
2828 int work = 0;
2829 uint32_t rv;
2830 dtrace_specid_t i;
2831
2832 for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
2833 dtrace_speculation_t *spec = &state->dts_speculations[i];
2834
2835 ASSERT(!spec->dtsp_cleaning);
2836
2837 if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
2838 spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2839 continue;
2840
2841 work++;
2842 spec->dtsp_cleaning = 1;
2843 }
2844
2845 if (!work)
2846 return;
2847
2848 dtrace_xcall(DTRACE_CPUALL,
2849 (dtrace_xcall_t)dtrace_speculation_clean_here, state);
2850
2851 /*
2852 * We now know that all CPUs have committed or discarded their
2853 * speculation buffers, as appropriate. We can now set the state
2854 * to inactive.
2855 */
2856 for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
2857 dtrace_speculation_t *spec = &state->dts_speculations[i];
2858 dtrace_speculation_state_t current, new;
2859
2860 if (!spec->dtsp_cleaning)
2861 continue;
2862
2863 current = spec->dtsp_state;
2864 ASSERT(current == DTRACESPEC_DISCARDING ||
2865 current == DTRACESPEC_COMMITTINGMANY);
2866
2867 new = DTRACESPEC_INACTIVE;
2868
2869 rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
2870 ASSERT(rv == current);
2871 spec->dtsp_cleaning = 0;
2872 }
2873 }
2874
2875 /*
2876 * Called as part of a speculate() to get the speculative buffer associated
2877 * with a given speculation. Returns NULL if the specified speculation is not
2878 * in an ACTIVE state. If the speculation is in the ACTIVEONE state -- and
2879 * the active CPU is not the specified CPU -- the speculation will be
2880 * atomically transitioned into the ACTIVEMANY state.
2881 */
2882 static dtrace_buffer_t *
2883 dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
2884 dtrace_specid_t which)
2885 {
2886 dtrace_speculation_t *spec;
2887 dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
2888 dtrace_buffer_t *buf;
2889
2890 if (which == 0)
2891 return (NULL);
2892
2893 if (which > (dtrace_specid_t)state->dts_nspeculations) {
2894 cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2895 return (NULL);
2896 }
2897
2898 spec = &state->dts_speculations[which - 1];
2899 buf = &spec->dtsp_buffer[cpuid];
2900
2901 do {
2902 current = spec->dtsp_state;
2903
2904 switch (current) {
2905 case DTRACESPEC_INACTIVE:
2906 case DTRACESPEC_COMMITTINGMANY:
2907 case DTRACESPEC_DISCARDING:
2908 return (NULL);
2909
2910 case DTRACESPEC_COMMITTING:
2911 ASSERT(buf->dtb_offset == 0);
2912 return (NULL);
2913
2914 case DTRACESPEC_ACTIVEONE:
2915 /*
2916 * This speculation is currently active on one CPU.
2917 * Check the offset in the buffer; if it's non-zero,
2918 * that CPU must be us (and we leave the state alone).
2919 * If it's zero, assume that we're starting on a new
2920 * CPU -- and change the state to indicate that the
2921 * speculation is active on more than one CPU.
2922 */
2923 if (buf->dtb_offset != 0)
2924 return (buf);
2925
2926 new = DTRACESPEC_ACTIVEMANY;
2927 break;
2928
2929 case DTRACESPEC_ACTIVEMANY:
2930 return (buf);
2931
2932 case DTRACESPEC_ACTIVE:
2933 new = DTRACESPEC_ACTIVEONE;
2934 break;
2935
2936 default:
2937 ASSERT(0);
2938 }
2939 } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2940 current, new) != current);
2941
2942 ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
2943 return (buf);
2944 }
2945
2946 /*
2947 * Return a string. In the event that the user lacks the privilege to access
2948 * arbitrary kernel memory, we copy the string out to scratch memory so that we
2949 * don't fail access checking.
2950 *
2951 * dtrace_dif_variable() uses this routine as a helper for various
2952 * builtin values such as 'execname' and 'probefunc.'
2953 */
2954 static
2955 uintptr_t
2956 dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
2957 dtrace_mstate_t *mstate)
2958 {
2959 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
2960 uintptr_t ret;
2961 size_t strsz;
2962
2963 /*
2964 * The easy case: this probe is allowed to read all of memory, so
2965 * we can just return this as a vanilla pointer.
2966 */
2967 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
2968 return (addr);
2969
2970 /*
2971 * This is the tougher case: we copy the string in question from
2972 * kernel memory into scratch memory and return it that way: this
2973 * ensures that we won't trip up when access checking tests the
2974 * BYREF return value.
2975 */
2976 strsz = dtrace_strlen((char *)addr, size) + 1;
2977
2978 if (mstate->dtms_scratch_ptr + strsz >
2979 mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
2980 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
2981 return (0);
2982 }
2983
2984 dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
2985 strsz);
2986 ret = mstate->dtms_scratch_ptr;
2987 mstate->dtms_scratch_ptr += strsz;
2988 return (ret);
2989 }
2990
2991 /*
2992 * This function implements the DIF emulator's variable lookups. The emulator
2993 * passes a reserved variable identifier and optional built-in array index.
2994 */
2995 static uint64_t
2996 dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
2997 uint64_t ndx)
2998 {
2999 /*
3000 * If we're accessing one of the uncached arguments, we'll turn this
3001 * into a reference in the args array.
3002 */
3003 if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
3004 ndx = v - DIF_VAR_ARG0;
3005 v = DIF_VAR_ARGS;
3006 }
3007
3008 switch (v) {
3009 case DIF_VAR_ARGS:
3010 ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
3011 if (ndx >= sizeof (mstate->dtms_arg) /
3012 sizeof (mstate->dtms_arg[0])) {
3013 /*
3014 * APPLE NOTE: Account for introduction of __dtrace_probe()
3015 */
3016 int aframes = mstate->dtms_probe->dtpr_aframes + 3;
3017 dtrace_provider_t *pv;
3018 uint64_t val;
3019
3020 pv = mstate->dtms_probe->dtpr_provider;
3021 if (pv->dtpv_pops.dtps_getargval != NULL)
3022 val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
3023 mstate->dtms_probe->dtpr_id,
3024 mstate->dtms_probe->dtpr_arg, ndx, aframes);
3025 /* Special case access of arg5 as passed to dtrace_probe_error() (which see.) */
3026 else if (mstate->dtms_probe->dtpr_id == dtrace_probeid_error && ndx == 5) {
3027 return ((dtrace_state_t *)(uintptr_t)(mstate->dtms_arg[0]))->dts_arg_error_illval;
3028 }
3029
3030 else
3031 val = dtrace_getarg(ndx, aframes);
3032
3033 /*
3034 * This is regrettably required to keep the compiler
3035 * from tail-optimizing the call to dtrace_getarg().
3036 * The condition always evaluates to true, but the
3037 * compiler has no way of figuring that out a priori.
3038 * (None of this would be necessary if the compiler
3039 * could be relied upon to _always_ tail-optimize
3040 * the call to dtrace_getarg() -- but it can't.)
3041 */
3042 if (mstate->dtms_probe != NULL)
3043 return (val);
3044
3045 ASSERT(0);
3046 }
3047
3048 return (mstate->dtms_arg[ndx]);
3049
3050 case DIF_VAR_UREGS: {
3051 thread_t thread;
3052
3053 if (!dtrace_priv_proc(state))
3054 return (0);
3055
3056 if ((thread = current_thread()) == NULL) {
3057 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3058 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = 0;
3059 return (0);
3060 }
3061
3062 return (dtrace_getreg(find_user_regs(thread), ndx));
3063 }
3064
3065
3066 case DIF_VAR_CURTHREAD:
3067 if (!dtrace_priv_kernel(state))
3068 return (0);
3069
3070 return ((uint64_t)(uintptr_t)current_thread());
3071
3072 case DIF_VAR_TIMESTAMP:
3073 if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
3074 mstate->dtms_timestamp = dtrace_gethrtime();
3075 mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
3076 }
3077 return (mstate->dtms_timestamp);
3078
3079 case DIF_VAR_VTIMESTAMP:
3080 ASSERT(dtrace_vtime_references != 0);
3081 return (dtrace_get_thread_vtime(current_thread()));
3082
3083 case DIF_VAR_WALLTIMESTAMP:
3084 if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
3085 mstate->dtms_walltimestamp = dtrace_gethrestime();
3086 mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
3087 }
3088 return (mstate->dtms_walltimestamp);
3089
3090 case DIF_VAR_MACHTIMESTAMP:
3091 if (!(mstate->dtms_present & DTRACE_MSTATE_MACHTIMESTAMP)) {
3092 mstate->dtms_machtimestamp = mach_absolute_time();
3093 mstate->dtms_present |= DTRACE_MSTATE_MACHTIMESTAMP;
3094 }
3095 return (mstate->dtms_machtimestamp);
3096
3097 case DIF_VAR_IPL:
3098 if (!dtrace_priv_kernel(state))
3099 return (0);
3100 if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
3101 mstate->dtms_ipl = dtrace_getipl();
3102 mstate->dtms_present |= DTRACE_MSTATE_IPL;
3103 }
3104 return (mstate->dtms_ipl);
3105
3106 case DIF_VAR_EPID:
3107 ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
3108 return (mstate->dtms_epid);
3109
3110 case DIF_VAR_ID:
3111 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3112 return (mstate->dtms_probe->dtpr_id);
3113
3114 case DIF_VAR_STACKDEPTH:
3115 if (!dtrace_priv_kernel(state))
3116 return (0);
3117 if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
3118 /*
3119 * APPLE NOTE: Account for introduction of __dtrace_probe()
3120 */
3121 int aframes = mstate->dtms_probe->dtpr_aframes + 3;
3122
3123 mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
3124 mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
3125 }
3126 return (mstate->dtms_stackdepth);
3127
3128 case DIF_VAR_USTACKDEPTH:
3129 if (!dtrace_priv_proc(state))
3130 return (0);
3131 if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
3132 /*
3133 * See comment in DIF_VAR_PID.
3134 */
3135 if (DTRACE_ANCHORED(mstate->dtms_probe) &&
3136 CPU_ON_INTR(CPU)) {
3137 mstate->dtms_ustackdepth = 0;
3138 } else {
3139 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3140 mstate->dtms_ustackdepth =
3141 dtrace_getustackdepth();
3142 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3143 }
3144 mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
3145 }
3146 return (mstate->dtms_ustackdepth);
3147
3148 case DIF_VAR_CALLER:
3149 if (!dtrace_priv_kernel(state))
3150 return (0);
3151 if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
3152 /*
3153 * APPLE NOTE: Account for introduction of __dtrace_probe()
3154 */
3155 int aframes = mstate->dtms_probe->dtpr_aframes + 3;
3156
3157 if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
3158 /*
3159 * If this is an unanchored probe, we are
3160 * required to go through the slow path:
3161 * dtrace_caller() only guarantees correct
3162 * results for anchored probes.
3163 */
3164 pc_t caller[2];
3165
3166 dtrace_getpcstack(caller, 2, aframes,
3167 (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
3168 mstate->dtms_caller = caller[1];
3169 } else if ((mstate->dtms_caller =
3170 dtrace_caller(aframes)) == (uintptr_t)-1) {
3171 /*
3172 * We have failed to do this the quick way;
3173 * we must resort to the slower approach of
3174 * calling dtrace_getpcstack().
3175 */
3176 pc_t caller;
3177
3178 dtrace_getpcstack(&caller, 1, aframes, NULL);
3179 mstate->dtms_caller = caller;
3180 }
3181
3182 mstate->dtms_present |= DTRACE_MSTATE_CALLER;
3183 }
3184 return (mstate->dtms_caller);
3185
3186 case DIF_VAR_UCALLER:
3187 if (!dtrace_priv_proc(state))
3188 return (0);
3189
3190 if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
3191 uint64_t ustack[3];
3192
3193 /*
3194 * dtrace_getupcstack() fills in the first uint64_t
3195 * with the current PID. The second uint64_t will
3196 * be the program counter at user-level. The third
3197 * uint64_t will contain the caller, which is what
3198 * we're after.
3199 */
3200 ustack[2] = 0;
3201 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3202 dtrace_getupcstack(ustack, 3);
3203 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3204 mstate->dtms_ucaller = ustack[2];
3205 mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
3206 }
3207
3208 return (mstate->dtms_ucaller);
3209
3210 case DIF_VAR_PROBEPROV:
3211 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3212 return (dtrace_dif_varstr(
3213 (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
3214 state, mstate));
3215
3216 case DIF_VAR_PROBEMOD:
3217 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3218 return (dtrace_dif_varstr(
3219 (uintptr_t)mstate->dtms_probe->dtpr_mod,
3220 state, mstate));
3221
3222 case DIF_VAR_PROBEFUNC:
3223 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3224 return (dtrace_dif_varstr(
3225 (uintptr_t)mstate->dtms_probe->dtpr_func,
3226 state, mstate));
3227
3228 case DIF_VAR_PROBENAME:
3229 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3230 return (dtrace_dif_varstr(
3231 (uintptr_t)mstate->dtms_probe->dtpr_name,
3232 state, mstate));
3233
3234 case DIF_VAR_PID:
3235 if (!dtrace_priv_proc_relaxed(state))
3236 return (0);
3237
3238 /*
3239 * Note that we are assuming that an unanchored probe is
3240 * always due to a high-level interrupt. (And we're assuming
3241 * that there is only a single high level interrupt.)
3242 */
3243 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3244 /* Anchored probe that fires while on an interrupt accrues to process 0 */
3245 return 0;
3246
3247 return ((uint64_t)dtrace_proc_selfpid());
3248
3249 case DIF_VAR_PPID:
3250 if (!dtrace_priv_proc_relaxed(state))
3251 return (0);
3252
3253 /*
3254 * See comment in DIF_VAR_PID.
3255 */
3256 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3257 return (0);
3258
3259 return ((uint64_t)dtrace_proc_selfppid());
3260
3261 case DIF_VAR_TID:
3262 /* We do not need to check for null current_thread() */
3263 return thread_tid(current_thread()); /* globally unique */
3264
3265 case DIF_VAR_PTHREAD_SELF:
3266 if (!dtrace_priv_proc(state))
3267 return (0);
3268
3269 /* Not currently supported, but we should be able to delta the dispatchqaddr and dispatchqoffset to get pthread_self */
3270 return 0;
3271
3272 case DIF_VAR_DISPATCHQADDR:
3273 if (!dtrace_priv_proc(state))
3274 return (0);
3275
3276 /* We do not need to check for null current_thread() */
3277 return thread_dispatchqaddr(current_thread());
3278
3279 case DIF_VAR_EXECNAME:
3280 {
3281 char *xname = (char *)mstate->dtms_scratch_ptr;
3282 size_t scratch_size = MAXCOMLEN+1;
3283
3284 /* The scratch allocation's lifetime is that of the clause. */
3285 if (!DTRACE_INSCRATCH(mstate, scratch_size)) {
3286 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3287 return 0;
3288 }
3289
3290 if (!dtrace_priv_proc_relaxed(state))
3291 return (0);
3292
3293 mstate->dtms_scratch_ptr += scratch_size;
3294 proc_selfname( xname, MAXCOMLEN );
3295
3296 return ((uint64_t)(uintptr_t)xname);
3297 }
3298
3299
3300 case DIF_VAR_ZONENAME:
3301 {
3302 /* scratch_size is equal to length('global') + 1 for the null-terminator. */
3303 char *zname = (char *)mstate->dtms_scratch_ptr;
3304 size_t scratch_size = 6 + 1;
3305
3306 if (!dtrace_priv_proc(state))
3307 return (0);
3308
3309 /* The scratch allocation's lifetime is that of the clause. */
3310 if (!DTRACE_INSCRATCH(mstate, scratch_size)) {
3311 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3312 return 0;
3313 }
3314
3315 mstate->dtms_scratch_ptr += scratch_size;
3316
3317 /* The kernel does not provide zonename, it will always return 'global'. */
3318 strlcpy(zname, "global", scratch_size);
3319
3320 return ((uint64_t)(uintptr_t)zname);
3321 }
3322
3323 case DIF_VAR_UID:
3324 if (!dtrace_priv_proc_relaxed(state))
3325 return (0);
3326
3327 /*
3328 * See comment in DIF_VAR_PID.
3329 */
3330 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3331 return (0);
3332
3333 return ((uint64_t) dtrace_proc_selfruid());
3334
3335 case DIF_VAR_GID:
3336 if (!dtrace_priv_proc(state))
3337 return (0);
3338
3339 /*
3340 * See comment in DIF_VAR_PID.
3341 */
3342 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3343 return (0);
3344
3345 if (dtrace_CRED() != NULL)
3346 /* Credential does not require lazy initialization. */
3347 return ((uint64_t)kauth_getgid());
3348 else {
3349 /* proc_lock would be taken under kauth_cred_proc_ref() in kauth_cred_get(). */
3350 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3351 return -1ULL;
3352 }
3353
3354 case DIF_VAR_ERRNO: {
3355 uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
3356 if (!dtrace_priv_proc(state))
3357 return (0);
3358
3359 /*
3360 * See comment in DIF_VAR_PID.
3361 */
3362 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3363 return (0);
3364
3365 if (uthread)
3366 return (uint64_t)uthread->t_dtrace_errno;
3367 else {
3368 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3369 return -1ULL;
3370 }
3371 }
3372
3373 default:
3374 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3375 return (0);
3376 }
3377 }
3378
3379 /*
3380 * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
3381 * Notice that we don't bother validating the proper number of arguments or
3382 * their types in the tuple stack. This isn't needed because all argument
3383 * interpretation is safe because of our load safety -- the worst that can
3384 * happen is that a bogus program can obtain bogus results.
3385 */
3386 static void
3387 dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
3388 dtrace_key_t *tupregs, int nargs,
3389 dtrace_mstate_t *mstate, dtrace_state_t *state)
3390 {
3391 volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
3392 volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
3393 dtrace_vstate_t *vstate = &state->dts_vstate;
3394
3395 #if !defined(__APPLE__)
3396 union {
3397 mutex_impl_t mi;
3398 uint64_t mx;
3399 } m;
3400
3401 union {
3402 krwlock_t ri;
3403 uintptr_t rw;
3404 } r;
3405 #else
3406 /* FIXME: awaits lock/mutex work */
3407 #endif /* __APPLE__ */
3408
3409 switch (subr) {
3410 case DIF_SUBR_RAND:
3411 regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875;
3412 break;
3413
3414 #if !defined(__APPLE__)
3415 case DIF_SUBR_MUTEX_OWNED:
3416 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3417 mstate, vstate)) {
3418 regs[rd] = 0;
3419 break;
3420 }
3421
3422 m.mx = dtrace_load64(tupregs[0].dttk_value);
3423 if (MUTEX_TYPE_ADAPTIVE(&m.mi))
3424 regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
3425 else
3426 regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
3427 break;
3428
3429 case DIF_SUBR_MUTEX_OWNER:
3430 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3431 mstate, vstate)) {
3432 regs[rd] = 0;
3433 break;
3434 }
3435
3436 m.mx = dtrace_load64(tupregs[0].dttk_value);
3437 if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
3438 MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
3439 regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
3440 else
3441 regs[rd] = 0;
3442 break;
3443
3444 case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
3445 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3446 mstate, vstate)) {
3447 regs[rd] = 0;
3448 break;
3449 }
3450
3451 m.mx = dtrace_load64(tupregs[0].dttk_value);
3452 regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
3453 break;
3454
3455 case DIF_SUBR_MUTEX_TYPE_SPIN:
3456 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3457 mstate, vstate)) {
3458 regs[rd] = 0;
3459 break;
3460 }
3461
3462 m.mx = dtrace_load64(tupregs[0].dttk_value);
3463 regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
3464 break;
3465
3466 case DIF_SUBR_RW_READ_HELD: {
3467 uintptr_t tmp;
3468
3469 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
3470 mstate, vstate)) {
3471 regs[rd] = 0;
3472 break;
3473 }
3474
3475 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3476 regs[rd] = _RW_READ_HELD(&r.ri, tmp);
3477 break;
3478 }
3479
3480 case DIF_SUBR_RW_WRITE_HELD:
3481 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
3482 mstate, vstate)) {
3483 regs[rd] = 0;
3484 break;
3485 }
3486
3487 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3488 regs[rd] = _RW_WRITE_HELD(&r.ri);
3489 break;
3490
3491 case DIF_SUBR_RW_ISWRITER:
3492 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
3493 mstate, vstate)) {
3494 regs[rd] = 0;
3495 break;
3496 }
3497
3498 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3499 regs[rd] = _RW_ISWRITER(&r.ri);
3500 break;
3501 #else
3502 /* FIXME: awaits lock/mutex work */
3503 #endif /* __APPLE__ */
3504
3505 case DIF_SUBR_BCOPY: {
3506 /*
3507 * We need to be sure that the destination is in the scratch
3508 * region -- no other region is allowed.
3509 */
3510 uintptr_t src = tupregs[0].dttk_value;
3511 uintptr_t dest = tupregs[1].dttk_value;
3512 size_t size = tupregs[2].dttk_value;
3513
3514 if (!dtrace_inscratch(dest, size, mstate)) {
3515 *flags |= CPU_DTRACE_BADADDR;
3516 *illval = regs[rd];
3517 break;
3518 }
3519
3520 if (!dtrace_canload(src, size, mstate, vstate)) {
3521 regs[rd] = 0;
3522 break;
3523 }
3524
3525 dtrace_bcopy((void *)src, (void *)dest, size);
3526 break;
3527 }
3528
3529 case DIF_SUBR_ALLOCA:
3530 case DIF_SUBR_COPYIN: {
3531 uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
3532 uint64_t size =
3533 tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
3534 size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
3535
3536 /*
3537 * This action doesn't require any credential checks since
3538 * probes will not activate in user contexts to which the
3539 * enabling user does not have permissions.
3540 */
3541
3542 /*
3543 * Rounding up the user allocation size could have overflowed
3544 * a large, bogus allocation (like -1ULL) to 0.
3545 */
3546 if (scratch_size < size ||
3547 !DTRACE_INSCRATCH(mstate, scratch_size)) {
3548 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3549 regs[rd] = 0;
3550 break;
3551 }
3552
3553 if (subr == DIF_SUBR_COPYIN) {
3554 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3555 if (dtrace_priv_proc(state))
3556 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3557 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3558 }
3559
3560 mstate->dtms_scratch_ptr += scratch_size;
3561 regs[rd] = dest;
3562 break;
3563 }
3564
3565 case DIF_SUBR_COPYINTO: {
3566 uint64_t size = tupregs[1].dttk_value;
3567 uintptr_t dest = tupregs[2].dttk_value;
3568
3569 /*
3570 * This action doesn't require any credential checks since
3571 * probes will not activate in user contexts to which the
3572 * enabling user does not have permissions.
3573 */
3574 if (!dtrace_inscratch(dest, size, mstate)) {
3575 *flags |= CPU_DTRACE_BADADDR;
3576 *illval = regs[rd];
3577 break;
3578 }
3579
3580 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3581 if (dtrace_priv_proc(state))
3582 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3583 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3584 break;
3585 }
3586
3587 case DIF_SUBR_COPYINSTR: {
3588 uintptr_t dest = mstate->dtms_scratch_ptr;
3589 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3590
3591 if (nargs > 1 && tupregs[1].dttk_value < size)
3592 size = tupregs[1].dttk_value + 1;
3593
3594 /*
3595 * This action doesn't require any credential checks since
3596 * probes will not activate in user contexts to which the
3597 * enabling user does not have permissions.
3598 */
3599 if (!DTRACE_INSCRATCH(mstate, size)) {
3600 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3601 regs[rd] = 0;
3602 break;
3603 }
3604
3605 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3606 if (dtrace_priv_proc(state))
3607 dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
3608 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3609
3610 ((char *)dest)[size - 1] = '\0';
3611 mstate->dtms_scratch_ptr += size;
3612 regs[rd] = dest;
3613 break;
3614 }
3615
3616 case DIF_SUBR_MSGSIZE:
3617 case DIF_SUBR_MSGDSIZE: {
3618 /* Darwin does not implement SysV streams messages */
3619 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3620 regs[rd] = 0;
3621 break;
3622 }
3623
3624 case DIF_SUBR_PROGENYOF: {
3625 pid_t pid = tupregs[0].dttk_value;
3626 struct proc *p = current_proc();
3627 int rval = 0, lim = nprocs;
3628
3629 while(p && (lim-- > 0)) {
3630 pid_t ppid;
3631
3632 ppid = (pid_t)dtrace_load32((uintptr_t)&(p->p_pid));
3633 if (*flags & CPU_DTRACE_FAULT)
3634 break;
3635
3636 if (ppid == pid) {
3637 rval = 1;
3638 break;
3639 }
3640
3641 if (ppid == 0)
3642 break; /* Can't climb process tree any further. */
3643
3644 p = (struct proc *)dtrace_loadptr((uintptr_t)&(p->p_pptr));
3645 if (*flags & CPU_DTRACE_FAULT)
3646 break;
3647 }
3648
3649 regs[rd] = rval;
3650 break;
3651 }
3652
3653 case DIF_SUBR_SPECULATION:
3654 regs[rd] = dtrace_speculation(state);
3655 break;
3656
3657
3658 case DIF_SUBR_COPYOUT: {
3659 uintptr_t kaddr = tupregs[0].dttk_value;
3660 user_addr_t uaddr = tupregs[1].dttk_value;
3661 uint64_t size = tupregs[2].dttk_value;
3662
3663 if (!dtrace_destructive_disallow &&
3664 dtrace_priv_proc_control(state) &&
3665 !dtrace_istoxic(kaddr, size)) {
3666 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3667 dtrace_copyout(kaddr, uaddr, size, flags);
3668 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3669 }
3670 break;
3671 }
3672
3673 case DIF_SUBR_COPYOUTSTR: {
3674 uintptr_t kaddr = tupregs[0].dttk_value;
3675 user_addr_t uaddr = tupregs[1].dttk_value;
3676 uint64_t size = tupregs[2].dttk_value;
3677
3678 if (!dtrace_destructive_disallow &&
3679 dtrace_priv_proc_control(state) &&
3680 !dtrace_istoxic(kaddr, size)) {
3681 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3682 dtrace_copyoutstr(kaddr, uaddr, size, flags);
3683 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3684 }
3685 break;
3686 }
3687
3688 case DIF_SUBR_STRLEN: {
3689 size_t sz;
3690 uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
3691 sz = dtrace_strlen((char *)addr,
3692 state->dts_options[DTRACEOPT_STRSIZE]);
3693
3694 if (!dtrace_canload(addr, sz + 1, mstate, vstate)) {
3695 regs[rd] = 0;
3696 break;
3697 }
3698
3699 regs[rd] = sz;
3700
3701 break;
3702 }
3703
3704 case DIF_SUBR_STRCHR:
3705 case DIF_SUBR_STRRCHR: {
3706 /*
3707 * We're going to iterate over the string looking for the
3708 * specified character. We will iterate until we have reached
3709 * the string length or we have found the character. If this
3710 * is DIF_SUBR_STRRCHR, we will look for the last occurrence
3711 * of the specified character instead of the first.
3712 */
3713 uintptr_t saddr = tupregs[0].dttk_value;
3714 uintptr_t addr = tupregs[0].dttk_value;
3715 uintptr_t limit = addr + state->dts_options[DTRACEOPT_STRSIZE];
3716 char c, target = (char)tupregs[1].dttk_value;
3717
3718 for (regs[rd] = 0; addr < limit; addr++) {
3719 if ((c = dtrace_load8(addr)) == target) {
3720 regs[rd] = addr;
3721
3722 if (subr == DIF_SUBR_STRCHR)
3723 break;
3724 }
3725
3726 if (c == '\0')
3727 break;
3728 }
3729
3730 if (!dtrace_canload(saddr, addr - saddr, mstate, vstate)) {
3731 regs[rd] = 0;
3732 break;
3733 }
3734
3735 break;
3736 }
3737
3738 case DIF_SUBR_STRSTR:
3739 case DIF_SUBR_INDEX:
3740 case DIF_SUBR_RINDEX: {
3741 /*
3742 * We're going to iterate over the string looking for the
3743 * specified string. We will iterate until we have reached
3744 * the string length or we have found the string. (Yes, this
3745 * is done in the most naive way possible -- but considering
3746 * that the string we're searching for is likely to be
3747 * relatively short, the complexity of Rabin-Karp or similar
3748 * hardly seems merited.)
3749 */
3750 char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
3751 char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
3752 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3753 size_t len = dtrace_strlen(addr, size);
3754 size_t sublen = dtrace_strlen(substr, size);
3755 char *limit = addr + len, *orig = addr;
3756 int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
3757 int inc = 1;
3758
3759 regs[rd] = notfound;
3760
3761 if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
3762 regs[rd] = 0;
3763 break;
3764 }
3765
3766 if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
3767 vstate)) {
3768 regs[rd] = 0;
3769 break;
3770 }
3771
3772 /*
3773 * strstr() and index()/rindex() have similar semantics if
3774 * both strings are the empty string: strstr() returns a
3775 * pointer to the (empty) string, and index() and rindex()
3776 * both return index 0 (regardless of any position argument).
3777 */
3778 if (sublen == 0 && len == 0) {
3779 if (subr == DIF_SUBR_STRSTR)
3780 regs[rd] = (uintptr_t)addr;
3781 else
3782 regs[rd] = 0;
3783 break;
3784 }
3785
3786 if (subr != DIF_SUBR_STRSTR) {
3787 if (subr == DIF_SUBR_RINDEX) {
3788 limit = orig - 1;
3789 addr += len;
3790 inc = -1;
3791 }
3792
3793 /*
3794 * Both index() and rindex() take an optional position
3795 * argument that denotes the starting position.
3796 */
3797 if (nargs == 3) {
3798 int64_t pos = (int64_t)tupregs[2].dttk_value;
3799
3800 /*
3801 * If the position argument to index() is
3802 * negative, Perl implicitly clamps it at
3803 * zero. This semantic is a little surprising
3804 * given the special meaning of negative
3805 * positions to similar Perl functions like
3806 * substr(), but it appears to reflect a
3807 * notion that index() can start from a
3808 * negative index and increment its way up to
3809 * the string. Given this notion, Perl's
3810 * rindex() is at least self-consistent in
3811 * that it implicitly clamps positions greater
3812 * than the string length to be the string
3813 * length. Where Perl completely loses
3814 * coherence, however, is when the specified
3815 * substring is the empty string (""). In
3816 * this case, even if the position is
3817 * negative, rindex() returns 0 -- and even if
3818 * the position is greater than the length,
3819 * index() returns the string length. These
3820 * semantics violate the notion that index()
3821 * should never return a value less than the
3822 * specified position and that rindex() should
3823 * never return a value greater than the
3824 * specified position. (One assumes that
3825 * these semantics are artifacts of Perl's
3826 * implementation and not the results of
3827 * deliberate design -- it beggars belief that
3828 * even Larry Wall could desire such oddness.)
3829 * While in the abstract one would wish for
3830 * consistent position semantics across
3831 * substr(), index() and rindex() -- or at the
3832 * very least self-consistent position
3833 * semantics for index() and rindex() -- we
3834 * instead opt to keep with the extant Perl
3835 * semantics, in all their broken glory. (Do
3836 * we have more desire to maintain Perl's
3837 * semantics than Perl does? Probably.)
3838 */
3839 if (subr == DIF_SUBR_RINDEX) {
3840 if (pos < 0) {
3841 if (sublen == 0)
3842 regs[rd] = 0;
3843 break;
3844 }
3845
3846 if ((size_t)pos > len)
3847 pos = len;
3848 } else {
3849 if (pos < 0)
3850 pos = 0;
3851
3852 if ((size_t)pos >= len) {
3853 if (sublen == 0)
3854 regs[rd] = len;
3855 break;
3856 }
3857 }
3858
3859 addr = orig + pos;
3860 }
3861 }
3862
3863 for (regs[rd] = notfound; addr != limit; addr += inc) {
3864 if (dtrace_strncmp(addr, substr, sublen) == 0) {
3865 if (subr != DIF_SUBR_STRSTR) {
3866 /*
3867 * As D index() and rindex() are
3868 * modeled on Perl (and not on awk),
3869 * we return a zero-based (and not a
3870 * one-based) index. (For you Perl
3871 * weenies: no, we're not going to add
3872 * $[ -- and shouldn't you be at a con
3873 * or something?)
3874 */
3875 regs[rd] = (uintptr_t)(addr - orig);
3876 break;
3877 }
3878
3879 ASSERT(subr == DIF_SUBR_STRSTR);
3880 regs[rd] = (uintptr_t)addr;
3881 break;
3882 }
3883 }
3884
3885 break;
3886 }
3887
3888 case DIF_SUBR_STRTOK: {
3889 uintptr_t addr = tupregs[0].dttk_value;
3890 uintptr_t tokaddr = tupregs[1].dttk_value;
3891 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3892 uintptr_t limit, toklimit = tokaddr + size;
3893 char *dest = (char *)mstate->dtms_scratch_ptr;
3894 uint8_t c='\0', tokmap[32]; /* 256 / 8 */
3895 uint64_t i = 0;
3896
3897 /*
3898 * Check both the token buffer and (later) the input buffer,
3899 * since both could be non-scratch addresses.
3900 */
3901 if (!dtrace_strcanload(tokaddr, size, mstate, vstate)) {
3902 regs[rd] = 0;
3903 break;
3904 }
3905
3906 if (!DTRACE_INSCRATCH(mstate, size)) {
3907 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3908 regs[rd] = 0;
3909 break;
3910 }
3911
3912 if (addr == 0) {
3913 /*
3914 * If the address specified is NULL, we use our saved
3915 * strtok pointer from the mstate. Note that this
3916 * means that the saved strtok pointer is _only_
3917 * valid within multiple enablings of the same probe --
3918 * it behaves like an implicit clause-local variable.
3919 */
3920 addr = mstate->dtms_strtok;
3921 } else {
3922 /*
3923 * If the user-specified address is non-NULL we must
3924 * access check it. This is the only time we have
3925 * a chance to do so, since this address may reside
3926 * in the string table of this clause-- future calls
3927 * (when we fetch addr from mstate->dtms_strtok)
3928 * would fail this access check.
3929 */
3930 if (!dtrace_strcanload(addr, size, mstate, vstate)) {
3931 regs[rd] = 0;
3932 break;
3933 }
3934 }
3935
3936 /*
3937 * First, zero the token map, and then process the token
3938 * string -- setting a bit in the map for every character
3939 * found in the token string.
3940 */
3941 for (i = 0; i < (int)sizeof (tokmap); i++)
3942 tokmap[i] = 0;
3943
3944 for (; tokaddr < toklimit; tokaddr++) {
3945 if ((c = dtrace_load8(tokaddr)) == '\0')
3946 break;
3947
3948 ASSERT((c >> 3) < sizeof (tokmap));
3949 tokmap[c >> 3] |= (1 << (c & 0x7));
3950 }
3951
3952 for (limit = addr + size; addr < limit; addr++) {
3953 /*
3954 * We're looking for a character that is _not_ contained
3955 * in the token string.
3956 */
3957 if ((c = dtrace_load8(addr)) == '\0')
3958 break;
3959
3960 if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
3961 break;
3962 }
3963
3964 if (c == '\0') {
3965 /*
3966 * We reached the end of the string without finding
3967 * any character that was not in the token string.
3968 * We return NULL in this case, and we set the saved
3969 * address to NULL as well.
3970 */
3971 regs[rd] = 0;
3972 mstate->dtms_strtok = 0;
3973 break;
3974 }
3975
3976 /*
3977 * From here on, we're copying into the destination string.
3978 */
3979 for (i = 0; addr < limit && i < size - 1; addr++) {
3980 if ((c = dtrace_load8(addr)) == '\0')
3981 break;
3982
3983 if (tokmap[c >> 3] & (1 << (c & 0x7)))
3984 break;
3985
3986 ASSERT(i < size);
3987 dest[i++] = c;
3988 }
3989
3990 ASSERT(i < size);
3991 dest[i] = '\0';
3992 regs[rd] = (uintptr_t)dest;
3993 mstate->dtms_scratch_ptr += size;
3994 mstate->dtms_strtok = addr;
3995 break;
3996 }
3997
3998 case DIF_SUBR_SUBSTR: {
3999 uintptr_t s = tupregs[0].dttk_value;
4000 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4001 char *d = (char *)mstate->dtms_scratch_ptr;
4002 int64_t index = (int64_t)tupregs[1].dttk_value;
4003 int64_t remaining = (int64_t)tupregs[2].dttk_value;
4004 size_t len = dtrace_strlen((char *)s, size);
4005 int64_t i = 0;
4006
4007 if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4008 regs[rd] = 0;
4009 break;
4010 }
4011
4012 if (!DTRACE_INSCRATCH(mstate, size)) {
4013 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4014 regs[rd] = 0;
4015 break;
4016 }
4017
4018 if (nargs <= 2)
4019 remaining = (int64_t)size;
4020
4021 if (index < 0) {
4022 index += len;
4023
4024 if (index < 0 && index + remaining > 0) {
4025 remaining += index;
4026 index = 0;
4027 }
4028 }
4029
4030 if ((size_t)index >= len || index < 0) {
4031 remaining = 0;
4032 } else if (remaining < 0) {
4033 remaining += len - index;
4034 } else if ((uint64_t)index + (uint64_t)remaining > size) {
4035 remaining = size - index;
4036 }
4037
4038 for (i = 0; i < remaining; i++) {
4039 if ((d[i] = dtrace_load8(s + index + i)) == '\0')
4040 break;
4041 }
4042
4043 d[i] = '\0';
4044
4045 mstate->dtms_scratch_ptr += size;
4046 regs[rd] = (uintptr_t)d;
4047 break;
4048 }
4049
4050 case DIF_SUBR_GETMAJOR:
4051 regs[rd] = (uintptr_t)major( (dev_t)tupregs[0].dttk_value );
4052 break;
4053
4054 case DIF_SUBR_GETMINOR:
4055 regs[rd] = (uintptr_t)minor( (dev_t)tupregs[0].dttk_value );
4056 break;
4057
4058 case DIF_SUBR_DDI_PATHNAME: {
4059 /* APPLE NOTE: currently unsupported on Darwin */
4060 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4061 regs[rd] = 0;
4062 break;
4063 }
4064
4065 case DIF_SUBR_STRJOIN: {
4066 char *d = (char *)mstate->dtms_scratch_ptr;
4067 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4068 uintptr_t s1 = tupregs[0].dttk_value;
4069 uintptr_t s2 = tupregs[1].dttk_value;
4070 uint64_t i = 0;
4071
4072 if (!dtrace_strcanload(s1, size, mstate, vstate) ||
4073 !dtrace_strcanload(s2, size, mstate, vstate)) {
4074 regs[rd] = 0;
4075 break;
4076 }
4077
4078 if (!DTRACE_INSCRATCH(mstate, size)) {
4079 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4080 regs[rd] = 0;
4081 break;
4082 }
4083
4084 for (;;) {
4085 if (i >= size) {
4086 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4087 regs[rd] = 0;
4088 break;
4089 }
4090
4091 if ((d[i++] = dtrace_load8(s1++)) == '\0') {
4092 i--;
4093 break;
4094 }
4095 }
4096
4097 for (;;) {
4098 if (i >= size) {
4099 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4100 regs[rd] = 0;
4101 break;
4102 }
4103
4104 if ((d[i++] = dtrace_load8(s2++)) == '\0')
4105 break;
4106 }
4107
4108 if (i < size) {
4109 mstate->dtms_scratch_ptr += i;
4110 regs[rd] = (uintptr_t)d;
4111 }
4112
4113 break;
4114 }
4115
4116 case DIF_SUBR_LLTOSTR: {
4117 int64_t i = (int64_t)tupregs[0].dttk_value;
4118 int64_t val = i < 0 ? i * -1 : i;
4119 uint64_t size = 22; /* enough room for 2^64 in decimal */
4120 char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
4121
4122 if (!DTRACE_INSCRATCH(mstate, size)) {
4123 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4124 regs[rd] = 0;
4125 break;
4126 }
4127
4128 for (*end-- = '\0'; val; val /= 10)
4129 *end-- = '0' + (val % 10);
4130
4131 if (i == 0)
4132 *end-- = '0';
4133
4134 if (i < 0)
4135 *end-- = '-';
4136
4137 regs[rd] = (uintptr_t)end + 1;
4138 mstate->dtms_scratch_ptr += size;
4139 break;
4140 }
4141
4142 case DIF_SUBR_HTONS:
4143 case DIF_SUBR_NTOHS:
4144 #ifdef _BIG_ENDIAN
4145 regs[rd] = (uint16_t)tupregs[0].dttk_value;
4146 #else
4147 regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
4148 #endif
4149 break;
4150
4151
4152 case DIF_SUBR_HTONL:
4153 case DIF_SUBR_NTOHL:
4154 #ifdef _BIG_ENDIAN
4155 regs[rd] = (uint32_t)tupregs[0].dttk_value;
4156 #else
4157 regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
4158 #endif
4159 break;
4160
4161
4162 case DIF_SUBR_HTONLL:
4163 case DIF_SUBR_NTOHLL:
4164 #ifdef _BIG_ENDIAN
4165 regs[rd] = (uint64_t)tupregs[0].dttk_value;
4166 #else
4167 regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
4168 #endif
4169 break;
4170
4171
4172 case DIF_SUBR_DIRNAME:
4173 case DIF_SUBR_BASENAME: {
4174 char *dest = (char *)mstate->dtms_scratch_ptr;
4175 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4176 uintptr_t src = tupregs[0].dttk_value;
4177 int i, j, len = dtrace_strlen((char *)src, size);
4178 int lastbase = -1, firstbase = -1, lastdir = -1;
4179 int start, end;
4180
4181 if (!dtrace_canload(src, len + 1, mstate, vstate)) {
4182 regs[rd] = 0;
4183 break;
4184 }
4185
4186 if (!DTRACE_INSCRATCH(mstate, size)) {
4187 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4188 regs[rd] = 0;
4189 break;
4190 }
4191
4192 /*
4193 * The basename and dirname for a zero-length string is
4194 * defined to be "."
4195 */
4196 if (len == 0) {
4197 len = 1;
4198 src = (uintptr_t)".";
4199 }
4200
4201 /*
4202 * Start from the back of the string, moving back toward the
4203 * front until we see a character that isn't a slash. That
4204 * character is the last character in the basename.
4205 */
4206 for (i = len - 1; i >= 0; i--) {
4207 if (dtrace_load8(src + i) != '/')
4208 break;
4209 }
4210
4211 if (i >= 0)
4212 lastbase = i;
4213
4214 /*
4215 * Starting from the last character in the basename, move
4216 * towards the front until we find a slash. The character
4217 * that we processed immediately before that is the first
4218 * character in the basename.
4219 */
4220 for (; i >= 0; i--) {
4221 if (dtrace_load8(src + i) == '/')
4222 break;
4223 }
4224
4225 if (i >= 0)
4226 firstbase = i + 1;
4227
4228 /*
4229 * Now keep going until we find a non-slash character. That
4230 * character is the last character in the dirname.
4231 */
4232 for (; i >= 0; i--) {
4233 if (dtrace_load8(src + i) != '/')
4234 break;
4235 }
4236
4237 if (i >= 0)
4238 lastdir = i;
4239
4240 ASSERT(!(lastbase == -1 && firstbase != -1));
4241 ASSERT(!(firstbase == -1 && lastdir != -1));
4242
4243 if (lastbase == -1) {
4244 /*
4245 * We didn't find a non-slash character. We know that
4246 * the length is non-zero, so the whole string must be
4247 * slashes. In either the dirname or the basename
4248 * case, we return '/'.
4249 */
4250 ASSERT(firstbase == -1);
4251 firstbase = lastbase = lastdir = 0;
4252 }
4253
4254 if (firstbase == -1) {
4255 /*
4256 * The entire string consists only of a basename
4257 * component. If we're looking for dirname, we need
4258 * to change our string to be just "."; if we're
4259 * looking for a basename, we'll just set the first
4260 * character of the basename to be 0.
4261 */
4262 if (subr == DIF_SUBR_DIRNAME) {
4263 ASSERT(lastdir == -1);
4264 src = (uintptr_t)".";
4265 lastdir = 0;
4266 } else {
4267 firstbase = 0;
4268 }
4269 }
4270
4271 if (subr == DIF_SUBR_DIRNAME) {
4272 if (lastdir == -1) {
4273 /*
4274 * We know that we have a slash in the name --
4275 * or lastdir would be set to 0, above. And
4276 * because lastdir is -1, we know that this
4277 * slash must be the first character. (That
4278 * is, the full string must be of the form
4279 * "/basename".) In this case, the last
4280 * character of the directory name is 0.
4281 */
4282 lastdir = 0;
4283 }
4284
4285 start = 0;
4286 end = lastdir;
4287 } else {
4288 ASSERT(subr == DIF_SUBR_BASENAME);
4289 ASSERT(firstbase != -1 && lastbase != -1);
4290 start = firstbase;
4291 end = lastbase;
4292 }
4293
4294 for (i = start, j = 0; i <= end && (uint64_t)j < size - 1; i++, j++)
4295 dest[j] = dtrace_load8(src + i);
4296
4297 dest[j] = '\0';
4298 regs[rd] = (uintptr_t)dest;
4299 mstate->dtms_scratch_ptr += size;
4300 break;
4301 }
4302
4303 case DIF_SUBR_CLEANPATH: {
4304 char *dest = (char *)mstate->dtms_scratch_ptr, c;
4305 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4306 uintptr_t src = tupregs[0].dttk_value;
4307 int i = 0, j = 0;
4308
4309 if (!dtrace_strcanload(src, size, mstate, vstate)) {
4310 regs[rd] = 0;
4311 break;
4312 }
4313
4314 if (!DTRACE_INSCRATCH(mstate, size)) {
4315 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4316 regs[rd] = 0;
4317 break;
4318 }
4319
4320 /*
4321 * Move forward, loading each character.
4322 */
4323 do {
4324 c = dtrace_load8(src + i++);
4325 next:
4326 if ((uint64_t)(j + 5) >= size) /* 5 = strlen("/..c\0") */
4327 break;
4328
4329 if (c != '/') {
4330 dest[j++] = c;
4331 continue;
4332 }
4333
4334 c = dtrace_load8(src + i++);
4335
4336 if (c == '/') {
4337 /*
4338 * We have two slashes -- we can just advance
4339 * to the next character.
4340 */
4341 goto next;
4342 }
4343
4344 if (c != '.') {
4345 /*
4346 * This is not "." and it's not ".." -- we can
4347 * just store the "/" and this character and
4348 * drive on.
4349 */
4350 dest[j++] = '/';
4351 dest[j++] = c;
4352 continue;
4353 }
4354
4355 c = dtrace_load8(src + i++);
4356
4357 if (c == '/') {
4358 /*
4359 * This is a "/./" component. We're not going
4360 * to store anything in the destination buffer;
4361 * we're just going to go to the next component.
4362 */
4363 goto next;
4364 }
4365
4366 if (c != '.') {
4367 /*
4368 * This is not ".." -- we can just store the
4369 * "/." and this character and continue
4370 * processing.
4371 */
4372 dest[j++] = '/';
4373 dest[j++] = '.';
4374 dest[j++] = c;
4375 continue;
4376 }
4377
4378 c = dtrace_load8(src + i++);
4379
4380 if (c != '/' && c != '\0') {
4381 /*
4382 * This is not ".." -- it's "..[mumble]".
4383 * We'll store the "/.." and this character
4384 * and continue processing.
4385 */
4386 dest[j++] = '/';
4387 dest[j++] = '.';
4388 dest[j++] = '.';
4389 dest[j++] = c;
4390 continue;
4391 }
4392
4393 /*
4394 * This is "/../" or "/..\0". We need to back up
4395 * our destination pointer until we find a "/".
4396 */
4397 i--;
4398 while (j != 0 && dest[--j] != '/')
4399 continue;
4400
4401 if (c == '\0')
4402 dest[++j] = '/';
4403 } while (c != '\0');
4404
4405 dest[j] = '\0';
4406 regs[rd] = (uintptr_t)dest;
4407 mstate->dtms_scratch_ptr += size;
4408 break;
4409 }
4410
4411 case DIF_SUBR_INET_NTOA:
4412 case DIF_SUBR_INET_NTOA6:
4413 case DIF_SUBR_INET_NTOP: {
4414 size_t size;
4415 int af, argi, i;
4416 char *base, *end;
4417
4418 if (subr == DIF_SUBR_INET_NTOP) {
4419 af = (int)tupregs[0].dttk_value;
4420 argi = 1;
4421 } else {
4422 af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
4423 argi = 0;
4424 }
4425
4426 if (af == AF_INET) {
4427 #if !defined(__APPLE__)
4428 ipaddr_t ip4;
4429 #else
4430 uint32_t ip4;
4431 #endif /* __APPLE__ */
4432 uint8_t *ptr8, val;
4433
4434 /*
4435 * Safely load the IPv4 address.
4436 */
4437 #if !defined(__APPLE__)
4438 ip4 = dtrace_load32(tupregs[argi].dttk_value);
4439 #else
4440 dtrace_bcopy(
4441 (void *)(uintptr_t)tupregs[argi].dttk_value,
4442 (void *)(uintptr_t)&ip4, sizeof (ip4));
4443 #endif /* __APPLE__ */
4444 /*
4445 * Check an IPv4 string will fit in scratch.
4446 */
4447 #if !defined(__APPLE__)
4448 size = INET_ADDRSTRLEN;
4449 #else
4450 size = MAX_IPv4_STR_LEN;
4451 #endif /* __APPLE__ */
4452 if (!DTRACE_INSCRATCH(mstate, size)) {
4453 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4454 regs[rd] = 0;
4455 break;
4456 }
4457 base = (char *)mstate->dtms_scratch_ptr;
4458 end = (char *)mstate->dtms_scratch_ptr + size - 1;
4459
4460 /*
4461 * Stringify as a dotted decimal quad.
4462 */
4463 *end-- = '\0';
4464 ptr8 = (uint8_t *)&ip4;
4465 for (i = 3; i >= 0; i--) {
4466 val = ptr8[i];
4467
4468 if (val == 0) {
4469 *end-- = '0';
4470 } else {
4471 for (; val; val /= 10) {
4472 *end-- = '0' + (val % 10);
4473 }
4474 }
4475
4476 if (i > 0)
4477 *end-- = '.';
4478 }
4479 ASSERT(end + 1 >= base);
4480
4481 } else if (af == AF_INET6) {
4482 #if defined(__APPLE__)
4483 #define _S6_un __u6_addr
4484 #define _S6_u8 __u6_addr8
4485 #endif /* __APPLE__ */
4486 struct in6_addr ip6;
4487 int firstzero, tryzero, numzero, v6end;
4488 uint16_t val;
4489 const char digits[] = "0123456789abcdef";
4490
4491 /*
4492 * Stringify using RFC 1884 convention 2 - 16 bit
4493 * hexadecimal values with a zero-run compression.
4494 * Lower case hexadecimal digits are used.
4495 * eg, fe80::214:4fff:fe0b:76c8.
4496 * The IPv4 embedded form is returned for inet_ntop,
4497 * just the IPv4 string is returned for inet_ntoa6.
4498 */
4499
4500 /*
4501 * Safely load the IPv6 address.
4502 */
4503 dtrace_bcopy(
4504 (void *)(uintptr_t)tupregs[argi].dttk_value,
4505 (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
4506
4507 /*
4508 * Check an IPv6 string will fit in scratch.
4509 */
4510 size = INET6_ADDRSTRLEN;
4511 if (!DTRACE_INSCRATCH(mstate, size)) {
4512 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4513 regs[rd] = 0;
4514 break;
4515 }
4516 base = (char *)mstate->dtms_scratch_ptr;
4517 end = (char *)mstate->dtms_scratch_ptr + size - 1;
4518 *end-- = '\0';
4519
4520 /*
4521 * Find the longest run of 16 bit zero values
4522 * for the single allowed zero compression - "::".
4523 */
4524 firstzero = -1;
4525 tryzero = -1;
4526 numzero = 1;
4527 for (i = 0; i < (int)sizeof (struct in6_addr); i++) {
4528 if (ip6._S6_un._S6_u8[i] == 0 &&
4529 tryzero == -1 && i % 2 == 0) {
4530 tryzero = i;
4531 continue;
4532 }
4533
4534 if (tryzero != -1 &&
4535 (ip6._S6_un._S6_u8[i] != 0 ||
4536 i == sizeof (struct in6_addr) - 1)) {
4537
4538 if (i - tryzero <= numzero) {
4539 tryzero = -1;
4540 continue;
4541 }
4542
4543 firstzero = tryzero;
4544 numzero = i - i % 2 - tryzero;
4545 tryzero = -1;
4546
4547 if (ip6._S6_un._S6_u8[i] == 0 &&
4548 i == sizeof (struct in6_addr) - 1)
4549 numzero += 2;
4550 }
4551 }
4552 ASSERT(firstzero + numzero <= (int)sizeof (struct in6_addr));
4553
4554 /*
4555 * Check for an IPv4 embedded address.
4556 */
4557 v6end = sizeof (struct in6_addr) - 2;
4558 if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
4559 IN6_IS_ADDR_V4COMPAT(&ip6)) {
4560 for (i = sizeof (struct in6_addr) - 1;
4561 i >= (int)DTRACE_V4MAPPED_OFFSET; i--) {
4562 ASSERT(end >= base);
4563
4564 val = ip6._S6_un._S6_u8[i];
4565
4566 if (val == 0) {
4567 *end-- = '0';
4568 } else {
4569 for (; val; val /= 10) {
4570 *end-- = '0' + val % 10;
4571 }
4572 }
4573
4574 if (i > (int)DTRACE_V4MAPPED_OFFSET)
4575 *end-- = '.';
4576 }
4577
4578 if (subr == DIF_SUBR_INET_NTOA6)
4579 goto inetout;
4580
4581 /*
4582 * Set v6end to skip the IPv4 address that
4583 * we have already stringified.
4584 */
4585 v6end = 10;
4586 }
4587
4588 /*
4589 * Build the IPv6 string by working through the
4590 * address in reverse.
4591 */
4592 for (i = v6end; i >= 0; i -= 2) {
4593 ASSERT(end >= base);
4594
4595 if (i == firstzero + numzero - 2) {
4596 *end-- = ':';
4597 *end-- = ':';
4598 i -= numzero - 2;
4599 continue;
4600 }
4601
4602 if (i < 14 && i != firstzero - 2)
4603 *end-- = ':';
4604
4605 val = (ip6._S6_un._S6_u8[i] << 8) +
4606 ip6._S6_un._S6_u8[i + 1];
4607
4608 if (val == 0) {
4609 *end-- = '0';
4610 } else {
4611 for (; val; val /= 16) {
4612 *end-- = digits[val % 16];
4613 }
4614 }
4615 }
4616 ASSERT(end + 1 >= base);
4617
4618 #if defined(__APPLE__)
4619 #undef _S6_un
4620 #undef _S6_u8
4621 #endif /* __APPLE__ */
4622 } else {
4623 /*
4624 * The user didn't use AH_INET or AH_INET6.
4625 */
4626 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4627 regs[rd] = 0;
4628 break;
4629 }
4630
4631 inetout: regs[rd] = (uintptr_t)end + 1;
4632 mstate->dtms_scratch_ptr += size;
4633 break;
4634 }
4635
4636 case DIF_SUBR_TOUPPER:
4637 case DIF_SUBR_TOLOWER: {
4638 uintptr_t src = tupregs[0].dttk_value;
4639 char *dest = (char *)mstate->dtms_scratch_ptr;
4640 char lower, upper, base, c;
4641 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4642 size_t len = dtrace_strlen((char*) src, size);
4643 size_t i = 0;
4644
4645 lower = (subr == DIF_SUBR_TOUPPER) ? 'a' : 'A';
4646 upper = (subr == DIF_SUBR_TOUPPER) ? 'z' : 'Z';
4647 base = (subr == DIF_SUBR_TOUPPER) ? 'A' : 'a';
4648
4649 if (!dtrace_canload(src, len + 1, mstate, vstate)) {
4650 regs[rd] = 0;
4651 break;
4652 }
4653
4654 if (!DTRACE_INSCRATCH(mstate, size)) {
4655 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4656 regs[rd] = 0;
4657 break;
4658 }
4659
4660 for (i = 0; i < size - 1; ++i) {
4661 if ((c = dtrace_load8(src + i)) == '\0')
4662 break;
4663 if (c >= lower && c <= upper)
4664 c = base + (c - lower);
4665 dest[i] = c;
4666 }
4667
4668 ASSERT(i < size);
4669
4670 dest[i] = '\0';
4671 regs[rd] = (uintptr_t) dest;
4672 mstate->dtms_scratch_ptr += size;
4673
4674 break;
4675 }
4676
4677 /*
4678 * APPLE NOTE:
4679 * CoreProfile callback ('core_profile (uint64_t, [uint64_t], [uint64_t] ...)')
4680 */
4681 case DIF_SUBR_COREPROFILE: {
4682 uint64_t selector = tupregs[0].dttk_value;
4683 uint64_t args[DIF_DTR_NREGS-1] = {0ULL};
4684 uint32_t ii;
4685 uint32_t count = (uint32_t)nargs;
4686
4687 if (count < 1) {
4688 regs[rd] = KERN_FAILURE;
4689 break;
4690 }
4691
4692 if(count > DIF_DTR_NREGS)
4693 count = DIF_DTR_NREGS;
4694
4695 /* copy in any variadic argument list, bounded by DIF_DTR_NREGS */
4696 for(ii = 0; ii < count-1; ii++) {
4697 args[ii] = tupregs[ii+1].dttk_value;
4698 }
4699
4700 kern_return_t ret =
4701 chudxnu_dtrace_callback(selector, args, count-1);
4702 if(KERN_SUCCESS != ret) {
4703 /* error */
4704 }
4705
4706 regs[rd] = ret;
4707 break;
4708 }
4709 }
4710 }
4711
4712 /*
4713 * Emulate the execution of DTrace IR instructions specified by the given
4714 * DIF object. This function is deliberately void of assertions as all of
4715 * the necessary checks are handled by a call to dtrace_difo_validate().
4716 */
4717 static uint64_t
4718 dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
4719 dtrace_vstate_t *vstate, dtrace_state_t *state)
4720 {
4721 const dif_instr_t *text = difo->dtdo_buf;
4722 const uint_t textlen = difo->dtdo_len;
4723 const char *strtab = difo->dtdo_strtab;
4724 const uint64_t *inttab = difo->dtdo_inttab;
4725
4726 uint64_t rval = 0;
4727 dtrace_statvar_t *svar;
4728 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
4729 dtrace_difv_t *v;
4730 volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
4731 volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
4732
4733 dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
4734 uint64_t regs[DIF_DIR_NREGS];
4735 uint64_t *tmp;
4736
4737 uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
4738 int64_t cc_r;
4739 uint_t pc = 0, id, opc = 0;
4740 uint8_t ttop = 0;
4741 dif_instr_t instr;
4742 uint_t r1, r2, rd;
4743
4744 /*
4745 * We stash the current DIF object into the machine state: we need it
4746 * for subsequent access checking.
4747 */
4748 mstate->dtms_difo = difo;
4749
4750 regs[DIF_REG_R0] = 0; /* %r0 is fixed at zero */
4751
4752 while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
4753 opc = pc;
4754
4755 instr = text[pc++];
4756 r1 = DIF_INSTR_R1(instr);
4757 r2 = DIF_INSTR_R2(instr);
4758 rd = DIF_INSTR_RD(instr);
4759
4760 switch (DIF_INSTR_OP(instr)) {
4761 case DIF_OP_OR:
4762 regs[rd] = regs[r1] | regs[r2];
4763 break;
4764 case DIF_OP_XOR:
4765 regs[rd] = regs[r1] ^ regs[r2];
4766 break;
4767 case DIF_OP_AND:
4768 regs[rd] = regs[r1] & regs[r2];
4769 break;
4770 case DIF_OP_SLL:
4771 regs[rd] = regs[r1] << regs[r2];
4772 break;
4773 case DIF_OP_SRL:
4774 regs[rd] = regs[r1] >> regs[r2];
4775 break;
4776 case DIF_OP_SUB:
4777 regs[rd] = regs[r1] - regs[r2];
4778 break;
4779 case DIF_OP_ADD:
4780 regs[rd] = regs[r1] + regs[r2];
4781 break;
4782 case DIF_OP_MUL:
4783 regs[rd] = regs[r1] * regs[r2];
4784 break;
4785 case DIF_OP_SDIV:
4786 if (regs[r2] == 0) {
4787 regs[rd] = 0;
4788 *flags |= CPU_DTRACE_DIVZERO;
4789 } else {
4790 regs[rd] = (int64_t)regs[r1] /
4791 (int64_t)regs[r2];
4792 }
4793 break;
4794
4795 case DIF_OP_UDIV:
4796 if (regs[r2] == 0) {
4797 regs[rd] = 0;
4798 *flags |= CPU_DTRACE_DIVZERO;
4799 } else {
4800 regs[rd] = regs[r1] / regs[r2];
4801 }
4802 break;
4803
4804 case DIF_OP_SREM:
4805 if (regs[r2] == 0) {
4806 regs[rd] = 0;
4807 *flags |= CPU_DTRACE_DIVZERO;
4808 } else {
4809 regs[rd] = (int64_t)regs[r1] %
4810 (int64_t)regs[r2];
4811 }
4812 break;
4813
4814 case DIF_OP_UREM:
4815 if (regs[r2] == 0) {
4816 regs[rd] = 0;
4817 *flags |= CPU_DTRACE_DIVZERO;
4818 } else {
4819 regs[rd] = regs[r1] % regs[r2];
4820 }
4821 break;
4822
4823 case DIF_OP_NOT:
4824 regs[rd] = ~regs[r1];
4825 break;
4826 case DIF_OP_MOV:
4827 regs[rd] = regs[r1];
4828 break;
4829 case DIF_OP_CMP:
4830 cc_r = regs[r1] - regs[r2];
4831 cc_n = cc_r < 0;
4832 cc_z = cc_r == 0;
4833 cc_v = 0;
4834 cc_c = regs[r1] < regs[r2];
4835 break;
4836 case DIF_OP_TST:
4837 cc_n = cc_v = cc_c = 0;
4838 cc_z = regs[r1] == 0;
4839 break;
4840 case DIF_OP_BA:
4841 pc = DIF_INSTR_LABEL(instr);
4842 break;
4843 case DIF_OP_BE:
4844 if (cc_z)
4845 pc = DIF_INSTR_LABEL(instr);
4846 break;
4847 case DIF_OP_BNE:
4848 if (cc_z == 0)
4849 pc = DIF_INSTR_LABEL(instr);
4850 break;
4851 case DIF_OP_BG:
4852 if ((cc_z | (cc_n ^ cc_v)) == 0)
4853 pc = DIF_INSTR_LABEL(instr);
4854 break;
4855 case DIF_OP_BGU:
4856 if ((cc_c | cc_z) == 0)
4857 pc = DIF_INSTR_LABEL(instr);
4858 break;
4859 case DIF_OP_BGE:
4860 if ((cc_n ^ cc_v) == 0)
4861 pc = DIF_INSTR_LABEL(instr);
4862 break;
4863 case DIF_OP_BGEU:
4864 if (cc_c == 0)
4865 pc = DIF_INSTR_LABEL(instr);
4866 break;
4867 case DIF_OP_BL:
4868 if (cc_n ^ cc_v)
4869 pc = DIF_INSTR_LABEL(instr);
4870 break;
4871 case DIF_OP_BLU:
4872 if (cc_c)
4873 pc = DIF_INSTR_LABEL(instr);
4874 break;
4875 case DIF_OP_BLE:
4876 if (cc_z | (cc_n ^ cc_v))
4877 pc = DIF_INSTR_LABEL(instr);
4878 break;
4879 case DIF_OP_BLEU:
4880 if (cc_c | cc_z)
4881 pc = DIF_INSTR_LABEL(instr);
4882 break;
4883 case DIF_OP_RLDSB:
4884 if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
4885 *flags |= CPU_DTRACE_KPRIV;
4886 *illval = regs[r1];
4887 break;
4888 }
4889 /*FALLTHROUGH*/
4890 case DIF_OP_LDSB:
4891 regs[rd] = (int8_t)dtrace_load8(regs[r1]);
4892 break;
4893 case DIF_OP_RLDSH:
4894 if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
4895 *flags |= CPU_DTRACE_KPRIV;
4896 *illval = regs[r1];
4897 break;
4898 }
4899 /*FALLTHROUGH*/
4900 case DIF_OP_LDSH:
4901 regs[rd] = (int16_t)dtrace_load16(regs[r1]);
4902 break;
4903 case DIF_OP_RLDSW:
4904 if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
4905 *flags |= CPU_DTRACE_KPRIV;
4906 *illval = regs[r1];
4907 break;
4908 }
4909 /*FALLTHROUGH*/
4910 case DIF_OP_LDSW:
4911 regs[rd] = (int32_t)dtrace_load32(regs[r1]);
4912 break;
4913 case DIF_OP_RLDUB:
4914 if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
4915 *flags |= CPU_DTRACE_KPRIV;
4916 *illval = regs[r1];
4917 break;
4918 }
4919 /*FALLTHROUGH*/
4920 case DIF_OP_LDUB:
4921 regs[rd] = dtrace_load8(regs[r1]);
4922 break;
4923 case DIF_OP_RLDUH:
4924 if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
4925 *flags |= CPU_DTRACE_KPRIV;
4926 *illval = regs[r1];
4927 break;
4928 }
4929 /*FALLTHROUGH*/
4930 case DIF_OP_LDUH:
4931 regs[rd] = dtrace_load16(regs[r1]);
4932 break;
4933 case DIF_OP_RLDUW:
4934 if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
4935 *flags |= CPU_DTRACE_KPRIV;
4936 *illval = regs[r1];
4937 break;
4938 }
4939 /*FALLTHROUGH*/
4940 case DIF_OP_LDUW:
4941 regs[rd] = dtrace_load32(regs[r1]);
4942 break;
4943 case DIF_OP_RLDX:
4944 if (!dtrace_canstore(regs[r1], 8, mstate, vstate)) {
4945 *flags |= CPU_DTRACE_KPRIV;
4946 *illval = regs[r1];
4947 break;
4948 }
4949 /*FALLTHROUGH*/
4950 case DIF_OP_LDX:
4951 regs[rd] = dtrace_load64(regs[r1]);
4952 break;
4953 /*
4954 * Darwin 32-bit kernel may fetch from 64-bit user.
4955 * Do not cast regs to uintptr_t
4956 * DIF_OP_ULDSB,DIF_OP_ULDSH, DIF_OP_ULDSW, DIF_OP_ULDUB
4957 * DIF_OP_ULDUH, DIF_OP_ULDUW, DIF_OP_ULDX
4958 */
4959 case DIF_OP_ULDSB:
4960 regs[rd] = (int8_t)
4961 dtrace_fuword8(regs[r1]);
4962 break;
4963 case DIF_OP_ULDSH:
4964 regs[rd] = (int16_t)
4965 dtrace_fuword16(regs[r1]);
4966 break;
4967 case DIF_OP_ULDSW:
4968 regs[rd] = (int32_t)
4969 dtrace_fuword32(regs[r1]);
4970 break;
4971 case DIF_OP_ULDUB:
4972 regs[rd] =
4973 dtrace_fuword8(regs[r1]);
4974 break;
4975 case DIF_OP_ULDUH:
4976 regs[rd] =
4977 dtrace_fuword16(regs[r1]);
4978 break;
4979 case DIF_OP_ULDUW:
4980 regs[rd] =
4981 dtrace_fuword32(regs[r1]);
4982 break;
4983 case DIF_OP_ULDX:
4984 regs[rd] =
4985 dtrace_fuword64(regs[r1]);
4986 break;
4987 case DIF_OP_RET:
4988 rval = regs[rd];
4989 pc = textlen;
4990 break;
4991 case DIF_OP_NOP:
4992 break;
4993 case DIF_OP_SETX:
4994 regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
4995 break;
4996 case DIF_OP_SETS:
4997 regs[rd] = (uint64_t)(uintptr_t)
4998 (strtab + DIF_INSTR_STRING(instr));
4999 break;
5000 case DIF_OP_SCMP: {
5001 size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
5002 uintptr_t s1 = regs[r1];
5003 uintptr_t s2 = regs[r2];
5004
5005 if (s1 != 0 &&
5006 !dtrace_strcanload(s1, sz, mstate, vstate))
5007 break;
5008 if (s2 != 0 &&
5009 !dtrace_strcanload(s2, sz, mstate, vstate))
5010 break;
5011
5012 cc_r = dtrace_strncmp((char *)s1, (char *)s2, sz);
5013
5014 cc_n = cc_r < 0;
5015 cc_z = cc_r == 0;
5016 cc_v = cc_c = 0;
5017 break;
5018 }
5019 case DIF_OP_LDGA:
5020 regs[rd] = dtrace_dif_variable(mstate, state,
5021 r1, regs[r2]);
5022 break;
5023 case DIF_OP_LDGS:
5024 id = DIF_INSTR_VAR(instr);
5025
5026 if (id >= DIF_VAR_OTHER_UBASE) {
5027 uintptr_t a;
5028
5029 id -= DIF_VAR_OTHER_UBASE;
5030 svar = vstate->dtvs_globals[id];
5031 ASSERT(svar != NULL);
5032 v = &svar->dtsv_var;
5033
5034 if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
5035 regs[rd] = svar->dtsv_data;
5036 break;
5037 }
5038
5039 a = (uintptr_t)svar->dtsv_data;
5040
5041 if (*(uint8_t *)a == UINT8_MAX) {
5042 /*
5043 * If the 0th byte is set to UINT8_MAX
5044 * then this is to be treated as a
5045 * reference to a NULL variable.
5046 */
5047 regs[rd] = 0;
5048 } else {
5049 regs[rd] = a + sizeof (uint64_t);
5050 }
5051
5052 break;
5053 }
5054
5055 regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
5056 break;
5057
5058 case DIF_OP_STGS:
5059 id = DIF_INSTR_VAR(instr);
5060
5061 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5062 id -= DIF_VAR_OTHER_UBASE;
5063
5064 svar = vstate->dtvs_globals[id];
5065 ASSERT(svar != NULL);
5066 v = &svar->dtsv_var;
5067
5068 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5069 uintptr_t a = (uintptr_t)svar->dtsv_data;
5070
5071 ASSERT(a != 0);
5072 ASSERT(svar->dtsv_size != 0);
5073
5074 if (regs[rd] == 0) {
5075 *(uint8_t *)a = UINT8_MAX;
5076 break;
5077 } else {
5078 *(uint8_t *)a = 0;
5079 a += sizeof (uint64_t);
5080 }
5081 if (!dtrace_vcanload(
5082 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5083 mstate, vstate))
5084 break;
5085
5086 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5087 (void *)a, &v->dtdv_type);
5088 break;
5089 }
5090
5091 svar->dtsv_data = regs[rd];
5092 break;
5093
5094 case DIF_OP_LDTA:
5095 /*
5096 * There are no DTrace built-in thread-local arrays at
5097 * present. This opcode is saved for future work.
5098 */
5099 *flags |= CPU_DTRACE_ILLOP;
5100 regs[rd] = 0;
5101 break;
5102
5103 case DIF_OP_LDLS:
5104 id = DIF_INSTR_VAR(instr);
5105
5106 if (id < DIF_VAR_OTHER_UBASE) {
5107 /*
5108 * For now, this has no meaning.
5109 */
5110 regs[rd] = 0;
5111 break;
5112 }
5113
5114 id -= DIF_VAR_OTHER_UBASE;
5115
5116 ASSERT(id < (uint_t)vstate->dtvs_nlocals);
5117 ASSERT(vstate->dtvs_locals != NULL);
5118 svar = vstate->dtvs_locals[id];
5119 ASSERT(svar != NULL);
5120 v = &svar->dtsv_var;
5121
5122 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5123 uintptr_t a = (uintptr_t)svar->dtsv_data;
5124 size_t sz = v->dtdv_type.dtdt_size;
5125
5126 sz += sizeof (uint64_t);
5127 ASSERT(svar->dtsv_size == (int)NCPU * sz);
5128 a += CPU->cpu_id * sz;
5129
5130 if (*(uint8_t *)a == UINT8_MAX) {
5131 /*
5132 * If the 0th byte is set to UINT8_MAX
5133 * then this is to be treated as a
5134 * reference to a NULL variable.
5135 */
5136 regs[rd] = 0;
5137 } else {
5138 regs[rd] = a + sizeof (uint64_t);
5139 }
5140
5141 break;
5142 }
5143
5144 ASSERT(svar->dtsv_size == (int)NCPU * sizeof (uint64_t));
5145 tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
5146 regs[rd] = tmp[CPU->cpu_id];
5147 break;
5148
5149 case DIF_OP_STLS:
5150 id = DIF_INSTR_VAR(instr);
5151
5152 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5153 id -= DIF_VAR_OTHER_UBASE;
5154 ASSERT(id < (uint_t)vstate->dtvs_nlocals);
5155 ASSERT(vstate->dtvs_locals != NULL);
5156 svar = vstate->dtvs_locals[id];
5157 ASSERT(svar != NULL);
5158 v = &svar->dtsv_var;
5159
5160 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5161 uintptr_t a = (uintptr_t)svar->dtsv_data;
5162 size_t sz = v->dtdv_type.dtdt_size;
5163
5164 sz += sizeof (uint64_t);
5165 ASSERT(svar->dtsv_size == (int)NCPU * sz);
5166 a += CPU->cpu_id * sz;
5167
5168 if (regs[rd] == 0) {
5169 *(uint8_t *)a = UINT8_MAX;
5170 break;
5171 } else {
5172 *(uint8_t *)a = 0;
5173 a += sizeof (uint64_t);
5174 }
5175
5176 if (!dtrace_vcanload(
5177 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5178 mstate, vstate))
5179 break;
5180
5181 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5182 (void *)a, &v->dtdv_type);
5183 break;
5184 }
5185
5186 ASSERT(svar->dtsv_size == (int)NCPU * sizeof (uint64_t));
5187 tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
5188 tmp[CPU->cpu_id] = regs[rd];
5189 break;
5190
5191 case DIF_OP_LDTS: {
5192 dtrace_dynvar_t *dvar;
5193 dtrace_key_t *key;
5194
5195 id = DIF_INSTR_VAR(instr);
5196 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5197 id -= DIF_VAR_OTHER_UBASE;
5198 v = &vstate->dtvs_tlocals[id];
5199
5200 key = &tupregs[DIF_DTR_NREGS];
5201 key[0].dttk_value = (uint64_t)id;
5202 key[0].dttk_size = 0;
5203 DTRACE_TLS_THRKEY(key[1].dttk_value);
5204 key[1].dttk_size = 0;
5205
5206 dvar = dtrace_dynvar(dstate, 2, key,
5207 sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
5208 mstate, vstate);
5209
5210 if (dvar == NULL) {
5211 regs[rd] = 0;
5212 break;
5213 }
5214
5215 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5216 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
5217 } else {
5218 regs[rd] = *((uint64_t *)dvar->dtdv_data);
5219 }
5220
5221 break;
5222 }
5223
5224 case DIF_OP_STTS: {
5225 dtrace_dynvar_t *dvar;
5226 dtrace_key_t *key;
5227
5228 id = DIF_INSTR_VAR(instr);
5229 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5230 id -= DIF_VAR_OTHER_UBASE;
5231
5232 key = &tupregs[DIF_DTR_NREGS];
5233 key[0].dttk_value = (uint64_t)id;
5234 key[0].dttk_size = 0;
5235 DTRACE_TLS_THRKEY(key[1].dttk_value);
5236 key[1].dttk_size = 0;
5237 v = &vstate->dtvs_tlocals[id];
5238
5239 dvar = dtrace_dynvar(dstate, 2, key,
5240 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5241 v->dtdv_type.dtdt_size : sizeof (uint64_t),
5242 regs[rd] ? DTRACE_DYNVAR_ALLOC :
5243 DTRACE_DYNVAR_DEALLOC, mstate, vstate);
5244
5245 /*
5246 * Given that we're storing to thread-local data,
5247 * we need to flush our predicate cache.
5248 */
5249 dtrace_set_thread_predcache(current_thread(), 0);
5250
5251 if (dvar == NULL)
5252 break;
5253
5254 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5255 if (!dtrace_vcanload(
5256 (void *)(uintptr_t)regs[rd],
5257 &v->dtdv_type, mstate, vstate))
5258 break;
5259
5260 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5261 dvar->dtdv_data, &v->dtdv_type);
5262 } else {
5263 *((uint64_t *)dvar->dtdv_data) = regs[rd];
5264 }
5265
5266 break;
5267 }
5268
5269 case DIF_OP_SRA:
5270 regs[rd] = (int64_t)regs[r1] >> regs[r2];
5271 break;
5272
5273 case DIF_OP_CALL:
5274 dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
5275 regs, tupregs, ttop, mstate, state);
5276 break;
5277
5278 case DIF_OP_PUSHTR:
5279 if (ttop == DIF_DTR_NREGS) {
5280 *flags |= CPU_DTRACE_TUPOFLOW;
5281 break;
5282 }
5283
5284 if (r1 == DIF_TYPE_STRING) {
5285 /*
5286 * If this is a string type and the size is 0,
5287 * we'll use the system-wide default string
5288 * size. Note that we are _not_ looking at
5289 * the value of the DTRACEOPT_STRSIZE option;
5290 * had this been set, we would expect to have
5291 * a non-zero size value in the "pushtr".
5292 */
5293 tupregs[ttop].dttk_size =
5294 dtrace_strlen((char *)(uintptr_t)regs[rd],
5295 regs[r2] ? regs[r2] :
5296 dtrace_strsize_default) + 1;
5297 } else {
5298 tupregs[ttop].dttk_size = regs[r2];
5299 }
5300
5301 tupregs[ttop++].dttk_value = regs[rd];
5302 break;
5303
5304 case DIF_OP_PUSHTV:
5305 if (ttop == DIF_DTR_NREGS) {
5306 *flags |= CPU_DTRACE_TUPOFLOW;
5307 break;
5308 }
5309
5310 tupregs[ttop].dttk_value = regs[rd];
5311 tupregs[ttop++].dttk_size = 0;
5312 break;
5313
5314 case DIF_OP_POPTS:
5315 if (ttop != 0)
5316 ttop--;
5317 break;
5318
5319 case DIF_OP_FLUSHTS:
5320 ttop = 0;
5321 break;
5322
5323 case DIF_OP_LDGAA:
5324 case DIF_OP_LDTAA: {
5325 dtrace_dynvar_t *dvar;
5326 dtrace_key_t *key = tupregs;
5327 uint_t nkeys = ttop;
5328
5329 id = DIF_INSTR_VAR(instr);
5330 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5331 id -= DIF_VAR_OTHER_UBASE;
5332
5333 key[nkeys].dttk_value = (uint64_t)id;
5334 key[nkeys++].dttk_size = 0;
5335
5336 if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
5337 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5338 key[nkeys++].dttk_size = 0;
5339 v = &vstate->dtvs_tlocals[id];
5340 } else {
5341 v = &vstate->dtvs_globals[id]->dtsv_var;
5342 }
5343
5344 dvar = dtrace_dynvar(dstate, nkeys, key,
5345 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5346 v->dtdv_type.dtdt_size : sizeof (uint64_t),
5347 DTRACE_DYNVAR_NOALLOC, mstate, vstate);
5348
5349 if (dvar == NULL) {
5350 regs[rd] = 0;
5351 break;
5352 }
5353
5354 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5355 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
5356 } else {
5357 regs[rd] = *((uint64_t *)dvar->dtdv_data);
5358 }
5359
5360 break;
5361 }
5362
5363 case DIF_OP_STGAA:
5364 case DIF_OP_STTAA: {
5365 dtrace_dynvar_t *dvar;
5366 dtrace_key_t *key = tupregs;
5367 uint_t nkeys = ttop;
5368
5369 id = DIF_INSTR_VAR(instr);
5370 ASSERT(id >= DIF_VAR_OTHER_UBASE);
5371 id -= DIF_VAR_OTHER_UBASE;
5372
5373 key[nkeys].dttk_value = (uint64_t)id;
5374 key[nkeys++].dttk_size = 0;
5375
5376 if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
5377 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5378 key[nkeys++].dttk_size = 0;
5379 v = &vstate->dtvs_tlocals[id];
5380 } else {
5381 v = &vstate->dtvs_globals[id]->dtsv_var;
5382 }
5383
5384 dvar = dtrace_dynvar(dstate, nkeys, key,
5385 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5386 v->dtdv_type.dtdt_size : sizeof (uint64_t),
5387 regs[rd] ? DTRACE_DYNVAR_ALLOC :
5388 DTRACE_DYNVAR_DEALLOC, mstate, vstate);
5389
5390 if (dvar == NULL)
5391 break;
5392
5393 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5394 if (!dtrace_vcanload(
5395 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5396 mstate, vstate))
5397 break;
5398
5399 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5400 dvar->dtdv_data, &v->dtdv_type);
5401 } else {
5402 *((uint64_t *)dvar->dtdv_data) = regs[rd];
5403 }
5404
5405 break;
5406 }
5407
5408 case DIF_OP_ALLOCS: {
5409 uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
5410 size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
5411
5412 /*
5413 * Rounding up the user allocation size could have
5414 * overflowed large, bogus allocations (like -1ULL) to
5415 * 0.
5416 */
5417 if (size < regs[r1] ||
5418 !DTRACE_INSCRATCH(mstate, size)) {
5419 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5420 regs[rd] = 0;
5421 break;
5422 }
5423
5424 dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
5425 mstate->dtms_scratch_ptr += size;
5426 regs[rd] = ptr;
5427 break;
5428 }
5429
5430 case DIF_OP_COPYS:
5431 if (!dtrace_canstore(regs[rd], regs[r2],
5432 mstate, vstate)) {
5433 *flags |= CPU_DTRACE_BADADDR;
5434 *illval = regs[rd];
5435 break;
5436 }
5437
5438 if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
5439 break;
5440
5441 dtrace_bcopy((void *)(uintptr_t)regs[r1],
5442 (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
5443 break;
5444
5445 case DIF_OP_STB:
5446 if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
5447 *flags |= CPU_DTRACE_BADADDR;
5448 *illval = regs[rd];
5449 break;
5450 }
5451 *((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
5452 break;
5453
5454 case DIF_OP_STH:
5455 if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
5456 *flags |= CPU_DTRACE_BADADDR;
5457 *illval = regs[rd];
5458 break;
5459 }
5460 if (regs[rd] & 1) {
5461 *flags |= CPU_DTRACE_BADALIGN;
5462 *illval = regs[rd];
5463 break;
5464 }
5465 *((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
5466 break;
5467
5468 case DIF_OP_STW:
5469 if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
5470 *flags |= CPU_DTRACE_BADADDR;
5471 *illval = regs[rd];
5472 break;
5473 }
5474 if (regs[rd] & 3) {
5475 *flags |= CPU_DTRACE_BADALIGN;
5476 *illval = regs[rd];
5477 break;
5478 }
5479 *((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
5480 break;
5481
5482 case DIF_OP_STX:
5483 if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
5484 *flags |= CPU_DTRACE_BADADDR;
5485 *illval = regs[rd];
5486 break;
5487 }
5488
5489 /*
5490 * Darwin kmem_zalloc() called from
5491 * dtrace_difo_init() is 4-byte aligned.
5492 */
5493 if (regs[rd] & 3) {
5494 *flags |= CPU_DTRACE_BADALIGN;
5495 *illval = regs[rd];
5496 break;
5497 }
5498 *((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
5499 break;
5500 }
5501 }
5502
5503 if (!(*flags & CPU_DTRACE_FAULT))
5504 return (rval);
5505
5506 mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
5507 mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
5508
5509 return (0);
5510 }
5511
5512 static void
5513 dtrace_action_breakpoint(dtrace_ecb_t *ecb)
5514 {
5515 dtrace_probe_t *probe = ecb->dte_probe;
5516 dtrace_provider_t *prov = probe->dtpr_provider;
5517 char c[DTRACE_FULLNAMELEN + 80], *str;
5518 const char *msg = "dtrace: breakpoint action at probe ";
5519 const char *ecbmsg = " (ecb ";
5520 uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
5521 uintptr_t val = (uintptr_t)ecb;
5522 int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
5523
5524 if (dtrace_destructive_disallow)
5525 return;
5526
5527 /*
5528 * It's impossible to be taking action on the NULL probe.
5529 */
5530 ASSERT(probe != NULL);
5531
5532 /*
5533 * This is a poor man's (destitute man's?) sprintf(): we want to
5534 * print the provider name, module name, function name and name of
5535 * the probe, along with the hex address of the ECB with the breakpoint
5536 * action -- all of which we must place in the character buffer by
5537 * hand.
5538 */
5539 while (*msg != '\0')
5540 c[i++] = *msg++;
5541
5542 for (str = prov->dtpv_name; *str != '\0'; str++)
5543 c[i++] = *str;
5544 c[i++] = ':';
5545
5546 for (str = probe->dtpr_mod; *str != '\0'; str++)
5547 c[i++] = *str;
5548 c[i++] = ':';
5549
5550 for (str = probe->dtpr_func; *str != '\0'; str++)
5551 c[i++] = *str;
5552 c[i++] = ':';
5553
5554 for (str = probe->dtpr_name; *str != '\0'; str++)
5555 c[i++] = *str;
5556
5557 while (*ecbmsg != '\0')
5558 c[i++] = *ecbmsg++;
5559
5560 while (shift >= 0) {
5561 mask = (uintptr_t)0xf << shift;
5562
5563 if (val >= ((uintptr_t)1 << shift))
5564 c[i++] = "0123456789abcdef"[(val & mask) >> shift];
5565 shift -= 4;
5566 }
5567
5568 c[i++] = ')';
5569 c[i] = '\0';
5570
5571 debug_enter(c);
5572 }
5573
5574 static void
5575 dtrace_action_panic(dtrace_ecb_t *ecb)
5576 {
5577 dtrace_probe_t *probe = ecb->dte_probe;
5578
5579 /*
5580 * It's impossible to be taking action on the NULL probe.
5581 */
5582 ASSERT(probe != NULL);
5583
5584 if (dtrace_destructive_disallow)
5585 return;
5586
5587 if (dtrace_panicked != NULL)
5588 return;
5589
5590 if (dtrace_casptr(&dtrace_panicked, NULL, current_thread()) != NULL)
5591 return;
5592
5593 /*
5594 * We won the right to panic. (We want to be sure that only one
5595 * thread calls panic() from dtrace_probe(), and that panic() is
5596 * called exactly once.)
5597 */
5598 panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
5599 probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
5600 probe->dtpr_func, probe->dtpr_name, (void *)ecb);
5601
5602 /*
5603 * APPLE NOTE: this was for an old Mac OS X debug feature
5604 * allowing a return from panic(). Revisit someday.
5605 */
5606 dtrace_panicked = NULL;
5607 }
5608
5609 static void
5610 dtrace_action_raise(uint64_t sig)
5611 {
5612 if (dtrace_destructive_disallow)
5613 return;
5614
5615 if (sig >= NSIG) {
5616 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5617 return;
5618 }
5619
5620 /*
5621 * raise() has a queue depth of 1 -- we ignore all subsequent
5622 * invocations of the raise() action.
5623 */
5624
5625 uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
5626
5627 if (uthread && uthread->t_dtrace_sig == 0) {
5628 uthread->t_dtrace_sig = sig;
5629 act_set_astbsd(current_thread());
5630 }
5631 }
5632
5633 static void
5634 dtrace_action_stop(void)
5635 {
5636 if (dtrace_destructive_disallow)
5637 return;
5638
5639 uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
5640 if (uthread) {
5641 /*
5642 * The currently running process will be set to task_suspend
5643 * when it next leaves the kernel.
5644 */
5645 uthread->t_dtrace_stop = 1;
5646 act_set_astbsd(current_thread());
5647 }
5648 }
5649
5650
5651 /*
5652 * APPLE NOTE: pidresume works in conjunction with the dtrace stop action.
5653 * Both activate only when the currently running process next leaves the
5654 * kernel.
5655 */
5656 static void
5657 dtrace_action_pidresume(uint64_t pid)
5658 {
5659 if (dtrace_destructive_disallow)
5660 return;
5661
5662 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5663 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5664 return;
5665 }
5666 uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
5667
5668 /*
5669 * When the currently running process leaves the kernel, it attempts to
5670 * task_resume the process (denoted by pid), if that pid appears to have
5671 * been stopped by dtrace_action_stop().
5672 * The currently running process has a pidresume() queue depth of 1 --
5673 * subsequent invocations of the pidresume() action are ignored.
5674 */
5675
5676 if (pid != 0 && uthread && uthread->t_dtrace_resumepid == 0) {
5677 uthread->t_dtrace_resumepid = pid;
5678 act_set_astbsd(current_thread());
5679 }
5680 }
5681
5682 static void
5683 dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
5684 {
5685 hrtime_t now;
5686 volatile uint16_t *flags;
5687 dtrace_cpu_t *cpu = CPU;
5688
5689 if (dtrace_destructive_disallow)
5690 return;
5691
5692 flags = (volatile uint16_t *)&cpu_core[cpu->cpu_id].cpuc_dtrace_flags;
5693
5694 now = dtrace_gethrtime();
5695
5696 if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
5697 /*
5698 * We need to advance the mark to the current time.
5699 */
5700 cpu->cpu_dtrace_chillmark = now;
5701 cpu->cpu_dtrace_chilled = 0;
5702 }
5703
5704 /*
5705 * Now check to see if the requested chill time would take us over
5706 * the maximum amount of time allowed in the chill interval. (Or
5707 * worse, if the calculation itself induces overflow.)
5708 */
5709 if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
5710 cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
5711 *flags |= CPU_DTRACE_ILLOP;
5712 return;
5713 }
5714
5715 while (dtrace_gethrtime() - now < val)
5716 continue;
5717
5718 /*
5719 * Normally, we assure that the value of the variable "timestamp" does
5720 * not change within an ECB. The presence of chill() represents an
5721 * exception to this rule, however.
5722 */
5723 mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
5724 cpu->cpu_dtrace_chilled += val;
5725 }
5726
5727 static void
5728 dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
5729 uint64_t *buf, uint64_t arg)
5730 {
5731 int nframes = DTRACE_USTACK_NFRAMES(arg);
5732 int strsize = DTRACE_USTACK_STRSIZE(arg);
5733 uint64_t *pcs = &buf[1], *fps;
5734 char *str = (char *)&pcs[nframes];
5735 int size, offs = 0, i, j;
5736 uintptr_t old = mstate->dtms_scratch_ptr, saved;
5737 uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
5738 char *sym;
5739
5740 /*
5741 * Should be taking a faster path if string space has not been
5742 * allocated.
5743 */
5744 ASSERT(strsize != 0);
5745
5746 /*
5747 * We will first allocate some temporary space for the frame pointers.
5748 */
5749 fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
5750 size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
5751 (nframes * sizeof (uint64_t));
5752
5753 if (!DTRACE_INSCRATCH(mstate, (uintptr_t)size)) {
5754 /*
5755 * Not enough room for our frame pointers -- need to indicate
5756 * that we ran out of scratch space.
5757 */
5758 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5759 return;
5760 }
5761
5762 mstate->dtms_scratch_ptr += size;
5763 saved = mstate->dtms_scratch_ptr;
5764
5765 /*
5766 * Now get a stack with both program counters and frame pointers.
5767 */
5768 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5769 dtrace_getufpstack(buf, fps, nframes + 1);
5770 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5771
5772 /*
5773 * If that faulted, we're cooked.
5774 */
5775 if (*flags & CPU_DTRACE_FAULT)
5776 goto out;
5777
5778 /*
5779 * Now we want to walk up the stack, calling the USTACK helper. For
5780 * each iteration, we restore the scratch pointer.
5781 */
5782 for (i = 0; i < nframes; i++) {
5783 mstate->dtms_scratch_ptr = saved;
5784
5785 if (offs >= strsize)
5786 break;
5787
5788 sym = (char *)(uintptr_t)dtrace_helper(
5789 DTRACE_HELPER_ACTION_USTACK,
5790 mstate, state, pcs[i], fps[i]);
5791
5792 /*
5793 * If we faulted while running the helper, we're going to
5794 * clear the fault and null out the corresponding string.
5795 */
5796 if (*flags & CPU_DTRACE_FAULT) {
5797 *flags &= ~CPU_DTRACE_FAULT;
5798 str[offs++] = '\0';
5799 continue;
5800 }
5801
5802 if (sym == NULL) {
5803 str[offs++] = '\0';
5804 continue;
5805 }
5806
5807 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
5808
5809 /*
5810 * Now copy in the string that the helper returned to us.
5811 */
5812 for (j = 0; offs + j < strsize; j++) {
5813 if ((str[offs + j] = sym[j]) == '\0')
5814 break;
5815 }
5816
5817 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
5818
5819 offs += j + 1;
5820 }
5821
5822 if (offs >= strsize) {
5823 /*
5824 * If we didn't have room for all of the strings, we don't
5825 * abort processing -- this needn't be a fatal error -- but we
5826 * still want to increment a counter (dts_stkstroverflows) to
5827 * allow this condition to be warned about. (If this is from
5828 * a jstack() action, it is easily tuned via jstackstrsize.)
5829 */
5830 dtrace_error(&state->dts_stkstroverflows);
5831 }
5832
5833 while (offs < strsize)
5834 str[offs++] = '\0';
5835
5836 out:
5837 mstate->dtms_scratch_ptr = old;
5838 }
5839
5840 /*
5841 * If you're looking for the epicenter of DTrace, you just found it. This
5842 * is the function called by the provider to fire a probe -- from which all
5843 * subsequent probe-context DTrace activity emanates.
5844 */
5845 static void
5846 __dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
5847 uint64_t arg2, uint64_t arg3, uint64_t arg4)
5848 {
5849 processorid_t cpuid;
5850 dtrace_icookie_t cookie;
5851 dtrace_probe_t *probe;
5852 dtrace_mstate_t mstate;
5853 dtrace_ecb_t *ecb;
5854 dtrace_action_t *act;
5855 intptr_t offs;
5856 size_t size;
5857 int vtime, onintr;
5858 volatile uint16_t *flags;
5859 hrtime_t now;
5860
5861 cookie = dtrace_interrupt_disable();
5862 probe = dtrace_probes[id - 1];
5863 cpuid = CPU->cpu_id;
5864 onintr = CPU_ON_INTR(CPU);
5865
5866 if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
5867 probe->dtpr_predcache == dtrace_get_thread_predcache(current_thread())) {
5868 /*
5869 * We have hit in the predicate cache; we know that
5870 * this predicate would evaluate to be false.
5871 */
5872 dtrace_interrupt_enable(cookie);
5873 return;
5874 }
5875
5876 if (panic_quiesce) {
5877 /*
5878 * We don't trace anything if we're panicking.
5879 */
5880 dtrace_interrupt_enable(cookie);
5881 return;
5882 }
5883
5884 #if !defined(__APPLE__)
5885 now = dtrace_gethrtime();
5886 vtime = dtrace_vtime_references != 0;
5887
5888 if (vtime && curthread->t_dtrace_start)
5889 curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
5890 #else
5891 /*
5892 * APPLE NOTE: The time spent entering DTrace and arriving
5893 * to this point, is attributed to the current thread.
5894 * Instead it should accrue to DTrace. FIXME
5895 */
5896 vtime = dtrace_vtime_references != 0;
5897
5898 if (vtime)
5899 {
5900 int64_t dtrace_accum_time, recent_vtime;
5901 thread_t thread = current_thread();
5902
5903 dtrace_accum_time = dtrace_get_thread_tracing(thread); /* Time spent inside DTrace so far (nanoseconds) */
5904
5905 if (dtrace_accum_time >= 0) {
5906 recent_vtime = dtrace_abs_to_nano(dtrace_calc_thread_recent_vtime(thread)); /* up to the moment thread vtime */
5907
5908 recent_vtime = recent_vtime - dtrace_accum_time; /* Time without DTrace contribution */
5909
5910 dtrace_set_thread_vtime(thread, recent_vtime);
5911 }
5912 }
5913
5914 now = dtrace_gethrtime(); /* must not precede dtrace_calc_thread_recent_vtime() call! */
5915 #endif /* __APPLE__ */
5916
5917 /*
5918 * APPLE NOTE: A provider may call dtrace_probe_error() in lieu of
5919 * dtrace_probe() in some circumstances. See, e.g. fasttrap_isa.c.
5920 * However the provider has no access to ECB context, so passes
5921 * 0 through "arg0" and the probe_id of the overridden probe as arg1.
5922 * Detect that here and cons up a viable state (from the probe_id).
5923 */
5924 if (dtrace_probeid_error == id && 0 == arg0) {
5925 dtrace_id_t ftp_id = (dtrace_id_t)arg1;
5926 dtrace_probe_t *ftp_probe = dtrace_probes[ftp_id - 1];
5927 dtrace_ecb_t *ftp_ecb = ftp_probe->dtpr_ecb;
5928
5929 if (NULL != ftp_ecb) {
5930 dtrace_state_t *ftp_state = ftp_ecb->dte_state;
5931
5932 arg0 = (uint64_t)(uintptr_t)ftp_state;
5933 arg1 = ftp_ecb->dte_epid;
5934 /*
5935 * args[2-4] established by caller.
5936 */
5937 ftp_state->dts_arg_error_illval = -1; /* arg5 */
5938 }
5939 }
5940
5941 mstate.dtms_difo = NULL;
5942 mstate.dtms_probe = probe;
5943 mstate.dtms_strtok = 0;
5944 mstate.dtms_arg[0] = arg0;
5945 mstate.dtms_arg[1] = arg1;
5946 mstate.dtms_arg[2] = arg2;
5947 mstate.dtms_arg[3] = arg3;
5948 mstate.dtms_arg[4] = arg4;
5949
5950 flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
5951
5952 for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
5953 dtrace_predicate_t *pred = ecb->dte_predicate;
5954 dtrace_state_t *state = ecb->dte_state;
5955 dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
5956 dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
5957 dtrace_vstate_t *vstate = &state->dts_vstate;
5958 dtrace_provider_t *prov = probe->dtpr_provider;
5959 uint64_t tracememsize = 0;
5960 int committed = 0;
5961 caddr_t tomax;
5962
5963 /*
5964 * A little subtlety with the following (seemingly innocuous)
5965 * declaration of the automatic 'val': by looking at the
5966 * code, you might think that it could be declared in the
5967 * action processing loop, below. (That is, it's only used in
5968 * the action processing loop.) However, it must be declared
5969 * out of that scope because in the case of DIF expression
5970 * arguments to aggregating actions, one iteration of the
5971 * action loop will use the last iteration's value.
5972 */
5973 #ifdef lint
5974 uint64_t val = 0;
5975 #else
5976 uint64_t val = 0;
5977 #endif
5978
5979 mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
5980 *flags &= ~CPU_DTRACE_ERROR;
5981
5982 if (prov == dtrace_provider) {
5983 /*
5984 * If dtrace itself is the provider of this probe,
5985 * we're only going to continue processing the ECB if
5986 * arg0 (the dtrace_state_t) is equal to the ECB's
5987 * creating state. (This prevents disjoint consumers
5988 * from seeing one another's metaprobes.)
5989 */
5990 if (arg0 != (uint64_t)(uintptr_t)state)
5991 continue;
5992 }
5993
5994 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
5995 /*
5996 * We're not currently active. If our provider isn't
5997 * the dtrace pseudo provider, we're not interested.
5998 */
5999 if (prov != dtrace_provider)
6000 continue;
6001
6002 /*
6003 * Now we must further check if we are in the BEGIN
6004 * probe. If we are, we will only continue processing
6005 * if we're still in WARMUP -- if one BEGIN enabling
6006 * has invoked the exit() action, we don't want to
6007 * evaluate subsequent BEGIN enablings.
6008 */
6009 if (probe->dtpr_id == dtrace_probeid_begin &&
6010 state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
6011 ASSERT(state->dts_activity ==
6012 DTRACE_ACTIVITY_DRAINING);
6013 continue;
6014 }
6015 }
6016
6017 if (ecb->dte_cond) {
6018 /*
6019 * If the dte_cond bits indicate that this
6020 * consumer is only allowed to see user-mode firings
6021 * of this probe, call the provider's dtps_usermode()
6022 * entry point to check that the probe was fired
6023 * while in a user context. Skip this ECB if that's
6024 * not the case.
6025 */
6026 if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&
6027 prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,
6028 probe->dtpr_id, probe->dtpr_arg) == 0)
6029 continue;
6030
6031 /*
6032 * This is more subtle than it looks. We have to be
6033 * absolutely certain that CRED() isn't going to
6034 * change out from under us so it's only legit to
6035 * examine that structure if we're in constrained
6036 * situations. Currently, the only times we'll this
6037 * check is if a non-super-user has enabled the
6038 * profile or syscall providers -- providers that
6039 * allow visibility of all processes. For the
6040 * profile case, the check above will ensure that
6041 * we're examining a user context.
6042 */
6043 if (ecb->dte_cond & DTRACE_COND_OWNER) {
6044 cred_t *cr;
6045 cred_t *s_cr =
6046 ecb->dte_state->dts_cred.dcr_cred;
6047 proc_t *proc;
6048 #pragma unused(proc) /* __APPLE__ */
6049
6050 ASSERT(s_cr != NULL);
6051
6052 /*
6053 * XXX this is hackish, but so is setting a variable
6054 * XXX in a McCarthy OR...
6055 */
6056 if ((cr = dtrace_CRED()) == NULL ||
6057 posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_uid ||
6058 posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_ruid ||
6059 posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_suid ||
6060 posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_gid ||
6061 posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_rgid ||
6062 posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_sgid ||
6063 #if !defined(__APPLE__)
6064 (proc = ttoproc(curthread)) == NULL ||
6065 (proc->p_flag & SNOCD))
6066 #else
6067 1) /* APPLE NOTE: Darwin omits "No Core Dump" flag */
6068 #endif /* __APPLE__ */
6069 continue;
6070 }
6071
6072 if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
6073 cred_t *cr;
6074 cred_t *s_cr =
6075 ecb->dte_state->dts_cred.dcr_cred;
6076 #pragma unused(cr, s_cr) /* __APPLE__ */
6077
6078 ASSERT(s_cr != NULL);
6079
6080 #if !defined(__APPLE__)
6081 if ((cr = CRED()) == NULL ||
6082 s_cr->cr_zone->zone_id !=
6083 cr->cr_zone->zone_id)
6084 continue;
6085 #else
6086 /* APPLE NOTE: Darwin doesn't do zones. */
6087 #endif /* __APPLE__ */
6088 }
6089 }
6090
6091 if (now - state->dts_alive > dtrace_deadman_timeout) {
6092 /*
6093 * We seem to be dead. Unless we (a) have kernel
6094 * destructive permissions (b) have expicitly enabled
6095 * destructive actions and (c) destructive actions have
6096 * not been disabled, we're going to transition into
6097 * the KILLED state, from which no further processing
6098 * on this state will be performed.
6099 */
6100 if (!dtrace_priv_kernel_destructive(state) ||
6101 !state->dts_cred.dcr_destructive ||
6102 dtrace_destructive_disallow) {
6103 void *activity = &state->dts_activity;
6104 dtrace_activity_t current;
6105
6106 do {
6107 current = state->dts_activity;
6108 } while (dtrace_cas32(activity, current,
6109 DTRACE_ACTIVITY_KILLED) != current);
6110
6111 continue;
6112 }
6113 }
6114
6115 if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
6116 ecb->dte_alignment, state, &mstate)) < 0)
6117 continue;
6118
6119 tomax = buf->dtb_tomax;
6120 ASSERT(tomax != NULL);
6121
6122 if (ecb->dte_size != 0)
6123 DTRACE_STORE(uint32_t, tomax, offs, ecb->dte_epid);
6124
6125 mstate.dtms_epid = ecb->dte_epid;
6126 mstate.dtms_present |= DTRACE_MSTATE_EPID;
6127
6128 if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
6129 mstate.dtms_access = DTRACE_ACCESS_KERNEL;
6130 else
6131 mstate.dtms_access = 0;
6132
6133 if (pred != NULL) {
6134 dtrace_difo_t *dp = pred->dtp_difo;
6135 int rval;
6136
6137 rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
6138
6139 if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
6140 dtrace_cacheid_t cid = probe->dtpr_predcache;
6141
6142 if (cid != DTRACE_CACHEIDNONE && !onintr) {
6143 /*
6144 * Update the predicate cache...
6145 */
6146 ASSERT(cid == pred->dtp_cacheid);
6147
6148 dtrace_set_thread_predcache(current_thread(), cid);
6149 }
6150
6151 continue;
6152 }
6153 }
6154
6155 for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
6156 act != NULL; act = act->dta_next) {
6157 size_t valoffs;
6158 dtrace_difo_t *dp;
6159 dtrace_recdesc_t *rec = &act->dta_rec;
6160
6161 size = rec->dtrd_size;
6162 valoffs = offs + rec->dtrd_offset;
6163
6164 if (DTRACEACT_ISAGG(act->dta_kind)) {
6165 uint64_t v = 0xbad;
6166 dtrace_aggregation_t *agg;
6167
6168 agg = (dtrace_aggregation_t *)act;
6169
6170 if ((dp = act->dta_difo) != NULL)
6171 v = dtrace_dif_emulate(dp,
6172 &mstate, vstate, state);
6173
6174 if (*flags & CPU_DTRACE_ERROR)
6175 continue;
6176
6177 /*
6178 * Note that we always pass the expression
6179 * value from the previous iteration of the
6180 * action loop. This value will only be used
6181 * if there is an expression argument to the
6182 * aggregating action, denoted by the
6183 * dtag_hasarg field.
6184 */
6185 dtrace_aggregate(agg, buf,
6186 offs, aggbuf, v, val);
6187 continue;
6188 }
6189
6190 switch (act->dta_kind) {
6191 case DTRACEACT_STOP:
6192 if (dtrace_priv_proc_destructive(state))
6193 dtrace_action_stop();
6194 continue;
6195
6196 case DTRACEACT_BREAKPOINT:
6197 if (dtrace_priv_kernel_destructive(state))
6198 dtrace_action_breakpoint(ecb);
6199 continue;
6200
6201 case DTRACEACT_PANIC:
6202 if (dtrace_priv_kernel_destructive(state))
6203 dtrace_action_panic(ecb);
6204 continue;
6205
6206 case DTRACEACT_STACK:
6207 if (!dtrace_priv_kernel(state))
6208 continue;
6209
6210 dtrace_getpcstack((pc_t *)(tomax + valoffs),
6211 size / sizeof (pc_t), probe->dtpr_aframes,
6212 DTRACE_ANCHORED(probe) ? NULL :
6213 (uint32_t *)(uintptr_t)arg0);
6214 continue;
6215
6216 case DTRACEACT_JSTACK:
6217 case DTRACEACT_USTACK:
6218 if (!dtrace_priv_proc(state))
6219 continue;
6220
6221 /*
6222 * See comment in DIF_VAR_PID.
6223 */
6224 if (DTRACE_ANCHORED(mstate.dtms_probe) &&
6225 CPU_ON_INTR(CPU)) {
6226 int depth = DTRACE_USTACK_NFRAMES(
6227 rec->dtrd_arg) + 1;
6228
6229 dtrace_bzero((void *)(tomax + valoffs),
6230 DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
6231 + depth * sizeof (uint64_t));
6232
6233 continue;
6234 }
6235
6236 if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
6237 curproc->p_dtrace_helpers != NULL) {
6238 /*
6239 * This is the slow path -- we have
6240 * allocated string space, and we're
6241 * getting the stack of a process that
6242 * has helpers. Call into a separate
6243 * routine to perform this processing.
6244 */
6245 dtrace_action_ustack(&mstate, state,
6246 (uint64_t *)(tomax + valoffs),
6247 rec->dtrd_arg);
6248 continue;
6249 }
6250
6251 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6252 dtrace_getupcstack((uint64_t *)
6253 (tomax + valoffs),
6254 DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
6255 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6256 continue;
6257
6258 default:
6259 break;
6260 }
6261
6262 dp = act->dta_difo;
6263 ASSERT(dp != NULL);
6264
6265 val = dtrace_dif_emulate(dp, &mstate, vstate, state);
6266
6267 if (*flags & CPU_DTRACE_ERROR)
6268 continue;
6269
6270 switch (act->dta_kind) {
6271 case DTRACEACT_SPECULATE:
6272 ASSERT(buf == &state->dts_buffer[cpuid]);
6273 buf = dtrace_speculation_buffer(state,
6274 cpuid, val);
6275
6276 if (buf == NULL) {
6277 *flags |= CPU_DTRACE_DROP;
6278 continue;
6279 }
6280
6281 offs = dtrace_buffer_reserve(buf,
6282 ecb->dte_needed, ecb->dte_alignment,
6283 state, NULL);
6284
6285 if (offs < 0) {
6286 *flags |= CPU_DTRACE_DROP;
6287 continue;
6288 }
6289
6290 tomax = buf->dtb_tomax;
6291 ASSERT(tomax != NULL);
6292
6293 if (ecb->dte_size != 0)
6294 DTRACE_STORE(uint32_t, tomax, offs,
6295 ecb->dte_epid);
6296 continue;
6297
6298 case DTRACEACT_CHILL:
6299 if (dtrace_priv_kernel_destructive(state))
6300 dtrace_action_chill(&mstate, val);
6301 continue;
6302
6303 case DTRACEACT_RAISE:
6304 if (dtrace_priv_proc_destructive(state))
6305 dtrace_action_raise(val);
6306 continue;
6307
6308 case DTRACEACT_PIDRESUME: /* __APPLE__ */
6309 if (dtrace_priv_proc_destructive(state))
6310 dtrace_action_pidresume(val);
6311 continue;
6312
6313 case DTRACEACT_COMMIT:
6314 ASSERT(!committed);
6315
6316 /*
6317 * We need to commit our buffer state.
6318 */
6319 if (ecb->dte_size)
6320 buf->dtb_offset = offs + ecb->dte_size;
6321 buf = &state->dts_buffer[cpuid];
6322 dtrace_speculation_commit(state, cpuid, val);
6323 committed = 1;
6324 continue;
6325
6326 case DTRACEACT_DISCARD:
6327 dtrace_speculation_discard(state, cpuid, val);
6328 continue;
6329
6330 case DTRACEACT_DIFEXPR:
6331 case DTRACEACT_LIBACT:
6332 case DTRACEACT_PRINTF:
6333 case DTRACEACT_PRINTA:
6334 case DTRACEACT_SYSTEM:
6335 case DTRACEACT_FREOPEN:
6336 case DTRACEACT_APPLEBINARY: /* __APPLE__ */
6337 case DTRACEACT_TRACEMEM:
6338 break;
6339
6340 case DTRACEACT_TRACEMEM_DYNSIZE:
6341 tracememsize = val;
6342 break;
6343
6344 case DTRACEACT_SYM:
6345 case DTRACEACT_MOD:
6346 if (!dtrace_priv_kernel(state))
6347 continue;
6348 break;
6349
6350 case DTRACEACT_USYM:
6351 case DTRACEACT_UMOD:
6352 case DTRACEACT_UADDR: {
6353 if (!dtrace_priv_proc(state))
6354 continue;
6355
6356 DTRACE_STORE(uint64_t, tomax,
6357 valoffs, (uint64_t)dtrace_proc_selfpid());
6358 DTRACE_STORE(uint64_t, tomax,
6359 valoffs + sizeof (uint64_t), val);
6360
6361 continue;
6362 }
6363
6364 case DTRACEACT_EXIT: {
6365 /*
6366 * For the exit action, we are going to attempt
6367 * to atomically set our activity to be
6368 * draining. If this fails (either because
6369 * another CPU has beat us to the exit action,
6370 * or because our current activity is something
6371 * other than ACTIVE or WARMUP), we will
6372 * continue. This assures that the exit action
6373 * can be successfully recorded at most once
6374 * when we're in the ACTIVE state. If we're
6375 * encountering the exit() action while in
6376 * COOLDOWN, however, we want to honor the new
6377 * status code. (We know that we're the only
6378 * thread in COOLDOWN, so there is no race.)
6379 */
6380 void *activity = &state->dts_activity;
6381 dtrace_activity_t current = state->dts_activity;
6382
6383 if (current == DTRACE_ACTIVITY_COOLDOWN)
6384 break;
6385
6386 if (current != DTRACE_ACTIVITY_WARMUP)
6387 current = DTRACE_ACTIVITY_ACTIVE;
6388
6389 if (dtrace_cas32(activity, current,
6390 DTRACE_ACTIVITY_DRAINING) != current) {
6391 *flags |= CPU_DTRACE_DROP;
6392 continue;
6393 }
6394
6395 break;
6396 }
6397
6398 default:
6399 ASSERT(0);
6400 }
6401
6402 if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF) {
6403 uintptr_t end = valoffs + size;
6404
6405 if (tracememsize != 0 &&
6406 valoffs + tracememsize < end)
6407 {
6408 end = valoffs + tracememsize;
6409 tracememsize = 0;
6410 }
6411
6412 if (!dtrace_vcanload((void *)(uintptr_t)val,
6413 &dp->dtdo_rtype, &mstate, vstate))
6414 continue;
6415
6416 /*
6417 * If this is a string, we're going to only
6418 * load until we find the zero byte -- after
6419 * which we'll store zero bytes.
6420 */
6421 if (dp->dtdo_rtype.dtdt_kind ==
6422 DIF_TYPE_STRING) {
6423 char c = '\0' + 1;
6424 int intuple = act->dta_intuple;
6425 size_t s;
6426
6427 for (s = 0; s < size; s++) {
6428 if (c != '\0')
6429 c = dtrace_load8(val++);
6430
6431 DTRACE_STORE(uint8_t, tomax,
6432 valoffs++, c);
6433
6434 if (c == '\0' && intuple)
6435 break;
6436 }
6437
6438 continue;
6439 }
6440
6441 while (valoffs < end) {
6442 DTRACE_STORE(uint8_t, tomax, valoffs++,
6443 dtrace_load8(val++));
6444 }
6445
6446 continue;
6447 }
6448
6449 switch (size) {
6450 case 0:
6451 break;
6452
6453 case sizeof (uint8_t):
6454 DTRACE_STORE(uint8_t, tomax, valoffs, val);
6455 break;
6456 case sizeof (uint16_t):
6457 DTRACE_STORE(uint16_t, tomax, valoffs, val);
6458 break;
6459 case sizeof (uint32_t):
6460 DTRACE_STORE(uint32_t, tomax, valoffs, val);
6461 break;
6462 case sizeof (uint64_t):
6463 DTRACE_STORE(uint64_t, tomax, valoffs, val);
6464 break;
6465 default:
6466 /*
6467 * Any other size should have been returned by
6468 * reference, not by value.
6469 */
6470 ASSERT(0);
6471 break;
6472 }
6473 }
6474
6475 if (*flags & CPU_DTRACE_DROP)
6476 continue;
6477
6478 if (*flags & CPU_DTRACE_FAULT) {
6479 int ndx;
6480 dtrace_action_t *err;
6481
6482 buf->dtb_errors++;
6483
6484 if (probe->dtpr_id == dtrace_probeid_error) {
6485 /*
6486 * There's nothing we can do -- we had an
6487 * error on the error probe. We bump an
6488 * error counter to at least indicate that
6489 * this condition happened.
6490 */
6491 dtrace_error(&state->dts_dblerrors);
6492 continue;
6493 }
6494
6495 if (vtime) {
6496 /*
6497 * Before recursing on dtrace_probe(), we
6498 * need to explicitly clear out our start
6499 * time to prevent it from being accumulated
6500 * into t_dtrace_vtime.
6501 */
6502
6503 /*
6504 * Darwin sets the sign bit on t_dtrace_tracing
6505 * to suspend accumulation to it.
6506 */
6507 dtrace_set_thread_tracing(current_thread(),
6508 (1ULL<<63) | dtrace_get_thread_tracing(current_thread()));
6509
6510 }
6511
6512 /*
6513 * Iterate over the actions to figure out which action
6514 * we were processing when we experienced the error.
6515 * Note that act points _past_ the faulting action; if
6516 * act is ecb->dte_action, the fault was in the
6517 * predicate, if it's ecb->dte_action->dta_next it's
6518 * in action #1, and so on.
6519 */
6520 for (err = ecb->dte_action, ndx = 0;
6521 err != act; err = err->dta_next, ndx++)
6522 continue;
6523
6524 dtrace_probe_error(state, ecb->dte_epid, ndx,
6525 (mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
6526 mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
6527 cpu_core[cpuid].cpuc_dtrace_illval);
6528
6529 continue;
6530 }
6531
6532 if (!committed)
6533 buf->dtb_offset = offs + ecb->dte_size;
6534 }
6535
6536 /* FIXME: On Darwin the time spent leaving DTrace from this point to the rti is attributed
6537 to the current thread. Instead it should accrue to DTrace. */
6538 if (vtime) {
6539 thread_t thread = current_thread();
6540 int64_t t = dtrace_get_thread_tracing(thread);
6541
6542 if (t >= 0) {
6543 /* Usual case, accumulate time spent here into t_dtrace_tracing */
6544 dtrace_set_thread_tracing(thread, t + (dtrace_gethrtime() - now));
6545 } else {
6546 /* Return from error recursion. No accumulation, just clear the sign bit on t_dtrace_tracing. */
6547 dtrace_set_thread_tracing(thread, (~(1ULL<<63)) & t);
6548 }
6549 }
6550
6551 dtrace_interrupt_enable(cookie);
6552 }
6553
6554 /*
6555 * APPLE NOTE: Don't allow a thread to re-enter dtrace_probe().
6556 * This could occur if a probe is encountered on some function in the
6557 * transitive closure of the call to dtrace_probe().
6558 * Solaris has some strong guarantees that this won't happen.
6559 * The Darwin implementation is not so mature as to make those guarantees.
6560 * Hence, the introduction of __dtrace_probe() on xnu.
6561 */
6562
6563 void
6564 dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
6565 uint64_t arg2, uint64_t arg3, uint64_t arg4)
6566 {
6567 thread_t thread = current_thread();
6568 disable_preemption();
6569 if (id == dtrace_probeid_error) {
6570 __dtrace_probe(id, arg0, arg1, arg2, arg3, arg4);
6571 dtrace_getipl(); /* Defeat tail-call optimization of __dtrace_probe() */
6572 } else if (!dtrace_get_thread_reentering(thread)) {
6573 dtrace_set_thread_reentering(thread, TRUE);
6574 __dtrace_probe(id, arg0, arg1, arg2, arg3, arg4);
6575 dtrace_set_thread_reentering(thread, FALSE);
6576 }
6577 #if DEBUG
6578 else __dtrace_probe(dtrace_probeid_error, 0, id, 1, -1, DTRACEFLT_UNKNOWN);
6579 #endif
6580 enable_preemption();
6581 }
6582
6583 /*
6584 * DTrace Probe Hashing Functions
6585 *
6586 * The functions in this section (and indeed, the functions in remaining
6587 * sections) are not _called_ from probe context. (Any exceptions to this are
6588 * marked with a "Note:".) Rather, they are called from elsewhere in the
6589 * DTrace framework to look-up probes in, add probes to and remove probes from
6590 * the DTrace probe hashes. (Each probe is hashed by each element of the
6591 * probe tuple -- allowing for fast lookups, regardless of what was
6592 * specified.)
6593 */
6594 static uint_t
6595 dtrace_hash_str(const char *p)
6596 {
6597 unsigned int g;
6598 uint_t hval = 0;
6599
6600 while (*p) {
6601 hval = (hval << 4) + *p++;
6602 if ((g = (hval & 0xf0000000)) != 0)
6603 hval ^= g >> 24;
6604 hval &= ~g;
6605 }
6606 return (hval);
6607 }
6608
6609 static dtrace_hash_t *
6610 dtrace_hash_create(uintptr_t stroffs, uintptr_t nextoffs, uintptr_t prevoffs)
6611 {
6612 dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
6613
6614 hash->dth_stroffs = stroffs;
6615 hash->dth_nextoffs = nextoffs;
6616 hash->dth_prevoffs = prevoffs;
6617
6618 hash->dth_size = 1;
6619 hash->dth_mask = hash->dth_size - 1;
6620
6621 hash->dth_tab = kmem_zalloc(hash->dth_size *
6622 sizeof (dtrace_hashbucket_t *), KM_SLEEP);
6623
6624 return (hash);
6625 }
6626
6627 /*
6628 * APPLE NOTE: dtrace_hash_destroy is not used.
6629 * It is called by dtrace_detach which is not
6630 * currently implemented. Revisit someday.
6631 */
6632 #if !defined(__APPLE__)
6633 static void
6634 dtrace_hash_destroy(dtrace_hash_t *hash)
6635 {
6636 #if DEBUG
6637 int i;
6638
6639 for (i = 0; i < hash->dth_size; i++)
6640 ASSERT(hash->dth_tab[i] == NULL);
6641 #endif
6642
6643 kmem_free(hash->dth_tab,
6644 hash->dth_size * sizeof (dtrace_hashbucket_t *));
6645 kmem_free(hash, sizeof (dtrace_hash_t));
6646 }
6647 #endif /* __APPLE__ */
6648
6649 static void
6650 dtrace_hash_resize(dtrace_hash_t *hash)
6651 {
6652 int size = hash->dth_size, i, ndx;
6653 int new_size = hash->dth_size << 1;
6654 int new_mask = new_size - 1;
6655 dtrace_hashbucket_t **new_tab, *bucket, *next;
6656
6657 ASSERT((new_size & new_mask) == 0);
6658
6659 new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
6660
6661 for (i = 0; i < size; i++) {
6662 for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
6663 dtrace_probe_t *probe = bucket->dthb_chain;
6664
6665 ASSERT(probe != NULL);
6666 ndx = DTRACE_HASHSTR(hash, probe) & new_mask;
6667
6668 next = bucket->dthb_next;
6669 bucket->dthb_next = new_tab[ndx];
6670 new_tab[ndx] = bucket;
6671 }
6672 }
6673
6674 kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
6675 hash->dth_tab = new_tab;
6676 hash->dth_size = new_size;
6677 hash->dth_mask = new_mask;
6678 }
6679
6680 static void
6681 dtrace_hash_add(dtrace_hash_t *hash, dtrace_probe_t *new)
6682 {
6683 int hashval = DTRACE_HASHSTR(hash, new);
6684 int ndx = hashval & hash->dth_mask;
6685 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6686 dtrace_probe_t **nextp, **prevp;
6687
6688 for (; bucket != NULL; bucket = bucket->dthb_next) {
6689 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
6690 goto add;
6691 }
6692
6693 if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
6694 dtrace_hash_resize(hash);
6695 dtrace_hash_add(hash, new);
6696 return;
6697 }
6698
6699 bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
6700 bucket->dthb_next = hash->dth_tab[ndx];
6701 hash->dth_tab[ndx] = bucket;
6702 hash->dth_nbuckets++;
6703
6704 add:
6705 nextp = DTRACE_HASHNEXT(hash, new);
6706 ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);
6707 *nextp = bucket->dthb_chain;
6708
6709 if (bucket->dthb_chain != NULL) {
6710 prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
6711 ASSERT(*prevp == NULL);
6712 *prevp = new;
6713 }
6714
6715 bucket->dthb_chain = new;
6716 bucket->dthb_len++;
6717 }
6718
6719 static dtrace_probe_t *
6720 dtrace_hash_lookup(dtrace_hash_t *hash, dtrace_probe_t *template)
6721 {
6722 int hashval = DTRACE_HASHSTR(hash, template);
6723 int ndx = hashval & hash->dth_mask;
6724 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6725
6726 for (; bucket != NULL; bucket = bucket->dthb_next) {
6727 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
6728 return (bucket->dthb_chain);
6729 }
6730
6731 return (NULL);
6732 }
6733
6734 static int
6735 dtrace_hash_collisions(dtrace_hash_t *hash, dtrace_probe_t *template)
6736 {
6737 int hashval = DTRACE_HASHSTR(hash, template);
6738 int ndx = hashval & hash->dth_mask;
6739 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6740
6741 for (; bucket != NULL; bucket = bucket->dthb_next) {
6742 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
6743 return (bucket->dthb_len);
6744 }
6745
6746 return (0);
6747 }
6748
6749 static void
6750 dtrace_hash_remove(dtrace_hash_t *hash, dtrace_probe_t *probe)
6751 {
6752 int ndx = DTRACE_HASHSTR(hash, probe) & hash->dth_mask;
6753 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
6754
6755 dtrace_probe_t **prevp = DTRACE_HASHPREV(hash, probe);
6756 dtrace_probe_t **nextp = DTRACE_HASHNEXT(hash, probe);
6757
6758 /*
6759 * Find the bucket that we're removing this probe from.
6760 */
6761 for (; bucket != NULL; bucket = bucket->dthb_next) {
6762 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, probe))
6763 break;
6764 }
6765
6766 ASSERT(bucket != NULL);
6767
6768 if (*prevp == NULL) {
6769 if (*nextp == NULL) {
6770 /*
6771 * The removed probe was the only probe on this
6772 * bucket; we need to remove the bucket.
6773 */
6774 dtrace_hashbucket_t *b = hash->dth_tab[ndx];
6775
6776 ASSERT(bucket->dthb_chain == probe);
6777 ASSERT(b != NULL);
6778
6779 if (b == bucket) {
6780 hash->dth_tab[ndx] = bucket->dthb_next;
6781 } else {
6782 while (b->dthb_next != bucket)
6783 b = b->dthb_next;
6784 b->dthb_next = bucket->dthb_next;
6785 }
6786
6787 ASSERT(hash->dth_nbuckets > 0);
6788 hash->dth_nbuckets--;
6789 kmem_free(bucket, sizeof (dtrace_hashbucket_t));
6790 return;
6791 }
6792
6793 bucket->dthb_chain = *nextp;
6794 } else {
6795 *(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;
6796 }
6797
6798 if (*nextp != NULL)
6799 *(DTRACE_HASHPREV(hash, *nextp)) = *prevp;
6800 }
6801
6802 /*
6803 * DTrace Utility Functions
6804 *
6805 * These are random utility functions that are _not_ called from probe context.
6806 */
6807 static int
6808 dtrace_badattr(const dtrace_attribute_t *a)
6809 {
6810 return (a->dtat_name > DTRACE_STABILITY_MAX ||
6811 a->dtat_data > DTRACE_STABILITY_MAX ||
6812 a->dtat_class > DTRACE_CLASS_MAX);
6813 }
6814
6815 /*
6816 * Return a duplicate copy of a string. If the specified string is NULL,
6817 * this function returns a zero-length string.
6818 * APPLE NOTE: Darwin employs size bounded string operation.
6819 */
6820 static char *
6821 dtrace_strdup(const char *str)
6822 {
6823 size_t bufsize = (str != NULL ? strlen(str) : 0) + 1;
6824 char *new = kmem_zalloc(bufsize, KM_SLEEP);
6825
6826 if (str != NULL)
6827 (void) strlcpy(new, str, bufsize);
6828
6829 return (new);
6830 }
6831
6832 #define DTRACE_ISALPHA(c) \
6833 (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
6834
6835 static int
6836 dtrace_badname(const char *s)
6837 {
6838 char c;
6839
6840 if (s == NULL || (c = *s++) == '\0')
6841 return (0);
6842
6843 if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
6844 return (1);
6845
6846 while ((c = *s++) != '\0') {
6847 if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') &&
6848 c != '-' && c != '_' && c != '.' && c != '`')
6849 return (1);
6850 }
6851
6852 return (0);
6853 }
6854
6855 static void
6856 dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
6857 {
6858 uint32_t priv;
6859
6860 if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
6861 /*
6862 * For DTRACE_PRIV_ALL, the uid and zoneid don't matter.
6863 */
6864 priv = DTRACE_PRIV_ALL;
6865 } else {
6866 *uidp = crgetuid(cr);
6867 *zoneidp = crgetzoneid(cr);
6868
6869 priv = 0;
6870 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
6871 priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;
6872 else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
6873 priv |= DTRACE_PRIV_USER;
6874 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
6875 priv |= DTRACE_PRIV_PROC;
6876 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
6877 priv |= DTRACE_PRIV_OWNER;
6878 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
6879 priv |= DTRACE_PRIV_ZONEOWNER;
6880 }
6881
6882 *privp = priv;
6883 }
6884
6885 #ifdef DTRACE_ERRDEBUG
6886 static void
6887 dtrace_errdebug(const char *str)
6888 {
6889 int hval = dtrace_hash_str(str) % DTRACE_ERRHASHSZ;
6890 int occupied = 0;
6891
6892 lck_mtx_lock(&dtrace_errlock);
6893 dtrace_errlast = str;
6894 dtrace_errthread = (kthread_t *)current_thread();
6895
6896 while (occupied++ < DTRACE_ERRHASHSZ) {
6897 if (dtrace_errhash[hval].dter_msg == str) {
6898 dtrace_errhash[hval].dter_count++;
6899 goto out;
6900 }
6901
6902 if (dtrace_errhash[hval].dter_msg != NULL) {
6903 hval = (hval + 1) % DTRACE_ERRHASHSZ;
6904 continue;
6905 }
6906
6907 dtrace_errhash[hval].dter_msg = str;
6908 dtrace_errhash[hval].dter_count = 1;
6909 goto out;
6910 }
6911
6912 panic("dtrace: undersized error hash");
6913 out:
6914 lck_mtx_unlock(&dtrace_errlock);
6915 }
6916 #endif
6917
6918 /*
6919 * DTrace Matching Functions
6920 *
6921 * These functions are used to match groups of probes, given some elements of
6922 * a probe tuple, or some globbed expressions for elements of a probe tuple.
6923 */
6924 static int
6925 dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
6926 zoneid_t zoneid)
6927 {
6928 if (priv != DTRACE_PRIV_ALL) {
6929 uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
6930 uint32_t match = priv & ppriv;
6931
6932 /*
6933 * No PRIV_DTRACE_* privileges...
6934 */
6935 if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |
6936 DTRACE_PRIV_KERNEL)) == 0)
6937 return (0);
6938
6939 /*
6940 * No matching bits, but there were bits to match...
6941 */
6942 if (match == 0 && ppriv != 0)
6943 return (0);
6944
6945 /*
6946 * Need to have permissions to the process, but don't...
6947 */
6948 if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
6949 uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
6950 return (0);
6951 }
6952
6953 /*
6954 * Need to be in the same zone unless we possess the
6955 * privilege to examine all zones.
6956 */
6957 if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
6958 zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
6959 return (0);
6960 }
6961 }
6962
6963 return (1);
6964 }
6965
6966 /*
6967 * dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
6968 * consists of input pattern strings and an ops-vector to evaluate them.
6969 * This function returns >0 for match, 0 for no match, and <0 for error.
6970 */
6971 static int
6972 dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,
6973 uint32_t priv, uid_t uid, zoneid_t zoneid)
6974 {
6975 dtrace_provider_t *pvp = prp->dtpr_provider;
6976 int rv;
6977
6978 if (pvp->dtpv_defunct)
6979 return (0);
6980
6981 if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
6982 return (rv);
6983
6984 if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
6985 return (rv);
6986
6987 if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
6988 return (rv);
6989
6990 if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
6991 return (rv);
6992
6993 if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
6994 return (0);
6995
6996 return (rv);
6997 }
6998
6999 /*
7000 * dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
7001 * interface for matching a glob pattern 'p' to an input string 's'. Unlike
7002 * libc's version, the kernel version only applies to 8-bit ASCII strings.
7003 * In addition, all of the recursion cases except for '*' matching have been
7004 * unwound. For '*', we still implement recursive evaluation, but a depth
7005 * counter is maintained and matching is aborted if we recurse too deep.
7006 * The function returns 0 if no match, >0 if match, and <0 if recursion error.
7007 */
7008 static int
7009 dtrace_match_glob(const char *s, const char *p, int depth)
7010 {
7011 const char *olds;
7012 char s1, c;
7013 int gs;
7014
7015 if (depth > DTRACE_PROBEKEY_MAXDEPTH)
7016 return (-1);
7017
7018 if (s == NULL)
7019 s = ""; /* treat NULL as empty string */
7020
7021 top:
7022 olds = s;
7023 s1 = *s++;
7024
7025 if (p == NULL)
7026 return (0);
7027
7028 if ((c = *p++) == '\0')
7029 return (s1 == '\0');
7030
7031 switch (c) {
7032 case '[': {
7033 int ok = 0, notflag = 0;
7034 char lc = '\0';
7035
7036 if (s1 == '\0')
7037 return (0);
7038
7039 if (*p == '!') {
7040 notflag = 1;
7041 p++;
7042 }
7043
7044 if ((c = *p++) == '\0')
7045 return (0);
7046
7047 do {
7048 if (c == '-' && lc != '\0' && *p != ']') {
7049 if ((c = *p++) == '\0')
7050 return (0);
7051 if (c == '\\' && (c = *p++) == '\0')
7052 return (0);
7053
7054 if (notflag) {
7055 if (s1 < lc || s1 > c)
7056 ok++;
7057 else
7058 return (0);
7059 } else if (lc <= s1 && s1 <= c)
7060 ok++;
7061
7062 } else if (c == '\\' && (c = *p++) == '\0')
7063 return (0);
7064
7065 lc = c; /* save left-hand 'c' for next iteration */
7066
7067 if (notflag) {
7068 if (s1 != c)
7069 ok++;
7070 else
7071 return (0);
7072 } else if (s1 == c)
7073 ok++;
7074
7075 if ((c = *p++) == '\0')
7076 return (0);
7077
7078 } while (c != ']');
7079
7080 if (ok)
7081 goto top;
7082
7083 return (0);
7084 }
7085
7086 case '\\':
7087 if ((c = *p++) == '\0')
7088 return (0);
7089 /*FALLTHRU*/
7090
7091 default:
7092 if (c != s1)
7093 return (0);
7094 /*FALLTHRU*/
7095
7096 case '?':
7097 if (s1 != '\0')
7098 goto top;
7099 return (0);
7100
7101 case '*':
7102 while (*p == '*')
7103 p++; /* consecutive *'s are identical to a single one */
7104
7105 if (*p == '\0')
7106 return (1);
7107
7108 for (s = olds; *s != '\0'; s++) {
7109 if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
7110 return (gs);
7111 }
7112
7113 return (0);
7114 }
7115 }
7116
7117 /*ARGSUSED*/
7118 static int
7119 dtrace_match_string(const char *s, const char *p, int depth)
7120 {
7121 #pragma unused(depth) /* __APPLE__ */
7122
7123 /* APPLE NOTE: Darwin employs size bounded string operation. */
7124 return (s != NULL && strncmp(s, p, strlen(s) + 1) == 0);
7125 }
7126
7127 /*ARGSUSED*/
7128 static int
7129 dtrace_match_nul(const char *s, const char *p, int depth)
7130 {
7131 #pragma unused(s, p, depth) /* __APPLE__ */
7132 return (1); /* always match the empty pattern */
7133 }
7134
7135 /*ARGSUSED*/
7136 static int
7137 dtrace_match_nonzero(const char *s, const char *p, int depth)
7138 {
7139 #pragma unused(p, depth) /* __APPLE__ */
7140 return (s != NULL && s[0] != '\0');
7141 }
7142
7143 static int
7144 dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
7145 zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *), void *arg)
7146 {
7147 dtrace_probe_t template, *probe;
7148 dtrace_hash_t *hash = NULL;
7149 int len, rc, best = INT_MAX, nmatched = 0;
7150 dtrace_id_t i;
7151
7152 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7153
7154 /*
7155 * If the probe ID is specified in the key, just lookup by ID and
7156 * invoke the match callback once if a matching probe is found.
7157 */
7158 if (pkp->dtpk_id != DTRACE_IDNONE) {
7159 if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
7160 dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
7161 if ((*matched)(probe, arg) == DTRACE_MATCH_FAIL)
7162 return (DTRACE_MATCH_FAIL);
7163 nmatched++;
7164 }
7165 return (nmatched);
7166 }
7167
7168 template.dtpr_mod = (char *)(uintptr_t)pkp->dtpk_mod;
7169 template.dtpr_func = (char *)(uintptr_t)pkp->dtpk_func;
7170 template.dtpr_name = (char *)(uintptr_t)pkp->dtpk_name;
7171
7172 /*
7173 * We want to find the most distinct of the module name, function
7174 * name, and name. So for each one that is not a glob pattern or
7175 * empty string, we perform a lookup in the corresponding hash and
7176 * use the hash table with the fewest collisions to do our search.
7177 */
7178 if (pkp->dtpk_mmatch == &dtrace_match_string &&
7179 (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
7180 best = len;
7181 hash = dtrace_bymod;
7182 }
7183
7184 if (pkp->dtpk_fmatch == &dtrace_match_string &&
7185 (len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
7186 best = len;
7187 hash = dtrace_byfunc;
7188 }
7189
7190 if (pkp->dtpk_nmatch == &dtrace_match_string &&
7191 (len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
7192 best = len;
7193 hash = dtrace_byname;
7194 }
7195
7196 /*
7197 * If we did not select a hash table, iterate over every probe and
7198 * invoke our callback for each one that matches our input probe key.
7199 */
7200 if (hash == NULL) {
7201 for (i = 0; i < (dtrace_id_t)dtrace_nprobes; i++) {
7202 if ((probe = dtrace_probes[i]) == NULL ||
7203 dtrace_match_probe(probe, pkp, priv, uid,
7204 zoneid) <= 0)
7205 continue;
7206
7207 nmatched++;
7208
7209 if ((rc = (*matched)(probe, arg)) != DTRACE_MATCH_NEXT) {
7210 if (rc == DTRACE_MATCH_FAIL)
7211 return (DTRACE_MATCH_FAIL);
7212 break;
7213 }
7214 }
7215
7216 return (nmatched);
7217 }
7218
7219 /*
7220 * If we selected a hash table, iterate over each probe of the same key
7221 * name and invoke the callback for every probe that matches the other
7222 * attributes of our input probe key.
7223 */
7224 for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
7225 probe = *(DTRACE_HASHNEXT(hash, probe))) {
7226
7227 if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
7228 continue;
7229
7230 nmatched++;
7231
7232 if ((rc = (*matched)(probe, arg)) != DTRACE_MATCH_NEXT) {
7233 if (rc == DTRACE_MATCH_FAIL)
7234 return (DTRACE_MATCH_FAIL);
7235 break;
7236 }
7237 }
7238
7239 return (nmatched);
7240 }
7241
7242 /*
7243 * Return the function pointer dtrace_probecmp() should use to compare the
7244 * specified pattern with a string. For NULL or empty patterns, we select
7245 * dtrace_match_nul(). For glob pattern strings, we use dtrace_match_glob().
7246 * For non-empty non-glob strings, we use dtrace_match_string().
7247 */
7248 static dtrace_probekey_f *
7249 dtrace_probekey_func(const char *p)
7250 {
7251 char c;
7252
7253 if (p == NULL || *p == '\0')
7254 return (&dtrace_match_nul);
7255
7256 while ((c = *p++) != '\0') {
7257 if (c == '[' || c == '?' || c == '*' || c == '\\')
7258 return (&dtrace_match_glob);
7259 }
7260
7261 return (&dtrace_match_string);
7262 }
7263
7264 /*
7265 * Build a probe comparison key for use with dtrace_match_probe() from the
7266 * given probe description. By convention, a null key only matches anchored
7267 * probes: if each field is the empty string, reset dtpk_fmatch to
7268 * dtrace_match_nonzero().
7269 */
7270 static void
7271 dtrace_probekey(const dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
7272 {
7273 pkp->dtpk_prov = pdp->dtpd_provider;
7274 pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
7275
7276 pkp->dtpk_mod = pdp->dtpd_mod;
7277 pkp->dtpk_mmatch = dtrace_probekey_func(pdp->dtpd_mod);
7278
7279 pkp->dtpk_func = pdp->dtpd_func;
7280 pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
7281
7282 pkp->dtpk_name = pdp->dtpd_name;
7283 pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
7284
7285 pkp->dtpk_id = pdp->dtpd_id;
7286
7287 if (pkp->dtpk_id == DTRACE_IDNONE &&
7288 pkp->dtpk_pmatch == &dtrace_match_nul &&
7289 pkp->dtpk_mmatch == &dtrace_match_nul &&
7290 pkp->dtpk_fmatch == &dtrace_match_nul &&
7291 pkp->dtpk_nmatch == &dtrace_match_nul)
7292 pkp->dtpk_fmatch = &dtrace_match_nonzero;
7293 }
7294
7295 /*
7296 * DTrace Provider-to-Framework API Functions
7297 *
7298 * These functions implement much of the Provider-to-Framework API, as
7299 * described in <sys/dtrace.h>. The parts of the API not in this section are
7300 * the functions in the API for probe management (found below), and
7301 * dtrace_probe() itself (found above).
7302 */
7303
7304 /*
7305 * Register the calling provider with the DTrace framework. This should
7306 * generally be called by DTrace providers in their attach(9E) entry point.
7307 */
7308 int
7309 dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
7310 cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)
7311 {
7312 dtrace_provider_t *provider;
7313
7314 if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {
7315 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7316 "arguments", name ? name : "<NULL>");
7317 return (EINVAL);
7318 }
7319
7320 if (name[0] == '\0' || dtrace_badname(name)) {
7321 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7322 "provider name", name);
7323 return (EINVAL);
7324 }
7325
7326 if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||
7327 pops->dtps_enable == NULL || pops->dtps_disable == NULL ||
7328 pops->dtps_destroy == NULL ||
7329 ((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
7330 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7331 "provider ops", name);
7332 return (EINVAL);
7333 }
7334
7335 if (dtrace_badattr(&pap->dtpa_provider) ||
7336 dtrace_badattr(&pap->dtpa_mod) ||
7337 dtrace_badattr(&pap->dtpa_func) ||
7338 dtrace_badattr(&pap->dtpa_name) ||
7339 dtrace_badattr(&pap->dtpa_args)) {
7340 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7341 "provider attributes", name);
7342 return (EINVAL);
7343 }
7344
7345 if (priv & ~DTRACE_PRIV_ALL) {
7346 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7347 "privilege attributes", name);
7348 return (EINVAL);
7349 }
7350
7351 if ((priv & DTRACE_PRIV_KERNEL) &&
7352 (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&
7353 pops->dtps_usermode == NULL) {
7354 cmn_err(CE_WARN, "failed to register provider '%s': need "
7355 "dtps_usermode() op for given privilege attributes", name);
7356 return (EINVAL);
7357 }
7358
7359 provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
7360
7361 /* APPLE NOTE: Darwin employs size bounded string operation. */
7362 {
7363 size_t bufsize = strlen(name) + 1;
7364 provider->dtpv_name = kmem_alloc(bufsize, KM_SLEEP);
7365 (void) strlcpy(provider->dtpv_name, name, bufsize);
7366 }
7367
7368 provider->dtpv_attr = *pap;
7369 provider->dtpv_priv.dtpp_flags = priv;
7370 if (cr != NULL) {
7371 provider->dtpv_priv.dtpp_uid = crgetuid(cr);
7372 provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
7373 }
7374 provider->dtpv_pops = *pops;
7375
7376 if (pops->dtps_provide == NULL) {
7377 ASSERT(pops->dtps_provide_module != NULL);
7378 provider->dtpv_pops.dtps_provide =
7379 (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop;
7380 }
7381
7382 if (pops->dtps_provide_module == NULL) {
7383 ASSERT(pops->dtps_provide != NULL);
7384 provider->dtpv_pops.dtps_provide_module =
7385 (void (*)(void *, struct modctl *))dtrace_nullop;
7386 }
7387
7388 if (pops->dtps_suspend == NULL) {
7389 ASSERT(pops->dtps_resume == NULL);
7390 provider->dtpv_pops.dtps_suspend =
7391 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
7392 provider->dtpv_pops.dtps_resume =
7393 (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
7394 }
7395
7396 provider->dtpv_arg = arg;
7397 *idp = (dtrace_provider_id_t)provider;
7398
7399 if (pops == &dtrace_provider_ops) {
7400 lck_mtx_assert(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
7401 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7402 ASSERT(dtrace_anon.dta_enabling == NULL);
7403
7404 /*
7405 * We make sure that the DTrace provider is at the head of
7406 * the provider chain.
7407 */
7408 provider->dtpv_next = dtrace_provider;
7409 dtrace_provider = provider;
7410 return (0);
7411 }
7412
7413 lck_mtx_lock(&dtrace_provider_lock);
7414 lck_mtx_lock(&dtrace_lock);
7415
7416 /*
7417 * If there is at least one provider registered, we'll add this
7418 * provider after the first provider.
7419 */
7420 if (dtrace_provider != NULL) {
7421 provider->dtpv_next = dtrace_provider->dtpv_next;
7422 dtrace_provider->dtpv_next = provider;
7423 } else {
7424 dtrace_provider = provider;
7425 }
7426
7427 if (dtrace_retained != NULL) {
7428 dtrace_enabling_provide(provider);
7429
7430 /*
7431 * Now we need to call dtrace_enabling_matchall() -- which
7432 * will acquire cpu_lock and dtrace_lock. We therefore need
7433 * to drop all of our locks before calling into it...
7434 */
7435 lck_mtx_unlock(&dtrace_lock);
7436 lck_mtx_unlock(&dtrace_provider_lock);
7437 dtrace_enabling_matchall();
7438
7439 return (0);
7440 }
7441
7442 lck_mtx_unlock(&dtrace_lock);
7443 lck_mtx_unlock(&dtrace_provider_lock);
7444
7445 return (0);
7446 }
7447
7448 /*
7449 * Unregister the specified provider from the DTrace framework. This should
7450 * generally be called by DTrace providers in their detach(9E) entry point.
7451 */
7452 int
7453 dtrace_unregister(dtrace_provider_id_t id)
7454 {
7455 dtrace_provider_t *old = (dtrace_provider_t *)id;
7456 dtrace_provider_t *prev = NULL;
7457 int i, self = 0;
7458 dtrace_probe_t *probe, *first = NULL;
7459
7460 if (old->dtpv_pops.dtps_enable ==
7461 (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop) {
7462 /*
7463 * If DTrace itself is the provider, we're called with locks
7464 * already held.
7465 */
7466 ASSERT(old == dtrace_provider);
7467 ASSERT(dtrace_devi != NULL);
7468 lck_mtx_assert(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
7469 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7470 self = 1;
7471
7472 if (dtrace_provider->dtpv_next != NULL) {
7473 /*
7474 * There's another provider here; return failure.
7475 */
7476 return (EBUSY);
7477 }
7478 } else {
7479 lck_mtx_lock(&dtrace_provider_lock);
7480 lck_mtx_lock(&mod_lock);
7481 lck_mtx_lock(&dtrace_lock);
7482 }
7483
7484 /*
7485 * If anyone has /dev/dtrace open, or if there are anonymous enabled
7486 * probes, we refuse to let providers slither away, unless this
7487 * provider has already been explicitly invalidated.
7488 */
7489 if (!old->dtpv_defunct &&
7490 (dtrace_opens || (dtrace_anon.dta_state != NULL &&
7491 dtrace_anon.dta_state->dts_necbs > 0))) {
7492 if (!self) {
7493 lck_mtx_unlock(&dtrace_lock);
7494 lck_mtx_unlock(&mod_lock);
7495 lck_mtx_unlock(&dtrace_provider_lock);
7496 }
7497 return (EBUSY);
7498 }
7499
7500 /*
7501 * Attempt to destroy the probes associated with this provider.
7502 */
7503 if (old->dtpv_ecb_count!=0) {
7504 /*
7505 * We have at least one ECB; we can't remove this provider.
7506 */
7507 if (!self) {
7508 lck_mtx_unlock(&dtrace_lock);
7509 lck_mtx_unlock(&mod_lock);
7510 lck_mtx_unlock(&dtrace_provider_lock);
7511 }
7512 return (EBUSY);
7513 }
7514
7515 /*
7516 * All of the probes for this provider are disabled; we can safely
7517 * remove all of them from their hash chains and from the probe array.
7518 */
7519 for (i = 0; i < dtrace_nprobes && old->dtpv_probe_count!=0; i++) {
7520 if ((probe = dtrace_probes[i]) == NULL)
7521 continue;
7522
7523 if (probe->dtpr_provider != old)
7524 continue;
7525
7526 dtrace_probes[i] = NULL;
7527 old->dtpv_probe_count--;
7528
7529 dtrace_hash_remove(dtrace_bymod, probe);
7530 dtrace_hash_remove(dtrace_byfunc, probe);
7531 dtrace_hash_remove(dtrace_byname, probe);
7532
7533 if (first == NULL) {
7534 first = probe;
7535 probe->dtpr_nextmod = NULL;
7536 } else {
7537 probe->dtpr_nextmod = first;
7538 first = probe;
7539 }
7540 }
7541
7542 /*
7543 * The provider's probes have been removed from the hash chains and
7544 * from the probe array. Now issue a dtrace_sync() to be sure that
7545 * everyone has cleared out from any probe array processing.
7546 */
7547 dtrace_sync();
7548
7549 for (probe = first; probe != NULL; probe = first) {
7550 first = probe->dtpr_nextmod;
7551
7552 old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
7553 probe->dtpr_arg);
7554 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
7555 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
7556 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
7557 vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
7558 zfree(dtrace_probe_t_zone, probe);
7559 }
7560
7561 if ((prev = dtrace_provider) == old) {
7562 ASSERT(self || dtrace_devi == NULL);
7563 ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
7564 dtrace_provider = old->dtpv_next;
7565 } else {
7566 while (prev != NULL && prev->dtpv_next != old)
7567 prev = prev->dtpv_next;
7568
7569 if (prev == NULL) {
7570 panic("attempt to unregister non-existent "
7571 "dtrace provider %p\n", (void *)id);
7572 }
7573
7574 prev->dtpv_next = old->dtpv_next;
7575 }
7576
7577 if (!self) {
7578 lck_mtx_unlock(&dtrace_lock);
7579 lck_mtx_unlock(&mod_lock);
7580 lck_mtx_unlock(&dtrace_provider_lock);
7581 }
7582
7583 kmem_free(old->dtpv_name, strlen(old->dtpv_name) + 1);
7584 kmem_free(old, sizeof (dtrace_provider_t));
7585
7586 return (0);
7587 }
7588
7589 /*
7590 * Invalidate the specified provider. All subsequent probe lookups for the
7591 * specified provider will fail, but its probes will not be removed.
7592 */
7593 void
7594 dtrace_invalidate(dtrace_provider_id_t id)
7595 {
7596 dtrace_provider_t *pvp = (dtrace_provider_t *)id;
7597
7598 ASSERT(pvp->dtpv_pops.dtps_enable !=
7599 (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
7600
7601 lck_mtx_lock(&dtrace_provider_lock);
7602 lck_mtx_lock(&dtrace_lock);
7603
7604 pvp->dtpv_defunct = 1;
7605
7606 lck_mtx_unlock(&dtrace_lock);
7607 lck_mtx_unlock(&dtrace_provider_lock);
7608 }
7609
7610 /*
7611 * Indicate whether or not DTrace has attached.
7612 */
7613 int
7614 dtrace_attached(void)
7615 {
7616 /*
7617 * dtrace_provider will be non-NULL iff the DTrace driver has
7618 * attached. (It's non-NULL because DTrace is always itself a
7619 * provider.)
7620 */
7621 return (dtrace_provider != NULL);
7622 }
7623
7624 /*
7625 * Remove all the unenabled probes for the given provider. This function is
7626 * not unlike dtrace_unregister(), except that it doesn't remove the provider
7627 * -- just as many of its associated probes as it can.
7628 */
7629 int
7630 dtrace_condense(dtrace_provider_id_t id)
7631 {
7632 dtrace_provider_t *prov = (dtrace_provider_t *)id;
7633 int i;
7634 dtrace_probe_t *probe;
7635
7636 /*
7637 * Make sure this isn't the dtrace provider itself.
7638 */
7639 ASSERT(prov->dtpv_pops.dtps_enable !=
7640 (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
7641
7642 lck_mtx_lock(&dtrace_provider_lock);
7643 lck_mtx_lock(&dtrace_lock);
7644
7645 /*
7646 * Attempt to destroy the probes associated with this provider.
7647 */
7648 for (i = 0; i < dtrace_nprobes; i++) {
7649 if ((probe = dtrace_probes[i]) == NULL)
7650 continue;
7651
7652 if (probe->dtpr_provider != prov)
7653 continue;
7654
7655 if (probe->dtpr_ecb != NULL)
7656 continue;
7657
7658 dtrace_probes[i] = NULL;
7659 prov->dtpv_probe_count--;
7660
7661 dtrace_hash_remove(dtrace_bymod, probe);
7662 dtrace_hash_remove(dtrace_byfunc, probe);
7663 dtrace_hash_remove(dtrace_byname, probe);
7664
7665 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, i + 1,
7666 probe->dtpr_arg);
7667 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
7668 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
7669 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
7670 zfree(dtrace_probe_t_zone, probe);
7671 vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1);
7672 }
7673
7674 lck_mtx_unlock(&dtrace_lock);
7675 lck_mtx_unlock(&dtrace_provider_lock);
7676
7677 return (0);
7678 }
7679
7680 /*
7681 * DTrace Probe Management Functions
7682 *
7683 * The functions in this section perform the DTrace probe management,
7684 * including functions to create probes, look-up probes, and call into the
7685 * providers to request that probes be provided. Some of these functions are
7686 * in the Provider-to-Framework API; these functions can be identified by the
7687 * fact that they are not declared "static".
7688 */
7689
7690 /*
7691 * Create a probe with the specified module name, function name, and name.
7692 */
7693 dtrace_id_t
7694 dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
7695 const char *func, const char *name, int aframes, void *arg)
7696 {
7697 dtrace_probe_t *probe, **probes;
7698 dtrace_provider_t *provider = (dtrace_provider_t *)prov;
7699 dtrace_id_t id;
7700
7701 if (provider == dtrace_provider) {
7702 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7703 } else {
7704 lck_mtx_lock(&dtrace_lock);
7705 }
7706
7707 id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
7708 VM_BESTFIT | VM_SLEEP);
7709
7710 probe = zalloc(dtrace_probe_t_zone);
7711 bzero(probe, sizeof (dtrace_probe_t));
7712
7713 probe->dtpr_id = id;
7714 probe->dtpr_gen = dtrace_probegen++;
7715 probe->dtpr_mod = dtrace_strdup(mod);
7716 probe->dtpr_func = dtrace_strdup(func);
7717 probe->dtpr_name = dtrace_strdup(name);
7718 probe->dtpr_arg = arg;
7719 probe->dtpr_aframes = aframes;
7720 probe->dtpr_provider = provider;
7721
7722 dtrace_hash_add(dtrace_bymod, probe);
7723 dtrace_hash_add(dtrace_byfunc, probe);
7724 dtrace_hash_add(dtrace_byname, probe);
7725
7726 if (id - 1 >= (dtrace_id_t)dtrace_nprobes) {
7727 size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
7728 size_t nsize = osize << 1;
7729
7730 if (nsize == 0) {
7731 ASSERT(osize == 0);
7732 ASSERT(dtrace_probes == NULL);
7733 nsize = sizeof (dtrace_probe_t *);
7734 }
7735
7736 probes = kmem_zalloc(nsize, KM_SLEEP);
7737
7738 if (dtrace_probes == NULL) {
7739 ASSERT(osize == 0);
7740 dtrace_probes = probes;
7741 dtrace_nprobes = 1;
7742 } else {
7743 dtrace_probe_t **oprobes = dtrace_probes;
7744
7745 bcopy(oprobes, probes, osize);
7746 dtrace_membar_producer();
7747 dtrace_probes = probes;
7748
7749 dtrace_sync();
7750
7751 /*
7752 * All CPUs are now seeing the new probes array; we can
7753 * safely free the old array.
7754 */
7755 kmem_free(oprobes, osize);
7756 dtrace_nprobes <<= 1;
7757 }
7758
7759 ASSERT(id - 1 < (dtrace_id_t)dtrace_nprobes);
7760 }
7761
7762 ASSERT(dtrace_probes[id - 1] == NULL);
7763 dtrace_probes[id - 1] = probe;
7764 provider->dtpv_probe_count++;
7765
7766 if (provider != dtrace_provider)
7767 lck_mtx_unlock(&dtrace_lock);
7768
7769 return (id);
7770 }
7771
7772 static dtrace_probe_t *
7773 dtrace_probe_lookup_id(dtrace_id_t id)
7774 {
7775 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7776
7777 if (id == 0 || id > (dtrace_id_t)dtrace_nprobes)
7778 return (NULL);
7779
7780 return (dtrace_probes[id - 1]);
7781 }
7782
7783 static int
7784 dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg)
7785 {
7786 *((dtrace_id_t *)arg) = probe->dtpr_id;
7787
7788 return (DTRACE_MATCH_DONE);
7789 }
7790
7791 /*
7792 * Look up a probe based on provider and one or more of module name, function
7793 * name and probe name.
7794 */
7795 dtrace_id_t
7796 dtrace_probe_lookup(dtrace_provider_id_t prid, const char *mod,
7797 const char *func, const char *name)
7798 {
7799 dtrace_probekey_t pkey;
7800 dtrace_id_t id;
7801 int match;
7802
7803 pkey.dtpk_prov = ((dtrace_provider_t *)prid)->dtpv_name;
7804 pkey.dtpk_pmatch = &dtrace_match_string;
7805 pkey.dtpk_mod = mod;
7806 pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
7807 pkey.dtpk_func = func;
7808 pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
7809 pkey.dtpk_name = name;
7810 pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
7811 pkey.dtpk_id = DTRACE_IDNONE;
7812
7813 lck_mtx_lock(&dtrace_lock);
7814 match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
7815 dtrace_probe_lookup_match, &id);
7816 lck_mtx_unlock(&dtrace_lock);
7817
7818 ASSERT(match == 1 || match == 0);
7819 return (match ? id : 0);
7820 }
7821
7822 /*
7823 * Returns the probe argument associated with the specified probe.
7824 */
7825 void *
7826 dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
7827 {
7828 dtrace_probe_t *probe;
7829 void *rval = NULL;
7830
7831 lck_mtx_lock(&dtrace_lock);
7832
7833 if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
7834 probe->dtpr_provider == (dtrace_provider_t *)id)
7835 rval = probe->dtpr_arg;
7836
7837 lck_mtx_unlock(&dtrace_lock);
7838
7839 return (rval);
7840 }
7841
7842 /*
7843 * Copy a probe into a probe description.
7844 */
7845 static void
7846 dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)
7847 {
7848 bzero(pdp, sizeof (dtrace_probedesc_t));
7849 pdp->dtpd_id = prp->dtpr_id;
7850
7851 /* APPLE NOTE: Darwin employs size bounded string operation. */
7852 (void) strlcpy(pdp->dtpd_provider,
7853 prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN);
7854
7855 (void) strlcpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN);
7856 (void) strlcpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN);
7857 (void) strlcpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN);
7858 }
7859
7860 /*
7861 * Called to indicate that a probe -- or probes -- should be provided by a
7862 * specfied provider. If the specified description is NULL, the provider will
7863 * be told to provide all of its probes. (This is done whenever a new
7864 * consumer comes along, or whenever a retained enabling is to be matched.) If
7865 * the specified description is non-NULL, the provider is given the
7866 * opportunity to dynamically provide the specified probe, allowing providers
7867 * to support the creation of probes on-the-fly. (So-called _autocreated_
7868 * probes.) If the provider is NULL, the operations will be applied to all
7869 * providers; if the provider is non-NULL the operations will only be applied
7870 * to the specified provider. The dtrace_provider_lock must be held, and the
7871 * dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
7872 * will need to grab the dtrace_lock when it reenters the framework through
7873 * dtrace_probe_lookup(), dtrace_probe_create(), etc.
7874 */
7875 static void
7876 dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
7877 {
7878 struct modctl *ctl;
7879 int all = 0;
7880
7881 lck_mtx_assert(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
7882
7883 if (prv == NULL) {
7884 all = 1;
7885 prv = dtrace_provider;
7886 }
7887
7888 do {
7889 /*
7890 * First, call the blanket provide operation.
7891 */
7892 prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
7893
7894 /*
7895 * Now call the per-module provide operation. We will grab
7896 * mod_lock to prevent the list from being modified. Note
7897 * that this also prevents the mod_busy bits from changing.
7898 * (mod_busy can only be changed with mod_lock held.)
7899 */
7900 lck_mtx_lock(&mod_lock);
7901
7902 ctl = dtrace_modctl_list;
7903 while (ctl) {
7904 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
7905 ctl = ctl->mod_next;
7906 }
7907
7908 lck_mtx_unlock(&mod_lock);
7909 } while (all && (prv = prv->dtpv_next) != NULL);
7910 }
7911
7912 /*
7913 * Iterate over each probe, and call the Framework-to-Provider API function
7914 * denoted by offs.
7915 */
7916 static void
7917 dtrace_probe_foreach(uintptr_t offs)
7918 {
7919 dtrace_provider_t *prov;
7920 void (*func)(void *, dtrace_id_t, void *);
7921 dtrace_probe_t *probe;
7922 dtrace_icookie_t cookie;
7923 int i;
7924
7925 /*
7926 * We disable interrupts to walk through the probe array. This is
7927 * safe -- the dtrace_sync() in dtrace_unregister() assures that we
7928 * won't see stale data.
7929 */
7930 cookie = dtrace_interrupt_disable();
7931
7932 for (i = 0; i < dtrace_nprobes; i++) {
7933 if ((probe = dtrace_probes[i]) == NULL)
7934 continue;
7935
7936 if (probe->dtpr_ecb == NULL) {
7937 /*
7938 * This probe isn't enabled -- don't call the function.
7939 */
7940 continue;
7941 }
7942
7943 prov = probe->dtpr_provider;
7944 func = *((void(**)(void *, dtrace_id_t, void *))
7945 ((uintptr_t)&prov->dtpv_pops + offs));
7946
7947 func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
7948 }
7949
7950 dtrace_interrupt_enable(cookie);
7951 }
7952
7953 static int
7954 dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab)
7955 {
7956 dtrace_probekey_t pkey;
7957 uint32_t priv;
7958 uid_t uid;
7959 zoneid_t zoneid;
7960
7961 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7962
7963 dtrace_ecb_create_cache = NULL;
7964
7965 if (desc == NULL) {
7966 /*
7967 * If we're passed a NULL description, we're being asked to
7968 * create an ECB with a NULL probe.
7969 */
7970 (void) dtrace_ecb_create_enable(NULL, enab);
7971 return (0);
7972 }
7973
7974 dtrace_probekey(desc, &pkey);
7975 dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
7976 &priv, &uid, &zoneid);
7977
7978 return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable,
7979 enab));
7980 }
7981
7982 /*
7983 * DTrace Helper Provider Functions
7984 */
7985 static void
7986 dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
7987 {
7988 attr->dtat_name = DOF_ATTR_NAME(dofattr);
7989 attr->dtat_data = DOF_ATTR_DATA(dofattr);
7990 attr->dtat_class = DOF_ATTR_CLASS(dofattr);
7991 }
7992
7993 static void
7994 dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
7995 const dof_provider_t *dofprov, char *strtab)
7996 {
7997 hprov->dthpv_provname = strtab + dofprov->dofpv_name;
7998 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
7999 dofprov->dofpv_provattr);
8000 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
8001 dofprov->dofpv_modattr);
8002 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
8003 dofprov->dofpv_funcattr);
8004 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
8005 dofprov->dofpv_nameattr);
8006 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
8007 dofprov->dofpv_argsattr);
8008 }
8009
8010 static void
8011 dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
8012 {
8013 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8014 dof_hdr_t *dof = (dof_hdr_t *)daddr;
8015 dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
8016 dof_provider_t *provider;
8017 dof_probe_t *probe;
8018 uint32_t *off, *enoff;
8019 uint8_t *arg;
8020 char *strtab;
8021 uint_t i, nprobes;
8022 dtrace_helper_provdesc_t dhpv;
8023 dtrace_helper_probedesc_t dhpb;
8024 dtrace_meta_t *meta = dtrace_meta_pid;
8025 dtrace_mops_t *mops = &meta->dtm_mops;
8026 void *parg;
8027
8028 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
8029 str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8030 provider->dofpv_strtab * dof->dofh_secsize);
8031 prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8032 provider->dofpv_probes * dof->dofh_secsize);
8033 arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8034 provider->dofpv_prargs * dof->dofh_secsize);
8035 off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8036 provider->dofpv_proffs * dof->dofh_secsize);
8037
8038 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
8039 off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
8040 arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
8041 enoff = NULL;
8042
8043 /*
8044 * See dtrace_helper_provider_validate().
8045 */
8046 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
8047 provider->dofpv_prenoffs != DOF_SECT_NONE) {
8048 enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8049 provider->dofpv_prenoffs * dof->dofh_secsize);
8050 enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
8051 }
8052
8053 nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
8054
8055 /*
8056 * Create the provider.
8057 */
8058 dtrace_dofprov2hprov(&dhpv, provider, strtab);
8059
8060 if ((parg = mops->dtms_provide_pid(meta->dtm_arg, &dhpv, pid)) == NULL)
8061 return;
8062
8063 meta->dtm_count++;
8064
8065 /*
8066 * Create the probes.
8067 */
8068 for (i = 0; i < nprobes; i++) {
8069 probe = (dof_probe_t *)(uintptr_t)(daddr +
8070 prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
8071
8072 dhpb.dthpb_mod = dhp->dofhp_mod;
8073 dhpb.dthpb_func = strtab + probe->dofpr_func;
8074 dhpb.dthpb_name = strtab + probe->dofpr_name;
8075 #if !defined(__APPLE__)
8076 dhpb.dthpb_base = probe->dofpr_addr;
8077 #else
8078 dhpb.dthpb_base = dhp->dofhp_addr; /* FIXME: James, why? */
8079 #endif
8080 dhpb.dthpb_offs = (int32_t *)(off + probe->dofpr_offidx);
8081 dhpb.dthpb_noffs = probe->dofpr_noffs;
8082 if (enoff != NULL) {
8083 dhpb.dthpb_enoffs = (int32_t *)(enoff + probe->dofpr_enoffidx);
8084 dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
8085 } else {
8086 dhpb.dthpb_enoffs = NULL;
8087 dhpb.dthpb_nenoffs = 0;
8088 }
8089 dhpb.dthpb_args = arg + probe->dofpr_argidx;
8090 dhpb.dthpb_nargc = probe->dofpr_nargc;
8091 dhpb.dthpb_xargc = probe->dofpr_xargc;
8092 dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
8093 dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
8094
8095 mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
8096 }
8097 }
8098
8099 static void
8100 dtrace_helper_provide(dof_helper_t *dhp, pid_t pid)
8101 {
8102 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8103 dof_hdr_t *dof = (dof_hdr_t *)daddr;
8104 uint32_t i;
8105
8106 lck_mtx_assert(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
8107
8108 for (i = 0; i < dof->dofh_secnum; i++) {
8109 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
8110 dof->dofh_secoff + i * dof->dofh_secsize);
8111
8112 if (sec->dofs_type != DOF_SECT_PROVIDER)
8113 continue;
8114
8115 dtrace_helper_provide_one(dhp, sec, pid);
8116 }
8117
8118 /*
8119 * We may have just created probes, so we must now rematch against
8120 * any retained enablings. Note that this call will acquire both
8121 * cpu_lock and dtrace_lock; the fact that we are holding
8122 * dtrace_meta_lock now is what defines the ordering with respect to
8123 * these three locks.
8124 */
8125 dtrace_enabling_matchall();
8126 }
8127
8128 static void
8129 dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
8130 {
8131 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8132 dof_hdr_t *dof = (dof_hdr_t *)daddr;
8133 dof_sec_t *str_sec;
8134 dof_provider_t *provider;
8135 char *strtab;
8136 dtrace_helper_provdesc_t dhpv;
8137 dtrace_meta_t *meta = dtrace_meta_pid;
8138 dtrace_mops_t *mops = &meta->dtm_mops;
8139
8140 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
8141 str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8142 provider->dofpv_strtab * dof->dofh_secsize);
8143
8144 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
8145
8146 /*
8147 * Create the provider.
8148 */
8149 dtrace_dofprov2hprov(&dhpv, provider, strtab);
8150
8151 mops->dtms_remove_pid(meta->dtm_arg, &dhpv, pid);
8152
8153 meta->dtm_count--;
8154 }
8155
8156 static void
8157 dtrace_helper_provider_remove(dof_helper_t *dhp, pid_t pid)
8158 {
8159 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8160 dof_hdr_t *dof = (dof_hdr_t *)daddr;
8161 uint32_t i;
8162
8163 lck_mtx_assert(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
8164
8165 for (i = 0; i < dof->dofh_secnum; i++) {
8166 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
8167 dof->dofh_secoff + i * dof->dofh_secsize);
8168
8169 if (sec->dofs_type != DOF_SECT_PROVIDER)
8170 continue;
8171
8172 dtrace_helper_provider_remove_one(dhp, sec, pid);
8173 }
8174 }
8175
8176 /*
8177 * DTrace Meta Provider-to-Framework API Functions
8178 *
8179 * These functions implement the Meta Provider-to-Framework API, as described
8180 * in <sys/dtrace.h>.
8181 */
8182 int
8183 dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
8184 dtrace_meta_provider_id_t *idp)
8185 {
8186 dtrace_meta_t *meta;
8187 dtrace_helpers_t *help, *next;
8188 uint_t i;
8189
8190 *idp = DTRACE_METAPROVNONE;
8191
8192 /*
8193 * We strictly don't need the name, but we hold onto it for
8194 * debuggability. All hail error queues!
8195 */
8196 if (name == NULL) {
8197 cmn_err(CE_WARN, "failed to register meta-provider: "
8198 "invalid name");
8199 return (EINVAL);
8200 }
8201
8202 if (mops == NULL ||
8203 mops->dtms_create_probe == NULL ||
8204 mops->dtms_provide_pid == NULL ||
8205 mops->dtms_remove_pid == NULL) {
8206 cmn_err(CE_WARN, "failed to register meta-register %s: "
8207 "invalid ops", name);
8208 return (EINVAL);
8209 }
8210
8211 meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
8212 meta->dtm_mops = *mops;
8213
8214 /* APPLE NOTE: Darwin employs size bounded string operation. */
8215 {
8216 size_t bufsize = strlen(name) + 1;
8217 meta->dtm_name = kmem_alloc(bufsize, KM_SLEEP);
8218 (void) strlcpy(meta->dtm_name, name, bufsize);
8219 }
8220
8221 meta->dtm_arg = arg;
8222
8223 lck_mtx_lock(&dtrace_meta_lock);
8224 lck_mtx_lock(&dtrace_lock);
8225
8226 if (dtrace_meta_pid != NULL) {
8227 lck_mtx_unlock(&dtrace_lock);
8228 lck_mtx_unlock(&dtrace_meta_lock);
8229 cmn_err(CE_WARN, "failed to register meta-register %s: "
8230 "user-land meta-provider exists", name);
8231 kmem_free(meta->dtm_name, strlen(meta->dtm_name) + 1);
8232 kmem_free(meta, sizeof (dtrace_meta_t));
8233 return (EINVAL);
8234 }
8235
8236 dtrace_meta_pid = meta;
8237 *idp = (dtrace_meta_provider_id_t)meta;
8238
8239 /*
8240 * If there are providers and probes ready to go, pass them
8241 * off to the new meta provider now.
8242 */
8243
8244 help = dtrace_deferred_pid;
8245 dtrace_deferred_pid = NULL;
8246
8247 lck_mtx_unlock(&dtrace_lock);
8248
8249 while (help != NULL) {
8250 for (i = 0; i < help->dthps_nprovs; i++) {
8251 dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
8252 help->dthps_pid);
8253 }
8254
8255 next = help->dthps_next;
8256 help->dthps_next = NULL;
8257 help->dthps_prev = NULL;
8258 help->dthps_deferred = 0;
8259 help = next;
8260 }
8261
8262 lck_mtx_unlock(&dtrace_meta_lock);
8263
8264 return (0);
8265 }
8266
8267 int
8268 dtrace_meta_unregister(dtrace_meta_provider_id_t id)
8269 {
8270 dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;
8271
8272 lck_mtx_lock(&dtrace_meta_lock);
8273 lck_mtx_lock(&dtrace_lock);
8274
8275 if (old == dtrace_meta_pid) {
8276 pp = &dtrace_meta_pid;
8277 } else {
8278 panic("attempt to unregister non-existent "
8279 "dtrace meta-provider %p\n", (void *)old);
8280 }
8281
8282 if (old->dtm_count != 0) {
8283 lck_mtx_unlock(&dtrace_lock);
8284 lck_mtx_unlock(&dtrace_meta_lock);
8285 return (EBUSY);
8286 }
8287
8288 *pp = NULL;
8289
8290 lck_mtx_unlock(&dtrace_lock);
8291 lck_mtx_unlock(&dtrace_meta_lock);
8292
8293 kmem_free(old->dtm_name, strlen(old->dtm_name) + 1);
8294 kmem_free(old, sizeof (dtrace_meta_t));
8295
8296 return (0);
8297 }
8298
8299
8300 /*
8301 * DTrace DIF Object Functions
8302 */
8303 static int
8304 dtrace_difo_err(uint_t pc, const char *format, ...)
8305 {
8306 if (dtrace_err_verbose) {
8307 va_list alist;
8308
8309 (void) uprintf("dtrace DIF object error: [%u]: ", pc);
8310 va_start(alist, format);
8311 (void) vuprintf(format, alist);
8312 va_end(alist);
8313 }
8314
8315 #ifdef DTRACE_ERRDEBUG
8316 dtrace_errdebug(format);
8317 #endif
8318 return (1);
8319 }
8320
8321 /*
8322 * Validate a DTrace DIF object by checking the IR instructions. The following
8323 * rules are currently enforced by dtrace_difo_validate():
8324 *
8325 * 1. Each instruction must have a valid opcode
8326 * 2. Each register, string, variable, or subroutine reference must be valid
8327 * 3. No instruction can modify register %r0 (must be zero)
8328 * 4. All instruction reserved bits must be set to zero
8329 * 5. The last instruction must be a "ret" instruction
8330 * 6. All branch targets must reference a valid instruction _after_ the branch
8331 */
8332 static int
8333 dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
8334 cred_t *cr)
8335 {
8336 int err = 0;
8337 uint_t i;
8338
8339 int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
8340 int kcheckload;
8341 uint_t pc;
8342
8343 kcheckload = cr == NULL ||
8344 (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
8345
8346 dp->dtdo_destructive = 0;
8347
8348 for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
8349 dif_instr_t instr = dp->dtdo_buf[pc];
8350
8351 uint_t r1 = DIF_INSTR_R1(instr);
8352 uint_t r2 = DIF_INSTR_R2(instr);
8353 uint_t rd = DIF_INSTR_RD(instr);
8354 uint_t rs = DIF_INSTR_RS(instr);
8355 uint_t label = DIF_INSTR_LABEL(instr);
8356 uint_t v = DIF_INSTR_VAR(instr);
8357 uint_t subr = DIF_INSTR_SUBR(instr);
8358 uint_t type = DIF_INSTR_TYPE(instr);
8359 uint_t op = DIF_INSTR_OP(instr);
8360
8361 switch (op) {
8362 case DIF_OP_OR:
8363 case DIF_OP_XOR:
8364 case DIF_OP_AND:
8365 case DIF_OP_SLL:
8366 case DIF_OP_SRL:
8367 case DIF_OP_SRA:
8368 case DIF_OP_SUB:
8369 case DIF_OP_ADD:
8370 case DIF_OP_MUL:
8371 case DIF_OP_SDIV:
8372 case DIF_OP_UDIV:
8373 case DIF_OP_SREM:
8374 case DIF_OP_UREM:
8375 case DIF_OP_COPYS:
8376 if (r1 >= nregs)
8377 err += efunc(pc, "invalid register %u\n", r1);
8378 if (r2 >= nregs)
8379 err += efunc(pc, "invalid register %u\n", r2);
8380 if (rd >= nregs)
8381 err += efunc(pc, "invalid register %u\n", rd);
8382 if (rd == 0)
8383 err += efunc(pc, "cannot write to %r0\n");
8384 break;
8385 case DIF_OP_NOT:
8386 case DIF_OP_MOV:
8387 case DIF_OP_ALLOCS:
8388 if (r1 >= nregs)
8389 err += efunc(pc, "invalid register %u\n", r1);
8390 if (r2 != 0)
8391 err += efunc(pc, "non-zero reserved bits\n");
8392 if (rd >= nregs)
8393 err += efunc(pc, "invalid register %u\n", rd);
8394 if (rd == 0)
8395 err += efunc(pc, "cannot write to %r0\n");
8396 break;
8397 case DIF_OP_LDSB:
8398 case DIF_OP_LDSH:
8399 case DIF_OP_LDSW:
8400 case DIF_OP_LDUB:
8401 case DIF_OP_LDUH:
8402 case DIF_OP_LDUW:
8403 case DIF_OP_LDX:
8404 if (r1 >= nregs)
8405 err += efunc(pc, "invalid register %u\n", r1);
8406 if (r2 != 0)
8407 err += efunc(pc, "non-zero reserved bits\n");
8408 if (rd >= nregs)
8409 err += efunc(pc, "invalid register %u\n", rd);
8410 if (rd == 0)
8411 err += efunc(pc, "cannot write to %r0\n");
8412 if (kcheckload)
8413 dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
8414 DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
8415 break;
8416 case DIF_OP_RLDSB:
8417 case DIF_OP_RLDSH:
8418 case DIF_OP_RLDSW:
8419 case DIF_OP_RLDUB:
8420 case DIF_OP_RLDUH:
8421 case DIF_OP_RLDUW:
8422 case DIF_OP_RLDX:
8423 if (r1 >= nregs)
8424 err += efunc(pc, "invalid register %u\n", r1);
8425 if (r2 != 0)
8426 err += efunc(pc, "non-zero reserved bits\n");
8427 if (rd >= nregs)
8428 err += efunc(pc, "invalid register %u\n", rd);
8429 if (rd == 0)
8430 err += efunc(pc, "cannot write to %r0\n");
8431 break;
8432 case DIF_OP_ULDSB:
8433 case DIF_OP_ULDSH:
8434 case DIF_OP_ULDSW:
8435 case DIF_OP_ULDUB:
8436 case DIF_OP_ULDUH:
8437 case DIF_OP_ULDUW:
8438 case DIF_OP_ULDX:
8439 if (r1 >= nregs)
8440 err += efunc(pc, "invalid register %u\n", r1);
8441 if (r2 != 0)
8442 err += efunc(pc, "non-zero reserved bits\n");
8443 if (rd >= nregs)
8444 err += efunc(pc, "invalid register %u\n", rd);
8445 if (rd == 0)
8446 err += efunc(pc, "cannot write to %r0\n");
8447 break;
8448 case DIF_OP_STB:
8449 case DIF_OP_STH:
8450 case DIF_OP_STW:
8451 case DIF_OP_STX:
8452 if (r1 >= nregs)
8453 err += efunc(pc, "invalid register %u\n", r1);
8454 if (r2 != 0)
8455 err += efunc(pc, "non-zero reserved bits\n");
8456 if (rd >= nregs)
8457 err += efunc(pc, "invalid register %u\n", rd);
8458 if (rd == 0)
8459 err += efunc(pc, "cannot write to 0 address\n");
8460 break;
8461 case DIF_OP_CMP:
8462 case DIF_OP_SCMP:
8463 if (r1 >= nregs)
8464 err += efunc(pc, "invalid register %u\n", r1);
8465 if (r2 >= nregs)
8466 err += efunc(pc, "invalid register %u\n", r2);
8467 if (rd != 0)
8468 err += efunc(pc, "non-zero reserved bits\n");
8469 break;
8470 case DIF_OP_TST:
8471 if (r1 >= nregs)
8472 err += efunc(pc, "invalid register %u\n", r1);
8473 if (r2 != 0 || rd != 0)
8474 err += efunc(pc, "non-zero reserved bits\n");
8475 break;
8476 case DIF_OP_BA:
8477 case DIF_OP_BE:
8478 case DIF_OP_BNE:
8479 case DIF_OP_BG:
8480 case DIF_OP_BGU:
8481 case DIF_OP_BGE:
8482 case DIF_OP_BGEU:
8483 case DIF_OP_BL:
8484 case DIF_OP_BLU:
8485 case DIF_OP_BLE:
8486 case DIF_OP_BLEU:
8487 if (label >= dp->dtdo_len) {
8488 err += efunc(pc, "invalid branch target %u\n",
8489 label);
8490 }
8491 if (label <= pc) {
8492 err += efunc(pc, "backward branch to %u\n",
8493 label);
8494 }
8495 break;
8496 case DIF_OP_RET:
8497 if (r1 != 0 || r2 != 0)
8498 err += efunc(pc, "non-zero reserved bits\n");
8499 if (rd >= nregs)
8500 err += efunc(pc, "invalid register %u\n", rd);
8501 break;
8502 case DIF_OP_NOP:
8503 case DIF_OP_POPTS:
8504 case DIF_OP_FLUSHTS:
8505 if (r1 != 0 || r2 != 0 || rd != 0)
8506 err += efunc(pc, "non-zero reserved bits\n");
8507 break;
8508 case DIF_OP_SETX:
8509 if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
8510 err += efunc(pc, "invalid integer ref %u\n",
8511 DIF_INSTR_INTEGER(instr));
8512 }
8513 if (rd >= nregs)
8514 err += efunc(pc, "invalid register %u\n", rd);
8515 if (rd == 0)
8516 err += efunc(pc, "cannot write to %r0\n");
8517 break;
8518 case DIF_OP_SETS:
8519 if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
8520 err += efunc(pc, "invalid string ref %u\n",
8521 DIF_INSTR_STRING(instr));
8522 }
8523 if (rd >= nregs)
8524 err += efunc(pc, "invalid register %u\n", rd);
8525 if (rd == 0)
8526 err += efunc(pc, "cannot write to %r0\n");
8527 break;
8528 case DIF_OP_LDGA:
8529 case DIF_OP_LDTA:
8530 if (r1 > DIF_VAR_ARRAY_MAX)
8531 err += efunc(pc, "invalid array %u\n", r1);
8532 if (r2 >= nregs)
8533 err += efunc(pc, "invalid register %u\n", r2);
8534 if (rd >= nregs)
8535 err += efunc(pc, "invalid register %u\n", rd);
8536 if (rd == 0)
8537 err += efunc(pc, "cannot write to %r0\n");
8538 break;
8539 case DIF_OP_LDGS:
8540 case DIF_OP_LDTS:
8541 case DIF_OP_LDLS:
8542 case DIF_OP_LDGAA:
8543 case DIF_OP_LDTAA:
8544 if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)
8545 err += efunc(pc, "invalid variable %u\n", v);
8546 if (rd >= nregs)
8547 err += efunc(pc, "invalid register %u\n", rd);
8548 if (rd == 0)
8549 err += efunc(pc, "cannot write to %r0\n");
8550 break;
8551 case DIF_OP_STGS:
8552 case DIF_OP_STTS:
8553 case DIF_OP_STLS:
8554 case DIF_OP_STGAA:
8555 case DIF_OP_STTAA:
8556 if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)
8557 err += efunc(pc, "invalid variable %u\n", v);
8558 if (rs >= nregs)
8559 err += efunc(pc, "invalid register %u\n", rd);
8560 break;
8561 case DIF_OP_CALL:
8562 if (subr > DIF_SUBR_MAX)
8563 err += efunc(pc, "invalid subr %u\n", subr);
8564 if (rd >= nregs)
8565 err += efunc(pc, "invalid register %u\n", rd);
8566 if (rd == 0)
8567 err += efunc(pc, "cannot write to %r0\n");
8568
8569 if (subr == DIF_SUBR_COPYOUT ||
8570 subr == DIF_SUBR_COPYOUTSTR) {
8571 dp->dtdo_destructive = 1;
8572 }
8573 break;
8574 case DIF_OP_PUSHTR:
8575 if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
8576 err += efunc(pc, "invalid ref type %u\n", type);
8577 if (r2 >= nregs)
8578 err += efunc(pc, "invalid register %u\n", r2);
8579 if (rs >= nregs)
8580 err += efunc(pc, "invalid register %u\n", rs);
8581 break;
8582 case DIF_OP_PUSHTV:
8583 if (type != DIF_TYPE_CTF)
8584 err += efunc(pc, "invalid val type %u\n", type);
8585 if (r2 >= nregs)
8586 err += efunc(pc, "invalid register %u\n", r2);
8587 if (rs >= nregs)
8588 err += efunc(pc, "invalid register %u\n", rs);
8589 break;
8590 default:
8591 err += efunc(pc, "invalid opcode %u\n",
8592 DIF_INSTR_OP(instr));
8593 }
8594 }
8595
8596 if (dp->dtdo_len != 0 &&
8597 DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
8598 err += efunc(dp->dtdo_len - 1,
8599 "expected 'ret' as last DIF instruction\n");
8600 }
8601
8602 if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF)) {
8603 /*
8604 * If we're not returning by reference, the size must be either
8605 * 0 or the size of one of the base types.
8606 */
8607 switch (dp->dtdo_rtype.dtdt_size) {
8608 case 0:
8609 case sizeof (uint8_t):
8610 case sizeof (uint16_t):
8611 case sizeof (uint32_t):
8612 case sizeof (uint64_t):
8613 break;
8614
8615 default:
8616 err += efunc(dp->dtdo_len - 1, "bad return size\n");
8617 }
8618 }
8619
8620 for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
8621 dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;
8622 dtrace_diftype_t *vt, *et;
8623 uint_t id;
8624 int ndx;
8625
8626 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
8627 v->dtdv_scope != DIFV_SCOPE_THREAD &&
8628 v->dtdv_scope != DIFV_SCOPE_LOCAL) {
8629 err += efunc(i, "unrecognized variable scope %d\n",
8630 v->dtdv_scope);
8631 break;
8632 }
8633
8634 if (v->dtdv_kind != DIFV_KIND_ARRAY &&
8635 v->dtdv_kind != DIFV_KIND_SCALAR) {
8636 err += efunc(i, "unrecognized variable type %d\n",
8637 v->dtdv_kind);
8638 break;
8639 }
8640
8641 if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
8642 err += efunc(i, "%d exceeds variable id limit\n", id);
8643 break;
8644 }
8645
8646 if (id < DIF_VAR_OTHER_UBASE)
8647 continue;
8648
8649 /*
8650 * For user-defined variables, we need to check that this
8651 * definition is identical to any previous definition that we
8652 * encountered.
8653 */
8654 ndx = id - DIF_VAR_OTHER_UBASE;
8655
8656 switch (v->dtdv_scope) {
8657 case DIFV_SCOPE_GLOBAL:
8658 if (ndx < vstate->dtvs_nglobals) {
8659 dtrace_statvar_t *svar;
8660
8661 if ((svar = vstate->dtvs_globals[ndx]) != NULL)
8662 existing = &svar->dtsv_var;
8663 }
8664
8665 break;
8666
8667 case DIFV_SCOPE_THREAD:
8668 if (ndx < vstate->dtvs_ntlocals)
8669 existing = &vstate->dtvs_tlocals[ndx];
8670 break;
8671
8672 case DIFV_SCOPE_LOCAL:
8673 if (ndx < vstate->dtvs_nlocals) {
8674 dtrace_statvar_t *svar;
8675
8676 if ((svar = vstate->dtvs_locals[ndx]) != NULL)
8677 existing = &svar->dtsv_var;
8678 }
8679
8680 break;
8681 }
8682
8683 vt = &v->dtdv_type;
8684
8685 if (vt->dtdt_flags & DIF_TF_BYREF) {
8686 if (vt->dtdt_size == 0) {
8687 err += efunc(i, "zero-sized variable\n");
8688 break;
8689 }
8690
8691 if (v->dtdv_scope == DIFV_SCOPE_GLOBAL &&
8692 vt->dtdt_size > dtrace_global_maxsize) {
8693 err += efunc(i, "oversized by-ref global\n");
8694 break;
8695 }
8696 }
8697
8698 if (existing == NULL || existing->dtdv_id == 0)
8699 continue;
8700
8701 ASSERT(existing->dtdv_id == v->dtdv_id);
8702 ASSERT(existing->dtdv_scope == v->dtdv_scope);
8703
8704 if (existing->dtdv_kind != v->dtdv_kind)
8705 err += efunc(i, "%d changed variable kind\n", id);
8706
8707 et = &existing->dtdv_type;
8708
8709 if (vt->dtdt_flags != et->dtdt_flags) {
8710 err += efunc(i, "%d changed variable type flags\n", id);
8711 break;
8712 }
8713
8714 if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
8715 err += efunc(i, "%d changed variable type size\n", id);
8716 break;
8717 }
8718 }
8719
8720 return (err);
8721 }
8722
8723 /*
8724 * Validate a DTrace DIF object that it is to be used as a helper. Helpers
8725 * are much more constrained than normal DIFOs. Specifically, they may
8726 * not:
8727 *
8728 * 1. Make calls to subroutines other than copyin(), copyinstr() or
8729 * miscellaneous string routines
8730 * 2. Access DTrace variables other than the args[] array, and the
8731 * curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
8732 * 3. Have thread-local variables.
8733 * 4. Have dynamic variables.
8734 */
8735 static int
8736 dtrace_difo_validate_helper(dtrace_difo_t *dp)
8737 {
8738 int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
8739 int err = 0;
8740 uint_t pc;
8741
8742 for (pc = 0; pc < dp->dtdo_len; pc++) {
8743 dif_instr_t instr = dp->dtdo_buf[pc];
8744
8745 uint_t v = DIF_INSTR_VAR(instr);
8746 uint_t subr = DIF_INSTR_SUBR(instr);
8747 uint_t op = DIF_INSTR_OP(instr);
8748
8749 switch (op) {
8750 case DIF_OP_OR:
8751 case DIF_OP_XOR:
8752 case DIF_OP_AND:
8753 case DIF_OP_SLL:
8754 case DIF_OP_SRL:
8755 case DIF_OP_SRA:
8756 case DIF_OP_SUB:
8757 case DIF_OP_ADD:
8758 case DIF_OP_MUL:
8759 case DIF_OP_SDIV:
8760 case DIF_OP_UDIV:
8761 case DIF_OP_SREM:
8762 case DIF_OP_UREM:
8763 case DIF_OP_COPYS:
8764 case DIF_OP_NOT:
8765 case DIF_OP_MOV:
8766 case DIF_OP_RLDSB:
8767 case DIF_OP_RLDSH:
8768 case DIF_OP_RLDSW:
8769 case DIF_OP_RLDUB:
8770 case DIF_OP_RLDUH:
8771 case DIF_OP_RLDUW:
8772 case DIF_OP_RLDX:
8773 case DIF_OP_ULDSB:
8774 case DIF_OP_ULDSH:
8775 case DIF_OP_ULDSW:
8776 case DIF_OP_ULDUB:
8777 case DIF_OP_ULDUH:
8778 case DIF_OP_ULDUW:
8779 case DIF_OP_ULDX:
8780 case DIF_OP_STB:
8781 case DIF_OP_STH:
8782 case DIF_OP_STW:
8783 case DIF_OP_STX:
8784 case DIF_OP_ALLOCS:
8785 case DIF_OP_CMP:
8786 case DIF_OP_SCMP:
8787 case DIF_OP_TST:
8788 case DIF_OP_BA:
8789 case DIF_OP_BE:
8790 case DIF_OP_BNE:
8791 case DIF_OP_BG:
8792 case DIF_OP_BGU:
8793 case DIF_OP_BGE:
8794 case DIF_OP_BGEU:
8795 case DIF_OP_BL:
8796 case DIF_OP_BLU:
8797 case DIF_OP_BLE:
8798 case DIF_OP_BLEU:
8799 case DIF_OP_RET:
8800 case DIF_OP_NOP:
8801 case DIF_OP_POPTS:
8802 case DIF_OP_FLUSHTS:
8803 case DIF_OP_SETX:
8804 case DIF_OP_SETS:
8805 case DIF_OP_LDGA:
8806 case DIF_OP_LDLS:
8807 case DIF_OP_STGS:
8808 case DIF_OP_STLS:
8809 case DIF_OP_PUSHTR:
8810 case DIF_OP_PUSHTV:
8811 break;
8812
8813 case DIF_OP_LDGS:
8814 if (v >= DIF_VAR_OTHER_UBASE)
8815 break;
8816
8817 if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
8818 break;
8819
8820 if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||
8821 v == DIF_VAR_PPID || v == DIF_VAR_TID ||
8822 v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||
8823 v == DIF_VAR_UID || v == DIF_VAR_GID)
8824 break;
8825
8826 err += efunc(pc, "illegal variable %u\n", v);
8827 break;
8828
8829 case DIF_OP_LDTA:
8830 case DIF_OP_LDTS:
8831 case DIF_OP_LDGAA:
8832 case DIF_OP_LDTAA:
8833 err += efunc(pc, "illegal dynamic variable load\n");
8834 break;
8835
8836 case DIF_OP_STTS:
8837 case DIF_OP_STGAA:
8838 case DIF_OP_STTAA:
8839 err += efunc(pc, "illegal dynamic variable store\n");
8840 break;
8841
8842 case DIF_OP_CALL:
8843 if (subr == DIF_SUBR_ALLOCA ||
8844 subr == DIF_SUBR_BCOPY ||
8845 subr == DIF_SUBR_COPYIN ||
8846 subr == DIF_SUBR_COPYINTO ||
8847 subr == DIF_SUBR_COPYINSTR ||
8848 subr == DIF_SUBR_INDEX ||
8849 subr == DIF_SUBR_INET_NTOA ||
8850 subr == DIF_SUBR_INET_NTOA6 ||
8851 subr == DIF_SUBR_INET_NTOP ||
8852 subr == DIF_SUBR_LLTOSTR ||
8853 subr == DIF_SUBR_RINDEX ||
8854 subr == DIF_SUBR_STRCHR ||
8855 subr == DIF_SUBR_STRJOIN ||
8856 subr == DIF_SUBR_STRRCHR ||
8857 subr == DIF_SUBR_STRSTR ||
8858 subr == DIF_SUBR_COREPROFILE ||
8859 subr == DIF_SUBR_HTONS ||
8860 subr == DIF_SUBR_HTONL ||
8861 subr == DIF_SUBR_HTONLL ||
8862 subr == DIF_SUBR_NTOHS ||
8863 subr == DIF_SUBR_NTOHL ||
8864 subr == DIF_SUBR_NTOHLL)
8865 break;
8866
8867 err += efunc(pc, "invalid subr %u\n", subr);
8868 break;
8869
8870 default:
8871 err += efunc(pc, "invalid opcode %u\n",
8872 DIF_INSTR_OP(instr));
8873 }
8874 }
8875
8876 return (err);
8877 }
8878
8879 /*
8880 * Returns 1 if the expression in the DIF object can be cached on a per-thread
8881 * basis; 0 if not.
8882 */
8883 static int
8884 dtrace_difo_cacheable(dtrace_difo_t *dp)
8885 {
8886 uint_t i;
8887
8888 if (dp == NULL)
8889 return (0);
8890
8891 for (i = 0; i < dp->dtdo_varlen; i++) {
8892 dtrace_difv_t *v = &dp->dtdo_vartab[i];
8893
8894 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
8895 continue;
8896
8897 switch (v->dtdv_id) {
8898 case DIF_VAR_CURTHREAD:
8899 case DIF_VAR_PID:
8900 case DIF_VAR_TID:
8901 case DIF_VAR_EXECNAME:
8902 case DIF_VAR_ZONENAME:
8903 break;
8904
8905 default:
8906 return (0);
8907 }
8908 }
8909
8910 /*
8911 * This DIF object may be cacheable. Now we need to look for any
8912 * array loading instructions, any memory loading instructions, or
8913 * any stores to thread-local variables.
8914 */
8915 for (i = 0; i < dp->dtdo_len; i++) {
8916 uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
8917
8918 if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||
8919 (op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||
8920 (op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||
8921 op == DIF_OP_LDGA || op == DIF_OP_STTS)
8922 return (0);
8923 }
8924
8925 return (1);
8926 }
8927
8928 static void
8929 dtrace_difo_hold(dtrace_difo_t *dp)
8930 {
8931 uint_t i;
8932
8933 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8934
8935 dp->dtdo_refcnt++;
8936 ASSERT(dp->dtdo_refcnt != 0);
8937
8938 /*
8939 * We need to check this DIF object for references to the variable
8940 * DIF_VAR_VTIMESTAMP.
8941 */
8942 for (i = 0; i < dp->dtdo_varlen; i++) {
8943 dtrace_difv_t *v = &dp->dtdo_vartab[i];
8944
8945 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
8946 continue;
8947
8948 if (dtrace_vtime_references++ == 0)
8949 dtrace_vtime_enable();
8950 }
8951 }
8952
8953 /*
8954 * This routine calculates the dynamic variable chunksize for a given DIF
8955 * object. The calculation is not fool-proof, and can probably be tricked by
8956 * malicious DIF -- but it works for all compiler-generated DIF. Because this
8957 * calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
8958 * if a dynamic variable size exceeds the chunksize.
8959 */
8960 static void
8961 dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
8962 {
8963 uint64_t sval = 0;
8964 dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
8965 const dif_instr_t *text = dp->dtdo_buf;
8966 uint_t pc, srd = 0;
8967 uint_t ttop = 0;
8968 size_t size, ksize;
8969 uint_t id, i;
8970
8971 for (pc = 0; pc < dp->dtdo_len; pc++) {
8972 dif_instr_t instr = text[pc];
8973 uint_t op = DIF_INSTR_OP(instr);
8974 uint_t rd = DIF_INSTR_RD(instr);
8975 uint_t r1 = DIF_INSTR_R1(instr);
8976 uint_t nkeys = 0;
8977 uchar_t scope;
8978
8979 dtrace_key_t *key = tupregs;
8980
8981 switch (op) {
8982 case DIF_OP_SETX:
8983 sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
8984 srd = rd;
8985 continue;
8986
8987 case DIF_OP_STTS:
8988 key = &tupregs[DIF_DTR_NREGS];
8989 key[0].dttk_size = 0;
8990 key[1].dttk_size = 0;
8991 nkeys = 2;
8992 scope = DIFV_SCOPE_THREAD;
8993 break;
8994
8995 case DIF_OP_STGAA:
8996 case DIF_OP_STTAA:
8997 nkeys = ttop;
8998
8999 if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
9000 key[nkeys++].dttk_size = 0;
9001
9002 key[nkeys++].dttk_size = 0;
9003
9004 if (op == DIF_OP_STTAA) {
9005 scope = DIFV_SCOPE_THREAD;
9006 } else {
9007 scope = DIFV_SCOPE_GLOBAL;
9008 }
9009
9010 break;
9011
9012 case DIF_OP_PUSHTR:
9013 if (ttop == DIF_DTR_NREGS)
9014 return;
9015
9016 if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {
9017 /*
9018 * If the register for the size of the "pushtr"
9019 * is %r0 (or the value is 0) and the type is
9020 * a string, we'll use the system-wide default
9021 * string size.
9022 */
9023 tupregs[ttop++].dttk_size =
9024 dtrace_strsize_default;
9025 } else {
9026 if (srd == 0)
9027 return;
9028
9029 tupregs[ttop++].dttk_size = sval;
9030 }
9031
9032 break;
9033
9034 case DIF_OP_PUSHTV:
9035 if (ttop == DIF_DTR_NREGS)
9036 return;
9037
9038 tupregs[ttop++].dttk_size = 0;
9039 break;
9040
9041 case DIF_OP_FLUSHTS:
9042 ttop = 0;
9043 break;
9044
9045 case DIF_OP_POPTS:
9046 if (ttop != 0)
9047 ttop--;
9048 break;
9049 }
9050
9051 sval = 0;
9052 srd = 0;
9053
9054 if (nkeys == 0)
9055 continue;
9056
9057 /*
9058 * We have a dynamic variable allocation; calculate its size.
9059 */
9060 for (ksize = 0, i = 0; i < nkeys; i++)
9061 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
9062
9063 size = sizeof (dtrace_dynvar_t);
9064 size += sizeof (dtrace_key_t) * (nkeys - 1);
9065 size += ksize;
9066
9067 /*
9068 * Now we need to determine the size of the stored data.
9069 */
9070 id = DIF_INSTR_VAR(instr);
9071
9072 for (i = 0; i < dp->dtdo_varlen; i++) {
9073 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9074
9075 if (v->dtdv_id == id && v->dtdv_scope == scope) {
9076 size += v->dtdv_type.dtdt_size;
9077 break;
9078 }
9079 }
9080
9081 if (i == dp->dtdo_varlen)
9082 return;
9083
9084 /*
9085 * We have the size. If this is larger than the chunk size
9086 * for our dynamic variable state, reset the chunk size.
9087 */
9088 size = P2ROUNDUP(size, sizeof (uint64_t));
9089
9090 if (size > vstate->dtvs_dynvars.dtds_chunksize)
9091 vstate->dtvs_dynvars.dtds_chunksize = size;
9092 }
9093 }
9094
9095 static void
9096 dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9097 {
9098 int oldsvars, osz, nsz, otlocals, ntlocals;
9099 uint_t i, id;
9100
9101 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9102 ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);
9103
9104 for (i = 0; i < dp->dtdo_varlen; i++) {
9105 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9106 dtrace_statvar_t *svar;
9107 dtrace_statvar_t ***svarp = NULL;
9108 size_t dsize = 0;
9109 uint8_t scope = v->dtdv_scope;
9110 int *np = (int *)NULL;
9111
9112 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
9113 continue;
9114
9115 id -= DIF_VAR_OTHER_UBASE;
9116
9117 switch (scope) {
9118 case DIFV_SCOPE_THREAD:
9119 while (id >= (uint_t)(otlocals = vstate->dtvs_ntlocals)) {
9120 dtrace_difv_t *tlocals;
9121
9122 if ((ntlocals = (otlocals << 1)) == 0)
9123 ntlocals = 1;
9124
9125 osz = otlocals * sizeof (dtrace_difv_t);
9126 nsz = ntlocals * sizeof (dtrace_difv_t);
9127
9128 tlocals = kmem_zalloc(nsz, KM_SLEEP);
9129
9130 if (osz != 0) {
9131 bcopy(vstate->dtvs_tlocals,
9132 tlocals, osz);
9133 kmem_free(vstate->dtvs_tlocals, osz);
9134 }
9135
9136 vstate->dtvs_tlocals = tlocals;
9137 vstate->dtvs_ntlocals = ntlocals;
9138 }
9139
9140 vstate->dtvs_tlocals[id] = *v;
9141 continue;
9142
9143 case DIFV_SCOPE_LOCAL:
9144 np = &vstate->dtvs_nlocals;
9145 svarp = &vstate->dtvs_locals;
9146
9147 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
9148 dsize = (int)NCPU * (v->dtdv_type.dtdt_size +
9149 sizeof (uint64_t));
9150 else
9151 dsize = (int)NCPU * sizeof (uint64_t);
9152
9153 break;
9154
9155 case DIFV_SCOPE_GLOBAL:
9156 np = &vstate->dtvs_nglobals;
9157 svarp = &vstate->dtvs_globals;
9158
9159 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
9160 dsize = v->dtdv_type.dtdt_size +
9161 sizeof (uint64_t);
9162
9163 break;
9164
9165 default:
9166 ASSERT(0);
9167 }
9168
9169 while (id >= (uint_t)(oldsvars = *np)) {
9170 dtrace_statvar_t **statics;
9171 int newsvars, oldsize, newsize;
9172
9173 if ((newsvars = (oldsvars << 1)) == 0)
9174 newsvars = 1;
9175
9176 oldsize = oldsvars * sizeof (dtrace_statvar_t *);
9177 newsize = newsvars * sizeof (dtrace_statvar_t *);
9178
9179 statics = kmem_zalloc(newsize, KM_SLEEP);
9180
9181 if (oldsize != 0) {
9182 bcopy(*svarp, statics, oldsize);
9183 kmem_free(*svarp, oldsize);
9184 }
9185
9186 *svarp = statics;
9187 *np = newsvars;
9188 }
9189
9190 if ((svar = (*svarp)[id]) == NULL) {
9191 svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
9192 svar->dtsv_var = *v;
9193
9194 if ((svar->dtsv_size = dsize) != 0) {
9195 svar->dtsv_data = (uint64_t)(uintptr_t)
9196 kmem_zalloc(dsize, KM_SLEEP);
9197 }
9198
9199 (*svarp)[id] = svar;
9200 }
9201
9202 svar->dtsv_refcnt++;
9203 }
9204
9205 dtrace_difo_chunksize(dp, vstate);
9206 dtrace_difo_hold(dp);
9207 }
9208
9209 static dtrace_difo_t *
9210 dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9211 {
9212 dtrace_difo_t *new;
9213 size_t sz;
9214
9215 ASSERT(dp->dtdo_buf != NULL);
9216 ASSERT(dp->dtdo_refcnt != 0);
9217
9218 new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
9219
9220 ASSERT(dp->dtdo_buf != NULL);
9221 sz = dp->dtdo_len * sizeof (dif_instr_t);
9222 new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
9223 bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
9224 new->dtdo_len = dp->dtdo_len;
9225
9226 if (dp->dtdo_strtab != NULL) {
9227 ASSERT(dp->dtdo_strlen != 0);
9228 new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
9229 bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
9230 new->dtdo_strlen = dp->dtdo_strlen;
9231 }
9232
9233 if (dp->dtdo_inttab != NULL) {
9234 ASSERT(dp->dtdo_intlen != 0);
9235 sz = dp->dtdo_intlen * sizeof (uint64_t);
9236 new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
9237 bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
9238 new->dtdo_intlen = dp->dtdo_intlen;
9239 }
9240
9241 if (dp->dtdo_vartab != NULL) {
9242 ASSERT(dp->dtdo_varlen != 0);
9243 sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
9244 new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
9245 bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
9246 new->dtdo_varlen = dp->dtdo_varlen;
9247 }
9248
9249 dtrace_difo_init(new, vstate);
9250 return (new);
9251 }
9252
9253 static void
9254 dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9255 {
9256 uint_t i;
9257
9258 ASSERT(dp->dtdo_refcnt == 0);
9259
9260 for (i = 0; i < dp->dtdo_varlen; i++) {
9261 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9262 dtrace_statvar_t *svar;
9263 dtrace_statvar_t **svarp = NULL;
9264 uint_t id;
9265 uint8_t scope = v->dtdv_scope;
9266 int *np = NULL;
9267
9268 switch (scope) {
9269 case DIFV_SCOPE_THREAD:
9270 continue;
9271
9272 case DIFV_SCOPE_LOCAL:
9273 np = &vstate->dtvs_nlocals;
9274 svarp = vstate->dtvs_locals;
9275 break;
9276
9277 case DIFV_SCOPE_GLOBAL:
9278 np = &vstate->dtvs_nglobals;
9279 svarp = vstate->dtvs_globals;
9280 break;
9281
9282 default:
9283 ASSERT(0);
9284 }
9285
9286 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
9287 continue;
9288
9289 id -= DIF_VAR_OTHER_UBASE;
9290
9291 ASSERT(id < (uint_t)*np);
9292
9293 svar = svarp[id];
9294 ASSERT(svar != NULL);
9295 ASSERT(svar->dtsv_refcnt > 0);
9296
9297 if (--svar->dtsv_refcnt > 0)
9298 continue;
9299
9300 if (svar->dtsv_size != 0) {
9301 ASSERT(svar->dtsv_data != 0);
9302 kmem_free((void *)(uintptr_t)svar->dtsv_data,
9303 svar->dtsv_size);
9304 }
9305
9306 kmem_free(svar, sizeof (dtrace_statvar_t));
9307 svarp[id] = NULL;
9308 }
9309
9310 kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
9311 kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
9312 kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
9313 kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
9314
9315 kmem_free(dp, sizeof (dtrace_difo_t));
9316 }
9317
9318 static void
9319 dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9320 {
9321 uint_t i;
9322
9323 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9324 ASSERT(dp->dtdo_refcnt != 0);
9325
9326 for (i = 0; i < dp->dtdo_varlen; i++) {
9327 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9328
9329 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
9330 continue;
9331
9332 ASSERT(dtrace_vtime_references > 0);
9333 if (--dtrace_vtime_references == 0)
9334 dtrace_vtime_disable();
9335 }
9336
9337 if (--dp->dtdo_refcnt == 0)
9338 dtrace_difo_destroy(dp, vstate);
9339 }
9340
9341 /*
9342 * DTrace Format Functions
9343 */
9344 static uint16_t
9345 dtrace_format_add(dtrace_state_t *state, char *str)
9346 {
9347 char *fmt, **new;
9348 uint16_t ndx, len = strlen(str) + 1;
9349
9350 fmt = kmem_zalloc(len, KM_SLEEP);
9351 bcopy(str, fmt, len);
9352
9353 for (ndx = 0; ndx < state->dts_nformats; ndx++) {
9354 if (state->dts_formats[ndx] == NULL) {
9355 state->dts_formats[ndx] = fmt;
9356 return (ndx + 1);
9357 }
9358 }
9359
9360 if (state->dts_nformats == USHRT_MAX) {
9361 /*
9362 * This is only likely if a denial-of-service attack is being
9363 * attempted. As such, it's okay to fail silently here.
9364 */
9365 kmem_free(fmt, len);
9366 return (0);
9367 }
9368
9369 /*
9370 * For simplicity, we always resize the formats array to be exactly the
9371 * number of formats.
9372 */
9373 ndx = state->dts_nformats++;
9374 new = kmem_alloc((ndx + 1) * sizeof (char *), KM_SLEEP);
9375
9376 if (state->dts_formats != NULL) {
9377 ASSERT(ndx != 0);
9378 bcopy(state->dts_formats, new, ndx * sizeof (char *));
9379 kmem_free(state->dts_formats, ndx * sizeof (char *));
9380 }
9381
9382 state->dts_formats = new;
9383 state->dts_formats[ndx] = fmt;
9384
9385 return (ndx + 1);
9386 }
9387
9388 static void
9389 dtrace_format_remove(dtrace_state_t *state, uint16_t format)
9390 {
9391 char *fmt;
9392
9393 ASSERT(state->dts_formats != NULL);
9394 ASSERT(format <= state->dts_nformats);
9395 ASSERT(state->dts_formats[format - 1] != NULL);
9396
9397 fmt = state->dts_formats[format - 1];
9398 kmem_free(fmt, strlen(fmt) + 1);
9399 state->dts_formats[format - 1] = NULL;
9400 }
9401
9402 static void
9403 dtrace_format_destroy(dtrace_state_t *state)
9404 {
9405 int i;
9406
9407 if (state->dts_nformats == 0) {
9408 ASSERT(state->dts_formats == NULL);
9409 return;
9410 }
9411
9412 ASSERT(state->dts_formats != NULL);
9413
9414 for (i = 0; i < state->dts_nformats; i++) {
9415 char *fmt = state->dts_formats[i];
9416
9417 if (fmt == NULL)
9418 continue;
9419
9420 kmem_free(fmt, strlen(fmt) + 1);
9421 }
9422
9423 kmem_free(state->dts_formats, state->dts_nformats * sizeof (char *));
9424 state->dts_nformats = 0;
9425 state->dts_formats = NULL;
9426 }
9427
9428 /*
9429 * DTrace Predicate Functions
9430 */
9431 static dtrace_predicate_t *
9432 dtrace_predicate_create(dtrace_difo_t *dp)
9433 {
9434 dtrace_predicate_t *pred;
9435
9436 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9437 ASSERT(dp->dtdo_refcnt != 0);
9438
9439 pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
9440 pred->dtp_difo = dp;
9441 pred->dtp_refcnt = 1;
9442
9443 if (!dtrace_difo_cacheable(dp))
9444 return (pred);
9445
9446 if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
9447 /*
9448 * This is only theoretically possible -- we have had 2^32
9449 * cacheable predicates on this machine. We cannot allow any
9450 * more predicates to become cacheable: as unlikely as it is,
9451 * there may be a thread caching a (now stale) predicate cache
9452 * ID. (N.B.: the temptation is being successfully resisted to
9453 * have this cmn_err() "Holy shit -- we executed this code!")
9454 */
9455 return (pred);
9456 }
9457
9458 pred->dtp_cacheid = dtrace_predcache_id++;
9459
9460 return (pred);
9461 }
9462
9463 static void
9464 dtrace_predicate_hold(dtrace_predicate_t *pred)
9465 {
9466 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9467 ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
9468 ASSERT(pred->dtp_refcnt > 0);
9469
9470 pred->dtp_refcnt++;
9471 }
9472
9473 static void
9474 dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)
9475 {
9476 dtrace_difo_t *dp = pred->dtp_difo;
9477 #pragma unused(dp) /* __APPLE__ */
9478
9479 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9480 ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
9481 ASSERT(pred->dtp_refcnt > 0);
9482
9483 if (--pred->dtp_refcnt == 0) {
9484 dtrace_difo_release(pred->dtp_difo, vstate);
9485 kmem_free(pred, sizeof (dtrace_predicate_t));
9486 }
9487 }
9488
9489 /*
9490 * DTrace Action Description Functions
9491 */
9492 static dtrace_actdesc_t *
9493 dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
9494 uint64_t uarg, uint64_t arg)
9495 {
9496 dtrace_actdesc_t *act;
9497
9498 ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != 0 &&
9499 arg >= KERNELBASE) || (arg == 0 && kind == DTRACEACT_PRINTA));
9500
9501 act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
9502 act->dtad_kind = kind;
9503 act->dtad_ntuple = ntuple;
9504 act->dtad_uarg = uarg;
9505 act->dtad_arg = arg;
9506 act->dtad_refcnt = 1;
9507
9508 return (act);
9509 }
9510
9511 static void
9512 dtrace_actdesc_hold(dtrace_actdesc_t *act)
9513 {
9514 ASSERT(act->dtad_refcnt >= 1);
9515 act->dtad_refcnt++;
9516 }
9517
9518 static void
9519 dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)
9520 {
9521 dtrace_actkind_t kind = act->dtad_kind;
9522 dtrace_difo_t *dp;
9523
9524 ASSERT(act->dtad_refcnt >= 1);
9525
9526 if (--act->dtad_refcnt != 0)
9527 return;
9528
9529 if ((dp = act->dtad_difo) != NULL)
9530 dtrace_difo_release(dp, vstate);
9531
9532 if (DTRACEACT_ISPRINTFLIKE(kind)) {
9533 char *str = (char *)(uintptr_t)act->dtad_arg;
9534
9535 ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
9536 (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
9537
9538 if (str != NULL)
9539 kmem_free(str, strlen(str) + 1);
9540 }
9541
9542 kmem_free(act, sizeof (dtrace_actdesc_t));
9543 }
9544
9545 /*
9546 * DTrace ECB Functions
9547 */
9548 static dtrace_ecb_t *
9549 dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
9550 {
9551 dtrace_ecb_t *ecb;
9552 dtrace_epid_t epid;
9553
9554 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9555
9556 ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
9557 ecb->dte_predicate = NULL;
9558 ecb->dte_probe = probe;
9559
9560 /*
9561 * The default size is the size of the default action: recording
9562 * the epid.
9563 */
9564 ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
9565 ecb->dte_alignment = sizeof (dtrace_epid_t);
9566
9567 epid = state->dts_epid++;
9568
9569 if (epid - 1 >= (dtrace_epid_t)state->dts_necbs) {
9570 dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;
9571 int necbs = state->dts_necbs << 1;
9572
9573 ASSERT(epid == (dtrace_epid_t)state->dts_necbs + 1);
9574
9575 if (necbs == 0) {
9576 ASSERT(oecbs == NULL);
9577 necbs = 1;
9578 }
9579
9580 ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
9581
9582 if (oecbs != NULL)
9583 bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
9584
9585 dtrace_membar_producer();
9586 state->dts_ecbs = ecbs;
9587
9588 if (oecbs != NULL) {
9589 /*
9590 * If this state is active, we must dtrace_sync()
9591 * before we can free the old dts_ecbs array: we're
9592 * coming in hot, and there may be active ring
9593 * buffer processing (which indexes into the dts_ecbs
9594 * array) on another CPU.
9595 */
9596 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
9597 dtrace_sync();
9598
9599 kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
9600 }
9601
9602 dtrace_membar_producer();
9603 state->dts_necbs = necbs;
9604 }
9605
9606 ecb->dte_state = state;
9607
9608 ASSERT(state->dts_ecbs[epid - 1] == NULL);
9609 dtrace_membar_producer();
9610 state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;
9611
9612 return (ecb);
9613 }
9614
9615 static int
9616 dtrace_ecb_enable(dtrace_ecb_t *ecb)
9617 {
9618 dtrace_probe_t *probe = ecb->dte_probe;
9619
9620 lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
9621 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9622 ASSERT(ecb->dte_next == NULL);
9623
9624 if (probe == NULL) {
9625 /*
9626 * This is the NULL probe -- there's nothing to do.
9627 */
9628 return(0);
9629 }
9630
9631 probe->dtpr_provider->dtpv_ecb_count++;
9632 if (probe->dtpr_ecb == NULL) {
9633 dtrace_provider_t *prov = probe->dtpr_provider;
9634
9635 /*
9636 * We're the first ECB on this probe.
9637 */
9638 probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
9639
9640 if (ecb->dte_predicate != NULL)
9641 probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
9642
9643 return (prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
9644 probe->dtpr_id, probe->dtpr_arg));
9645 } else {
9646 /*
9647 * This probe is already active. Swing the last pointer to
9648 * point to the new ECB, and issue a dtrace_sync() to assure
9649 * that all CPUs have seen the change.
9650 */
9651 ASSERT(probe->dtpr_ecb_last != NULL);
9652 probe->dtpr_ecb_last->dte_next = ecb;
9653 probe->dtpr_ecb_last = ecb;
9654 probe->dtpr_predcache = 0;
9655
9656 dtrace_sync();
9657 return(0);
9658 }
9659 }
9660
9661 static void
9662 dtrace_ecb_resize(dtrace_ecb_t *ecb)
9663 {
9664 uint32_t maxalign = sizeof (dtrace_epid_t);
9665 uint32_t align = sizeof (uint8_t), offs, diff;
9666 dtrace_action_t *act;
9667 int wastuple = 0;
9668 uint32_t aggbase = UINT32_MAX;
9669 dtrace_state_t *state = ecb->dte_state;
9670
9671 /*
9672 * If we record anything, we always record the epid. (And we always
9673 * record it first.)
9674 */
9675 offs = sizeof (dtrace_epid_t);
9676 ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
9677
9678 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
9679 dtrace_recdesc_t *rec = &act->dta_rec;
9680
9681 if ((align = rec->dtrd_alignment) > maxalign)
9682 maxalign = align;
9683
9684 if (!wastuple && act->dta_intuple) {
9685 /*
9686 * This is the first record in a tuple. Align the
9687 * offset to be at offset 4 in an 8-byte aligned
9688 * block.
9689 */
9690 diff = offs + sizeof (dtrace_aggid_t);
9691
9692 if ((diff = (diff & (sizeof (uint64_t) - 1))))
9693 offs += sizeof (uint64_t) - diff;
9694
9695 aggbase = offs - sizeof (dtrace_aggid_t);
9696 ASSERT(!(aggbase & (sizeof (uint64_t) - 1)));
9697 }
9698
9699 /*LINTED*/
9700 if (rec->dtrd_size != 0 && (diff = (offs & (align - 1)))) {
9701 /*
9702 * The current offset is not properly aligned; align it.
9703 */
9704 offs += align - diff;
9705 }
9706
9707 rec->dtrd_offset = offs;
9708
9709 if (offs + rec->dtrd_size > ecb->dte_needed) {
9710 ecb->dte_needed = offs + rec->dtrd_size;
9711
9712 if (ecb->dte_needed > state->dts_needed)
9713 state->dts_needed = ecb->dte_needed;
9714 }
9715
9716 if (DTRACEACT_ISAGG(act->dta_kind)) {
9717 dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
9718 dtrace_action_t *first = agg->dtag_first, *prev;
9719
9720 ASSERT(rec->dtrd_size != 0 && first != NULL);
9721 ASSERT(wastuple);
9722 ASSERT(aggbase != UINT32_MAX);
9723
9724 agg->dtag_base = aggbase;
9725
9726 while ((prev = first->dta_prev) != NULL &&
9727 DTRACEACT_ISAGG(prev->dta_kind)) {
9728 agg = (dtrace_aggregation_t *)prev;
9729 first = agg->dtag_first;
9730 }
9731
9732 if (prev != NULL) {
9733 offs = prev->dta_rec.dtrd_offset +
9734 prev->dta_rec.dtrd_size;
9735 } else {
9736 offs = sizeof (dtrace_epid_t);
9737 }
9738 wastuple = 0;
9739 } else {
9740 if (!act->dta_intuple)
9741 ecb->dte_size = offs + rec->dtrd_size;
9742
9743 offs += rec->dtrd_size;
9744 }
9745
9746 wastuple = act->dta_intuple;
9747 }
9748
9749 if ((act = ecb->dte_action) != NULL &&
9750 !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
9751 ecb->dte_size == sizeof (dtrace_epid_t)) {
9752 /*
9753 * If the size is still sizeof (dtrace_epid_t), then all
9754 * actions store no data; set the size to 0.
9755 */
9756 ecb->dte_alignment = maxalign;
9757 ecb->dte_size = 0;
9758
9759 /*
9760 * If the needed space is still sizeof (dtrace_epid_t), then
9761 * all actions need no additional space; set the needed
9762 * size to 0.
9763 */
9764 if (ecb->dte_needed == sizeof (dtrace_epid_t))
9765 ecb->dte_needed = 0;
9766
9767 return;
9768 }
9769
9770 /*
9771 * Set our alignment, and make sure that the dte_size and dte_needed
9772 * are aligned to the size of an EPID.
9773 */
9774 ecb->dte_alignment = maxalign;
9775 ecb->dte_size = (ecb->dte_size + (sizeof (dtrace_epid_t) - 1)) &
9776 ~(sizeof (dtrace_epid_t) - 1);
9777 ecb->dte_needed = (ecb->dte_needed + (sizeof (dtrace_epid_t) - 1)) &
9778 ~(sizeof (dtrace_epid_t) - 1);
9779 ASSERT(ecb->dte_size <= ecb->dte_needed);
9780 }
9781
9782 static dtrace_action_t *
9783 dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
9784 {
9785 dtrace_aggregation_t *agg;
9786 size_t size = sizeof (uint64_t);
9787 int ntuple = desc->dtad_ntuple;
9788 dtrace_action_t *act;
9789 dtrace_recdesc_t *frec;
9790 dtrace_aggid_t aggid;
9791 dtrace_state_t *state = ecb->dte_state;
9792
9793 agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
9794 agg->dtag_ecb = ecb;
9795
9796 ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
9797
9798 switch (desc->dtad_kind) {
9799 case DTRACEAGG_MIN:
9800 agg->dtag_initial = INT64_MAX;
9801 agg->dtag_aggregate = dtrace_aggregate_min;
9802 break;
9803
9804 case DTRACEAGG_MAX:
9805 agg->dtag_initial = INT64_MIN;
9806 agg->dtag_aggregate = dtrace_aggregate_max;
9807 break;
9808
9809 case DTRACEAGG_COUNT:
9810 agg->dtag_aggregate = dtrace_aggregate_count;
9811 break;
9812
9813 case DTRACEAGG_QUANTIZE:
9814 agg->dtag_aggregate = dtrace_aggregate_quantize;
9815 size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
9816 sizeof (uint64_t);
9817 break;
9818
9819 case DTRACEAGG_LQUANTIZE: {
9820 uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
9821 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
9822
9823 agg->dtag_initial = desc->dtad_arg;
9824 agg->dtag_aggregate = dtrace_aggregate_lquantize;
9825
9826 if (step == 0 || levels == 0)
9827 goto err;
9828
9829 size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
9830 break;
9831 }
9832
9833 case DTRACEAGG_LLQUANTIZE: {
9834 uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(desc->dtad_arg);
9835 uint16_t low = DTRACE_LLQUANTIZE_LOW(desc->dtad_arg);
9836 uint16_t high = DTRACE_LLQUANTIZE_HIGH(desc->dtad_arg);
9837 uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(desc->dtad_arg);
9838 int64_t v;
9839
9840 agg->dtag_initial = desc->dtad_arg;
9841 agg->dtag_aggregate = dtrace_aggregate_llquantize;
9842
9843 if (factor < 2 || low >= high || nsteps < factor)
9844 goto err;
9845
9846 /*
9847 * Now check that the number of steps evenly divides a power
9848 * of the factor. (This assures both integer bucket size and
9849 * linearity within each magnitude.)
9850 */
9851 for (v = factor; v < nsteps; v *= factor)
9852 continue;
9853
9854 if ((v % nsteps) || (nsteps % factor))
9855 goto err;
9856
9857 size = (dtrace_aggregate_llquantize_bucket(factor, low, high, nsteps, INT64_MAX) + 2) * sizeof (uint64_t);
9858 break;
9859 }
9860
9861 case DTRACEAGG_AVG:
9862 agg->dtag_aggregate = dtrace_aggregate_avg;
9863 size = sizeof (uint64_t) * 2;
9864 break;
9865
9866 case DTRACEAGG_STDDEV:
9867 agg->dtag_aggregate = dtrace_aggregate_stddev;
9868 size = sizeof (uint64_t) * 4;
9869 break;
9870
9871 case DTRACEAGG_SUM:
9872 agg->dtag_aggregate = dtrace_aggregate_sum;
9873 break;
9874
9875 default:
9876 goto err;
9877 }
9878
9879 agg->dtag_action.dta_rec.dtrd_size = size;
9880
9881 if (ntuple == 0)
9882 goto err;
9883
9884 /*
9885 * We must make sure that we have enough actions for the n-tuple.
9886 */
9887 for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
9888 if (DTRACEACT_ISAGG(act->dta_kind))
9889 break;
9890
9891 if (--ntuple == 0) {
9892 /*
9893 * This is the action with which our n-tuple begins.
9894 */
9895 agg->dtag_first = act;
9896 goto success;
9897 }
9898 }
9899
9900 /*
9901 * This n-tuple is short by ntuple elements. Return failure.
9902 */
9903 ASSERT(ntuple != 0);
9904 err:
9905 kmem_free(agg, sizeof (dtrace_aggregation_t));
9906 return (NULL);
9907
9908 success:
9909 /*
9910 * If the last action in the tuple has a size of zero, it's actually
9911 * an expression argument for the aggregating action.
9912 */
9913 ASSERT(ecb->dte_action_last != NULL);
9914 act = ecb->dte_action_last;
9915
9916 if (act->dta_kind == DTRACEACT_DIFEXPR) {
9917 ASSERT(act->dta_difo != NULL);
9918
9919 if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
9920 agg->dtag_hasarg = 1;
9921 }
9922
9923 /*
9924 * We need to allocate an id for this aggregation.
9925 */
9926 aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
9927 VM_BESTFIT | VM_SLEEP);
9928
9929 if (aggid - 1 >= (dtrace_aggid_t)state->dts_naggregations) {
9930 dtrace_aggregation_t **oaggs = state->dts_aggregations;
9931 dtrace_aggregation_t **aggs;
9932 int naggs = state->dts_naggregations << 1;
9933 int onaggs = state->dts_naggregations;
9934
9935 ASSERT(aggid == (dtrace_aggid_t)state->dts_naggregations + 1);
9936
9937 if (naggs == 0) {
9938 ASSERT(oaggs == NULL);
9939 naggs = 1;
9940 }
9941
9942 aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
9943
9944 if (oaggs != NULL) {
9945 bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
9946 kmem_free(oaggs, onaggs * sizeof (*aggs));
9947 }
9948
9949 state->dts_aggregations = aggs;
9950 state->dts_naggregations = naggs;
9951 }
9952
9953 ASSERT(state->dts_aggregations[aggid - 1] == NULL);
9954 state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;
9955
9956 frec = &agg->dtag_first->dta_rec;
9957 if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
9958 frec->dtrd_alignment = sizeof (dtrace_aggid_t);
9959
9960 for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
9961 ASSERT(!act->dta_intuple);
9962 act->dta_intuple = 1;
9963 }
9964
9965 return (&agg->dtag_action);
9966 }
9967
9968 static void
9969 dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)
9970 {
9971 dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
9972 dtrace_state_t *state = ecb->dte_state;
9973 dtrace_aggid_t aggid = agg->dtag_id;
9974
9975 ASSERT(DTRACEACT_ISAGG(act->dta_kind));
9976 vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
9977
9978 ASSERT(state->dts_aggregations[aggid - 1] == agg);
9979 state->dts_aggregations[aggid - 1] = NULL;
9980
9981 kmem_free(agg, sizeof (dtrace_aggregation_t));
9982 }
9983
9984 static int
9985 dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
9986 {
9987 dtrace_action_t *action, *last;
9988 dtrace_difo_t *dp = desc->dtad_difo;
9989 uint32_t size = 0, align = sizeof (uint8_t), mask;
9990 uint16_t format = 0;
9991 dtrace_recdesc_t *rec;
9992 dtrace_state_t *state = ecb->dte_state;
9993 dtrace_optval_t *opt = state->dts_options;
9994 dtrace_optval_t nframes=0, strsize;
9995 uint64_t arg = desc->dtad_arg;
9996
9997 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9998 ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);
9999
10000 if (DTRACEACT_ISAGG(desc->dtad_kind)) {
10001 /*
10002 * If this is an aggregating action, there must be neither
10003 * a speculate nor a commit on the action chain.
10004 */
10005 dtrace_action_t *act;
10006
10007 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
10008 if (act->dta_kind == DTRACEACT_COMMIT)
10009 return (EINVAL);
10010
10011 if (act->dta_kind == DTRACEACT_SPECULATE)
10012 return (EINVAL);
10013 }
10014
10015 action = dtrace_ecb_aggregation_create(ecb, desc);
10016
10017 if (action == NULL)
10018 return (EINVAL);
10019 } else {
10020 if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||
10021 (desc->dtad_kind == DTRACEACT_DIFEXPR &&
10022 dp != NULL && dp->dtdo_destructive)) {
10023 state->dts_destructive = 1;
10024 }
10025
10026 switch (desc->dtad_kind) {
10027 case DTRACEACT_PRINTF:
10028 case DTRACEACT_PRINTA:
10029 case DTRACEACT_SYSTEM:
10030 case DTRACEACT_FREOPEN:
10031 /*
10032 * We know that our arg is a string -- turn it into a
10033 * format.
10034 */
10035 if (arg == 0) {
10036 ASSERT(desc->dtad_kind == DTRACEACT_PRINTA);
10037 format = 0;
10038 } else {
10039 ASSERT(arg != 0);
10040 ASSERT(arg > KERNELBASE);
10041 format = dtrace_format_add(state,
10042 (char *)(uintptr_t)arg);
10043 }
10044
10045 /*FALLTHROUGH*/
10046 case DTRACEACT_LIBACT:
10047 case DTRACEACT_DIFEXPR:
10048 case DTRACEACT_TRACEMEM:
10049 case DTRACEACT_TRACEMEM_DYNSIZE:
10050 case DTRACEACT_APPLEBINARY: /* __APPLE__ */
10051 if (dp == NULL)
10052 return (EINVAL);
10053
10054 if ((size = dp->dtdo_rtype.dtdt_size) != 0)
10055 break;
10056
10057 if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
10058 if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10059 return (EINVAL);
10060
10061 size = opt[DTRACEOPT_STRSIZE];
10062 }
10063
10064 break;
10065
10066 case DTRACEACT_STACK:
10067 if ((nframes = arg) == 0) {
10068 nframes = opt[DTRACEOPT_STACKFRAMES];
10069 ASSERT(nframes > 0);
10070 arg = nframes;
10071 }
10072
10073 size = nframes * sizeof (pc_t);
10074 break;
10075
10076 case DTRACEACT_JSTACK:
10077 if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
10078 strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
10079
10080 if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
10081 nframes = opt[DTRACEOPT_JSTACKFRAMES];
10082
10083 arg = DTRACE_USTACK_ARG(nframes, strsize);
10084
10085 /*FALLTHROUGH*/
10086 case DTRACEACT_USTACK:
10087 if (desc->dtad_kind != DTRACEACT_JSTACK &&
10088 (nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
10089 strsize = DTRACE_USTACK_STRSIZE(arg);
10090 nframes = opt[DTRACEOPT_USTACKFRAMES];
10091 ASSERT(nframes > 0);
10092 arg = DTRACE_USTACK_ARG(nframes, strsize);
10093 }
10094
10095 /*
10096 * Save a slot for the pid.
10097 */
10098 size = (nframes + 1) * sizeof (uint64_t);
10099 size += DTRACE_USTACK_STRSIZE(arg);
10100 size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
10101
10102 break;
10103
10104 case DTRACEACT_SYM:
10105 case DTRACEACT_MOD:
10106 if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=
10107 sizeof (uint64_t)) ||
10108 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10109 return (EINVAL);
10110 break;
10111
10112 case DTRACEACT_USYM:
10113 case DTRACEACT_UMOD:
10114 case DTRACEACT_UADDR:
10115 if (dp == NULL ||
10116 (dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||
10117 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10118 return (EINVAL);
10119
10120 /*
10121 * We have a slot for the pid, plus a slot for the
10122 * argument. To keep things simple (aligned with
10123 * bitness-neutral sizing), we store each as a 64-bit
10124 * quantity.
10125 */
10126 size = 2 * sizeof (uint64_t);
10127 break;
10128
10129 case DTRACEACT_STOP:
10130 case DTRACEACT_BREAKPOINT:
10131 case DTRACEACT_PANIC:
10132 break;
10133
10134 case DTRACEACT_CHILL:
10135 case DTRACEACT_DISCARD:
10136 case DTRACEACT_RAISE:
10137 case DTRACEACT_PIDRESUME: /* __APPLE__ */
10138 if (dp == NULL)
10139 return (EINVAL);
10140 break;
10141
10142 case DTRACEACT_EXIT:
10143 if (dp == NULL ||
10144 (size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||
10145 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10146 return (EINVAL);
10147 break;
10148
10149 case DTRACEACT_SPECULATE:
10150 if (ecb->dte_size > sizeof (dtrace_epid_t))
10151 return (EINVAL);
10152
10153 if (dp == NULL)
10154 return (EINVAL);
10155
10156 state->dts_speculates = 1;
10157 break;
10158
10159 case DTRACEACT_COMMIT: {
10160 dtrace_action_t *act = ecb->dte_action;
10161
10162 for (; act != NULL; act = act->dta_next) {
10163 if (act->dta_kind == DTRACEACT_COMMIT)
10164 return (EINVAL);
10165 }
10166
10167 if (dp == NULL)
10168 return (EINVAL);
10169 break;
10170 }
10171
10172 default:
10173 return (EINVAL);
10174 }
10175
10176 if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {
10177 /*
10178 * If this is a data-storing action or a speculate,
10179 * we must be sure that there isn't a commit on the
10180 * action chain.
10181 */
10182 dtrace_action_t *act = ecb->dte_action;
10183
10184 for (; act != NULL; act = act->dta_next) {
10185 if (act->dta_kind == DTRACEACT_COMMIT)
10186 return (EINVAL);
10187 }
10188 }
10189
10190 action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
10191 action->dta_rec.dtrd_size = size;
10192 }
10193
10194 action->dta_refcnt = 1;
10195 rec = &action->dta_rec;
10196 size = rec->dtrd_size;
10197
10198 for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
10199 if (!(size & mask)) {
10200 align = mask + 1;
10201 break;
10202 }
10203 }
10204
10205 action->dta_kind = desc->dtad_kind;
10206
10207 if ((action->dta_difo = dp) != NULL)
10208 dtrace_difo_hold(dp);
10209
10210 rec->dtrd_action = action->dta_kind;
10211 rec->dtrd_arg = arg;
10212 rec->dtrd_uarg = desc->dtad_uarg;
10213 rec->dtrd_alignment = (uint16_t)align;
10214 rec->dtrd_format = format;
10215
10216 if ((last = ecb->dte_action_last) != NULL) {
10217 ASSERT(ecb->dte_action != NULL);
10218 action->dta_prev = last;
10219 last->dta_next = action;
10220 } else {
10221 ASSERT(ecb->dte_action == NULL);
10222 ecb->dte_action = action;
10223 }
10224
10225 ecb->dte_action_last = action;
10226
10227 return (0);
10228 }
10229
10230 static void
10231 dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
10232 {
10233 dtrace_action_t *act = ecb->dte_action, *next;
10234 dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
10235 dtrace_difo_t *dp;
10236 uint16_t format;
10237
10238 if (act != NULL && act->dta_refcnt > 1) {
10239 ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);
10240 act->dta_refcnt--;
10241 } else {
10242 for (; act != NULL; act = next) {
10243 next = act->dta_next;
10244 ASSERT(next != NULL || act == ecb->dte_action_last);
10245 ASSERT(act->dta_refcnt == 1);
10246
10247 if ((format = act->dta_rec.dtrd_format) != 0)
10248 dtrace_format_remove(ecb->dte_state, format);
10249
10250 if ((dp = act->dta_difo) != NULL)
10251 dtrace_difo_release(dp, vstate);
10252
10253 if (DTRACEACT_ISAGG(act->dta_kind)) {
10254 dtrace_ecb_aggregation_destroy(ecb, act);
10255 } else {
10256 kmem_free(act, sizeof (dtrace_action_t));
10257 }
10258 }
10259 }
10260
10261 ecb->dte_action = NULL;
10262 ecb->dte_action_last = NULL;
10263 ecb->dte_size = sizeof (dtrace_epid_t);
10264 }
10265
10266 static void
10267 dtrace_ecb_disable(dtrace_ecb_t *ecb)
10268 {
10269 /*
10270 * We disable the ECB by removing it from its probe.
10271 */
10272 dtrace_ecb_t *pecb, *prev = NULL;
10273 dtrace_probe_t *probe = ecb->dte_probe;
10274
10275 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10276
10277 if (probe == NULL) {
10278 /*
10279 * This is the NULL probe; there is nothing to disable.
10280 */
10281 return;
10282 }
10283
10284 for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
10285 if (pecb == ecb)
10286 break;
10287 prev = pecb;
10288 }
10289
10290 ASSERT(pecb != NULL);
10291
10292 if (prev == NULL) {
10293 probe->dtpr_ecb = ecb->dte_next;
10294 } else {
10295 prev->dte_next = ecb->dte_next;
10296 }
10297
10298 if (ecb == probe->dtpr_ecb_last) {
10299 ASSERT(ecb->dte_next == NULL);
10300 probe->dtpr_ecb_last = prev;
10301 }
10302
10303 probe->dtpr_provider->dtpv_ecb_count--;
10304 /*
10305 * The ECB has been disconnected from the probe; now sync to assure
10306 * that all CPUs have seen the change before returning.
10307 */
10308 dtrace_sync();
10309
10310 if (probe->dtpr_ecb == NULL) {
10311 /*
10312 * That was the last ECB on the probe; clear the predicate
10313 * cache ID for the probe, disable it and sync one more time
10314 * to assure that we'll never hit it again.
10315 */
10316 dtrace_provider_t *prov = probe->dtpr_provider;
10317
10318 ASSERT(ecb->dte_next == NULL);
10319 ASSERT(probe->dtpr_ecb_last == NULL);
10320 probe->dtpr_predcache = DTRACE_CACHEIDNONE;
10321 prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
10322 probe->dtpr_id, probe->dtpr_arg);
10323 dtrace_sync();
10324 } else {
10325 /*
10326 * There is at least one ECB remaining on the probe. If there
10327 * is _exactly_ one, set the probe's predicate cache ID to be
10328 * the predicate cache ID of the remaining ECB.
10329 */
10330 ASSERT(probe->dtpr_ecb_last != NULL);
10331 ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
10332
10333 if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
10334 dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
10335
10336 ASSERT(probe->dtpr_ecb->dte_next == NULL);
10337
10338 if (p != NULL)
10339 probe->dtpr_predcache = p->dtp_cacheid;
10340 }
10341
10342 ecb->dte_next = NULL;
10343 }
10344 }
10345
10346 static void
10347 dtrace_ecb_destroy(dtrace_ecb_t *ecb)
10348 {
10349 dtrace_state_t *state = ecb->dte_state;
10350 dtrace_vstate_t *vstate = &state->dts_vstate;
10351 dtrace_predicate_t *pred;
10352 dtrace_epid_t epid = ecb->dte_epid;
10353
10354 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10355 ASSERT(ecb->dte_next == NULL);
10356 ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);
10357
10358 if ((pred = ecb->dte_predicate) != NULL)
10359 dtrace_predicate_release(pred, vstate);
10360
10361 dtrace_ecb_action_remove(ecb);
10362
10363 ASSERT(state->dts_ecbs[epid - 1] == ecb);
10364 state->dts_ecbs[epid - 1] = NULL;
10365
10366 kmem_free(ecb, sizeof (dtrace_ecb_t));
10367 }
10368
10369 static dtrace_ecb_t *
10370 dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
10371 dtrace_enabling_t *enab)
10372 {
10373 dtrace_ecb_t *ecb;
10374 dtrace_predicate_t *pred;
10375 dtrace_actdesc_t *act;
10376 dtrace_provider_t *prov;
10377 dtrace_ecbdesc_t *desc = enab->dten_current;
10378
10379 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10380 ASSERT(state != NULL);
10381
10382 ecb = dtrace_ecb_add(state, probe);
10383 ecb->dte_uarg = desc->dted_uarg;
10384
10385 if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
10386 dtrace_predicate_hold(pred);
10387 ecb->dte_predicate = pred;
10388 }
10389
10390 if (probe != NULL) {
10391 /*
10392 * If the provider shows more leg than the consumer is old
10393 * enough to see, we need to enable the appropriate implicit
10394 * predicate bits to prevent the ecb from activating at
10395 * revealing times.
10396 *
10397 * Providers specifying DTRACE_PRIV_USER at register time
10398 * are stating that they need the /proc-style privilege
10399 * model to be enforced, and this is what DTRACE_COND_OWNER
10400 * and DTRACE_COND_ZONEOWNER will then do at probe time.
10401 */
10402 prov = probe->dtpr_provider;
10403 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
10404 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
10405 ecb->dte_cond |= DTRACE_COND_OWNER;
10406
10407 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
10408 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
10409 ecb->dte_cond |= DTRACE_COND_ZONEOWNER;
10410
10411 /*
10412 * If the provider shows us kernel innards and the user
10413 * is lacking sufficient privilege, enable the
10414 * DTRACE_COND_USERMODE implicit predicate.
10415 */
10416 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
10417 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
10418 ecb->dte_cond |= DTRACE_COND_USERMODE;
10419 }
10420
10421 if (dtrace_ecb_create_cache != NULL) {
10422 /*
10423 * If we have a cached ecb, we'll use its action list instead
10424 * of creating our own (saving both time and space).
10425 */
10426 dtrace_ecb_t *cached = dtrace_ecb_create_cache;
10427 dtrace_action_t *act_if = cached->dte_action;
10428
10429 if (act_if != NULL) {
10430 ASSERT(act_if->dta_refcnt > 0);
10431 act_if->dta_refcnt++;
10432 ecb->dte_action = act_if;
10433 ecb->dte_action_last = cached->dte_action_last;
10434 ecb->dte_needed = cached->dte_needed;
10435 ecb->dte_size = cached->dte_size;
10436 ecb->dte_alignment = cached->dte_alignment;
10437 }
10438
10439 return (ecb);
10440 }
10441
10442 for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
10443 if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
10444 dtrace_ecb_destroy(ecb);
10445 return (NULL);
10446 }
10447 }
10448
10449 dtrace_ecb_resize(ecb);
10450
10451 return (dtrace_ecb_create_cache = ecb);
10452 }
10453
10454 static int
10455 dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg)
10456 {
10457 dtrace_ecb_t *ecb;
10458 dtrace_enabling_t *enab = arg;
10459 dtrace_state_t *state = enab->dten_vstate->dtvs_state;
10460
10461 ASSERT(state != NULL);
10462
10463 if (probe != NULL && probe->dtpr_gen < enab->dten_probegen) {
10464 /*
10465 * This probe was created in a generation for which this
10466 * enabling has previously created ECBs; we don't want to
10467 * enable it again, so just kick out.
10468 */
10469 return (DTRACE_MATCH_NEXT);
10470 }
10471
10472 if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
10473 return (DTRACE_MATCH_DONE);
10474
10475 if (dtrace_ecb_enable(ecb) < 0)
10476 return (DTRACE_MATCH_FAIL);
10477
10478 return (DTRACE_MATCH_NEXT);
10479 }
10480
10481 static dtrace_ecb_t *
10482 dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
10483 {
10484 dtrace_ecb_t *ecb;
10485 #pragma unused(ecb) /* __APPLE__ */
10486
10487 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10488
10489 if (id == 0 || id > (dtrace_epid_t)state->dts_necbs)
10490 return (NULL);
10491
10492 ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
10493 ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);
10494
10495 return (state->dts_ecbs[id - 1]);
10496 }
10497
10498 static dtrace_aggregation_t *
10499 dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
10500 {
10501 dtrace_aggregation_t *agg;
10502 #pragma unused(agg) /* __APPLE__ */
10503
10504 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10505
10506 if (id == 0 || id > (dtrace_aggid_t)state->dts_naggregations)
10507 return (NULL);
10508
10509 ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
10510 ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||
10511 agg->dtag_id == id);
10512
10513 return (state->dts_aggregations[id - 1]);
10514 }
10515
10516 /*
10517 * DTrace Buffer Functions
10518 *
10519 * The following functions manipulate DTrace buffers. Most of these functions
10520 * are called in the context of establishing or processing consumer state;
10521 * exceptions are explicitly noted.
10522 */
10523
10524 /*
10525 * Note: called from cross call context. This function switches the two
10526 * buffers on a given CPU. The atomicity of this operation is assured by
10527 * disabling interrupts while the actual switch takes place; the disabling of
10528 * interrupts serializes the execution with any execution of dtrace_probe() on
10529 * the same CPU.
10530 */
10531 static void
10532 dtrace_buffer_switch(dtrace_buffer_t *buf)
10533 {
10534 caddr_t tomax = buf->dtb_tomax;
10535 caddr_t xamot = buf->dtb_xamot;
10536 dtrace_icookie_t cookie;
10537
10538 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
10539 ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
10540
10541 cookie = dtrace_interrupt_disable();
10542 buf->dtb_tomax = xamot;
10543 buf->dtb_xamot = tomax;
10544 buf->dtb_xamot_drops = buf->dtb_drops;
10545 buf->dtb_xamot_offset = buf->dtb_offset;
10546 buf->dtb_xamot_errors = buf->dtb_errors;
10547 buf->dtb_xamot_flags = buf->dtb_flags;
10548 buf->dtb_offset = 0;
10549 buf->dtb_drops = 0;
10550 buf->dtb_errors = 0;
10551 buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
10552 dtrace_interrupt_enable(cookie);
10553 }
10554
10555 /*
10556 * Note: called from cross call context. This function activates a buffer
10557 * on a CPU. As with dtrace_buffer_switch(), the atomicity of the operation
10558 * is guaranteed by the disabling of interrupts.
10559 */
10560 static void
10561 dtrace_buffer_activate(dtrace_state_t *state)
10562 {
10563 dtrace_buffer_t *buf;
10564 dtrace_icookie_t cookie = dtrace_interrupt_disable();
10565
10566 buf = &state->dts_buffer[CPU->cpu_id];
10567
10568 if (buf->dtb_tomax != NULL) {
10569 /*
10570 * We might like to assert that the buffer is marked inactive,
10571 * but this isn't necessarily true: the buffer for the CPU
10572 * that processes the BEGIN probe has its buffer activated
10573 * manually. In this case, we take the (harmless) action
10574 * re-clearing the bit INACTIVE bit.
10575 */
10576 buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
10577 }
10578
10579 dtrace_interrupt_enable(cookie);
10580 }
10581
10582 static int
10583 dtrace_buffer_canalloc(size_t size)
10584 {
10585 if (size > (UINT64_MAX - dtrace_buffer_memory_inuse))
10586 return (B_FALSE);
10587 if ((size + dtrace_buffer_memory_inuse) > dtrace_buffer_memory_maxsize)
10588 return (B_FALSE);
10589
10590 return (B_TRUE);
10591 }
10592
10593 static int
10594 dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags,
10595 processorid_t cpu)
10596 {
10597 dtrace_cpu_t *cp;
10598 dtrace_buffer_t *buf;
10599 size_t size_before_alloc = dtrace_buffer_memory_inuse;
10600
10601 lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
10602 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10603
10604 if (size > (size_t)dtrace_nonroot_maxsize &&
10605 !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
10606 return (EFBIG);
10607
10608 cp = cpu_list;
10609
10610 do {
10611 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
10612 continue;
10613
10614 buf = &bufs[cp->cpu_id];
10615
10616 /*
10617 * If there is already a buffer allocated for this CPU, it
10618 * is only possible that this is a DR event. In this case,
10619 * the buffer size must match our specified size.
10620 */
10621 if (buf->dtb_tomax != NULL) {
10622 ASSERT(buf->dtb_size == size);
10623 continue;
10624 }
10625
10626 ASSERT(buf->dtb_xamot == NULL);
10627
10628 /* DTrace, please do not eat all the memory. */
10629 if (dtrace_buffer_canalloc(size) == B_FALSE)
10630 goto err;
10631 if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
10632 goto err;
10633 dtrace_buffer_memory_inuse += size;
10634
10635 buf->dtb_size = size;
10636 buf->dtb_flags = flags;
10637 buf->dtb_offset = 0;
10638 buf->dtb_drops = 0;
10639
10640 if (flags & DTRACEBUF_NOSWITCH)
10641 continue;
10642
10643 /* DTrace, please do not eat all the memory. */
10644 if (dtrace_buffer_canalloc(size) == B_FALSE)
10645 goto err;
10646 if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
10647 goto err;
10648 dtrace_buffer_memory_inuse += size;
10649 } while ((cp = cp->cpu_next) != cpu_list);
10650
10651 ASSERT(dtrace_buffer_memory_inuse <= dtrace_buffer_memory_maxsize);
10652
10653 return (0);
10654
10655 err:
10656 cp = cpu_list;
10657
10658 do {
10659 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
10660 continue;
10661
10662 buf = &bufs[cp->cpu_id];
10663
10664 if (buf->dtb_xamot != NULL) {
10665 ASSERT(buf->dtb_tomax != NULL);
10666 ASSERT(buf->dtb_size == size);
10667 kmem_free(buf->dtb_xamot, size);
10668 }
10669
10670 if (buf->dtb_tomax != NULL) {
10671 ASSERT(buf->dtb_size == size);
10672 kmem_free(buf->dtb_tomax, size);
10673 }
10674
10675 buf->dtb_tomax = NULL;
10676 buf->dtb_xamot = NULL;
10677 buf->dtb_size = 0;
10678 } while ((cp = cp->cpu_next) != cpu_list);
10679
10680 /* Restore the size saved before allocating memory */
10681 dtrace_buffer_memory_inuse = size_before_alloc;
10682
10683 return (ENOMEM);
10684 }
10685
10686 /*
10687 * Note: called from probe context. This function just increments the drop
10688 * count on a buffer. It has been made a function to allow for the
10689 * possibility of understanding the source of mysterious drop counts. (A
10690 * problem for which one may be particularly disappointed that DTrace cannot
10691 * be used to understand DTrace.)
10692 */
10693 static void
10694 dtrace_buffer_drop(dtrace_buffer_t *buf)
10695 {
10696 buf->dtb_drops++;
10697 }
10698
10699 /*
10700 * Note: called from probe context. This function is called to reserve space
10701 * in a buffer. If mstate is non-NULL, sets the scratch base and size in the
10702 * mstate. Returns the new offset in the buffer, or a negative value if an
10703 * error has occurred.
10704 */
10705 static intptr_t
10706 dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
10707 dtrace_state_t *state, dtrace_mstate_t *mstate)
10708 {
10709 intptr_t offs = buf->dtb_offset, soffs;
10710 intptr_t woffs;
10711 caddr_t tomax;
10712 size_t total_off;
10713
10714 if (buf->dtb_flags & DTRACEBUF_INACTIVE)
10715 return (-1);
10716
10717 if ((tomax = buf->dtb_tomax) == NULL) {
10718 dtrace_buffer_drop(buf);
10719 return (-1);
10720 }
10721
10722 if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {
10723 while (offs & (align - 1)) {
10724 /*
10725 * Assert that our alignment is off by a number which
10726 * is itself sizeof (uint32_t) aligned.
10727 */
10728 ASSERT(!((align - (offs & (align - 1))) &
10729 (sizeof (uint32_t) - 1)));
10730 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
10731 offs += sizeof (uint32_t);
10732 }
10733
10734 if ((uint64_t)(soffs = offs + needed) > buf->dtb_size) {
10735 dtrace_buffer_drop(buf);
10736 return (-1);
10737 }
10738
10739 if (mstate == NULL)
10740 return (offs);
10741
10742 mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
10743 mstate->dtms_scratch_size = buf->dtb_size - soffs;
10744 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
10745
10746 return (offs);
10747 }
10748
10749 if (buf->dtb_flags & DTRACEBUF_FILL) {
10750 if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
10751 (buf->dtb_flags & DTRACEBUF_FULL))
10752 return (-1);
10753 goto out;
10754 }
10755
10756 total_off = needed + (offs & (align - 1));
10757
10758 /*
10759 * For a ring buffer, life is quite a bit more complicated. Before
10760 * we can store any padding, we need to adjust our wrapping offset.
10761 * (If we've never before wrapped or we're not about to, no adjustment
10762 * is required.)
10763 */
10764 if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
10765 offs + total_off > buf->dtb_size) {
10766 woffs = buf->dtb_xamot_offset;
10767
10768 if (offs + total_off > buf->dtb_size) {
10769 /*
10770 * We can't fit in the end of the buffer. First, a
10771 * sanity check that we can fit in the buffer at all.
10772 */
10773 if (total_off > buf->dtb_size) {
10774 dtrace_buffer_drop(buf);
10775 return (-1);
10776 }
10777
10778 /*
10779 * We're going to be storing at the top of the buffer,
10780 * so now we need to deal with the wrapped offset. We
10781 * only reset our wrapped offset to 0 if it is
10782 * currently greater than the current offset. If it
10783 * is less than the current offset, it is because a
10784 * previous allocation induced a wrap -- but the
10785 * allocation didn't subsequently take the space due
10786 * to an error or false predicate evaluation. In this
10787 * case, we'll just leave the wrapped offset alone: if
10788 * the wrapped offset hasn't been advanced far enough
10789 * for this allocation, it will be adjusted in the
10790 * lower loop.
10791 */
10792 if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
10793 if (woffs >= offs)
10794 woffs = 0;
10795 } else {
10796 woffs = 0;
10797 }
10798
10799 /*
10800 * Now we know that we're going to be storing to the
10801 * top of the buffer and that there is room for us
10802 * there. We need to clear the buffer from the current
10803 * offset to the end (there may be old gunk there).
10804 */
10805 while ((uint64_t)offs < buf->dtb_size)
10806 tomax[offs++] = 0;
10807
10808 /*
10809 * We need to set our offset to zero. And because we
10810 * are wrapping, we need to set the bit indicating as
10811 * much. We can also adjust our needed space back
10812 * down to the space required by the ECB -- we know
10813 * that the top of the buffer is aligned.
10814 */
10815 offs = 0;
10816 total_off = needed;
10817 buf->dtb_flags |= DTRACEBUF_WRAPPED;
10818 } else {
10819 /*
10820 * There is room for us in the buffer, so we simply
10821 * need to check the wrapped offset.
10822 */
10823 if (woffs < offs) {
10824 /*
10825 * The wrapped offset is less than the offset.
10826 * This can happen if we allocated buffer space
10827 * that induced a wrap, but then we didn't
10828 * subsequently take the space due to an error
10829 * or false predicate evaluation. This is
10830 * okay; we know that _this_ allocation isn't
10831 * going to induce a wrap. We still can't
10832 * reset the wrapped offset to be zero,
10833 * however: the space may have been trashed in
10834 * the previous failed probe attempt. But at
10835 * least the wrapped offset doesn't need to
10836 * be adjusted at all...
10837 */
10838 goto out;
10839 }
10840 }
10841
10842 while (offs + total_off > (size_t)woffs) {
10843 dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
10844 size_t size;
10845
10846 if (epid == DTRACE_EPIDNONE) {
10847 size = sizeof (uint32_t);
10848 } else {
10849 ASSERT(epid <= (dtrace_epid_t)state->dts_necbs);
10850 ASSERT(state->dts_ecbs[epid - 1] != NULL);
10851
10852 size = state->dts_ecbs[epid - 1]->dte_size;
10853 }
10854
10855 ASSERT(woffs + size <= buf->dtb_size);
10856 ASSERT(size != 0);
10857
10858 if (woffs + size == buf->dtb_size) {
10859 /*
10860 * We've reached the end of the buffer; we want
10861 * to set the wrapped offset to 0 and break
10862 * out. However, if the offs is 0, then we're
10863 * in a strange edge-condition: the amount of
10864 * space that we want to reserve plus the size
10865 * of the record that we're overwriting is
10866 * greater than the size of the buffer. This
10867 * is problematic because if we reserve the
10868 * space but subsequently don't consume it (due
10869 * to a failed predicate or error) the wrapped
10870 * offset will be 0 -- yet the EPID at offset 0
10871 * will not be committed. This situation is
10872 * relatively easy to deal with: if we're in
10873 * this case, the buffer is indistinguishable
10874 * from one that hasn't wrapped; we need only
10875 * finish the job by clearing the wrapped bit,
10876 * explicitly setting the offset to be 0, and
10877 * zero'ing out the old data in the buffer.
10878 */
10879 if (offs == 0) {
10880 buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
10881 buf->dtb_offset = 0;
10882 woffs = total_off;
10883
10884 while ((uint64_t)woffs < buf->dtb_size)
10885 tomax[woffs++] = 0;
10886 }
10887
10888 woffs = 0;
10889 break;
10890 }
10891
10892 woffs += size;
10893 }
10894
10895 /*
10896 * We have a wrapped offset. It may be that the wrapped offset
10897 * has become zero -- that's okay.
10898 */
10899 buf->dtb_xamot_offset = woffs;
10900 }
10901
10902 out:
10903 /*
10904 * Now we can plow the buffer with any necessary padding.
10905 */
10906 while (offs & (align - 1)) {
10907 /*
10908 * Assert that our alignment is off by a number which
10909 * is itself sizeof (uint32_t) aligned.
10910 */
10911 ASSERT(!((align - (offs & (align - 1))) &
10912 (sizeof (uint32_t) - 1)));
10913 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
10914 offs += sizeof (uint32_t);
10915 }
10916
10917 if (buf->dtb_flags & DTRACEBUF_FILL) {
10918 if (offs + needed > buf->dtb_size - state->dts_reserve) {
10919 buf->dtb_flags |= DTRACEBUF_FULL;
10920 return (-1);
10921 }
10922 }
10923
10924 if (mstate == NULL)
10925 return (offs);
10926
10927 /*
10928 * For ring buffers and fill buffers, the scratch space is always
10929 * the inactive buffer.
10930 */
10931 mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
10932 mstate->dtms_scratch_size = buf->dtb_size;
10933 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
10934
10935 return (offs);
10936 }
10937
10938 static void
10939 dtrace_buffer_polish(dtrace_buffer_t *buf)
10940 {
10941 ASSERT(buf->dtb_flags & DTRACEBUF_RING);
10942 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10943
10944 if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
10945 return;
10946
10947 /*
10948 * We need to polish the ring buffer. There are three cases:
10949 *
10950 * - The first (and presumably most common) is that there is no gap
10951 * between the buffer offset and the wrapped offset. In this case,
10952 * there is nothing in the buffer that isn't valid data; we can
10953 * mark the buffer as polished and return.
10954 *
10955 * - The second (less common than the first but still more common
10956 * than the third) is that there is a gap between the buffer offset
10957 * and the wrapped offset, and the wrapped offset is larger than the
10958 * buffer offset. This can happen because of an alignment issue, or
10959 * can happen because of a call to dtrace_buffer_reserve() that
10960 * didn't subsequently consume the buffer space. In this case,
10961 * we need to zero the data from the buffer offset to the wrapped
10962 * offset.
10963 *
10964 * - The third (and least common) is that there is a gap between the
10965 * buffer offset and the wrapped offset, but the wrapped offset is
10966 * _less_ than the buffer offset. This can only happen because a
10967 * call to dtrace_buffer_reserve() induced a wrap, but the space
10968 * was not subsequently consumed. In this case, we need to zero the
10969 * space from the offset to the end of the buffer _and_ from the
10970 * top of the buffer to the wrapped offset.
10971 */
10972 if (buf->dtb_offset < buf->dtb_xamot_offset) {
10973 bzero(buf->dtb_tomax + buf->dtb_offset,
10974 buf->dtb_xamot_offset - buf->dtb_offset);
10975 }
10976
10977 if (buf->dtb_offset > buf->dtb_xamot_offset) {
10978 bzero(buf->dtb_tomax + buf->dtb_offset,
10979 buf->dtb_size - buf->dtb_offset);
10980 bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
10981 }
10982 }
10983
10984 static void
10985 dtrace_buffer_free(dtrace_buffer_t *bufs)
10986 {
10987 int i;
10988
10989 for (i = 0; i < (int)NCPU; i++) {
10990 dtrace_buffer_t *buf = &bufs[i];
10991
10992 if (buf->dtb_tomax == NULL) {
10993 ASSERT(buf->dtb_xamot == NULL);
10994 ASSERT(buf->dtb_size == 0);
10995 continue;
10996 }
10997
10998 if (buf->dtb_xamot != NULL) {
10999 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
11000 kmem_free(buf->dtb_xamot, buf->dtb_size);
11001
11002 ASSERT(dtrace_buffer_memory_inuse >= buf->dtb_size);
11003 dtrace_buffer_memory_inuse -= buf->dtb_size;
11004 }
11005
11006 kmem_free(buf->dtb_tomax, buf->dtb_size);
11007 ASSERT(dtrace_buffer_memory_inuse >= buf->dtb_size);
11008 dtrace_buffer_memory_inuse -= buf->dtb_size;
11009
11010 buf->dtb_size = 0;
11011 buf->dtb_tomax = NULL;
11012 buf->dtb_xamot = NULL;
11013 }
11014 }
11015
11016 /*
11017 * DTrace Enabling Functions
11018 */
11019 static dtrace_enabling_t *
11020 dtrace_enabling_create(dtrace_vstate_t *vstate)
11021 {
11022 dtrace_enabling_t *enab;
11023
11024 enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
11025 enab->dten_vstate = vstate;
11026
11027 return (enab);
11028 }
11029
11030 static void
11031 dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)
11032 {
11033 dtrace_ecbdesc_t **ndesc;
11034 size_t osize, nsize;
11035
11036 /*
11037 * We can't add to enablings after we've enabled them, or after we've
11038 * retained them.
11039 */
11040 ASSERT(enab->dten_probegen == 0);
11041 ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
11042
11043 /* APPLE NOTE: this protects against gcc 4.0 botch on x86 */
11044 if (ecb == NULL) return;
11045
11046 if (enab->dten_ndesc < enab->dten_maxdesc) {
11047 enab->dten_desc[enab->dten_ndesc++] = ecb;
11048 return;
11049 }
11050
11051 osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
11052
11053 if (enab->dten_maxdesc == 0) {
11054 enab->dten_maxdesc = 1;
11055 } else {
11056 enab->dten_maxdesc <<= 1;
11057 }
11058
11059 ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
11060
11061 nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
11062 ndesc = kmem_zalloc(nsize, KM_SLEEP);
11063 bcopy(enab->dten_desc, ndesc, osize);
11064 kmem_free(enab->dten_desc, osize);
11065
11066 enab->dten_desc = ndesc;
11067 enab->dten_desc[enab->dten_ndesc++] = ecb;
11068 }
11069
11070 static void
11071 dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,
11072 dtrace_probedesc_t *pd)
11073 {
11074 dtrace_ecbdesc_t *new;
11075 dtrace_predicate_t *pred;
11076 dtrace_actdesc_t *act;
11077
11078 /*
11079 * We're going to create a new ECB description that matches the
11080 * specified ECB in every way, but has the specified probe description.
11081 */
11082 new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
11083
11084 if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
11085 dtrace_predicate_hold(pred);
11086
11087 for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
11088 dtrace_actdesc_hold(act);
11089
11090 new->dted_action = ecb->dted_action;
11091 new->dted_pred = ecb->dted_pred;
11092 new->dted_probe = *pd;
11093 new->dted_uarg = ecb->dted_uarg;
11094
11095 dtrace_enabling_add(enab, new);
11096 }
11097
11098 static void
11099 dtrace_enabling_dump(dtrace_enabling_t *enab)
11100 {
11101 int i;
11102
11103 for (i = 0; i < enab->dten_ndesc; i++) {
11104 dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
11105
11106 cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
11107 desc->dtpd_provider, desc->dtpd_mod,
11108 desc->dtpd_func, desc->dtpd_name);
11109 }
11110 }
11111
11112 static void
11113 dtrace_enabling_destroy(dtrace_enabling_t *enab)
11114 {
11115 int i;
11116 dtrace_ecbdesc_t *ep;
11117 dtrace_vstate_t *vstate = enab->dten_vstate;
11118
11119 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11120
11121 for (i = 0; i < enab->dten_ndesc; i++) {
11122 dtrace_actdesc_t *act, *next;
11123 dtrace_predicate_t *pred;
11124
11125 ep = enab->dten_desc[i];
11126
11127 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
11128 dtrace_predicate_release(pred, vstate);
11129
11130 for (act = ep->dted_action; act != NULL; act = next) {
11131 next = act->dtad_next;
11132 dtrace_actdesc_release(act, vstate);
11133 }
11134
11135 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
11136 }
11137
11138 kmem_free(enab->dten_desc,
11139 enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
11140
11141 /*
11142 * If this was a retained enabling, decrement the dts_nretained count
11143 * and take it off of the dtrace_retained list.
11144 */
11145 if (enab->dten_prev != NULL || enab->dten_next != NULL ||
11146 dtrace_retained == enab) {
11147 ASSERT(enab->dten_vstate->dtvs_state != NULL);
11148 ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
11149 enab->dten_vstate->dtvs_state->dts_nretained--;
11150 dtrace_retained_gen++;
11151 }
11152
11153 if (enab->dten_prev == NULL) {
11154 if (dtrace_retained == enab) {
11155 dtrace_retained = enab->dten_next;
11156
11157 if (dtrace_retained != NULL)
11158 dtrace_retained->dten_prev = NULL;
11159 }
11160 } else {
11161 ASSERT(enab != dtrace_retained);
11162 ASSERT(dtrace_retained != NULL);
11163 enab->dten_prev->dten_next = enab->dten_next;
11164 }
11165
11166 if (enab->dten_next != NULL) {
11167 ASSERT(dtrace_retained != NULL);
11168 enab->dten_next->dten_prev = enab->dten_prev;
11169 }
11170
11171 kmem_free(enab, sizeof (dtrace_enabling_t));
11172 }
11173
11174 static int
11175 dtrace_enabling_retain(dtrace_enabling_t *enab)
11176 {
11177 dtrace_state_t *state;
11178
11179 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11180 ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
11181 ASSERT(enab->dten_vstate != NULL);
11182
11183 state = enab->dten_vstate->dtvs_state;
11184 ASSERT(state != NULL);
11185
11186 /*
11187 * We only allow each state to retain dtrace_retain_max enablings.
11188 */
11189 if (state->dts_nretained >= dtrace_retain_max)
11190 return (ENOSPC);
11191
11192 state->dts_nretained++;
11193 dtrace_retained_gen++;
11194
11195 if (dtrace_retained == NULL) {
11196 dtrace_retained = enab;
11197 return (0);
11198 }
11199
11200 enab->dten_next = dtrace_retained;
11201 dtrace_retained->dten_prev = enab;
11202 dtrace_retained = enab;
11203
11204 return (0);
11205 }
11206
11207 static int
11208 dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,
11209 dtrace_probedesc_t *create)
11210 {
11211 dtrace_enabling_t *new, *enab;
11212 int found = 0, err = ENOENT;
11213
11214 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11215 ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
11216 ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
11217 ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
11218 ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
11219
11220 new = dtrace_enabling_create(&state->dts_vstate);
11221
11222 /*
11223 * Iterate over all retained enablings, looking for enablings that
11224 * match the specified state.
11225 */
11226 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
11227 int i;
11228
11229 /*
11230 * dtvs_state can only be NULL for helper enablings -- and
11231 * helper enablings can't be retained.
11232 */
11233 ASSERT(enab->dten_vstate->dtvs_state != NULL);
11234
11235 if (enab->dten_vstate->dtvs_state != state)
11236 continue;
11237
11238 /*
11239 * Now iterate over each probe description; we're looking for
11240 * an exact match to the specified probe description.
11241 */
11242 for (i = 0; i < enab->dten_ndesc; i++) {
11243 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
11244 dtrace_probedesc_t *pd = &ep->dted_probe;
11245
11246 /* APPLE NOTE: Darwin employs size bounded string operation. */
11247 if (strncmp(pd->dtpd_provider, match->dtpd_provider, DTRACE_PROVNAMELEN))
11248 continue;
11249
11250 if (strncmp(pd->dtpd_mod, match->dtpd_mod, DTRACE_MODNAMELEN))
11251 continue;
11252
11253 if (strncmp(pd->dtpd_func, match->dtpd_func, DTRACE_FUNCNAMELEN))
11254 continue;
11255
11256 if (strncmp(pd->dtpd_name, match->dtpd_name, DTRACE_NAMELEN))
11257 continue;
11258
11259 /*
11260 * We have a winning probe! Add it to our growing
11261 * enabling.
11262 */
11263 found = 1;
11264 dtrace_enabling_addlike(new, ep, create);
11265 }
11266 }
11267
11268 if (!found || (err = dtrace_enabling_retain(new)) != 0) {
11269 dtrace_enabling_destroy(new);
11270 return (err);
11271 }
11272
11273 return (0);
11274 }
11275
11276 static void
11277 dtrace_enabling_retract(dtrace_state_t *state)
11278 {
11279 dtrace_enabling_t *enab, *next;
11280
11281 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11282
11283 /*
11284 * Iterate over all retained enablings, destroy the enablings retained
11285 * for the specified state.
11286 */
11287 for (enab = dtrace_retained; enab != NULL; enab = next) {
11288 next = enab->dten_next;
11289
11290 /*
11291 * dtvs_state can only be NULL for helper enablings -- and
11292 * helper enablings can't be retained.
11293 */
11294 ASSERT(enab->dten_vstate->dtvs_state != NULL);
11295
11296 if (enab->dten_vstate->dtvs_state == state) {
11297 ASSERT(state->dts_nretained > 0);
11298 dtrace_enabling_destroy(enab);
11299 }
11300 }
11301
11302 ASSERT(state->dts_nretained == 0);
11303 }
11304
11305 static int
11306 dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched)
11307 {
11308 int i = 0;
11309 int total_matched = 0, matched = 0;
11310
11311 lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
11312 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11313
11314 for (i = 0; i < enab->dten_ndesc; i++) {
11315 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
11316
11317 enab->dten_current = ep;
11318 enab->dten_error = 0;
11319
11320 /*
11321 * If a provider failed to enable a probe then get out and
11322 * let the consumer know we failed.
11323 */
11324 if ((matched = dtrace_probe_enable(&ep->dted_probe, enab)) < 0)
11325 return (EBUSY);
11326
11327 total_matched += matched;
11328
11329 if (enab->dten_error != 0) {
11330 /*
11331 * If we get an error half-way through enabling the
11332 * probes, we kick out -- perhaps with some number of
11333 * them enabled. Leaving enabled probes enabled may
11334 * be slightly confusing for user-level, but we expect
11335 * that no one will attempt to actually drive on in
11336 * the face of such errors. If this is an anonymous
11337 * enabling (indicated with a NULL nmatched pointer),
11338 * we cmn_err() a message. We aren't expecting to
11339 * get such an error -- such as it can exist at all,
11340 * it would be a result of corrupted DOF in the driver
11341 * properties.
11342 */
11343 if (nmatched == NULL) {
11344 cmn_err(CE_WARN, "dtrace_enabling_match() "
11345 "error on %p: %d", (void *)ep,
11346 enab->dten_error);
11347 }
11348
11349 return (enab->dten_error);
11350 }
11351 }
11352
11353 enab->dten_probegen = dtrace_probegen;
11354 if (nmatched != NULL)
11355 *nmatched = total_matched;
11356
11357 return (0);
11358 }
11359
11360 static void
11361 dtrace_enabling_matchall(void)
11362 {
11363 dtrace_enabling_t *enab;
11364
11365 lck_mtx_lock(&cpu_lock);
11366 lck_mtx_lock(&dtrace_lock);
11367
11368 /*
11369 * Iterate over all retained enablings to see if any probes match
11370 * against them. We only perform this operation on enablings for which
11371 * we have sufficient permissions by virtue of being in the global zone
11372 * or in the same zone as the DTrace client. Because we can be called
11373 * after dtrace_detach() has been called, we cannot assert that there
11374 * are retained enablings. We can safely load from dtrace_retained,
11375 * however: the taskq_destroy() at the end of dtrace_detach() will
11376 * block pending our completion.
11377 */
11378
11379 /*
11380 * Darwin doesn't do zones.
11381 * Behave as if always in "global" zone."
11382 */
11383 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
11384 (void) dtrace_enabling_match(enab, NULL);
11385 }
11386
11387 lck_mtx_unlock(&dtrace_lock);
11388 lck_mtx_unlock(&cpu_lock);
11389 }
11390
11391 /*
11392 * If an enabling is to be enabled without having matched probes (that is, if
11393 * dtrace_state_go() is to be called on the underlying dtrace_state_t), the
11394 * enabling must be _primed_ by creating an ECB for every ECB description.
11395 * This must be done to assure that we know the number of speculations, the
11396 * number of aggregations, the minimum buffer size needed, etc. before we
11397 * transition out of DTRACE_ACTIVITY_INACTIVE. To do this without actually
11398 * enabling any probes, we create ECBs for every ECB decription, but with a
11399 * NULL probe -- which is exactly what this function does.
11400 */
11401 static void
11402 dtrace_enabling_prime(dtrace_state_t *state)
11403 {
11404 dtrace_enabling_t *enab;
11405 int i;
11406
11407 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
11408 ASSERT(enab->dten_vstate->dtvs_state != NULL);
11409
11410 if (enab->dten_vstate->dtvs_state != state)
11411 continue;
11412
11413 /*
11414 * We don't want to prime an enabling more than once, lest
11415 * we allow a malicious user to induce resource exhaustion.
11416 * (The ECBs that result from priming an enabling aren't
11417 * leaked -- but they also aren't deallocated until the
11418 * consumer state is destroyed.)
11419 */
11420 if (enab->dten_primed)
11421 continue;
11422
11423 for (i = 0; i < enab->dten_ndesc; i++) {
11424 enab->dten_current = enab->dten_desc[i];
11425 (void) dtrace_probe_enable(NULL, enab);
11426 }
11427
11428 enab->dten_primed = 1;
11429 }
11430 }
11431
11432 /*
11433 * Called to indicate that probes should be provided due to retained
11434 * enablings. This is implemented in terms of dtrace_probe_provide(), but it
11435 * must take an initial lap through the enabling calling the dtps_provide()
11436 * entry point explicitly to allow for autocreated probes.
11437 */
11438 static void
11439 dtrace_enabling_provide(dtrace_provider_t *prv)
11440 {
11441 int i, all = 0;
11442 dtrace_probedesc_t desc;
11443 dtrace_genid_t gen;
11444
11445 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11446 lck_mtx_assert(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
11447
11448 if (prv == NULL) {
11449 all = 1;
11450 prv = dtrace_provider;
11451 }
11452
11453 do {
11454 dtrace_enabling_t *enab;
11455 void *parg = prv->dtpv_arg;
11456
11457 retry:
11458 gen = dtrace_retained_gen;
11459 for (enab = dtrace_retained; enab != NULL;
11460 enab = enab->dten_next) {
11461 for (i = 0; i < enab->dten_ndesc; i++) {
11462 desc = enab->dten_desc[i]->dted_probe;
11463 lck_mtx_unlock(&dtrace_lock);
11464 prv->dtpv_pops.dtps_provide(parg, &desc);
11465 lck_mtx_lock(&dtrace_lock);
11466 /*
11467 * Process the retained enablings again if
11468 * they have changed while we weren't holding
11469 * dtrace_lock.
11470 */
11471 if (gen != dtrace_retained_gen)
11472 goto retry;
11473 }
11474 }
11475 } while (all && (prv = prv->dtpv_next) != NULL);
11476
11477 lck_mtx_unlock(&dtrace_lock);
11478 dtrace_probe_provide(NULL, all ? NULL : prv);
11479 lck_mtx_lock(&dtrace_lock);
11480 }
11481
11482 /*
11483 * DTrace DOF Functions
11484 */
11485 /*ARGSUSED*/
11486 static void
11487 dtrace_dof_error(dof_hdr_t *dof, const char *str)
11488 {
11489 #pragma unused(dof) /* __APPLE__ */
11490 if (dtrace_err_verbose)
11491 cmn_err(CE_WARN, "failed to process DOF: %s", str);
11492
11493 #ifdef DTRACE_ERRDEBUG
11494 dtrace_errdebug(str);
11495 #endif
11496 }
11497
11498 /*
11499 * Create DOF out of a currently enabled state. Right now, we only create
11500 * DOF containing the run-time options -- but this could be expanded to create
11501 * complete DOF representing the enabled state.
11502 */
11503 static dof_hdr_t *
11504 dtrace_dof_create(dtrace_state_t *state)
11505 {
11506 dof_hdr_t *dof;
11507 dof_sec_t *sec;
11508 dof_optdesc_t *opt;
11509 int i, len = sizeof (dof_hdr_t) +
11510 roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
11511 sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
11512
11513 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11514
11515 dof = dt_kmem_zalloc_aligned(len, 8, KM_SLEEP);
11516 dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
11517 dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
11518 dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
11519 dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
11520
11521 dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
11522 dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
11523 dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
11524 dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
11525 dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
11526 dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
11527
11528 dof->dofh_flags = 0;
11529 dof->dofh_hdrsize = sizeof (dof_hdr_t);
11530 dof->dofh_secsize = sizeof (dof_sec_t);
11531 dof->dofh_secnum = 1; /* only DOF_SECT_OPTDESC */
11532 dof->dofh_secoff = sizeof (dof_hdr_t);
11533 dof->dofh_loadsz = len;
11534 dof->dofh_filesz = len;
11535 dof->dofh_pad = 0;
11536
11537 /*
11538 * Fill in the option section header...
11539 */
11540 sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
11541 sec->dofs_type = DOF_SECT_OPTDESC;
11542 sec->dofs_align = sizeof (uint64_t);
11543 sec->dofs_flags = DOF_SECF_LOAD;
11544 sec->dofs_entsize = sizeof (dof_optdesc_t);
11545
11546 opt = (dof_optdesc_t *)((uintptr_t)sec +
11547 roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
11548
11549 sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
11550 sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
11551
11552 for (i = 0; i < DTRACEOPT_MAX; i++) {
11553 opt[i].dofo_option = i;
11554 opt[i].dofo_strtab = DOF_SECIDX_NONE;
11555 opt[i].dofo_value = state->dts_options[i];
11556 }
11557
11558 return (dof);
11559 }
11560
11561 static dof_hdr_t *
11562 dtrace_dof_copyin(user_addr_t uarg, int *errp)
11563 {
11564 dof_hdr_t hdr, *dof;
11565
11566 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
11567
11568 /*
11569 * First, we're going to copyin() the sizeof (dof_hdr_t).
11570 */
11571 if (copyin(uarg, &hdr, sizeof (hdr)) != 0) {
11572 dtrace_dof_error(NULL, "failed to copyin DOF header");
11573 *errp = EFAULT;
11574 return (NULL);
11575 }
11576
11577 /*
11578 * Now we'll allocate the entire DOF and copy it in -- provided
11579 * that the length isn't outrageous.
11580 */
11581 if (hdr.dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
11582 dtrace_dof_error(&hdr, "load size exceeds maximum");
11583 *errp = E2BIG;
11584 return (NULL);
11585 }
11586
11587 if (hdr.dofh_loadsz < sizeof (hdr)) {
11588 dtrace_dof_error(&hdr, "invalid load size");
11589 *errp = EINVAL;
11590 return (NULL);
11591 }
11592
11593 dof = dt_kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP);
11594
11595 if (copyin(uarg, dof, hdr.dofh_loadsz) != 0 ||
11596 dof->dofh_loadsz != hdr.dofh_loadsz) {
11597 dt_kmem_free_aligned(dof, hdr.dofh_loadsz);
11598 *errp = EFAULT;
11599 return (NULL);
11600 }
11601
11602 return (dof);
11603 }
11604
11605 static dof_hdr_t *
11606 dtrace_dof_copyin_from_proc(proc_t* p, user_addr_t uarg, int *errp)
11607 {
11608 dof_hdr_t hdr, *dof;
11609
11610 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
11611
11612 /*
11613 * First, we're going to copyin() the sizeof (dof_hdr_t).
11614 */
11615 if (uread(p, &hdr, sizeof(hdr), uarg) != KERN_SUCCESS) {
11616 dtrace_dof_error(NULL, "failed to copyin DOF header");
11617 *errp = EFAULT;
11618 return (NULL);
11619 }
11620
11621 /*
11622 * Now we'll allocate the entire DOF and copy it in -- provided
11623 * that the length isn't outrageous.
11624 */
11625 if (hdr.dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
11626 dtrace_dof_error(&hdr, "load size exceeds maximum");
11627 *errp = E2BIG;
11628 return (NULL);
11629 }
11630
11631 if (hdr.dofh_loadsz < sizeof (hdr)) {
11632 dtrace_dof_error(&hdr, "invalid load size");
11633 *errp = EINVAL;
11634 return (NULL);
11635 }
11636
11637 dof = dt_kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP);
11638
11639 if (uread(p, dof, hdr.dofh_loadsz, uarg) != KERN_SUCCESS) {
11640 dt_kmem_free_aligned(dof, hdr.dofh_loadsz);
11641 *errp = EFAULT;
11642 return (NULL);
11643 }
11644
11645 return (dof);
11646 }
11647
11648 static dof_hdr_t *
11649 dtrace_dof_property(const char *name)
11650 {
11651 uchar_t *buf;
11652 uint64_t loadsz;
11653 unsigned int len, i;
11654 dof_hdr_t *dof;
11655
11656 /*
11657 * Unfortunately, array of values in .conf files are always (and
11658 * only) interpreted to be integer arrays. We must read our DOF
11659 * as an integer array, and then squeeze it into a byte array.
11660 */
11661 if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dtrace_devi, 0,
11662 name, (int **)&buf, &len) != DDI_PROP_SUCCESS)
11663 return (NULL);
11664
11665 for (i = 0; i < len; i++)
11666 buf[i] = (uchar_t)(((int *)buf)[i]);
11667
11668 if (len < sizeof (dof_hdr_t)) {
11669 ddi_prop_free(buf);
11670 dtrace_dof_error(NULL, "truncated header");
11671 return (NULL);
11672 }
11673
11674 if (len < (loadsz = ((dof_hdr_t *)buf)->dofh_loadsz)) {
11675 ddi_prop_free(buf);
11676 dtrace_dof_error(NULL, "truncated DOF");
11677 return (NULL);
11678 }
11679
11680 if (loadsz >= (uint64_t)dtrace_dof_maxsize) {
11681 ddi_prop_free(buf);
11682 dtrace_dof_error(NULL, "oversized DOF");
11683 return (NULL);
11684 }
11685
11686 dof = dt_kmem_alloc_aligned(loadsz, 8, KM_SLEEP);
11687 bcopy(buf, dof, loadsz);
11688 ddi_prop_free(buf);
11689
11690 return (dof);
11691 }
11692
11693 static void
11694 dtrace_dof_destroy(dof_hdr_t *dof)
11695 {
11696 dt_kmem_free_aligned(dof, dof->dofh_loadsz);
11697 }
11698
11699 /*
11700 * Return the dof_sec_t pointer corresponding to a given section index. If the
11701 * index is not valid, dtrace_dof_error() is called and NULL is returned. If
11702 * a type other than DOF_SECT_NONE is specified, the header is checked against
11703 * this type and NULL is returned if the types do not match.
11704 */
11705 static dof_sec_t *
11706 dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
11707 {
11708 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)
11709 ((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
11710
11711 if (i >= dof->dofh_secnum) {
11712 dtrace_dof_error(dof, "referenced section index is invalid");
11713 return (NULL);
11714 }
11715
11716 if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
11717 dtrace_dof_error(dof, "referenced section is not loadable");
11718 return (NULL);
11719 }
11720
11721 if (type != DOF_SECT_NONE && type != sec->dofs_type) {
11722 dtrace_dof_error(dof, "referenced section is the wrong type");
11723 return (NULL);
11724 }
11725
11726 return (sec);
11727 }
11728
11729 static dtrace_probedesc_t *
11730 dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)
11731 {
11732 dof_probedesc_t *probe;
11733 dof_sec_t *strtab;
11734 uintptr_t daddr = (uintptr_t)dof;
11735 uintptr_t str;
11736 size_t size;
11737
11738 if (sec->dofs_type != DOF_SECT_PROBEDESC) {
11739 dtrace_dof_error(dof, "invalid probe section");
11740 return (NULL);
11741 }
11742
11743 if (sec->dofs_align != sizeof (dof_secidx_t)) {
11744 dtrace_dof_error(dof, "bad alignment in probe description");
11745 return (NULL);
11746 }
11747
11748 if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
11749 dtrace_dof_error(dof, "truncated probe description");
11750 return (NULL);
11751 }
11752
11753 probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
11754 strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
11755
11756 if (strtab == NULL)
11757 return (NULL);
11758
11759 str = daddr + strtab->dofs_offset;
11760 size = strtab->dofs_size;
11761
11762 if (probe->dofp_provider >= strtab->dofs_size) {
11763 dtrace_dof_error(dof, "corrupt probe provider");
11764 return (NULL);
11765 }
11766
11767 (void) strncpy(desc->dtpd_provider,
11768 (char *)(str + probe->dofp_provider),
11769 MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));
11770
11771 /* APPLE NOTE: Darwin employs size bounded string operation. */
11772 desc->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
11773
11774 if (probe->dofp_mod >= strtab->dofs_size) {
11775 dtrace_dof_error(dof, "corrupt probe module");
11776 return (NULL);
11777 }
11778
11779 (void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
11780 MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));
11781
11782 /* APPLE NOTE: Darwin employs size bounded string operation. */
11783 desc->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
11784
11785 if (probe->dofp_func >= strtab->dofs_size) {
11786 dtrace_dof_error(dof, "corrupt probe function");
11787 return (NULL);
11788 }
11789
11790 (void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
11791 MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));
11792
11793 /* APPLE NOTE: Darwin employs size bounded string operation. */
11794 desc->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
11795
11796 if (probe->dofp_name >= strtab->dofs_size) {
11797 dtrace_dof_error(dof, "corrupt probe name");
11798 return (NULL);
11799 }
11800
11801 (void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
11802 MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));
11803
11804 /* APPLE NOTE: Darwin employs size bounded string operation. */
11805 desc->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
11806
11807 return (desc);
11808 }
11809
11810 static dtrace_difo_t *
11811 dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11812 cred_t *cr)
11813 {
11814 dtrace_difo_t *dp;
11815 size_t ttl = 0;
11816 dof_difohdr_t *dofd;
11817 uintptr_t daddr = (uintptr_t)dof;
11818 size_t max_size = dtrace_difo_maxsize;
11819 uint_t i;
11820 int l, n;
11821
11822
11823 static const struct {
11824 int section;
11825 int bufoffs;
11826 int lenoffs;
11827 int entsize;
11828 int align;
11829 const char *msg;
11830 } difo[] = {
11831 { DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
11832 offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
11833 sizeof (dif_instr_t), "multiple DIF sections" },
11834
11835 { DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
11836 offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
11837 sizeof (uint64_t), "multiple integer tables" },
11838
11839 { DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
11840 offsetof(dtrace_difo_t, dtdo_strlen), 0,
11841 sizeof (char), "multiple string tables" },
11842
11843 { DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
11844 offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
11845 sizeof (uint_t), "multiple variable tables" },
11846
11847 { DOF_SECT_NONE, 0, 0, 0, 0, NULL }
11848 };
11849
11850 if (sec->dofs_type != DOF_SECT_DIFOHDR) {
11851 dtrace_dof_error(dof, "invalid DIFO header section");
11852 return (NULL);
11853 }
11854
11855 if (sec->dofs_align != sizeof (dof_secidx_t)) {
11856 dtrace_dof_error(dof, "bad alignment in DIFO header");
11857 return (NULL);
11858 }
11859
11860 if (sec->dofs_size < sizeof (dof_difohdr_t) ||
11861 sec->dofs_size % sizeof (dof_secidx_t)) {
11862 dtrace_dof_error(dof, "bad size in DIFO header");
11863 return (NULL);
11864 }
11865
11866 dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
11867 n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;
11868
11869 dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
11870 dp->dtdo_rtype = dofd->dofd_rtype;
11871
11872 for (l = 0; l < n; l++) {
11873 dof_sec_t *subsec;
11874 void **bufp;
11875 uint32_t *lenp;
11876
11877 if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
11878 dofd->dofd_links[l])) == NULL)
11879 goto err; /* invalid section link */
11880
11881 if (ttl + subsec->dofs_size > max_size) {
11882 dtrace_dof_error(dof, "exceeds maximum size");
11883 goto err;
11884 }
11885
11886 ttl += subsec->dofs_size;
11887
11888 for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
11889
11890 if (subsec->dofs_type != (uint32_t)difo[i].section)
11891 continue;
11892
11893 if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
11894 dtrace_dof_error(dof, "section not loaded");
11895 goto err;
11896 }
11897
11898 if (subsec->dofs_align != (uint32_t)difo[i].align) {
11899 dtrace_dof_error(dof, "bad alignment");
11900 goto err;
11901 }
11902
11903 bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
11904 lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
11905
11906 if (*bufp != NULL) {
11907 dtrace_dof_error(dof, difo[i].msg);
11908 goto err;
11909 }
11910
11911 if ((uint32_t)difo[i].entsize != subsec->dofs_entsize) {
11912 dtrace_dof_error(dof, "entry size mismatch");
11913 goto err;
11914 }
11915
11916 if (subsec->dofs_entsize != 0 &&
11917 (subsec->dofs_size % subsec->dofs_entsize) != 0) {
11918 dtrace_dof_error(dof, "corrupt entry size");
11919 goto err;
11920 }
11921
11922 *lenp = subsec->dofs_size;
11923 *bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
11924 bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
11925 *bufp, subsec->dofs_size);
11926
11927 if (subsec->dofs_entsize != 0)
11928 *lenp /= subsec->dofs_entsize;
11929
11930 break;
11931 }
11932
11933 /*
11934 * If we encounter a loadable DIFO sub-section that is not
11935 * known to us, assume this is a broken program and fail.
11936 */
11937 if (difo[i].section == DOF_SECT_NONE &&
11938 (subsec->dofs_flags & DOF_SECF_LOAD)) {
11939 dtrace_dof_error(dof, "unrecognized DIFO subsection");
11940 goto err;
11941 }
11942 }
11943
11944 if (dp->dtdo_buf == NULL) {
11945 /*
11946 * We can't have a DIF object without DIF text.
11947 */
11948 dtrace_dof_error(dof, "missing DIF text");
11949 goto err;
11950 }
11951
11952 /*
11953 * Before we validate the DIF object, run through the variable table
11954 * looking for the strings -- if any of their size are under, we'll set
11955 * their size to be the system-wide default string size. Note that
11956 * this should _not_ happen if the "strsize" option has been set --
11957 * in this case, the compiler should have set the size to reflect the
11958 * setting of the option.
11959 */
11960 for (i = 0; i < dp->dtdo_varlen; i++) {
11961 dtrace_difv_t *v = &dp->dtdo_vartab[i];
11962 dtrace_diftype_t *t = &v->dtdv_type;
11963
11964 if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
11965 continue;
11966
11967 if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
11968 t->dtdt_size = dtrace_strsize_default;
11969 }
11970
11971 if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
11972 goto err;
11973
11974 dtrace_difo_init(dp, vstate);
11975 return (dp);
11976
11977 err:
11978 kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
11979 kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
11980 kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
11981 kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
11982
11983 kmem_free(dp, sizeof (dtrace_difo_t));
11984 return (NULL);
11985 }
11986
11987 static dtrace_predicate_t *
11988 dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
11989 cred_t *cr)
11990 {
11991 dtrace_difo_t *dp;
11992
11993 if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
11994 return (NULL);
11995
11996 return (dtrace_predicate_create(dp));
11997 }
11998
11999 static dtrace_actdesc_t *
12000 dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
12001 cred_t *cr)
12002 {
12003 dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;
12004 dof_actdesc_t *desc;
12005 dof_sec_t *difosec;
12006 size_t offs;
12007 uintptr_t daddr = (uintptr_t)dof;
12008 uint64_t arg;
12009 dtrace_actkind_t kind;
12010
12011 if (sec->dofs_type != DOF_SECT_ACTDESC) {
12012 dtrace_dof_error(dof, "invalid action section");
12013 return (NULL);
12014 }
12015
12016 if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
12017 dtrace_dof_error(dof, "truncated action description");
12018 return (NULL);
12019 }
12020
12021 if (sec->dofs_align != sizeof (uint64_t)) {
12022 dtrace_dof_error(dof, "bad alignment in action description");
12023 return (NULL);
12024 }
12025
12026 if (sec->dofs_size < sec->dofs_entsize) {
12027 dtrace_dof_error(dof, "section entry size exceeds total size");
12028 return (NULL);
12029 }
12030
12031 if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
12032 dtrace_dof_error(dof, "bad entry size in action description");
12033 return (NULL);
12034 }
12035
12036 if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
12037 dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
12038 return (NULL);
12039 }
12040
12041 for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
12042 desc = (dof_actdesc_t *)(daddr +
12043 (uintptr_t)sec->dofs_offset + offs);
12044 kind = (dtrace_actkind_t)desc->dofa_kind;
12045
12046 if (DTRACEACT_ISPRINTFLIKE(kind) &&
12047 (kind != DTRACEACT_PRINTA ||
12048 desc->dofa_strtab != DOF_SECIDX_NONE)) {
12049 dof_sec_t *strtab;
12050 char *str, *fmt;
12051 uint64_t i;
12052
12053 /*
12054 * printf()-like actions must have a format string.
12055 */
12056 if ((strtab = dtrace_dof_sect(dof,
12057 DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
12058 goto err;
12059
12060 str = (char *)((uintptr_t)dof +
12061 (uintptr_t)strtab->dofs_offset);
12062
12063 for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
12064 if (str[i] == '\0')
12065 break;
12066 }
12067
12068 if (i >= strtab->dofs_size) {
12069 dtrace_dof_error(dof, "bogus format string");
12070 goto err;
12071 }
12072
12073 if (i == desc->dofa_arg) {
12074 dtrace_dof_error(dof, "empty format string");
12075 goto err;
12076 }
12077
12078 i -= desc->dofa_arg;
12079 fmt = kmem_alloc(i + 1, KM_SLEEP);
12080 bcopy(&str[desc->dofa_arg], fmt, i + 1);
12081 arg = (uint64_t)(uintptr_t)fmt;
12082 } else {
12083 if (kind == DTRACEACT_PRINTA) {
12084 ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
12085 arg = 0;
12086 } else {
12087 arg = desc->dofa_arg;
12088 }
12089 }
12090
12091 act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
12092 desc->dofa_uarg, arg);
12093
12094 if (last != NULL) {
12095 last->dtad_next = act;
12096 } else {
12097 first = act;
12098 }
12099
12100 last = act;
12101
12102 if (desc->dofa_difo == DOF_SECIDX_NONE)
12103 continue;
12104
12105 if ((difosec = dtrace_dof_sect(dof,
12106 DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
12107 goto err;
12108
12109 act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
12110
12111 if (act->dtad_difo == NULL)
12112 goto err;
12113 }
12114
12115 ASSERT(first != NULL);
12116 return (first);
12117
12118 err:
12119 for (act = first; act != NULL; act = next) {
12120 next = act->dtad_next;
12121 dtrace_actdesc_release(act, vstate);
12122 }
12123
12124 return (NULL);
12125 }
12126
12127 static dtrace_ecbdesc_t *
12128 dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
12129 cred_t *cr)
12130 {
12131 dtrace_ecbdesc_t *ep;
12132 dof_ecbdesc_t *ecb;
12133 dtrace_probedesc_t *desc;
12134 dtrace_predicate_t *pred = NULL;
12135
12136 if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
12137 dtrace_dof_error(dof, "truncated ECB description");
12138 return (NULL);
12139 }
12140
12141 if (sec->dofs_align != sizeof (uint64_t)) {
12142 dtrace_dof_error(dof, "bad alignment in ECB description");
12143 return (NULL);
12144 }
12145
12146 ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
12147 sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
12148
12149 if (sec == NULL)
12150 return (NULL);
12151
12152 ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
12153 ep->dted_uarg = ecb->dofe_uarg;
12154 desc = &ep->dted_probe;
12155
12156 if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
12157 goto err;
12158
12159 if (ecb->dofe_pred != DOF_SECIDX_NONE) {
12160 if ((sec = dtrace_dof_sect(dof,
12161 DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
12162 goto err;
12163
12164 if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
12165 goto err;
12166
12167 ep->dted_pred.dtpdd_predicate = pred;
12168 }
12169
12170 if (ecb->dofe_actions != DOF_SECIDX_NONE) {
12171 if ((sec = dtrace_dof_sect(dof,
12172 DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
12173 goto err;
12174
12175 ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
12176
12177 if (ep->dted_action == NULL)
12178 goto err;
12179 }
12180
12181 return (ep);
12182
12183 err:
12184 if (pred != NULL)
12185 dtrace_predicate_release(pred, vstate);
12186 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
12187 return (NULL);
12188 }
12189
12190 /*
12191 * APPLE NOTE: dyld handles dof relocation.
12192 * Darwin does not need dtrace_dof_relocate()
12193 */
12194
12195 /*
12196 * The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
12197 * header: it should be at the front of a memory region that is at least
12198 * sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
12199 * size. It need not be validated in any other way.
12200 */
12201 static int
12202 dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
12203 dtrace_enabling_t **enabp, uint64_t ubase, int noprobes)
12204 {
12205 #pragma unused(ubase) /* __APPLE__ */
12206 uint64_t len = dof->dofh_loadsz, seclen;
12207 uintptr_t daddr = (uintptr_t)dof;
12208 dtrace_ecbdesc_t *ep;
12209 dtrace_enabling_t *enab;
12210 uint_t i;
12211
12212 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12213 ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
12214
12215 /*
12216 * Check the DOF header identification bytes. In addition to checking
12217 * valid settings, we also verify that unused bits/bytes are zeroed so
12218 * we can use them later without fear of regressing existing binaries.
12219 */
12220 if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
12221 DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
12222 dtrace_dof_error(dof, "DOF magic string mismatch");
12223 return (-1);
12224 }
12225
12226 if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
12227 dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
12228 dtrace_dof_error(dof, "DOF has invalid data model");
12229 return (-1);
12230 }
12231
12232 if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
12233 dtrace_dof_error(dof, "DOF encoding mismatch");
12234 return (-1);
12235 }
12236
12237 /*
12238 * APPLE NOTE: Darwin only supports DOF_VERSION_3 for now.
12239 */
12240 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_3) {
12241 dtrace_dof_error(dof, "DOF version mismatch");
12242 return (-1);
12243 }
12244
12245 if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
12246 dtrace_dof_error(dof, "DOF uses unsupported instruction set");
12247 return (-1);
12248 }
12249
12250 if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
12251 dtrace_dof_error(dof, "DOF uses too many integer registers");
12252 return (-1);
12253 }
12254
12255 if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
12256 dtrace_dof_error(dof, "DOF uses too many tuple registers");
12257 return (-1);
12258 }
12259
12260 for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
12261 if (dof->dofh_ident[i] != 0) {
12262 dtrace_dof_error(dof, "DOF has invalid ident byte set");
12263 return (-1);
12264 }
12265 }
12266
12267 if (dof->dofh_flags & ~DOF_FL_VALID) {
12268 dtrace_dof_error(dof, "DOF has invalid flag bits set");
12269 return (-1);
12270 }
12271
12272 if (dof->dofh_secsize == 0) {
12273 dtrace_dof_error(dof, "zero section header size");
12274 return (-1);
12275 }
12276
12277 /*
12278 * Check that the section headers don't exceed the amount of DOF
12279 * data. Note that we cast the section size and number of sections
12280 * to uint64_t's to prevent possible overflow in the multiplication.
12281 */
12282 seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
12283
12284 if (dof->dofh_secoff > len || seclen > len ||
12285 dof->dofh_secoff + seclen > len) {
12286 dtrace_dof_error(dof, "truncated section headers");
12287 return (-1);
12288 }
12289
12290 if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
12291 dtrace_dof_error(dof, "misaligned section headers");
12292 return (-1);
12293 }
12294
12295 if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
12296 dtrace_dof_error(dof, "misaligned section size");
12297 return (-1);
12298 }
12299
12300 /*
12301 * Take an initial pass through the section headers to be sure that
12302 * the headers don't have stray offsets. If the 'noprobes' flag is
12303 * set, do not permit sections relating to providers, probes, or args.
12304 */
12305 for (i = 0; i < dof->dofh_secnum; i++) {
12306 dof_sec_t *sec = (dof_sec_t *)(daddr +
12307 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12308
12309 if (noprobes) {
12310 switch (sec->dofs_type) {
12311 case DOF_SECT_PROVIDER:
12312 case DOF_SECT_PROBES:
12313 case DOF_SECT_PRARGS:
12314 case DOF_SECT_PROFFS:
12315 dtrace_dof_error(dof, "illegal sections "
12316 "for enabling");
12317 return (-1);
12318 }
12319 }
12320
12321 if (!(sec->dofs_flags & DOF_SECF_LOAD))
12322 continue; /* just ignore non-loadable sections */
12323
12324 if (sec->dofs_align & (sec->dofs_align - 1)) {
12325 dtrace_dof_error(dof, "bad section alignment");
12326 return (-1);
12327 }
12328
12329 if (sec->dofs_offset & (sec->dofs_align - 1)) {
12330 dtrace_dof_error(dof, "misaligned section");
12331 return (-1);
12332 }
12333
12334 if (sec->dofs_offset > len || sec->dofs_size > len ||
12335 sec->dofs_offset + sec->dofs_size > len) {
12336 dtrace_dof_error(dof, "corrupt section header");
12337 return (-1);
12338 }
12339
12340 if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +
12341 sec->dofs_offset + sec->dofs_size - 1) != '\0') {
12342 dtrace_dof_error(dof, "non-terminating string table");
12343 return (-1);
12344 }
12345 }
12346
12347 /*
12348 * APPLE NOTE: We have no further relocation to perform.
12349 * All dof values are relative offsets.
12350 */
12351
12352 if ((enab = *enabp) == NULL)
12353 enab = *enabp = dtrace_enabling_create(vstate);
12354
12355 for (i = 0; i < dof->dofh_secnum; i++) {
12356 dof_sec_t *sec = (dof_sec_t *)(daddr +
12357 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12358
12359 if (sec->dofs_type != DOF_SECT_ECBDESC)
12360 continue;
12361
12362 /*
12363 * APPLE NOTE: Defend against gcc 4.0 botch on x86.
12364 * not all paths out of inlined dtrace_dof_ecbdesc
12365 * are checked for the NULL return value.
12366 * Check for NULL explicitly here.
12367 */
12368 ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr);
12369 if (ep == NULL) {
12370 dtrace_enabling_destroy(enab);
12371 *enabp = NULL;
12372 return (-1);
12373 }
12374
12375 dtrace_enabling_add(enab, ep);
12376 }
12377
12378 return (0);
12379 }
12380
12381 /*
12382 * Process DOF for any options. This routine assumes that the DOF has been
12383 * at least processed by dtrace_dof_slurp().
12384 */
12385 static int
12386 dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)
12387 {
12388 uint_t i;
12389 int rval;
12390 uint32_t entsize;
12391 size_t offs;
12392 dof_optdesc_t *desc;
12393
12394 for (i = 0; i < dof->dofh_secnum; i++) {
12395 dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +
12396 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
12397
12398 if (sec->dofs_type != DOF_SECT_OPTDESC)
12399 continue;
12400
12401 if (sec->dofs_align != sizeof (uint64_t)) {
12402 dtrace_dof_error(dof, "bad alignment in "
12403 "option description");
12404 return (EINVAL);
12405 }
12406
12407 if ((entsize = sec->dofs_entsize) == 0) {
12408 dtrace_dof_error(dof, "zeroed option entry size");
12409 return (EINVAL);
12410 }
12411
12412 if (entsize < sizeof (dof_optdesc_t)) {
12413 dtrace_dof_error(dof, "bad option entry size");
12414 return (EINVAL);
12415 }
12416
12417 for (offs = 0; offs < sec->dofs_size; offs += entsize) {
12418 desc = (dof_optdesc_t *)((uintptr_t)dof +
12419 (uintptr_t)sec->dofs_offset + offs);
12420
12421 if (desc->dofo_strtab != DOF_SECIDX_NONE) {
12422 dtrace_dof_error(dof, "non-zero option string");
12423 return (EINVAL);
12424 }
12425
12426 if (desc->dofo_value == (uint64_t)DTRACEOPT_UNSET) {
12427 dtrace_dof_error(dof, "unset option");
12428 return (EINVAL);
12429 }
12430
12431 if ((rval = dtrace_state_option(state,
12432 desc->dofo_option, desc->dofo_value)) != 0) {
12433 dtrace_dof_error(dof, "rejected option");
12434 return (rval);
12435 }
12436 }
12437 }
12438
12439 return (0);
12440 }
12441
12442 /*
12443 * DTrace Consumer State Functions
12444 */
12445 static int
12446 dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
12447 {
12448 size_t hashsize, maxper, min_size, chunksize = dstate->dtds_chunksize;
12449 void *base;
12450 uintptr_t limit;
12451 dtrace_dynvar_t *dvar, *next, *start;
12452 size_t i;
12453
12454 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12455 ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
12456
12457 bzero(dstate, sizeof (dtrace_dstate_t));
12458
12459 if ((dstate->dtds_chunksize = chunksize) == 0)
12460 dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
12461
12462 if (size < (min_size = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
12463 size = min_size;
12464
12465 if ((base = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
12466 return (ENOMEM);
12467
12468 dstate->dtds_size = size;
12469 dstate->dtds_base = base;
12470 dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
12471 bzero(dstate->dtds_percpu, (int)NCPU * sizeof (dtrace_dstate_percpu_t));
12472
12473 hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
12474
12475 if (hashsize != 1 && (hashsize & 1))
12476 hashsize--;
12477
12478 dstate->dtds_hashsize = hashsize;
12479 dstate->dtds_hash = dstate->dtds_base;
12480
12481 /*
12482 * Set all of our hash buckets to point to the single sink, and (if
12483 * it hasn't already been set), set the sink's hash value to be the
12484 * sink sentinel value. The sink is needed for dynamic variable
12485 * lookups to know that they have iterated over an entire, valid hash
12486 * chain.
12487 */
12488 for (i = 0; i < hashsize; i++)
12489 dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
12490
12491 if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
12492 dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
12493
12494 /*
12495 * Determine number of active CPUs. Divide free list evenly among
12496 * active CPUs.
12497 */
12498 start = (dtrace_dynvar_t *)
12499 ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
12500 limit = (uintptr_t)base + size;
12501
12502 maxper = (limit - (uintptr_t)start) / (int)NCPU;
12503 maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
12504
12505 for (i = 0; i < NCPU; i++) {
12506 dstate->dtds_percpu[i].dtdsc_free = dvar = start;
12507
12508 /*
12509 * If we don't even have enough chunks to make it once through
12510 * NCPUs, we're just going to allocate everything to the first
12511 * CPU. And if we're on the last CPU, we're going to allocate
12512 * whatever is left over. In either case, we set the limit to
12513 * be the limit of the dynamic variable space.
12514 */
12515 if (maxper == 0 || i == NCPU - 1) {
12516 limit = (uintptr_t)base + size;
12517 start = NULL;
12518 } else {
12519 limit = (uintptr_t)start + maxper;
12520 start = (dtrace_dynvar_t *)limit;
12521 }
12522
12523 ASSERT(limit <= (uintptr_t)base + size);
12524
12525 for (;;) {
12526 next = (dtrace_dynvar_t *)((uintptr_t)dvar +
12527 dstate->dtds_chunksize);
12528
12529 if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
12530 break;
12531
12532 dvar->dtdv_next = next;
12533 dvar = next;
12534 }
12535
12536 if (maxper == 0)
12537 break;
12538 }
12539
12540 return (0);
12541 }
12542
12543 static void
12544 dtrace_dstate_fini(dtrace_dstate_t *dstate)
12545 {
12546 lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
12547
12548 if (dstate->dtds_base == NULL)
12549 return;
12550
12551 kmem_free(dstate->dtds_base, dstate->dtds_size);
12552 kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);
12553 }
12554
12555 static void
12556 dtrace_vstate_fini(dtrace_vstate_t *vstate)
12557 {
12558 /*
12559 * Logical XOR, where are you?
12560 */
12561 ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));
12562
12563 if (vstate->dtvs_nglobals > 0) {
12564 kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
12565 sizeof (dtrace_statvar_t *));
12566 }
12567
12568 if (vstate->dtvs_ntlocals > 0) {
12569 kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
12570 sizeof (dtrace_difv_t));
12571 }
12572
12573 ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));
12574
12575 if (vstate->dtvs_nlocals > 0) {
12576 kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
12577 sizeof (dtrace_statvar_t *));
12578 }
12579 }
12580
12581 static void
12582 dtrace_state_clean(dtrace_state_t *state)
12583 {
12584 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
12585 return;
12586
12587 dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
12588 dtrace_speculation_clean(state);
12589 }
12590
12591 static void
12592 dtrace_state_deadman(dtrace_state_t *state)
12593 {
12594 hrtime_t now;
12595
12596 dtrace_sync();
12597
12598 now = dtrace_gethrtime();
12599
12600 if (state != dtrace_anon.dta_state &&
12601 now - state->dts_laststatus >= dtrace_deadman_user)
12602 return;
12603
12604 /*
12605 * We must be sure that dts_alive never appears to be less than the
12606 * value upon entry to dtrace_state_deadman(), and because we lack a
12607 * dtrace_cas64(), we cannot store to it atomically. We thus instead
12608 * store INT64_MAX to it, followed by a memory barrier, followed by
12609 * the new value. This assures that dts_alive never appears to be
12610 * less than its true value, regardless of the order in which the
12611 * stores to the underlying storage are issued.
12612 */
12613 state->dts_alive = INT64_MAX;
12614 dtrace_membar_producer();
12615 state->dts_alive = now;
12616 }
12617
12618 static int
12619 dtrace_state_create(dev_t *devp, cred_t *cr, dtrace_state_t **new_state)
12620 {
12621 minor_t minor;
12622 major_t major;
12623 char c[30];
12624 dtrace_state_t *state;
12625 dtrace_optval_t *opt;
12626 int bufsize = (int)NCPU * sizeof (dtrace_buffer_t), i;
12627
12628 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12629 lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
12630
12631 /* Cause restart */
12632 *new_state = NULL;
12633
12634 /*
12635 * Darwin's DEVFS layer acquired the minor number for this "device" when it called
12636 * dtrace_devfs_clone_func(). At that time, dtrace_devfs_clone_func() proposed a minor number
12637 * (next unused according to vmem_alloc()) and then immediately put the number back in play
12638 * (by calling vmem_free()). Now that minor number is being used for an open, so committing it
12639 * to use. The following vmem_alloc() must deliver that same minor number. FIXME.
12640 */
12641
12642 minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1,
12643 VM_BESTFIT | VM_SLEEP);
12644
12645 if (NULL != devp) {
12646 ASSERT(getminor(*devp) == minor);
12647 if (getminor(*devp) != minor) {
12648 printf("dtrace_open: couldn't re-acquire vended minor number %d. Instead got %d\n",
12649 getminor(*devp), minor);
12650 vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
12651 return (ERESTART); /* can't reacquire */
12652 }
12653 } else {
12654 /* NULL==devp iff "Anonymous state" (see dtrace_anon_property),
12655 * so just vend the minor device number here de novo since no "open" has occurred. */
12656 }
12657
12658 if (ddi_soft_state_zalloc(dtrace_softstate, minor) != DDI_SUCCESS) {
12659 vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
12660 return (EAGAIN); /* temporary resource shortage */
12661 }
12662
12663 state = ddi_get_soft_state(dtrace_softstate, minor);
12664 state->dts_epid = DTRACE_EPIDNONE + 1;
12665
12666 (void) snprintf(c, sizeof (c), "dtrace_aggid_%d", minor);
12667 state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
12668 NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
12669
12670 if (devp != NULL) {
12671 major = getemajor(*devp);
12672 } else {
12673 major = ddi_driver_major(dtrace_devi);
12674 }
12675
12676 state->dts_dev = makedevice(major, minor);
12677
12678 if (devp != NULL)
12679 *devp = state->dts_dev;
12680
12681 /*
12682 * We allocate NCPU buffers. On the one hand, this can be quite
12683 * a bit of memory per instance (nearly 36K on a Starcat). On the
12684 * other hand, it saves an additional memory reference in the probe
12685 * path.
12686 */
12687 state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
12688 state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
12689 state->dts_cleaner = CYCLIC_NONE;
12690 state->dts_deadman = CYCLIC_NONE;
12691 state->dts_vstate.dtvs_state = state;
12692
12693 for (i = 0; i < DTRACEOPT_MAX; i++)
12694 state->dts_options[i] = DTRACEOPT_UNSET;
12695
12696 /*
12697 * Set the default options.
12698 */
12699 opt = state->dts_options;
12700 opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
12701 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
12702 opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
12703 opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
12704 opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
12705 opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
12706 opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
12707 opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
12708 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
12709 opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
12710 opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
12711 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
12712 opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
12713 opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
12714
12715 state->dts_activity = DTRACE_ACTIVITY_INACTIVE;
12716
12717 /*
12718 * Depending on the user credentials, we set flag bits which alter probe
12719 * visibility or the amount of destructiveness allowed. In the case of
12720 * actual anonymous tracing, or the possession of all privileges, all of
12721 * the normal checks are bypassed.
12722 */
12723 if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
12724 state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
12725 state->dts_cred.dcr_action = DTRACE_CRA_ALL;
12726 } else {
12727 /*
12728 * Set up the credentials for this instantiation. We take a
12729 * hold on the credential to prevent it from disappearing on
12730 * us; this in turn prevents the zone_t referenced by this
12731 * credential from disappearing. This means that we can
12732 * examine the credential and the zone from probe context.
12733 */
12734 crhold(cr);
12735 state->dts_cred.dcr_cred = cr;
12736
12737 /*
12738 * CRA_PROC means "we have *some* privilege for dtrace" and
12739 * unlocks the use of variables like pid, zonename, etc.
12740 */
12741 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||
12742 PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
12743 state->dts_cred.dcr_action |= DTRACE_CRA_PROC;
12744 }
12745
12746 /*
12747 * dtrace_user allows use of syscall and profile providers.
12748 * If the user also has proc_owner and/or proc_zone, we
12749 * extend the scope to include additional visibility and
12750 * destructive power.
12751 */
12752 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
12753 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
12754 state->dts_cred.dcr_visible |=
12755 DTRACE_CRV_ALLPROC;
12756
12757 state->dts_cred.dcr_action |=
12758 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
12759 }
12760
12761 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
12762 state->dts_cred.dcr_visible |=
12763 DTRACE_CRV_ALLZONE;
12764
12765 state->dts_cred.dcr_action |=
12766 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
12767 }
12768
12769 /*
12770 * If we have all privs in whatever zone this is,
12771 * we can do destructive things to processes which
12772 * have altered credentials.
12773 *
12774 * APPLE NOTE: Darwin doesn't do zones.
12775 * Behave as if zone always has destructive privs.
12776 */
12777
12778 state->dts_cred.dcr_action |=
12779 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
12780 }
12781
12782 /*
12783 * Holding the dtrace_kernel privilege also implies that
12784 * the user has the dtrace_user privilege from a visibility
12785 * perspective. But without further privileges, some
12786 * destructive actions are not available.
12787 */
12788 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
12789 /*
12790 * Make all probes in all zones visible. However,
12791 * this doesn't mean that all actions become available
12792 * to all zones.
12793 */
12794 state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |
12795 DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;
12796
12797 state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |
12798 DTRACE_CRA_PROC;
12799 /*
12800 * Holding proc_owner means that destructive actions
12801 * for *this* zone are allowed.
12802 */
12803 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
12804 state->dts_cred.dcr_action |=
12805 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
12806
12807 /*
12808 * Holding proc_zone means that destructive actions
12809 * for this user/group ID in all zones is allowed.
12810 */
12811 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
12812 state->dts_cred.dcr_action |=
12813 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
12814
12815 /*
12816 * If we have all privs in whatever zone this is,
12817 * we can do destructive things to processes which
12818 * have altered credentials.
12819 *
12820 * APPLE NOTE: Darwin doesn't do zones.
12821 * Behave as if zone always has destructive privs.
12822 */
12823 state->dts_cred.dcr_action |=
12824 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
12825 }
12826
12827 /*
12828 * Holding the dtrace_proc privilege gives control over fasttrap
12829 * and pid providers. We need to grant wider destructive
12830 * privileges in the event that the user has proc_owner and/or
12831 * proc_zone.
12832 */
12833 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
12834 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
12835 state->dts_cred.dcr_action |=
12836 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
12837
12838 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
12839 state->dts_cred.dcr_action |=
12840 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
12841 }
12842 }
12843
12844 *new_state = state;
12845 return(0); /* Success */
12846 }
12847
12848 static int
12849 dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
12850 {
12851 dtrace_optval_t *opt = state->dts_options, size;
12852 processorid_t cpu = 0;
12853 int flags = 0, rval;
12854
12855 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12856 lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
12857 ASSERT(which < DTRACEOPT_MAX);
12858 ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||
12859 (state == dtrace_anon.dta_state &&
12860 state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
12861
12862 if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)
12863 return (0);
12864
12865 if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
12866 cpu = opt[DTRACEOPT_CPU];
12867
12868 if (which == DTRACEOPT_SPECSIZE)
12869 flags |= DTRACEBUF_NOSWITCH;
12870
12871 if (which == DTRACEOPT_BUFSIZE) {
12872 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
12873 flags |= DTRACEBUF_RING;
12874
12875 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
12876 flags |= DTRACEBUF_FILL;
12877
12878 if (state != dtrace_anon.dta_state ||
12879 state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
12880 flags |= DTRACEBUF_INACTIVE;
12881 }
12882
12883 for (size = opt[which]; (size_t)size >= sizeof (uint64_t); size >>= 1) {
12884 /*
12885 * The size must be 8-byte aligned. If the size is not 8-byte
12886 * aligned, drop it down by the difference.
12887 */
12888 if (size & (sizeof (uint64_t) - 1))
12889 size -= size & (sizeof (uint64_t) - 1);
12890
12891 if (size < state->dts_reserve) {
12892 /*
12893 * Buffers always must be large enough to accommodate
12894 * their prereserved space. We return E2BIG instead
12895 * of ENOMEM in this case to allow for user-level
12896 * software to differentiate the cases.
12897 */
12898 return (E2BIG);
12899 }
12900
12901 rval = dtrace_buffer_alloc(buf, size, flags, cpu);
12902
12903 if (rval != ENOMEM) {
12904 opt[which] = size;
12905 return (rval);
12906 }
12907
12908 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
12909 return (rval);
12910 }
12911
12912 return (ENOMEM);
12913 }
12914
12915 static int
12916 dtrace_state_buffers(dtrace_state_t *state)
12917 {
12918 dtrace_speculation_t *spec = state->dts_speculations;
12919 int rval, i;
12920
12921 if ((rval = dtrace_state_buffer(state, state->dts_buffer,
12922 DTRACEOPT_BUFSIZE)) != 0)
12923 return (rval);
12924
12925 if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
12926 DTRACEOPT_AGGSIZE)) != 0)
12927 return (rval);
12928
12929 for (i = 0; i < state->dts_nspeculations; i++) {
12930 if ((rval = dtrace_state_buffer(state,
12931 spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
12932 return (rval);
12933 }
12934
12935 return (0);
12936 }
12937
12938 static void
12939 dtrace_state_prereserve(dtrace_state_t *state)
12940 {
12941 dtrace_ecb_t *ecb;
12942 dtrace_probe_t *probe;
12943
12944 state->dts_reserve = 0;
12945
12946 if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
12947 return;
12948
12949 /*
12950 * If our buffer policy is a "fill" buffer policy, we need to set the
12951 * prereserved space to be the space required by the END probes.
12952 */
12953 probe = dtrace_probes[dtrace_probeid_end - 1];
12954 ASSERT(probe != NULL);
12955
12956 for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
12957 if (ecb->dte_state != state)
12958 continue;
12959
12960 state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
12961 }
12962 }
12963
12964 static int
12965 dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
12966 {
12967 dtrace_optval_t *opt = state->dts_options, sz, nspec;
12968 dtrace_speculation_t *spec;
12969 dtrace_buffer_t *buf;
12970 cyc_handler_t hdlr;
12971 cyc_time_t when;
12972 int rval = 0, i, bufsize = (int)NCPU * sizeof (dtrace_buffer_t);
12973 dtrace_icookie_t cookie;
12974
12975 lck_mtx_lock(&cpu_lock);
12976 lck_mtx_lock(&dtrace_lock);
12977
12978 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
12979 rval = EBUSY;
12980 goto out;
12981 }
12982
12983 /*
12984 * Before we can perform any checks, we must prime all of the
12985 * retained enablings that correspond to this state.
12986 */
12987 dtrace_enabling_prime(state);
12988
12989 if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
12990 rval = EACCES;
12991 goto out;
12992 }
12993
12994 dtrace_state_prereserve(state);
12995
12996 /*
12997 * Now we want to do is try to allocate our speculations.
12998 * We do not automatically resize the number of speculations; if
12999 * this fails, we will fail the operation.
13000 */
13001 nspec = opt[DTRACEOPT_NSPEC];
13002 ASSERT(nspec != DTRACEOPT_UNSET);
13003
13004 if (nspec > INT_MAX) {
13005 rval = ENOMEM;
13006 goto out;
13007 }
13008
13009 spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t), KM_NOSLEEP);
13010
13011 if (spec == NULL) {
13012 rval = ENOMEM;
13013 goto out;
13014 }
13015
13016 state->dts_speculations = spec;
13017 state->dts_nspeculations = (int)nspec;
13018
13019 for (i = 0; i < nspec; i++) {
13020 if ((buf = kmem_zalloc(bufsize, KM_NOSLEEP)) == NULL) {
13021 rval = ENOMEM;
13022 goto err;
13023 }
13024
13025 spec[i].dtsp_buffer = buf;
13026 }
13027
13028 if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
13029 if (dtrace_anon.dta_state == NULL) {
13030 rval = ENOENT;
13031 goto out;
13032 }
13033
13034 if (state->dts_necbs != 0) {
13035 rval = EALREADY;
13036 goto out;
13037 }
13038
13039 state->dts_anon = dtrace_anon_grab();
13040 ASSERT(state->dts_anon != NULL);
13041 state = state->dts_anon;
13042
13043 /*
13044 * We want "grabanon" to be set in the grabbed state, so we'll
13045 * copy that option value from the grabbing state into the
13046 * grabbed state.
13047 */
13048 state->dts_options[DTRACEOPT_GRABANON] =
13049 opt[DTRACEOPT_GRABANON];
13050
13051 *cpu = dtrace_anon.dta_beganon;
13052
13053 /*
13054 * If the anonymous state is active (as it almost certainly
13055 * is if the anonymous enabling ultimately matched anything),
13056 * we don't allow any further option processing -- but we
13057 * don't return failure.
13058 */
13059 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
13060 goto out;
13061 }
13062
13063 if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
13064 opt[DTRACEOPT_AGGSIZE] != 0) {
13065 if (state->dts_aggregations == NULL) {
13066 /*
13067 * We're not going to create an aggregation buffer
13068 * because we don't have any ECBs that contain
13069 * aggregations -- set this option to 0.
13070 */
13071 opt[DTRACEOPT_AGGSIZE] = 0;
13072 } else {
13073 /*
13074 * If we have an aggregation buffer, we must also have
13075 * a buffer to use as scratch.
13076 */
13077 if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
13078 (size_t)opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
13079 opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
13080 }
13081 }
13082 }
13083
13084 if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
13085 opt[DTRACEOPT_SPECSIZE] != 0) {
13086 if (!state->dts_speculates) {
13087 /*
13088 * We're not going to create speculation buffers
13089 * because we don't have any ECBs that actually
13090 * speculate -- set the speculation size to 0.
13091 */
13092 opt[DTRACEOPT_SPECSIZE] = 0;
13093 }
13094 }
13095
13096 /*
13097 * The bare minimum size for any buffer that we're actually going to
13098 * do anything to is sizeof (uint64_t).
13099 */
13100 sz = sizeof (uint64_t);
13101
13102 if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||
13103 (state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||
13104 (state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
13105 /*
13106 * A buffer size has been explicitly set to 0 (or to a size
13107 * that will be adjusted to 0) and we need the space -- we
13108 * need to return failure. We return ENOSPC to differentiate
13109 * it from failing to allocate a buffer due to failure to meet
13110 * the reserve (for which we return E2BIG).
13111 */
13112 rval = ENOSPC;
13113 goto out;
13114 }
13115
13116 if ((rval = dtrace_state_buffers(state)) != 0)
13117 goto err;
13118
13119 if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
13120 sz = dtrace_dstate_defsize;
13121
13122 do {
13123 rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
13124
13125 if (rval == 0)
13126 break;
13127
13128 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
13129 goto err;
13130 } while (sz >>= 1);
13131
13132 opt[DTRACEOPT_DYNVARSIZE] = sz;
13133
13134 if (rval != 0)
13135 goto err;
13136
13137 if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
13138 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
13139
13140 if (opt[DTRACEOPT_CLEANRATE] == 0)
13141 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
13142
13143 if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
13144 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
13145
13146 if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
13147 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
13148
13149 hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
13150 hdlr.cyh_arg = state;
13151 hdlr.cyh_level = CY_LOW_LEVEL;
13152
13153 when.cyt_when = 0;
13154 when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
13155
13156 state->dts_cleaner = cyclic_add(&hdlr, &when);
13157
13158 hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
13159 hdlr.cyh_arg = state;
13160 hdlr.cyh_level = CY_LOW_LEVEL;
13161
13162 when.cyt_when = 0;
13163 when.cyt_interval = dtrace_deadman_interval;
13164
13165 state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
13166 state->dts_deadman = cyclic_add(&hdlr, &when);
13167
13168 state->dts_activity = DTRACE_ACTIVITY_WARMUP;
13169
13170 /*
13171 * Now it's time to actually fire the BEGIN probe. We need to disable
13172 * interrupts here both to record the CPU on which we fired the BEGIN
13173 * probe (the data from this CPU will be processed first at user
13174 * level) and to manually activate the buffer for this CPU.
13175 */
13176 cookie = dtrace_interrupt_disable();
13177 *cpu = CPU->cpu_id;
13178 ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
13179 state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
13180
13181 dtrace_probe(dtrace_probeid_begin,
13182 (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
13183 dtrace_interrupt_enable(cookie);
13184 /*
13185 * We may have had an exit action from a BEGIN probe; only change our
13186 * state to ACTIVE if we're still in WARMUP.
13187 */
13188 ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||
13189 state->dts_activity == DTRACE_ACTIVITY_DRAINING);
13190
13191 if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
13192 state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
13193
13194 /*
13195 * Regardless of whether or not now we're in ACTIVE or DRAINING, we
13196 * want each CPU to transition its principal buffer out of the
13197 * INACTIVE state. Doing this assures that no CPU will suddenly begin
13198 * processing an ECB halfway down a probe's ECB chain; all CPUs will
13199 * atomically transition from processing none of a state's ECBs to
13200 * processing all of them.
13201 */
13202 dtrace_xcall(DTRACE_CPUALL,
13203 (dtrace_xcall_t)dtrace_buffer_activate, state);
13204 goto out;
13205
13206 err:
13207 dtrace_buffer_free(state->dts_buffer);
13208 dtrace_buffer_free(state->dts_aggbuffer);
13209
13210 if ((nspec = state->dts_nspeculations) == 0) {
13211 ASSERT(state->dts_speculations == NULL);
13212 goto out;
13213 }
13214
13215 spec = state->dts_speculations;
13216 ASSERT(spec != NULL);
13217
13218 for (i = 0; i < state->dts_nspeculations; i++) {
13219 if ((buf = spec[i].dtsp_buffer) == NULL)
13220 break;
13221
13222 dtrace_buffer_free(buf);
13223 kmem_free(buf, bufsize);
13224 }
13225
13226 kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
13227 state->dts_nspeculations = 0;
13228 state->dts_speculations = NULL;
13229
13230 out:
13231 lck_mtx_unlock(&dtrace_lock);
13232 lck_mtx_unlock(&cpu_lock);
13233
13234 return (rval);
13235 }
13236
13237 static int
13238 dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
13239 {
13240 dtrace_icookie_t cookie;
13241
13242 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13243
13244 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
13245 state->dts_activity != DTRACE_ACTIVITY_DRAINING)
13246 return (EINVAL);
13247
13248 /*
13249 * We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
13250 * to be sure that every CPU has seen it. See below for the details
13251 * on why this is done.
13252 */
13253 state->dts_activity = DTRACE_ACTIVITY_DRAINING;
13254 dtrace_sync();
13255
13256 /*
13257 * By this point, it is impossible for any CPU to be still processing
13258 * with DTRACE_ACTIVITY_ACTIVE. We can thus set our activity to
13259 * DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
13260 * other CPU in dtrace_buffer_reserve(). This allows dtrace_probe()
13261 * and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
13262 * iff we're in the END probe.
13263 */
13264 state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
13265 dtrace_sync();
13266 ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
13267
13268 /*
13269 * Finally, we can release the reserve and call the END probe. We
13270 * disable interrupts across calling the END probe to allow us to
13271 * return the CPU on which we actually called the END probe. This
13272 * allows user-land to be sure that this CPU's principal buffer is
13273 * processed last.
13274 */
13275 state->dts_reserve = 0;
13276
13277 cookie = dtrace_interrupt_disable();
13278 *cpu = CPU->cpu_id;
13279 dtrace_probe(dtrace_probeid_end,
13280 (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
13281 dtrace_interrupt_enable(cookie);
13282
13283 state->dts_activity = DTRACE_ACTIVITY_STOPPED;
13284 dtrace_sync();
13285
13286 return (0);
13287 }
13288
13289 static int
13290 dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
13291 dtrace_optval_t val)
13292 {
13293 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13294
13295 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
13296 return (EBUSY);
13297
13298 if (option >= DTRACEOPT_MAX)
13299 return (EINVAL);
13300
13301 if (option != DTRACEOPT_CPU && val < 0)
13302 return (EINVAL);
13303
13304 switch (option) {
13305 case DTRACEOPT_DESTRUCTIVE:
13306 /*
13307 * Prevent consumers from enabling destructive actions if DTrace
13308 * is running in a restricted environment, or if actions are
13309 * disallowed.
13310 */
13311 if (dtrace_is_restricted() || dtrace_destructive_disallow)
13312 return (EACCES);
13313
13314 state->dts_cred.dcr_destructive = 1;
13315 break;
13316
13317 case DTRACEOPT_BUFSIZE:
13318 case DTRACEOPT_DYNVARSIZE:
13319 case DTRACEOPT_AGGSIZE:
13320 case DTRACEOPT_SPECSIZE:
13321 case DTRACEOPT_STRSIZE:
13322 if (val < 0)
13323 return (EINVAL);
13324
13325 if (val >= LONG_MAX) {
13326 /*
13327 * If this is an otherwise negative value, set it to
13328 * the highest multiple of 128m less than LONG_MAX.
13329 * Technically, we're adjusting the size without
13330 * regard to the buffer resizing policy, but in fact,
13331 * this has no effect -- if we set the buffer size to
13332 * ~LONG_MAX and the buffer policy is ultimately set to
13333 * be "manual", the buffer allocation is guaranteed to
13334 * fail, if only because the allocation requires two
13335 * buffers. (We set the the size to the highest
13336 * multiple of 128m because it ensures that the size
13337 * will remain a multiple of a megabyte when
13338 * repeatedly halved -- all the way down to 15m.)
13339 */
13340 val = LONG_MAX - (1 << 27) + 1;
13341 }
13342 }
13343
13344 state->dts_options[option] = val;
13345
13346 return (0);
13347 }
13348
13349 static void
13350 dtrace_state_destroy(dtrace_state_t *state)
13351 {
13352 dtrace_ecb_t *ecb;
13353 dtrace_vstate_t *vstate = &state->dts_vstate;
13354 minor_t minor = getminor(state->dts_dev);
13355 int i, bufsize = (int)NCPU * sizeof (dtrace_buffer_t);
13356 dtrace_speculation_t *spec = state->dts_speculations;
13357 int nspec = state->dts_nspeculations;
13358 uint32_t match;
13359
13360 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13361 lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
13362
13363 /*
13364 * First, retract any retained enablings for this state.
13365 */
13366 dtrace_enabling_retract(state);
13367 ASSERT(state->dts_nretained == 0);
13368
13369 if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||
13370 state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
13371 /*
13372 * We have managed to come into dtrace_state_destroy() on a
13373 * hot enabling -- almost certainly because of a disorderly
13374 * shutdown of a consumer. (That is, a consumer that is
13375 * exiting without having called dtrace_stop().) In this case,
13376 * we're going to set our activity to be KILLED, and then
13377 * issue a sync to be sure that everyone is out of probe
13378 * context before we start blowing away ECBs.
13379 */
13380 state->dts_activity = DTRACE_ACTIVITY_KILLED;
13381 dtrace_sync();
13382 }
13383
13384 /*
13385 * Release the credential hold we took in dtrace_state_create().
13386 */
13387 if (state->dts_cred.dcr_cred != NULL)
13388 crfree(state->dts_cred.dcr_cred);
13389
13390 /*
13391 * Now we can safely disable and destroy any enabled probes. Because
13392 * any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
13393 * (especially if they're all enabled), we take two passes through the
13394 * ECBs: in the first, we disable just DTRACE_PRIV_KERNEL probes, and
13395 * in the second we disable whatever is left over.
13396 */
13397 for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
13398 for (i = 0; i < state->dts_necbs; i++) {
13399 if ((ecb = state->dts_ecbs[i]) == NULL)
13400 continue;
13401
13402 if (match && ecb->dte_probe != NULL) {
13403 dtrace_probe_t *probe = ecb->dte_probe;
13404 dtrace_provider_t *prov = probe->dtpr_provider;
13405
13406 if (!(prov->dtpv_priv.dtpp_flags & match))
13407 continue;
13408 }
13409
13410 dtrace_ecb_disable(ecb);
13411 dtrace_ecb_destroy(ecb);
13412 }
13413
13414 if (!match)
13415 break;
13416 }
13417
13418 /*
13419 * Before we free the buffers, perform one more sync to assure that
13420 * every CPU is out of probe context.
13421 */
13422 dtrace_sync();
13423
13424 dtrace_buffer_free(state->dts_buffer);
13425 dtrace_buffer_free(state->dts_aggbuffer);
13426
13427 for (i = 0; i < nspec; i++)
13428 dtrace_buffer_free(spec[i].dtsp_buffer);
13429
13430 if (state->dts_cleaner != CYCLIC_NONE)
13431 cyclic_remove(state->dts_cleaner);
13432
13433 if (state->dts_deadman != CYCLIC_NONE)
13434 cyclic_remove(state->dts_deadman);
13435
13436 dtrace_dstate_fini(&vstate->dtvs_dynvars);
13437 dtrace_vstate_fini(vstate);
13438 kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
13439
13440 if (state->dts_aggregations != NULL) {
13441 #if DEBUG
13442 for (i = 0; i < state->dts_naggregations; i++)
13443 ASSERT(state->dts_aggregations[i] == NULL);
13444 #endif
13445 ASSERT(state->dts_naggregations > 0);
13446 kmem_free(state->dts_aggregations,
13447 state->dts_naggregations * sizeof (dtrace_aggregation_t *));
13448 }
13449
13450 kmem_free(state->dts_buffer, bufsize);
13451 kmem_free(state->dts_aggbuffer, bufsize);
13452
13453 for (i = 0; i < nspec; i++)
13454 kmem_free(spec[i].dtsp_buffer, bufsize);
13455
13456 kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
13457
13458 dtrace_format_destroy(state);
13459
13460 vmem_destroy(state->dts_aggid_arena);
13461 ddi_soft_state_free(dtrace_softstate, minor);
13462 vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
13463 }
13464
13465 /*
13466 * DTrace Anonymous Enabling Functions
13467 */
13468 static dtrace_state_t *
13469 dtrace_anon_grab(void)
13470 {
13471 dtrace_state_t *state;
13472
13473 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13474
13475 if ((state = dtrace_anon.dta_state) == NULL) {
13476 ASSERT(dtrace_anon.dta_enabling == NULL);
13477 return (NULL);
13478 }
13479
13480 ASSERT(dtrace_anon.dta_enabling != NULL);
13481 ASSERT(dtrace_retained != NULL);
13482
13483 dtrace_enabling_destroy(dtrace_anon.dta_enabling);
13484 dtrace_anon.dta_enabling = NULL;
13485 dtrace_anon.dta_state = NULL;
13486
13487 return (state);
13488 }
13489
13490 static void
13491 dtrace_anon_property(void)
13492 {
13493 int i, rv;
13494 dtrace_state_t *state;
13495 dof_hdr_t *dof;
13496 char c[32]; /* enough for "dof-data-" + digits */
13497
13498 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13499 lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
13500
13501 for (i = 0; ; i++) {
13502 (void) snprintf(c, sizeof (c), "dof-data-%d", i);
13503
13504 dtrace_err_verbose = 1;
13505
13506 if ((dof = dtrace_dof_property(c)) == NULL) {
13507 dtrace_err_verbose = 0;
13508 break;
13509 }
13510
13511 /*
13512 * We want to create anonymous state, so we need to transition
13513 * the kernel debugger to indicate that DTrace is active. If
13514 * this fails (e.g. because the debugger has modified text in
13515 * some way), we won't continue with the processing.
13516 */
13517 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
13518 cmn_err(CE_NOTE, "kernel debugger active; anonymous "
13519 "enabling ignored.");
13520 dtrace_dof_destroy(dof);
13521 break;
13522 }
13523
13524 /*
13525 * If we haven't allocated an anonymous state, we'll do so now.
13526 */
13527 if ((state = dtrace_anon.dta_state) == NULL) {
13528 rv = dtrace_state_create(NULL, NULL, &state);
13529 dtrace_anon.dta_state = state;
13530 if (rv != 0 || state == NULL) {
13531 /*
13532 * This basically shouldn't happen: the only
13533 * failure mode from dtrace_state_create() is a
13534 * failure of ddi_soft_state_zalloc() that
13535 * itself should never happen. Still, the
13536 * interface allows for a failure mode, and
13537 * we want to fail as gracefully as possible:
13538 * we'll emit an error message and cease
13539 * processing anonymous state in this case.
13540 */
13541 cmn_err(CE_WARN, "failed to create "
13542 "anonymous state");
13543 dtrace_dof_destroy(dof);
13544 break;
13545 }
13546 }
13547
13548 rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
13549 &dtrace_anon.dta_enabling, 0, B_TRUE);
13550
13551 if (rv == 0)
13552 rv = dtrace_dof_options(dof, state);
13553
13554 dtrace_err_verbose = 0;
13555 dtrace_dof_destroy(dof);
13556
13557 if (rv != 0) {
13558 /*
13559 * This is malformed DOF; chuck any anonymous state
13560 * that we created.
13561 */
13562 ASSERT(dtrace_anon.dta_enabling == NULL);
13563 dtrace_state_destroy(state);
13564 dtrace_anon.dta_state = NULL;
13565 break;
13566 }
13567
13568 ASSERT(dtrace_anon.dta_enabling != NULL);
13569 }
13570
13571 if (dtrace_anon.dta_enabling != NULL) {
13572 int rval;
13573
13574 /*
13575 * dtrace_enabling_retain() can only fail because we are
13576 * trying to retain more enablings than are allowed -- but
13577 * we only have one anonymous enabling, and we are guaranteed
13578 * to be allowed at least one retained enabling; we assert
13579 * that dtrace_enabling_retain() returns success.
13580 */
13581 rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
13582 ASSERT(rval == 0);
13583
13584 dtrace_enabling_dump(dtrace_anon.dta_enabling);
13585 }
13586 }
13587
13588 /*
13589 * DTrace Helper Functions
13590 */
13591 static void
13592 dtrace_helper_trace(dtrace_helper_action_t *helper,
13593 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
13594 {
13595 uint32_t size, next, nnext;
13596 int i;
13597 dtrace_helptrace_t *ent;
13598 uint16_t flags = cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
13599
13600 if (!dtrace_helptrace_enabled)
13601 return;
13602
13603 ASSERT((uint32_t)vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
13604
13605 /*
13606 * What would a tracing framework be without its own tracing
13607 * framework? (Well, a hell of a lot simpler, for starters...)
13608 */
13609 size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
13610 sizeof (uint64_t) - sizeof (uint64_t);
13611
13612 /*
13613 * Iterate until we can allocate a slot in the trace buffer.
13614 */
13615 do {
13616 next = dtrace_helptrace_next;
13617
13618 if (next + size < dtrace_helptrace_bufsize) {
13619 nnext = next + size;
13620 } else {
13621 nnext = size;
13622 }
13623 } while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
13624
13625 /*
13626 * We have our slot; fill it in.
13627 */
13628 if (nnext == size)
13629 next = 0;
13630
13631 ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
13632 ent->dtht_helper = helper;
13633 ent->dtht_where = where;
13634 ent->dtht_nlocals = vstate->dtvs_nlocals;
13635
13636 ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
13637 mstate->dtms_fltoffs : -1;
13638 ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
13639 ent->dtht_illval = cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
13640
13641 for (i = 0; i < vstate->dtvs_nlocals; i++) {
13642 dtrace_statvar_t *svar;
13643
13644 if ((svar = vstate->dtvs_locals[i]) == NULL)
13645 continue;
13646
13647 ASSERT(svar->dtsv_size >= (int)NCPU * sizeof (uint64_t));
13648 ent->dtht_locals[i] =
13649 ((uint64_t *)(uintptr_t)svar->dtsv_data)[CPU->cpu_id];
13650 }
13651 }
13652
13653 static uint64_t
13654 dtrace_helper(int which, dtrace_mstate_t *mstate,
13655 dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
13656 {
13657 uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
13658 uint64_t sarg0 = mstate->dtms_arg[0];
13659 uint64_t sarg1 = mstate->dtms_arg[1];
13660 uint64_t rval = 0;
13661 dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
13662 dtrace_helper_action_t *helper;
13663 dtrace_vstate_t *vstate;
13664 dtrace_difo_t *pred;
13665 int i, trace = dtrace_helptrace_enabled;
13666
13667 ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
13668
13669 if (helpers == NULL)
13670 return (0);
13671
13672 if ((helper = helpers->dthps_actions[which]) == NULL)
13673 return (0);
13674
13675 vstate = &helpers->dthps_vstate;
13676 mstate->dtms_arg[0] = arg0;
13677 mstate->dtms_arg[1] = arg1;
13678
13679 /*
13680 * Now iterate over each helper. If its predicate evaluates to 'true',
13681 * we'll call the corresponding actions. Note that the below calls
13682 * to dtrace_dif_emulate() may set faults in machine state. This is
13683 * okay: our caller (the outer dtrace_dif_emulate()) will simply plow
13684 * the stored DIF offset with its own (which is the desired behavior).
13685 * Also, note the calls to dtrace_dif_emulate() may allocate scratch
13686 * from machine state; this is okay, too.
13687 */
13688 for (; helper != NULL; helper = helper->dtha_next) {
13689 if ((pred = helper->dtha_predicate) != NULL) {
13690 if (trace)
13691 dtrace_helper_trace(helper, mstate, vstate, 0);
13692
13693 if (!dtrace_dif_emulate(pred, mstate, vstate, state))
13694 goto next;
13695
13696 if (*flags & CPU_DTRACE_FAULT)
13697 goto err;
13698 }
13699
13700 for (i = 0; i < helper->dtha_nactions; i++) {
13701 if (trace)
13702 dtrace_helper_trace(helper,
13703 mstate, vstate, i + 1);
13704
13705 rval = dtrace_dif_emulate(helper->dtha_actions[i],
13706 mstate, vstate, state);
13707
13708 if (*flags & CPU_DTRACE_FAULT)
13709 goto err;
13710 }
13711
13712 next:
13713 if (trace)
13714 dtrace_helper_trace(helper, mstate, vstate,
13715 DTRACE_HELPTRACE_NEXT);
13716 }
13717
13718 if (trace)
13719 dtrace_helper_trace(helper, mstate, vstate,
13720 DTRACE_HELPTRACE_DONE);
13721
13722 /*
13723 * Restore the arg0 that we saved upon entry.
13724 */
13725 mstate->dtms_arg[0] = sarg0;
13726 mstate->dtms_arg[1] = sarg1;
13727
13728 return (rval);
13729
13730 err:
13731 if (trace)
13732 dtrace_helper_trace(helper, mstate, vstate,
13733 DTRACE_HELPTRACE_ERR);
13734
13735 /*
13736 * Restore the arg0 that we saved upon entry.
13737 */
13738 mstate->dtms_arg[0] = sarg0;
13739 mstate->dtms_arg[1] = sarg1;
13740
13741 return (0);
13742 }
13743
13744 static void
13745 dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
13746 dtrace_vstate_t *vstate)
13747 {
13748 int i;
13749
13750 if (helper->dtha_predicate != NULL)
13751 dtrace_difo_release(helper->dtha_predicate, vstate);
13752
13753 for (i = 0; i < helper->dtha_nactions; i++) {
13754 ASSERT(helper->dtha_actions[i] != NULL);
13755 dtrace_difo_release(helper->dtha_actions[i], vstate);
13756 }
13757
13758 kmem_free(helper->dtha_actions,
13759 helper->dtha_nactions * sizeof (dtrace_difo_t *));
13760 kmem_free(helper, sizeof (dtrace_helper_action_t));
13761 }
13762
13763 static int
13764 dtrace_helper_destroygen(proc_t* p, int gen)
13765 {
13766 dtrace_helpers_t *help = p->p_dtrace_helpers;
13767 dtrace_vstate_t *vstate;
13768 uint_t i;
13769
13770 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13771
13772 if (help == NULL || gen > help->dthps_generation)
13773 return (EINVAL);
13774
13775 vstate = &help->dthps_vstate;
13776
13777 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
13778 dtrace_helper_action_t *last = NULL, *h, *next;
13779
13780 for (h = help->dthps_actions[i]; h != NULL; h = next) {
13781 next = h->dtha_next;
13782
13783 if (h->dtha_generation == gen) {
13784 if (last != NULL) {
13785 last->dtha_next = next;
13786 } else {
13787 help->dthps_actions[i] = next;
13788 }
13789
13790 dtrace_helper_action_destroy(h, vstate);
13791 } else {
13792 last = h;
13793 }
13794 }
13795 }
13796
13797 /*
13798 * Interate until we've cleared out all helper providers with the
13799 * given generation number.
13800 */
13801 for (;;) {
13802 dtrace_helper_provider_t *prov = NULL;
13803
13804 /*
13805 * Look for a helper provider with the right generation. We
13806 * have to start back at the beginning of the list each time
13807 * because we drop dtrace_lock. It's unlikely that we'll make
13808 * more than two passes.
13809 */
13810 for (i = 0; i < help->dthps_nprovs; i++) {
13811 prov = help->dthps_provs[i];
13812
13813 if (prov->dthp_generation == gen)
13814 break;
13815 }
13816
13817 /*
13818 * If there were no matches, we're done.
13819 */
13820 if (i == help->dthps_nprovs)
13821 break;
13822
13823 /*
13824 * Move the last helper provider into this slot.
13825 */
13826 help->dthps_nprovs--;
13827 help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
13828 help->dthps_provs[help->dthps_nprovs] = NULL;
13829
13830 lck_mtx_unlock(&dtrace_lock);
13831
13832 /*
13833 * If we have a meta provider, remove this helper provider.
13834 */
13835 lck_mtx_lock(&dtrace_meta_lock);
13836 if (dtrace_meta_pid != NULL) {
13837 ASSERT(dtrace_deferred_pid == NULL);
13838 dtrace_helper_provider_remove(&prov->dthp_prov,
13839 p->p_pid);
13840 }
13841 lck_mtx_unlock(&dtrace_meta_lock);
13842
13843 dtrace_helper_provider_destroy(prov);
13844
13845 lck_mtx_lock(&dtrace_lock);
13846 }
13847
13848 return (0);
13849 }
13850
13851 static int
13852 dtrace_helper_validate(dtrace_helper_action_t *helper)
13853 {
13854 int err = 0, i;
13855 dtrace_difo_t *dp;
13856
13857 if ((dp = helper->dtha_predicate) != NULL)
13858 err += dtrace_difo_validate_helper(dp);
13859
13860 for (i = 0; i < helper->dtha_nactions; i++)
13861 err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
13862
13863 return (err == 0);
13864 }
13865
13866 static int
13867 dtrace_helper_action_add(proc_t* p, int which, dtrace_ecbdesc_t *ep)
13868 {
13869 dtrace_helpers_t *help;
13870 dtrace_helper_action_t *helper, *last;
13871 dtrace_actdesc_t *act;
13872 dtrace_vstate_t *vstate;
13873 dtrace_predicate_t *pred;
13874 int count = 0, nactions = 0, i;
13875
13876 if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
13877 return (EINVAL);
13878
13879 help = p->p_dtrace_helpers;
13880 last = help->dthps_actions[which];
13881 vstate = &help->dthps_vstate;
13882
13883 for (count = 0; last != NULL; last = last->dtha_next) {
13884 count++;
13885 if (last->dtha_next == NULL)
13886 break;
13887 }
13888
13889 /*
13890 * If we already have dtrace_helper_actions_max helper actions for this
13891 * helper action type, we'll refuse to add a new one.
13892 */
13893 if (count >= dtrace_helper_actions_max)
13894 return (ENOSPC);
13895
13896 helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
13897 helper->dtha_generation = help->dthps_generation;
13898
13899 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
13900 ASSERT(pred->dtp_difo != NULL);
13901 dtrace_difo_hold(pred->dtp_difo);
13902 helper->dtha_predicate = pred->dtp_difo;
13903 }
13904
13905 for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
13906 if (act->dtad_kind != DTRACEACT_DIFEXPR)
13907 goto err;
13908
13909 if (act->dtad_difo == NULL)
13910 goto err;
13911
13912 nactions++;
13913 }
13914
13915 helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *
13916 (helper->dtha_nactions = nactions), KM_SLEEP);
13917
13918 for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
13919 dtrace_difo_hold(act->dtad_difo);
13920 helper->dtha_actions[i++] = act->dtad_difo;
13921 }
13922
13923 if (!dtrace_helper_validate(helper))
13924 goto err;
13925
13926 if (last == NULL) {
13927 help->dthps_actions[which] = helper;
13928 } else {
13929 last->dtha_next = helper;
13930 }
13931
13932 if ((uint32_t)vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
13933 dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
13934 dtrace_helptrace_next = 0;
13935 }
13936
13937 return (0);
13938 err:
13939 dtrace_helper_action_destroy(helper, vstate);
13940 return (EINVAL);
13941 }
13942
13943 static void
13944 dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
13945 dof_helper_t *dofhp)
13946 {
13947 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
13948
13949 lck_mtx_lock(&dtrace_meta_lock);
13950 lck_mtx_lock(&dtrace_lock);
13951
13952 if (!dtrace_attached() || dtrace_meta_pid == NULL) {
13953 /*
13954 * If the dtrace module is loaded but not attached, or if
13955 * there aren't isn't a meta provider registered to deal with
13956 * these provider descriptions, we need to postpone creating
13957 * the actual providers until later.
13958 */
13959
13960 if (help->dthps_next == NULL && help->dthps_prev == NULL &&
13961 dtrace_deferred_pid != help) {
13962 help->dthps_deferred = 1;
13963 help->dthps_pid = p->p_pid;
13964 help->dthps_next = dtrace_deferred_pid;
13965 help->dthps_prev = NULL;
13966 if (dtrace_deferred_pid != NULL)
13967 dtrace_deferred_pid->dthps_prev = help;
13968 dtrace_deferred_pid = help;
13969 }
13970
13971 lck_mtx_unlock(&dtrace_lock);
13972
13973 } else if (dofhp != NULL) {
13974 /*
13975 * If the dtrace module is loaded and we have a particular
13976 * helper provider description, pass that off to the
13977 * meta provider.
13978 */
13979
13980 lck_mtx_unlock(&dtrace_lock);
13981
13982 dtrace_helper_provide(dofhp, p->p_pid);
13983
13984 } else {
13985 /*
13986 * Otherwise, just pass all the helper provider descriptions
13987 * off to the meta provider.
13988 */
13989
13990 uint_t i;
13991 lck_mtx_unlock(&dtrace_lock);
13992
13993 for (i = 0; i < help->dthps_nprovs; i++) {
13994 dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
13995 p->p_pid);
13996 }
13997 }
13998
13999 lck_mtx_unlock(&dtrace_meta_lock);
14000 }
14001
14002 static int
14003 dtrace_helper_provider_add(proc_t* p, dof_helper_t *dofhp, int gen)
14004 {
14005 dtrace_helpers_t *help;
14006 dtrace_helper_provider_t *hprov, **tmp_provs;
14007 uint_t tmp_maxprovs, i;
14008
14009 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14010 help = p->p_dtrace_helpers;
14011 ASSERT(help != NULL);
14012
14013 /*
14014 * If we already have dtrace_helper_providers_max helper providers,
14015 * we're refuse to add a new one.
14016 */
14017 if (help->dthps_nprovs >= dtrace_helper_providers_max)
14018 return (ENOSPC);
14019
14020 /*
14021 * Check to make sure this isn't a duplicate.
14022 */
14023 for (i = 0; i < help->dthps_nprovs; i++) {
14024 if (dofhp->dofhp_addr ==
14025 help->dthps_provs[i]->dthp_prov.dofhp_addr)
14026 return (EALREADY);
14027 }
14028
14029 hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
14030 hprov->dthp_prov = *dofhp;
14031 hprov->dthp_ref = 1;
14032 hprov->dthp_generation = gen;
14033
14034 /*
14035 * Allocate a bigger table for helper providers if it's already full.
14036 */
14037 if (help->dthps_maxprovs == help->dthps_nprovs) {
14038 tmp_maxprovs = help->dthps_maxprovs;
14039 tmp_provs = help->dthps_provs;
14040
14041 if (help->dthps_maxprovs == 0)
14042 help->dthps_maxprovs = 2;
14043 else
14044 help->dthps_maxprovs *= 2;
14045 if (help->dthps_maxprovs > dtrace_helper_providers_max)
14046 help->dthps_maxprovs = dtrace_helper_providers_max;
14047
14048 ASSERT(tmp_maxprovs < help->dthps_maxprovs);
14049
14050 help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
14051 sizeof (dtrace_helper_provider_t *), KM_SLEEP);
14052
14053 if (tmp_provs != NULL) {
14054 bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
14055 sizeof (dtrace_helper_provider_t *));
14056 kmem_free(tmp_provs, tmp_maxprovs *
14057 sizeof (dtrace_helper_provider_t *));
14058 }
14059 }
14060
14061 help->dthps_provs[help->dthps_nprovs] = hprov;
14062 help->dthps_nprovs++;
14063
14064 return (0);
14065 }
14066
14067 static void
14068 dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
14069 {
14070 lck_mtx_lock(&dtrace_lock);
14071
14072 if (--hprov->dthp_ref == 0) {
14073 dof_hdr_t *dof;
14074 lck_mtx_unlock(&dtrace_lock);
14075 dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
14076 dtrace_dof_destroy(dof);
14077 kmem_free(hprov, sizeof (dtrace_helper_provider_t));
14078 } else {
14079 lck_mtx_unlock(&dtrace_lock);
14080 }
14081 }
14082
14083 static int
14084 dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)
14085 {
14086 uintptr_t daddr = (uintptr_t)dof;
14087 dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
14088 dof_provider_t *provider;
14089 dof_probe_t *probe;
14090 uint8_t *arg;
14091 char *strtab, *typestr;
14092 dof_stridx_t typeidx;
14093 size_t typesz;
14094 uint_t nprobes, j, k;
14095
14096 ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
14097
14098 if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
14099 dtrace_dof_error(dof, "misaligned section offset");
14100 return (-1);
14101 }
14102
14103 /*
14104 * The section needs to be large enough to contain the DOF provider
14105 * structure appropriate for the given version.
14106 */
14107 if (sec->dofs_size <
14108 ((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
14109 offsetof(dof_provider_t, dofpv_prenoffs) :
14110 sizeof (dof_provider_t))) {
14111 dtrace_dof_error(dof, "provider section too small");
14112 return (-1);
14113 }
14114
14115 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
14116 str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
14117 prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
14118 arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
14119 off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
14120
14121 if (str_sec == NULL || prb_sec == NULL ||
14122 arg_sec == NULL || off_sec == NULL)
14123 return (-1);
14124
14125 enoff_sec = NULL;
14126
14127 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
14128 provider->dofpv_prenoffs != DOF_SECT_NONE &&
14129 (enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
14130 provider->dofpv_prenoffs)) == NULL)
14131 return (-1);
14132
14133 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
14134
14135 if (provider->dofpv_name >= str_sec->dofs_size ||
14136 strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
14137 dtrace_dof_error(dof, "invalid provider name");
14138 return (-1);
14139 }
14140
14141 if (prb_sec->dofs_entsize == 0 ||
14142 prb_sec->dofs_entsize > prb_sec->dofs_size) {
14143 dtrace_dof_error(dof, "invalid entry size");
14144 return (-1);
14145 }
14146
14147 if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
14148 dtrace_dof_error(dof, "misaligned entry size");
14149 return (-1);
14150 }
14151
14152 if (off_sec->dofs_entsize != sizeof (uint32_t)) {
14153 dtrace_dof_error(dof, "invalid entry size");
14154 return (-1);
14155 }
14156
14157 if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
14158 dtrace_dof_error(dof, "misaligned section offset");
14159 return (-1);
14160 }
14161
14162 if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
14163 dtrace_dof_error(dof, "invalid entry size");
14164 return (-1);
14165 }
14166
14167 arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
14168
14169 nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
14170
14171 /*
14172 * Take a pass through the probes to check for errors.
14173 */
14174 for (j = 0; j < nprobes; j++) {
14175 probe = (dof_probe_t *)(uintptr_t)(daddr +
14176 prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
14177
14178 if (probe->dofpr_func >= str_sec->dofs_size) {
14179 dtrace_dof_error(dof, "invalid function name");
14180 return (-1);
14181 }
14182
14183 if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
14184 dtrace_dof_error(dof, "function name too long");
14185 return (-1);
14186 }
14187
14188 if (probe->dofpr_name >= str_sec->dofs_size ||
14189 strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
14190 dtrace_dof_error(dof, "invalid probe name");
14191 return (-1);
14192 }
14193
14194 /*
14195 * The offset count must not wrap the index, and the offsets
14196 * must also not overflow the section's data.
14197 */
14198 if (probe->dofpr_offidx + probe->dofpr_noffs <
14199 probe->dofpr_offidx ||
14200 (probe->dofpr_offidx + probe->dofpr_noffs) *
14201 off_sec->dofs_entsize > off_sec->dofs_size) {
14202 dtrace_dof_error(dof, "invalid probe offset");
14203 return (-1);
14204 }
14205
14206 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
14207 /*
14208 * If there's no is-enabled offset section, make sure
14209 * there aren't any is-enabled offsets. Otherwise
14210 * perform the same checks as for probe offsets
14211 * (immediately above).
14212 */
14213 if (enoff_sec == NULL) {
14214 if (probe->dofpr_enoffidx != 0 ||
14215 probe->dofpr_nenoffs != 0) {
14216 dtrace_dof_error(dof, "is-enabled "
14217 "offsets with null section");
14218 return (-1);
14219 }
14220 } else if (probe->dofpr_enoffidx +
14221 probe->dofpr_nenoffs < probe->dofpr_enoffidx ||
14222 (probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
14223 enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
14224 dtrace_dof_error(dof, "invalid is-enabled "
14225 "offset");
14226 return (-1);
14227 }
14228
14229 if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
14230 dtrace_dof_error(dof, "zero probe and "
14231 "is-enabled offsets");
14232 return (-1);
14233 }
14234 } else if (probe->dofpr_noffs == 0) {
14235 dtrace_dof_error(dof, "zero probe offsets");
14236 return (-1);
14237 }
14238
14239 if (probe->dofpr_argidx + probe->dofpr_xargc <
14240 probe->dofpr_argidx ||
14241 (probe->dofpr_argidx + probe->dofpr_xargc) *
14242 arg_sec->dofs_entsize > arg_sec->dofs_size) {
14243 dtrace_dof_error(dof, "invalid args");
14244 return (-1);
14245 }
14246
14247 typeidx = probe->dofpr_nargv;
14248 typestr = strtab + probe->dofpr_nargv;
14249 for (k = 0; k < probe->dofpr_nargc; k++) {
14250 if (typeidx >= str_sec->dofs_size) {
14251 dtrace_dof_error(dof, "bad "
14252 "native argument type");
14253 return (-1);
14254 }
14255
14256 typesz = strlen(typestr) + 1;
14257 if (typesz > DTRACE_ARGTYPELEN) {
14258 dtrace_dof_error(dof, "native "
14259 "argument type too long");
14260 return (-1);
14261 }
14262 typeidx += typesz;
14263 typestr += typesz;
14264 }
14265
14266 typeidx = probe->dofpr_xargv;
14267 typestr = strtab + probe->dofpr_xargv;
14268 for (k = 0; k < probe->dofpr_xargc; k++) {
14269 if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
14270 dtrace_dof_error(dof, "bad "
14271 "native argument index");
14272 return (-1);
14273 }
14274
14275 if (typeidx >= str_sec->dofs_size) {
14276 dtrace_dof_error(dof, "bad "
14277 "translated argument type");
14278 return (-1);
14279 }
14280
14281 typesz = strlen(typestr) + 1;
14282 if (typesz > DTRACE_ARGTYPELEN) {
14283 dtrace_dof_error(dof, "translated argument "
14284 "type too long");
14285 return (-1);
14286 }
14287
14288 typeidx += typesz;
14289 typestr += typesz;
14290 }
14291 }
14292
14293 return (0);
14294 }
14295
14296 static int
14297 dtrace_helper_slurp(proc_t* p, dof_hdr_t *dof, dof_helper_t *dhp)
14298 {
14299 dtrace_helpers_t *help;
14300 dtrace_vstate_t *vstate;
14301 dtrace_enabling_t *enab = NULL;
14302 int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
14303 uintptr_t daddr = (uintptr_t)dof;
14304
14305 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14306
14307 if ((help = p->p_dtrace_helpers) == NULL)
14308 help = dtrace_helpers_create(p);
14309
14310 vstate = &help->dthps_vstate;
14311
14312 if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab,
14313 dhp != NULL ? dhp->dofhp_addr : 0, B_FALSE)) != 0) {
14314 dtrace_dof_destroy(dof);
14315 return (rv);
14316 }
14317
14318 /*
14319 * Look for helper providers and validate their descriptions.
14320 */
14321 if (dhp != NULL) {
14322 for (i = 0; (uint32_t)i < dof->dofh_secnum; i++) {
14323 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
14324 dof->dofh_secoff + i * dof->dofh_secsize);
14325
14326 if (sec->dofs_type != DOF_SECT_PROVIDER)
14327 continue;
14328
14329 if (dtrace_helper_provider_validate(dof, sec) != 0) {
14330 dtrace_enabling_destroy(enab);
14331 dtrace_dof_destroy(dof);
14332 return (-1);
14333 }
14334
14335 nprovs++;
14336 }
14337 }
14338
14339 /*
14340 * Now we need to walk through the ECB descriptions in the enabling.
14341 */
14342 for (i = 0; i < enab->dten_ndesc; i++) {
14343 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
14344 dtrace_probedesc_t *desc = &ep->dted_probe;
14345
14346 /* APPLE NOTE: Darwin employs size bounded string operation. */
14347 if (!LIT_STRNEQL(desc->dtpd_provider, "dtrace"))
14348 continue;
14349
14350 if (!LIT_STRNEQL(desc->dtpd_mod, "helper"))
14351 continue;
14352
14353 if (!LIT_STRNEQL(desc->dtpd_func, "ustack"))
14354 continue;
14355
14356 if ((rv = dtrace_helper_action_add(p, DTRACE_HELPER_ACTION_USTACK,
14357 ep)) != 0) {
14358 /*
14359 * Adding this helper action failed -- we are now going
14360 * to rip out the entire generation and return failure.
14361 */
14362 (void) dtrace_helper_destroygen(p, help->dthps_generation);
14363 dtrace_enabling_destroy(enab);
14364 dtrace_dof_destroy(dof);
14365 return (-1);
14366 }
14367
14368 nhelpers++;
14369 }
14370
14371 if (nhelpers < enab->dten_ndesc)
14372 dtrace_dof_error(dof, "unmatched helpers");
14373
14374 gen = help->dthps_generation++;
14375 dtrace_enabling_destroy(enab);
14376
14377 if (dhp != NULL && nprovs > 0) {
14378 dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
14379 if (dtrace_helper_provider_add(p, dhp, gen) == 0) {
14380 lck_mtx_unlock(&dtrace_lock);
14381 dtrace_helper_provider_register(p, help, dhp);
14382 lck_mtx_lock(&dtrace_lock);
14383
14384 destroy = 0;
14385 }
14386 }
14387
14388 if (destroy)
14389 dtrace_dof_destroy(dof);
14390
14391 return (gen);
14392 }
14393
14394 /*
14395 * APPLE NOTE: DTrace lazy dof implementation
14396 *
14397 * DTrace user static probes (USDT probes) and helper actions are loaded
14398 * in a process by proccessing dof sections. The dof sections are passed
14399 * into the kernel by dyld, in a dof_ioctl_data_t block. It is rather
14400 * expensive to process dof for a process that will never use it. There
14401 * is a memory cost (allocating the providers/probes), and a cpu cost
14402 * (creating the providers/probes).
14403 *
14404 * To reduce this cost, we use "lazy dof". The normal proceedure for
14405 * dof processing is to copyin the dof(s) pointed to by the dof_ioctl_data_t
14406 * block, and invoke dof_slurp_helper() on them. When "lazy dof" is
14407 * used, each process retains the dof_ioctl_data_t block, instead of
14408 * copying in the data it points to.
14409 *
14410 * The dof_ioctl_data_t blocks are managed as if they were the actual
14411 * processed dof; on fork the block is copied to the child, on exec and
14412 * exit the block is freed.
14413 *
14414 * If the process loads library(s) containing additional dof, the
14415 * new dof_ioctl_data_t is merged with the existing block.
14416 *
14417 * There are a few catches that make this slightly more difficult.
14418 * When dyld registers dof_ioctl_data_t blocks, it expects a unique
14419 * identifier value for each dof in the block. In non-lazy dof terms,
14420 * this is the generation that dof was loaded in. If we hand back
14421 * a UID for a lazy dof, that same UID must be able to unload the
14422 * dof once it has become non-lazy. To meet this requirement, the
14423 * code that loads lazy dof requires that the UID's for dof(s) in
14424 * the lazy dof be sorted, and in ascending order. It is okay to skip
14425 * UID's, I.E., 1 -> 5 -> 6 is legal.
14426 *
14427 * Once a process has become non-lazy, it will stay non-lazy. All
14428 * future dof operations for that process will be non-lazy, even
14429 * if the dof mode transitions back to lazy.
14430 *
14431 * Always do lazy dof checks before non-lazy (I.E. In fork, exit, exec.).
14432 * That way if the lazy check fails due to transitioning to non-lazy, the
14433 * right thing is done with the newly faulted in dof.
14434 */
14435
14436 /*
14437 * This method is a bit squicky. It must handle:
14438 *
14439 * dof should not be lazy.
14440 * dof should have been handled lazily, but there was an error
14441 * dof was handled lazily, and needs to be freed.
14442 * dof was handled lazily, and must not be freed.
14443 *
14444 *
14445 * Returns EACCESS if dof should be handled non-lazily.
14446 *
14447 * KERN_SUCCESS and all other return codes indicate lazy handling of dof.
14448 *
14449 * If the dofs data is claimed by this method, dofs_claimed will be set.
14450 * Callers should not free claimed dofs.
14451 */
14452 static int
14453 dtrace_lazy_dofs_add(proc_t *p, dof_ioctl_data_t* incoming_dofs, int *dofs_claimed)
14454 {
14455 ASSERT(p);
14456 ASSERT(incoming_dofs && incoming_dofs->dofiod_count > 0);
14457
14458 int rval = 0;
14459 *dofs_claimed = 0;
14460
14461 lck_rw_lock_shared(&dtrace_dof_mode_lock);
14462
14463 /*
14464 * If we have lazy dof, dof mode better be LAZY_ON.
14465 */
14466 ASSERT(p->p_dtrace_lazy_dofs == NULL || dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON);
14467 ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
14468 ASSERT(dtrace_dof_mode != DTRACE_DOF_MODE_NEVER);
14469
14470 /*
14471 * Any existing helpers force non-lazy behavior.
14472 */
14473 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON && (p->p_dtrace_helpers == NULL)) {
14474 lck_mtx_lock(&p->p_dtrace_sprlock);
14475
14476 dof_ioctl_data_t* existing_dofs = p->p_dtrace_lazy_dofs;
14477 unsigned int existing_dofs_count = (existing_dofs) ? existing_dofs->dofiod_count : 0;
14478 unsigned int i, merged_dofs_count = incoming_dofs->dofiod_count + existing_dofs_count;
14479
14480 /*
14481 * Range check...
14482 */
14483 if (merged_dofs_count == 0 || merged_dofs_count > 1024) {
14484 dtrace_dof_error(NULL, "lazy_dofs_add merged_dofs_count out of range");
14485 rval = EINVAL;
14486 goto unlock;
14487 }
14488
14489 /*
14490 * Each dof being added must be assigned a unique generation.
14491 */
14492 uint64_t generation = (existing_dofs) ? existing_dofs->dofiod_helpers[existing_dofs_count - 1].dofhp_dof + 1 : 1;
14493 for (i=0; i<incoming_dofs->dofiod_count; i++) {
14494 /*
14495 * We rely on these being the same so we can overwrite dofhp_dof and not lose info.
14496 */
14497 ASSERT(incoming_dofs->dofiod_helpers[i].dofhp_dof == incoming_dofs->dofiod_helpers[i].dofhp_addr);
14498 incoming_dofs->dofiod_helpers[i].dofhp_dof = generation++;
14499 }
14500
14501
14502 if (existing_dofs) {
14503 /*
14504 * Merge the existing and incoming dofs
14505 */
14506 size_t merged_dofs_size = DOF_IOCTL_DATA_T_SIZE(merged_dofs_count);
14507 dof_ioctl_data_t* merged_dofs = kmem_alloc(merged_dofs_size, KM_SLEEP);
14508
14509 bcopy(&existing_dofs->dofiod_helpers[0],
14510 &merged_dofs->dofiod_helpers[0],
14511 sizeof(dof_helper_t) * existing_dofs_count);
14512 bcopy(&incoming_dofs->dofiod_helpers[0],
14513 &merged_dofs->dofiod_helpers[existing_dofs_count],
14514 sizeof(dof_helper_t) * incoming_dofs->dofiod_count);
14515
14516 merged_dofs->dofiod_count = merged_dofs_count;
14517
14518 kmem_free(existing_dofs, DOF_IOCTL_DATA_T_SIZE(existing_dofs_count));
14519
14520 p->p_dtrace_lazy_dofs = merged_dofs;
14521 } else {
14522 /*
14523 * Claim the incoming dofs
14524 */
14525 *dofs_claimed = 1;
14526 p->p_dtrace_lazy_dofs = incoming_dofs;
14527 }
14528
14529 #if DEBUG
14530 dof_ioctl_data_t* all_dofs = p->p_dtrace_lazy_dofs;
14531 for (i=0; i<all_dofs->dofiod_count-1; i++) {
14532 ASSERT(all_dofs->dofiod_helpers[i].dofhp_dof < all_dofs->dofiod_helpers[i+1].dofhp_dof);
14533 }
14534 #endif /* DEBUG */
14535
14536 unlock:
14537 lck_mtx_unlock(&p->p_dtrace_sprlock);
14538 } else {
14539 rval = EACCES;
14540 }
14541
14542 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
14543
14544 return rval;
14545 }
14546
14547 /*
14548 * Returns:
14549 *
14550 * EINVAL: lazy dof is enabled, but the requested generation was not found.
14551 * EACCES: This removal needs to be handled non-lazily.
14552 */
14553 static int
14554 dtrace_lazy_dofs_remove(proc_t *p, int generation)
14555 {
14556 int rval = EINVAL;
14557
14558 lck_rw_lock_shared(&dtrace_dof_mode_lock);
14559
14560 /*
14561 * If we have lazy dof, dof mode better be LAZY_ON.
14562 */
14563 ASSERT(p->p_dtrace_lazy_dofs == NULL || dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON);
14564 ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
14565 ASSERT(dtrace_dof_mode != DTRACE_DOF_MODE_NEVER);
14566
14567 /*
14568 * Any existing helpers force non-lazy behavior.
14569 */
14570 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON && (p->p_dtrace_helpers == NULL)) {
14571 lck_mtx_lock(&p->p_dtrace_sprlock);
14572
14573 dof_ioctl_data_t* existing_dofs = p->p_dtrace_lazy_dofs;
14574
14575 if (existing_dofs) {
14576 int index, existing_dofs_count = existing_dofs->dofiod_count;
14577 for (index=0; index<existing_dofs_count; index++) {
14578 if ((int)existing_dofs->dofiod_helpers[index].dofhp_dof == generation) {
14579 dof_ioctl_data_t* removed_dofs = NULL;
14580
14581 /*
14582 * If there is only 1 dof, we'll delete it and swap in NULL.
14583 */
14584 if (existing_dofs_count > 1) {
14585 int removed_dofs_count = existing_dofs_count - 1;
14586 size_t removed_dofs_size = DOF_IOCTL_DATA_T_SIZE(removed_dofs_count);
14587
14588 removed_dofs = kmem_alloc(removed_dofs_size, KM_SLEEP);
14589 removed_dofs->dofiod_count = removed_dofs_count;
14590
14591 /*
14592 * copy the remaining data.
14593 */
14594 if (index > 0) {
14595 bcopy(&existing_dofs->dofiod_helpers[0],
14596 &removed_dofs->dofiod_helpers[0],
14597 index * sizeof(dof_helper_t));
14598 }
14599
14600 if (index < existing_dofs_count-1) {
14601 bcopy(&existing_dofs->dofiod_helpers[index+1],
14602 &removed_dofs->dofiod_helpers[index],
14603 (existing_dofs_count - index - 1) * sizeof(dof_helper_t));
14604 }
14605 }
14606
14607 kmem_free(existing_dofs, DOF_IOCTL_DATA_T_SIZE(existing_dofs_count));
14608
14609 p->p_dtrace_lazy_dofs = removed_dofs;
14610
14611 rval = KERN_SUCCESS;
14612
14613 break;
14614 }
14615 }
14616
14617 #if DEBUG
14618 dof_ioctl_data_t* all_dofs = p->p_dtrace_lazy_dofs;
14619 if (all_dofs) {
14620 unsigned int i;
14621 for (i=0; i<all_dofs->dofiod_count-1; i++) {
14622 ASSERT(all_dofs->dofiod_helpers[i].dofhp_dof < all_dofs->dofiod_helpers[i+1].dofhp_dof);
14623 }
14624 }
14625 #endif
14626
14627 }
14628
14629 lck_mtx_unlock(&p->p_dtrace_sprlock);
14630 } else {
14631 rval = EACCES;
14632 }
14633
14634 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
14635
14636 return rval;
14637 }
14638
14639 void
14640 dtrace_lazy_dofs_destroy(proc_t *p)
14641 {
14642 lck_rw_lock_shared(&dtrace_dof_mode_lock);
14643 lck_mtx_lock(&p->p_dtrace_sprlock);
14644
14645 /*
14646 * If we have lazy dof, dof mode better be LAZY_ON, or we must be exiting.
14647 * We cannot assert against DTRACE_DOF_MODE_NEVER here, because we are called from
14648 * kern_exit.c and kern_exec.c.
14649 */
14650 ASSERT(p->p_dtrace_lazy_dofs == NULL || dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON || p->p_lflag & P_LEXIT);
14651 ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
14652
14653 dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs;
14654 p->p_dtrace_lazy_dofs = NULL;
14655
14656 lck_mtx_unlock(&p->p_dtrace_sprlock);
14657 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
14658
14659 if (lazy_dofs) {
14660 kmem_free(lazy_dofs, DOF_IOCTL_DATA_T_SIZE(lazy_dofs->dofiod_count));
14661 }
14662 }
14663
14664 void
14665 dtrace_lazy_dofs_duplicate(proc_t *parent, proc_t *child)
14666 {
14667 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
14668 lck_mtx_assert(&parent->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED);
14669 lck_mtx_assert(&child->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED);
14670
14671 lck_rw_lock_shared(&dtrace_dof_mode_lock);
14672 lck_mtx_lock(&parent->p_dtrace_sprlock);
14673
14674 /*
14675 * If we have lazy dof, dof mode better be LAZY_ON, or we must be exiting.
14676 * We cannot assert against DTRACE_DOF_MODE_NEVER here, because we are called from
14677 * kern_fork.c
14678 */
14679 ASSERT(parent->p_dtrace_lazy_dofs == NULL || dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON);
14680 ASSERT(parent->p_dtrace_lazy_dofs == NULL || parent->p_dtrace_helpers == NULL);
14681 /*
14682 * In theory we should hold the child sprlock, but this is safe...
14683 */
14684 ASSERT(child->p_dtrace_lazy_dofs == NULL && child->p_dtrace_helpers == NULL);
14685
14686 dof_ioctl_data_t* parent_dofs = parent->p_dtrace_lazy_dofs;
14687 dof_ioctl_data_t* child_dofs = NULL;
14688 if (parent_dofs) {
14689 size_t parent_dofs_size = DOF_IOCTL_DATA_T_SIZE(parent_dofs->dofiod_count);
14690 child_dofs = kmem_alloc(parent_dofs_size, KM_SLEEP);
14691 bcopy(parent_dofs, child_dofs, parent_dofs_size);
14692 }
14693
14694 lck_mtx_unlock(&parent->p_dtrace_sprlock);
14695
14696 if (child_dofs) {
14697 lck_mtx_lock(&child->p_dtrace_sprlock);
14698 child->p_dtrace_lazy_dofs = child_dofs;
14699 lck_mtx_unlock(&child->p_dtrace_sprlock);
14700 }
14701
14702 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
14703 }
14704
14705 static int
14706 dtrace_lazy_dofs_proc_iterate_filter(proc_t *p, void* ignored)
14707 {
14708 #pragma unused(ignored)
14709 /*
14710 * Okay to NULL test without taking the sprlock.
14711 */
14712 return p->p_dtrace_lazy_dofs != NULL;
14713 }
14714
14715 static int
14716 dtrace_lazy_dofs_proc_iterate_doit(proc_t *p, void* ignored)
14717 {
14718 #pragma unused(ignored)
14719 /*
14720 * It is possible this process may exit during our attempt to
14721 * fault in the dof. We could fix this by holding locks longer,
14722 * but the errors are benign.
14723 */
14724 lck_mtx_lock(&p->p_dtrace_sprlock);
14725
14726 /*
14727 * In this case only, it is okay to have lazy dof when dof mode is DTRACE_DOF_MODE_LAZY_OFF
14728 */
14729 ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
14730 ASSERT(dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF);
14731
14732
14733 dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs;
14734 p->p_dtrace_lazy_dofs = NULL;
14735
14736 lck_mtx_unlock(&p->p_dtrace_sprlock);
14737
14738 /*
14739 * Process each dof_helper_t
14740 */
14741 if (lazy_dofs != NULL) {
14742 unsigned int i;
14743 int rval;
14744
14745 for (i=0; i<lazy_dofs->dofiod_count; i++) {
14746 /*
14747 * When loading lazy dof, we depend on the generations being sorted in ascending order.
14748 */
14749 ASSERT(i >= (lazy_dofs->dofiod_count - 1) || lazy_dofs->dofiod_helpers[i].dofhp_dof < lazy_dofs->dofiod_helpers[i+1].dofhp_dof);
14750
14751 dof_helper_t *dhp = &lazy_dofs->dofiod_helpers[i];
14752
14753 /*
14754 * We stored the generation in dofhp_dof. Save it, and restore the original value.
14755 */
14756 int generation = dhp->dofhp_dof;
14757 dhp->dofhp_dof = dhp->dofhp_addr;
14758
14759 dof_hdr_t *dof = dtrace_dof_copyin_from_proc(p, dhp->dofhp_dof, &rval);
14760
14761 if (dof != NULL) {
14762 dtrace_helpers_t *help;
14763
14764 lck_mtx_lock(&dtrace_lock);
14765
14766 /*
14767 * This must be done with the dtrace_lock held
14768 */
14769 if ((help = p->p_dtrace_helpers) == NULL)
14770 help = dtrace_helpers_create(p);
14771
14772 /*
14773 * If the generation value has been bumped, someone snuck in
14774 * when we released the dtrace lock. We have to dump this generation,
14775 * there is no safe way to load it.
14776 */
14777 if (help->dthps_generation <= generation) {
14778 help->dthps_generation = generation;
14779
14780 /*
14781 * dtrace_helper_slurp() takes responsibility for the dof --
14782 * it may free it now or it may save it and free it later.
14783 */
14784 if ((rval = dtrace_helper_slurp(p, dof, dhp)) != generation) {
14785 dtrace_dof_error(NULL, "returned value did not match expected generation");
14786 }
14787 }
14788
14789 lck_mtx_unlock(&dtrace_lock);
14790 }
14791 }
14792
14793 kmem_free(lazy_dofs, DOF_IOCTL_DATA_T_SIZE(lazy_dofs->dofiod_count));
14794 }
14795
14796 return PROC_RETURNED;
14797 }
14798
14799 static dtrace_helpers_t *
14800 dtrace_helpers_create(proc_t *p)
14801 {
14802 dtrace_helpers_t *help;
14803
14804 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14805 ASSERT(p->p_dtrace_helpers == NULL);
14806
14807 help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
14808 help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *
14809 DTRACE_NHELPER_ACTIONS, KM_SLEEP);
14810
14811 p->p_dtrace_helpers = help;
14812 dtrace_helpers++;
14813
14814 return (help);
14815 }
14816
14817 static void
14818 dtrace_helpers_destroy(proc_t* p)
14819 {
14820 dtrace_helpers_t *help;
14821 dtrace_vstate_t *vstate;
14822 uint_t i;
14823
14824 lck_mtx_lock(&dtrace_lock);
14825
14826 ASSERT(p->p_dtrace_helpers != NULL);
14827 ASSERT(dtrace_helpers > 0);
14828
14829 help = p->p_dtrace_helpers;
14830 vstate = &help->dthps_vstate;
14831
14832 /*
14833 * We're now going to lose the help from this process.
14834 */
14835 p->p_dtrace_helpers = NULL;
14836 dtrace_sync();
14837
14838 /*
14839 * Destory the helper actions.
14840 */
14841 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
14842 dtrace_helper_action_t *h, *next;
14843
14844 for (h = help->dthps_actions[i]; h != NULL; h = next) {
14845 next = h->dtha_next;
14846 dtrace_helper_action_destroy(h, vstate);
14847 h = next;
14848 }
14849 }
14850
14851 lck_mtx_unlock(&dtrace_lock);
14852
14853 /*
14854 * Destroy the helper providers.
14855 */
14856 if (help->dthps_maxprovs > 0) {
14857 lck_mtx_lock(&dtrace_meta_lock);
14858 if (dtrace_meta_pid != NULL) {
14859 ASSERT(dtrace_deferred_pid == NULL);
14860
14861 for (i = 0; i < help->dthps_nprovs; i++) {
14862 dtrace_helper_provider_remove(
14863 &help->dthps_provs[i]->dthp_prov, p->p_pid);
14864 }
14865 } else {
14866 lck_mtx_lock(&dtrace_lock);
14867 ASSERT(help->dthps_deferred == 0 ||
14868 help->dthps_next != NULL ||
14869 help->dthps_prev != NULL ||
14870 help == dtrace_deferred_pid);
14871
14872 /*
14873 * Remove the helper from the deferred list.
14874 */
14875 if (help->dthps_next != NULL)
14876 help->dthps_next->dthps_prev = help->dthps_prev;
14877 if (help->dthps_prev != NULL)
14878 help->dthps_prev->dthps_next = help->dthps_next;
14879 if (dtrace_deferred_pid == help) {
14880 dtrace_deferred_pid = help->dthps_next;
14881 ASSERT(help->dthps_prev == NULL);
14882 }
14883
14884 lck_mtx_unlock(&dtrace_lock);
14885 }
14886
14887 lck_mtx_unlock(&dtrace_meta_lock);
14888
14889 for (i = 0; i < help->dthps_nprovs; i++) {
14890 dtrace_helper_provider_destroy(help->dthps_provs[i]);
14891 }
14892
14893 kmem_free(help->dthps_provs, help->dthps_maxprovs *
14894 sizeof (dtrace_helper_provider_t *));
14895 }
14896
14897 lck_mtx_lock(&dtrace_lock);
14898
14899 dtrace_vstate_fini(&help->dthps_vstate);
14900 kmem_free(help->dthps_actions,
14901 sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);
14902 kmem_free(help, sizeof (dtrace_helpers_t));
14903
14904 --dtrace_helpers;
14905 lck_mtx_unlock(&dtrace_lock);
14906 }
14907
14908 static void
14909 dtrace_helpers_duplicate(proc_t *from, proc_t *to)
14910 {
14911 dtrace_helpers_t *help, *newhelp;
14912 dtrace_helper_action_t *helper, *new, *last;
14913 dtrace_difo_t *dp;
14914 dtrace_vstate_t *vstate;
14915 uint_t i;
14916 int j, sz, hasprovs = 0;
14917
14918 lck_mtx_lock(&dtrace_lock);
14919 ASSERT(from->p_dtrace_helpers != NULL);
14920 ASSERT(dtrace_helpers > 0);
14921
14922 help = from->p_dtrace_helpers;
14923 newhelp = dtrace_helpers_create(to);
14924 ASSERT(to->p_dtrace_helpers != NULL);
14925
14926 newhelp->dthps_generation = help->dthps_generation;
14927 vstate = &newhelp->dthps_vstate;
14928
14929 /*
14930 * Duplicate the helper actions.
14931 */
14932 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
14933 if ((helper = help->dthps_actions[i]) == NULL)
14934 continue;
14935
14936 for (last = NULL; helper != NULL; helper = helper->dtha_next) {
14937 new = kmem_zalloc(sizeof (dtrace_helper_action_t),
14938 KM_SLEEP);
14939 new->dtha_generation = helper->dtha_generation;
14940
14941 if ((dp = helper->dtha_predicate) != NULL) {
14942 dp = dtrace_difo_duplicate(dp, vstate);
14943 new->dtha_predicate = dp;
14944 }
14945
14946 new->dtha_nactions = helper->dtha_nactions;
14947 sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;
14948 new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
14949
14950 for (j = 0; j < new->dtha_nactions; j++) {
14951 dtrace_difo_t *dpj = helper->dtha_actions[j];
14952
14953 ASSERT(dpj != NULL);
14954 dpj = dtrace_difo_duplicate(dpj, vstate);
14955 new->dtha_actions[j] = dpj;
14956 }
14957
14958 if (last != NULL) {
14959 last->dtha_next = new;
14960 } else {
14961 newhelp->dthps_actions[i] = new;
14962 }
14963
14964 last = new;
14965 }
14966 }
14967
14968 /*
14969 * Duplicate the helper providers and register them with the
14970 * DTrace framework.
14971 */
14972 if (help->dthps_nprovs > 0) {
14973 newhelp->dthps_nprovs = help->dthps_nprovs;
14974 newhelp->dthps_maxprovs = help->dthps_nprovs;
14975 newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
14976 sizeof (dtrace_helper_provider_t *), KM_SLEEP);
14977 for (i = 0; i < newhelp->dthps_nprovs; i++) {
14978 newhelp->dthps_provs[i] = help->dthps_provs[i];
14979 newhelp->dthps_provs[i]->dthp_ref++;
14980 }
14981
14982 hasprovs = 1;
14983 }
14984
14985 lck_mtx_unlock(&dtrace_lock);
14986
14987 if (hasprovs)
14988 dtrace_helper_provider_register(to, newhelp, NULL);
14989 }
14990
14991 /*
14992 * DTrace Hook Functions
14993 */
14994
14995 /*
14996 * APPLE NOTE: dtrace_modctl_* routines for kext support.
14997 * Used to manipulate the modctl list within dtrace xnu.
14998 */
14999
15000 modctl_t *dtrace_modctl_list;
15001
15002 static void
15003 dtrace_modctl_add(struct modctl * newctl)
15004 {
15005 struct modctl *nextp, *prevp;
15006
15007 ASSERT(newctl != NULL);
15008 lck_mtx_assert(&mod_lock, LCK_MTX_ASSERT_OWNED);
15009
15010 // Insert new module at the front of the list,
15011
15012 newctl->mod_next = dtrace_modctl_list;
15013 dtrace_modctl_list = newctl;
15014
15015 /*
15016 * If a module exists with the same name, then that module
15017 * must have been unloaded with enabled probes. We will move
15018 * the unloaded module to the new module's stale chain and
15019 * then stop traversing the list.
15020 */
15021
15022 prevp = newctl;
15023 nextp = newctl->mod_next;
15024
15025 while (nextp != NULL) {
15026 if (nextp->mod_loaded) {
15027 /* This is a loaded module. Keep traversing. */
15028 prevp = nextp;
15029 nextp = nextp->mod_next;
15030 continue;
15031 }
15032 else {
15033 /* Found an unloaded module */
15034 if (strncmp (newctl->mod_modname, nextp->mod_modname, KMOD_MAX_NAME)) {
15035 /* Names don't match. Keep traversing. */
15036 prevp = nextp;
15037 nextp = nextp->mod_next;
15038 continue;
15039 }
15040 else {
15041 /* We found a stale entry, move it. We're done. */
15042 prevp->mod_next = nextp->mod_next;
15043 newctl->mod_stale = nextp;
15044 nextp->mod_next = NULL;
15045 break;
15046 }
15047 }
15048 }
15049 }
15050
15051 static modctl_t *
15052 dtrace_modctl_lookup(struct kmod_info * kmod)
15053 {
15054 lck_mtx_assert(&mod_lock, LCK_MTX_ASSERT_OWNED);
15055
15056 struct modctl * ctl;
15057
15058 for (ctl = dtrace_modctl_list; ctl; ctl=ctl->mod_next) {
15059 if (ctl->mod_id == kmod->id)
15060 return(ctl);
15061 }
15062 return (NULL);
15063 }
15064
15065 /*
15066 * This routine is called from dtrace_module_unloaded().
15067 * It removes a modctl structure and its stale chain
15068 * from the kext shadow list.
15069 */
15070 static void
15071 dtrace_modctl_remove(struct modctl * ctl)
15072 {
15073 ASSERT(ctl != NULL);
15074 lck_mtx_assert(&mod_lock, LCK_MTX_ASSERT_OWNED);
15075 modctl_t *prevp, *nextp, *curp;
15076
15077 // Remove stale chain first
15078 for (curp=ctl->mod_stale; curp != NULL; curp=nextp) {
15079 nextp = curp->mod_stale;
15080 /* There should NEVER be user symbols allocated at this point */
15081 ASSERT(curp->mod_user_symbols == NULL);
15082 kmem_free(curp, sizeof(modctl_t));
15083 }
15084
15085 prevp = NULL;
15086 curp = dtrace_modctl_list;
15087
15088 while (curp != ctl) {
15089 prevp = curp;
15090 curp = curp->mod_next;
15091 }
15092
15093 if (prevp != NULL) {
15094 prevp->mod_next = ctl->mod_next;
15095 }
15096 else {
15097 dtrace_modctl_list = ctl->mod_next;
15098 }
15099
15100 /* There should NEVER be user symbols allocated at this point */
15101 ASSERT(ctl->mod_user_symbols == NULL);
15102
15103 kmem_free (ctl, sizeof(modctl_t));
15104 }
15105
15106 /*
15107 * APPLE NOTE: The kext loader will call dtrace_module_loaded
15108 * when the kext is loaded in memory, but before calling the
15109 * kext's start routine.
15110 *
15111 * Return 0 on success
15112 * Return -1 on failure
15113 */
15114
15115 static int
15116 dtrace_module_loaded(struct kmod_info *kmod, uint32_t flag)
15117 {
15118 dtrace_provider_t *prv;
15119
15120 /*
15121 * If kernel symbols have been disabled, return immediately
15122 * DTRACE_KERNEL_SYMBOLS_NEVER is a permanent mode, it is safe to test without holding locks
15123 */
15124 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER)
15125 return 0;
15126
15127 struct modctl *ctl = NULL;
15128 if (!kmod || kmod->address == 0 || kmod->size == 0)
15129 return(-1);
15130
15131 lck_mtx_lock(&dtrace_provider_lock);
15132 lck_mtx_lock(&mod_lock);
15133
15134 /*
15135 * Have we seen this kext before?
15136 */
15137
15138 ctl = dtrace_modctl_lookup(kmod);
15139
15140 if (ctl != NULL) {
15141 /* bail... we already have this kext in the modctl list */
15142 lck_mtx_unlock(&mod_lock);
15143 lck_mtx_unlock(&dtrace_provider_lock);
15144 if (dtrace_err_verbose)
15145 cmn_err(CE_WARN, "dtrace load module already exists '%s %u' is failing against '%s %u'", kmod->name, (uint_t)kmod->id, ctl->mod_modname, ctl->mod_id);
15146 return(-1);
15147 }
15148 else {
15149 ctl = kmem_alloc(sizeof(struct modctl), KM_SLEEP);
15150 if (ctl == NULL) {
15151 if (dtrace_err_verbose)
15152 cmn_err(CE_WARN, "dtrace module load '%s %u' is failing ", kmod->name, (uint_t)kmod->id);
15153 lck_mtx_unlock(&mod_lock);
15154 lck_mtx_unlock(&dtrace_provider_lock);
15155 return (-1);
15156 }
15157 ctl->mod_next = NULL;
15158 ctl->mod_stale = NULL;
15159 strlcpy (ctl->mod_modname, kmod->name, sizeof(ctl->mod_modname));
15160 ctl->mod_loadcnt = kmod->id;
15161 ctl->mod_nenabled = 0;
15162 ctl->mod_address = kmod->address;
15163 ctl->mod_size = kmod->size;
15164 ctl->mod_id = kmod->id;
15165 ctl->mod_loaded = 1;
15166 ctl->mod_flags = 0;
15167 ctl->mod_user_symbols = NULL;
15168
15169 /*
15170 * Find the UUID for this module, if it has one
15171 */
15172 kernel_mach_header_t* header = (kernel_mach_header_t *)ctl->mod_address;
15173 struct load_command* load_cmd = (struct load_command *)&header[1];
15174 uint32_t i;
15175 for (i = 0; i < header->ncmds; i++) {
15176 if (load_cmd->cmd == LC_UUID) {
15177 struct uuid_command* uuid_cmd = (struct uuid_command *)load_cmd;
15178 memcpy(ctl->mod_uuid, uuid_cmd->uuid, sizeof(uuid_cmd->uuid));
15179 ctl->mod_flags |= MODCTL_HAS_UUID;
15180 break;
15181 }
15182 load_cmd = (struct load_command *)((caddr_t)load_cmd + load_cmd->cmdsize);
15183 }
15184
15185 if (ctl->mod_address == g_kernel_kmod_info.address) {
15186 ctl->mod_flags |= MODCTL_IS_MACH_KERNEL;
15187 }
15188 }
15189 dtrace_modctl_add(ctl);
15190
15191 /*
15192 * We must hold the dtrace_lock to safely test non permanent dtrace_fbt_symbol_mode(s)
15193 */
15194 lck_mtx_lock(&dtrace_lock);
15195
15196 /*
15197 * DTrace must decide if it will instrument modules lazily via
15198 * userspace symbols (default mode), or instrument immediately via
15199 * kernel symbols (non-default mode)
15200 *
15201 * When in default/lazy mode, DTrace will only support modules
15202 * built with a valid UUID.
15203 *
15204 * Overriding the default can be done explicitly in one of
15205 * the following two ways.
15206 *
15207 * A module can force symbols from kernel space using the plist key,
15208 * OSBundleForceDTraceInit (see kmod.h). If this per kext state is set,
15209 * we fall through and instrument this module now.
15210 *
15211 * Or, the boot-arg, dtrace_kernel_symbol_mode, can be set to force symbols
15212 * from kernel space (see dtrace_impl.h). If this system state is set
15213 * to a non-userspace mode, we fall through and instrument the module now.
15214 */
15215
15216 if ((dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) &&
15217 (!(flag & KMOD_DTRACE_FORCE_INIT)))
15218 {
15219 /* We will instrument the module lazily -- this is the default */
15220 lck_mtx_unlock(&dtrace_lock);
15221 lck_mtx_unlock(&mod_lock);
15222 lck_mtx_unlock(&dtrace_provider_lock);
15223 return 0;
15224 }
15225
15226 /* We will instrument the module immediately using kernel symbols */
15227 ctl->mod_flags |= MODCTL_HAS_KERNEL_SYMBOLS;
15228
15229 lck_mtx_unlock(&dtrace_lock);
15230
15231 /*
15232 * We're going to call each providers per-module provide operation
15233 * specifying only this module.
15234 */
15235 for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
15236 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
15237
15238 /*
15239 * APPLE NOTE: The contract with the kext loader is that once this function
15240 * has completed, it may delete kernel symbols at will.
15241 * We must set this while still holding the mod_lock.
15242 */
15243 ctl->mod_flags &= ~MODCTL_HAS_KERNEL_SYMBOLS;
15244
15245 lck_mtx_unlock(&mod_lock);
15246 lck_mtx_unlock(&dtrace_provider_lock);
15247
15248 /*
15249 * If we have any retained enablings, we need to match against them.
15250 * Enabling probes requires that cpu_lock be held, and we cannot hold
15251 * cpu_lock here -- it is legal for cpu_lock to be held when loading a
15252 * module. (In particular, this happens when loading scheduling
15253 * classes.) So if we have any retained enablings, we need to dispatch
15254 * our task queue to do the match for us.
15255 */
15256 lck_mtx_lock(&dtrace_lock);
15257
15258 if (dtrace_retained == NULL) {
15259 lck_mtx_unlock(&dtrace_lock);
15260 return 0;
15261 }
15262
15263 /* APPLE NOTE!
15264 *
15265 * The cpu_lock mentioned above is only held by dtrace code, Apple's xnu never actually
15266 * holds it for any reason. Thus the comment above is invalid, we can directly invoke
15267 * dtrace_enabling_matchall without jumping through all the hoops, and we can avoid
15268 * the delay call as well.
15269 */
15270 lck_mtx_unlock(&dtrace_lock);
15271
15272 dtrace_enabling_matchall();
15273
15274 return 0;
15275 }
15276
15277 /*
15278 * Return 0 on success
15279 * Return -1 on failure
15280 */
15281 static int
15282 dtrace_module_unloaded(struct kmod_info *kmod)
15283 {
15284 dtrace_probe_t template, *probe, *first, *next;
15285 dtrace_provider_t *prov;
15286 struct modctl *ctl = NULL;
15287 struct modctl *syncctl = NULL;
15288 struct modctl *nextsyncctl = NULL;
15289 int syncmode = 0;
15290
15291 lck_mtx_lock(&dtrace_provider_lock);
15292 lck_mtx_lock(&mod_lock);
15293 lck_mtx_lock(&dtrace_lock);
15294
15295 if (kmod == NULL) {
15296 syncmode = 1;
15297 }
15298 else {
15299 ctl = dtrace_modctl_lookup(kmod);
15300 if (ctl == NULL)
15301 {
15302 lck_mtx_unlock(&dtrace_lock);
15303 lck_mtx_unlock(&mod_lock);
15304 lck_mtx_unlock(&dtrace_provider_lock);
15305 return (-1);
15306 }
15307 ctl->mod_loaded = 0;
15308 ctl->mod_address = 0;
15309 ctl->mod_size = 0;
15310 }
15311
15312 if (dtrace_bymod == NULL) {
15313 /*
15314 * The DTrace module is loaded (obviously) but not attached;
15315 * we don't have any work to do.
15316 */
15317 if (ctl != NULL)
15318 (void)dtrace_modctl_remove(ctl);
15319 lck_mtx_unlock(&dtrace_lock);
15320 lck_mtx_unlock(&mod_lock);
15321 lck_mtx_unlock(&dtrace_provider_lock);
15322 return(0);
15323 }
15324
15325 /* Syncmode set means we target and traverse entire modctl list. */
15326 if (syncmode)
15327 nextsyncctl = dtrace_modctl_list;
15328
15329 syncloop:
15330 if (syncmode)
15331 {
15332 /* find a stale modctl struct */
15333 for (syncctl = nextsyncctl; syncctl != NULL; syncctl=syncctl->mod_next) {
15334 if (syncctl->mod_address == 0)
15335 break;
15336 }
15337 if (syncctl==NULL)
15338 {
15339 /* We have no more work to do */
15340 lck_mtx_unlock(&dtrace_lock);
15341 lck_mtx_unlock(&mod_lock);
15342 lck_mtx_unlock(&dtrace_provider_lock);
15343 return(0);
15344 }
15345 else {
15346 /* keep track of next syncctl in case this one is removed */
15347 nextsyncctl = syncctl->mod_next;
15348 ctl = syncctl;
15349 }
15350 }
15351
15352 template.dtpr_mod = ctl->mod_modname;
15353
15354 for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
15355 probe != NULL; probe = probe->dtpr_nextmod) {
15356 if (probe->dtpr_ecb != NULL) {
15357 /*
15358 * This shouldn't _actually_ be possible -- we're
15359 * unloading a module that has an enabled probe in it.
15360 * (It's normally up to the provider to make sure that
15361 * this can't happen.) However, because dtps_enable()
15362 * doesn't have a failure mode, there can be an
15363 * enable/unload race. Upshot: we don't want to
15364 * assert, but we're not going to disable the
15365 * probe, either.
15366 */
15367
15368
15369 if (syncmode) {
15370 /* We're syncing, let's look at next in list */
15371 goto syncloop;
15372 }
15373
15374 lck_mtx_unlock(&dtrace_lock);
15375 lck_mtx_unlock(&mod_lock);
15376 lck_mtx_unlock(&dtrace_provider_lock);
15377
15378 if (dtrace_err_verbose) {
15379 cmn_err(CE_WARN, "unloaded module '%s' had "
15380 "enabled probes", ctl->mod_modname);
15381 }
15382 return(-1);
15383 }
15384 }
15385
15386 probe = first;
15387
15388 for (first = NULL; probe != NULL; probe = next) {
15389 ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
15390
15391 dtrace_probes[probe->dtpr_id - 1] = NULL;
15392 probe->dtpr_provider->dtpv_probe_count--;
15393
15394 next = probe->dtpr_nextmod;
15395 dtrace_hash_remove(dtrace_bymod, probe);
15396 dtrace_hash_remove(dtrace_byfunc, probe);
15397 dtrace_hash_remove(dtrace_byname, probe);
15398
15399 if (first == NULL) {
15400 first = probe;
15401 probe->dtpr_nextmod = NULL;
15402 } else {
15403 probe->dtpr_nextmod = first;
15404 first = probe;
15405 }
15406 }
15407
15408 /*
15409 * We've removed all of the module's probes from the hash chains and
15410 * from the probe array. Now issue a dtrace_sync() to be sure that
15411 * everyone has cleared out from any probe array processing.
15412 */
15413 dtrace_sync();
15414
15415 for (probe = first; probe != NULL; probe = first) {
15416 first = probe->dtpr_nextmod;
15417 prov = probe->dtpr_provider;
15418 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
15419 probe->dtpr_arg);
15420 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
15421 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
15422 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
15423 vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
15424
15425 zfree(dtrace_probe_t_zone, probe);
15426 }
15427
15428 dtrace_modctl_remove(ctl);
15429
15430 if (syncmode)
15431 goto syncloop;
15432
15433 lck_mtx_unlock(&dtrace_lock);
15434 lck_mtx_unlock(&mod_lock);
15435 lck_mtx_unlock(&dtrace_provider_lock);
15436
15437 return(0);
15438 }
15439
15440 void
15441 dtrace_suspend(void)
15442 {
15443 dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
15444 }
15445
15446 void
15447 dtrace_resume(void)
15448 {
15449 dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
15450 }
15451
15452 static int
15453 dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
15454 {
15455 lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
15456 lck_mtx_lock(&dtrace_lock);
15457
15458 switch (what) {
15459 case CPU_CONFIG: {
15460 dtrace_state_t *state;
15461 dtrace_optval_t *opt, rs, c;
15462
15463 /*
15464 * For now, we only allocate a new buffer for anonymous state.
15465 */
15466 if ((state = dtrace_anon.dta_state) == NULL)
15467 break;
15468
15469 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
15470 break;
15471
15472 opt = state->dts_options;
15473 c = opt[DTRACEOPT_CPU];
15474
15475 if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
15476 break;
15477
15478 /*
15479 * Regardless of what the actual policy is, we're going to
15480 * temporarily set our resize policy to be manual. We're
15481 * also going to temporarily set our CPU option to denote
15482 * the newly configured CPU.
15483 */
15484 rs = opt[DTRACEOPT_BUFRESIZE];
15485 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
15486 opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
15487
15488 (void) dtrace_state_buffers(state);
15489
15490 opt[DTRACEOPT_BUFRESIZE] = rs;
15491 opt[DTRACEOPT_CPU] = c;
15492
15493 break;
15494 }
15495
15496 case CPU_UNCONFIG:
15497 /*
15498 * We don't free the buffer in the CPU_UNCONFIG case. (The
15499 * buffer will be freed when the consumer exits.)
15500 */
15501 break;
15502
15503 default:
15504 break;
15505 }
15506
15507 lck_mtx_unlock(&dtrace_lock);
15508 return (0);
15509 }
15510
15511 static void
15512 dtrace_cpu_setup_initial(processorid_t cpu)
15513 {
15514 (void) dtrace_cpu_setup(CPU_CONFIG, cpu);
15515 }
15516
15517 static void
15518 dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
15519 {
15520 if (dtrace_toxranges >= dtrace_toxranges_max) {
15521 int osize, nsize;
15522 dtrace_toxrange_t *range;
15523
15524 osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
15525
15526 if (osize == 0) {
15527 ASSERT(dtrace_toxrange == NULL);
15528 ASSERT(dtrace_toxranges_max == 0);
15529 dtrace_toxranges_max = 1;
15530 } else {
15531 dtrace_toxranges_max <<= 1;
15532 }
15533
15534 nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
15535 range = kmem_zalloc(nsize, KM_SLEEP);
15536
15537 if (dtrace_toxrange != NULL) {
15538 ASSERT(osize != 0);
15539 bcopy(dtrace_toxrange, range, osize);
15540 kmem_free(dtrace_toxrange, osize);
15541 }
15542
15543 dtrace_toxrange = range;
15544 }
15545
15546 ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == 0);
15547 ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == 0);
15548
15549 dtrace_toxrange[dtrace_toxranges].dtt_base = base;
15550 dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
15551 dtrace_toxranges++;
15552 }
15553
15554 /*
15555 * DTrace Driver Cookbook Functions
15556 */
15557 /*ARGSUSED*/
15558 static int
15559 dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
15560 {
15561 #pragma unused(cmd) /* __APPLE__ */
15562 dtrace_provider_id_t id;
15563 dtrace_state_t *state = NULL;
15564 dtrace_enabling_t *enab;
15565
15566 lck_mtx_lock(&cpu_lock);
15567 lck_mtx_lock(&dtrace_provider_lock);
15568 lck_mtx_lock(&dtrace_lock);
15569
15570 if (ddi_soft_state_init(&dtrace_softstate,
15571 sizeof (dtrace_state_t), 0) != 0) {
15572 cmn_err(CE_NOTE, "/dev/dtrace failed to initialize soft state");
15573 lck_mtx_unlock(&dtrace_lock);
15574 lck_mtx_unlock(&dtrace_provider_lock);
15575 lck_mtx_unlock(&cpu_lock);
15576 return (DDI_FAILURE);
15577 }
15578
15579 /* Darwin uses BSD cloning device driver to automagically obtain minor device number. */
15580
15581 ddi_report_dev(devi);
15582 dtrace_devi = devi;
15583
15584 dtrace_modload = dtrace_module_loaded;
15585 dtrace_modunload = dtrace_module_unloaded;
15586 dtrace_cpu_init = dtrace_cpu_setup_initial;
15587 dtrace_helpers_cleanup = dtrace_helpers_destroy;
15588 dtrace_helpers_fork = dtrace_helpers_duplicate;
15589 dtrace_cpustart_init = dtrace_suspend;
15590 dtrace_cpustart_fini = dtrace_resume;
15591 dtrace_debugger_init = dtrace_suspend;
15592 dtrace_debugger_fini = dtrace_resume;
15593
15594 register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
15595
15596 lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
15597
15598 dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1,
15599 NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
15600 dtrace_minor = vmem_create("dtrace_minor", (void *)DTRACEMNRN_CLONE,
15601 UINT32_MAX - DTRACEMNRN_CLONE, 1, NULL, NULL, NULL, 0,
15602 VM_SLEEP | VMC_IDENTIFIER);
15603 dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri,
15604 1, INT_MAX, 0);
15605
15606 dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
15607 sizeof (dtrace_dstate_percpu_t) * (int)NCPU, DTRACE_STATE_ALIGN,
15608 NULL, NULL, NULL, NULL, NULL, 0);
15609
15610 lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
15611 dtrace_bymod = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_mod),
15612 offsetof(dtrace_probe_t, dtpr_nextmod),
15613 offsetof(dtrace_probe_t, dtpr_prevmod));
15614
15615 dtrace_byfunc = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_func),
15616 offsetof(dtrace_probe_t, dtpr_nextfunc),
15617 offsetof(dtrace_probe_t, dtpr_prevfunc));
15618
15619 dtrace_byname = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_name),
15620 offsetof(dtrace_probe_t, dtpr_nextname),
15621 offsetof(dtrace_probe_t, dtpr_prevname));
15622
15623 if (dtrace_retain_max < 1) {
15624 cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
15625 "setting to 1", dtrace_retain_max);
15626 dtrace_retain_max = 1;
15627 }
15628
15629 /*
15630 * Now discover our toxic ranges.
15631 */
15632 dtrace_toxic_ranges(dtrace_toxrange_add);
15633
15634 /*
15635 * Before we register ourselves as a provider to our own framework,
15636 * we would like to assert that dtrace_provider is NULL -- but that's
15637 * not true if we were loaded as a dependency of a DTrace provider.
15638 * Once we've registered, we can assert that dtrace_provider is our
15639 * pseudo provider.
15640 */
15641 (void) dtrace_register("dtrace", &dtrace_provider_attr,
15642 DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);
15643
15644 ASSERT(dtrace_provider != NULL);
15645 ASSERT((dtrace_provider_id_t)dtrace_provider == id);
15646
15647 #if defined (__x86_64__)
15648 dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
15649 dtrace_provider, NULL, NULL, "BEGIN", 1, NULL);
15650 dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
15651 dtrace_provider, NULL, NULL, "END", 0, NULL);
15652 dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
15653 dtrace_provider, NULL, NULL, "ERROR", 3, NULL);
15654 #else
15655 #error Unknown Architecture
15656 #endif
15657
15658 dtrace_anon_property();
15659 lck_mtx_unlock(&cpu_lock);
15660
15661 /*
15662 * If DTrace helper tracing is enabled, we need to allocate the
15663 * trace buffer and initialize the values.
15664 */
15665 if (dtrace_helptrace_enabled) {
15666 ASSERT(dtrace_helptrace_buffer == NULL);
15667 dtrace_helptrace_buffer =
15668 kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
15669 dtrace_helptrace_next = 0;
15670 }
15671
15672 /*
15673 * If there are already providers, we must ask them to provide their
15674 * probes, and then match any anonymous enabling against them. Note
15675 * that there should be no other retained enablings at this time:
15676 * the only retained enablings at this time should be the anonymous
15677 * enabling.
15678 */
15679 if (dtrace_anon.dta_enabling != NULL) {
15680 ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
15681
15682 /*
15683 * APPLE NOTE: if handling anonymous dof, switch symbol modes.
15684 */
15685 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) {
15686 dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_KERNEL;
15687 }
15688
15689 dtrace_enabling_provide(NULL);
15690 state = dtrace_anon.dta_state;
15691
15692 /*
15693 * We couldn't hold cpu_lock across the above call to
15694 * dtrace_enabling_provide(), but we must hold it to actually
15695 * enable the probes. We have to drop all of our locks, pick
15696 * up cpu_lock, and regain our locks before matching the
15697 * retained anonymous enabling.
15698 */
15699 lck_mtx_unlock(&dtrace_lock);
15700 lck_mtx_unlock(&dtrace_provider_lock);
15701
15702 lck_mtx_lock(&cpu_lock);
15703 lck_mtx_lock(&dtrace_provider_lock);
15704 lck_mtx_lock(&dtrace_lock);
15705
15706 if ((enab = dtrace_anon.dta_enabling) != NULL)
15707 (void) dtrace_enabling_match(enab, NULL);
15708
15709 lck_mtx_unlock(&cpu_lock);
15710 }
15711
15712 lck_mtx_unlock(&dtrace_lock);
15713 lck_mtx_unlock(&dtrace_provider_lock);
15714
15715 if (state != NULL) {
15716 /*
15717 * If we created any anonymous state, set it going now.
15718 */
15719 (void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
15720 }
15721
15722 return (DDI_SUCCESS);
15723 }
15724
15725 /*ARGSUSED*/
15726 static int
15727 dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
15728 {
15729 #pragma unused(flag, otyp)
15730 dtrace_state_t *state;
15731 uint32_t priv;
15732 uid_t uid;
15733 zoneid_t zoneid;
15734 int rv;
15735
15736 /* APPLE: Darwin puts Helper on its own major device. */
15737
15738 /*
15739 * If no DTRACE_PRIV_* bits are set in the credential, then the
15740 * caller lacks sufficient permission to do anything with DTrace.
15741 */
15742 dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
15743 if (priv == DTRACE_PRIV_NONE)
15744 return (EACCES);
15745
15746 /*
15747 * APPLE NOTE: We delay the initialization of fasttrap as late as possible.
15748 * It certainly can't be later than now!
15749 */
15750 fasttrap_init();
15751
15752 /*
15753 * Ask all providers to provide all their probes.
15754 */
15755 lck_mtx_lock(&dtrace_provider_lock);
15756 dtrace_probe_provide(NULL, NULL);
15757 lck_mtx_unlock(&dtrace_provider_lock);
15758
15759 lck_mtx_lock(&cpu_lock);
15760 lck_mtx_lock(&dtrace_lock);
15761 dtrace_opens++;
15762 dtrace_membar_producer();
15763
15764 /*
15765 * If the kernel debugger is active (that is, if the kernel debugger
15766 * modified text in some way), we won't allow the open.
15767 */
15768 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
15769 dtrace_opens--;
15770 lck_mtx_unlock(&dtrace_lock);
15771 lck_mtx_unlock(&cpu_lock);
15772 return (EBUSY);
15773 }
15774
15775 rv = dtrace_state_create(devp, cred_p, &state);
15776 lck_mtx_unlock(&cpu_lock);
15777
15778 if (rv != 0 || state == NULL) {
15779 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
15780 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
15781 lck_mtx_unlock(&dtrace_lock);
15782 /* propagate EAGAIN or ERESTART */
15783 return (rv);
15784 }
15785
15786 lck_mtx_unlock(&dtrace_lock);
15787
15788 lck_rw_lock_exclusive(&dtrace_dof_mode_lock);
15789
15790 /*
15791 * If we are currently lazy, transition states.
15792 *
15793 * Unlike dtrace_close, we do not need to check the
15794 * value of dtrace_opens, as any positive value (and
15795 * we count as 1) means we transition states.
15796 */
15797 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON) {
15798 dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_OFF;
15799
15800 /*
15801 * Iterate all existing processes and load lazy dofs.
15802 */
15803 proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS,
15804 dtrace_lazy_dofs_proc_iterate_doit,
15805 NULL,
15806 dtrace_lazy_dofs_proc_iterate_filter,
15807 NULL);
15808 }
15809
15810 lck_rw_unlock_exclusive(&dtrace_dof_mode_lock);
15811
15812 /*
15813 * Update kernel symbol state.
15814 *
15815 * We must own the provider and dtrace locks.
15816 *
15817 * NOTE! It may appear there is a race by setting this value so late
15818 * after dtrace_probe_provide. However, any kext loaded after the
15819 * call to probe provide and before we set LAZY_OFF will be marked as
15820 * eligible for symbols from userspace. The same dtrace that is currently
15821 * calling dtrace_open() (this call!) will get a list of kexts needing
15822 * symbols and fill them in, thus closing the race window.
15823 *
15824 * We want to set this value only after it certain it will succeed, as
15825 * this significantly reduces the complexity of error exits.
15826 */
15827 lck_mtx_lock(&dtrace_lock);
15828 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) {
15829 dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_KERNEL;
15830 }
15831 lck_mtx_unlock(&dtrace_lock);
15832
15833 return (0);
15834 }
15835
15836 /*ARGSUSED*/
15837 static int
15838 dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
15839 {
15840 #pragma unused(flag, otyp, cred_p) /* __APPLE__ */
15841 minor_t minor = getminor(dev);
15842 dtrace_state_t *state;
15843
15844 /* APPLE NOTE: Darwin puts Helper on its own major device. */
15845
15846 state = ddi_get_soft_state(dtrace_softstate, minor);
15847
15848 lck_mtx_lock(&cpu_lock);
15849 lck_mtx_lock(&dtrace_lock);
15850
15851 if (state->dts_anon) {
15852 /*
15853 * There is anonymous state. Destroy that first.
15854 */
15855 ASSERT(dtrace_anon.dta_state == NULL);
15856 dtrace_state_destroy(state->dts_anon);
15857 }
15858
15859 dtrace_state_destroy(state);
15860 ASSERT(dtrace_opens > 0);
15861
15862 /*
15863 * Only relinquish control of the kernel debugger interface when there
15864 * are no consumers and no anonymous enablings.
15865 */
15866 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
15867 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
15868
15869 lck_mtx_unlock(&dtrace_lock);
15870 lck_mtx_unlock(&cpu_lock);
15871
15872 /*
15873 * Lock ordering requires the dof mode lock be taken before
15874 * the dtrace_lock.
15875 */
15876 lck_rw_lock_exclusive(&dtrace_dof_mode_lock);
15877 lck_mtx_lock(&dtrace_lock);
15878
15879 if (dtrace_opens == 0) {
15880 /*
15881 * If we are currently lazy-off, and this is the last close, transition to
15882 * lazy state.
15883 */
15884 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF) {
15885 dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_ON;
15886 }
15887
15888 /*
15889 * If we are the last dtrace client, switch back to lazy (from userspace) symbols
15890 */
15891 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_KERNEL) {
15892 dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE;
15893 }
15894 }
15895
15896 lck_mtx_unlock(&dtrace_lock);
15897 lck_rw_unlock_exclusive(&dtrace_dof_mode_lock);
15898
15899 /*
15900 * Kext probes may be retained past the end of the kext's lifespan. The
15901 * probes are kept until the last reference to them has been removed.
15902 * Since closing an active dtrace context is likely to drop that last reference,
15903 * lets take a shot at cleaning out the orphaned probes now.
15904 */
15905 dtrace_module_unloaded(NULL);
15906
15907 return (0);
15908 }
15909
15910 /*ARGSUSED*/
15911 static int
15912 dtrace_ioctl_helper(u_long cmd, caddr_t arg, int *rv)
15913 {
15914 #pragma unused(rv)
15915 /*
15916 * Safe to check this outside the dof mode lock
15917 */
15918 if (dtrace_dof_mode == DTRACE_DOF_MODE_NEVER)
15919 return KERN_SUCCESS;
15920
15921 switch (cmd) {
15922 case DTRACEHIOC_ADDDOF:
15923 {
15924 dof_helper_t *dhp = NULL;
15925 size_t dof_ioctl_data_size;
15926 dof_ioctl_data_t* multi_dof;
15927 unsigned int i;
15928 int rval = 0;
15929 user_addr_t user_address = *(user_addr_t*)arg;
15930 uint64_t dof_count;
15931 int multi_dof_claimed = 0;
15932 proc_t* p = current_proc();
15933
15934 /*
15935 * Read the number of DOF sections being passed in.
15936 */
15937 if (copyin(user_address + offsetof(dof_ioctl_data_t, dofiod_count),
15938 &dof_count,
15939 sizeof(dof_count))) {
15940 dtrace_dof_error(NULL, "failed to copyin dofiod_count");
15941 return (EFAULT);
15942 }
15943
15944 /*
15945 * Range check the count.
15946 */
15947 if (dof_count == 0 || dof_count > 1024) {
15948 dtrace_dof_error(NULL, "dofiod_count is not valid");
15949 return (EINVAL);
15950 }
15951
15952 /*
15953 * Allocate a correctly sized structure and copyin the data.
15954 */
15955 dof_ioctl_data_size = DOF_IOCTL_DATA_T_SIZE(dof_count);
15956 if ((multi_dof = kmem_alloc(dof_ioctl_data_size, KM_SLEEP)) == NULL)
15957 return (ENOMEM);
15958
15959 /* NOTE! We can no longer exit this method via return */
15960 if (copyin(user_address, multi_dof, dof_ioctl_data_size) != 0) {
15961 dtrace_dof_error(NULL, "failed copyin of dof_ioctl_data_t");
15962 rval = EFAULT;
15963 goto cleanup;
15964 }
15965
15966 /*
15967 * Check that the count didn't change between the first copyin and the second.
15968 */
15969 if (multi_dof->dofiod_count != dof_count) {
15970 rval = EINVAL;
15971 goto cleanup;
15972 }
15973
15974 /*
15975 * Try to process lazily first.
15976 */
15977 rval = dtrace_lazy_dofs_add(p, multi_dof, &multi_dof_claimed);
15978
15979 /*
15980 * If rval is EACCES, we must be non-lazy.
15981 */
15982 if (rval == EACCES) {
15983 rval = 0;
15984 /*
15985 * Process each dof_helper_t
15986 */
15987 i = 0;
15988 do {
15989 dhp = &multi_dof->dofiod_helpers[i];
15990
15991 dof_hdr_t *dof = dtrace_dof_copyin(dhp->dofhp_dof, &rval);
15992
15993 if (dof != NULL) {
15994 lck_mtx_lock(&dtrace_lock);
15995
15996 /*
15997 * dtrace_helper_slurp() takes responsibility for the dof --
15998 * it may free it now or it may save it and free it later.
15999 */
16000 if ((dhp->dofhp_dof = (uint64_t)dtrace_helper_slurp(p, dof, dhp)) == -1ULL) {
16001 rval = EINVAL;
16002 }
16003
16004 lck_mtx_unlock(&dtrace_lock);
16005 }
16006 } while (++i < multi_dof->dofiod_count && rval == 0);
16007 }
16008
16009 /*
16010 * We need to copyout the multi_dof struct, because it contains
16011 * the generation (unique id) values needed to call DTRACEHIOC_REMOVE
16012 *
16013 * This could certainly be better optimized.
16014 */
16015 if (copyout(multi_dof, user_address, dof_ioctl_data_size) != 0) {
16016 dtrace_dof_error(NULL, "failed copyout of dof_ioctl_data_t");
16017 /* Don't overwrite pre-existing error code */
16018 if (rval == 0) rval = EFAULT;
16019 }
16020
16021 cleanup:
16022 /*
16023 * If we had to allocate struct memory, free it.
16024 */
16025 if (multi_dof != NULL && !multi_dof_claimed) {
16026 kmem_free(multi_dof, dof_ioctl_data_size);
16027 }
16028
16029 return rval;
16030 }
16031
16032 case DTRACEHIOC_REMOVE: {
16033 int generation = *(int*)arg;
16034 proc_t* p = current_proc();
16035
16036 /*
16037 * Try lazy first.
16038 */
16039 int rval = dtrace_lazy_dofs_remove(p, generation);
16040
16041 /*
16042 * EACCES means non-lazy
16043 */
16044 if (rval == EACCES) {
16045 lck_mtx_lock(&dtrace_lock);
16046 rval = dtrace_helper_destroygen(p, generation);
16047 lck_mtx_unlock(&dtrace_lock);
16048 }
16049
16050 return (rval);
16051 }
16052
16053 default:
16054 break;
16055 }
16056
16057 return ENOTTY;
16058 }
16059
16060 /*ARGSUSED*/
16061 static int
16062 dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv)
16063 {
16064 #pragma unused(md)
16065 minor_t minor = getminor(dev);
16066 dtrace_state_t *state;
16067 int rval;
16068
16069 /* Darwin puts Helper on its own major device. */
16070
16071 state = ddi_get_soft_state(dtrace_softstate, minor);
16072
16073 if (state->dts_anon) {
16074 ASSERT(dtrace_anon.dta_state == NULL);
16075 state = state->dts_anon;
16076 }
16077
16078 switch (cmd) {
16079 case DTRACEIOC_PROVIDER: {
16080 dtrace_providerdesc_t pvd;
16081 dtrace_provider_t *pvp;
16082
16083 if (copyin(arg, &pvd, sizeof (pvd)) != 0)
16084 return (EFAULT);
16085
16086 pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
16087 lck_mtx_lock(&dtrace_provider_lock);
16088
16089 for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
16090 if (strncmp(pvp->dtpv_name, pvd.dtvd_name, DTRACE_PROVNAMELEN) == 0)
16091 break;
16092 }
16093
16094 lck_mtx_unlock(&dtrace_provider_lock);
16095
16096 if (pvp == NULL)
16097 return (ESRCH);
16098
16099 bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
16100 bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
16101 if (copyout(&pvd, arg, sizeof (pvd)) != 0)
16102 return (EFAULT);
16103
16104 return (0);
16105 }
16106
16107 case DTRACEIOC_EPROBE: {
16108 dtrace_eprobedesc_t epdesc;
16109 dtrace_ecb_t *ecb;
16110 dtrace_action_t *act;
16111 void *buf;
16112 size_t size;
16113 uintptr_t dest;
16114 int nrecs;
16115
16116 if (copyin(arg, &epdesc, sizeof (epdesc)) != 0)
16117 return (EFAULT);
16118
16119 lck_mtx_lock(&dtrace_lock);
16120
16121 if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
16122 lck_mtx_unlock(&dtrace_lock);
16123 return (EINVAL);
16124 }
16125
16126 if (ecb->dte_probe == NULL) {
16127 lck_mtx_unlock(&dtrace_lock);
16128 return (EINVAL);
16129 }
16130
16131 epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
16132 epdesc.dtepd_uarg = ecb->dte_uarg;
16133 epdesc.dtepd_size = ecb->dte_size;
16134
16135 nrecs = epdesc.dtepd_nrecs;
16136 epdesc.dtepd_nrecs = 0;
16137 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
16138 if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
16139 continue;
16140
16141 epdesc.dtepd_nrecs++;
16142 }
16143
16144 /*
16145 * Now that we have the size, we need to allocate a temporary
16146 * buffer in which to store the complete description. We need
16147 * the temporary buffer to be able to drop dtrace_lock()
16148 * across the copyout(), below.
16149 */
16150 size = sizeof (dtrace_eprobedesc_t) +
16151 (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
16152
16153 buf = kmem_alloc(size, KM_SLEEP);
16154 dest = (uintptr_t)buf;
16155
16156 bcopy(&epdesc, (void *)dest, sizeof (epdesc));
16157 dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
16158
16159 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
16160 if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
16161 continue;
16162
16163 if (nrecs-- == 0)
16164 break;
16165
16166 bcopy(&act->dta_rec, (void *)dest,
16167 sizeof (dtrace_recdesc_t));
16168 dest += sizeof (dtrace_recdesc_t);
16169 }
16170
16171 lck_mtx_unlock(&dtrace_lock);
16172
16173 if (copyout(buf, arg, dest - (uintptr_t)buf) != 0) {
16174 kmem_free(buf, size);
16175 return (EFAULT);
16176 }
16177
16178 kmem_free(buf, size);
16179 return (0);
16180 }
16181
16182 case DTRACEIOC_AGGDESC: {
16183 dtrace_aggdesc_t aggdesc;
16184 dtrace_action_t *act;
16185 dtrace_aggregation_t *agg;
16186 int nrecs;
16187 uint32_t offs;
16188 dtrace_recdesc_t *lrec;
16189 void *buf;
16190 size_t size;
16191 uintptr_t dest;
16192
16193 if (copyin(arg, &aggdesc, sizeof (aggdesc)) != 0)
16194 return (EFAULT);
16195
16196 lck_mtx_lock(&dtrace_lock);
16197
16198 if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
16199 lck_mtx_unlock(&dtrace_lock);
16200 return (EINVAL);
16201 }
16202
16203 aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
16204
16205 nrecs = aggdesc.dtagd_nrecs;
16206 aggdesc.dtagd_nrecs = 0;
16207
16208 offs = agg->dtag_base;
16209 lrec = &agg->dtag_action.dta_rec;
16210 aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
16211
16212 for (act = agg->dtag_first; ; act = act->dta_next) {
16213 ASSERT(act->dta_intuple ||
16214 DTRACEACT_ISAGG(act->dta_kind));
16215
16216 /*
16217 * If this action has a record size of zero, it
16218 * denotes an argument to the aggregating action.
16219 * Because the presence of this record doesn't (or
16220 * shouldn't) affect the way the data is interpreted,
16221 * we don't copy it out to save user-level the
16222 * confusion of dealing with a zero-length record.
16223 */
16224 if (act->dta_rec.dtrd_size == 0) {
16225 ASSERT(agg->dtag_hasarg);
16226 continue;
16227 }
16228
16229 aggdesc.dtagd_nrecs++;
16230
16231 if (act == &agg->dtag_action)
16232 break;
16233 }
16234
16235 /*
16236 * Now that we have the size, we need to allocate a temporary
16237 * buffer in which to store the complete description. We need
16238 * the temporary buffer to be able to drop dtrace_lock()
16239 * across the copyout(), below.
16240 */
16241 size = sizeof (dtrace_aggdesc_t) +
16242 (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
16243
16244 buf = kmem_alloc(size, KM_SLEEP);
16245 dest = (uintptr_t)buf;
16246
16247 bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
16248 dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
16249
16250 for (act = agg->dtag_first; ; act = act->dta_next) {
16251 dtrace_recdesc_t rec = act->dta_rec;
16252
16253 /*
16254 * See the comment in the above loop for why we pass
16255 * over zero-length records.
16256 */
16257 if (rec.dtrd_size == 0) {
16258 ASSERT(agg->dtag_hasarg);
16259 continue;
16260 }
16261
16262 if (nrecs-- == 0)
16263 break;
16264
16265 rec.dtrd_offset -= offs;
16266 bcopy(&rec, (void *)dest, sizeof (rec));
16267 dest += sizeof (dtrace_recdesc_t);
16268
16269 if (act == &agg->dtag_action)
16270 break;
16271 }
16272
16273 lck_mtx_unlock(&dtrace_lock);
16274
16275 if (copyout(buf, arg, dest - (uintptr_t)buf) != 0) {
16276 kmem_free(buf, size);
16277 return (EFAULT);
16278 }
16279
16280 kmem_free(buf, size);
16281 return (0);
16282 }
16283
16284 case DTRACEIOC_ENABLE: {
16285 dof_hdr_t *dof;
16286 dtrace_enabling_t *enab = NULL;
16287 dtrace_vstate_t *vstate;
16288 int err = 0;
16289
16290 *rv = 0;
16291
16292 /*
16293 * If a NULL argument has been passed, we take this as our
16294 * cue to reevaluate our enablings.
16295 */
16296 if (arg == 0) {
16297 dtrace_enabling_matchall();
16298
16299 return (0);
16300 }
16301
16302 if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
16303 return (rval);
16304
16305 lck_mtx_lock(&cpu_lock);
16306 lck_mtx_lock(&dtrace_lock);
16307 vstate = &state->dts_vstate;
16308
16309 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
16310 lck_mtx_unlock(&dtrace_lock);
16311 lck_mtx_unlock(&cpu_lock);
16312 dtrace_dof_destroy(dof);
16313 return (EBUSY);
16314 }
16315
16316 if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
16317 lck_mtx_unlock(&dtrace_lock);
16318 lck_mtx_unlock(&cpu_lock);
16319 dtrace_dof_destroy(dof);
16320 return (EINVAL);
16321 }
16322
16323 if ((rval = dtrace_dof_options(dof, state)) != 0) {
16324 dtrace_enabling_destroy(enab);
16325 lck_mtx_unlock(&dtrace_lock);
16326 lck_mtx_unlock(&cpu_lock);
16327 dtrace_dof_destroy(dof);
16328 return (rval);
16329 }
16330
16331 if ((err = dtrace_enabling_match(enab, rv)) == 0) {
16332 err = dtrace_enabling_retain(enab);
16333 } else {
16334 dtrace_enabling_destroy(enab);
16335 }
16336
16337 lck_mtx_unlock(&dtrace_lock);
16338 lck_mtx_unlock(&cpu_lock);
16339 dtrace_dof_destroy(dof);
16340
16341 return (err);
16342 }
16343
16344 case DTRACEIOC_REPLICATE: {
16345 dtrace_repldesc_t desc;
16346 dtrace_probedesc_t *match = &desc.dtrpd_match;
16347 dtrace_probedesc_t *create = &desc.dtrpd_create;
16348 int err;
16349
16350 if (copyin(arg, &desc, sizeof (desc)) != 0)
16351 return (EFAULT);
16352
16353 match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
16354 match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
16355 match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
16356 match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
16357
16358 create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
16359 create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
16360 create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
16361 create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
16362
16363 lck_mtx_lock(&dtrace_lock);
16364 err = dtrace_enabling_replicate(state, match, create);
16365 lck_mtx_unlock(&dtrace_lock);
16366
16367 return (err);
16368 }
16369
16370 case DTRACEIOC_PROBEMATCH:
16371 case DTRACEIOC_PROBES: {
16372 dtrace_probe_t *probe = NULL;
16373 dtrace_probedesc_t desc;
16374 dtrace_probekey_t pkey;
16375 dtrace_id_t i;
16376 int m = 0;
16377 uint32_t priv;
16378 uid_t uid;
16379 zoneid_t zoneid;
16380
16381 if (copyin(arg, &desc, sizeof (desc)) != 0)
16382 return (EFAULT);
16383
16384 desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
16385 desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
16386 desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
16387 desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
16388
16389 /*
16390 * Before we attempt to match this probe, we want to give
16391 * all providers the opportunity to provide it.
16392 */
16393 if (desc.dtpd_id == DTRACE_IDNONE) {
16394 lck_mtx_lock(&dtrace_provider_lock);
16395 dtrace_probe_provide(&desc, NULL);
16396 lck_mtx_unlock(&dtrace_provider_lock);
16397 desc.dtpd_id++;
16398 }
16399
16400 if (cmd == DTRACEIOC_PROBEMATCH) {
16401 dtrace_probekey(&desc, &pkey);
16402 pkey.dtpk_id = DTRACE_IDNONE;
16403 }
16404
16405 dtrace_cred2priv(cr, &priv, &uid, &zoneid);
16406
16407 lck_mtx_lock(&dtrace_lock);
16408
16409 if (cmd == DTRACEIOC_PROBEMATCH) {
16410 /* Quiet compiler warning */
16411 for (i = desc.dtpd_id; i <= (dtrace_id_t)dtrace_nprobes; i++) {
16412 if ((probe = dtrace_probes[i - 1]) != NULL &&
16413 (m = dtrace_match_probe(probe, &pkey,
16414 priv, uid, zoneid)) != 0)
16415 break;
16416 }
16417
16418 if (m < 0) {
16419 lck_mtx_unlock(&dtrace_lock);
16420 return (EINVAL);
16421 }
16422
16423 } else {
16424 /* Quiet compiler warning */
16425 for (i = desc.dtpd_id; i <= (dtrace_id_t)dtrace_nprobes; i++) {
16426 if ((probe = dtrace_probes[i - 1]) != NULL &&
16427 dtrace_match_priv(probe, priv, uid, zoneid))
16428 break;
16429 }
16430 }
16431
16432 if (probe == NULL) {
16433 lck_mtx_unlock(&dtrace_lock);
16434 return (ESRCH);
16435 }
16436
16437 dtrace_probe_description(probe, &desc);
16438 lck_mtx_unlock(&dtrace_lock);
16439
16440 if (copyout(&desc, arg, sizeof (desc)) != 0)
16441 return (EFAULT);
16442
16443 return (0);
16444 }
16445
16446 case DTRACEIOC_PROBEARG: {
16447 dtrace_argdesc_t desc;
16448 dtrace_probe_t *probe;
16449 dtrace_provider_t *prov;
16450
16451 if (copyin(arg, &desc, sizeof (desc)) != 0)
16452 return (EFAULT);
16453
16454 if (desc.dtargd_id == DTRACE_IDNONE)
16455 return (EINVAL);
16456
16457 if (desc.dtargd_ndx == DTRACE_ARGNONE)
16458 return (EINVAL);
16459
16460 lck_mtx_lock(&dtrace_provider_lock);
16461 lck_mtx_lock(&mod_lock);
16462 lck_mtx_lock(&dtrace_lock);
16463
16464 /* Quiet compiler warning */
16465 if (desc.dtargd_id > (dtrace_id_t)dtrace_nprobes) {
16466 lck_mtx_unlock(&dtrace_lock);
16467 lck_mtx_unlock(&mod_lock);
16468 lck_mtx_unlock(&dtrace_provider_lock);
16469 return (EINVAL);
16470 }
16471
16472 if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
16473 lck_mtx_unlock(&dtrace_lock);
16474 lck_mtx_unlock(&mod_lock);
16475 lck_mtx_unlock(&dtrace_provider_lock);
16476 return (EINVAL);
16477 }
16478
16479 lck_mtx_unlock(&dtrace_lock);
16480
16481 prov = probe->dtpr_provider;
16482
16483 if (prov->dtpv_pops.dtps_getargdesc == NULL) {
16484 /*
16485 * There isn't any typed information for this probe.
16486 * Set the argument number to DTRACE_ARGNONE.
16487 */
16488 desc.dtargd_ndx = DTRACE_ARGNONE;
16489 } else {
16490 desc.dtargd_native[0] = '\0';
16491 desc.dtargd_xlate[0] = '\0';
16492 desc.dtargd_mapping = desc.dtargd_ndx;
16493
16494 prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
16495 probe->dtpr_id, probe->dtpr_arg, &desc);
16496 }
16497
16498 lck_mtx_unlock(&mod_lock);
16499 lck_mtx_unlock(&dtrace_provider_lock);
16500
16501 if (copyout(&desc, arg, sizeof (desc)) != 0)
16502 return (EFAULT);
16503
16504 return (0);
16505 }
16506
16507 case DTRACEIOC_GO: {
16508 processorid_t cpuid;
16509 rval = dtrace_state_go(state, &cpuid);
16510
16511 if (rval != 0)
16512 return (rval);
16513
16514 if (copyout(&cpuid, arg, sizeof (cpuid)) != 0)
16515 return (EFAULT);
16516
16517 return (0);
16518 }
16519
16520 case DTRACEIOC_STOP: {
16521 processorid_t cpuid;
16522
16523 lck_mtx_lock(&dtrace_lock);
16524 rval = dtrace_state_stop(state, &cpuid);
16525 lck_mtx_unlock(&dtrace_lock);
16526
16527 if (rval != 0)
16528 return (rval);
16529
16530 if (copyout(&cpuid, arg, sizeof (cpuid)) != 0)
16531 return (EFAULT);
16532
16533 return (0);
16534 }
16535
16536 case DTRACEIOC_DOFGET: {
16537 dof_hdr_t hdr, *dof;
16538 uint64_t len;
16539
16540 if (copyin(arg, &hdr, sizeof (hdr)) != 0)
16541 return (EFAULT);
16542
16543 lck_mtx_lock(&dtrace_lock);
16544 dof = dtrace_dof_create(state);
16545 lck_mtx_unlock(&dtrace_lock);
16546
16547 len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
16548 rval = copyout(dof, arg, len);
16549 dtrace_dof_destroy(dof);
16550
16551 return (rval == 0 ? 0 : EFAULT);
16552 }
16553
16554 case DTRACEIOC_AGGSNAP:
16555 case DTRACEIOC_BUFSNAP: {
16556 dtrace_bufdesc_t desc;
16557 caddr_t cached;
16558 dtrace_buffer_t *buf;
16559
16560 if (copyin(arg, &desc, sizeof (desc)) != 0)
16561 return (EFAULT);
16562
16563 if ((int)desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)
16564 return (EINVAL);
16565
16566 lck_mtx_lock(&dtrace_lock);
16567
16568 if (cmd == DTRACEIOC_BUFSNAP) {
16569 buf = &state->dts_buffer[desc.dtbd_cpu];
16570 } else {
16571 buf = &state->dts_aggbuffer[desc.dtbd_cpu];
16572 }
16573
16574 if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
16575 size_t sz = buf->dtb_offset;
16576
16577 if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
16578 lck_mtx_unlock(&dtrace_lock);
16579 return (EBUSY);
16580 }
16581
16582 /*
16583 * If this buffer has already been consumed, we're
16584 * going to indicate that there's nothing left here
16585 * to consume.
16586 */
16587 if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
16588 lck_mtx_unlock(&dtrace_lock);
16589
16590 desc.dtbd_size = 0;
16591 desc.dtbd_drops = 0;
16592 desc.dtbd_errors = 0;
16593 desc.dtbd_oldest = 0;
16594 sz = sizeof (desc);
16595
16596 if (copyout(&desc, arg, sz) != 0)
16597 return (EFAULT);
16598
16599 return (0);
16600 }
16601
16602 /*
16603 * If this is a ring buffer that has wrapped, we want
16604 * to copy the whole thing out.
16605 */
16606 if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
16607 dtrace_buffer_polish(buf);
16608 sz = buf->dtb_size;
16609 }
16610
16611 if (copyout(buf->dtb_tomax, (user_addr_t)desc.dtbd_data, sz) != 0) {
16612 lck_mtx_unlock(&dtrace_lock);
16613 return (EFAULT);
16614 }
16615
16616 desc.dtbd_size = sz;
16617 desc.dtbd_drops = buf->dtb_drops;
16618 desc.dtbd_errors = buf->dtb_errors;
16619 desc.dtbd_oldest = buf->dtb_xamot_offset;
16620
16621 lck_mtx_unlock(&dtrace_lock);
16622
16623 if (copyout(&desc, arg, sizeof (desc)) != 0)
16624 return (EFAULT);
16625
16626 buf->dtb_flags |= DTRACEBUF_CONSUMED;
16627
16628 return (0);
16629 }
16630
16631 if (buf->dtb_tomax == NULL) {
16632 ASSERT(buf->dtb_xamot == NULL);
16633 lck_mtx_unlock(&dtrace_lock);
16634 return (ENOENT);
16635 }
16636
16637 cached = buf->dtb_tomax;
16638 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
16639
16640 dtrace_xcall(desc.dtbd_cpu,
16641 (dtrace_xcall_t)dtrace_buffer_switch, buf);
16642
16643 state->dts_errors += buf->dtb_xamot_errors;
16644
16645 /*
16646 * If the buffers did not actually switch, then the cross call
16647 * did not take place -- presumably because the given CPU is
16648 * not in the ready set. If this is the case, we'll return
16649 * ENOENT.
16650 */
16651 if (buf->dtb_tomax == cached) {
16652 ASSERT(buf->dtb_xamot != cached);
16653 lck_mtx_unlock(&dtrace_lock);
16654 return (ENOENT);
16655 }
16656
16657 ASSERT(cached == buf->dtb_xamot);
16658
16659 /*
16660 * We have our snapshot; now copy it out.
16661 */
16662 if (copyout(buf->dtb_xamot, (user_addr_t)desc.dtbd_data,
16663 buf->dtb_xamot_offset) != 0) {
16664 lck_mtx_unlock(&dtrace_lock);
16665 return (EFAULT);
16666 }
16667
16668 desc.dtbd_size = buf->dtb_xamot_offset;
16669 desc.dtbd_drops = buf->dtb_xamot_drops;
16670 desc.dtbd_errors = buf->dtb_xamot_errors;
16671 desc.dtbd_oldest = 0;
16672
16673 lck_mtx_unlock(&dtrace_lock);
16674
16675 /*
16676 * Finally, copy out the buffer description.
16677 */
16678 if (copyout(&desc, arg, sizeof (desc)) != 0)
16679 return (EFAULT);
16680
16681 return (0);
16682 }
16683
16684 case DTRACEIOC_CONF: {
16685 dtrace_conf_t conf;
16686
16687 bzero(&conf, sizeof (conf));
16688 conf.dtc_difversion = DIF_VERSION;
16689 conf.dtc_difintregs = DIF_DIR_NREGS;
16690 conf.dtc_diftupregs = DIF_DTR_NREGS;
16691 conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
16692
16693 if (copyout(&conf, arg, sizeof (conf)) != 0)
16694 return (EFAULT);
16695
16696 return (0);
16697 }
16698
16699 case DTRACEIOC_STATUS: {
16700 dtrace_status_t stat;
16701 dtrace_dstate_t *dstate;
16702 int i, j;
16703 uint64_t nerrs;
16704
16705 /*
16706 * See the comment in dtrace_state_deadman() for the reason
16707 * for setting dts_laststatus to INT64_MAX before setting
16708 * it to the correct value.
16709 */
16710 state->dts_laststatus = INT64_MAX;
16711 dtrace_membar_producer();
16712 state->dts_laststatus = dtrace_gethrtime();
16713
16714 bzero(&stat, sizeof (stat));
16715
16716 lck_mtx_lock(&dtrace_lock);
16717
16718 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
16719 lck_mtx_unlock(&dtrace_lock);
16720 return (ENOENT);
16721 }
16722
16723 if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
16724 stat.dtst_exiting = 1;
16725
16726 nerrs = state->dts_errors;
16727 dstate = &state->dts_vstate.dtvs_dynvars;
16728
16729 for (i = 0; i < (int)NCPU; i++) {
16730 dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
16731
16732 stat.dtst_dyndrops += dcpu->dtdsc_drops;
16733 stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
16734 stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
16735
16736 if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
16737 stat.dtst_filled++;
16738
16739 nerrs += state->dts_buffer[i].dtb_errors;
16740
16741 for (j = 0; j < state->dts_nspeculations; j++) {
16742 dtrace_speculation_t *spec;
16743 dtrace_buffer_t *buf;
16744
16745 spec = &state->dts_speculations[j];
16746 buf = &spec->dtsp_buffer[i];
16747 stat.dtst_specdrops += buf->dtb_xamot_drops;
16748 }
16749 }
16750
16751 stat.dtst_specdrops_busy = state->dts_speculations_busy;
16752 stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
16753 stat.dtst_stkstroverflows = state->dts_stkstroverflows;
16754 stat.dtst_dblerrors = state->dts_dblerrors;
16755 stat.dtst_killed =
16756 (state->dts_activity == DTRACE_ACTIVITY_KILLED);
16757 stat.dtst_errors = nerrs;
16758
16759 lck_mtx_unlock(&dtrace_lock);
16760
16761 if (copyout(&stat, arg, sizeof (stat)) != 0)
16762 return (EFAULT);
16763
16764 return (0);
16765 }
16766
16767 case DTRACEIOC_FORMAT: {
16768 dtrace_fmtdesc_t fmt;
16769 char *str;
16770 int len;
16771
16772 if (copyin(arg, &fmt, sizeof (fmt)) != 0)
16773 return (EFAULT);
16774
16775 lck_mtx_lock(&dtrace_lock);
16776
16777 if (fmt.dtfd_format == 0 ||
16778 fmt.dtfd_format > state->dts_nformats) {
16779 lck_mtx_unlock(&dtrace_lock);
16780 return (EINVAL);
16781 }
16782
16783 /*
16784 * Format strings are allocated contiguously and they are
16785 * never freed; if a format index is less than the number
16786 * of formats, we can assert that the format map is non-NULL
16787 * and that the format for the specified index is non-NULL.
16788 */
16789 ASSERT(state->dts_formats != NULL);
16790 str = state->dts_formats[fmt.dtfd_format - 1];
16791 ASSERT(str != NULL);
16792
16793 len = strlen(str) + 1;
16794
16795 if (len > fmt.dtfd_length) {
16796 fmt.dtfd_length = len;
16797
16798 if (copyout(&fmt, arg, sizeof (fmt)) != 0) {
16799 lck_mtx_unlock(&dtrace_lock);
16800 return (EINVAL);
16801 }
16802 } else {
16803 if (copyout(str, (user_addr_t)fmt.dtfd_string, len) != 0) {
16804 lck_mtx_unlock(&dtrace_lock);
16805 return (EINVAL);
16806 }
16807 }
16808
16809 lck_mtx_unlock(&dtrace_lock);
16810 return (0);
16811 }
16812
16813 case DTRACEIOC_MODUUIDSLIST: {
16814 size_t module_uuids_list_size;
16815 dtrace_module_uuids_list_t* uuids_list;
16816 uint64_t dtmul_count;
16817
16818 /*
16819 * Security restrictions make this operation illegal, if this is enabled DTrace
16820 * must refuse to provide any fbt probes.
16821 */
16822 if (dtrace_is_restricted()) {
16823 cmn_err(CE_WARN, "security restrictions disallow DTRACEIOC_MODUUIDSLIST");
16824 return (EPERM);
16825 }
16826
16827 /*
16828 * Fail if the kernel symbol mode makes this operation illegal.
16829 * Both NEVER & ALWAYS_FROM_KERNEL are permanent states, it is legal to check
16830 * for them without holding the dtrace_lock.
16831 */
16832 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER ||
16833 dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) {
16834 cmn_err(CE_WARN, "dtrace_kernel_symbol_mode of %u disallows DTRACEIOC_MODUUIDSLIST", dtrace_kernel_symbol_mode);
16835 return (EPERM);
16836 }
16837
16838 /*
16839 * Read the number of symbolsdesc structs being passed in.
16840 */
16841 if (copyin(arg + offsetof(dtrace_module_uuids_list_t, dtmul_count),
16842 &dtmul_count,
16843 sizeof(dtmul_count))) {
16844 cmn_err(CE_WARN, "failed to copyin dtmul_count");
16845 return (EFAULT);
16846 }
16847
16848 /*
16849 * Range check the count. More than 2k kexts is probably an error.
16850 */
16851 if (dtmul_count > 2048) {
16852 cmn_err(CE_WARN, "dtmul_count is not valid");
16853 return (EINVAL);
16854 }
16855
16856 /*
16857 * For all queries, we return EINVAL when the user specified
16858 * count does not match the actual number of modules we find
16859 * available.
16860 *
16861 * If the user specified count is zero, then this serves as a
16862 * simple query to count the available modules in need of symbols.
16863 */
16864
16865 rval = 0;
16866
16867 if (dtmul_count == 0)
16868 {
16869 lck_mtx_lock(&mod_lock);
16870 struct modctl* ctl = dtrace_modctl_list;
16871 while (ctl) {
16872 /* Update the private probes bit */
16873 if (dtrace_provide_private_probes)
16874 ctl->mod_flags |= MODCTL_FBT_PROVIDE_PRIVATE_PROBES;
16875
16876 ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
16877 if (!MOD_SYMBOLS_DONE(ctl)) {
16878 dtmul_count++;
16879 rval = EINVAL;
16880 }
16881 ctl = ctl->mod_next;
16882 }
16883 lck_mtx_unlock(&mod_lock);
16884
16885 if (copyout(&dtmul_count, arg, sizeof (dtmul_count)) != 0)
16886 return (EFAULT);
16887 else
16888 return (rval);
16889 }
16890
16891 /*
16892 * If we reach this point, then we have a request for full list data.
16893 * Allocate a correctly sized structure and copyin the data.
16894 */
16895 module_uuids_list_size = DTRACE_MODULE_UUIDS_LIST_SIZE(dtmul_count);
16896 if ((uuids_list = kmem_alloc(module_uuids_list_size, KM_SLEEP)) == NULL)
16897 return (ENOMEM);
16898
16899 /* NOTE! We can no longer exit this method via return */
16900 if (copyin(arg, uuids_list, module_uuids_list_size) != 0) {
16901 cmn_err(CE_WARN, "failed copyin of dtrace_module_uuids_list_t");
16902 rval = EFAULT;
16903 goto moduuidslist_cleanup;
16904 }
16905
16906 /*
16907 * Check that the count didn't change between the first copyin and the second.
16908 */
16909 if (uuids_list->dtmul_count != dtmul_count) {
16910 rval = EINVAL;
16911 goto moduuidslist_cleanup;
16912 }
16913
16914 /*
16915 * Build the list of UUID's that need symbols
16916 */
16917 lck_mtx_lock(&mod_lock);
16918
16919 dtmul_count = 0;
16920
16921 struct modctl* ctl = dtrace_modctl_list;
16922 while (ctl) {
16923 /* Update the private probes bit */
16924 if (dtrace_provide_private_probes)
16925 ctl->mod_flags |= MODCTL_FBT_PROVIDE_PRIVATE_PROBES;
16926
16927 /*
16928 * We assume that userspace symbols will be "better" than kernel level symbols,
16929 * as userspace can search for dSYM(s) and symbol'd binaries. Even if kernel syms
16930 * are available, add user syms if the module might use them.
16931 */
16932 ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
16933 if (!MOD_SYMBOLS_DONE(ctl)) {
16934 UUID* uuid = &uuids_list->dtmul_uuid[dtmul_count];
16935 if (dtmul_count++ < uuids_list->dtmul_count) {
16936 memcpy(uuid, ctl->mod_uuid, sizeof(UUID));
16937 }
16938 }
16939 ctl = ctl->mod_next;
16940 }
16941
16942 lck_mtx_unlock(&mod_lock);
16943
16944 if (uuids_list->dtmul_count < dtmul_count)
16945 rval = EINVAL;
16946
16947 uuids_list->dtmul_count = dtmul_count;
16948
16949 /*
16950 * Copyout the symbols list (or at least the count!)
16951 */
16952 if (copyout(uuids_list, arg, module_uuids_list_size) != 0) {
16953 cmn_err(CE_WARN, "failed copyout of dtrace_symbolsdesc_list_t");
16954 rval = EFAULT;
16955 }
16956
16957 moduuidslist_cleanup:
16958 /*
16959 * If we had to allocate struct memory, free it.
16960 */
16961 if (uuids_list != NULL) {
16962 kmem_free(uuids_list, module_uuids_list_size);
16963 }
16964
16965 return rval;
16966 }
16967
16968 case DTRACEIOC_PROVMODSYMS: {
16969 size_t module_symbols_size;
16970 dtrace_module_symbols_t* module_symbols;
16971 uint64_t dtmodsyms_count;
16972
16973 /*
16974 * Security restrictions make this operation illegal, if this is enabled DTrace
16975 * must refuse to provide any fbt probes.
16976 */
16977 if (dtrace_is_restricted()) {
16978 cmn_err(CE_WARN, "security restrictions disallow DTRACEIOC_MODUUIDSLIST");
16979 return (EPERM);
16980 }
16981
16982 /*
16983 * Fail if the kernel symbol mode makes this operation illegal.
16984 * Both NEVER & ALWAYS_FROM_KERNEL are permanent states, it is legal to check
16985 * for them without holding the dtrace_lock.
16986 */
16987 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER ||
16988 dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) {
16989 cmn_err(CE_WARN, "dtrace_kernel_symbol_mode of %u disallows DTRACEIOC_PROVMODSYMS", dtrace_kernel_symbol_mode);
16990 return (EPERM);
16991 }
16992
16993 /*
16994 * Read the number of module symbols structs being passed in.
16995 */
16996 if (copyin(arg + offsetof(dtrace_module_symbols_t, dtmodsyms_count),
16997 &dtmodsyms_count,
16998 sizeof(dtmodsyms_count))) {
16999 cmn_err(CE_WARN, "failed to copyin dtmodsyms_count");
17000 return (EFAULT);
17001 }
17002
17003 /*
17004 * Range check the count. How much data can we pass around?
17005 * FIX ME!
17006 */
17007 if (dtmodsyms_count == 0 || (dtmodsyms_count > 100 * 1024)) {
17008 cmn_err(CE_WARN, "dtmodsyms_count is not valid");
17009 return (EINVAL);
17010 }
17011
17012 /*
17013 * Allocate a correctly sized structure and copyin the data.
17014 */
17015 module_symbols_size = DTRACE_MODULE_SYMBOLS_SIZE(dtmodsyms_count);
17016 if ((module_symbols = kmem_alloc(module_symbols_size, KM_SLEEP)) == NULL)
17017 return (ENOMEM);
17018
17019 rval = 0;
17020
17021 /* NOTE! We can no longer exit this method via return */
17022 if (copyin(arg, module_symbols, module_symbols_size) != 0) {
17023 cmn_err(CE_WARN, "failed copyin of dtrace_module_symbols_t, symbol count %llu", module_symbols->dtmodsyms_count);
17024 rval = EFAULT;
17025 goto module_symbols_cleanup;
17026 }
17027
17028 /*
17029 * Check that the count didn't change between the first copyin and the second.
17030 */
17031 if (module_symbols->dtmodsyms_count != dtmodsyms_count) {
17032 rval = EINVAL;
17033 goto module_symbols_cleanup;
17034 }
17035
17036 /*
17037 * Find the modctl to add symbols to.
17038 */
17039 lck_mtx_lock(&dtrace_provider_lock);
17040 lck_mtx_lock(&mod_lock);
17041
17042 struct modctl* ctl = dtrace_modctl_list;
17043 while (ctl) {
17044 /* Update the private probes bit */
17045 if (dtrace_provide_private_probes)
17046 ctl->mod_flags |= MODCTL_FBT_PROVIDE_PRIVATE_PROBES;
17047
17048 ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
17049 if (MOD_HAS_UUID(ctl) && !MOD_SYMBOLS_DONE(ctl)) {
17050 if (memcmp(module_symbols->dtmodsyms_uuid, ctl->mod_uuid, sizeof(UUID)) == 0) {
17051 /* BINGO! */
17052 ctl->mod_user_symbols = module_symbols;
17053 break;
17054 }
17055 }
17056 ctl = ctl->mod_next;
17057 }
17058
17059 if (ctl) {
17060 dtrace_provider_t *prv;
17061
17062 /*
17063 * We're going to call each providers per-module provide operation
17064 * specifying only this module.
17065 */
17066 for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
17067 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
17068
17069 /*
17070 * We gave every provider a chance to provide with the user syms, go ahead and clear them
17071 */
17072 ctl->mod_user_symbols = NULL; /* MUST reset this to clear HAS_USERSPACE_SYMBOLS */
17073 }
17074
17075 lck_mtx_unlock(&mod_lock);
17076 lck_mtx_unlock(&dtrace_provider_lock);
17077
17078 module_symbols_cleanup:
17079 /*
17080 * If we had to allocate struct memory, free it.
17081 */
17082 if (module_symbols != NULL) {
17083 kmem_free(module_symbols, module_symbols_size);
17084 }
17085
17086 return rval;
17087 }
17088
17089 case DTRACEIOC_PROCWAITFOR: {
17090 dtrace_procdesc_t pdesc = {
17091 .p_comm = {0},
17092 .p_pid = -1
17093 };
17094
17095 if ((rval = copyin(arg, &pdesc, sizeof(pdesc))) != 0)
17096 goto proc_waitfor_error;
17097
17098 if ((rval = dtrace_proc_waitfor(&pdesc)) != 0)
17099 goto proc_waitfor_error;
17100
17101 if ((rval = copyout(&pdesc, arg, sizeof(pdesc))) != 0)
17102 goto proc_waitfor_error;
17103
17104 return 0;
17105
17106 proc_waitfor_error:
17107 /* The process was suspended, revert this since the client will not do it. */
17108 if (pdesc.p_pid != -1) {
17109 proc_t *proc = proc_find(pdesc.p_pid);
17110 if (proc != PROC_NULL) {
17111 task_pidresume(proc->task);
17112 proc_rele(proc);
17113 }
17114 }
17115
17116 return rval;
17117 }
17118
17119 default:
17120 break;
17121 }
17122
17123 return (ENOTTY);
17124 }
17125
17126 /*
17127 * APPLE NOTE: dtrace_detach not implemented
17128 */
17129 #if !defined(__APPLE__)
17130 /*ARGSUSED*/
17131 static int
17132 dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
17133 {
17134 dtrace_state_t *state;
17135
17136 switch (cmd) {
17137 case DDI_DETACH:
17138 break;
17139
17140 case DDI_SUSPEND:
17141 return (DDI_SUCCESS);
17142
17143 default:
17144 return (DDI_FAILURE);
17145 }
17146
17147 lck_mtx_lock(&cpu_lock);
17148 lck_mtx_lock(&dtrace_provider_lock);
17149 lck_mtx_lock(&dtrace_lock);
17150
17151 ASSERT(dtrace_opens == 0);
17152
17153 if (dtrace_helpers > 0) {
17154 lck_mtx_unlock(&dtrace_lock);
17155 lck_mtx_unlock(&dtrace_provider_lock);
17156 lck_mtx_unlock(&cpu_lock);
17157 return (DDI_FAILURE);
17158 }
17159
17160 if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
17161 lck_mtx_unlock(&dtrace_lock);
17162 lck_mtx_unlock(&dtrace_provider_lock);
17163 lck_mtx_unlock(&cpu_lock);
17164 return (DDI_FAILURE);
17165 }
17166
17167 dtrace_provider = NULL;
17168
17169 if ((state = dtrace_anon_grab()) != NULL) {
17170 /*
17171 * If there were ECBs on this state, the provider should
17172 * have not been allowed to detach; assert that there is
17173 * none.
17174 */
17175 ASSERT(state->dts_necbs == 0);
17176 dtrace_state_destroy(state);
17177
17178 /*
17179 * If we're being detached with anonymous state, we need to
17180 * indicate to the kernel debugger that DTrace is now inactive.
17181 */
17182 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
17183 }
17184
17185 bzero(&dtrace_anon, sizeof (dtrace_anon_t));
17186 unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
17187 dtrace_cpu_init = NULL;
17188 dtrace_helpers_cleanup = NULL;
17189 dtrace_helpers_fork = NULL;
17190 dtrace_cpustart_init = NULL;
17191 dtrace_cpustart_fini = NULL;
17192 dtrace_debugger_init = NULL;
17193 dtrace_debugger_fini = NULL;
17194 dtrace_kreloc_init = NULL;
17195 dtrace_kreloc_fini = NULL;
17196 dtrace_modload = NULL;
17197 dtrace_modunload = NULL;
17198
17199 lck_mtx_unlock(&cpu_lock);
17200
17201 if (dtrace_helptrace_enabled) {
17202 kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
17203 dtrace_helptrace_buffer = NULL;
17204 }
17205
17206 kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
17207 dtrace_probes = NULL;
17208 dtrace_nprobes = 0;
17209
17210 dtrace_hash_destroy(dtrace_bymod);
17211 dtrace_hash_destroy(dtrace_byfunc);
17212 dtrace_hash_destroy(dtrace_byname);
17213 dtrace_bymod = NULL;
17214 dtrace_byfunc = NULL;
17215 dtrace_byname = NULL;
17216
17217 kmem_cache_destroy(dtrace_state_cache);
17218 vmem_destroy(dtrace_minor);
17219 vmem_destroy(dtrace_arena);
17220
17221 if (dtrace_toxrange != NULL) {
17222 kmem_free(dtrace_toxrange,
17223 dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
17224 dtrace_toxrange = NULL;
17225 dtrace_toxranges = 0;
17226 dtrace_toxranges_max = 0;
17227 }
17228
17229 ddi_remove_minor_node(dtrace_devi, NULL);
17230 dtrace_devi = NULL;
17231
17232 ddi_soft_state_fini(&dtrace_softstate);
17233
17234 ASSERT(dtrace_vtime_references == 0);
17235 ASSERT(dtrace_opens == 0);
17236 ASSERT(dtrace_retained == NULL);
17237
17238 lck_mtx_unlock(&dtrace_lock);
17239 lck_mtx_unlock(&dtrace_provider_lock);
17240
17241 /*
17242 * We don't destroy the task queue until after we have dropped our
17243 * locks (taskq_destroy() may block on running tasks). To prevent
17244 * attempting to do work after we have effectively detached but before
17245 * the task queue has been destroyed, all tasks dispatched via the
17246 * task queue must check that DTrace is still attached before
17247 * performing any operation.
17248 */
17249 taskq_destroy(dtrace_taskq);
17250 dtrace_taskq = NULL;
17251
17252 return (DDI_SUCCESS);
17253 }
17254 #endif /* __APPLE__ */
17255
17256 d_open_t _dtrace_open, helper_open;
17257 d_close_t _dtrace_close, helper_close;
17258 d_ioctl_t _dtrace_ioctl, helper_ioctl;
17259
17260 int
17261 _dtrace_open(dev_t dev, int flags, int devtype, struct proc *p)
17262 {
17263 #pragma unused(p)
17264 dev_t locdev = dev;
17265
17266 return dtrace_open( &locdev, flags, devtype, CRED());
17267 }
17268
17269 int
17270 helper_open(dev_t dev, int flags, int devtype, struct proc *p)
17271 {
17272 #pragma unused(dev,flags,devtype,p)
17273 return 0;
17274 }
17275
17276 int
17277 _dtrace_close(dev_t dev, int flags, int devtype, struct proc *p)
17278 {
17279 #pragma unused(p)
17280 return dtrace_close( dev, flags, devtype, CRED());
17281 }
17282
17283 int
17284 helper_close(dev_t dev, int flags, int devtype, struct proc *p)
17285 {
17286 #pragma unused(dev,flags,devtype,p)
17287 return 0;
17288 }
17289
17290 int
17291 _dtrace_ioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct proc *p)
17292 {
17293 #pragma unused(p)
17294 int err, rv = 0;
17295 user_addr_t uaddrp;
17296
17297 if (proc_is64bit(p))
17298 uaddrp = *(user_addr_t *)data;
17299 else
17300 uaddrp = (user_addr_t) *(uint32_t *)data;
17301
17302 err = dtrace_ioctl(dev, cmd, uaddrp, fflag, CRED(), &rv);
17303
17304 /* Darwin's BSD ioctls only return -1 or zero. Overload errno to mimic Solaris. 20 bits suffice. */
17305 if (err != 0) {
17306 ASSERT( (err & 0xfffff000) == 0 );
17307 return (err & 0xfff); /* ioctl will return -1 and will set errno to an error code < 4096 */
17308 } else if (rv != 0) {
17309 ASSERT( (rv & 0xfff00000) == 0 );
17310 return (((rv & 0xfffff) << 12)); /* ioctl will return -1 and will set errno to a value >= 4096 */
17311 } else
17312 return 0;
17313 }
17314
17315 int
17316 helper_ioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct proc *p)
17317 {
17318 #pragma unused(dev,fflag,p)
17319 int err, rv = 0;
17320
17321 err = dtrace_ioctl_helper(cmd, data, &rv);
17322 /* Darwin's BSD ioctls only return -1 or zero. Overload errno to mimic Solaris. 20 bits suffice. */
17323 if (err != 0) {
17324 ASSERT( (err & 0xfffff000) == 0 );
17325 return (err & 0xfff); /* ioctl will return -1 and will set errno to an error code < 4096 */
17326 } else if (rv != 0) {
17327 ASSERT( (rv & 0xfff00000) == 0 );
17328 return (((rv & 0xfffff) << 12)); /* ioctl will return -1 and will set errno to a value >= 4096 */
17329 } else
17330 return 0;
17331 }
17332
17333 #define HELPER_MAJOR -24 /* let the kernel pick the device number */
17334
17335 /*
17336 * A struct describing which functions will get invoked for certain
17337 * actions.
17338 */
17339 static struct cdevsw helper_cdevsw =
17340 {
17341 helper_open, /* open */
17342 helper_close, /* close */
17343 eno_rdwrt, /* read */
17344 eno_rdwrt, /* write */
17345 helper_ioctl, /* ioctl */
17346 (stop_fcn_t *)nulldev, /* stop */
17347 (reset_fcn_t *)nulldev, /* reset */
17348 NULL, /* tty's */
17349 eno_select, /* select */
17350 eno_mmap, /* mmap */
17351 eno_strat, /* strategy */
17352 eno_getc, /* getc */
17353 eno_putc, /* putc */
17354 0 /* type */
17355 };
17356
17357 static int helper_majdevno = 0;
17358
17359 static int gDTraceInited = 0;
17360
17361 void
17362 helper_init( void )
17363 {
17364 /*
17365 * Once the "helper" is initialized, it can take ioctl calls that use locks
17366 * and zones initialized in dtrace_init. Make certain dtrace_init was called
17367 * before us.
17368 */
17369
17370 if (!gDTraceInited) {
17371 panic("helper_init before dtrace_init\n");
17372 }
17373
17374 if (0 >= helper_majdevno)
17375 {
17376 helper_majdevno = cdevsw_add(HELPER_MAJOR, &helper_cdevsw);
17377
17378 if (helper_majdevno < 0) {
17379 printf("helper_init: failed to allocate a major number!\n");
17380 return;
17381 }
17382
17383 if (NULL == devfs_make_node( makedev(helper_majdevno, 0), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666,
17384 DTRACEMNR_HELPER, 0 )) {
17385 printf("dtrace_init: failed to devfs_make_node for helper!\n");
17386 return;
17387 }
17388 } else
17389 panic("helper_init: called twice!\n");
17390 }
17391
17392 #undef HELPER_MAJOR
17393
17394 /*
17395 * Called with DEVFS_LOCK held, so vmem_alloc's underlying blist structures are protected.
17396 */
17397 static int
17398 dtrace_clone_func(dev_t dev, int action)
17399 {
17400 #pragma unused(dev)
17401
17402 if (action == DEVFS_CLONE_ALLOC) {
17403 if (NULL == dtrace_minor) /* Arena not created yet!?! */
17404 return 0;
17405 else {
17406 /*
17407 * Propose a minor number, namely the next number that vmem_alloc() will return.
17408 * Immediately put it back in play by calling vmem_free(). FIXME.
17409 */
17410 int ret = (int)(uintptr_t)vmem_alloc(dtrace_minor, 1, VM_BESTFIT | VM_SLEEP);
17411
17412 vmem_free(dtrace_minor, (void *)(uintptr_t)ret, 1);
17413
17414 return ret;
17415 }
17416 }
17417 else if (action == DEVFS_CLONE_FREE) {
17418 return 0;
17419 }
17420 else return -1;
17421 }
17422
17423 #define DTRACE_MAJOR -24 /* let the kernel pick the device number */
17424
17425 static struct cdevsw dtrace_cdevsw =
17426 {
17427 _dtrace_open, /* open */
17428 _dtrace_close, /* close */
17429 eno_rdwrt, /* read */
17430 eno_rdwrt, /* write */
17431 _dtrace_ioctl, /* ioctl */
17432 (stop_fcn_t *)nulldev, /* stop */
17433 (reset_fcn_t *)nulldev, /* reset */
17434 NULL, /* tty's */
17435 eno_select, /* select */
17436 eno_mmap, /* mmap */
17437 eno_strat, /* strategy */
17438 eno_getc, /* getc */
17439 eno_putc, /* putc */
17440 0 /* type */
17441 };
17442
17443 lck_attr_t* dtrace_lck_attr;
17444 lck_grp_attr_t* dtrace_lck_grp_attr;
17445 lck_grp_t* dtrace_lck_grp;
17446
17447 static int gMajDevNo;
17448
17449 void
17450 dtrace_init( void )
17451 {
17452 if (0 == gDTraceInited) {
17453 int i, ncpu;
17454 size_t size = sizeof(dtrace_buffer_memory_maxsize);
17455
17456 /*
17457 * DTrace allocates buffers based on the maximum number
17458 * of enabled cpus. This call avoids any race when finding
17459 * that count.
17460 */
17461 ASSERT(dtrace_max_cpus == 0);
17462 ncpu = dtrace_max_cpus = ml_get_max_cpus();
17463
17464 /*
17465 * Retrieve the size of the physical memory in order to define
17466 * the state buffer memory maximal size. If we cannot retrieve
17467 * this value, we'll consider that we have 1Gb of memory per CPU, that's
17468 * still better than raising a kernel panic.
17469 */
17470 if (0 != kernel_sysctlbyname("hw.memsize", &dtrace_buffer_memory_maxsize,
17471 &size, NULL, 0))
17472 {
17473 dtrace_buffer_memory_maxsize = ncpu * 1024 * 1024 * 1024;
17474 printf("dtrace_init: failed to retrieve the hw.memsize, defaulted to %lld bytes\n",
17475 dtrace_buffer_memory_maxsize);
17476 }
17477
17478 /*
17479 * Finally, divide by three to prevent DTrace from eating too
17480 * much memory.
17481 */
17482 dtrace_buffer_memory_maxsize /= 3;
17483 ASSERT(dtrace_buffer_memory_maxsize > 0);
17484
17485 gMajDevNo = cdevsw_add(DTRACE_MAJOR, &dtrace_cdevsw);
17486
17487 if (gMajDevNo < 0) {
17488 printf("dtrace_init: failed to allocate a major number!\n");
17489 gDTraceInited = 0;
17490 return;
17491 }
17492
17493 if (NULL == devfs_make_node_clone( makedev(gMajDevNo, 0), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666,
17494 dtrace_clone_func, DTRACEMNR_DTRACE, 0 )) {
17495 printf("dtrace_init: failed to devfs_make_node_clone for dtrace!\n");
17496 gDTraceInited = 0;
17497 return;
17498 }
17499
17500 #if defined(DTRACE_MEMORY_ZONES)
17501 /*
17502 * Initialize the dtrace kalloc-emulation zones.
17503 */
17504 dtrace_alloc_init();
17505 #endif /* DTRACE_MEMORY_ZONES */
17506
17507 /*
17508 * Allocate the dtrace_probe_t zone
17509 */
17510 dtrace_probe_t_zone = zinit(sizeof(dtrace_probe_t),
17511 1024 * sizeof(dtrace_probe_t),
17512 sizeof(dtrace_probe_t),
17513 "dtrace.dtrace_probe_t");
17514
17515 /*
17516 * Create the dtrace lock group and attrs.
17517 */
17518 dtrace_lck_attr = lck_attr_alloc_init();
17519 dtrace_lck_grp_attr= lck_grp_attr_alloc_init();
17520 dtrace_lck_grp = lck_grp_alloc_init("dtrace", dtrace_lck_grp_attr);
17521
17522 /*
17523 * We have to initialize all locks explicitly
17524 */
17525 lck_mtx_init(&dtrace_lock, dtrace_lck_grp, dtrace_lck_attr);
17526 lck_mtx_init(&dtrace_provider_lock, dtrace_lck_grp, dtrace_lck_attr);
17527 lck_mtx_init(&dtrace_meta_lock, dtrace_lck_grp, dtrace_lck_attr);
17528 lck_mtx_init(&dtrace_procwaitfor_lock, dtrace_lck_grp, dtrace_lck_attr);
17529 #if DEBUG
17530 lck_mtx_init(&dtrace_errlock, dtrace_lck_grp, dtrace_lck_attr);
17531 #endif
17532 lck_rw_init(&dtrace_dof_mode_lock, dtrace_lck_grp, dtrace_lck_attr);
17533
17534 /*
17535 * The cpu_core structure consists of per-CPU state available in any context.
17536 * On some architectures, this may mean that the page(s) containing the
17537 * NCPU-sized array of cpu_core structures must be locked in the TLB -- it
17538 * is up to the platform to assure that this is performed properly. Note that
17539 * the structure is sized to avoid false sharing.
17540 */
17541 lck_mtx_init(&cpu_lock, dtrace_lck_grp, dtrace_lck_attr);
17542 lck_mtx_init(&cyc_lock, dtrace_lck_grp, dtrace_lck_attr);
17543 lck_mtx_init(&mod_lock, dtrace_lck_grp, dtrace_lck_attr);
17544
17545 /*
17546 * Initialize the CPU offline/online hooks.
17547 */
17548 dtrace_install_cpu_hooks();
17549
17550 dtrace_modctl_list = NULL;
17551
17552 cpu_core = (cpu_core_t *)kmem_zalloc( ncpu * sizeof(cpu_core_t), KM_SLEEP );
17553 for (i = 0; i < ncpu; ++i) {
17554 lck_mtx_init(&cpu_core[i].cpuc_pid_lock, dtrace_lck_grp, dtrace_lck_attr);
17555 }
17556
17557 cpu_list = (dtrace_cpu_t *)kmem_zalloc( ncpu * sizeof(dtrace_cpu_t), KM_SLEEP );
17558 for (i = 0; i < ncpu; ++i) {
17559 cpu_list[i].cpu_id = (processorid_t)i;
17560 cpu_list[i].cpu_next = &(cpu_list[(i+1) % ncpu]);
17561 LIST_INIT(&cpu_list[i].cpu_cyc_list);
17562 lck_rw_init(&cpu_list[i].cpu_ft_lock, dtrace_lck_grp, dtrace_lck_attr);
17563 }
17564
17565 lck_mtx_lock(&cpu_lock);
17566 for (i = 0; i < ncpu; ++i)
17567 /* FIXME: track CPU configuration a la CHUD Processor Pref Pane. */
17568 dtrace_cpu_setup_initial( (processorid_t)i ); /* In lieu of register_cpu_setup_func() callback */
17569 lck_mtx_unlock(&cpu_lock);
17570
17571 (void)dtrace_abs_to_nano(0LL); /* Force once only call to clock_timebase_info (which can take a lock) */
17572
17573 dtrace_isa_init();
17574
17575 /*
17576 * See dtrace_impl.h for a description of dof modes.
17577 * The default is lazy dof.
17578 *
17579 * FIXME: Warn if state is LAZY_OFF? It won't break anything, but
17580 * makes no sense...
17581 */
17582 if (!PE_parse_boot_argn("dtrace_dof_mode", &dtrace_dof_mode, sizeof (dtrace_dof_mode))) {
17583 dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_ON;
17584 }
17585
17586 /*
17587 * Sanity check of dof mode value.
17588 */
17589 switch (dtrace_dof_mode) {
17590 case DTRACE_DOF_MODE_NEVER:
17591 case DTRACE_DOF_MODE_LAZY_ON:
17592 /* valid modes, but nothing else we need to do */
17593 break;
17594
17595 case DTRACE_DOF_MODE_LAZY_OFF:
17596 case DTRACE_DOF_MODE_NON_LAZY:
17597 /* Cannot wait for a dtrace_open to init fasttrap */
17598 fasttrap_init();
17599 break;
17600
17601 default:
17602 /* Invalid, clamp to non lazy */
17603 dtrace_dof_mode = DTRACE_DOF_MODE_NON_LAZY;
17604 fasttrap_init();
17605 break;
17606 }
17607
17608 /*
17609 * See dtrace_impl.h for a description of kernel symbol modes.
17610 * The default is to wait for symbols from userspace (lazy symbols).
17611 */
17612 if (!PE_parse_boot_argn("dtrace_kernel_symbol_mode", &dtrace_kernel_symbol_mode, sizeof (dtrace_kernel_symbol_mode))) {
17613 dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE;
17614 }
17615
17616 gDTraceInited = 1;
17617
17618 } else
17619 panic("dtrace_init: called twice!\n");
17620 }
17621
17622 void
17623 dtrace_postinit(void)
17624 {
17625 /*
17626 * Called from bsd_init after all provider's *_init() routines have been
17627 * run. That way, anonymous DOF enabled under dtrace_attach() is safe
17628 * to go.
17629 */
17630 dtrace_attach( (dev_info_t *)(uintptr_t)makedev(gMajDevNo, 0), 0 ); /* Punning a dev_t to a dev_info_t* */
17631
17632 /*
17633 * Add the mach_kernel to the module list for lazy processing
17634 */
17635 struct kmod_info fake_kernel_kmod;
17636 memset(&fake_kernel_kmod, 0, sizeof(fake_kernel_kmod));
17637
17638 strlcpy(fake_kernel_kmod.name, "mach_kernel", sizeof(fake_kernel_kmod.name));
17639 fake_kernel_kmod.id = 1;
17640 fake_kernel_kmod.address = g_kernel_kmod_info.address;
17641 fake_kernel_kmod.size = g_kernel_kmod_info.size;
17642
17643 if (dtrace_module_loaded(&fake_kernel_kmod, 0) != 0) {
17644 printf("dtrace_postinit: Could not register mach_kernel modctl\n");
17645 }
17646
17647 (void)OSKextRegisterKextsWithDTrace();
17648 }
17649 #undef DTRACE_MAJOR
17650
17651 /*
17652 * Routines used to register interest in cpu's being added to or removed
17653 * from the system.
17654 */
17655 void
17656 register_cpu_setup_func(cpu_setup_func_t *ignore1, void *ignore2)
17657 {
17658 #pragma unused(ignore1,ignore2)
17659 }
17660
17661 void
17662 unregister_cpu_setup_func(cpu_setup_func_t *ignore1, void *ignore2)
17663 {
17664 #pragma unused(ignore1,ignore2)
17665 }