]> git.saurik.com Git - apple/xnu.git/blob - bsd/dev/dtrace/dtrace.c
xnu-6153.61.1.tar.gz
[apple/xnu.git] / bsd / dev / dtrace / dtrace.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Portions Copyright (c) 2013, 2016, Joyent, Inc. All rights reserved.
24 * Portions Copyright (c) 2013 by Delphix. All rights reserved.
25 */
26
27 /*
28 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
29 * Use is subject to license terms.
30 */
31
32 /*
33 * DTrace - Dynamic Tracing for Solaris
34 *
35 * This is the implementation of the Solaris Dynamic Tracing framework
36 * (DTrace). The user-visible interface to DTrace is described at length in
37 * the "Solaris Dynamic Tracing Guide". The interfaces between the libdtrace
38 * library, the in-kernel DTrace framework, and the DTrace providers are
39 * described in the block comments in the <sys/dtrace.h> header file. The
40 * internal architecture of DTrace is described in the block comments in the
41 * <sys/dtrace_impl.h> header file. The comments contained within the DTrace
42 * implementation very much assume mastery of all of these sources; if one has
43 * an unanswered question about the implementation, one should consult them
44 * first.
45 *
46 * The functions here are ordered roughly as follows:
47 *
48 * - Probe context functions
49 * - Probe hashing functions
50 * - Non-probe context utility functions
51 * - Matching functions
52 * - Provider-to-Framework API functions
53 * - Probe management functions
54 * - DIF object functions
55 * - Format functions
56 * - Predicate functions
57 * - ECB functions
58 * - Buffer functions
59 * - Enabling functions
60 * - DOF functions
61 * - Anonymous enabling functions
62 * - Process functions
63 * - Consumer state functions
64 * - Helper functions
65 * - Hook functions
66 * - Driver cookbook functions
67 *
68 * Each group of functions begins with a block comment labelled the "DTrace
69 * [Group] Functions", allowing one to find each block by searching forward
70 * on capital-f functions.
71 */
72 #include <sys/errno.h>
73 #include <sys/types.h>
74 #include <sys/stat.h>
75 #include <sys/conf.h>
76 #include <sys/random.h>
77 #include <sys/systm.h>
78 #include <sys/dtrace_impl.h>
79 #include <sys/param.h>
80 #include <sys/proc_internal.h>
81 #include <sys/ioctl.h>
82 #include <sys/fcntl.h>
83 #include <miscfs/devfs/devfs.h>
84 #include <sys/malloc.h>
85 #include <sys/kernel_types.h>
86 #include <sys/proc_internal.h>
87 #include <sys/uio_internal.h>
88 #include <sys/kauth.h>
89 #include <vm/pmap.h>
90 #include <sys/user.h>
91 #include <mach/exception_types.h>
92 #include <sys/signalvar.h>
93 #include <mach/task.h>
94 #include <kern/zalloc.h>
95 #include <kern/ast.h>
96 #include <kern/sched_prim.h>
97 #include <kern/task.h>
98 #include <netinet/in.h>
99 #include <libkern/sysctl.h>
100 #include <sys/kdebug.h>
101
102 #if MONOTONIC
103 #include <kern/monotonic.h>
104 #include <machine/monotonic.h>
105 #endif /* MONOTONIC */
106
107 #include "dtrace_xoroshiro128_plus.h"
108
109 #include <IOKit/IOPlatformExpert.h>
110
111 #include <kern/cpu_data.h>
112 extern uint32_t pmap_find_phys(void *, uint64_t);
113 extern boolean_t pmap_valid_page(uint32_t);
114 extern void OSKextRegisterKextsWithDTrace(void);
115 extern kmod_info_t g_kernel_kmod_info;
116 extern void commpage_update_dof(boolean_t enabled);
117
118 /* Solaris proc_t is the struct. Darwin's proc_t is a pointer to it. */
119 #define proc_t struct proc /* Steer clear of the Darwin typedef for proc_t */
120
121 #define t_predcache t_dtrace_predcache /* Cosmetic. Helps readability of thread.h */
122
123 extern void dtrace_suspend(void);
124 extern void dtrace_resume(void);
125 extern void dtrace_early_init(void);
126 extern int dtrace_keep_kernel_symbols(void);
127 extern void dtrace_init(void);
128 extern void helper_init(void);
129 extern void fasttrap_init(void);
130
131 static int dtrace_lazy_dofs_duplicate(proc_t *, proc_t *);
132 extern void dtrace_lazy_dofs_destroy(proc_t *);
133 extern void dtrace_postinit(void);
134
135 extern void dtrace_proc_fork(proc_t*, proc_t*, int);
136 extern void dtrace_proc_exec(proc_t*);
137 extern void dtrace_proc_exit(proc_t*);
138
139 /*
140 * DTrace Tunable Variables
141 *
142 * The following variables may be dynamically tuned by using sysctl(8), the
143 * variables being stored in the kern.dtrace namespace. For example:
144 * sysctl kern.dtrace.dof_maxsize = 1048575 # 1M
145 *
146 * In general, the only variables that one should be tuning this way are those
147 * that affect system-wide DTrace behavior, and for which the default behavior
148 * is undesirable. Most of these variables are tunable on a per-consumer
149 * basis using DTrace options, and need not be tuned on a system-wide basis.
150 * When tuning these variables, avoid pathological values; while some attempt
151 * is made to verify the integrity of these variables, they are not considered
152 * part of the supported interface to DTrace, and they are therefore not
153 * checked comprehensively.
154 */
155 uint64_t dtrace_buffer_memory_maxsize = 0; /* initialized in dtrace_init */
156 uint64_t dtrace_buffer_memory_inuse = 0;
157 int dtrace_destructive_disallow = 0;
158 dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024);
159 size_t dtrace_difo_maxsize = (256 * 1024);
160 dtrace_optval_t dtrace_dof_maxsize = (512 * 1024);
161 dtrace_optval_t dtrace_statvar_maxsize = (16 * 1024);
162 dtrace_optval_t dtrace_statvar_maxsize_max = (16 * 10 * 1024);
163 size_t dtrace_actions_max = (16 * 1024);
164 size_t dtrace_retain_max = 1024;
165 dtrace_optval_t dtrace_helper_actions_max = 32;
166 dtrace_optval_t dtrace_helper_providers_max = 64;
167 dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024);
168 size_t dtrace_strsize_default = 256;
169 dtrace_optval_t dtrace_strsize_min = 8;
170 dtrace_optval_t dtrace_strsize_max = 65536;
171 dtrace_optval_t dtrace_cleanrate_default = 990099000; /* 1.1 hz */
172 dtrace_optval_t dtrace_cleanrate_min = 20000000; /* 50 hz */
173 dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC; /* 1/minute */
174 dtrace_optval_t dtrace_aggrate_default = NANOSEC; /* 1 hz */
175 dtrace_optval_t dtrace_statusrate_default = NANOSEC; /* 1 hz */
176 dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC; /* 6/minute */
177 dtrace_optval_t dtrace_switchrate_default = NANOSEC; /* 1 hz */
178 dtrace_optval_t dtrace_nspec_default = 1;
179 dtrace_optval_t dtrace_specsize_default = 32 * 1024;
180 dtrace_optval_t dtrace_stackframes_default = 20;
181 dtrace_optval_t dtrace_ustackframes_default = 20;
182 dtrace_optval_t dtrace_jstackframes_default = 50;
183 dtrace_optval_t dtrace_jstackstrsize_default = 512;
184 dtrace_optval_t dtrace_buflimit_default = 75;
185 dtrace_optval_t dtrace_buflimit_min = 1;
186 dtrace_optval_t dtrace_buflimit_max = 99;
187 size_t dtrace_nprobes_default = 4;
188 int dtrace_msgdsize_max = 128;
189 hrtime_t dtrace_chill_max = 500 * (NANOSEC / MILLISEC); /* 500 ms */
190 hrtime_t dtrace_chill_interval = NANOSEC; /* 1000 ms */
191 int dtrace_devdepth_max = 32;
192 int dtrace_err_verbose;
193 hrtime_t dtrace_deadman_interval = NANOSEC;
194 hrtime_t dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
195 hrtime_t dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
196
197 /*
198 * DTrace External Variables
199 *
200 * As dtrace(7D) is a kernel module, any DTrace variables are obviously
201 * available to DTrace consumers via the backtick (`) syntax. One of these,
202 * dtrace_zero, is made deliberately so: it is provided as a source of
203 * well-known, zero-filled memory. While this variable is not documented,
204 * it is used by some translators as an implementation detail.
205 */
206 const char dtrace_zero[256] = { 0 }; /* zero-filled memory */
207 unsigned int dtrace_max_cpus = 0; /* number of enabled cpus */
208 /*
209 * DTrace Internal Variables
210 */
211 static dev_info_t *dtrace_devi; /* device info */
212 static vmem_t *dtrace_arena; /* probe ID arena */
213 static dtrace_probe_t **dtrace_probes; /* array of all probes */
214 static int dtrace_nprobes; /* number of probes */
215 static dtrace_provider_t *dtrace_provider; /* provider list */
216 static dtrace_meta_t *dtrace_meta_pid; /* user-land meta provider */
217 static int dtrace_opens; /* number of opens */
218 static int dtrace_helpers; /* number of helpers */
219 static dtrace_hash_t *dtrace_strings;
220 static dtrace_hash_t *dtrace_byprov; /* probes hashed by provider */
221 static dtrace_hash_t *dtrace_bymod; /* probes hashed by module */
222 static dtrace_hash_t *dtrace_byfunc; /* probes hashed by function */
223 static dtrace_hash_t *dtrace_byname; /* probes hashed by name */
224 static dtrace_toxrange_t *dtrace_toxrange; /* toxic range array */
225 static int dtrace_toxranges; /* number of toxic ranges */
226 static int dtrace_toxranges_max; /* size of toxic range array */
227 static dtrace_anon_t dtrace_anon; /* anonymous enabling */
228 static kmem_cache_t *dtrace_state_cache; /* cache for dynamic state */
229 static uint64_t dtrace_vtime_references; /* number of vtimestamp refs */
230 static kthread_t *dtrace_panicked; /* panicking thread */
231 static dtrace_ecb_t *dtrace_ecb_create_cache; /* cached created ECB */
232 static dtrace_genid_t dtrace_probegen; /* current probe generation */
233 static dtrace_helpers_t *dtrace_deferred_pid; /* deferred helper list */
234 static dtrace_enabling_t *dtrace_retained; /* list of retained enablings */
235 static dtrace_genid_t dtrace_retained_gen; /* current retained enab gen */
236 static dtrace_dynvar_t dtrace_dynhash_sink; /* end of dynamic hash chains */
237
238 static int dtrace_dof_mode; /* See dtrace_impl.h for a description of Darwin's dof modes. */
239
240 /*
241 * This does't quite fit as an internal variable, as it must be accessed in
242 * fbt_provide and sdt_provide. Its clearly not a dtrace tunable variable either...
243 */
244 int dtrace_kernel_symbol_mode; /* See dtrace_impl.h for a description of Darwin's kernel symbol modes. */
245 static uint32_t dtrace_wake_clients;
246 static uint8_t dtrace_kerneluuid[16]; /* the 128-bit uuid */
247
248 /*
249 * To save memory, some common memory allocations are given a
250 * unique zone. For example, dtrace_probe_t is 72 bytes in size,
251 * which means it would fall into the kalloc.128 bucket. With
252 * 20k elements allocated, the space saved is substantial.
253 */
254
255 struct zone *dtrace_probe_t_zone;
256
257 static int dtrace_module_unloaded(struct kmod_info *kmod);
258
259 /*
260 * DTrace Locking
261 * DTrace is protected by three (relatively coarse-grained) locks:
262 *
263 * (1) dtrace_lock is required to manipulate essentially any DTrace state,
264 * including enabling state, probes, ECBs, consumer state, helper state,
265 * etc. Importantly, dtrace_lock is _not_ required when in probe context;
266 * probe context is lock-free -- synchronization is handled via the
267 * dtrace_sync() cross call mechanism.
268 *
269 * (2) dtrace_provider_lock is required when manipulating provider state, or
270 * when provider state must be held constant.
271 *
272 * (3) dtrace_meta_lock is required when manipulating meta provider state, or
273 * when meta provider state must be held constant.
274 *
275 * The lock ordering between these three locks is dtrace_meta_lock before
276 * dtrace_provider_lock before dtrace_lock. (In particular, there are
277 * several places where dtrace_provider_lock is held by the framework as it
278 * calls into the providers -- which then call back into the framework,
279 * grabbing dtrace_lock.)
280 *
281 * There are two other locks in the mix: mod_lock and cpu_lock. With respect
282 * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
283 * role as a coarse-grained lock; it is acquired before both of these locks.
284 * With respect to dtrace_meta_lock, its behavior is stranger: cpu_lock must
285 * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
286 * mod_lock is similar with respect to dtrace_provider_lock in that it must be
287 * acquired _between_ dtrace_provider_lock and dtrace_lock.
288 */
289
290
291 /*
292 * APPLE NOTE:
293 *
294 * For porting purposes, all kmutex_t vars have been changed
295 * to lck_mtx_t, which require explicit initialization.
296 *
297 * kmutex_t becomes lck_mtx_t
298 * mutex_enter() becomes lck_mtx_lock()
299 * mutex_exit() becomes lck_mtx_unlock()
300 *
301 * Lock asserts are changed like this:
302 *
303 * ASSERT(MUTEX_HELD(&cpu_lock));
304 * becomes:
305 * LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
306 *
307 */
308 static lck_mtx_t dtrace_lock; /* probe state lock */
309 static lck_mtx_t dtrace_provider_lock; /* provider state lock */
310 static lck_mtx_t dtrace_meta_lock; /* meta-provider state lock */
311 static lck_rw_t dtrace_dof_mode_lock; /* dof mode lock */
312
313 /*
314 * DTrace Provider Variables
315 *
316 * These are the variables relating to DTrace as a provider (that is, the
317 * provider of the BEGIN, END, and ERROR probes).
318 */
319 static dtrace_pattr_t dtrace_provider_attr = {
320 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
321 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
322 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
323 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
324 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
325 };
326
327 static void
328 dtrace_provide_nullop(void *arg, const dtrace_probedesc_t *desc)
329 {
330 #pragma unused(arg, desc)
331 }
332
333 static void
334 dtrace_provide_module_nullop(void *arg, struct modctl *ctl)
335 {
336 #pragma unused(arg, ctl)
337 }
338
339 static int
340 dtrace_enable_nullop(void *arg, dtrace_id_t id, void *parg)
341 {
342 #pragma unused(arg, id, parg)
343 return (0);
344 }
345
346 static void
347 dtrace_disable_nullop(void *arg, dtrace_id_t id, void *parg)
348 {
349 #pragma unused(arg, id, parg)
350 }
351
352 static void
353 dtrace_suspend_nullop(void *arg, dtrace_id_t id, void *parg)
354 {
355 #pragma unused(arg, id, parg)
356 }
357
358 static void
359 dtrace_resume_nullop(void *arg, dtrace_id_t id, void *parg)
360 {
361 #pragma unused(arg, id, parg)
362 }
363
364 static void
365 dtrace_destroy_nullop(void *arg, dtrace_id_t id, void *parg)
366 {
367 #pragma unused(arg, id, parg)
368 }
369
370
371 static dtrace_pops_t dtrace_provider_ops = {
372 .dtps_provide = dtrace_provide_nullop,
373 .dtps_provide_module = dtrace_provide_module_nullop,
374 .dtps_enable = dtrace_enable_nullop,
375 .dtps_disable = dtrace_disable_nullop,
376 .dtps_suspend = dtrace_suspend_nullop,
377 .dtps_resume = dtrace_resume_nullop,
378 .dtps_getargdesc = NULL,
379 .dtps_getargval = NULL,
380 .dtps_usermode = NULL,
381 .dtps_destroy = dtrace_destroy_nullop,
382 };
383
384 static dtrace_id_t dtrace_probeid_begin; /* special BEGIN probe */
385 static dtrace_id_t dtrace_probeid_end; /* special END probe */
386 dtrace_id_t dtrace_probeid_error; /* special ERROR probe */
387
388 /*
389 * DTrace Helper Tracing Variables
390 */
391 uint32_t dtrace_helptrace_next = 0;
392 uint32_t dtrace_helptrace_nlocals;
393 char *dtrace_helptrace_buffer;
394 size_t dtrace_helptrace_bufsize = 512 * 1024;
395
396 #if DEBUG
397 int dtrace_helptrace_enabled = 1;
398 #else
399 int dtrace_helptrace_enabled = 0;
400 #endif
401
402 #if defined (__arm64__)
403 /*
404 * The ioctl for adding helper DOF is based on the
405 * size of a user_addr_t. We need to recognize both
406 * U32 and U64 as the same action.
407 */
408 #define DTRACEHIOC_ADDDOF_U32 _IOW('h', 4, user32_addr_t)
409 #define DTRACEHIOC_ADDDOF_U64 _IOW('h', 4, user64_addr_t)
410 #endif /* __arm64__ */
411
412 /*
413 * DTrace Error Hashing
414 *
415 * On DEBUG kernels, DTrace will track the errors that has seen in a hash
416 * table. This is very useful for checking coverage of tests that are
417 * expected to induce DIF or DOF processing errors, and may be useful for
418 * debugging problems in the DIF code generator or in DOF generation . The
419 * error hash may be examined with the ::dtrace_errhash MDB dcmd.
420 */
421 #if DEBUG
422 static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ];
423 static const char *dtrace_errlast;
424 static kthread_t *dtrace_errthread;
425 static lck_mtx_t dtrace_errlock;
426 #endif
427
428 /*
429 * DTrace Macros and Constants
430 *
431 * These are various macros that are useful in various spots in the
432 * implementation, along with a few random constants that have no meaning
433 * outside of the implementation. There is no real structure to this cpp
434 * mishmash -- but is there ever?
435 */
436
437 #define DTRACE_GETSTR(hash, elm) \
438 (hash->dth_getstr(elm, hash->dth_stroffs))
439
440 #define DTRACE_HASHSTR(hash, elm) \
441 dtrace_hash_str(DTRACE_GETSTR(hash, elm))
442
443 #define DTRACE_HASHNEXT(hash, elm) \
444 (void**)((uintptr_t)(elm) + (hash)->dth_nextoffs)
445
446 #define DTRACE_HASHPREV(hash, elm) \
447 (void**)((uintptr_t)(elm) + (hash)->dth_prevoffs)
448
449 #define DTRACE_HASHEQ(hash, lhs, rhs) \
450 (strcmp(DTRACE_GETSTR(hash, lhs), \
451 DTRACE_GETSTR(hash, rhs)) == 0)
452
453 #define DTRACE_AGGHASHSIZE_SLEW 17
454
455 #define DTRACE_V4MAPPED_OFFSET (sizeof (uint32_t) * 3)
456
457 /*
458 * The key for a thread-local variable consists of the lower 61 bits of the
459 * current_thread(), plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
460 * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
461 * equal to a variable identifier. This is necessary (but not sufficient) to
462 * assure that global associative arrays never collide with thread-local
463 * variables. To guarantee that they cannot collide, we must also define the
464 * order for keying dynamic variables. That order is:
465 *
466 * [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
467 *
468 * Because the variable-key and the tls-key are in orthogonal spaces, there is
469 * no way for a global variable key signature to match a thread-local key
470 * signature.
471 */
472 #if defined (__x86_64__)
473 /* FIXME: two function calls!! */
474 #define DTRACE_TLS_THRKEY(where) { \
475 uint_t intr = ml_at_interrupt_context(); /* Note: just one measly bit */ \
476 uint64_t thr = (uintptr_t)current_thread(); \
477 ASSERT(intr < (1 << 3)); \
478 (where) = ((thr + DIF_VARIABLE_MAX) & \
479 (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
480 }
481 #elif defined(__arm__)
482 /* FIXME: three function calls!!! */
483 #define DTRACE_TLS_THRKEY(where) { \
484 uint_t intr = ml_at_interrupt_context(); /* Note: just one measly bit */ \
485 uint64_t thr = (uintptr_t)current_thread(); \
486 uint_t pid = (uint_t)dtrace_proc_selfpid(); \
487 ASSERT(intr < (1 << 3)); \
488 (where) = (((thr << 32 | pid) + DIF_VARIABLE_MAX) & \
489 (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
490 }
491 #elif defined (__arm64__)
492 /* FIXME: two function calls!! */
493 #define DTRACE_TLS_THRKEY(where) { \
494 uint_t intr = ml_at_interrupt_context(); /* Note: just one measly bit */ \
495 uint64_t thr = (uintptr_t)current_thread(); \
496 ASSERT(intr < (1 << 3)); \
497 (where) = ((thr + DIF_VARIABLE_MAX) & \
498 (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
499 }
500 #else
501 #error Unknown architecture
502 #endif
503
504 #define DT_BSWAP_8(x) ((x) & 0xff)
505 #define DT_BSWAP_16(x) ((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
506 #define DT_BSWAP_32(x) ((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
507 #define DT_BSWAP_64(x) ((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
508
509 #define DT_MASK_LO 0x00000000FFFFFFFFULL
510
511 #define DTRACE_STORE(type, tomax, offset, what) \
512 *((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
513
514
515 #define DTRACE_ALIGNCHECK(addr, size, flags) \
516 if (addr & (MIN(size,4) - 1)) { \
517 *flags |= CPU_DTRACE_BADALIGN; \
518 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr; \
519 return (0); \
520 }
521
522 #define DTRACE_RANGE_REMAIN(remp, addr, baseaddr, basesz) \
523 do { \
524 if ((remp) != NULL) { \
525 *(remp) = (uintptr_t)(baseaddr) + (basesz) - (addr); \
526 } \
527 } while (0)
528
529
530 /*
531 * Test whether a range of memory starting at testaddr of size testsz falls
532 * within the range of memory described by addr, sz. We take care to avoid
533 * problems with overflow and underflow of the unsigned quantities, and
534 * disallow all negative sizes. Ranges of size 0 are allowed.
535 */
536 #define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
537 ((testaddr) - (baseaddr) < (basesz) && \
538 (testaddr) + (testsz) - (baseaddr) <= (basesz) && \
539 (testaddr) + (testsz) >= (testaddr))
540
541 /*
542 * Test whether alloc_sz bytes will fit in the scratch region. We isolate
543 * alloc_sz on the righthand side of the comparison in order to avoid overflow
544 * or underflow in the comparison with it. This is simpler than the INRANGE
545 * check above, because we know that the dtms_scratch_ptr is valid in the
546 * range. Allocations of size zero are allowed.
547 */
548 #define DTRACE_INSCRATCH(mstate, alloc_sz) \
549 ((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
550 (mstate)->dtms_scratch_ptr >= (alloc_sz))
551
552 #define RECOVER_LABEL(bits) dtraceLoadRecover##bits:
553
554 #if defined (__x86_64__) || (defined (__arm__) || defined (__arm64__))
555 #define DTRACE_LOADFUNC(bits) \
556 /*CSTYLED*/ \
557 uint##bits##_t dtrace_load##bits(uintptr_t addr); \
558 \
559 uint##bits##_t \
560 dtrace_load##bits(uintptr_t addr) \
561 { \
562 size_t size = bits / NBBY; \
563 /*CSTYLED*/ \
564 uint##bits##_t rval = 0; \
565 int i; \
566 volatile uint16_t *flags = (volatile uint16_t *) \
567 &cpu_core[CPU->cpu_id].cpuc_dtrace_flags; \
568 \
569 DTRACE_ALIGNCHECK(addr, size, flags); \
570 \
571 for (i = 0; i < dtrace_toxranges; i++) { \
572 if (addr >= dtrace_toxrange[i].dtt_limit) \
573 continue; \
574 \
575 if (addr + size <= dtrace_toxrange[i].dtt_base) \
576 continue; \
577 \
578 /* \
579 * This address falls within a toxic region; return 0. \
580 */ \
581 *flags |= CPU_DTRACE_BADADDR; \
582 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr; \
583 return (0); \
584 } \
585 \
586 { \
587 volatile vm_offset_t recover = (vm_offset_t)&&dtraceLoadRecover##bits; \
588 *flags |= CPU_DTRACE_NOFAULT; \
589 recover = dtrace_sign_and_set_thread_recover(current_thread(), recover); \
590 /*CSTYLED*/ \
591 /* \
592 * PR6394061 - avoid device memory that is unpredictably \
593 * mapped and unmapped \
594 */ \
595 if (pmap_valid_page(pmap_find_phys(kernel_pmap, addr))) \
596 rval = *((volatile uint##bits##_t *)addr); \
597 else { \
598 *flags |= CPU_DTRACE_BADADDR; \
599 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr; \
600 return (0); \
601 } \
602 \
603 RECOVER_LABEL(bits); \
604 (void)dtrace_set_thread_recover(current_thread(), recover); \
605 *flags &= ~CPU_DTRACE_NOFAULT; \
606 } \
607 \
608 return (rval); \
609 }
610 #else /* all other architectures */
611 #error Unknown Architecture
612 #endif
613
614 #ifdef __LP64__
615 #define dtrace_loadptr dtrace_load64
616 #else
617 #define dtrace_loadptr dtrace_load32
618 #endif
619
620 #define DTRACE_DYNHASH_FREE 0
621 #define DTRACE_DYNHASH_SINK 1
622 #define DTRACE_DYNHASH_VALID 2
623
624 #define DTRACE_MATCH_FAIL -1
625 #define DTRACE_MATCH_NEXT 0
626 #define DTRACE_MATCH_DONE 1
627 #define DTRACE_ANCHORED(probe) ((probe)->dtpr_func[0] != '\0')
628 #define DTRACE_STATE_ALIGN 64
629
630 #define DTRACE_FLAGS2FLT(flags) \
631 (((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR : \
632 ((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP : \
633 ((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO : \
634 ((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV : \
635 ((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV : \
636 ((flags) & CPU_DTRACE_TUPOFLOW) ? DTRACEFLT_TUPOFLOW : \
637 ((flags) & CPU_DTRACE_BADALIGN) ? DTRACEFLT_BADALIGN : \
638 ((flags) & CPU_DTRACE_NOSCRATCH) ? DTRACEFLT_NOSCRATCH : \
639 ((flags) & CPU_DTRACE_BADSTACK) ? DTRACEFLT_BADSTACK : \
640 DTRACEFLT_UNKNOWN)
641
642 #define DTRACEACT_ISSTRING(act) \
643 ((act)->dta_kind == DTRACEACT_DIFEXPR && \
644 (act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
645
646
647 static size_t dtrace_strlen(const char *, size_t);
648 static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
649 static void dtrace_enabling_provide(dtrace_provider_t *);
650 static int dtrace_enabling_match(dtrace_enabling_t *, int *, dtrace_match_cond_t *cond);
651 static void dtrace_enabling_matchall_with_cond(dtrace_match_cond_t *cond);
652 static void dtrace_enabling_matchall(void);
653 static dtrace_state_t *dtrace_anon_grab(void);
654 static uint64_t dtrace_helper(int, dtrace_mstate_t *,
655 dtrace_state_t *, uint64_t, uint64_t);
656 static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
657 static void dtrace_buffer_drop(dtrace_buffer_t *);
658 static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
659 dtrace_state_t *, dtrace_mstate_t *);
660 static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
661 dtrace_optval_t);
662 static int dtrace_ecb_create_enable(dtrace_probe_t *, void *, void *);
663 static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
664 static int dtrace_canload_remains(uint64_t, size_t, size_t *,
665 dtrace_mstate_t *, dtrace_vstate_t *);
666 static int dtrace_canstore_remains(uint64_t, size_t, size_t *,
667 dtrace_mstate_t *, dtrace_vstate_t *);
668
669
670 /*
671 * DTrace sysctl handlers
672 *
673 * These declarations and functions are used for a deeper DTrace configuration.
674 * Most of them are not per-consumer basis and may impact the other DTrace
675 * consumers. Correctness may not be supported for all the variables, so you
676 * should be careful about what values you are using.
677 */
678
679 SYSCTL_DECL(_kern_dtrace);
680 SYSCTL_NODE(_kern, OID_AUTO, dtrace, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "dtrace");
681
682 static int
683 sysctl_dtrace_err_verbose SYSCTL_HANDLER_ARGS
684 {
685 #pragma unused(oidp, arg2)
686 int changed, error;
687 int value = *(int *) arg1;
688
689 error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
690 if (error || !changed)
691 return (error);
692
693 if (value != 0 && value != 1)
694 return (ERANGE);
695
696 lck_mtx_lock(&dtrace_lock);
697 dtrace_err_verbose = value;
698 lck_mtx_unlock(&dtrace_lock);
699
700 return (0);
701 }
702
703 /*
704 * kern.dtrace.err_verbose
705 *
706 * Set DTrace verbosity when an error occured (0 = disabled, 1 = enabld).
707 * Errors are reported when a DIFO or a DOF has been rejected by the kernel.
708 */
709 SYSCTL_PROC(_kern_dtrace, OID_AUTO, err_verbose,
710 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
711 &dtrace_err_verbose, 0,
712 sysctl_dtrace_err_verbose, "I", "dtrace error verbose");
713
714 static int
715 sysctl_dtrace_buffer_memory_maxsize SYSCTL_HANDLER_ARGS
716 {
717 #pragma unused(oidp, arg2, req)
718 int changed, error;
719 uint64_t value = *(uint64_t *) arg1;
720
721 error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
722 if (error || !changed)
723 return (error);
724
725 if (value <= dtrace_buffer_memory_inuse)
726 return (ERANGE);
727
728 lck_mtx_lock(&dtrace_lock);
729 dtrace_buffer_memory_maxsize = value;
730 lck_mtx_unlock(&dtrace_lock);
731
732 return (0);
733 }
734
735 /*
736 * kern.dtrace.buffer_memory_maxsize
737 *
738 * Set DTrace maximal size in bytes used by all the consumers' state buffers. By default
739 * the limit is PHYS_MEM / 3 for *all* consumers. Attempting to set a null, a negative value
740 * or a value <= to dtrace_buffer_memory_inuse will result in a failure.
741 */
742 SYSCTL_PROC(_kern_dtrace, OID_AUTO, buffer_memory_maxsize,
743 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
744 &dtrace_buffer_memory_maxsize, 0,
745 sysctl_dtrace_buffer_memory_maxsize, "Q", "dtrace state buffer memory maxsize");
746
747 /*
748 * kern.dtrace.buffer_memory_inuse
749 *
750 * Current state buffer memory used, in bytes, by all the DTrace consumers.
751 * This value is read-only.
752 */
753 SYSCTL_QUAD(_kern_dtrace, OID_AUTO, buffer_memory_inuse, CTLFLAG_RD | CTLFLAG_LOCKED,
754 &dtrace_buffer_memory_inuse, "dtrace state buffer memory in-use");
755
756 static int
757 sysctl_dtrace_difo_maxsize SYSCTL_HANDLER_ARGS
758 {
759 #pragma unused(oidp, arg2, req)
760 int changed, error;
761 size_t value = *(size_t*) arg1;
762
763 error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
764 if (error || !changed)
765 return (error);
766
767 if (value <= 0)
768 return (ERANGE);
769
770 lck_mtx_lock(&dtrace_lock);
771 dtrace_difo_maxsize = value;
772 lck_mtx_unlock(&dtrace_lock);
773
774 return (0);
775 }
776
777 /*
778 * kern.dtrace.difo_maxsize
779 *
780 * Set the DIFO max size in bytes, check the definition of dtrace_difo_maxsize
781 * to get the default value. Attempting to set a null or negative size will
782 * result in a failure.
783 */
784 SYSCTL_PROC(_kern_dtrace, OID_AUTO, difo_maxsize,
785 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
786 &dtrace_difo_maxsize, 0,
787 sysctl_dtrace_difo_maxsize, "Q", "dtrace difo maxsize");
788
789 static int
790 sysctl_dtrace_dof_maxsize SYSCTL_HANDLER_ARGS
791 {
792 #pragma unused(oidp, arg2, req)
793 int changed, error;
794 dtrace_optval_t value = *(dtrace_optval_t *) arg1;
795
796 error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
797 if (error || !changed)
798 return (error);
799
800 if (value <= 0)
801 return (ERANGE);
802
803 if (value >= dtrace_copy_maxsize())
804 return (ERANGE);
805
806 lck_mtx_lock(&dtrace_lock);
807 dtrace_dof_maxsize = value;
808 lck_mtx_unlock(&dtrace_lock);
809
810 return (0);
811 }
812
813 /*
814 * kern.dtrace.dof_maxsize
815 *
816 * Set the DOF max size in bytes, check the definition of dtrace_dof_maxsize to
817 * get the default value. Attempting to set a null or negative size will result
818 * in a failure.
819 */
820 SYSCTL_PROC(_kern_dtrace, OID_AUTO, dof_maxsize,
821 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
822 &dtrace_dof_maxsize, 0,
823 sysctl_dtrace_dof_maxsize, "Q", "dtrace dof maxsize");
824
825 static int
826 sysctl_dtrace_statvar_maxsize SYSCTL_HANDLER_ARGS
827 {
828 #pragma unused(oidp, arg2, req)
829 int changed, error;
830 dtrace_optval_t value = *(dtrace_optval_t*) arg1;
831
832 error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
833 if (error || !changed)
834 return (error);
835
836 if (value <= 0)
837 return (ERANGE);
838 if (value > dtrace_statvar_maxsize_max)
839 return (ERANGE);
840
841 lck_mtx_lock(&dtrace_lock);
842 dtrace_statvar_maxsize = value;
843 lck_mtx_unlock(&dtrace_lock);
844
845 return (0);
846 }
847
848 /*
849 * kern.dtrace.global_maxsize
850 *
851 * Set the variable max size in bytes, check the definition of
852 * dtrace_statvar_maxsize to get the default value. Attempting to set a null,
853 * too high or negative size will result in a failure.
854 */
855 SYSCTL_PROC(_kern_dtrace, OID_AUTO, global_maxsize,
856 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
857 &dtrace_statvar_maxsize, 0,
858 sysctl_dtrace_statvar_maxsize, "Q", "dtrace statvar maxsize");
859
860
861 /*
862 * kern.dtrace.provide_private_probes
863 *
864 * Set whether the providers must provide the private probes. This is
865 * kept as compatibility as they are always provided.
866 */
867 SYSCTL_INT(_kern_dtrace, OID_AUTO, provide_private_probes,
868 CTLFLAG_RD | CTLFLAG_LOCKED,
869 (int *)NULL, 1, "provider must provide the private probes");
870
871 /*
872 * kern.dtrace.dof_mode
873 *
874 * Returns the current DOF mode.
875 * This value is read-only.
876 */
877 SYSCTL_INT(_kern_dtrace, OID_AUTO, dof_mode, CTLFLAG_RD | CTLFLAG_LOCKED,
878 &dtrace_dof_mode, 0, "dtrace dof mode");
879
880 /*
881 * DTrace Probe Context Functions
882 *
883 * These functions are called from probe context. Because probe context is
884 * any context in which C may be called, arbitrarily locks may be held,
885 * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
886 * As a result, functions called from probe context may only call other DTrace
887 * support functions -- they may not interact at all with the system at large.
888 * (Note that the ASSERT macro is made probe-context safe by redefining it in
889 * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
890 * loads are to be performed from probe context, they _must_ be in terms of
891 * the safe dtrace_load*() variants.
892 *
893 * Some functions in this block are not actually called from probe context;
894 * for these functions, there will be a comment above the function reading
895 * "Note: not called from probe context."
896 */
897
898 int
899 dtrace_assfail(const char *a, const char *f, int l)
900 {
901 panic("dtrace: assertion failed: %s, file: %s, line: %d", a, f, l);
902
903 /*
904 * We just need something here that even the most clever compiler
905 * cannot optimize away.
906 */
907 return (a[(uintptr_t)f]);
908 }
909
910 /*
911 * Atomically increment a specified error counter from probe context.
912 */
913 static void
914 dtrace_error(uint32_t *counter)
915 {
916 /*
917 * Most counters stored to in probe context are per-CPU counters.
918 * However, there are some error conditions that are sufficiently
919 * arcane that they don't merit per-CPU storage. If these counters
920 * are incremented concurrently on different CPUs, scalability will be
921 * adversely affected -- but we don't expect them to be white-hot in a
922 * correctly constructed enabling...
923 */
924 uint32_t oval, nval;
925
926 do {
927 oval = *counter;
928
929 if ((nval = oval + 1) == 0) {
930 /*
931 * If the counter would wrap, set it to 1 -- assuring
932 * that the counter is never zero when we have seen
933 * errors. (The counter must be 32-bits because we
934 * aren't guaranteed a 64-bit compare&swap operation.)
935 * To save this code both the infamy of being fingered
936 * by a priggish news story and the indignity of being
937 * the target of a neo-puritan witch trial, we're
938 * carefully avoiding any colorful description of the
939 * likelihood of this condition -- but suffice it to
940 * say that it is only slightly more likely than the
941 * overflow of predicate cache IDs, as discussed in
942 * dtrace_predicate_create().
943 */
944 nval = 1;
945 }
946 } while (dtrace_cas32(counter, oval, nval) != oval);
947 }
948
949 /*
950 * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
951 * uint8_t, a uint16_t, a uint32_t and a uint64_t.
952 */
953 DTRACE_LOADFUNC(8)
954 DTRACE_LOADFUNC(16)
955 DTRACE_LOADFUNC(32)
956 DTRACE_LOADFUNC(64)
957
958 static int
959 dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
960 {
961 if (dest < mstate->dtms_scratch_base)
962 return (0);
963
964 if (dest + size < dest)
965 return (0);
966
967 if (dest + size > mstate->dtms_scratch_ptr)
968 return (0);
969
970 return (1);
971 }
972
973 static int
974 dtrace_canstore_statvar(uint64_t addr, size_t sz, size_t *remain,
975 dtrace_statvar_t **svars, int nsvars)
976 {
977 int i;
978
979 size_t maxglobalsize, maxlocalsize;
980
981 maxglobalsize = dtrace_statvar_maxsize + sizeof (uint64_t);
982 maxlocalsize = (maxglobalsize) * NCPU;
983
984 if (nsvars == 0)
985 return (0);
986
987 for (i = 0; i < nsvars; i++) {
988 dtrace_statvar_t *svar = svars[i];
989 uint8_t scope;
990 size_t size;
991
992 if (svar == NULL || (size = svar->dtsv_size) == 0)
993 continue;
994
995 scope = svar->dtsv_var.dtdv_scope;
996
997 /**
998 * We verify that our size is valid in the spirit of providing
999 * defense in depth: we want to prevent attackers from using
1000 * DTrace to escalate an orthogonal kernel heap corruption bug
1001 * into the ability to store to arbitrary locations in memory.
1002 */
1003 VERIFY((scope == DIFV_SCOPE_GLOBAL && size <= maxglobalsize) ||
1004 (scope == DIFV_SCOPE_LOCAL && size <= maxlocalsize));
1005
1006 if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size)) {
1007 DTRACE_RANGE_REMAIN(remain, addr, svar->dtsv_data,
1008 svar->dtsv_size);
1009 return (1);
1010 }
1011 }
1012
1013 return (0);
1014 }
1015
1016 /*
1017 * Check to see if the address is within a memory region to which a store may
1018 * be issued. This includes the DTrace scratch areas, and any DTrace variable
1019 * region. The caller of dtrace_canstore() is responsible for performing any
1020 * alignment checks that are needed before stores are actually executed.
1021 */
1022 static int
1023 dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
1024 dtrace_vstate_t *vstate)
1025 {
1026 return (dtrace_canstore_remains(addr, sz, NULL, mstate, vstate));
1027 }
1028 /*
1029 * Implementation of dtrace_canstore which communicates the upper bound of the
1030 * allowed memory region.
1031 */
1032 static int
1033 dtrace_canstore_remains(uint64_t addr, size_t sz, size_t *remain,
1034 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1035 {
1036 /*
1037 * First, check to see if the address is in scratch space...
1038 */
1039 if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
1040 mstate->dtms_scratch_size)) {
1041 DTRACE_RANGE_REMAIN(remain, addr, mstate->dtms_scratch_base,
1042 mstate->dtms_scratch_size);
1043 return (1);
1044 }
1045 /*
1046 * Now check to see if it's a dynamic variable. This check will pick
1047 * up both thread-local variables and any global dynamically-allocated
1048 * variables.
1049 */
1050 if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base,
1051 vstate->dtvs_dynvars.dtds_size)) {
1052 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
1053 uintptr_t base = (uintptr_t)dstate->dtds_base +
1054 (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
1055 uintptr_t chunkoffs;
1056 dtrace_dynvar_t *dvar;
1057
1058 /*
1059 * Before we assume that we can store here, we need to make
1060 * sure that it isn't in our metadata -- storing to our
1061 * dynamic variable metadata would corrupt our state. For
1062 * the range to not include any dynamic variable metadata,
1063 * it must:
1064 *
1065 * (1) Start above the hash table that is at the base of
1066 * the dynamic variable space
1067 *
1068 * (2) Have a starting chunk offset that is beyond the
1069 * dtrace_dynvar_t that is at the base of every chunk
1070 *
1071 * (3) Not span a chunk boundary
1072 *
1073 * (4) Not be in the tuple space of a dynamic variable
1074 *
1075 */
1076 if (addr < base)
1077 return (0);
1078
1079 chunkoffs = (addr - base) % dstate->dtds_chunksize;
1080
1081 if (chunkoffs < sizeof (dtrace_dynvar_t))
1082 return (0);
1083
1084 if (chunkoffs + sz > dstate->dtds_chunksize)
1085 return (0);
1086
1087 dvar = (dtrace_dynvar_t *)((uintptr_t)addr - chunkoffs);
1088
1089 if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE)
1090 return (0);
1091
1092 if (chunkoffs < sizeof (dtrace_dynvar_t) +
1093 ((dvar->dtdv_tuple.dtt_nkeys - 1) * sizeof (dtrace_key_t)))
1094 return (0);
1095
1096 return (1);
1097 }
1098
1099 /*
1100 * Finally, check the static local and global variables. These checks
1101 * take the longest, so we perform them last.
1102 */
1103 if (dtrace_canstore_statvar(addr, sz, remain,
1104 vstate->dtvs_locals, vstate->dtvs_nlocals))
1105 return (1);
1106
1107 if (dtrace_canstore_statvar(addr, sz, remain,
1108 vstate->dtvs_globals, vstate->dtvs_nglobals))
1109 return (1);
1110
1111 return (0);
1112 }
1113
1114
1115 /*
1116 * Convenience routine to check to see if the address is within a memory
1117 * region in which a load may be issued given the user's privilege level;
1118 * if not, it sets the appropriate error flags and loads 'addr' into the
1119 * illegal value slot.
1120 *
1121 * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
1122 * appropriate memory access protection.
1123 */
1124 int
1125 dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
1126 dtrace_vstate_t *vstate)
1127 {
1128 return (dtrace_canload_remains(addr, sz, NULL, mstate, vstate));
1129 }
1130
1131 /*
1132 * Implementation of dtrace_canload which communicates the upper bound of the
1133 * allowed memory region.
1134 */
1135 static int
1136 dtrace_canload_remains(uint64_t addr, size_t sz, size_t *remain,
1137 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1138 {
1139 volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
1140
1141 /*
1142 * If we hold the privilege to read from kernel memory, then
1143 * everything is readable.
1144 */
1145 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
1146 DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
1147 return (1);
1148 }
1149
1150 /*
1151 * You can obviously read that which you can store.
1152 */
1153 if (dtrace_canstore_remains(addr, sz, remain, mstate, vstate))
1154 return (1);
1155
1156 /*
1157 * We're allowed to read from our own string table.
1158 */
1159 if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab,
1160 mstate->dtms_difo->dtdo_strlen)) {
1161 DTRACE_RANGE_REMAIN(remain, addr,
1162 mstate->dtms_difo->dtdo_strtab,
1163 mstate->dtms_difo->dtdo_strlen);
1164 return (1);
1165 }
1166
1167 DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
1168 *illval = addr;
1169 return (0);
1170 }
1171
1172 /*
1173 * Convenience routine to check to see if a given string is within a memory
1174 * region in which a load may be issued given the user's privilege level;
1175 * this exists so that we don't need to issue unnecessary dtrace_strlen()
1176 * calls in the event that the user has all privileges.
1177 */
1178 static int
1179 dtrace_strcanload(uint64_t addr, size_t sz, size_t *remain,
1180 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1181 {
1182 size_t rsize;
1183
1184 /*
1185 * If we hold the privilege to read from kernel memory, then
1186 * everything is readable.
1187 */
1188 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
1189 DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
1190 return (1);
1191 }
1192
1193 /*
1194 * Even if the caller is uninterested in querying the remaining valid
1195 * range, it is required to ensure that the access is allowed.
1196 */
1197 if (remain == NULL) {
1198 remain = &rsize;
1199 }
1200 if (dtrace_canload_remains(addr, 0, remain, mstate, vstate)) {
1201 size_t strsz;
1202 /*
1203 * Perform the strlen after determining the length of the
1204 * memory region which is accessible. This prevents timing
1205 * information from being used to find NULs in memory which is
1206 * not accessible to the caller.
1207 */
1208 strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr,
1209 MIN(sz, *remain));
1210 if (strsz <= *remain) {
1211 return (1);
1212 }
1213 }
1214
1215 return (0);
1216 }
1217
1218 /*
1219 * Convenience routine to check to see if a given variable is within a memory
1220 * region in which a load may be issued given the user's privilege level.
1221 */
1222 static int
1223 dtrace_vcanload(void *src, dtrace_diftype_t *type, size_t *remain,
1224 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1225 {
1226 size_t sz;
1227 ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1228
1229 /*
1230 * Calculate the max size before performing any checks since even
1231 * DTRACE_ACCESS_KERNEL-credentialed callers expect that this function
1232 * return the max length via 'remain'.
1233 */
1234 if (type->dtdt_kind == DIF_TYPE_STRING) {
1235 dtrace_state_t *state = vstate->dtvs_state;
1236
1237 if (state != NULL) {
1238 sz = state->dts_options[DTRACEOPT_STRSIZE];
1239 } else {
1240 /*
1241 * In helper context, we have a NULL state; fall back
1242 * to using the system-wide default for the string size
1243 * in this case.
1244 */
1245 sz = dtrace_strsize_default;
1246 }
1247 } else {
1248 sz = type->dtdt_size;
1249 }
1250
1251 /*
1252 * If we hold the privilege to read from kernel memory, then
1253 * everything is readable.
1254 */
1255 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
1256 DTRACE_RANGE_REMAIN(remain, (uintptr_t)src, src, sz);
1257 return (1);
1258 }
1259
1260 if (type->dtdt_kind == DIF_TYPE_STRING) {
1261 return (dtrace_strcanload((uintptr_t)src, sz, remain, mstate,
1262 vstate));
1263 }
1264 return (dtrace_canload_remains((uintptr_t)src, sz, remain, mstate,
1265 vstate));
1266 }
1267
1268 #define isdigit(ch) ((ch) >= '0' && (ch) <= '9')
1269 #define islower(ch) ((ch) >= 'a' && (ch) <= 'z')
1270 #define isspace(ch) (((ch) == ' ') || ((ch) == '\r') || ((ch) == '\n') || \
1271 ((ch) == '\t') || ((ch) == '\f'))
1272 #define isxdigit(ch) (isdigit(ch) || ((ch) >= 'a' && (ch) <= 'f') || \
1273 ((ch) >= 'A' && (ch) <= 'F'))
1274 #define lisalnum(x) \
1275 (isdigit(x) || ((x) >= 'a' && (x) <= 'z') || ((x) >= 'A' && (x) <= 'Z'))
1276
1277 #define DIGIT(x) \
1278 (isdigit(x) ? (x) - '0' : islower(x) ? (x) + 10 - 'a' : (x) + 10 - 'A')
1279
1280 /*
1281 * Convert a string to a signed integer using safe loads.
1282 */
1283 static int64_t
1284 dtrace_strtoll(char *input, int base, size_t limit)
1285 {
1286 uintptr_t pos = (uintptr_t)input;
1287 int64_t val = 0;
1288 int x;
1289 boolean_t neg = B_FALSE;
1290 char c, cc, ccc;
1291 uintptr_t end = pos + limit;
1292
1293 /*
1294 * Consume any whitespace preceding digits.
1295 */
1296 while ((c = dtrace_load8(pos)) == ' ' || c == '\t')
1297 pos++;
1298
1299 /*
1300 * Handle an explicit sign if one is present.
1301 */
1302 if (c == '-' || c == '+') {
1303 if (c == '-')
1304 neg = B_TRUE;
1305 c = dtrace_load8(++pos);
1306 }
1307
1308 /*
1309 * Check for an explicit hexadecimal prefix ("0x" or "0X") and skip it
1310 * if present.
1311 */
1312 if (base == 16 && c == '0' && ((cc = dtrace_load8(pos + 1)) == 'x' ||
1313 cc == 'X') && isxdigit(ccc = dtrace_load8(pos + 2))) {
1314 pos += 2;
1315 c = ccc;
1316 }
1317
1318 /*
1319 * Read in contiguous digits until the first non-digit character.
1320 */
1321 for (; pos < end && c != '\0' && lisalnum(c) && (x = DIGIT(c)) < base;
1322 c = dtrace_load8(++pos))
1323 val = val * base + x;
1324
1325 return (neg ? -val : val);
1326 }
1327
1328
1329 /*
1330 * Compare two strings using safe loads.
1331 */
1332 static int
1333 dtrace_strncmp(const char *s1, const char *s2, size_t limit)
1334 {
1335 uint8_t c1, c2;
1336 volatile uint16_t *flags;
1337
1338 if (s1 == s2 || limit == 0)
1339 return (0);
1340
1341 flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1342
1343 do {
1344 if (s1 == NULL) {
1345 c1 = '\0';
1346 } else {
1347 c1 = dtrace_load8((uintptr_t)s1++);
1348 }
1349
1350 if (s2 == NULL) {
1351 c2 = '\0';
1352 } else {
1353 c2 = dtrace_load8((uintptr_t)s2++);
1354 }
1355
1356 if (c1 != c2)
1357 return (c1 - c2);
1358 } while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
1359
1360 return (0);
1361 }
1362
1363 /*
1364 * Compute strlen(s) for a string using safe memory accesses. The additional
1365 * len parameter is used to specify a maximum length to ensure completion.
1366 */
1367 static size_t
1368 dtrace_strlen(const char *s, size_t lim)
1369 {
1370 uint_t len;
1371
1372 for (len = 0; len != lim; len++) {
1373 if (dtrace_load8((uintptr_t)s++) == '\0')
1374 break;
1375 }
1376
1377 return (len);
1378 }
1379
1380 /*
1381 * Check if an address falls within a toxic region.
1382 */
1383 static int
1384 dtrace_istoxic(uintptr_t kaddr, size_t size)
1385 {
1386 uintptr_t taddr, tsize;
1387 int i;
1388
1389 for (i = 0; i < dtrace_toxranges; i++) {
1390 taddr = dtrace_toxrange[i].dtt_base;
1391 tsize = dtrace_toxrange[i].dtt_limit - taddr;
1392
1393 if (kaddr - taddr < tsize) {
1394 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1395 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = kaddr;
1396 return (1);
1397 }
1398
1399 if (taddr - kaddr < size) {
1400 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1401 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = taddr;
1402 return (1);
1403 }
1404 }
1405
1406 return (0);
1407 }
1408
1409 /*
1410 * Copy src to dst using safe memory accesses. The src is assumed to be unsafe
1411 * memory specified by the DIF program. The dst is assumed to be safe memory
1412 * that we can store to directly because it is managed by DTrace. As with
1413 * standard bcopy, overlapping copies are handled properly.
1414 */
1415 static void
1416 dtrace_bcopy(const void *src, void *dst, size_t len)
1417 {
1418 if (len != 0) {
1419 uint8_t *s1 = dst;
1420 const uint8_t *s2 = src;
1421
1422 if (s1 <= s2) {
1423 do {
1424 *s1++ = dtrace_load8((uintptr_t)s2++);
1425 } while (--len != 0);
1426 } else {
1427 s2 += len;
1428 s1 += len;
1429
1430 do {
1431 *--s1 = dtrace_load8((uintptr_t)--s2);
1432 } while (--len != 0);
1433 }
1434 }
1435 }
1436
1437 /*
1438 * Copy src to dst using safe memory accesses, up to either the specified
1439 * length, or the point that a nul byte is encountered. The src is assumed to
1440 * be unsafe memory specified by the DIF program. The dst is assumed to be
1441 * safe memory that we can store to directly because it is managed by DTrace.
1442 * Unlike dtrace_bcopy(), overlapping regions are not handled.
1443 */
1444 static void
1445 dtrace_strcpy(const void *src, void *dst, size_t len)
1446 {
1447 if (len != 0) {
1448 uint8_t *s1 = dst, c;
1449 const uint8_t *s2 = src;
1450
1451 do {
1452 *s1++ = c = dtrace_load8((uintptr_t)s2++);
1453 } while (--len != 0 && c != '\0');
1454 }
1455 }
1456
1457 /*
1458 * Copy src to dst, deriving the size and type from the specified (BYREF)
1459 * variable type. The src is assumed to be unsafe memory specified by the DIF
1460 * program. The dst is assumed to be DTrace variable memory that is of the
1461 * specified type; we assume that we can store to directly.
1462 */
1463 static void
1464 dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type, size_t limit)
1465 {
1466 ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1467
1468 if (type->dtdt_kind == DIF_TYPE_STRING) {
1469 dtrace_strcpy(src, dst, MIN(type->dtdt_size, limit));
1470 } else {
1471 dtrace_bcopy(src, dst, MIN(type->dtdt_size, limit));
1472 }
1473 }
1474
1475 /*
1476 * Compare s1 to s2 using safe memory accesses. The s1 data is assumed to be
1477 * unsafe memory specified by the DIF program. The s2 data is assumed to be
1478 * safe memory that we can access directly because it is managed by DTrace.
1479 */
1480 static int
1481 dtrace_bcmp(const void *s1, const void *s2, size_t len)
1482 {
1483 volatile uint16_t *flags;
1484
1485 flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1486
1487 if (s1 == s2)
1488 return (0);
1489
1490 if (s1 == NULL || s2 == NULL)
1491 return (1);
1492
1493 if (s1 != s2 && len != 0) {
1494 const uint8_t *ps1 = s1;
1495 const uint8_t *ps2 = s2;
1496
1497 do {
1498 if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
1499 return (1);
1500 } while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
1501 }
1502 return (0);
1503 }
1504
1505 /*
1506 * Zero the specified region using a simple byte-by-byte loop. Note that this
1507 * is for safe DTrace-managed memory only.
1508 */
1509 static void
1510 dtrace_bzero(void *dst, size_t len)
1511 {
1512 uchar_t *cp;
1513
1514 for (cp = dst; len != 0; len--)
1515 *cp++ = 0;
1516 }
1517
1518 static void
1519 dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
1520 {
1521 uint64_t result[2];
1522
1523 result[0] = addend1[0] + addend2[0];
1524 result[1] = addend1[1] + addend2[1] +
1525 (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
1526
1527 sum[0] = result[0];
1528 sum[1] = result[1];
1529 }
1530
1531 /*
1532 * Shift the 128-bit value in a by b. If b is positive, shift left.
1533 * If b is negative, shift right.
1534 */
1535 static void
1536 dtrace_shift_128(uint64_t *a, int b)
1537 {
1538 uint64_t mask;
1539
1540 if (b == 0)
1541 return;
1542
1543 if (b < 0) {
1544 b = -b;
1545 if (b >= 64) {
1546 a[0] = a[1] >> (b - 64);
1547 a[1] = 0;
1548 } else {
1549 a[0] >>= b;
1550 mask = 1LL << (64 - b);
1551 mask -= 1;
1552 a[0] |= ((a[1] & mask) << (64 - b));
1553 a[1] >>= b;
1554 }
1555 } else {
1556 if (b >= 64) {
1557 a[1] = a[0] << (b - 64);
1558 a[0] = 0;
1559 } else {
1560 a[1] <<= b;
1561 mask = a[0] >> (64 - b);
1562 a[1] |= mask;
1563 a[0] <<= b;
1564 }
1565 }
1566 }
1567
1568 /*
1569 * The basic idea is to break the 2 64-bit values into 4 32-bit values,
1570 * use native multiplication on those, and then re-combine into the
1571 * resulting 128-bit value.
1572 *
1573 * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1574 * hi1 * hi2 << 64 +
1575 * hi1 * lo2 << 32 +
1576 * hi2 * lo1 << 32 +
1577 * lo1 * lo2
1578 */
1579 static void
1580 dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1581 {
1582 uint64_t hi1, hi2, lo1, lo2;
1583 uint64_t tmp[2];
1584
1585 hi1 = factor1 >> 32;
1586 hi2 = factor2 >> 32;
1587
1588 lo1 = factor1 & DT_MASK_LO;
1589 lo2 = factor2 & DT_MASK_LO;
1590
1591 product[0] = lo1 * lo2;
1592 product[1] = hi1 * hi2;
1593
1594 tmp[0] = hi1 * lo2;
1595 tmp[1] = 0;
1596 dtrace_shift_128(tmp, 32);
1597 dtrace_add_128(product, tmp, product);
1598
1599 tmp[0] = hi2 * lo1;
1600 tmp[1] = 0;
1601 dtrace_shift_128(tmp, 32);
1602 dtrace_add_128(product, tmp, product);
1603 }
1604
1605 /*
1606 * This privilege check should be used by actions and subroutines to
1607 * verify that the user credentials of the process that enabled the
1608 * invoking ECB match the target credentials
1609 */
1610 static int
1611 dtrace_priv_proc_common_user(dtrace_state_t *state)
1612 {
1613 cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1614
1615 /*
1616 * We should always have a non-NULL state cred here, since if cred
1617 * is null (anonymous tracing), we fast-path bypass this routine.
1618 */
1619 ASSERT(s_cr != NULL);
1620
1621 if ((cr = dtrace_CRED()) != NULL &&
1622 posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_uid &&
1623 posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_ruid &&
1624 posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_suid &&
1625 posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_gid &&
1626 posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_rgid &&
1627 posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_sgid)
1628 return (1);
1629
1630 return (0);
1631 }
1632
1633 /*
1634 * This privilege check should be used by actions and subroutines to
1635 * verify that the zone of the process that enabled the invoking ECB
1636 * matches the target credentials
1637 */
1638 static int
1639 dtrace_priv_proc_common_zone(dtrace_state_t *state)
1640 {
1641 cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1642 #pragma unused(cr, s_cr, state) /* __APPLE__ */
1643
1644 /*
1645 * We should always have a non-NULL state cred here, since if cred
1646 * is null (anonymous tracing), we fast-path bypass this routine.
1647 */
1648 ASSERT(s_cr != NULL);
1649
1650 return 1; /* APPLE NOTE: Darwin doesn't do zones. */
1651 }
1652
1653 /*
1654 * This privilege check should be used by actions and subroutines to
1655 * verify that the process has not setuid or changed credentials.
1656 */
1657 static int
1658 dtrace_priv_proc_common_nocd(void)
1659 {
1660 return 1; /* Darwin omits "No Core Dump" flag. */
1661 }
1662
1663 static int
1664 dtrace_priv_proc_destructive(dtrace_state_t *state)
1665 {
1666 int action = state->dts_cred.dcr_action;
1667
1668 if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1669 goto bad;
1670
1671 if (dtrace_is_restricted() && !dtrace_can_attach_to_proc(current_proc()))
1672 goto bad;
1673
1674 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1675 dtrace_priv_proc_common_zone(state) == 0)
1676 goto bad;
1677
1678 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1679 dtrace_priv_proc_common_user(state) == 0)
1680 goto bad;
1681
1682 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1683 dtrace_priv_proc_common_nocd() == 0)
1684 goto bad;
1685
1686 return (1);
1687
1688 bad:
1689 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1690
1691 return (0);
1692 }
1693
1694 static int
1695 dtrace_priv_proc_control(dtrace_state_t *state)
1696 {
1697 if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1698 goto bad;
1699
1700 if (dtrace_is_restricted() && !dtrace_can_attach_to_proc(current_proc()))
1701 goto bad;
1702
1703 if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1704 return (1);
1705
1706 if (dtrace_priv_proc_common_zone(state) &&
1707 dtrace_priv_proc_common_user(state) &&
1708 dtrace_priv_proc_common_nocd())
1709 return (1);
1710
1711 bad:
1712 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1713
1714 return (0);
1715 }
1716
1717 static int
1718 dtrace_priv_proc(dtrace_state_t *state)
1719 {
1720 if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1721 goto bad;
1722
1723 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed() && !dtrace_can_attach_to_proc(current_proc()))
1724 goto bad;
1725
1726 if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1727 return (1);
1728
1729 bad:
1730 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1731
1732 return (0);
1733 }
1734
1735 /*
1736 * The P_LNOATTACH check is an Apple specific check.
1737 * We need a version of dtrace_priv_proc() that omits
1738 * that check for PID and EXECNAME accesses
1739 */
1740 static int
1741 dtrace_priv_proc_relaxed(dtrace_state_t *state)
1742 {
1743
1744 if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1745 return (1);
1746
1747 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1748
1749 return (0);
1750 }
1751
1752 static int
1753 dtrace_priv_kernel(dtrace_state_t *state)
1754 {
1755 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed())
1756 goto bad;
1757
1758 if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1759 return (1);
1760
1761 bad:
1762 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1763
1764 return (0);
1765 }
1766
1767 static int
1768 dtrace_priv_kernel_destructive(dtrace_state_t *state)
1769 {
1770 if (dtrace_is_restricted())
1771 goto bad;
1772
1773 if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1774 return (1);
1775
1776 bad:
1777 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1778
1779 return (0);
1780 }
1781
1782 /*
1783 * Note: not called from probe context. This function is called
1784 * asynchronously (and at a regular interval) from outside of probe context to
1785 * clean the dirty dynamic variable lists on all CPUs. Dynamic variable
1786 * cleaning is explained in detail in <sys/dtrace_impl.h>.
1787 */
1788 static void
1789 dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1790 {
1791 dtrace_dynvar_t *dirty;
1792 dtrace_dstate_percpu_t *dcpu;
1793 int i, work = 0;
1794
1795 for (i = 0; i < (int)NCPU; i++) {
1796 dcpu = &dstate->dtds_percpu[i];
1797
1798 ASSERT(dcpu->dtdsc_rinsing == NULL);
1799
1800 /*
1801 * If the dirty list is NULL, there is no dirty work to do.
1802 */
1803 if (dcpu->dtdsc_dirty == NULL)
1804 continue;
1805
1806 /*
1807 * If the clean list is non-NULL, then we're not going to do
1808 * any work for this CPU -- it means that there has not been
1809 * a dtrace_dynvar() allocation on this CPU (or from this CPU)
1810 * since the last time we cleaned house.
1811 */
1812 if (dcpu->dtdsc_clean != NULL)
1813 continue;
1814
1815 work = 1;
1816
1817 /*
1818 * Atomically move the dirty list aside.
1819 */
1820 do {
1821 dirty = dcpu->dtdsc_dirty;
1822
1823 /*
1824 * Before we zap the dirty list, set the rinsing list.
1825 * (This allows for a potential assertion in
1826 * dtrace_dynvar(): if a free dynamic variable appears
1827 * on a hash chain, either the dirty list or the
1828 * rinsing list for some CPU must be non-NULL.)
1829 */
1830 dcpu->dtdsc_rinsing = dirty;
1831 dtrace_membar_producer();
1832 } while (dtrace_casptr(&dcpu->dtdsc_dirty,
1833 dirty, NULL) != dirty);
1834 }
1835
1836 if (!work) {
1837 /*
1838 * We have no work to do; we can simply return.
1839 */
1840 return;
1841 }
1842
1843 dtrace_sync();
1844
1845 for (i = 0; i < (int)NCPU; i++) {
1846 dcpu = &dstate->dtds_percpu[i];
1847
1848 if (dcpu->dtdsc_rinsing == NULL)
1849 continue;
1850
1851 /*
1852 * We are now guaranteed that no hash chain contains a pointer
1853 * into this dirty list; we can make it clean.
1854 */
1855 ASSERT(dcpu->dtdsc_clean == NULL);
1856 dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1857 dcpu->dtdsc_rinsing = NULL;
1858 }
1859
1860 /*
1861 * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1862 * sure that all CPUs have seen all of the dtdsc_clean pointers.
1863 * This prevents a race whereby a CPU incorrectly decides that
1864 * the state should be something other than DTRACE_DSTATE_CLEAN
1865 * after dtrace_dynvar_clean() has completed.
1866 */
1867 dtrace_sync();
1868
1869 dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1870 }
1871
1872 /*
1873 * Depending on the value of the op parameter, this function looks-up,
1874 * allocates or deallocates an arbitrarily-keyed dynamic variable. If an
1875 * allocation is requested, this function will return a pointer to a
1876 * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1877 * variable can be allocated. If NULL is returned, the appropriate counter
1878 * will be incremented.
1879 */
1880 static dtrace_dynvar_t *
1881 dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1882 dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1883 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1884 {
1885 uint64_t hashval = DTRACE_DYNHASH_VALID;
1886 dtrace_dynhash_t *hash = dstate->dtds_hash;
1887 dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1888 processorid_t me = CPU->cpu_id, cpu = me;
1889 dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
1890 size_t bucket, ksize;
1891 size_t chunksize = dstate->dtds_chunksize;
1892 uintptr_t kdata, lock, nstate;
1893 uint_t i;
1894
1895 ASSERT(nkeys != 0);
1896
1897 /*
1898 * Hash the key. As with aggregations, we use Jenkins' "One-at-a-time"
1899 * algorithm. For the by-value portions, we perform the algorithm in
1900 * 16-bit chunks (as opposed to 8-bit chunks). This speeds things up a
1901 * bit, and seems to have only a minute effect on distribution. For
1902 * the by-reference data, we perform "One-at-a-time" iterating (safely)
1903 * over each referenced byte. It's painful to do this, but it's much
1904 * better than pathological hash distribution. The efficacy of the
1905 * hashing algorithm (and a comparison with other algorithms) may be
1906 * found by running the ::dtrace_dynstat MDB dcmd.
1907 */
1908 for (i = 0; i < nkeys; i++) {
1909 if (key[i].dttk_size == 0) {
1910 uint64_t val = key[i].dttk_value;
1911
1912 hashval += (val >> 48) & 0xffff;
1913 hashval += (hashval << 10);
1914 hashval ^= (hashval >> 6);
1915
1916 hashval += (val >> 32) & 0xffff;
1917 hashval += (hashval << 10);
1918 hashval ^= (hashval >> 6);
1919
1920 hashval += (val >> 16) & 0xffff;
1921 hashval += (hashval << 10);
1922 hashval ^= (hashval >> 6);
1923
1924 hashval += val & 0xffff;
1925 hashval += (hashval << 10);
1926 hashval ^= (hashval >> 6);
1927 } else {
1928 /*
1929 * This is incredibly painful, but it beats the hell
1930 * out of the alternative.
1931 */
1932 uint64_t j, size = key[i].dttk_size;
1933 uintptr_t base = (uintptr_t)key[i].dttk_value;
1934
1935 if (!dtrace_canload(base, size, mstate, vstate))
1936 break;
1937
1938 for (j = 0; j < size; j++) {
1939 hashval += dtrace_load8(base + j);
1940 hashval += (hashval << 10);
1941 hashval ^= (hashval >> 6);
1942 }
1943 }
1944 }
1945
1946 if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1947 return (NULL);
1948
1949 hashval += (hashval << 3);
1950 hashval ^= (hashval >> 11);
1951 hashval += (hashval << 15);
1952
1953 /*
1954 * There is a remote chance (ideally, 1 in 2^31) that our hashval
1955 * comes out to be one of our two sentinel hash values. If this
1956 * actually happens, we set the hashval to be a value known to be a
1957 * non-sentinel value.
1958 */
1959 if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1960 hashval = DTRACE_DYNHASH_VALID;
1961
1962 /*
1963 * Yes, it's painful to do a divide here. If the cycle count becomes
1964 * important here, tricks can be pulled to reduce it. (However, it's
1965 * critical that hash collisions be kept to an absolute minimum;
1966 * they're much more painful than a divide.) It's better to have a
1967 * solution that generates few collisions and still keeps things
1968 * relatively simple.
1969 */
1970 bucket = hashval % dstate->dtds_hashsize;
1971
1972 if (op == DTRACE_DYNVAR_DEALLOC) {
1973 volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1974
1975 for (;;) {
1976 while ((lock = *lockp) & 1)
1977 continue;
1978
1979 if (dtrace_casptr((void *)(uintptr_t)lockp,
1980 (void *)lock, (void *)(lock + 1)) == (void *)lock)
1981 break;
1982 }
1983
1984 dtrace_membar_producer();
1985 }
1986
1987 top:
1988 prev = NULL;
1989 lock = hash[bucket].dtdh_lock;
1990
1991 dtrace_membar_consumer();
1992
1993 start = hash[bucket].dtdh_chain;
1994 ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
1995 start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
1996 op != DTRACE_DYNVAR_DEALLOC));
1997
1998 for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1999 dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
2000 dtrace_key_t *dkey = &dtuple->dtt_key[0];
2001
2002 if (dvar->dtdv_hashval != hashval) {
2003 if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
2004 /*
2005 * We've reached the sink, and therefore the
2006 * end of the hash chain; we can kick out of
2007 * the loop knowing that we have seen a valid
2008 * snapshot of state.
2009 */
2010 ASSERT(dvar->dtdv_next == NULL);
2011 ASSERT(dvar == &dtrace_dynhash_sink);
2012 break;
2013 }
2014
2015 if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
2016 /*
2017 * We've gone off the rails: somewhere along
2018 * the line, one of the members of this hash
2019 * chain was deleted. Note that we could also
2020 * detect this by simply letting this loop run
2021 * to completion, as we would eventually hit
2022 * the end of the dirty list. However, we
2023 * want to avoid running the length of the
2024 * dirty list unnecessarily (it might be quite
2025 * long), so we catch this as early as
2026 * possible by detecting the hash marker. In
2027 * this case, we simply set dvar to NULL and
2028 * break; the conditional after the loop will
2029 * send us back to top.
2030 */
2031 dvar = NULL;
2032 break;
2033 }
2034
2035 goto next;
2036 }
2037
2038 if (dtuple->dtt_nkeys != nkeys)
2039 goto next;
2040
2041 for (i = 0; i < nkeys; i++, dkey++) {
2042 if (dkey->dttk_size != key[i].dttk_size)
2043 goto next; /* size or type mismatch */
2044
2045 if (dkey->dttk_size != 0) {
2046 if (dtrace_bcmp(
2047 (void *)(uintptr_t)key[i].dttk_value,
2048 (void *)(uintptr_t)dkey->dttk_value,
2049 dkey->dttk_size))
2050 goto next;
2051 } else {
2052 if (dkey->dttk_value != key[i].dttk_value)
2053 goto next;
2054 }
2055 }
2056
2057 if (op != DTRACE_DYNVAR_DEALLOC)
2058 return (dvar);
2059
2060 ASSERT(dvar->dtdv_next == NULL ||
2061 dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
2062
2063 if (prev != NULL) {
2064 ASSERT(hash[bucket].dtdh_chain != dvar);
2065 ASSERT(start != dvar);
2066 ASSERT(prev->dtdv_next == dvar);
2067 prev->dtdv_next = dvar->dtdv_next;
2068 } else {
2069 if (dtrace_casptr(&hash[bucket].dtdh_chain,
2070 start, dvar->dtdv_next) != start) {
2071 /*
2072 * We have failed to atomically swing the
2073 * hash table head pointer, presumably because
2074 * of a conflicting allocation on another CPU.
2075 * We need to reread the hash chain and try
2076 * again.
2077 */
2078 goto top;
2079 }
2080 }
2081
2082 dtrace_membar_producer();
2083
2084 /*
2085 * Now set the hash value to indicate that it's free.
2086 */
2087 ASSERT(hash[bucket].dtdh_chain != dvar);
2088 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2089
2090 dtrace_membar_producer();
2091
2092 /*
2093 * Set the next pointer to point at the dirty list, and
2094 * atomically swing the dirty pointer to the newly freed dvar.
2095 */
2096 do {
2097 next = dcpu->dtdsc_dirty;
2098 dvar->dtdv_next = next;
2099 } while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
2100
2101 /*
2102 * Finally, unlock this hash bucket.
2103 */
2104 ASSERT(hash[bucket].dtdh_lock == lock);
2105 ASSERT(lock & 1);
2106 hash[bucket].dtdh_lock++;
2107
2108 return (NULL);
2109 next:
2110 prev = dvar;
2111 continue;
2112 }
2113
2114 if (dvar == NULL) {
2115 /*
2116 * If dvar is NULL, it is because we went off the rails:
2117 * one of the elements that we traversed in the hash chain
2118 * was deleted while we were traversing it. In this case,
2119 * we assert that we aren't doing a dealloc (deallocs lock
2120 * the hash bucket to prevent themselves from racing with
2121 * one another), and retry the hash chain traversal.
2122 */
2123 ASSERT(op != DTRACE_DYNVAR_DEALLOC);
2124 goto top;
2125 }
2126
2127 if (op != DTRACE_DYNVAR_ALLOC) {
2128 /*
2129 * If we are not to allocate a new variable, we want to
2130 * return NULL now. Before we return, check that the value
2131 * of the lock word hasn't changed. If it has, we may have
2132 * seen an inconsistent snapshot.
2133 */
2134 if (op == DTRACE_DYNVAR_NOALLOC) {
2135 if (hash[bucket].dtdh_lock != lock)
2136 goto top;
2137 } else {
2138 ASSERT(op == DTRACE_DYNVAR_DEALLOC);
2139 ASSERT(hash[bucket].dtdh_lock == lock);
2140 ASSERT(lock & 1);
2141 hash[bucket].dtdh_lock++;
2142 }
2143
2144 return (NULL);
2145 }
2146
2147 /*
2148 * We need to allocate a new dynamic variable. The size we need is the
2149 * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
2150 * size of any auxiliary key data (rounded up to 8-byte alignment) plus
2151 * the size of any referred-to data (dsize). We then round the final
2152 * size up to the chunksize for allocation.
2153 */
2154 for (ksize = 0, i = 0; i < nkeys; i++)
2155 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
2156
2157 /*
2158 * This should be pretty much impossible, but could happen if, say,
2159 * strange DIF specified the tuple. Ideally, this should be an
2160 * assertion and not an error condition -- but that requires that the
2161 * chunksize calculation in dtrace_difo_chunksize() be absolutely
2162 * bullet-proof. (That is, it must not be able to be fooled by
2163 * malicious DIF.) Given the lack of backwards branches in DIF,
2164 * solving this would presumably not amount to solving the Halting
2165 * Problem -- but it still seems awfully hard.
2166 */
2167 if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
2168 ksize + dsize > chunksize) {
2169 dcpu->dtdsc_drops++;
2170 return (NULL);
2171 }
2172
2173 nstate = DTRACE_DSTATE_EMPTY;
2174
2175 do {
2176 retry:
2177 free = dcpu->dtdsc_free;
2178
2179 if (free == NULL) {
2180 dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
2181 void *rval;
2182
2183 if (clean == NULL) {
2184 /*
2185 * We're out of dynamic variable space on
2186 * this CPU. Unless we have tried all CPUs,
2187 * we'll try to allocate from a different
2188 * CPU.
2189 */
2190 switch (dstate->dtds_state) {
2191 case DTRACE_DSTATE_CLEAN: {
2192 void *sp = &dstate->dtds_state;
2193
2194 if (++cpu >= (int)NCPU)
2195 cpu = 0;
2196
2197 if (dcpu->dtdsc_dirty != NULL &&
2198 nstate == DTRACE_DSTATE_EMPTY)
2199 nstate = DTRACE_DSTATE_DIRTY;
2200
2201 if (dcpu->dtdsc_rinsing != NULL)
2202 nstate = DTRACE_DSTATE_RINSING;
2203
2204 dcpu = &dstate->dtds_percpu[cpu];
2205
2206 if (cpu != me)
2207 goto retry;
2208
2209 (void) dtrace_cas32(sp,
2210 DTRACE_DSTATE_CLEAN, nstate);
2211
2212 /*
2213 * To increment the correct bean
2214 * counter, take another lap.
2215 */
2216 goto retry;
2217 }
2218
2219 case DTRACE_DSTATE_DIRTY:
2220 dcpu->dtdsc_dirty_drops++;
2221 break;
2222
2223 case DTRACE_DSTATE_RINSING:
2224 dcpu->dtdsc_rinsing_drops++;
2225 break;
2226
2227 case DTRACE_DSTATE_EMPTY:
2228 dcpu->dtdsc_drops++;
2229 break;
2230 }
2231
2232 DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
2233 return (NULL);
2234 }
2235
2236 /*
2237 * The clean list appears to be non-empty. We want to
2238 * move the clean list to the free list; we start by
2239 * moving the clean pointer aside.
2240 */
2241 if (dtrace_casptr(&dcpu->dtdsc_clean,
2242 clean, NULL) != clean) {
2243 /*
2244 * We are in one of two situations:
2245 *
2246 * (a) The clean list was switched to the
2247 * free list by another CPU.
2248 *
2249 * (b) The clean list was added to by the
2250 * cleansing cyclic.
2251 *
2252 * In either of these situations, we can
2253 * just reattempt the free list allocation.
2254 */
2255 goto retry;
2256 }
2257
2258 ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
2259
2260 /*
2261 * Now we'll move the clean list to the free list.
2262 * It's impossible for this to fail: the only way
2263 * the free list can be updated is through this
2264 * code path, and only one CPU can own the clean list.
2265 * Thus, it would only be possible for this to fail if
2266 * this code were racing with dtrace_dynvar_clean().
2267 * (That is, if dtrace_dynvar_clean() updated the clean
2268 * list, and we ended up racing to update the free
2269 * list.) This race is prevented by the dtrace_sync()
2270 * in dtrace_dynvar_clean() -- which flushes the
2271 * owners of the clean lists out before resetting
2272 * the clean lists.
2273 */
2274 rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
2275 ASSERT(rval == NULL);
2276 goto retry;
2277 }
2278
2279 dvar = free;
2280 new_free = dvar->dtdv_next;
2281 } while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
2282
2283 /*
2284 * We have now allocated a new chunk. We copy the tuple keys into the
2285 * tuple array and copy any referenced key data into the data space
2286 * following the tuple array. As we do this, we relocate dttk_value
2287 * in the final tuple to point to the key data address in the chunk.
2288 */
2289 kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
2290 dvar->dtdv_data = (void *)(kdata + ksize);
2291 dvar->dtdv_tuple.dtt_nkeys = nkeys;
2292
2293 for (i = 0; i < nkeys; i++) {
2294 dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
2295 size_t kesize = key[i].dttk_size;
2296
2297 if (kesize != 0) {
2298 dtrace_bcopy(
2299 (const void *)(uintptr_t)key[i].dttk_value,
2300 (void *)kdata, kesize);
2301 dkey->dttk_value = kdata;
2302 kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
2303 } else {
2304 dkey->dttk_value = key[i].dttk_value;
2305 }
2306
2307 dkey->dttk_size = kesize;
2308 }
2309
2310 ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
2311 dvar->dtdv_hashval = hashval;
2312 dvar->dtdv_next = start;
2313
2314 if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
2315 return (dvar);
2316
2317 /*
2318 * The cas has failed. Either another CPU is adding an element to
2319 * this hash chain, or another CPU is deleting an element from this
2320 * hash chain. The simplest way to deal with both of these cases
2321 * (though not necessarily the most efficient) is to free our
2322 * allocated block and tail-call ourselves. Note that the free is
2323 * to the dirty list and _not_ to the free list. This is to prevent
2324 * races with allocators, above.
2325 */
2326 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2327
2328 dtrace_membar_producer();
2329
2330 do {
2331 free = dcpu->dtdsc_dirty;
2332 dvar->dtdv_next = free;
2333 } while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
2334
2335 return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
2336 }
2337
2338 /*ARGSUSED*/
2339 static void
2340 dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
2341 {
2342 #pragma unused(arg) /* __APPLE__ */
2343 if ((int64_t)nval < (int64_t)*oval)
2344 *oval = nval;
2345 }
2346
2347 /*ARGSUSED*/
2348 static void
2349 dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
2350 {
2351 #pragma unused(arg) /* __APPLE__ */
2352 if ((int64_t)nval > (int64_t)*oval)
2353 *oval = nval;
2354 }
2355
2356 static void
2357 dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
2358 {
2359 int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
2360 int64_t val = (int64_t)nval;
2361
2362 if (val < 0) {
2363 for (i = 0; i < zero; i++) {
2364 if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
2365 quanta[i] += incr;
2366 return;
2367 }
2368 }
2369 } else {
2370 for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
2371 if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
2372 quanta[i - 1] += incr;
2373 return;
2374 }
2375 }
2376
2377 quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
2378 return;
2379 }
2380
2381 ASSERT(0);
2382 }
2383
2384 static void
2385 dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
2386 {
2387 uint64_t arg = *lquanta++;
2388 int32_t base = DTRACE_LQUANTIZE_BASE(arg);
2389 uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
2390 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
2391 int32_t val = (int32_t)nval, level;
2392
2393 ASSERT(step != 0);
2394 ASSERT(levels != 0);
2395
2396 if (val < base) {
2397 /*
2398 * This is an underflow.
2399 */
2400 lquanta[0] += incr;
2401 return;
2402 }
2403
2404 level = (val - base) / step;
2405
2406 if (level < levels) {
2407 lquanta[level + 1] += incr;
2408 return;
2409 }
2410
2411 /*
2412 * This is an overflow.
2413 */
2414 lquanta[levels + 1] += incr;
2415 }
2416
2417 static int
2418 dtrace_aggregate_llquantize_bucket(int16_t factor, int16_t low, int16_t high,
2419 int16_t nsteps, int64_t value)
2420 {
2421 int64_t this = 1, last, next;
2422 int base = 1, order;
2423
2424 for (order = 0; order < low; ++order)
2425 this *= factor;
2426
2427 /*
2428 * If our value is less than our factor taken to the power of the
2429 * low order of magnitude, it goes into the zeroth bucket.
2430 */
2431 if (value < this)
2432 return 0;
2433 else
2434 last = this;
2435
2436 for (this *= factor; order <= high; ++order) {
2437 int nbuckets = this > nsteps ? nsteps : this;
2438
2439 /*
2440 * We should not generally get log/linear quantizations
2441 * with a high magnitude that allows 64-bits to
2442 * overflow, but we nonetheless protect against this
2443 * by explicitly checking for overflow, and clamping
2444 * our value accordingly.
2445 */
2446 next = this * factor;
2447 if (next < this) {
2448 value = this - 1;
2449 }
2450
2451 /*
2452 * If our value lies within this order of magnitude,
2453 * determine its position by taking the offset within
2454 * the order of magnitude, dividing by the bucket
2455 * width, and adding to our (accumulated) base.
2456 */
2457 if (value < this) {
2458 return (base + (value - last) / (this / nbuckets));
2459 }
2460
2461 base += nbuckets - (nbuckets / factor);
2462 last = this;
2463 this = next;
2464 }
2465
2466 /*
2467 * Our value is greater than or equal to our factor taken to the
2468 * power of one plus the high magnitude -- return the top bucket.
2469 */
2470 return base;
2471 }
2472
2473 static void
2474 dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
2475 {
2476 uint64_t arg = *llquanta++;
2477 uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
2478 uint16_t low = DTRACE_LLQUANTIZE_LOW(arg);
2479 uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg);
2480 uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
2481
2482 llquanta[dtrace_aggregate_llquantize_bucket(factor, low, high, nsteps, nval)] += incr;
2483 }
2484
2485 /*ARGSUSED*/
2486 static void
2487 dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
2488 {
2489 #pragma unused(arg) /* __APPLE__ */
2490 data[0]++;
2491 data[1] += nval;
2492 }
2493
2494 /*ARGSUSED*/
2495 static void
2496 dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
2497 {
2498 #pragma unused(arg) /* __APPLE__ */
2499 int64_t snval = (int64_t)nval;
2500 uint64_t tmp[2];
2501
2502 data[0]++;
2503 data[1] += nval;
2504
2505 /*
2506 * What we want to say here is:
2507 *
2508 * data[2] += nval * nval;
2509 *
2510 * But given that nval is 64-bit, we could easily overflow, so
2511 * we do this as 128-bit arithmetic.
2512 */
2513 if (snval < 0)
2514 snval = -snval;
2515
2516 dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
2517 dtrace_add_128(data + 2, tmp, data + 2);
2518 }
2519
2520 /*ARGSUSED*/
2521 static void
2522 dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
2523 {
2524 #pragma unused(nval, arg) /* __APPLE__ */
2525 *oval = *oval + 1;
2526 }
2527
2528 /*ARGSUSED*/
2529 static void
2530 dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
2531 {
2532 #pragma unused(arg) /* __APPLE__ */
2533 *oval += nval;
2534 }
2535
2536 /*
2537 * Aggregate given the tuple in the principal data buffer, and the aggregating
2538 * action denoted by the specified dtrace_aggregation_t. The aggregation
2539 * buffer is specified as the buf parameter. This routine does not return
2540 * failure; if there is no space in the aggregation buffer, the data will be
2541 * dropped, and a corresponding counter incremented.
2542 */
2543 static void
2544 dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
2545 intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
2546 {
2547 #pragma unused(arg)
2548 dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
2549 uint32_t i, ndx, size, fsize;
2550 uint32_t align = sizeof (uint64_t) - 1;
2551 dtrace_aggbuffer_t *agb;
2552 dtrace_aggkey_t *key;
2553 uint32_t hashval = 0, limit, isstr;
2554 caddr_t tomax, data, kdata;
2555 dtrace_actkind_t action;
2556 dtrace_action_t *act;
2557 uintptr_t offs;
2558
2559 if (buf == NULL)
2560 return;
2561
2562 if (!agg->dtag_hasarg) {
2563 /*
2564 * Currently, only quantize() and lquantize() take additional
2565 * arguments, and they have the same semantics: an increment
2566 * value that defaults to 1 when not present. If additional
2567 * aggregating actions take arguments, the setting of the
2568 * default argument value will presumably have to become more
2569 * sophisticated...
2570 */
2571 arg = 1;
2572 }
2573
2574 action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
2575 size = rec->dtrd_offset - agg->dtag_base;
2576 fsize = size + rec->dtrd_size;
2577
2578 ASSERT(dbuf->dtb_tomax != NULL);
2579 data = dbuf->dtb_tomax + offset + agg->dtag_base;
2580
2581 if ((tomax = buf->dtb_tomax) == NULL) {
2582 dtrace_buffer_drop(buf);
2583 return;
2584 }
2585
2586 /*
2587 * The metastructure is always at the bottom of the buffer.
2588 */
2589 agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
2590 sizeof (dtrace_aggbuffer_t));
2591
2592 if (buf->dtb_offset == 0) {
2593 /*
2594 * We just kludge up approximately 1/8th of the size to be
2595 * buckets. If this guess ends up being routinely
2596 * off-the-mark, we may need to dynamically readjust this
2597 * based on past performance.
2598 */
2599 uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
2600
2601 if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
2602 (uintptr_t)tomax || hashsize == 0) {
2603 /*
2604 * We've been given a ludicrously small buffer;
2605 * increment our drop count and leave.
2606 */
2607 dtrace_buffer_drop(buf);
2608 return;
2609 }
2610
2611 /*
2612 * And now, a pathetic attempt to try to get a an odd (or
2613 * perchance, a prime) hash size for better hash distribution.
2614 */
2615 if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
2616 hashsize -= DTRACE_AGGHASHSIZE_SLEW;
2617
2618 agb->dtagb_hashsize = hashsize;
2619 agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
2620 agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
2621 agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
2622
2623 for (i = 0; i < agb->dtagb_hashsize; i++)
2624 agb->dtagb_hash[i] = NULL;
2625 }
2626
2627 ASSERT(agg->dtag_first != NULL);
2628 ASSERT(agg->dtag_first->dta_intuple);
2629
2630 /*
2631 * Calculate the hash value based on the key. Note that we _don't_
2632 * include the aggid in the hashing (but we will store it as part of
2633 * the key). The hashing algorithm is Bob Jenkins' "One-at-a-time"
2634 * algorithm: a simple, quick algorithm that has no known funnels, and
2635 * gets good distribution in practice. The efficacy of the hashing
2636 * algorithm (and a comparison with other algorithms) may be found by
2637 * running the ::dtrace_aggstat MDB dcmd.
2638 */
2639 for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2640 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2641 limit = i + act->dta_rec.dtrd_size;
2642 ASSERT(limit <= size);
2643 isstr = DTRACEACT_ISSTRING(act);
2644
2645 for (; i < limit; i++) {
2646 hashval += data[i];
2647 hashval += (hashval << 10);
2648 hashval ^= (hashval >> 6);
2649
2650 if (isstr && data[i] == '\0')
2651 break;
2652 }
2653 }
2654
2655 hashval += (hashval << 3);
2656 hashval ^= (hashval >> 11);
2657 hashval += (hashval << 15);
2658
2659 /*
2660 * Yes, the divide here is expensive -- but it's generally the least
2661 * of the performance issues given the amount of data that we iterate
2662 * over to compute hash values, compare data, etc.
2663 */
2664 ndx = hashval % agb->dtagb_hashsize;
2665
2666 for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
2667 ASSERT((caddr_t)key >= tomax);
2668 ASSERT((caddr_t)key < tomax + buf->dtb_size);
2669
2670 if (hashval != key->dtak_hashval || key->dtak_size != size)
2671 continue;
2672
2673 kdata = key->dtak_data;
2674 ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
2675
2676 for (act = agg->dtag_first; act->dta_intuple;
2677 act = act->dta_next) {
2678 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2679 limit = i + act->dta_rec.dtrd_size;
2680 ASSERT(limit <= size);
2681 isstr = DTRACEACT_ISSTRING(act);
2682
2683 for (; i < limit; i++) {
2684 if (kdata[i] != data[i])
2685 goto next;
2686
2687 if (isstr && data[i] == '\0')
2688 break;
2689 }
2690 }
2691
2692 if (action != key->dtak_action) {
2693 /*
2694 * We are aggregating on the same value in the same
2695 * aggregation with two different aggregating actions.
2696 * (This should have been picked up in the compiler,
2697 * so we may be dealing with errant or devious DIF.)
2698 * This is an error condition; we indicate as much,
2699 * and return.
2700 */
2701 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2702 return;
2703 }
2704
2705 /*
2706 * This is a hit: we need to apply the aggregator to
2707 * the value at this key.
2708 */
2709 agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2710 return;
2711 next:
2712 continue;
2713 }
2714
2715 /*
2716 * We didn't find it. We need to allocate some zero-filled space,
2717 * link it into the hash table appropriately, and apply the aggregator
2718 * to the (zero-filled) value.
2719 */
2720 offs = buf->dtb_offset;
2721 while (offs & (align - 1))
2722 offs += sizeof (uint32_t);
2723
2724 /*
2725 * If we don't have enough room to both allocate a new key _and_
2726 * its associated data, increment the drop count and return.
2727 */
2728 if ((uintptr_t)tomax + offs + fsize >
2729 agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2730 dtrace_buffer_drop(buf);
2731 return;
2732 }
2733
2734 /*CONSTCOND*/
2735 ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
2736 key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
2737 agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2738
2739 key->dtak_data = kdata = tomax + offs;
2740 buf->dtb_offset = offs + fsize;
2741
2742 /*
2743 * Now copy the data across.
2744 */
2745 *((dtrace_aggid_t *)kdata) = agg->dtag_id;
2746
2747 for (i = sizeof (dtrace_aggid_t); i < size; i++)
2748 kdata[i] = data[i];
2749
2750 /*
2751 * Because strings are not zeroed out by default, we need to iterate
2752 * looking for actions that store strings, and we need to explicitly
2753 * pad these strings out with zeroes.
2754 */
2755 for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2756 int nul;
2757
2758 if (!DTRACEACT_ISSTRING(act))
2759 continue;
2760
2761 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2762 limit = i + act->dta_rec.dtrd_size;
2763 ASSERT(limit <= size);
2764
2765 for (nul = 0; i < limit; i++) {
2766 if (nul) {
2767 kdata[i] = '\0';
2768 continue;
2769 }
2770
2771 if (data[i] != '\0')
2772 continue;
2773
2774 nul = 1;
2775 }
2776 }
2777
2778 for (i = size; i < fsize; i++)
2779 kdata[i] = 0;
2780
2781 key->dtak_hashval = hashval;
2782 key->dtak_size = size;
2783 key->dtak_action = action;
2784 key->dtak_next = agb->dtagb_hash[ndx];
2785 agb->dtagb_hash[ndx] = key;
2786
2787 /*
2788 * Finally, apply the aggregator.
2789 */
2790 *((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
2791 agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2792 }
2793
2794 /*
2795 * Given consumer state, this routine finds a speculation in the INACTIVE
2796 * state and transitions it into the ACTIVE state. If there is no speculation
2797 * in the INACTIVE state, 0 is returned. In this case, no error counter is
2798 * incremented -- it is up to the caller to take appropriate action.
2799 */
2800 static int
2801 dtrace_speculation(dtrace_state_t *state)
2802 {
2803 int i = 0;
2804 dtrace_speculation_state_t current;
2805 uint32_t *stat = &state->dts_speculations_unavail, count;
2806
2807 while (i < state->dts_nspeculations) {
2808 dtrace_speculation_t *spec = &state->dts_speculations[i];
2809
2810 current = spec->dtsp_state;
2811
2812 if (current != DTRACESPEC_INACTIVE) {
2813 if (current == DTRACESPEC_COMMITTINGMANY ||
2814 current == DTRACESPEC_COMMITTING ||
2815 current == DTRACESPEC_DISCARDING)
2816 stat = &state->dts_speculations_busy;
2817 i++;
2818 continue;
2819 }
2820
2821 if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2822 current, DTRACESPEC_ACTIVE) == current)
2823 return (i + 1);
2824 }
2825
2826 /*
2827 * We couldn't find a speculation. If we found as much as a single
2828 * busy speculation buffer, we'll attribute this failure as "busy"
2829 * instead of "unavail".
2830 */
2831 do {
2832 count = *stat;
2833 } while (dtrace_cas32(stat, count, count + 1) != count);
2834
2835 return (0);
2836 }
2837
2838 /*
2839 * This routine commits an active speculation. If the specified speculation
2840 * is not in a valid state to perform a commit(), this routine will silently do
2841 * nothing. The state of the specified speculation is transitioned according
2842 * to the state transition diagram outlined in <sys/dtrace_impl.h>
2843 */
2844 static void
2845 dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2846 dtrace_specid_t which)
2847 {
2848 dtrace_speculation_t *spec;
2849 dtrace_buffer_t *src, *dest;
2850 uintptr_t daddr, saddr, dlimit, slimit;
2851 dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
2852 intptr_t offs;
2853 uint64_t timestamp;
2854
2855 if (which == 0)
2856 return;
2857
2858 if (which > (dtrace_specid_t)state->dts_nspeculations) {
2859 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2860 return;
2861 }
2862
2863 spec = &state->dts_speculations[which - 1];
2864 src = &spec->dtsp_buffer[cpu];
2865 dest = &state->dts_buffer[cpu];
2866
2867 do {
2868 current = spec->dtsp_state;
2869
2870 if (current == DTRACESPEC_COMMITTINGMANY)
2871 break;
2872
2873 switch (current) {
2874 case DTRACESPEC_INACTIVE:
2875 case DTRACESPEC_DISCARDING:
2876 return;
2877
2878 case DTRACESPEC_COMMITTING:
2879 /*
2880 * This is only possible if we are (a) commit()'ing
2881 * without having done a prior speculate() on this CPU
2882 * and (b) racing with another commit() on a different
2883 * CPU. There's nothing to do -- we just assert that
2884 * our offset is 0.
2885 */
2886 ASSERT(src->dtb_offset == 0);
2887 return;
2888
2889 case DTRACESPEC_ACTIVE:
2890 new = DTRACESPEC_COMMITTING;
2891 break;
2892
2893 case DTRACESPEC_ACTIVEONE:
2894 /*
2895 * This speculation is active on one CPU. If our
2896 * buffer offset is non-zero, we know that the one CPU
2897 * must be us. Otherwise, we are committing on a
2898 * different CPU from the speculate(), and we must
2899 * rely on being asynchronously cleaned.
2900 */
2901 if (src->dtb_offset != 0) {
2902 new = DTRACESPEC_COMMITTING;
2903 break;
2904 }
2905 /*FALLTHROUGH*/
2906
2907 case DTRACESPEC_ACTIVEMANY:
2908 new = DTRACESPEC_COMMITTINGMANY;
2909 break;
2910
2911 default:
2912 ASSERT(0);
2913 }
2914 } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2915 current, new) != current);
2916
2917 /*
2918 * We have set the state to indicate that we are committing this
2919 * speculation. Now reserve the necessary space in the destination
2920 * buffer.
2921 */
2922 if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2923 sizeof (uint64_t), state, NULL)) < 0) {
2924 dtrace_buffer_drop(dest);
2925 goto out;
2926 }
2927
2928 /*
2929 * We have sufficient space to copy the speculative buffer into the
2930 * primary buffer. First, modify the speculative buffer, filling
2931 * in the timestamp of all entries with the current time. The data
2932 * must have the commit() time rather than the time it was traced,
2933 * so that all entries in the primary buffer are in timestamp order.
2934 */
2935 timestamp = dtrace_gethrtime();
2936 saddr = (uintptr_t)src->dtb_tomax;
2937 slimit = saddr + src->dtb_offset;
2938 while (saddr < slimit) {
2939 size_t size;
2940 dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr;
2941
2942 if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
2943 saddr += sizeof (dtrace_epid_t);
2944 continue;
2945 }
2946
2947 ASSERT(dtrh->dtrh_epid <= ((dtrace_epid_t) state->dts_necbs));
2948 size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size;
2949
2950 ASSERT(saddr + size <= slimit);
2951 ASSERT(size >= sizeof(dtrace_rechdr_t));
2952 ASSERT(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh) == UINT64_MAX);
2953
2954 DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);
2955
2956 saddr += size;
2957 }
2958
2959 /*
2960 * Copy the buffer across. (Note that this is a
2961 * highly subobtimal bcopy(); in the unlikely event that this becomes
2962 * a serious performance issue, a high-performance DTrace-specific
2963 * bcopy() should obviously be invented.)
2964 */
2965 daddr = (uintptr_t)dest->dtb_tomax + offs;
2966 dlimit = daddr + src->dtb_offset;
2967 saddr = (uintptr_t)src->dtb_tomax;
2968
2969 /*
2970 * First, the aligned portion.
2971 */
2972 while (dlimit - daddr >= sizeof (uint64_t)) {
2973 *((uint64_t *)daddr) = *((uint64_t *)saddr);
2974
2975 daddr += sizeof (uint64_t);
2976 saddr += sizeof (uint64_t);
2977 }
2978
2979 /*
2980 * Now any left-over bit...
2981 */
2982 while (dlimit - daddr)
2983 *((uint8_t *)daddr++) = *((uint8_t *)saddr++);
2984
2985 /*
2986 * Finally, commit the reserved space in the destination buffer.
2987 */
2988 dest->dtb_offset = offs + src->dtb_offset;
2989
2990 out:
2991 /*
2992 * If we're lucky enough to be the only active CPU on this speculation
2993 * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2994 */
2995 if (current == DTRACESPEC_ACTIVE ||
2996 (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
2997 uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
2998 DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
2999 #pragma unused(rval) /* __APPLE__ */
3000
3001 ASSERT(rval == DTRACESPEC_COMMITTING);
3002 }
3003
3004 src->dtb_offset = 0;
3005 src->dtb_xamot_drops += src->dtb_drops;
3006 src->dtb_drops = 0;
3007 }
3008
3009 /*
3010 * This routine discards an active speculation. If the specified speculation
3011 * is not in a valid state to perform a discard(), this routine will silently
3012 * do nothing. The state of the specified speculation is transitioned
3013 * according to the state transition diagram outlined in <sys/dtrace_impl.h>
3014 */
3015 static void
3016 dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
3017 dtrace_specid_t which)
3018 {
3019 dtrace_speculation_t *spec;
3020 dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
3021 dtrace_buffer_t *buf;
3022
3023 if (which == 0)
3024 return;
3025
3026 if (which > (dtrace_specid_t)state->dts_nspeculations) {
3027 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3028 return;
3029 }
3030
3031 spec = &state->dts_speculations[which - 1];
3032 buf = &spec->dtsp_buffer[cpu];
3033
3034 do {
3035 current = spec->dtsp_state;
3036
3037 switch (current) {
3038 case DTRACESPEC_INACTIVE:
3039 case DTRACESPEC_COMMITTINGMANY:
3040 case DTRACESPEC_COMMITTING:
3041 case DTRACESPEC_DISCARDING:
3042 return;
3043
3044 case DTRACESPEC_ACTIVE:
3045 case DTRACESPEC_ACTIVEMANY:
3046 new = DTRACESPEC_DISCARDING;
3047 break;
3048
3049 case DTRACESPEC_ACTIVEONE:
3050 if (buf->dtb_offset != 0) {
3051 new = DTRACESPEC_INACTIVE;
3052 } else {
3053 new = DTRACESPEC_DISCARDING;
3054 }
3055 break;
3056
3057 default:
3058 ASSERT(0);
3059 }
3060 } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
3061 current, new) != current);
3062
3063 buf->dtb_offset = 0;
3064 buf->dtb_drops = 0;
3065 }
3066
3067 /*
3068 * Note: not called from probe context. This function is called
3069 * asynchronously from cross call context to clean any speculations that are
3070 * in the COMMITTINGMANY or DISCARDING states. These speculations may not be
3071 * transitioned back to the INACTIVE state until all CPUs have cleaned the
3072 * speculation.
3073 */
3074 static void
3075 dtrace_speculation_clean_here(dtrace_state_t *state)
3076 {
3077 dtrace_icookie_t cookie;
3078 processorid_t cpu = CPU->cpu_id;
3079 dtrace_buffer_t *dest = &state->dts_buffer[cpu];
3080 dtrace_specid_t i;
3081
3082 cookie = dtrace_interrupt_disable();
3083
3084 if (dest->dtb_tomax == NULL) {
3085 dtrace_interrupt_enable(cookie);
3086 return;
3087 }
3088
3089 for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
3090 dtrace_speculation_t *spec = &state->dts_speculations[i];
3091 dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
3092
3093 if (src->dtb_tomax == NULL)
3094 continue;
3095
3096 if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
3097 src->dtb_offset = 0;
3098 continue;
3099 }
3100
3101 if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
3102 continue;
3103
3104 if (src->dtb_offset == 0)
3105 continue;
3106
3107 dtrace_speculation_commit(state, cpu, i + 1);
3108 }
3109
3110 dtrace_interrupt_enable(cookie);
3111 }
3112
3113 /*
3114 * Note: not called from probe context. This function is called
3115 * asynchronously (and at a regular interval) to clean any speculations that
3116 * are in the COMMITTINGMANY or DISCARDING states. If it discovers that there
3117 * is work to be done, it cross calls all CPUs to perform that work;
3118 * COMMITMANY and DISCARDING speculations may not be transitioned back to the
3119 * INACTIVE state until they have been cleaned by all CPUs.
3120 */
3121 static void
3122 dtrace_speculation_clean(dtrace_state_t *state)
3123 {
3124 int work = 0;
3125 uint32_t rv;
3126 dtrace_specid_t i;
3127
3128 for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
3129 dtrace_speculation_t *spec = &state->dts_speculations[i];
3130
3131 ASSERT(!spec->dtsp_cleaning);
3132
3133 if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
3134 spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
3135 continue;
3136
3137 work++;
3138 spec->dtsp_cleaning = 1;
3139 }
3140
3141 if (!work)
3142 return;
3143
3144 dtrace_xcall(DTRACE_CPUALL,
3145 (dtrace_xcall_t)dtrace_speculation_clean_here, state);
3146
3147 /*
3148 * We now know that all CPUs have committed or discarded their
3149 * speculation buffers, as appropriate. We can now set the state
3150 * to inactive.
3151 */
3152 for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
3153 dtrace_speculation_t *spec = &state->dts_speculations[i];
3154 dtrace_speculation_state_t current, new;
3155
3156 if (!spec->dtsp_cleaning)
3157 continue;
3158
3159 current = spec->dtsp_state;
3160 ASSERT(current == DTRACESPEC_DISCARDING ||
3161 current == DTRACESPEC_COMMITTINGMANY);
3162
3163 new = DTRACESPEC_INACTIVE;
3164
3165 rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
3166 ASSERT(rv == current);
3167 spec->dtsp_cleaning = 0;
3168 }
3169 }
3170
3171 /*
3172 * Called as part of a speculate() to get the speculative buffer associated
3173 * with a given speculation. Returns NULL if the specified speculation is not
3174 * in an ACTIVE state. If the speculation is in the ACTIVEONE state -- and
3175 * the active CPU is not the specified CPU -- the speculation will be
3176 * atomically transitioned into the ACTIVEMANY state.
3177 */
3178 static dtrace_buffer_t *
3179 dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
3180 dtrace_specid_t which)
3181 {
3182 dtrace_speculation_t *spec;
3183 dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
3184 dtrace_buffer_t *buf;
3185
3186 if (which == 0)
3187 return (NULL);
3188
3189 if (which > (dtrace_specid_t)state->dts_nspeculations) {
3190 cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3191 return (NULL);
3192 }
3193
3194 spec = &state->dts_speculations[which - 1];
3195 buf = &spec->dtsp_buffer[cpuid];
3196
3197 do {
3198 current = spec->dtsp_state;
3199
3200 switch (current) {
3201 case DTRACESPEC_INACTIVE:
3202 case DTRACESPEC_COMMITTINGMANY:
3203 case DTRACESPEC_DISCARDING:
3204 return (NULL);
3205
3206 case DTRACESPEC_COMMITTING:
3207 ASSERT(buf->dtb_offset == 0);
3208 return (NULL);
3209
3210 case DTRACESPEC_ACTIVEONE:
3211 /*
3212 * This speculation is currently active on one CPU.
3213 * Check the offset in the buffer; if it's non-zero,
3214 * that CPU must be us (and we leave the state alone).
3215 * If it's zero, assume that we're starting on a new
3216 * CPU -- and change the state to indicate that the
3217 * speculation is active on more than one CPU.
3218 */
3219 if (buf->dtb_offset != 0)
3220 return (buf);
3221
3222 new = DTRACESPEC_ACTIVEMANY;
3223 break;
3224
3225 case DTRACESPEC_ACTIVEMANY:
3226 return (buf);
3227
3228 case DTRACESPEC_ACTIVE:
3229 new = DTRACESPEC_ACTIVEONE;
3230 break;
3231
3232 default:
3233 ASSERT(0);
3234 }
3235 } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
3236 current, new) != current);
3237
3238 ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
3239 return (buf);
3240 }
3241
3242 /*
3243 * Return a string. In the event that the user lacks the privilege to access
3244 * arbitrary kernel memory, we copy the string out to scratch memory so that we
3245 * don't fail access checking.
3246 *
3247 * dtrace_dif_variable() uses this routine as a helper for various
3248 * builtin values such as 'execname' and 'probefunc.'
3249 */
3250 static
3251 uintptr_t
3252 dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
3253 dtrace_mstate_t *mstate)
3254 {
3255 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3256 uintptr_t ret;
3257 size_t strsz;
3258
3259 /*
3260 * The easy case: this probe is allowed to read all of memory, so
3261 * we can just return this as a vanilla pointer.
3262 */
3263 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
3264 return (addr);
3265
3266 /*
3267 * This is the tougher case: we copy the string in question from
3268 * kernel memory into scratch memory and return it that way: this
3269 * ensures that we won't trip up when access checking tests the
3270 * BYREF return value.
3271 */
3272 strsz = dtrace_strlen((char *)addr, size) + 1;
3273
3274 if (mstate->dtms_scratch_ptr + strsz >
3275 mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
3276 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3277 return (0);
3278 }
3279
3280 dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
3281 strsz);
3282 ret = mstate->dtms_scratch_ptr;
3283 mstate->dtms_scratch_ptr += strsz;
3284 return (ret);
3285 }
3286
3287 /*
3288 * This function implements the DIF emulator's variable lookups. The emulator
3289 * passes a reserved variable identifier and optional built-in array index.
3290 */
3291 static uint64_t
3292 dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
3293 uint64_t ndx)
3294 {
3295 /*
3296 * If we're accessing one of the uncached arguments, we'll turn this
3297 * into a reference in the args array.
3298 */
3299 if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
3300 ndx = v - DIF_VAR_ARG0;
3301 v = DIF_VAR_ARGS;
3302 }
3303
3304 switch (v) {
3305 case DIF_VAR_ARGS:
3306 ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
3307 if (ndx >= sizeof (mstate->dtms_arg) /
3308 sizeof (mstate->dtms_arg[0])) {
3309 int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3310 dtrace_vstate_t *vstate = &state->dts_vstate;
3311 dtrace_provider_t *pv;
3312 uint64_t val;
3313
3314 pv = mstate->dtms_probe->dtpr_provider;
3315 if (pv->dtpv_pops.dtps_getargval != NULL)
3316 val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
3317 mstate->dtms_probe->dtpr_id,
3318 mstate->dtms_probe->dtpr_arg, ndx, aframes);
3319 /* Special case access of arg5 as passed to dtrace_probe_error() (which see.) */
3320 else if (mstate->dtms_probe->dtpr_id == dtrace_probeid_error && ndx == 5) {
3321 return ((dtrace_state_t *)(uintptr_t)(mstate->dtms_arg[0]))->dts_arg_error_illval;
3322 }
3323
3324 else
3325 val = dtrace_getarg(ndx, aframes, mstate, vstate);
3326
3327 /*
3328 * This is regrettably required to keep the compiler
3329 * from tail-optimizing the call to dtrace_getarg().
3330 * The condition always evaluates to true, but the
3331 * compiler has no way of figuring that out a priori.
3332 * (None of this would be necessary if the compiler
3333 * could be relied upon to _always_ tail-optimize
3334 * the call to dtrace_getarg() -- but it can't.)
3335 */
3336 if (mstate->dtms_probe != NULL)
3337 return (val);
3338
3339 ASSERT(0);
3340 }
3341
3342 return (mstate->dtms_arg[ndx]);
3343
3344 case DIF_VAR_UREGS: {
3345 thread_t thread;
3346
3347 if (!dtrace_priv_proc(state))
3348 return (0);
3349
3350 if ((thread = current_thread()) == NULL) {
3351 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3352 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = 0;
3353 return (0);
3354 }
3355
3356 return (dtrace_getreg(find_user_regs(thread), ndx));
3357 }
3358
3359
3360 case DIF_VAR_CURTHREAD:
3361 if (!dtrace_priv_kernel(state))
3362 return (0);
3363
3364 return ((uint64_t)(uintptr_t)current_thread());
3365
3366 case DIF_VAR_TIMESTAMP:
3367 if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
3368 mstate->dtms_timestamp = dtrace_gethrtime();
3369 mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
3370 }
3371 return (mstate->dtms_timestamp);
3372
3373 case DIF_VAR_VTIMESTAMP:
3374 ASSERT(dtrace_vtime_references != 0);
3375 return (dtrace_get_thread_vtime(current_thread()));
3376
3377 case DIF_VAR_WALLTIMESTAMP:
3378 if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
3379 mstate->dtms_walltimestamp = dtrace_gethrestime();
3380 mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
3381 }
3382 return (mstate->dtms_walltimestamp);
3383
3384 case DIF_VAR_MACHTIMESTAMP:
3385 if (!(mstate->dtms_present & DTRACE_MSTATE_MACHTIMESTAMP)) {
3386 mstate->dtms_machtimestamp = mach_absolute_time();
3387 mstate->dtms_present |= DTRACE_MSTATE_MACHTIMESTAMP;
3388 }
3389 return (mstate->dtms_machtimestamp);
3390
3391 case DIF_VAR_CPU:
3392 return ((uint64_t) dtrace_get_thread_last_cpu_id(current_thread()));
3393
3394 case DIF_VAR_IPL:
3395 if (!dtrace_priv_kernel(state))
3396 return (0);
3397 if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
3398 mstate->dtms_ipl = dtrace_getipl();
3399 mstate->dtms_present |= DTRACE_MSTATE_IPL;
3400 }
3401 return (mstate->dtms_ipl);
3402
3403 case DIF_VAR_EPID:
3404 ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
3405 return (mstate->dtms_epid);
3406
3407 case DIF_VAR_ID:
3408 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3409 return (mstate->dtms_probe->dtpr_id);
3410
3411 case DIF_VAR_STACKDEPTH:
3412 if (!dtrace_priv_kernel(state))
3413 return (0);
3414 if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
3415 int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3416
3417 mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
3418 mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
3419 }
3420 return (mstate->dtms_stackdepth);
3421
3422 case DIF_VAR_USTACKDEPTH:
3423 if (!dtrace_priv_proc(state))
3424 return (0);
3425 if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
3426 /*
3427 * See comment in DIF_VAR_PID.
3428 */
3429 if (DTRACE_ANCHORED(mstate->dtms_probe) &&
3430 CPU_ON_INTR(CPU)) {
3431 mstate->dtms_ustackdepth = 0;
3432 } else {
3433 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3434 mstate->dtms_ustackdepth =
3435 dtrace_getustackdepth();
3436 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3437 }
3438 mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
3439 }
3440 return (mstate->dtms_ustackdepth);
3441
3442 case DIF_VAR_CALLER:
3443 if (!dtrace_priv_kernel(state))
3444 return (0);
3445 if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
3446 int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3447
3448 if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
3449 /*
3450 * If this is an unanchored probe, we are
3451 * required to go through the slow path:
3452 * dtrace_caller() only guarantees correct
3453 * results for anchored probes.
3454 */
3455 pc_t caller[2];
3456
3457 dtrace_getpcstack(caller, 2, aframes,
3458 (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
3459 mstate->dtms_caller = caller[1];
3460 } else if ((mstate->dtms_caller =
3461 dtrace_caller(aframes)) == (uintptr_t)-1) {
3462 /*
3463 * We have failed to do this the quick way;
3464 * we must resort to the slower approach of
3465 * calling dtrace_getpcstack().
3466 */
3467 pc_t caller;
3468
3469 dtrace_getpcstack(&caller, 1, aframes, NULL);
3470 mstate->dtms_caller = caller;
3471 }
3472
3473 mstate->dtms_present |= DTRACE_MSTATE_CALLER;
3474 }
3475 return (mstate->dtms_caller);
3476
3477 case DIF_VAR_UCALLER:
3478 if (!dtrace_priv_proc(state))
3479 return (0);
3480
3481 if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
3482 uint64_t ustack[3];
3483
3484 /*
3485 * dtrace_getupcstack() fills in the first uint64_t
3486 * with the current PID. The second uint64_t will
3487 * be the program counter at user-level. The third
3488 * uint64_t will contain the caller, which is what
3489 * we're after.
3490 */
3491 ustack[2] = 0;
3492 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3493 dtrace_getupcstack(ustack, 3);
3494 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3495 mstate->dtms_ucaller = ustack[2];
3496 mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
3497 }
3498
3499 return (mstate->dtms_ucaller);
3500
3501 case DIF_VAR_PROBEPROV:
3502 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3503 return (dtrace_dif_varstr(
3504 (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
3505 state, mstate));
3506
3507 case DIF_VAR_PROBEMOD:
3508 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3509 return (dtrace_dif_varstr(
3510 (uintptr_t)mstate->dtms_probe->dtpr_mod,
3511 state, mstate));
3512
3513 case DIF_VAR_PROBEFUNC:
3514 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3515 return (dtrace_dif_varstr(
3516 (uintptr_t)mstate->dtms_probe->dtpr_func,
3517 state, mstate));
3518
3519 case DIF_VAR_PROBENAME:
3520 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3521 return (dtrace_dif_varstr(
3522 (uintptr_t)mstate->dtms_probe->dtpr_name,
3523 state, mstate));
3524
3525 case DIF_VAR_PID:
3526 if (!dtrace_priv_proc_relaxed(state))
3527 return (0);
3528
3529 /*
3530 * Note that we are assuming that an unanchored probe is
3531 * always due to a high-level interrupt. (And we're assuming
3532 * that there is only a single high level interrupt.)
3533 */
3534 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3535 /* Anchored probe that fires while on an interrupt accrues to process 0 */
3536 return 0;
3537
3538 return ((uint64_t)dtrace_proc_selfpid());
3539
3540 case DIF_VAR_PPID:
3541 if (!dtrace_priv_proc_relaxed(state))
3542 return (0);
3543
3544 /*
3545 * See comment in DIF_VAR_PID.
3546 */
3547 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3548 return (0);
3549
3550 return ((uint64_t)dtrace_proc_selfppid());
3551
3552 case DIF_VAR_TID:
3553 /* We do not need to check for null current_thread() */
3554 return thread_tid(current_thread()); /* globally unique */
3555
3556 case DIF_VAR_PTHREAD_SELF:
3557 if (!dtrace_priv_proc(state))
3558 return (0);
3559
3560 /* Not currently supported, but we should be able to delta the dispatchqaddr and dispatchqoffset to get pthread_self */
3561 return 0;
3562
3563 case DIF_VAR_DISPATCHQADDR:
3564 if (!dtrace_priv_proc(state))
3565 return (0);
3566
3567 /* We do not need to check for null current_thread() */
3568 return thread_dispatchqaddr(current_thread());
3569
3570 case DIF_VAR_EXECNAME:
3571 {
3572 char *xname = (char *)mstate->dtms_scratch_ptr;
3573 size_t scratch_size = MAXCOMLEN+1;
3574
3575 /* The scratch allocation's lifetime is that of the clause. */
3576 if (!DTRACE_INSCRATCH(mstate, scratch_size)) {
3577 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3578 return 0;
3579 }
3580
3581 if (!dtrace_priv_proc_relaxed(state))
3582 return (0);
3583
3584 mstate->dtms_scratch_ptr += scratch_size;
3585 proc_selfname( xname, scratch_size );
3586
3587 return ((uint64_t)(uintptr_t)xname);
3588 }
3589
3590
3591 case DIF_VAR_ZONENAME:
3592 {
3593 /* scratch_size is equal to length('global') + 1 for the null-terminator. */
3594 char *zname = (char *)mstate->dtms_scratch_ptr;
3595 size_t scratch_size = 6 + 1;
3596
3597 if (!dtrace_priv_proc(state))
3598 return (0);
3599
3600 /* The scratch allocation's lifetime is that of the clause. */
3601 if (!DTRACE_INSCRATCH(mstate, scratch_size)) {
3602 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3603 return 0;
3604 }
3605
3606 mstate->dtms_scratch_ptr += scratch_size;
3607
3608 /* The kernel does not provide zonename, it will always return 'global'. */
3609 strlcpy(zname, "global", scratch_size);
3610
3611 return ((uint64_t)(uintptr_t)zname);
3612 }
3613
3614 #if MONOTONIC
3615 case DIF_VAR_CPUINSTRS:
3616 return mt_cur_cpu_instrs();
3617
3618 case DIF_VAR_CPUCYCLES:
3619 return mt_cur_cpu_cycles();
3620
3621 case DIF_VAR_VINSTRS:
3622 return mt_cur_thread_instrs();
3623
3624 case DIF_VAR_VCYCLES:
3625 return mt_cur_thread_cycles();
3626 #else /* MONOTONIC */
3627 case DIF_VAR_CPUINSTRS: /* FALLTHROUGH */
3628 case DIF_VAR_CPUCYCLES: /* FALLTHROUGH */
3629 case DIF_VAR_VINSTRS: /* FALLTHROUGH */
3630 case DIF_VAR_VCYCLES: /* FALLTHROUGH */
3631 return 0;
3632 #endif /* !MONOTONIC */
3633
3634 case DIF_VAR_UID:
3635 if (!dtrace_priv_proc_relaxed(state))
3636 return (0);
3637
3638 /*
3639 * See comment in DIF_VAR_PID.
3640 */
3641 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3642 return (0);
3643
3644 return ((uint64_t) dtrace_proc_selfruid());
3645
3646 case DIF_VAR_GID:
3647 if (!dtrace_priv_proc(state))
3648 return (0);
3649
3650 /*
3651 * See comment in DIF_VAR_PID.
3652 */
3653 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3654 return (0);
3655
3656 if (dtrace_CRED() != NULL)
3657 /* Credential does not require lazy initialization. */
3658 return ((uint64_t)kauth_getgid());
3659 else {
3660 /* proc_lock would be taken under kauth_cred_proc_ref() in kauth_cred_get(). */
3661 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3662 return -1ULL;
3663 }
3664
3665 case DIF_VAR_ERRNO: {
3666 uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
3667 if (!dtrace_priv_proc(state))
3668 return (0);
3669
3670 /*
3671 * See comment in DIF_VAR_PID.
3672 */
3673 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3674 return (0);
3675
3676 if (uthread)
3677 return (uint64_t)uthread->t_dtrace_errno;
3678 else {
3679 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3680 return -1ULL;
3681 }
3682 }
3683
3684 default:
3685 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3686 return (0);
3687 }
3688 }
3689
3690 typedef enum dtrace_json_state {
3691 DTRACE_JSON_REST = 1,
3692 DTRACE_JSON_OBJECT,
3693 DTRACE_JSON_STRING,
3694 DTRACE_JSON_STRING_ESCAPE,
3695 DTRACE_JSON_STRING_ESCAPE_UNICODE,
3696 DTRACE_JSON_COLON,
3697 DTRACE_JSON_COMMA,
3698 DTRACE_JSON_VALUE,
3699 DTRACE_JSON_IDENTIFIER,
3700 DTRACE_JSON_NUMBER,
3701 DTRACE_JSON_NUMBER_FRAC,
3702 DTRACE_JSON_NUMBER_EXP,
3703 DTRACE_JSON_COLLECT_OBJECT
3704 } dtrace_json_state_t;
3705
3706 /*
3707 * This function possesses just enough knowledge about JSON to extract a single
3708 * value from a JSON string and store it in the scratch buffer. It is able
3709 * to extract nested object values, and members of arrays by index.
3710 *
3711 * elemlist is a list of JSON keys, stored as packed NUL-terminated strings, to
3712 * be looked up as we descend into the object tree. e.g.
3713 *
3714 * foo[0].bar.baz[32] --> "foo" NUL "0" NUL "bar" NUL "baz" NUL "32" NUL
3715 * with nelems = 5.
3716 *
3717 * The run time of this function must be bounded above by strsize to limit the
3718 * amount of work done in probe context. As such, it is implemented as a
3719 * simple state machine, reading one character at a time using safe loads
3720 * until we find the requested element, hit a parsing error or run off the
3721 * end of the object or string.
3722 *
3723 * As there is no way for a subroutine to return an error without interrupting
3724 * clause execution, we simply return NULL in the event of a missing key or any
3725 * other error condition. Each NULL return in this function is commented with
3726 * the error condition it represents -- parsing or otherwise.
3727 *
3728 * The set of states for the state machine closely matches the JSON
3729 * specification (http://json.org/). Briefly:
3730 *
3731 * DTRACE_JSON_REST:
3732 * Skip whitespace until we find either a top-level Object, moving
3733 * to DTRACE_JSON_OBJECT; or an Array, moving to DTRACE_JSON_VALUE.
3734 *
3735 * DTRACE_JSON_OBJECT:
3736 * Locate the next key String in an Object. Sets a flag to denote
3737 * the next String as a key string and moves to DTRACE_JSON_STRING.
3738 *
3739 * DTRACE_JSON_COLON:
3740 * Skip whitespace until we find the colon that separates key Strings
3741 * from their values. Once found, move to DTRACE_JSON_VALUE.
3742 *
3743 * DTRACE_JSON_VALUE:
3744 * Detects the type of the next value (String, Number, Identifier, Object
3745 * or Array) and routes to the states that process that type. Here we also
3746 * deal with the element selector list if we are requested to traverse down
3747 * into the object tree.
3748 *
3749 * DTRACE_JSON_COMMA:
3750 * Skip whitespace until we find the comma that separates key-value pairs
3751 * in Objects (returning to DTRACE_JSON_OBJECT) or values in Arrays
3752 * (similarly DTRACE_JSON_VALUE). All following literal value processing
3753 * states return to this state at the end of their value, unless otherwise
3754 * noted.
3755 *
3756 * DTRACE_JSON_NUMBER, DTRACE_JSON_NUMBER_FRAC, DTRACE_JSON_NUMBER_EXP:
3757 * Processes a Number literal from the JSON, including any exponent
3758 * component that may be present. Numbers are returned as strings, which
3759 * may be passed to strtoll() if an integer is required.
3760 *
3761 * DTRACE_JSON_IDENTIFIER:
3762 * Processes a "true", "false" or "null" literal in the JSON.
3763 *
3764 * DTRACE_JSON_STRING, DTRACE_JSON_STRING_ESCAPE,
3765 * DTRACE_JSON_STRING_ESCAPE_UNICODE:
3766 * Processes a String literal from the JSON, whether the String denotes
3767 * a key, a value or part of a larger Object. Handles all escape sequences
3768 * present in the specification, including four-digit unicode characters,
3769 * but merely includes the escape sequence without converting it to the
3770 * actual escaped character. If the String is flagged as a key, we
3771 * move to DTRACE_JSON_COLON rather than DTRACE_JSON_COMMA.
3772 *
3773 * DTRACE_JSON_COLLECT_OBJECT:
3774 * This state collects an entire Object (or Array), correctly handling
3775 * embedded strings. If the full element selector list matches this nested
3776 * object, we return the Object in full as a string. If not, we use this
3777 * state to skip to the next value at this level and continue processing.
3778 */
3779 static char *
3780 dtrace_json(uint64_t size, uintptr_t json, char *elemlist, int nelems,
3781 char *dest)
3782 {
3783 dtrace_json_state_t state = DTRACE_JSON_REST;
3784 int64_t array_elem = INT64_MIN;
3785 int64_t array_pos = 0;
3786 uint8_t escape_unicount = 0;
3787 boolean_t string_is_key = B_FALSE;
3788 boolean_t collect_object = B_FALSE;
3789 boolean_t found_key = B_FALSE;
3790 boolean_t in_array = B_FALSE;
3791 uint32_t braces = 0, brackets = 0;
3792 char *elem = elemlist;
3793 char *dd = dest;
3794 uintptr_t cur;
3795
3796 for (cur = json; cur < json + size; cur++) {
3797 char cc = dtrace_load8(cur);
3798 if (cc == '\0')
3799 return (NULL);
3800
3801 switch (state) {
3802 case DTRACE_JSON_REST:
3803 if (isspace(cc))
3804 break;
3805
3806 if (cc == '{') {
3807 state = DTRACE_JSON_OBJECT;
3808 break;
3809 }
3810
3811 if (cc == '[') {
3812 in_array = B_TRUE;
3813 array_pos = 0;
3814 array_elem = dtrace_strtoll(elem, 10, size);
3815 found_key = array_elem == 0 ? B_TRUE : B_FALSE;
3816 state = DTRACE_JSON_VALUE;
3817 break;
3818 }
3819
3820 /*
3821 * ERROR: expected to find a top-level object or array.
3822 */
3823 return (NULL);
3824 case DTRACE_JSON_OBJECT:
3825 if (isspace(cc))
3826 break;
3827
3828 if (cc == '"') {
3829 state = DTRACE_JSON_STRING;
3830 string_is_key = B_TRUE;
3831 break;
3832 }
3833
3834 /*
3835 * ERROR: either the object did not start with a key
3836 * string, or we've run off the end of the object
3837 * without finding the requested key.
3838 */
3839 return (NULL);
3840 case DTRACE_JSON_STRING:
3841 if (cc == '\\') {
3842 *dd++ = '\\';
3843 state = DTRACE_JSON_STRING_ESCAPE;
3844 break;
3845 }
3846
3847 if (cc == '"') {
3848 if (collect_object) {
3849 /*
3850 * We don't reset the dest here, as
3851 * the string is part of a larger
3852 * object being collected.
3853 */
3854 *dd++ = cc;
3855 collect_object = B_FALSE;
3856 state = DTRACE_JSON_COLLECT_OBJECT;
3857 break;
3858 }
3859 *dd = '\0';
3860 dd = dest; /* reset string buffer */
3861 if (string_is_key) {
3862 if (dtrace_strncmp(dest, elem,
3863 size) == 0)
3864 found_key = B_TRUE;
3865 } else if (found_key) {
3866 if (nelems > 1) {
3867 /*
3868 * We expected an object, not
3869 * this string.
3870 */
3871 return (NULL);
3872 }
3873 return (dest);
3874 }
3875 state = string_is_key ? DTRACE_JSON_COLON :
3876 DTRACE_JSON_COMMA;
3877 string_is_key = B_FALSE;
3878 break;
3879 }
3880
3881 *dd++ = cc;
3882 break;
3883 case DTRACE_JSON_STRING_ESCAPE:
3884 *dd++ = cc;
3885 if (cc == 'u') {
3886 escape_unicount = 0;
3887 state = DTRACE_JSON_STRING_ESCAPE_UNICODE;
3888 } else {
3889 state = DTRACE_JSON_STRING;
3890 }
3891 break;
3892 case DTRACE_JSON_STRING_ESCAPE_UNICODE:
3893 if (!isxdigit(cc)) {
3894 /*
3895 * ERROR: invalid unicode escape, expected
3896 * four valid hexidecimal digits.
3897 */
3898 return (NULL);
3899 }
3900
3901 *dd++ = cc;
3902 if (++escape_unicount == 4)
3903 state = DTRACE_JSON_STRING;
3904 break;
3905 case DTRACE_JSON_COLON:
3906 if (isspace(cc))
3907 break;
3908
3909 if (cc == ':') {
3910 state = DTRACE_JSON_VALUE;
3911 break;
3912 }
3913
3914 /*
3915 * ERROR: expected a colon.
3916 */
3917 return (NULL);
3918 case DTRACE_JSON_COMMA:
3919 if (isspace(cc))
3920 break;
3921
3922 if (cc == ',') {
3923 if (in_array) {
3924 state = DTRACE_JSON_VALUE;
3925 if (++array_pos == array_elem)
3926 found_key = B_TRUE;
3927 } else {
3928 state = DTRACE_JSON_OBJECT;
3929 }
3930 break;
3931 }
3932
3933 /*
3934 * ERROR: either we hit an unexpected character, or
3935 * we reached the end of the object or array without
3936 * finding the requested key.
3937 */
3938 return (NULL);
3939 case DTRACE_JSON_IDENTIFIER:
3940 if (islower(cc)) {
3941 *dd++ = cc;
3942 break;
3943 }
3944
3945 *dd = '\0';
3946 dd = dest; /* reset string buffer */
3947
3948 if (dtrace_strncmp(dest, "true", 5) == 0 ||
3949 dtrace_strncmp(dest, "false", 6) == 0 ||
3950 dtrace_strncmp(dest, "null", 5) == 0) {
3951 if (found_key) {
3952 if (nelems > 1) {
3953 /*
3954 * ERROR: We expected an object,
3955 * not this identifier.
3956 */
3957 return (NULL);
3958 }
3959 return (dest);
3960 } else {
3961 cur--;
3962 state = DTRACE_JSON_COMMA;
3963 break;
3964 }
3965 }
3966
3967 /*
3968 * ERROR: we did not recognise the identifier as one
3969 * of those in the JSON specification.
3970 */
3971 return (NULL);
3972 case DTRACE_JSON_NUMBER:
3973 if (cc == '.') {
3974 *dd++ = cc;
3975 state = DTRACE_JSON_NUMBER_FRAC;
3976 break;
3977 }
3978
3979 if (cc == 'x' || cc == 'X') {
3980 /*
3981 * ERROR: specification explicitly excludes
3982 * hexidecimal or octal numbers.
3983 */
3984 return (NULL);
3985 }
3986
3987 /* FALLTHRU */
3988 case DTRACE_JSON_NUMBER_FRAC:
3989 if (cc == 'e' || cc == 'E') {
3990 *dd++ = cc;
3991 state = DTRACE_JSON_NUMBER_EXP;
3992 break;
3993 }
3994
3995 if (cc == '+' || cc == '-') {
3996 /*
3997 * ERROR: expect sign as part of exponent only.
3998 */
3999 return (NULL);
4000 }
4001 /* FALLTHRU */
4002 case DTRACE_JSON_NUMBER_EXP:
4003 if (isdigit(cc) || cc == '+' || cc == '-') {
4004 *dd++ = cc;
4005 break;
4006 }
4007
4008 *dd = '\0';
4009 dd = dest; /* reset string buffer */
4010 if (found_key) {
4011 if (nelems > 1) {
4012 /*
4013 * ERROR: We expected an object, not
4014 * this number.
4015 */
4016 return (NULL);
4017 }
4018 return (dest);
4019 }
4020
4021 cur--;
4022 state = DTRACE_JSON_COMMA;
4023 break;
4024 case DTRACE_JSON_VALUE:
4025 if (isspace(cc))
4026 break;
4027
4028 if (cc == '{' || cc == '[') {
4029 if (nelems > 1 && found_key) {
4030 in_array = cc == '[' ? B_TRUE : B_FALSE;
4031 /*
4032 * If our element selector directs us
4033 * to descend into this nested object,
4034 * then move to the next selector
4035 * element in the list and restart the
4036 * state machine.
4037 */
4038 while (*elem != '\0')
4039 elem++;
4040 elem++; /* skip the inter-element NUL */
4041 nelems--;
4042 dd = dest;
4043 if (in_array) {
4044 state = DTRACE_JSON_VALUE;
4045 array_pos = 0;
4046 array_elem = dtrace_strtoll(
4047 elem, 10, size);
4048 found_key = array_elem == 0 ?
4049 B_TRUE : B_FALSE;
4050 } else {
4051 found_key = B_FALSE;
4052 state = DTRACE_JSON_OBJECT;
4053 }
4054 break;
4055 }
4056
4057 /*
4058 * Otherwise, we wish to either skip this
4059 * nested object or return it in full.
4060 */
4061 if (cc == '[')
4062 brackets = 1;
4063 else
4064 braces = 1;
4065 *dd++ = cc;
4066 state = DTRACE_JSON_COLLECT_OBJECT;
4067 break;
4068 }
4069
4070 if (cc == '"') {
4071 state = DTRACE_JSON_STRING;
4072 break;
4073 }
4074
4075 if (islower(cc)) {
4076 /*
4077 * Here we deal with true, false and null.
4078 */
4079 *dd++ = cc;
4080 state = DTRACE_JSON_IDENTIFIER;
4081 break;
4082 }
4083
4084 if (cc == '-' || isdigit(cc)) {
4085 *dd++ = cc;
4086 state = DTRACE_JSON_NUMBER;
4087 break;
4088 }
4089
4090 /*
4091 * ERROR: unexpected character at start of value.
4092 */
4093 return (NULL);
4094 case DTRACE_JSON_COLLECT_OBJECT:
4095 if (cc == '\0')
4096 /*
4097 * ERROR: unexpected end of input.
4098 */
4099 return (NULL);
4100
4101 *dd++ = cc;
4102 if (cc == '"') {
4103 collect_object = B_TRUE;
4104 state = DTRACE_JSON_STRING;
4105 break;
4106 }
4107
4108 if (cc == ']') {
4109 if (brackets-- == 0) {
4110 /*
4111 * ERROR: unbalanced brackets.
4112 */
4113 return (NULL);
4114 }
4115 } else if (cc == '}') {
4116 if (braces-- == 0) {
4117 /*
4118 * ERROR: unbalanced braces.
4119 */
4120 return (NULL);
4121 }
4122 } else if (cc == '{') {
4123 braces++;
4124 } else if (cc == '[') {
4125 brackets++;
4126 }
4127
4128 if (brackets == 0 && braces == 0) {
4129 if (found_key) {
4130 *dd = '\0';
4131 return (dest);
4132 }
4133 dd = dest; /* reset string buffer */
4134 state = DTRACE_JSON_COMMA;
4135 }
4136 break;
4137 }
4138 }
4139 return (NULL);
4140 }
4141
4142 /*
4143 * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
4144 * Notice that we don't bother validating the proper number of arguments or
4145 * their types in the tuple stack. This isn't needed because all argument
4146 * interpretation is safe because of our load safety -- the worst that can
4147 * happen is that a bogus program can obtain bogus results.
4148 */
4149 static void
4150 dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
4151 dtrace_key_t *tupregs, int nargs,
4152 dtrace_mstate_t *mstate, dtrace_state_t *state)
4153 {
4154 volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
4155 volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
4156 dtrace_vstate_t *vstate = &state->dts_vstate;
4157
4158 #if !defined(__APPLE__)
4159 union {
4160 mutex_impl_t mi;
4161 uint64_t mx;
4162 } m;
4163
4164 union {
4165 krwlock_t ri;
4166 uintptr_t rw;
4167 } r;
4168 #else
4169 /* FIXME: awaits lock/mutex work */
4170 #endif /* __APPLE__ */
4171
4172 switch (subr) {
4173 case DIF_SUBR_RAND:
4174 regs[rd] = dtrace_xoroshiro128_plus_next(
4175 state->dts_rstate[CPU->cpu_id]);
4176 break;
4177
4178 #if !defined(__APPLE__)
4179 case DIF_SUBR_MUTEX_OWNED:
4180 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4181 mstate, vstate)) {
4182 regs[rd] = 0;
4183 break;
4184 }
4185
4186 m.mx = dtrace_load64(tupregs[0].dttk_value);
4187 if (MUTEX_TYPE_ADAPTIVE(&m.mi))
4188 regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
4189 else
4190 regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
4191 break;
4192
4193 case DIF_SUBR_MUTEX_OWNER:
4194 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4195 mstate, vstate)) {
4196 regs[rd] = 0;
4197 break;
4198 }
4199
4200 m.mx = dtrace_load64(tupregs[0].dttk_value);
4201 if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
4202 MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
4203 regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
4204 else
4205 regs[rd] = 0;
4206 break;
4207
4208 case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
4209 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4210 mstate, vstate)) {
4211 regs[rd] = 0;
4212 break;
4213 }
4214
4215 m.mx = dtrace_load64(tupregs[0].dttk_value);
4216 regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
4217 break;
4218
4219 case DIF_SUBR_MUTEX_TYPE_SPIN:
4220 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4221 mstate, vstate)) {
4222 regs[rd] = 0;
4223 break;
4224 }
4225
4226 m.mx = dtrace_load64(tupregs[0].dttk_value);
4227 regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
4228 break;
4229
4230 case DIF_SUBR_RW_READ_HELD: {
4231 uintptr_t tmp;
4232
4233 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4234 mstate, vstate)) {
4235 regs[rd] = 0;
4236 break;
4237 }
4238
4239 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4240 regs[rd] = _RW_READ_HELD(&r.ri, tmp);
4241 break;
4242 }
4243
4244 case DIF_SUBR_RW_WRITE_HELD:
4245 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
4246 mstate, vstate)) {
4247 regs[rd] = 0;
4248 break;
4249 }
4250
4251 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4252 regs[rd] = _RW_WRITE_HELD(&r.ri);
4253 break;
4254
4255 case DIF_SUBR_RW_ISWRITER:
4256 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
4257 mstate, vstate)) {
4258 regs[rd] = 0;
4259 break;
4260 }
4261
4262 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4263 regs[rd] = _RW_ISWRITER(&r.ri);
4264 break;
4265 #else
4266 /* FIXME: awaits lock/mutex work */
4267 #endif /* __APPLE__ */
4268
4269 case DIF_SUBR_BCOPY: {
4270 /*
4271 * We need to be sure that the destination is in the scratch
4272 * region -- no other region is allowed.
4273 */
4274 uintptr_t src = tupregs[0].dttk_value;
4275 uintptr_t dest = tupregs[1].dttk_value;
4276 size_t size = tupregs[2].dttk_value;
4277
4278 if (!dtrace_inscratch(dest, size, mstate)) {
4279 *flags |= CPU_DTRACE_BADADDR;
4280 *illval = regs[rd];
4281 break;
4282 }
4283
4284 if (!dtrace_canload(src, size, mstate, vstate)) {
4285 regs[rd] = 0;
4286 break;
4287 }
4288
4289 dtrace_bcopy((void *)src, (void *)dest, size);
4290 break;
4291 }
4292
4293 case DIF_SUBR_ALLOCA:
4294 case DIF_SUBR_COPYIN: {
4295 uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
4296 uint64_t size =
4297 tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
4298 size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
4299
4300 /*
4301 * Check whether the user can access kernel memory
4302 */
4303 if (dtrace_priv_kernel(state) == 0) {
4304 DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
4305 regs[rd] = 0;
4306 break;
4307 }
4308 /*
4309 * This action doesn't require any credential checks since
4310 * probes will not activate in user contexts to which the
4311 * enabling user does not have permissions.
4312 */
4313
4314 /*
4315 * Rounding up the user allocation size could have overflowed
4316 * a large, bogus allocation (like -1ULL) to 0.
4317 */
4318 if (scratch_size < size ||
4319 !DTRACE_INSCRATCH(mstate, scratch_size)) {
4320 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4321 regs[rd] = 0;
4322 break;
4323 }
4324
4325 if (subr == DIF_SUBR_COPYIN) {
4326 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4327 if (dtrace_priv_proc(state))
4328 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
4329 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4330 }
4331
4332 mstate->dtms_scratch_ptr += scratch_size;
4333 regs[rd] = dest;
4334 break;
4335 }
4336
4337 case DIF_SUBR_COPYINTO: {
4338 uint64_t size = tupregs[1].dttk_value;
4339 uintptr_t dest = tupregs[2].dttk_value;
4340
4341 /*
4342 * This action doesn't require any credential checks since
4343 * probes will not activate in user contexts to which the
4344 * enabling user does not have permissions.
4345 */
4346 if (!dtrace_inscratch(dest, size, mstate)) {
4347 *flags |= CPU_DTRACE_BADADDR;
4348 *illval = regs[rd];
4349 break;
4350 }
4351
4352 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4353 if (dtrace_priv_proc(state))
4354 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
4355 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4356 break;
4357 }
4358
4359 case DIF_SUBR_COPYINSTR: {
4360 uintptr_t dest = mstate->dtms_scratch_ptr;
4361 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4362
4363 if (nargs > 1 && tupregs[1].dttk_value < size)
4364 size = tupregs[1].dttk_value + 1;
4365
4366 /*
4367 * This action doesn't require any credential checks since
4368 * probes will not activate in user contexts to which the
4369 * enabling user does not have permissions.
4370 */
4371 if (!DTRACE_INSCRATCH(mstate, size)) {
4372 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4373 regs[rd] = 0;
4374 break;
4375 }
4376
4377 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4378 if (dtrace_priv_proc(state))
4379 dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
4380 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4381
4382 ((char *)dest)[size - 1] = '\0';
4383 mstate->dtms_scratch_ptr += size;
4384 regs[rd] = dest;
4385 break;
4386 }
4387
4388 case DIF_SUBR_MSGSIZE:
4389 case DIF_SUBR_MSGDSIZE: {
4390 /* Darwin does not implement SysV streams messages */
4391 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4392 regs[rd] = 0;
4393 break;
4394 }
4395
4396 case DIF_SUBR_PROGENYOF: {
4397 pid_t pid = tupregs[0].dttk_value;
4398 struct proc *p = current_proc();
4399 int rval = 0, lim = nprocs;
4400
4401 while(p && (lim-- > 0)) {
4402 pid_t ppid;
4403
4404 ppid = (pid_t)dtrace_load32((uintptr_t)&(p->p_pid));
4405 if (*flags & CPU_DTRACE_FAULT)
4406 break;
4407
4408 if (ppid == pid) {
4409 rval = 1;
4410 break;
4411 }
4412
4413 if (ppid == 0)
4414 break; /* Can't climb process tree any further. */
4415
4416 p = (struct proc *)dtrace_loadptr((uintptr_t)&(p->p_pptr));
4417 if (*flags & CPU_DTRACE_FAULT)
4418 break;
4419 }
4420
4421 regs[rd] = rval;
4422 break;
4423 }
4424
4425 case DIF_SUBR_SPECULATION:
4426 regs[rd] = dtrace_speculation(state);
4427 break;
4428
4429
4430 case DIF_SUBR_COPYOUT: {
4431 uintptr_t kaddr = tupregs[0].dttk_value;
4432 user_addr_t uaddr = tupregs[1].dttk_value;
4433 uint64_t size = tupregs[2].dttk_value;
4434
4435 if (!dtrace_destructive_disallow &&
4436 dtrace_priv_proc_control(state) &&
4437 !dtrace_istoxic(kaddr, size) &&
4438 dtrace_canload(kaddr, size, mstate, vstate)) {
4439 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4440 dtrace_copyout(kaddr, uaddr, size, flags);
4441 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4442 }
4443 break;
4444 }
4445
4446 case DIF_SUBR_COPYOUTSTR: {
4447 uintptr_t kaddr = tupregs[0].dttk_value;
4448 user_addr_t uaddr = tupregs[1].dttk_value;
4449 uint64_t size = tupregs[2].dttk_value;
4450 size_t lim;
4451
4452 if (!dtrace_destructive_disallow &&
4453 dtrace_priv_proc_control(state) &&
4454 !dtrace_istoxic(kaddr, size) &&
4455 dtrace_strcanload(kaddr, size, &lim, mstate, vstate)) {
4456 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4457 dtrace_copyoutstr(kaddr, uaddr, lim, flags);
4458 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4459 }
4460 break;
4461 }
4462
4463 case DIF_SUBR_STRLEN: {
4464 size_t size = state->dts_options[DTRACEOPT_STRSIZE];
4465 uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
4466 size_t lim;
4467
4468 if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) {
4469 regs[rd] = 0;
4470 break;
4471 }
4472
4473 regs[rd] = dtrace_strlen((char *)addr, lim);
4474
4475 break;
4476 }
4477
4478 case DIF_SUBR_STRCHR:
4479 case DIF_SUBR_STRRCHR: {
4480 /*
4481 * We're going to iterate over the string looking for the
4482 * specified character. We will iterate until we have reached
4483 * the string length or we have found the character. If this
4484 * is DIF_SUBR_STRRCHR, we will look for the last occurrence
4485 * of the specified character instead of the first.
4486 */
4487 uintptr_t addr = tupregs[0].dttk_value;
4488 uintptr_t addr_limit;
4489 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4490 size_t lim;
4491 char c, target = (char)tupregs[1].dttk_value;
4492
4493 if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) {
4494 regs[rd] = 0;
4495 break;
4496 }
4497 addr_limit = addr + lim;
4498
4499 for (regs[rd] = 0; addr < addr_limit; addr++) {
4500 if ((c = dtrace_load8(addr)) == target) {
4501 regs[rd] = addr;
4502
4503 if (subr == DIF_SUBR_STRCHR)
4504 break;
4505 }
4506
4507 if (c == '\0')
4508 break;
4509 }
4510
4511 break;
4512 }
4513
4514 case DIF_SUBR_STRSTR:
4515 case DIF_SUBR_INDEX:
4516 case DIF_SUBR_RINDEX: {
4517 /*
4518 * We're going to iterate over the string looking for the
4519 * specified string. We will iterate until we have reached
4520 * the string length or we have found the string. (Yes, this
4521 * is done in the most naive way possible -- but considering
4522 * that the string we're searching for is likely to be
4523 * relatively short, the complexity of Rabin-Karp or similar
4524 * hardly seems merited.)
4525 */
4526 char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
4527 char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
4528 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4529 size_t len = dtrace_strlen(addr, size);
4530 size_t sublen = dtrace_strlen(substr, size);
4531 char *limit = addr + len, *orig = addr;
4532 int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
4533 int inc = 1;
4534
4535 regs[rd] = notfound;
4536
4537 if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
4538 regs[rd] = 0;
4539 break;
4540 }
4541
4542 if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
4543 vstate)) {
4544 regs[rd] = 0;
4545 break;
4546 }
4547
4548 /*
4549 * strstr() and index()/rindex() have similar semantics if
4550 * both strings are the empty string: strstr() returns a
4551 * pointer to the (empty) string, and index() and rindex()
4552 * both return index 0 (regardless of any position argument).
4553 */
4554 if (sublen == 0 && len == 0) {
4555 if (subr == DIF_SUBR_STRSTR)
4556 regs[rd] = (uintptr_t)addr;
4557 else
4558 regs[rd] = 0;
4559 break;
4560 }
4561
4562 if (subr != DIF_SUBR_STRSTR) {
4563 if (subr == DIF_SUBR_RINDEX) {
4564 limit = orig - 1;
4565 addr += len;
4566 inc = -1;
4567 }
4568
4569 /*
4570 * Both index() and rindex() take an optional position
4571 * argument that denotes the starting position.
4572 */
4573 if (nargs == 3) {
4574 int64_t pos = (int64_t)tupregs[2].dttk_value;
4575
4576 /*
4577 * If the position argument to index() is
4578 * negative, Perl implicitly clamps it at
4579 * zero. This semantic is a little surprising
4580 * given the special meaning of negative
4581 * positions to similar Perl functions like
4582 * substr(), but it appears to reflect a
4583 * notion that index() can start from a
4584 * negative index and increment its way up to
4585 * the string. Given this notion, Perl's
4586 * rindex() is at least self-consistent in
4587 * that it implicitly clamps positions greater
4588 * than the string length to be the string
4589 * length. Where Perl completely loses
4590 * coherence, however, is when the specified
4591 * substring is the empty string (""). In
4592 * this case, even if the position is
4593 * negative, rindex() returns 0 -- and even if
4594 * the position is greater than the length,
4595 * index() returns the string length. These
4596 * semantics violate the notion that index()
4597 * should never return a value less than the
4598 * specified position and that rindex() should
4599 * never return a value greater than the
4600 * specified position. (One assumes that
4601 * these semantics are artifacts of Perl's
4602 * implementation and not the results of
4603 * deliberate design -- it beggars belief that
4604 * even Larry Wall could desire such oddness.)
4605 * While in the abstract one would wish for
4606 * consistent position semantics across
4607 * substr(), index() and rindex() -- or at the
4608 * very least self-consistent position
4609 * semantics for index() and rindex() -- we
4610 * instead opt to keep with the extant Perl
4611 * semantics, in all their broken glory. (Do
4612 * we have more desire to maintain Perl's
4613 * semantics than Perl does? Probably.)
4614 */
4615 if (subr == DIF_SUBR_RINDEX) {
4616 if (pos < 0) {
4617 if (sublen == 0)
4618 regs[rd] = 0;
4619 break;
4620 }
4621
4622 if ((size_t)pos > len)
4623 pos = len;
4624 } else {
4625 if (pos < 0)
4626 pos = 0;
4627
4628 if ((size_t)pos >= len) {
4629 if (sublen == 0)
4630 regs[rd] = len;
4631 break;
4632 }
4633 }
4634
4635 addr = orig + pos;
4636 }
4637 }
4638
4639 for (regs[rd] = notfound; addr != limit; addr += inc) {
4640 if (dtrace_strncmp(addr, substr, sublen) == 0) {
4641 if (subr != DIF_SUBR_STRSTR) {
4642 /*
4643 * As D index() and rindex() are
4644 * modeled on Perl (and not on awk),
4645 * we return a zero-based (and not a
4646 * one-based) index. (For you Perl
4647 * weenies: no, we're not going to add
4648 * $[ -- and shouldn't you be at a con
4649 * or something?)
4650 */
4651 regs[rd] = (uintptr_t)(addr - orig);
4652 break;
4653 }
4654
4655 ASSERT(subr == DIF_SUBR_STRSTR);
4656 regs[rd] = (uintptr_t)addr;
4657 break;
4658 }
4659 }
4660
4661 break;
4662 }
4663
4664 case DIF_SUBR_STRTOK: {
4665 uintptr_t addr = tupregs[0].dttk_value;
4666 uintptr_t tokaddr = tupregs[1].dttk_value;
4667 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4668 uintptr_t limit, toklimit;
4669 size_t clim;
4670 char *dest = (char *)mstate->dtms_scratch_ptr;
4671 uint8_t c='\0', tokmap[32]; /* 256 / 8 */
4672 uint64_t i = 0;
4673
4674 /*
4675 * Check both the token buffer and (later) the input buffer,
4676 * since both could be non-scratch addresses.
4677 */
4678 if (!dtrace_strcanload(tokaddr, size, &clim, mstate, vstate)) {
4679 regs[rd] = 0;
4680 break;
4681 }
4682 toklimit = tokaddr + clim;
4683
4684 if (!DTRACE_INSCRATCH(mstate, size)) {
4685 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4686 regs[rd] = 0;
4687 break;
4688 }
4689
4690 if (addr == 0) {
4691 /*
4692 * If the address specified is NULL, we use our saved
4693 * strtok pointer from the mstate. Note that this
4694 * means that the saved strtok pointer is _only_
4695 * valid within multiple enablings of the same probe --
4696 * it behaves like an implicit clause-local variable.
4697 */
4698 addr = mstate->dtms_strtok;
4699 limit = mstate->dtms_strtok_limit;
4700 } else {
4701 /*
4702 * If the user-specified address is non-NULL we must
4703 * access check it. This is the only time we have
4704 * a chance to do so, since this address may reside
4705 * in the string table of this clause-- future calls
4706 * (when we fetch addr from mstate->dtms_strtok)
4707 * would fail this access check.
4708 */
4709 if (!dtrace_strcanload(addr, size, &clim, mstate,
4710 vstate)) {
4711 regs[rd] = 0;
4712 break;
4713 }
4714 limit = addr + clim;
4715 }
4716
4717 /*
4718 * First, zero the token map, and then process the token
4719 * string -- setting a bit in the map for every character
4720 * found in the token string.
4721 */
4722 for (i = 0; i < (int)sizeof (tokmap); i++)
4723 tokmap[i] = 0;
4724
4725 for (; tokaddr < toklimit; tokaddr++) {
4726 if ((c = dtrace_load8(tokaddr)) == '\0')
4727 break;
4728
4729 ASSERT((c >> 3) < sizeof (tokmap));
4730 tokmap[c >> 3] |= (1 << (c & 0x7));
4731 }
4732
4733 for (; addr < limit; addr++) {
4734 /*
4735 * We're looking for a character that is _not_
4736 * contained in the token string.
4737 */
4738 if ((c = dtrace_load8(addr)) == '\0')
4739 break;
4740
4741 if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
4742 break;
4743 }
4744
4745 if (c == '\0') {
4746 /*
4747 * We reached the end of the string without finding
4748 * any character that was not in the token string.
4749 * We return NULL in this case, and we set the saved
4750 * address to NULL as well.
4751 */
4752 regs[rd] = 0;
4753 mstate->dtms_strtok = 0;
4754 mstate->dtms_strtok_limit = 0;
4755 break;
4756 }
4757
4758 /*
4759 * From here on, we're copying into the destination string.
4760 */
4761 for (i = 0; addr < limit && i < size - 1; addr++) {
4762 if ((c = dtrace_load8(addr)) == '\0')
4763 break;
4764
4765 if (tokmap[c >> 3] & (1 << (c & 0x7)))
4766 break;
4767
4768 ASSERT(i < size);
4769 dest[i++] = c;
4770 }
4771
4772 ASSERT(i < size);
4773 dest[i] = '\0';
4774 regs[rd] = (uintptr_t)dest;
4775 mstate->dtms_scratch_ptr += size;
4776 mstate->dtms_strtok = addr;
4777 mstate->dtms_strtok_limit = limit;
4778 break;
4779 }
4780
4781 case DIF_SUBR_SUBSTR: {
4782 uintptr_t s = tupregs[0].dttk_value;
4783 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4784 char *d = (char *)mstate->dtms_scratch_ptr;
4785 int64_t index = (int64_t)tupregs[1].dttk_value;
4786 int64_t remaining = (int64_t)tupregs[2].dttk_value;
4787 size_t len = dtrace_strlen((char *)s, size);
4788 int64_t i = 0;
4789
4790 if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4791 regs[rd] = 0;
4792 break;
4793 }
4794
4795 if (!DTRACE_INSCRATCH(mstate, size)) {
4796 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4797 regs[rd] = 0;
4798 break;
4799 }
4800
4801 if (nargs <= 2)
4802 remaining = (int64_t)size;
4803
4804 if (index < 0) {
4805 index += len;
4806
4807 if (index < 0 && index + remaining > 0) {
4808 remaining += index;
4809 index = 0;
4810 }
4811 }
4812
4813 if ((size_t)index >= len || index < 0) {
4814 remaining = 0;
4815 } else if (remaining < 0) {
4816 remaining += len - index;
4817 } else if ((uint64_t)index + (uint64_t)remaining > size) {
4818 remaining = size - index;
4819 }
4820
4821 for (i = 0; i < remaining; i++) {
4822 if ((d[i] = dtrace_load8(s + index + i)) == '\0')
4823 break;
4824 }
4825
4826 d[i] = '\0';
4827
4828 mstate->dtms_scratch_ptr += size;
4829 regs[rd] = (uintptr_t)d;
4830 break;
4831 }
4832
4833 case DIF_SUBR_GETMAJOR:
4834 regs[rd] = (uintptr_t)major( (dev_t)tupregs[0].dttk_value );
4835 break;
4836
4837 case DIF_SUBR_GETMINOR:
4838 regs[rd] = (uintptr_t)minor( (dev_t)tupregs[0].dttk_value );
4839 break;
4840
4841 case DIF_SUBR_DDI_PATHNAME: {
4842 /* APPLE NOTE: currently unsupported on Darwin */
4843 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4844 regs[rd] = 0;
4845 break;
4846 }
4847
4848 case DIF_SUBR_STRJOIN: {
4849 char *d = (char *)mstate->dtms_scratch_ptr;
4850 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4851 uintptr_t s1 = tupregs[0].dttk_value;
4852 uintptr_t s2 = tupregs[1].dttk_value;
4853 uint64_t i = 0, j = 0;
4854 size_t lim1, lim2;
4855 char c;
4856
4857 if (!dtrace_strcanload(s1, size, &lim1, mstate, vstate) ||
4858 !dtrace_strcanload(s2, size, &lim2, mstate, vstate)) {
4859 regs[rd] = 0;
4860 break;
4861 }
4862
4863 if (!DTRACE_INSCRATCH(mstate, size)) {
4864 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4865 regs[rd] = 0;
4866 break;
4867 }
4868
4869 for (;;) {
4870 if (i >= size) {
4871 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4872 regs[rd] = 0;
4873 break;
4874 }
4875 c = (i >= lim1) ? '\0' : dtrace_load8(s1++);
4876 if ((d[i++] = c) == '\0') {
4877 i--;
4878 break;
4879 }
4880 }
4881
4882 for (;;) {
4883 if (i >= size) {
4884 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4885 regs[rd] = 0;
4886 break;
4887 }
4888 c = (j++ >= lim2) ? '\0' : dtrace_load8(s2++);
4889 if ((d[i++] = c) == '\0')
4890 break;
4891 }
4892
4893 if (i < size) {
4894 mstate->dtms_scratch_ptr += i;
4895 regs[rd] = (uintptr_t)d;
4896 }
4897
4898 break;
4899 }
4900
4901 case DIF_SUBR_STRTOLL: {
4902 uintptr_t s = tupregs[0].dttk_value;
4903 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4904 size_t lim;
4905 int base = 10;
4906
4907 if (nargs > 1) {
4908 if ((base = tupregs[1].dttk_value) <= 1 ||
4909 base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
4910 *flags |= CPU_DTRACE_ILLOP;
4911 break;
4912 }
4913 }
4914
4915 if (!dtrace_strcanload(s, size, &lim, mstate, vstate)) {
4916 regs[rd] = INT64_MIN;
4917 break;
4918 }
4919
4920 regs[rd] = dtrace_strtoll((char *)s, base, lim);
4921 break;
4922 }
4923
4924 case DIF_SUBR_LLTOSTR: {
4925 int64_t i = (int64_t)tupregs[0].dttk_value;
4926 uint64_t val, digit;
4927 uint64_t size = 65; /* enough room for 2^64 in binary */
4928 char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
4929 int base = 10;
4930
4931 if (nargs > 1) {
4932 if ((base = tupregs[1].dttk_value) <= 1 ||
4933 base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
4934 *flags |= CPU_DTRACE_ILLOP;
4935 break;
4936 }
4937 }
4938
4939 val = (base == 10 && i < 0) ? i * -1 : i;
4940
4941 if (!DTRACE_INSCRATCH(mstate, size)) {
4942 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4943 regs[rd] = 0;
4944 break;
4945 }
4946
4947 for (*end-- = '\0'; val; val /= base) {
4948 if ((digit = val % base) <= '9' - '0') {
4949 *end-- = '0' + digit;
4950 } else {
4951 *end-- = 'a' + (digit - ('9' - '0') - 1);
4952 }
4953 }
4954
4955 if (i == 0 && base == 16)
4956 *end-- = '0';
4957
4958 if (base == 16)
4959 *end-- = 'x';
4960
4961 if (i == 0 || base == 8 || base == 16)
4962 *end-- = '0';
4963
4964 if (i < 0 && base == 10)
4965 *end-- = '-';
4966
4967 regs[rd] = (uintptr_t)end + 1;
4968 mstate->dtms_scratch_ptr += size;
4969 break;
4970 }
4971
4972 case DIF_SUBR_HTONS:
4973 case DIF_SUBR_NTOHS:
4974 #ifdef _BIG_ENDIAN
4975 regs[rd] = (uint16_t)tupregs[0].dttk_value;
4976 #else
4977 regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
4978 #endif
4979 break;
4980
4981
4982 case DIF_SUBR_HTONL:
4983 case DIF_SUBR_NTOHL:
4984 #ifdef _BIG_ENDIAN
4985 regs[rd] = (uint32_t)tupregs[0].dttk_value;
4986 #else
4987 regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
4988 #endif
4989 break;
4990
4991
4992 case DIF_SUBR_HTONLL:
4993 case DIF_SUBR_NTOHLL:
4994 #ifdef _BIG_ENDIAN
4995 regs[rd] = (uint64_t)tupregs[0].dttk_value;
4996 #else
4997 regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
4998 #endif
4999 break;
5000
5001
5002 case DIF_SUBR_DIRNAME:
5003 case DIF_SUBR_BASENAME: {
5004 char *dest = (char *)mstate->dtms_scratch_ptr;
5005 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5006 uintptr_t src = tupregs[0].dttk_value;
5007 int i, j, len = dtrace_strlen((char *)src, size);
5008 int lastbase = -1, firstbase = -1, lastdir = -1;
5009 int start, end;
5010
5011 if (!dtrace_canload(src, len + 1, mstate, vstate)) {
5012 regs[rd] = 0;
5013 break;
5014 }
5015
5016 if (!DTRACE_INSCRATCH(mstate, size)) {
5017 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5018 regs[rd] = 0;
5019 break;
5020 }
5021
5022 /*
5023 * The basename and dirname for a zero-length string is
5024 * defined to be "."
5025 */
5026 if (len == 0) {
5027 len = 1;
5028 src = (uintptr_t)".";
5029 }
5030
5031 /*
5032 * Start from the back of the string, moving back toward the
5033 * front until we see a character that isn't a slash. That
5034 * character is the last character in the basename.
5035 */
5036 for (i = len - 1; i >= 0; i--) {
5037 if (dtrace_load8(src + i) != '/')
5038 break;
5039 }
5040
5041 if (i >= 0)
5042 lastbase = i;
5043
5044 /*
5045 * Starting from the last character in the basename, move
5046 * towards the front until we find a slash. The character
5047 * that we processed immediately before that is the first
5048 * character in the basename.
5049 */
5050 for (; i >= 0; i--) {
5051 if (dtrace_load8(src + i) == '/')
5052 break;
5053 }
5054
5055 if (i >= 0)
5056 firstbase = i + 1;
5057
5058 /*
5059 * Now keep going until we find a non-slash character. That
5060 * character is the last character in the dirname.
5061 */
5062 for (; i >= 0; i--) {
5063 if (dtrace_load8(src + i) != '/')
5064 break;
5065 }
5066
5067 if (i >= 0)
5068 lastdir = i;
5069
5070 ASSERT(!(lastbase == -1 && firstbase != -1));
5071 ASSERT(!(firstbase == -1 && lastdir != -1));
5072
5073 if (lastbase == -1) {
5074 /*
5075 * We didn't find a non-slash character. We know that
5076 * the length is non-zero, so the whole string must be
5077 * slashes. In either the dirname or the basename
5078 * case, we return '/'.
5079 */
5080 ASSERT(firstbase == -1);
5081 firstbase = lastbase = lastdir = 0;
5082 }
5083
5084 if (firstbase == -1) {
5085 /*
5086 * The entire string consists only of a basename
5087 * component. If we're looking for dirname, we need
5088 * to change our string to be just "."; if we're
5089 * looking for a basename, we'll just set the first
5090 * character of the basename to be 0.
5091 */
5092 if (subr == DIF_SUBR_DIRNAME) {
5093 ASSERT(lastdir == -1);
5094 src = (uintptr_t)".";
5095 lastdir = 0;
5096 } else {
5097 firstbase = 0;
5098 }
5099 }
5100
5101 if (subr == DIF_SUBR_DIRNAME) {
5102 if (lastdir == -1) {
5103 /*
5104 * We know that we have a slash in the name --
5105 * or lastdir would be set to 0, above. And
5106 * because lastdir is -1, we know that this
5107 * slash must be the first character. (That
5108 * is, the full string must be of the form
5109 * "/basename".) In this case, the last
5110 * character of the directory name is 0.
5111 */
5112 lastdir = 0;
5113 }
5114
5115 start = 0;
5116 end = lastdir;
5117 } else {
5118 ASSERT(subr == DIF_SUBR_BASENAME);
5119 ASSERT(firstbase != -1 && lastbase != -1);
5120 start = firstbase;
5121 end = lastbase;
5122 }
5123
5124 for (i = start, j = 0; i <= end && (uint64_t)j < size - 1; i++, j++)
5125 dest[j] = dtrace_load8(src + i);
5126
5127 dest[j] = '\0';
5128 regs[rd] = (uintptr_t)dest;
5129 mstate->dtms_scratch_ptr += size;
5130 break;
5131 }
5132
5133 case DIF_SUBR_CLEANPATH: {
5134 char *dest = (char *)mstate->dtms_scratch_ptr, c;
5135 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5136 uintptr_t src = tupregs[0].dttk_value;
5137 size_t lim;
5138 size_t i = 0, j = 0;
5139
5140 if (!dtrace_strcanload(src, size, &lim, mstate, vstate)) {
5141 regs[rd] = 0;
5142 break;
5143 }
5144
5145 if (!DTRACE_INSCRATCH(mstate, size)) {
5146 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5147 regs[rd] = 0;
5148 break;
5149 }
5150
5151 /*
5152 * Move forward, loading each character.
5153 */
5154 do {
5155 c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5156 next:
5157 if ((uint64_t)(j + 5) >= size) /* 5 = strlen("/..c\0") */
5158 break;
5159
5160 if (c != '/') {
5161 dest[j++] = c;
5162 continue;
5163 }
5164
5165 c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5166
5167 if (c == '/') {
5168 /*
5169 * We have two slashes -- we can just advance
5170 * to the next character.
5171 */
5172 goto next;
5173 }
5174
5175 if (c != '.') {
5176 /*
5177 * This is not "." and it's not ".." -- we can
5178 * just store the "/" and this character and
5179 * drive on.
5180 */
5181 dest[j++] = '/';
5182 dest[j++] = c;
5183 continue;
5184 }
5185
5186 c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5187
5188 if (c == '/') {
5189 /*
5190 * This is a "/./" component. We're not going
5191 * to store anything in the destination buffer;
5192 * we're just going to go to the next component.
5193 */
5194 goto next;
5195 }
5196
5197 if (c != '.') {
5198 /*
5199 * This is not ".." -- we can just store the
5200 * "/." and this character and continue
5201 * processing.
5202 */
5203 dest[j++] = '/';
5204 dest[j++] = '.';
5205 dest[j++] = c;
5206 continue;
5207 }
5208
5209 c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5210
5211 if (c != '/' && c != '\0') {
5212 /*
5213 * This is not ".." -- it's "..[mumble]".
5214 * We'll store the "/.." and this character
5215 * and continue processing.
5216 */
5217 dest[j++] = '/';
5218 dest[j++] = '.';
5219 dest[j++] = '.';
5220 dest[j++] = c;
5221 continue;
5222 }
5223
5224 /*
5225 * This is "/../" or "/..\0". We need to back up
5226 * our destination pointer until we find a "/".
5227 */
5228 i--;
5229 while (j != 0 && dest[--j] != '/')
5230 continue;
5231
5232 if (c == '\0')
5233 dest[++j] = '/';
5234 } while (c != '\0');
5235
5236 dest[j] = '\0';
5237 regs[rd] = (uintptr_t)dest;
5238 mstate->dtms_scratch_ptr += size;
5239 break;
5240 }
5241
5242 case DIF_SUBR_INET_NTOA:
5243 case DIF_SUBR_INET_NTOA6:
5244 case DIF_SUBR_INET_NTOP: {
5245 size_t size;
5246 int af, argi, i;
5247 char *base, *end;
5248
5249 if (subr == DIF_SUBR_INET_NTOP) {
5250 af = (int)tupregs[0].dttk_value;
5251 argi = 1;
5252 } else {
5253 af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
5254 argi = 0;
5255 }
5256
5257 if (af == AF_INET) {
5258 #if !defined(__APPLE__)
5259 ipaddr_t ip4;
5260 #else
5261 uint32_t ip4;
5262 #endif /* __APPLE__ */
5263 uint8_t *ptr8, val;
5264
5265 /*
5266 * Safely load the IPv4 address.
5267 */
5268 #if !defined(__APPLE__)
5269 ip4 = dtrace_load32(tupregs[argi].dttk_value);
5270 #else
5271 if (!dtrace_canload(tupregs[argi].dttk_value, sizeof(ip4),
5272 mstate, vstate)) {
5273 regs[rd] = 0;
5274 break;
5275 }
5276
5277 dtrace_bcopy(
5278 (void *)(uintptr_t)tupregs[argi].dttk_value,
5279 (void *)(uintptr_t)&ip4, sizeof (ip4));
5280 #endif /* __APPLE__ */
5281 /*
5282 * Check an IPv4 string will fit in scratch.
5283 */
5284 #if !defined(__APPLE__)
5285 size = INET_ADDRSTRLEN;
5286 #else
5287 size = MAX_IPv4_STR_LEN;
5288 #endif /* __APPLE__ */
5289 if (!DTRACE_INSCRATCH(mstate, size)) {
5290 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5291 regs[rd] = 0;
5292 break;
5293 }
5294 base = (char *)mstate->dtms_scratch_ptr;
5295 end = (char *)mstate->dtms_scratch_ptr + size - 1;
5296
5297 /*
5298 * Stringify as a dotted decimal quad.
5299 */
5300 *end-- = '\0';
5301 ptr8 = (uint8_t *)&ip4;
5302 for (i = 3; i >= 0; i--) {
5303 val = ptr8[i];
5304
5305 if (val == 0) {
5306 *end-- = '0';
5307 } else {
5308 for (; val; val /= 10) {
5309 *end-- = '0' + (val % 10);
5310 }
5311 }
5312
5313 if (i > 0)
5314 *end-- = '.';
5315 }
5316 ASSERT(end + 1 >= base);
5317
5318 } else if (af == AF_INET6) {
5319 #if defined(__APPLE__)
5320 #define _S6_un __u6_addr
5321 #define _S6_u8 __u6_addr8
5322 #endif /* __APPLE__ */
5323 struct in6_addr ip6;
5324 int firstzero, tryzero, numzero, v6end;
5325 uint16_t val;
5326 const char digits[] = "0123456789abcdef";
5327
5328 /*
5329 * Stringify using RFC 1884 convention 2 - 16 bit
5330 * hexadecimal values with a zero-run compression.
5331 * Lower case hexadecimal digits are used.
5332 * eg, fe80::214:4fff:fe0b:76c8.
5333 * The IPv4 embedded form is returned for inet_ntop,
5334 * just the IPv4 string is returned for inet_ntoa6.
5335 */
5336
5337 if (!dtrace_canload(tupregs[argi].dttk_value,
5338 sizeof(struct in6_addr), mstate, vstate)) {
5339 regs[rd] = 0;
5340 break;
5341 }
5342
5343 /*
5344 * Safely load the IPv6 address.
5345 */
5346 dtrace_bcopy(
5347 (void *)(uintptr_t)tupregs[argi].dttk_value,
5348 (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
5349
5350 /*
5351 * Check an IPv6 string will fit in scratch.
5352 */
5353 size = INET6_ADDRSTRLEN;
5354 if (!DTRACE_INSCRATCH(mstate, size)) {
5355 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5356 regs[rd] = 0;
5357 break;
5358 }
5359 base = (char *)mstate->dtms_scratch_ptr;
5360 end = (char *)mstate->dtms_scratch_ptr + size - 1;
5361 *end-- = '\0';
5362
5363 /*
5364 * Find the longest run of 16 bit zero values
5365 * for the single allowed zero compression - "::".
5366 */
5367 firstzero = -1;
5368 tryzero = -1;
5369 numzero = 1;
5370 for (i = 0; i < (int)sizeof (struct in6_addr); i++) {
5371 if (ip6._S6_un._S6_u8[i] == 0 &&
5372 tryzero == -1 && i % 2 == 0) {
5373 tryzero = i;
5374 continue;
5375 }
5376
5377 if (tryzero != -1 &&
5378 (ip6._S6_un._S6_u8[i] != 0 ||
5379 i == sizeof (struct in6_addr) - 1)) {
5380
5381 if (i - tryzero <= numzero) {
5382 tryzero = -1;
5383 continue;
5384 }
5385
5386 firstzero = tryzero;
5387 numzero = i - i % 2 - tryzero;
5388 tryzero = -1;
5389
5390 if (ip6._S6_un._S6_u8[i] == 0 &&
5391 i == sizeof (struct in6_addr) - 1)
5392 numzero += 2;
5393 }
5394 }
5395 ASSERT(firstzero + numzero <= (int)sizeof (struct in6_addr));
5396
5397 /*
5398 * Check for an IPv4 embedded address.
5399 */
5400 v6end = sizeof (struct in6_addr) - 2;
5401 if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
5402 IN6_IS_ADDR_V4COMPAT(&ip6)) {
5403 for (i = sizeof (struct in6_addr) - 1;
5404 i >= (int)DTRACE_V4MAPPED_OFFSET; i--) {
5405 ASSERT(end >= base);
5406
5407 val = ip6._S6_un._S6_u8[i];
5408
5409 if (val == 0) {
5410 *end-- = '0';
5411 } else {
5412 for (; val; val /= 10) {
5413 *end-- = '0' + val % 10;
5414 }
5415 }
5416
5417 if (i > (int)DTRACE_V4MAPPED_OFFSET)
5418 *end-- = '.';
5419 }
5420
5421 if (subr == DIF_SUBR_INET_NTOA6)
5422 goto inetout;
5423
5424 /*
5425 * Set v6end to skip the IPv4 address that
5426 * we have already stringified.
5427 */
5428 v6end = 10;
5429 }
5430
5431 /*
5432 * Build the IPv6 string by working through the
5433 * address in reverse.
5434 */
5435 for (i = v6end; i >= 0; i -= 2) {
5436 ASSERT(end >= base);
5437
5438 if (i == firstzero + numzero - 2) {
5439 *end-- = ':';
5440 *end-- = ':';
5441 i -= numzero - 2;
5442 continue;
5443 }
5444
5445 if (i < 14 && i != firstzero - 2)
5446 *end-- = ':';
5447
5448 val = (ip6._S6_un._S6_u8[i] << 8) +
5449 ip6._S6_un._S6_u8[i + 1];
5450
5451 if (val == 0) {
5452 *end-- = '0';
5453 } else {
5454 for (; val; val /= 16) {
5455 *end-- = digits[val % 16];
5456 }
5457 }
5458 }
5459 ASSERT(end + 1 >= base);
5460
5461 #if defined(__APPLE__)
5462 #undef _S6_un
5463 #undef _S6_u8
5464 #endif /* __APPLE__ */
5465 } else {
5466 /*
5467 * The user didn't use AH_INET or AH_INET6.
5468 */
5469 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5470 regs[rd] = 0;
5471 break;
5472 }
5473
5474 inetout: regs[rd] = (uintptr_t)end + 1;
5475 mstate->dtms_scratch_ptr += size;
5476 break;
5477 }
5478
5479 case DIF_SUBR_JSON: {
5480 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5481 uintptr_t json = tupregs[0].dttk_value;
5482 size_t jsonlen = dtrace_strlen((char *)json, size);
5483 uintptr_t elem = tupregs[1].dttk_value;
5484 size_t elemlen = dtrace_strlen((char *)elem, size);
5485
5486 char *dest = (char *)mstate->dtms_scratch_ptr;
5487 char *elemlist = (char *)mstate->dtms_scratch_ptr + jsonlen + 1;
5488 char *ee = elemlist;
5489 int nelems = 1;
5490 uintptr_t cur;
5491
5492 if (!dtrace_canload(json, jsonlen + 1, mstate, vstate) ||
5493 !dtrace_canload(elem, elemlen + 1, mstate, vstate)) {
5494 regs[rd] = 0;
5495 break;
5496 }
5497
5498 if (!DTRACE_INSCRATCH(mstate, jsonlen + 1 + elemlen + 1)) {
5499 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5500 regs[rd] = 0;
5501 break;
5502 }
5503
5504 /*
5505 * Read the element selector and split it up into a packed list
5506 * of strings.
5507 */
5508 for (cur = elem; cur < elem + elemlen; cur++) {
5509 char cc = dtrace_load8(cur);
5510
5511 if (cur == elem && cc == '[') {
5512 /*
5513 * If the first element selector key is
5514 * actually an array index then ignore the
5515 * bracket.
5516 */
5517 continue;
5518 }
5519
5520 if (cc == ']')
5521 continue;
5522
5523 if (cc == '.' || cc == '[') {
5524 nelems++;
5525 cc = '\0';
5526 }
5527
5528 *ee++ = cc;
5529 }
5530 *ee++ = '\0';
5531
5532 if ((regs[rd] = (uintptr_t)dtrace_json(size, json, elemlist,
5533 nelems, dest)) != 0)
5534 mstate->dtms_scratch_ptr += jsonlen + 1;
5535 break;
5536 }
5537
5538 case DIF_SUBR_TOUPPER:
5539 case DIF_SUBR_TOLOWER: {
5540 uintptr_t src = tupregs[0].dttk_value;
5541 char *dest = (char *)mstate->dtms_scratch_ptr;
5542 char lower, upper, base, c;
5543 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5544 size_t len = dtrace_strlen((char*) src, size);
5545 size_t i = 0;
5546
5547 lower = (subr == DIF_SUBR_TOUPPER) ? 'a' : 'A';
5548 upper = (subr == DIF_SUBR_TOUPPER) ? 'z' : 'Z';
5549 base = (subr == DIF_SUBR_TOUPPER) ? 'A' : 'a';
5550
5551 if (!dtrace_canload(src, len + 1, mstate, vstate)) {
5552 regs[rd] = 0;
5553 break;
5554 }
5555
5556 if (!DTRACE_INSCRATCH(mstate, size)) {
5557 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5558 regs[rd] = 0;
5559 break;
5560 }
5561
5562 for (i = 0; i < size - 1; ++i) {
5563 if ((c = dtrace_load8(src + i)) == '\0')
5564 break;
5565 if (c >= lower && c <= upper)
5566 c = base + (c - lower);
5567 dest[i] = c;
5568 }
5569
5570 ASSERT(i < size);
5571
5572 dest[i] = '\0';
5573 regs[rd] = (uintptr_t) dest;
5574 mstate->dtms_scratch_ptr += size;
5575
5576 break;
5577 }
5578 case DIF_SUBR_STRIP:
5579 if (!dtrace_is_valid_ptrauth_key(tupregs[1].dttk_value)) {
5580 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5581 break;
5582 }
5583 regs[rd] = (uint64_t)dtrace_ptrauth_strip(
5584 (void*)tupregs[0].dttk_value, tupregs[1].dttk_value);
5585 break;
5586
5587 #if defined(__APPLE__)
5588 case DIF_SUBR_VM_KERNEL_ADDRPERM: {
5589 if (!dtrace_priv_kernel(state)) {
5590 regs[rd] = 0;
5591 } else {
5592 regs[rd] = VM_KERNEL_ADDRPERM((vm_offset_t) tupregs[0].dttk_value);
5593 }
5594
5595 break;
5596 }
5597
5598 case DIF_SUBR_KDEBUG_TRACE: {
5599 uint32_t debugid;
5600 uintptr_t args[4] = {0};
5601 int i;
5602
5603 if (nargs < 2 || nargs > 5) {
5604 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5605 break;
5606 }
5607
5608 if (dtrace_destructive_disallow)
5609 return;
5610
5611 debugid = tupregs[0].dttk_value;
5612 for (i = 0; i < nargs - 1; i++)
5613 args[i] = tupregs[i + 1].dttk_value;
5614
5615 kernel_debug(debugid, args[0], args[1], args[2], args[3], 0);
5616
5617 break;
5618 }
5619
5620 case DIF_SUBR_KDEBUG_TRACE_STRING: {
5621 if (nargs != 3) {
5622 break;
5623 }
5624
5625 if (dtrace_destructive_disallow)
5626 return;
5627
5628 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5629 uint32_t debugid = tupregs[0].dttk_value;
5630 uint64_t str_id = tupregs[1].dttk_value;
5631 uintptr_t src = tupregs[2].dttk_value;
5632 size_t lim;
5633 char buf[size];
5634 char* str = NULL;
5635
5636 if (src != (uintptr_t)0) {
5637 str = buf;
5638 if (!dtrace_strcanload(src, size, &lim, mstate, vstate)) {
5639 break;
5640 }
5641 dtrace_strcpy((void*)src, buf, size);
5642 }
5643
5644 (void)kernel_debug_string(debugid, &str_id, str);
5645 regs[rd] = str_id;
5646
5647 break;
5648 }
5649 #endif
5650
5651 }
5652 }
5653
5654 /*
5655 * Emulate the execution of DTrace IR instructions specified by the given
5656 * DIF object. This function is deliberately void of assertions as all of
5657 * the necessary checks are handled by a call to dtrace_difo_validate().
5658 */
5659 static uint64_t
5660 dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
5661 dtrace_vstate_t *vstate, dtrace_state_t *state)
5662 {
5663 const dif_instr_t *text = difo->dtdo_buf;
5664 const uint_t textlen = difo->dtdo_len;
5665 const char *strtab = difo->dtdo_strtab;
5666 const uint64_t *inttab = difo->dtdo_inttab;
5667
5668 uint64_t rval = 0;
5669 dtrace_statvar_t *svar;
5670 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
5671 dtrace_difv_t *v;
5672 volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
5673 volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
5674
5675 dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
5676 uint64_t regs[DIF_DIR_NREGS];
5677 uint64_t *tmp;
5678
5679 uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
5680 int64_t cc_r;
5681 uint_t pc = 0, id, opc = 0;
5682 uint8_t ttop = 0;
5683 dif_instr_t instr;
5684 uint_t r1, r2, rd;
5685
5686 /*
5687 * We stash the current DIF object into the machine state: we need it
5688 * for subsequent access checking.
5689 */
5690 mstate->dtms_difo = difo;
5691
5692 regs[DIF_REG_R0] = 0; /* %r0 is fixed at zero */
5693
5694 while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
5695 opc = pc;
5696
5697 instr = text[pc++];
5698 r1 = DIF_INSTR_R1(instr);
5699 r2 = DIF_INSTR_R2(instr);
5700 rd = DIF_INSTR_RD(instr);
5701
5702 switch (DIF_INSTR_OP(instr)) {
5703 case DIF_OP_OR:
5704 regs[rd] = regs[r1] | regs[r2];
5705 break;
5706 case DIF_OP_XOR:
5707 regs[rd] = regs[r1] ^ regs[r2];
5708 break;
5709 case DIF_OP_AND:
5710 regs[rd] = regs[r1] & regs[r2];
5711 break;
5712 case DIF_OP_SLL:
5713 regs[rd] = regs[r1] << regs[r2];
5714 break;
5715 case DIF_OP_SRL:
5716 regs[rd] = regs[r1] >> regs[r2];
5717 break;
5718 case DIF_OP_SUB:
5719 regs[rd] = regs[r1] - regs[r2];
5720 break;
5721 case DIF_OP_ADD:
5722 regs[rd] = regs[r1] + regs[r2];
5723 break;
5724 case DIF_OP_MUL:
5725 regs[rd] = regs[r1] * regs[r2];
5726 break;
5727 case DIF_OP_SDIV:
5728 if (regs[r2] == 0) {
5729 regs[rd] = 0;
5730 *flags |= CPU_DTRACE_DIVZERO;
5731 } else {
5732 regs[rd] = (int64_t)regs[r1] /
5733 (int64_t)regs[r2];
5734 }
5735 break;
5736
5737 case DIF_OP_UDIV:
5738 if (regs[r2] == 0) {
5739 regs[rd] = 0;
5740 *flags |= CPU_DTRACE_DIVZERO;
5741 } else {
5742 regs[rd] = regs[r1] / regs[r2];
5743 }
5744 break;
5745
5746 case DIF_OP_SREM:
5747 if (regs[r2] == 0) {
5748 regs[rd] = 0;
5749 *flags |= CPU_DTRACE_DIVZERO;
5750 } else {
5751 regs[rd] = (int64_t)regs[r1] %
5752 (int64_t)regs[r2];
5753 }
5754 break;
5755
5756 case DIF_OP_UREM:
5757 if (regs[r2] == 0) {
5758 regs[rd] = 0;
5759 *flags |= CPU_DTRACE_DIVZERO;
5760 } else {
5761 regs[rd] = regs[r1] % regs[r2];
5762 }
5763 break;
5764
5765 case DIF_OP_NOT:
5766 regs[rd] = ~regs[r1];
5767 break;
5768 case DIF_OP_MOV:
5769 regs[rd] = regs[r1];
5770 break;
5771 case DIF_OP_CMP:
5772 cc_r = regs[r1] - regs[r2];
5773 cc_n = cc_r < 0;
5774 cc_z = cc_r == 0;
5775 cc_v = 0;
5776 cc_c = regs[r1] < regs[r2];
5777 break;
5778 case DIF_OP_TST:
5779 cc_n = cc_v = cc_c = 0;
5780 cc_z = regs[r1] == 0;
5781 break;
5782 case DIF_OP_BA:
5783 pc = DIF_INSTR_LABEL(instr);
5784 break;
5785 case DIF_OP_BE:
5786 if (cc_z)
5787 pc = DIF_INSTR_LABEL(instr);
5788 break;
5789 case DIF_OP_BNE:
5790 if (cc_z == 0)
5791 pc = DIF_INSTR_LABEL(instr);
5792 break;
5793 case DIF_OP_BG:
5794 if ((cc_z | (cc_n ^ cc_v)) == 0)
5795 pc = DIF_INSTR_LABEL(instr);
5796 break;
5797 case DIF_OP_BGU:
5798 if ((cc_c | cc_z) == 0)
5799 pc = DIF_INSTR_LABEL(instr);
5800 break;
5801 case DIF_OP_BGE:
5802 if ((cc_n ^ cc_v) == 0)
5803 pc = DIF_INSTR_LABEL(instr);
5804 break;
5805 case DIF_OP_BGEU:
5806 if (cc_c == 0)
5807 pc = DIF_INSTR_LABEL(instr);
5808 break;
5809 case DIF_OP_BL:
5810 if (cc_n ^ cc_v)
5811 pc = DIF_INSTR_LABEL(instr);
5812 break;
5813 case DIF_OP_BLU:
5814 if (cc_c)
5815 pc = DIF_INSTR_LABEL(instr);
5816 break;
5817 case DIF_OP_BLE:
5818 if (cc_z | (cc_n ^ cc_v))
5819 pc = DIF_INSTR_LABEL(instr);
5820 break;
5821 case DIF_OP_BLEU:
5822 if (cc_c | cc_z)
5823 pc = DIF_INSTR_LABEL(instr);
5824 break;
5825 case DIF_OP_RLDSB:
5826 if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
5827 *flags |= CPU_DTRACE_KPRIV;
5828 *illval = regs[r1];
5829 break;
5830 }
5831 /*FALLTHROUGH*/
5832 case DIF_OP_LDSB:
5833 regs[rd] = (int8_t)dtrace_load8(regs[r1]);
5834 break;
5835 case DIF_OP_RLDSH:
5836 if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
5837 *flags |= CPU_DTRACE_KPRIV;
5838 *illval = regs[r1];
5839 break;
5840 }
5841 /*FALLTHROUGH*/
5842 case DIF_OP_LDSH:
5843 regs[rd] = (int16_t)dtrace_load16(regs[r1]);
5844 break;
5845 case DIF_OP_RLDSW:
5846 if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
5847 *flags |= CPU_DTRACE_KPRIV;
5848 *illval = regs[r1];
5849 break;
5850 }
5851 /*FALLTHROUGH*/
5852 case DIF_OP_LDSW:
5853 regs[rd] = (int32_t)dtrace_load32(regs[r1]);
5854 break;
5855 case DIF_OP_RLDUB:
5856 if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
5857 *flags |= CPU_DTRACE_KPRIV;
5858 *illval = regs[r1];
5859 break;
5860 }
5861 /*FALLTHROUGH*/
5862 case DIF_OP_LDUB:
5863 regs[rd] = dtrace_load8(regs[r1]);
5864 break;
5865 case DIF_OP_RLDUH:
5866 if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
5867 *flags |= CPU_DTRACE_KPRIV;
5868 *illval = regs[r1];
5869 break;
5870 }
5871 /*FALLTHROUGH*/
5872 case DIF_OP_LDUH:
5873 regs[rd] = dtrace_load16(regs[r1]);
5874 break;
5875 case DIF_OP_RLDUW:
5876 if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
5877 *flags |= CPU_DTRACE_KPRIV;
5878 *illval = regs[r1];
5879 break;
5880 }
5881 /*FALLTHROUGH*/
5882 case DIF_OP_LDUW:
5883 regs[rd] = dtrace_load32(regs[r1]);
5884 break;
5885 case DIF_OP_RLDX:
5886 if (!dtrace_canstore(regs[r1], 8, mstate, vstate)) {
5887 *flags |= CPU_DTRACE_KPRIV;
5888 *illval = regs[r1];
5889 break;
5890 }
5891 /*FALLTHROUGH*/
5892 case DIF_OP_LDX:
5893 regs[rd] = dtrace_load64(regs[r1]);
5894 break;
5895 /*
5896 * Darwin 32-bit kernel may fetch from 64-bit user.
5897 * Do not cast regs to uintptr_t
5898 * DIF_OP_ULDSB,DIF_OP_ULDSH, DIF_OP_ULDSW, DIF_OP_ULDUB
5899 * DIF_OP_ULDUH, DIF_OP_ULDUW, DIF_OP_ULDX
5900 */
5901 case DIF_OP_ULDSB:
5902 regs[rd] = (int8_t)
5903 dtrace_fuword8(regs[r1]);
5904 break;
5905 case DIF_OP_ULDSH:
5906 regs[rd] = (int16_t)
5907 dtrace_fuword16(regs[r1]);
5908 break;
5909 case DIF_OP_ULDSW:
5910 regs[rd] = (int32_t)
5911 dtrace_fuword32(regs[r1]);
5912 break;
5913 case DIF_OP_ULDUB:
5914 regs[rd] =
5915 dtrace_fuword8(regs[r1]);
5916 break;
5917 case DIF_OP_ULDUH:
5918 regs[rd] =
5919 dtrace_fuword16(regs[r1]);
5920 break;
5921 case DIF_OP_ULDUW:
5922 regs[rd] =
5923 dtrace_fuword32(regs[r1]);
5924 break;
5925 case DIF_OP_ULDX:
5926 regs[rd] =
5927 dtrace_fuword64(regs[r1]);
5928 break;
5929 case DIF_OP_RET:
5930 rval = regs[rd];
5931 pc = textlen;
5932 break;
5933 case DIF_OP_NOP:
5934 break;
5935 case DIF_OP_SETX:
5936 regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
5937 break;
5938 case DIF_OP_SETS:
5939 regs[rd] = (uint64_t)(uintptr_t)
5940 (strtab + DIF_INSTR_STRING(instr));
5941 break;
5942 case DIF_OP_SCMP: {
5943 size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
5944 uintptr_t s1 = regs[r1];
5945 uintptr_t s2 = regs[r2];
5946 size_t lim1 = sz, lim2 = sz;
5947
5948 if (s1 != 0 &&
5949 !dtrace_strcanload(s1, sz, &lim1, mstate, vstate))
5950 break;
5951 if (s2 != 0 &&
5952 !dtrace_strcanload(s2, sz, &lim2, mstate, vstate))
5953 break;
5954
5955 cc_r = dtrace_strncmp((char *)s1, (char *)s2,
5956 MIN(lim1, lim2));
5957
5958 cc_n = cc_r < 0;
5959 cc_z = cc_r == 0;
5960 cc_v = cc_c = 0;
5961 break;
5962 }
5963 case DIF_OP_LDGA:
5964 regs[rd] = dtrace_dif_variable(mstate, state,
5965 r1, regs[r2]);
5966 break;
5967 case DIF_OP_LDGS:
5968 id = DIF_INSTR_VAR(instr);
5969
5970 if (id >= DIF_VAR_OTHER_UBASE) {
5971 uintptr_t a;
5972
5973 id -= DIF_VAR_OTHER_UBASE;
5974 svar = vstate->dtvs_globals[id];
5975 ASSERT(svar != NULL);
5976 v = &svar->dtsv_var;
5977
5978 if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
5979 regs[rd] = svar->dtsv_data;
5980 break;
5981 }
5982
5983 a = (uintptr_t)svar->dtsv_data;
5984
5985 if (*(uint8_t *)a == UINT8_MAX) {
5986 /*
5987 * If the 0th byte is set to UINT8_MAX
5988 * then this is to be treated as a
5989 * reference to a NULL variable.
5990 */
5991 regs[rd] = 0;
5992 } else {
5993 regs[rd] = a + sizeof (uint64_t);
5994 }
5995
5996 break;
5997 }
5998
5999 regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
6000 break;
6001
6002 case DIF_OP_STGS:
6003 id = DIF_INSTR_VAR(instr);
6004
6005 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6006 id -= DIF_VAR_OTHER_UBASE;
6007
6008 VERIFY(id < (uint_t)vstate->dtvs_nglobals);
6009 svar = vstate->dtvs_globals[id];
6010 ASSERT(svar != NULL);
6011 v = &svar->dtsv_var;
6012
6013 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6014 uintptr_t a = (uintptr_t)svar->dtsv_data;
6015 size_t lim;
6016
6017 ASSERT(a != 0);
6018 ASSERT(svar->dtsv_size != 0);
6019
6020 if (regs[rd] == 0) {
6021 *(uint8_t *)a = UINT8_MAX;
6022 break;
6023 } else {
6024 *(uint8_t *)a = 0;
6025 a += sizeof (uint64_t);
6026 }
6027 if (!dtrace_vcanload(
6028 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6029 &lim, mstate, vstate))
6030 break;
6031
6032 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6033 (void *)a, &v->dtdv_type, lim);
6034 break;
6035 }
6036
6037 svar->dtsv_data = regs[rd];
6038 break;
6039
6040 case DIF_OP_LDTA:
6041 /*
6042 * There are no DTrace built-in thread-local arrays at
6043 * present. This opcode is saved for future work.
6044 */
6045 *flags |= CPU_DTRACE_ILLOP;
6046 regs[rd] = 0;
6047 break;
6048
6049 case DIF_OP_LDLS:
6050 id = DIF_INSTR_VAR(instr);
6051
6052 if (id < DIF_VAR_OTHER_UBASE) {
6053 /*
6054 * For now, this has no meaning.
6055 */
6056 regs[rd] = 0;
6057 break;
6058 }
6059
6060 id -= DIF_VAR_OTHER_UBASE;
6061
6062 ASSERT(id < (uint_t)vstate->dtvs_nlocals);
6063 ASSERT(vstate->dtvs_locals != NULL);
6064 svar = vstate->dtvs_locals[id];
6065 ASSERT(svar != NULL);
6066 v = &svar->dtsv_var;
6067
6068 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6069 uintptr_t a = (uintptr_t)svar->dtsv_data;
6070 size_t sz = v->dtdv_type.dtdt_size;
6071
6072 sz += sizeof (uint64_t);
6073 ASSERT(svar->dtsv_size == (int)NCPU * sz);
6074 a += CPU->cpu_id * sz;
6075
6076 if (*(uint8_t *)a == UINT8_MAX) {
6077 /*
6078 * If the 0th byte is set to UINT8_MAX
6079 * then this is to be treated as a
6080 * reference to a NULL variable.
6081 */
6082 regs[rd] = 0;
6083 } else {
6084 regs[rd] = a + sizeof (uint64_t);
6085 }
6086
6087 break;
6088 }
6089
6090 ASSERT(svar->dtsv_size == (int)NCPU * sizeof (uint64_t));
6091 tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6092 regs[rd] = tmp[CPU->cpu_id];
6093 break;
6094
6095 case DIF_OP_STLS:
6096 id = DIF_INSTR_VAR(instr);
6097
6098 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6099 id -= DIF_VAR_OTHER_UBASE;
6100 VERIFY(id < (uint_t)vstate->dtvs_nlocals);
6101 ASSERT(vstate->dtvs_locals != NULL);
6102 svar = vstate->dtvs_locals[id];
6103 ASSERT(svar != NULL);
6104 v = &svar->dtsv_var;
6105
6106 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6107 uintptr_t a = (uintptr_t)svar->dtsv_data;
6108 size_t sz = v->dtdv_type.dtdt_size;
6109 size_t lim;
6110
6111 sz += sizeof (uint64_t);
6112 ASSERT(svar->dtsv_size == (int)NCPU * sz);
6113 a += CPU->cpu_id * sz;
6114
6115 if (regs[rd] == 0) {
6116 *(uint8_t *)a = UINT8_MAX;
6117 break;
6118 } else {
6119 *(uint8_t *)a = 0;
6120 a += sizeof (uint64_t);
6121 }
6122
6123 if (!dtrace_vcanload(
6124 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6125 &lim, mstate, vstate))
6126 break;
6127
6128 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6129 (void *)a, &v->dtdv_type, lim);
6130 break;
6131 }
6132
6133 ASSERT(svar->dtsv_size == (int)NCPU * sizeof (uint64_t));
6134 tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6135 tmp[CPU->cpu_id] = regs[rd];
6136 break;
6137
6138 case DIF_OP_LDTS: {
6139 dtrace_dynvar_t *dvar;
6140 dtrace_key_t *key;
6141
6142 id = DIF_INSTR_VAR(instr);
6143 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6144 id -= DIF_VAR_OTHER_UBASE;
6145 v = &vstate->dtvs_tlocals[id];
6146
6147 key = &tupregs[DIF_DTR_NREGS];
6148 key[0].dttk_value = (uint64_t)id;
6149 key[0].dttk_size = 0;
6150 DTRACE_TLS_THRKEY(key[1].dttk_value);
6151 key[1].dttk_size = 0;
6152
6153 dvar = dtrace_dynvar(dstate, 2, key,
6154 sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
6155 mstate, vstate);
6156
6157 if (dvar == NULL) {
6158 regs[rd] = 0;
6159 break;
6160 }
6161
6162 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6163 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6164 } else {
6165 regs[rd] = *((uint64_t *)dvar->dtdv_data);
6166 }
6167
6168 break;
6169 }
6170
6171 case DIF_OP_STTS: {
6172 dtrace_dynvar_t *dvar;
6173 dtrace_key_t *key;
6174
6175 id = DIF_INSTR_VAR(instr);
6176 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6177 id -= DIF_VAR_OTHER_UBASE;
6178 VERIFY(id < (uint_t)vstate->dtvs_ntlocals);
6179
6180 key = &tupregs[DIF_DTR_NREGS];
6181 key[0].dttk_value = (uint64_t)id;
6182 key[0].dttk_size = 0;
6183 DTRACE_TLS_THRKEY(key[1].dttk_value);
6184 key[1].dttk_size = 0;
6185 v = &vstate->dtvs_tlocals[id];
6186
6187 dvar = dtrace_dynvar(dstate, 2, key,
6188 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6189 v->dtdv_type.dtdt_size : sizeof (uint64_t),
6190 regs[rd] ? DTRACE_DYNVAR_ALLOC :
6191 DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6192
6193 /*
6194 * Given that we're storing to thread-local data,
6195 * we need to flush our predicate cache.
6196 */
6197 dtrace_set_thread_predcache(current_thread(), 0);
6198
6199 if (dvar == NULL)
6200 break;
6201
6202 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6203 size_t lim;
6204
6205 if (!dtrace_vcanload(
6206 (void *)(uintptr_t)regs[rd],
6207 &v->dtdv_type, &lim, mstate, vstate))
6208 break;
6209
6210 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6211 dvar->dtdv_data, &v->dtdv_type, lim);
6212 } else {
6213 *((uint64_t *)dvar->dtdv_data) = regs[rd];
6214 }
6215
6216 break;
6217 }
6218
6219 case DIF_OP_SRA:
6220 regs[rd] = (int64_t)regs[r1] >> regs[r2];
6221 break;
6222
6223 case DIF_OP_CALL:
6224 dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
6225 regs, tupregs, ttop, mstate, state);
6226 break;
6227
6228 case DIF_OP_PUSHTR:
6229 if (ttop == DIF_DTR_NREGS) {
6230 *flags |= CPU_DTRACE_TUPOFLOW;
6231 break;
6232 }
6233
6234 if (r1 == DIF_TYPE_STRING) {
6235 /*
6236 * If this is a string type and the size is 0,
6237 * we'll use the system-wide default string
6238 * size. Note that we are _not_ looking at
6239 * the value of the DTRACEOPT_STRSIZE option;
6240 * had this been set, we would expect to have
6241 * a non-zero size value in the "pushtr".
6242 */
6243 tupregs[ttop].dttk_size =
6244 dtrace_strlen((char *)(uintptr_t)regs[rd],
6245 regs[r2] ? regs[r2] :
6246 dtrace_strsize_default) + 1;
6247 } else {
6248 if (regs[r2] > LONG_MAX) {
6249 *flags |= CPU_DTRACE_ILLOP;
6250 break;
6251 }
6252 tupregs[ttop].dttk_size = regs[r2];
6253 }
6254
6255 tupregs[ttop++].dttk_value = regs[rd];
6256 break;
6257
6258 case DIF_OP_PUSHTV:
6259 if (ttop == DIF_DTR_NREGS) {
6260 *flags |= CPU_DTRACE_TUPOFLOW;
6261 break;
6262 }
6263
6264 tupregs[ttop].dttk_value = regs[rd];
6265 tupregs[ttop++].dttk_size = 0;
6266 break;
6267
6268 case DIF_OP_POPTS:
6269 if (ttop != 0)
6270 ttop--;
6271 break;
6272
6273 case DIF_OP_FLUSHTS:
6274 ttop = 0;
6275 break;
6276
6277 case DIF_OP_LDGAA:
6278 case DIF_OP_LDTAA: {
6279 dtrace_dynvar_t *dvar;
6280 dtrace_key_t *key = tupregs;
6281 uint_t nkeys = ttop;
6282
6283 id = DIF_INSTR_VAR(instr);
6284 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6285 id -= DIF_VAR_OTHER_UBASE;
6286
6287 key[nkeys].dttk_value = (uint64_t)id;
6288 key[nkeys++].dttk_size = 0;
6289
6290 if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
6291 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6292 key[nkeys++].dttk_size = 0;
6293 VERIFY(id < (uint_t)vstate->dtvs_ntlocals);
6294 v = &vstate->dtvs_tlocals[id];
6295 } else {
6296 VERIFY(id < (uint_t)vstate->dtvs_nglobals);
6297 v = &vstate->dtvs_globals[id]->dtsv_var;
6298 }
6299
6300 dvar = dtrace_dynvar(dstate, nkeys, key,
6301 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6302 v->dtdv_type.dtdt_size : sizeof (uint64_t),
6303 DTRACE_DYNVAR_NOALLOC, mstate, vstate);
6304
6305 if (dvar == NULL) {
6306 regs[rd] = 0;
6307 break;
6308 }
6309
6310 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6311 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6312 } else {
6313 regs[rd] = *((uint64_t *)dvar->dtdv_data);
6314 }
6315
6316 break;
6317 }
6318
6319 case DIF_OP_STGAA:
6320 case DIF_OP_STTAA: {
6321 dtrace_dynvar_t *dvar;
6322 dtrace_key_t *key = tupregs;
6323 uint_t nkeys = ttop;
6324
6325 id = DIF_INSTR_VAR(instr);
6326 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6327 id -= DIF_VAR_OTHER_UBASE;
6328
6329 key[nkeys].dttk_value = (uint64_t)id;
6330 key[nkeys++].dttk_size = 0;
6331
6332 if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
6333 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6334 key[nkeys++].dttk_size = 0;
6335 VERIFY(id < (uint_t)vstate->dtvs_ntlocals);
6336 v = &vstate->dtvs_tlocals[id];
6337 } else {
6338 VERIFY(id < (uint_t)vstate->dtvs_nglobals);
6339 v = &vstate->dtvs_globals[id]->dtsv_var;
6340 }
6341
6342 dvar = dtrace_dynvar(dstate, nkeys, key,
6343 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6344 v->dtdv_type.dtdt_size : sizeof (uint64_t),
6345 regs[rd] ? DTRACE_DYNVAR_ALLOC :
6346 DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6347
6348 if (dvar == NULL)
6349 break;
6350
6351 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6352 size_t lim;
6353
6354 if (!dtrace_vcanload(
6355 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6356 &lim, mstate, vstate))
6357 break;
6358
6359 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6360 dvar->dtdv_data, &v->dtdv_type, lim);
6361 } else {
6362 *((uint64_t *)dvar->dtdv_data) = regs[rd];
6363 }
6364
6365 break;
6366 }
6367
6368 case DIF_OP_ALLOCS: {
6369 uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6370 size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
6371
6372 /*
6373 * Rounding up the user allocation size could have
6374 * overflowed large, bogus allocations (like -1ULL) to
6375 * 0.
6376 */
6377 if (size < regs[r1] ||
6378 !DTRACE_INSCRATCH(mstate, size)) {
6379 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6380 regs[rd] = 0;
6381 break;
6382 }
6383
6384 dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
6385 mstate->dtms_scratch_ptr += size;
6386 regs[rd] = ptr;
6387 break;
6388 }
6389
6390 case DIF_OP_COPYS:
6391 if (!dtrace_canstore(regs[rd], regs[r2],
6392 mstate, vstate)) {
6393 *flags |= CPU_DTRACE_BADADDR;
6394 *illval = regs[rd];
6395 break;
6396 }
6397
6398 if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
6399 break;
6400
6401 dtrace_bcopy((void *)(uintptr_t)regs[r1],
6402 (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
6403 break;
6404
6405 case DIF_OP_STB:
6406 if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
6407 *flags |= CPU_DTRACE_BADADDR;
6408 *illval = regs[rd];
6409 break;
6410 }
6411 *((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
6412 break;
6413
6414 case DIF_OP_STH:
6415 if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
6416 *flags |= CPU_DTRACE_BADADDR;
6417 *illval = regs[rd];
6418 break;
6419 }
6420 if (regs[rd] & 1) {
6421 *flags |= CPU_DTRACE_BADALIGN;
6422 *illval = regs[rd];
6423 break;
6424 }
6425 *((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
6426 break;
6427
6428 case DIF_OP_STW:
6429 if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
6430 *flags |= CPU_DTRACE_BADADDR;
6431 *illval = regs[rd];
6432 break;
6433 }
6434 if (regs[rd] & 3) {
6435 *flags |= CPU_DTRACE_BADALIGN;
6436 *illval = regs[rd];
6437 break;
6438 }
6439 *((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
6440 break;
6441
6442 case DIF_OP_STX:
6443 if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
6444 *flags |= CPU_DTRACE_BADADDR;
6445 *illval = regs[rd];
6446 break;
6447 }
6448
6449 /*
6450 * Darwin kmem_zalloc() called from
6451 * dtrace_difo_init() is 4-byte aligned.
6452 */
6453 if (regs[rd] & 3) {
6454 *flags |= CPU_DTRACE_BADALIGN;
6455 *illval = regs[rd];
6456 break;
6457 }
6458 *((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
6459 break;
6460 case DIF_OP_STRIP:
6461 regs[rd] = (uint64_t)dtrace_ptrauth_strip(
6462 (void*)regs[r1], r2);
6463 break;
6464 }
6465 }
6466
6467 if (!(*flags & CPU_DTRACE_FAULT))
6468 return (rval);
6469
6470 mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
6471 mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
6472
6473 return (0);
6474 }
6475
6476 static void
6477 dtrace_action_breakpoint(dtrace_ecb_t *ecb)
6478 {
6479 dtrace_probe_t *probe = ecb->dte_probe;
6480 dtrace_provider_t *prov = probe->dtpr_provider;
6481 char c[DTRACE_FULLNAMELEN + 80], *str;
6482 const char *msg = "dtrace: breakpoint action at probe ";
6483 const char *ecbmsg = " (ecb ";
6484 uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
6485 uintptr_t val = (uintptr_t)ecb;
6486 int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
6487
6488 if (dtrace_destructive_disallow)
6489 return;
6490
6491 /*
6492 * It's impossible to be taking action on the NULL probe.
6493 */
6494 ASSERT(probe != NULL);
6495
6496 /*
6497 * This is a poor man's (destitute man's?) sprintf(): we want to
6498 * print the provider name, module name, function name and name of
6499 * the probe, along with the hex address of the ECB with the breakpoint
6500 * action -- all of which we must place in the character buffer by
6501 * hand.
6502 */
6503 while (*msg != '\0')
6504 c[i++] = *msg++;
6505
6506 for (str = prov->dtpv_name; *str != '\0'; str++)
6507 c[i++] = *str;
6508 c[i++] = ':';
6509
6510 for (str = probe->dtpr_mod; *str != '\0'; str++)
6511 c[i++] = *str;
6512 c[i++] = ':';
6513
6514 for (str = probe->dtpr_func; *str != '\0'; str++)
6515 c[i++] = *str;
6516 c[i++] = ':';
6517
6518 for (str = probe->dtpr_name; *str != '\0'; str++)
6519 c[i++] = *str;
6520
6521 while (*ecbmsg != '\0')
6522 c[i++] = *ecbmsg++;
6523
6524 while (shift >= 0) {
6525 mask = (uintptr_t)0xf << shift;
6526
6527 if (val >= ((uintptr_t)1 << shift))
6528 c[i++] = "0123456789abcdef"[(val & mask) >> shift];
6529 shift -= 4;
6530 }
6531
6532 c[i++] = ')';
6533 c[i] = '\0';
6534
6535 debug_enter(c);
6536 }
6537
6538 static void
6539 dtrace_action_panic(dtrace_ecb_t *ecb)
6540 {
6541 dtrace_probe_t *probe = ecb->dte_probe;
6542
6543 /*
6544 * It's impossible to be taking action on the NULL probe.
6545 */
6546 ASSERT(probe != NULL);
6547
6548 if (dtrace_destructive_disallow)
6549 return;
6550
6551 if (dtrace_panicked != NULL)
6552 return;
6553
6554 if (dtrace_casptr(&dtrace_panicked, NULL, current_thread()) != NULL)
6555 return;
6556
6557 /*
6558 * We won the right to panic. (We want to be sure that only one
6559 * thread calls panic() from dtrace_probe(), and that panic() is
6560 * called exactly once.)
6561 */
6562 panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
6563 probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
6564 probe->dtpr_func, probe->dtpr_name, (void *)ecb);
6565
6566 /*
6567 * APPLE NOTE: this was for an old Mac OS X debug feature
6568 * allowing a return from panic(). Revisit someday.
6569 */
6570 dtrace_panicked = NULL;
6571 }
6572
6573 static void
6574 dtrace_action_raise(uint64_t sig)
6575 {
6576 if (dtrace_destructive_disallow)
6577 return;
6578
6579 if (sig >= NSIG) {
6580 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6581 return;
6582 }
6583
6584 /*
6585 * raise() has a queue depth of 1 -- we ignore all subsequent
6586 * invocations of the raise() action.
6587 */
6588
6589 uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
6590
6591 if (uthread && uthread->t_dtrace_sig == 0) {
6592 uthread->t_dtrace_sig = sig;
6593 act_set_astbsd(current_thread());
6594 }
6595 }
6596
6597 static void
6598 dtrace_action_stop(void)
6599 {
6600 if (dtrace_destructive_disallow)
6601 return;
6602
6603 uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
6604 if (uthread) {
6605 /*
6606 * The currently running process will be set to task_suspend
6607 * when it next leaves the kernel.
6608 */
6609 uthread->t_dtrace_stop = 1;
6610 act_set_astbsd(current_thread());
6611 }
6612 }
6613
6614
6615 /*
6616 * APPLE NOTE: pidresume works in conjunction with the dtrace stop action.
6617 * Both activate only when the currently running process next leaves the
6618 * kernel.
6619 */
6620 static void
6621 dtrace_action_pidresume(uint64_t pid)
6622 {
6623 if (dtrace_destructive_disallow)
6624 return;
6625
6626 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
6627 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6628 return;
6629 }
6630 uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
6631
6632 /*
6633 * When the currently running process leaves the kernel, it attempts to
6634 * task_resume the process (denoted by pid), if that pid appears to have
6635 * been stopped by dtrace_action_stop().
6636 * The currently running process has a pidresume() queue depth of 1 --
6637 * subsequent invocations of the pidresume() action are ignored.
6638 */
6639
6640 if (pid != 0 && uthread && uthread->t_dtrace_resumepid == 0) {
6641 uthread->t_dtrace_resumepid = pid;
6642 act_set_astbsd(current_thread());
6643 }
6644 }
6645
6646 static void
6647 dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
6648 {
6649 hrtime_t now;
6650 volatile uint16_t *flags;
6651 dtrace_cpu_t *cpu = CPU;
6652
6653 if (dtrace_destructive_disallow)
6654 return;
6655
6656 flags = (volatile uint16_t *)&cpu_core[cpu->cpu_id].cpuc_dtrace_flags;
6657
6658 now = dtrace_gethrtime();
6659
6660 if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
6661 /*
6662 * We need to advance the mark to the current time.
6663 */
6664 cpu->cpu_dtrace_chillmark = now;
6665 cpu->cpu_dtrace_chilled = 0;
6666 }
6667
6668 /*
6669 * Now check to see if the requested chill time would take us over
6670 * the maximum amount of time allowed in the chill interval. (Or
6671 * worse, if the calculation itself induces overflow.)
6672 */
6673 if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
6674 cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
6675 *flags |= CPU_DTRACE_ILLOP;
6676 return;
6677 }
6678
6679 while (dtrace_gethrtime() - now < val)
6680 continue;
6681
6682 /*
6683 * Normally, we assure that the value of the variable "timestamp" does
6684 * not change within an ECB. The presence of chill() represents an
6685 * exception to this rule, however.
6686 */
6687 mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
6688 cpu->cpu_dtrace_chilled += val;
6689 }
6690
6691 static void
6692 dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
6693 uint64_t *buf, uint64_t arg)
6694 {
6695 int nframes = DTRACE_USTACK_NFRAMES(arg);
6696 int strsize = DTRACE_USTACK_STRSIZE(arg);
6697 uint64_t *pcs = &buf[1], *fps;
6698 char *str = (char *)&pcs[nframes];
6699 int size, offs = 0, i, j;
6700 uintptr_t old = mstate->dtms_scratch_ptr, saved;
6701 uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
6702 char *sym;
6703
6704 /*
6705 * Should be taking a faster path if string space has not been
6706 * allocated.
6707 */
6708 ASSERT(strsize != 0);
6709
6710 /*
6711 * We will first allocate some temporary space for the frame pointers.
6712 */
6713 fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6714 size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
6715 (nframes * sizeof (uint64_t));
6716
6717 if (!DTRACE_INSCRATCH(mstate, (uintptr_t)size)) {
6718 /*
6719 * Not enough room for our frame pointers -- need to indicate
6720 * that we ran out of scratch space.
6721 */
6722 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6723 return;
6724 }
6725
6726 mstate->dtms_scratch_ptr += size;
6727 saved = mstate->dtms_scratch_ptr;
6728
6729 /*
6730 * Now get a stack with both program counters and frame pointers.
6731 */
6732 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6733 dtrace_getufpstack(buf, fps, nframes + 1);
6734 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6735
6736 /*
6737 * If that faulted, we're cooked.
6738 */
6739 if (*flags & CPU_DTRACE_FAULT)
6740 goto out;
6741
6742 /*
6743 * Now we want to walk up the stack, calling the USTACK helper. For
6744 * each iteration, we restore the scratch pointer.
6745 */
6746 for (i = 0; i < nframes; i++) {
6747 mstate->dtms_scratch_ptr = saved;
6748
6749 if (offs >= strsize)
6750 break;
6751
6752 sym = (char *)(uintptr_t)dtrace_helper(
6753 DTRACE_HELPER_ACTION_USTACK,
6754 mstate, state, pcs[i], fps[i]);
6755
6756 /*
6757 * If we faulted while running the helper, we're going to
6758 * clear the fault and null out the corresponding string.
6759 */
6760 if (*flags & CPU_DTRACE_FAULT) {
6761 *flags &= ~CPU_DTRACE_FAULT;
6762 str[offs++] = '\0';
6763 continue;
6764 }
6765
6766 if (sym == NULL) {
6767 str[offs++] = '\0';
6768 continue;
6769 }
6770
6771 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6772
6773 /*
6774 * Now copy in the string that the helper returned to us.
6775 */
6776 for (j = 0; offs + j < strsize; j++) {
6777 if ((str[offs + j] = sym[j]) == '\0')
6778 break;
6779 }
6780
6781 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6782
6783 offs += j + 1;
6784 }
6785
6786 if (offs >= strsize) {
6787 /*
6788 * If we didn't have room for all of the strings, we don't
6789 * abort processing -- this needn't be a fatal error -- but we
6790 * still want to increment a counter (dts_stkstroverflows) to
6791 * allow this condition to be warned about. (If this is from
6792 * a jstack() action, it is easily tuned via jstackstrsize.)
6793 */
6794 dtrace_error(&state->dts_stkstroverflows);
6795 }
6796
6797 while (offs < strsize)
6798 str[offs++] = '\0';
6799
6800 out:
6801 mstate->dtms_scratch_ptr = old;
6802 }
6803
6804 static void
6805 dtrace_store_by_ref(dtrace_difo_t *dp, caddr_t tomax, size_t size,
6806 size_t *valoffsp, uint64_t *valp, uint64_t end, int intuple, int dtkind)
6807 {
6808 volatile uint16_t *flags;
6809 uint64_t val = *valp;
6810 size_t valoffs = *valoffsp;
6811
6812 flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
6813 ASSERT(dtkind == DIF_TF_BYREF || dtkind == DIF_TF_BYUREF);
6814
6815 /*
6816 * If this is a string, we're going to only load until we find the zero
6817 * byte -- after which we'll store zero bytes.
6818 */
6819 if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
6820 char c = '\0' + 1;
6821 size_t s;
6822
6823 for (s = 0; s < size; s++) {
6824 if (c != '\0' && dtkind == DIF_TF_BYREF) {
6825 c = dtrace_load8(val++);
6826 } else if (c != '\0' && dtkind == DIF_TF_BYUREF) {
6827 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6828 c = dtrace_fuword8((user_addr_t)(uintptr_t)val++);
6829 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6830 if (*flags & CPU_DTRACE_FAULT)
6831 break;
6832 }
6833
6834 DTRACE_STORE(uint8_t, tomax, valoffs++, c);
6835
6836 if (c == '\0' && intuple)
6837 break;
6838 }
6839 } else {
6840 uint8_t c;
6841 while (valoffs < end) {
6842 if (dtkind == DIF_TF_BYREF) {
6843 c = dtrace_load8(val++);
6844 } else if (dtkind == DIF_TF_BYUREF) {
6845 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6846 c = dtrace_fuword8((user_addr_t)(uintptr_t)val++);
6847 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6848 if (*flags & CPU_DTRACE_FAULT)
6849 break;
6850 }
6851
6852 DTRACE_STORE(uint8_t, tomax,
6853 valoffs++, c);
6854 }
6855 }
6856
6857 *valp = val;
6858 *valoffsp = valoffs;
6859 }
6860
6861 /*
6862 * Disables interrupts and sets the per-thread inprobe flag. When DEBUG is
6863 * defined, we also assert that we are not recursing unless the probe ID is an
6864 * error probe.
6865 */
6866 static dtrace_icookie_t
6867 dtrace_probe_enter(dtrace_id_t id)
6868 {
6869 thread_t thread = current_thread();
6870 uint16_t inprobe;
6871
6872 dtrace_icookie_t cookie;
6873
6874 cookie = dtrace_interrupt_disable();
6875
6876 /*
6877 * Unless this is an ERROR probe, we are not allowed to recurse in
6878 * dtrace_probe(). Recursing into DTrace probe usually means that a
6879 * function is instrumented that should not have been instrumented or
6880 * that the ordering guarantee of the records will be violated,
6881 * resulting in unexpected output. If there is an exception to this
6882 * assertion, a new case should be added.
6883 */
6884 inprobe = dtrace_get_thread_inprobe(thread);
6885 VERIFY(inprobe == 0 ||
6886 id == dtrace_probeid_error);
6887 ASSERT(inprobe < UINT16_MAX);
6888 dtrace_set_thread_inprobe(thread, inprobe + 1);
6889
6890 return (cookie);
6891 }
6892
6893 /*
6894 * Clears the per-thread inprobe flag and enables interrupts.
6895 */
6896 static void
6897 dtrace_probe_exit(dtrace_icookie_t cookie)
6898 {
6899 thread_t thread = current_thread();
6900 uint16_t inprobe = dtrace_get_thread_inprobe(thread);
6901
6902 ASSERT(inprobe > 0);
6903 dtrace_set_thread_inprobe(thread, inprobe - 1);
6904
6905 #if INTERRUPT_MASKED_DEBUG
6906 ml_spin_debug_reset(thread);
6907 #endif /* INTERRUPT_MASKED_DEBUG */
6908
6909 dtrace_interrupt_enable(cookie);
6910 }
6911
6912 /*
6913 * If you're looking for the epicenter of DTrace, you just found it. This
6914 * is the function called by the provider to fire a probe -- from which all
6915 * subsequent probe-context DTrace activity emanates.
6916 */
6917 void
6918 dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
6919 uint64_t arg2, uint64_t arg3, uint64_t arg4)
6920 {
6921 processorid_t cpuid;
6922 dtrace_icookie_t cookie;
6923 dtrace_probe_t *probe;
6924 dtrace_mstate_t mstate;
6925 dtrace_ecb_t *ecb;
6926 dtrace_action_t *act;
6927 intptr_t offs;
6928 size_t size;
6929 int vtime, onintr;
6930 volatile uint16_t *flags;
6931 hrtime_t now;
6932
6933 cookie = dtrace_probe_enter(id);
6934 probe = dtrace_probes[id - 1];
6935 cpuid = CPU->cpu_id;
6936 onintr = CPU_ON_INTR(CPU);
6937
6938 if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
6939 probe->dtpr_predcache == dtrace_get_thread_predcache(current_thread())) {
6940 /*
6941 * We have hit in the predicate cache; we know that
6942 * this predicate would evaluate to be false.
6943 */
6944 dtrace_probe_exit(cookie);
6945 return;
6946 }
6947
6948 if (panic_quiesce) {
6949 /*
6950 * We don't trace anything if we're panicking.
6951 */
6952 dtrace_probe_exit(cookie);
6953 return;
6954 }
6955
6956 #if !defined(__APPLE__)
6957 now = dtrace_gethrtime();
6958 vtime = dtrace_vtime_references != 0;
6959
6960 if (vtime && curthread->t_dtrace_start)
6961 curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
6962 #else
6963 /*
6964 * APPLE NOTE: The time spent entering DTrace and arriving
6965 * to this point, is attributed to the current thread.
6966 * Instead it should accrue to DTrace. FIXME
6967 */
6968 vtime = dtrace_vtime_references != 0;
6969
6970 if (vtime)
6971 {
6972 int64_t dtrace_accum_time, recent_vtime;
6973 thread_t thread = current_thread();
6974
6975 dtrace_accum_time = dtrace_get_thread_tracing(thread); /* Time spent inside DTrace so far (nanoseconds) */
6976
6977 if (dtrace_accum_time >= 0) {
6978 recent_vtime = dtrace_abs_to_nano(dtrace_calc_thread_recent_vtime(thread)); /* up to the moment thread vtime */
6979
6980 recent_vtime = recent_vtime - dtrace_accum_time; /* Time without DTrace contribution */
6981
6982 dtrace_set_thread_vtime(thread, recent_vtime);
6983 }
6984 }
6985
6986 now = dtrace_gethrtime(); /* must not precede dtrace_calc_thread_recent_vtime() call! */
6987 #endif /* __APPLE__ */
6988
6989 /*
6990 * APPLE NOTE: A provider may call dtrace_probe_error() in lieu of
6991 * dtrace_probe() in some circumstances. See, e.g. fasttrap_isa.c.
6992 * However the provider has no access to ECB context, so passes
6993 * 0 through "arg0" and the probe_id of the overridden probe as arg1.
6994 * Detect that here and cons up a viable state (from the probe_id).
6995 */
6996 if (dtrace_probeid_error == id && 0 == arg0) {
6997 dtrace_id_t ftp_id = (dtrace_id_t)arg1;
6998 dtrace_probe_t *ftp_probe = dtrace_probes[ftp_id - 1];
6999 dtrace_ecb_t *ftp_ecb = ftp_probe->dtpr_ecb;
7000
7001 if (NULL != ftp_ecb) {
7002 dtrace_state_t *ftp_state = ftp_ecb->dte_state;
7003
7004 arg0 = (uint64_t)(uintptr_t)ftp_state;
7005 arg1 = ftp_ecb->dte_epid;
7006 /*
7007 * args[2-4] established by caller.
7008 */
7009 ftp_state->dts_arg_error_illval = -1; /* arg5 */
7010 }
7011 }
7012
7013 mstate.dtms_difo = NULL;
7014 mstate.dtms_probe = probe;
7015 mstate.dtms_strtok = 0;
7016 mstate.dtms_arg[0] = arg0;
7017 mstate.dtms_arg[1] = arg1;
7018 mstate.dtms_arg[2] = arg2;
7019 mstate.dtms_arg[3] = arg3;
7020 mstate.dtms_arg[4] = arg4;
7021
7022 flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
7023
7024 for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
7025 dtrace_predicate_t *pred = ecb->dte_predicate;
7026 dtrace_state_t *state = ecb->dte_state;
7027 dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
7028 dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
7029 dtrace_vstate_t *vstate = &state->dts_vstate;
7030 dtrace_provider_t *prov = probe->dtpr_provider;
7031 uint64_t tracememsize = 0;
7032 int committed = 0;
7033 caddr_t tomax;
7034
7035 /*
7036 * A little subtlety with the following (seemingly innocuous)
7037 * declaration of the automatic 'val': by looking at the
7038 * code, you might think that it could be declared in the
7039 * action processing loop, below. (That is, it's only used in
7040 * the action processing loop.) However, it must be declared
7041 * out of that scope because in the case of DIF expression
7042 * arguments to aggregating actions, one iteration of the
7043 * action loop will use the last iteration's value.
7044 */
7045 #ifdef lint
7046 uint64_t val = 0;
7047 #else
7048 uint64_t val = 0;
7049 #endif
7050
7051 mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
7052 *flags &= ~CPU_DTRACE_ERROR;
7053
7054 if (prov == dtrace_provider) {
7055 /*
7056 * If dtrace itself is the provider of this probe,
7057 * we're only going to continue processing the ECB if
7058 * arg0 (the dtrace_state_t) is equal to the ECB's
7059 * creating state. (This prevents disjoint consumers
7060 * from seeing one another's metaprobes.)
7061 */
7062 if (arg0 != (uint64_t)(uintptr_t)state)
7063 continue;
7064 }
7065
7066 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
7067 /*
7068 * We're not currently active. If our provider isn't
7069 * the dtrace pseudo provider, we're not interested.
7070 */
7071 if (prov != dtrace_provider)
7072 continue;
7073
7074 /*
7075 * Now we must further check if we are in the BEGIN
7076 * probe. If we are, we will only continue processing
7077 * if we're still in WARMUP -- if one BEGIN enabling
7078 * has invoked the exit() action, we don't want to
7079 * evaluate subsequent BEGIN enablings.
7080 */
7081 if (probe->dtpr_id == dtrace_probeid_begin &&
7082 state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
7083 ASSERT(state->dts_activity ==
7084 DTRACE_ACTIVITY_DRAINING);
7085 continue;
7086 }
7087 }
7088
7089 if (ecb->dte_cond) {
7090 /*
7091 * If the dte_cond bits indicate that this
7092 * consumer is only allowed to see user-mode firings
7093 * of this probe, call the provider's dtps_usermode()
7094 * entry point to check that the probe was fired
7095 * while in a user context. Skip this ECB if that's
7096 * not the case.
7097 */
7098 if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&
7099 prov->dtpv_pops.dtps_usermode &&
7100 prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,
7101 probe->dtpr_id, probe->dtpr_arg) == 0)
7102 continue;
7103
7104 /*
7105 * This is more subtle than it looks. We have to be
7106 * absolutely certain that CRED() isn't going to
7107 * change out from under us so it's only legit to
7108 * examine that structure if we're in constrained
7109 * situations. Currently, the only times we'll this
7110 * check is if a non-super-user has enabled the
7111 * profile or syscall providers -- providers that
7112 * allow visibility of all processes. For the
7113 * profile case, the check above will ensure that
7114 * we're examining a user context.
7115 */
7116 if (ecb->dte_cond & DTRACE_COND_OWNER) {
7117 cred_t *cr;
7118 cred_t *s_cr =
7119 ecb->dte_state->dts_cred.dcr_cred;
7120 proc_t *proc;
7121 #pragma unused(proc) /* __APPLE__ */
7122
7123 ASSERT(s_cr != NULL);
7124
7125 /*
7126 * XXX this is hackish, but so is setting a variable
7127 * XXX in a McCarthy OR...
7128 */
7129 if ((cr = dtrace_CRED()) == NULL ||
7130 posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_uid ||
7131 posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_ruid ||
7132 posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_suid ||
7133 posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_gid ||
7134 posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_rgid ||
7135 posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_sgid ||
7136 #if !defined(__APPLE__)
7137 (proc = ttoproc(curthread)) == NULL ||
7138 (proc->p_flag & SNOCD))
7139 #else
7140 1) /* APPLE NOTE: Darwin omits "No Core Dump" flag */
7141 #endif /* __APPLE__ */
7142 continue;
7143 }
7144
7145 if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
7146 cred_t *cr;
7147 cred_t *s_cr =
7148 ecb->dte_state->dts_cred.dcr_cred;
7149 #pragma unused(cr, s_cr) /* __APPLE__ */
7150
7151 ASSERT(s_cr != NULL);
7152
7153 #if !defined(__APPLE__)
7154 if ((cr = CRED()) == NULL ||
7155 s_cr->cr_zone->zone_id !=
7156 cr->cr_zone->zone_id)
7157 continue;
7158 #else
7159 /* APPLE NOTE: Darwin doesn't do zones. */
7160 #endif /* __APPLE__ */
7161 }
7162 }
7163
7164 if (now - state->dts_alive > dtrace_deadman_timeout) {
7165 /*
7166 * We seem to be dead. Unless we (a) have kernel
7167 * destructive permissions (b) have expicitly enabled
7168 * destructive actions and (c) destructive actions have
7169 * not been disabled, we're going to transition into
7170 * the KILLED state, from which no further processing
7171 * on this state will be performed.
7172 */
7173 if (!dtrace_priv_kernel_destructive(state) ||
7174 !state->dts_cred.dcr_destructive ||
7175 dtrace_destructive_disallow) {
7176 void *activity = &state->dts_activity;
7177 dtrace_activity_t current;
7178
7179 do {
7180 current = state->dts_activity;
7181 } while (dtrace_cas32(activity, current,
7182 DTRACE_ACTIVITY_KILLED) != current);
7183
7184 continue;
7185 }
7186 }
7187
7188 if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
7189 ecb->dte_alignment, state, &mstate)) < 0)
7190 continue;
7191
7192 tomax = buf->dtb_tomax;
7193 ASSERT(tomax != NULL);
7194
7195 /*
7196 * Build and store the record header corresponding to the ECB.
7197 */
7198 if (ecb->dte_size != 0) {
7199 dtrace_rechdr_t dtrh;
7200
7201 if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
7202 mstate.dtms_timestamp = dtrace_gethrtime();
7203 mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP;
7204 }
7205
7206 ASSERT(ecb->dte_size >= sizeof(dtrace_rechdr_t));
7207
7208 dtrh.dtrh_epid = ecb->dte_epid;
7209 DTRACE_RECORD_STORE_TIMESTAMP(&dtrh, mstate.dtms_timestamp);
7210 DTRACE_STORE(dtrace_rechdr_t, tomax, offs, dtrh);
7211 }
7212
7213 mstate.dtms_epid = ecb->dte_epid;
7214 mstate.dtms_present |= DTRACE_MSTATE_EPID;
7215
7216 if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
7217 mstate.dtms_access = DTRACE_ACCESS_KERNEL;
7218 else
7219 mstate.dtms_access = 0;
7220
7221 if (pred != NULL) {
7222 dtrace_difo_t *dp = pred->dtp_difo;
7223 uint64_t rval;
7224
7225 rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
7226
7227 if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
7228 dtrace_cacheid_t cid = probe->dtpr_predcache;
7229
7230 if (cid != DTRACE_CACHEIDNONE && !onintr) {
7231 /*
7232 * Update the predicate cache...
7233 */
7234 ASSERT(cid == pred->dtp_cacheid);
7235
7236 dtrace_set_thread_predcache(current_thread(), cid);
7237 }
7238
7239 continue;
7240 }
7241 }
7242
7243 for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
7244 act != NULL; act = act->dta_next) {
7245 size_t valoffs;
7246 dtrace_difo_t *dp;
7247 dtrace_recdesc_t *rec = &act->dta_rec;
7248
7249 size = rec->dtrd_size;
7250 valoffs = offs + rec->dtrd_offset;
7251
7252 if (DTRACEACT_ISAGG(act->dta_kind)) {
7253 uint64_t v = 0xbad;
7254 dtrace_aggregation_t *agg;
7255
7256 agg = (dtrace_aggregation_t *)act;
7257
7258 if ((dp = act->dta_difo) != NULL)
7259 v = dtrace_dif_emulate(dp,
7260 &mstate, vstate, state);
7261
7262 if (*flags & CPU_DTRACE_ERROR)
7263 continue;
7264
7265 /*
7266 * Note that we always pass the expression
7267 * value from the previous iteration of the
7268 * action loop. This value will only be used
7269 * if there is an expression argument to the
7270 * aggregating action, denoted by the
7271 * dtag_hasarg field.
7272 */
7273 dtrace_aggregate(agg, buf,
7274 offs, aggbuf, v, val);
7275 continue;
7276 }
7277
7278 switch (act->dta_kind) {
7279 case DTRACEACT_STOP:
7280 if (dtrace_priv_proc_destructive(state))
7281 dtrace_action_stop();
7282 continue;
7283
7284 case DTRACEACT_BREAKPOINT:
7285 if (dtrace_priv_kernel_destructive(state))
7286 dtrace_action_breakpoint(ecb);
7287 continue;
7288
7289 case DTRACEACT_PANIC:
7290 if (dtrace_priv_kernel_destructive(state))
7291 dtrace_action_panic(ecb);
7292 continue;
7293
7294 case DTRACEACT_STACK:
7295 if (!dtrace_priv_kernel(state))
7296 continue;
7297
7298 dtrace_getpcstack((pc_t *)(tomax + valoffs),
7299 size / sizeof (pc_t), probe->dtpr_aframes,
7300 DTRACE_ANCHORED(probe) ? NULL :
7301 (uint32_t *)(uintptr_t)arg0);
7302 continue;
7303
7304 case DTRACEACT_JSTACK:
7305 case DTRACEACT_USTACK:
7306 if (!dtrace_priv_proc(state))
7307 continue;
7308
7309 /*
7310 * See comment in DIF_VAR_PID.
7311 */
7312 if (DTRACE_ANCHORED(mstate.dtms_probe) &&
7313 CPU_ON_INTR(CPU)) {
7314 int depth = DTRACE_USTACK_NFRAMES(
7315 rec->dtrd_arg) + 1;
7316
7317 dtrace_bzero((void *)(tomax + valoffs),
7318 DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
7319 + depth * sizeof (uint64_t));
7320
7321 continue;
7322 }
7323
7324 if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
7325 curproc->p_dtrace_helpers != NULL) {
7326 /*
7327 * This is the slow path -- we have
7328 * allocated string space, and we're
7329 * getting the stack of a process that
7330 * has helpers. Call into a separate
7331 * routine to perform this processing.
7332 */
7333 dtrace_action_ustack(&mstate, state,
7334 (uint64_t *)(tomax + valoffs),
7335 rec->dtrd_arg);
7336 continue;
7337 }
7338
7339 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
7340 dtrace_getupcstack((uint64_t *)
7341 (tomax + valoffs),
7342 DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
7343 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
7344 continue;
7345
7346 default:
7347 break;
7348 }
7349
7350 dp = act->dta_difo;
7351 ASSERT(dp != NULL);
7352
7353 val = dtrace_dif_emulate(dp, &mstate, vstate, state);
7354
7355 if (*flags & CPU_DTRACE_ERROR)
7356 continue;
7357
7358 switch (act->dta_kind) {
7359 case DTRACEACT_SPECULATE: {
7360 dtrace_rechdr_t *dtrh = NULL;
7361
7362 ASSERT(buf == &state->dts_buffer[cpuid]);
7363 buf = dtrace_speculation_buffer(state,
7364 cpuid, val);
7365
7366 if (buf == NULL) {
7367 *flags |= CPU_DTRACE_DROP;
7368 continue;
7369 }
7370
7371 offs = dtrace_buffer_reserve(buf,
7372 ecb->dte_needed, ecb->dte_alignment,
7373 state, NULL);
7374
7375 if (offs < 0) {
7376 *flags |= CPU_DTRACE_DROP;
7377 continue;
7378 }
7379
7380 tomax = buf->dtb_tomax;
7381 ASSERT(tomax != NULL);
7382
7383 if (ecb->dte_size == 0)
7384 continue;
7385
7386 ASSERT(ecb->dte_size >= sizeof(dtrace_rechdr_t));
7387 dtrh = ((void *)(tomax + offs));
7388 dtrh->dtrh_epid = ecb->dte_epid;
7389
7390 /*
7391 * When the speculation is committed, all of
7392 * the records in the speculative buffer will
7393 * have their timestamps set to the commit
7394 * time. Until then, it is set to a sentinel
7395 * value, for debugability.
7396 */
7397 DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX);
7398
7399 continue;
7400 }
7401
7402 case DTRACEACT_CHILL:
7403 if (dtrace_priv_kernel_destructive(state))
7404 dtrace_action_chill(&mstate, val);
7405 continue;
7406
7407 case DTRACEACT_RAISE:
7408 if (dtrace_priv_proc_destructive(state))
7409 dtrace_action_raise(val);
7410 continue;
7411
7412 case DTRACEACT_PIDRESUME: /* __APPLE__ */
7413 if (dtrace_priv_proc_destructive(state))
7414 dtrace_action_pidresume(val);
7415 continue;
7416
7417 case DTRACEACT_COMMIT:
7418 ASSERT(!committed);
7419
7420 /*
7421 * We need to commit our buffer state.
7422 */
7423 if (ecb->dte_size)
7424 buf->dtb_offset = offs + ecb->dte_size;
7425 buf = &state->dts_buffer[cpuid];
7426 dtrace_speculation_commit(state, cpuid, val);
7427 committed = 1;
7428 continue;
7429
7430 case DTRACEACT_DISCARD:
7431 dtrace_speculation_discard(state, cpuid, val);
7432 continue;
7433
7434 case DTRACEACT_DIFEXPR:
7435 case DTRACEACT_LIBACT:
7436 case DTRACEACT_PRINTF:
7437 case DTRACEACT_PRINTA:
7438 case DTRACEACT_SYSTEM:
7439 case DTRACEACT_FREOPEN:
7440 case DTRACEACT_APPLEBINARY: /* __APPLE__ */
7441 case DTRACEACT_TRACEMEM:
7442 break;
7443
7444 case DTRACEACT_TRACEMEM_DYNSIZE:
7445 tracememsize = val;
7446 break;
7447
7448 case DTRACEACT_SYM:
7449 case DTRACEACT_MOD:
7450 if (!dtrace_priv_kernel(state))
7451 continue;
7452 break;
7453
7454 case DTRACEACT_USYM:
7455 case DTRACEACT_UMOD:
7456 case DTRACEACT_UADDR: {
7457 if (!dtrace_priv_proc(state))
7458 continue;
7459
7460 DTRACE_STORE(uint64_t, tomax,
7461 valoffs, (uint64_t)dtrace_proc_selfpid());
7462 DTRACE_STORE(uint64_t, tomax,
7463 valoffs + sizeof (uint64_t), val);
7464
7465 continue;
7466 }
7467
7468 case DTRACEACT_EXIT: {
7469 /*
7470 * For the exit action, we are going to attempt
7471 * to atomically set our activity to be
7472 * draining. If this fails (either because
7473 * another CPU has beat us to the exit action,
7474 * or because our current activity is something
7475 * other than ACTIVE or WARMUP), we will
7476 * continue. This assures that the exit action
7477 * can be successfully recorded at most once
7478 * when we're in the ACTIVE state. If we're
7479 * encountering the exit() action while in
7480 * COOLDOWN, however, we want to honor the new
7481 * status code. (We know that we're the only
7482 * thread in COOLDOWN, so there is no race.)
7483 */
7484 void *activity = &state->dts_activity;
7485 dtrace_activity_t current = state->dts_activity;
7486
7487 if (current == DTRACE_ACTIVITY_COOLDOWN)
7488 break;
7489
7490 if (current != DTRACE_ACTIVITY_WARMUP)
7491 current = DTRACE_ACTIVITY_ACTIVE;
7492
7493 if (dtrace_cas32(activity, current,
7494 DTRACE_ACTIVITY_DRAINING) != current) {
7495 *flags |= CPU_DTRACE_DROP;
7496 continue;
7497 }
7498
7499 break;
7500 }
7501
7502 default:
7503 ASSERT(0);
7504 }
7505
7506 if (dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF | DIF_TF_BYUREF)) {
7507 uintptr_t end = valoffs + size;
7508
7509 if (tracememsize != 0 &&
7510 valoffs + tracememsize < end)
7511 {
7512 end = valoffs + tracememsize;
7513 tracememsize = 0;
7514 }
7515
7516 if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF &&
7517 !dtrace_vcanload((void *)(uintptr_t)val,
7518 &dp->dtdo_rtype, NULL, &mstate, vstate))
7519 {
7520 continue;
7521 }
7522
7523 dtrace_store_by_ref(dp, tomax, size, &valoffs,
7524 &val, end, act->dta_intuple,
7525 dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ?
7526 DIF_TF_BYREF: DIF_TF_BYUREF);
7527
7528 continue;
7529 }
7530
7531 switch (size) {
7532 case 0:
7533 break;
7534
7535 case sizeof (uint8_t):
7536 DTRACE_STORE(uint8_t, tomax, valoffs, val);
7537 break;
7538 case sizeof (uint16_t):
7539 DTRACE_STORE(uint16_t, tomax, valoffs, val);
7540 break;
7541 case sizeof (uint32_t):
7542 DTRACE_STORE(uint32_t, tomax, valoffs, val);
7543 break;
7544 case sizeof (uint64_t):
7545 DTRACE_STORE(uint64_t, tomax, valoffs, val);
7546 break;
7547 default:
7548 /*
7549 * Any other size should have been returned by
7550 * reference, not by value.
7551 */
7552 ASSERT(0);
7553 break;
7554 }
7555 }
7556
7557 if (*flags & CPU_DTRACE_DROP)
7558 continue;
7559
7560 if (*flags & CPU_DTRACE_FAULT) {
7561 int ndx;
7562 dtrace_action_t *err;
7563
7564 buf->dtb_errors++;
7565
7566 if (probe->dtpr_id == dtrace_probeid_error) {
7567 /*
7568 * There's nothing we can do -- we had an
7569 * error on the error probe. We bump an
7570 * error counter to at least indicate that
7571 * this condition happened.
7572 */
7573 dtrace_error(&state->dts_dblerrors);
7574 continue;
7575 }
7576
7577 if (vtime) {
7578 /*
7579 * Before recursing on dtrace_probe(), we
7580 * need to explicitly clear out our start
7581 * time to prevent it from being accumulated
7582 * into t_dtrace_vtime.
7583 */
7584
7585 /*
7586 * Darwin sets the sign bit on t_dtrace_tracing
7587 * to suspend accumulation to it.
7588 */
7589 dtrace_set_thread_tracing(current_thread(),
7590 (1ULL<<63) | dtrace_get_thread_tracing(current_thread()));
7591
7592 }
7593
7594 /*
7595 * Iterate over the actions to figure out which action
7596 * we were processing when we experienced the error.
7597 * Note that act points _past_ the faulting action; if
7598 * act is ecb->dte_action, the fault was in the
7599 * predicate, if it's ecb->dte_action->dta_next it's
7600 * in action #1, and so on.
7601 */
7602 for (err = ecb->dte_action, ndx = 0;
7603 err != act; err = err->dta_next, ndx++)
7604 continue;
7605
7606 dtrace_probe_error(state, ecb->dte_epid, ndx,
7607 (mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
7608 mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
7609 cpu_core[cpuid].cpuc_dtrace_illval);
7610
7611 continue;
7612 }
7613
7614 if (!committed)
7615 buf->dtb_offset = offs + ecb->dte_size;
7616 }
7617
7618 /* FIXME: On Darwin the time spent leaving DTrace from this point to the rti is attributed
7619 to the current thread. Instead it should accrue to DTrace. */
7620 if (vtime) {
7621 thread_t thread = current_thread();
7622 int64_t t = dtrace_get_thread_tracing(thread);
7623
7624 if (t >= 0) {
7625 /* Usual case, accumulate time spent here into t_dtrace_tracing */
7626 dtrace_set_thread_tracing(thread, t + (dtrace_gethrtime() - now));
7627 } else {
7628 /* Return from error recursion. No accumulation, just clear the sign bit on t_dtrace_tracing. */
7629 dtrace_set_thread_tracing(thread, (~(1ULL<<63)) & t);
7630 }
7631 }
7632
7633 dtrace_probe_exit(cookie);
7634 }
7635
7636 /*
7637 * DTrace Probe Hashing Functions
7638 *
7639 * The functions in this section (and indeed, the functions in remaining
7640 * sections) are not _called_ from probe context. (Any exceptions to this are
7641 * marked with a "Note:".) Rather, they are called from elsewhere in the
7642 * DTrace framework to look-up probes in, add probes to and remove probes from
7643 * the DTrace probe hashes. (Each probe is hashed by each element of the
7644 * probe tuple -- allowing for fast lookups, regardless of what was
7645 * specified.)
7646 */
7647 static uint_t
7648 dtrace_hash_str(const char *p)
7649 {
7650 unsigned int g;
7651 uint_t hval = 0;
7652
7653 while (*p) {
7654 hval = (hval << 4) + *p++;
7655 if ((g = (hval & 0xf0000000)) != 0)
7656 hval ^= g >> 24;
7657 hval &= ~g;
7658 }
7659 return (hval);
7660 }
7661
7662 static const char*
7663 dtrace_strkey_probe_provider(void *elm, uintptr_t offs)
7664 {
7665 #pragma unused(offs)
7666 dtrace_probe_t *probe = (dtrace_probe_t*)elm;
7667 return probe->dtpr_provider->dtpv_name;
7668 }
7669
7670 static const char*
7671 dtrace_strkey_offset(void *elm, uintptr_t offs)
7672 {
7673 return ((char *)((uintptr_t)(elm) + offs));
7674 }
7675
7676 static const char*
7677 dtrace_strkey_deref_offset(void *elm, uintptr_t offs)
7678 {
7679 return *((char **)((uintptr_t)(elm) + offs));
7680 }
7681
7682 static dtrace_hash_t *
7683 dtrace_hash_create(dtrace_strkey_f func, uintptr_t arg, uintptr_t nextoffs, uintptr_t prevoffs)
7684 {
7685 dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
7686
7687 hash->dth_getstr = func;
7688 hash->dth_stroffs = arg;
7689 hash->dth_nextoffs = nextoffs;
7690 hash->dth_prevoffs = prevoffs;
7691
7692 hash->dth_size = 1;
7693 hash->dth_mask = hash->dth_size - 1;
7694
7695 hash->dth_tab = kmem_zalloc(hash->dth_size *
7696 sizeof (dtrace_hashbucket_t *), KM_SLEEP);
7697
7698 return (hash);
7699 }
7700
7701 /*
7702 * APPLE NOTE: dtrace_hash_destroy is not used.
7703 * It is called by dtrace_detach which is not
7704 * currently implemented. Revisit someday.
7705 */
7706 #if !defined(__APPLE__)
7707 static void
7708 dtrace_hash_destroy(dtrace_hash_t *hash)
7709 {
7710 #if DEBUG
7711 int i;
7712
7713 for (i = 0; i < hash->dth_size; i++)
7714 ASSERT(hash->dth_tab[i] == NULL);
7715 #endif
7716
7717 kmem_free(hash->dth_tab,
7718 hash->dth_size * sizeof (dtrace_hashbucket_t *));
7719 kmem_free(hash, sizeof (dtrace_hash_t));
7720 }
7721 #endif /* __APPLE__ */
7722
7723 static void
7724 dtrace_hash_resize(dtrace_hash_t *hash)
7725 {
7726 int size = hash->dth_size, i, ndx;
7727 int new_size = hash->dth_size << 1;
7728 int new_mask = new_size - 1;
7729 dtrace_hashbucket_t **new_tab, *bucket, *next;
7730
7731 ASSERT((new_size & new_mask) == 0);
7732
7733 new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
7734
7735 for (i = 0; i < size; i++) {
7736 for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
7737 void *elm = bucket->dthb_chain;
7738
7739 ASSERT(elm != NULL);
7740 ndx = DTRACE_HASHSTR(hash, elm) & new_mask;
7741
7742 next = bucket->dthb_next;
7743 bucket->dthb_next = new_tab[ndx];
7744 new_tab[ndx] = bucket;
7745 }
7746 }
7747
7748 kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
7749 hash->dth_tab = new_tab;
7750 hash->dth_size = new_size;
7751 hash->dth_mask = new_mask;
7752 }
7753
7754 static void
7755 dtrace_hash_add(dtrace_hash_t *hash, void *new)
7756 {
7757 int hashval = DTRACE_HASHSTR(hash, new);
7758 int ndx = hashval & hash->dth_mask;
7759 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7760 void **nextp, **prevp;
7761
7762 for (; bucket != NULL; bucket = bucket->dthb_next) {
7763 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
7764 goto add;
7765 }
7766
7767 if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
7768 dtrace_hash_resize(hash);
7769 dtrace_hash_add(hash, new);
7770 return;
7771 }
7772
7773 bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
7774 bucket->dthb_next = hash->dth_tab[ndx];
7775 hash->dth_tab[ndx] = bucket;
7776 hash->dth_nbuckets++;
7777
7778 add:
7779 nextp = DTRACE_HASHNEXT(hash, new);
7780 ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);
7781 *nextp = bucket->dthb_chain;
7782
7783 if (bucket->dthb_chain != NULL) {
7784 prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
7785 ASSERT(*prevp == NULL);
7786 *prevp = new;
7787 }
7788
7789 bucket->dthb_chain = new;
7790 bucket->dthb_len++;
7791 }
7792
7793 static void *
7794 dtrace_hash_lookup_string(dtrace_hash_t *hash, const char *str)
7795 {
7796 int hashval = dtrace_hash_str(str);
7797 int ndx = hashval & hash->dth_mask;
7798 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7799
7800 for (; bucket != NULL; bucket = bucket->dthb_next) {
7801 if (strcmp(str, DTRACE_GETSTR(hash, bucket->dthb_chain)) == 0)
7802 return (bucket->dthb_chain);
7803 }
7804
7805 return (NULL);
7806 }
7807
7808 static dtrace_probe_t *
7809 dtrace_hash_lookup(dtrace_hash_t *hash, void *template)
7810 {
7811 return dtrace_hash_lookup_string(hash, DTRACE_GETSTR(hash, template));
7812 }
7813
7814 static int
7815 dtrace_hash_collisions(dtrace_hash_t *hash, void *template)
7816 {
7817 int hashval = DTRACE_HASHSTR(hash, template);
7818 int ndx = hashval & hash->dth_mask;
7819 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7820
7821 for (; bucket != NULL; bucket = bucket->dthb_next) {
7822 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
7823 return (bucket->dthb_len);
7824 }
7825
7826 return (0);
7827 }
7828
7829 static void
7830 dtrace_hash_remove(dtrace_hash_t *hash, void *elm)
7831 {
7832 int ndx = DTRACE_HASHSTR(hash, elm) & hash->dth_mask;
7833 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7834
7835 void **prevp = DTRACE_HASHPREV(hash, elm);
7836 void **nextp = DTRACE_HASHNEXT(hash, elm);
7837
7838 /*
7839 * Find the bucket that we're removing this elm from.
7840 */
7841 for (; bucket != NULL; bucket = bucket->dthb_next) {
7842 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, elm))
7843 break;
7844 }
7845
7846 ASSERT(bucket != NULL);
7847
7848 if (*prevp == NULL) {
7849 if (*nextp == NULL) {
7850 /*
7851 * The removed element was the only element on this
7852 * bucket; we need to remove the bucket.
7853 */
7854 dtrace_hashbucket_t *b = hash->dth_tab[ndx];
7855
7856 ASSERT(bucket->dthb_chain == elm);
7857 ASSERT(b != NULL);
7858
7859 if (b == bucket) {
7860 hash->dth_tab[ndx] = bucket->dthb_next;
7861 } else {
7862 while (b->dthb_next != bucket)
7863 b = b->dthb_next;
7864 b->dthb_next = bucket->dthb_next;
7865 }
7866
7867 ASSERT(hash->dth_nbuckets > 0);
7868 hash->dth_nbuckets--;
7869 kmem_free(bucket, sizeof (dtrace_hashbucket_t));
7870 return;
7871 }
7872
7873 bucket->dthb_chain = *nextp;
7874 } else {
7875 *(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;
7876 }
7877
7878 if (*nextp != NULL)
7879 *(DTRACE_HASHPREV(hash, *nextp)) = *prevp;
7880 }
7881
7882 /*
7883 * DTrace Utility Functions
7884 *
7885 * These are random utility functions that are _not_ called from probe context.
7886 */
7887 static int
7888 dtrace_badattr(const dtrace_attribute_t *a)
7889 {
7890 return (a->dtat_name > DTRACE_STABILITY_MAX ||
7891 a->dtat_data > DTRACE_STABILITY_MAX ||
7892 a->dtat_class > DTRACE_CLASS_MAX);
7893 }
7894
7895 /*
7896 * Returns a dtrace-managed copy of a string, and will
7897 * deduplicate copies of the same string.
7898 * If the specified string is NULL, returns an empty string
7899 */
7900 static char *
7901 dtrace_strref(const char *str)
7902 {
7903 dtrace_string_t *s = NULL;
7904 size_t bufsize = (str != NULL ? strlen(str) : 0) + 1;
7905
7906 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7907
7908 if (str == NULL)
7909 str = "";
7910
7911 for (s = dtrace_hash_lookup_string(dtrace_strings, str); s != NULL;
7912 s = *(DTRACE_HASHNEXT(dtrace_strings, s))) {
7913 if (strncmp(str, s->dtst_str, bufsize) != 0) {
7914 continue;
7915 }
7916 ASSERT(s->dtst_refcount != UINT32_MAX);
7917 s->dtst_refcount++;
7918 return s->dtst_str;
7919 }
7920
7921 s = kmem_zalloc(sizeof(dtrace_string_t) + bufsize, KM_SLEEP);
7922 s->dtst_refcount = 1;
7923 (void) strlcpy(s->dtst_str, str, bufsize);
7924
7925 dtrace_hash_add(dtrace_strings, s);
7926
7927 return s->dtst_str;
7928 }
7929
7930 static void
7931 dtrace_strunref(const char *str)
7932 {
7933 ASSERT(str != NULL);
7934 dtrace_string_t *s = NULL;
7935 size_t bufsize = strlen(str) + 1;
7936
7937 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7938
7939 for (s = dtrace_hash_lookup_string(dtrace_strings, str); s != NULL;
7940 s = *(DTRACE_HASHNEXT(dtrace_strings, s))) {
7941 if (strncmp(str, s->dtst_str, bufsize) != 0) {
7942 continue;
7943 }
7944 ASSERT(s->dtst_refcount != 0);
7945 s->dtst_refcount--;
7946 if (s->dtst_refcount == 0) {
7947 dtrace_hash_remove(dtrace_strings, s);
7948 kmem_free(s, sizeof(dtrace_string_t) + bufsize);
7949 }
7950 return;
7951 }
7952 panic("attempt to unref non-existent string %s", str);
7953 }
7954
7955 #define DTRACE_ISALPHA(c) \
7956 (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
7957
7958 static int
7959 dtrace_badname(const char *s)
7960 {
7961 char c;
7962
7963 if (s == NULL || (c = *s++) == '\0')
7964 return (0);
7965
7966 if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
7967 return (1);
7968
7969 while ((c = *s++) != '\0') {
7970 if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') &&
7971 c != '-' && c != '_' && c != '.' && c != '`')
7972 return (1);
7973 }
7974
7975 return (0);
7976 }
7977
7978 static void
7979 dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
7980 {
7981 uint32_t priv;
7982
7983 if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
7984 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
7985 priv = DTRACE_PRIV_USER | DTRACE_PRIV_PROC | DTRACE_PRIV_OWNER;
7986 }
7987 else {
7988 priv = DTRACE_PRIV_ALL;
7989 }
7990 *uidp = 0;
7991 *zoneidp = 0;
7992 } else {
7993 *uidp = crgetuid(cr);
7994 *zoneidp = crgetzoneid(cr);
7995
7996 priv = 0;
7997 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
7998 priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;
7999 else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
8000 priv |= DTRACE_PRIV_USER;
8001 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
8002 priv |= DTRACE_PRIV_PROC;
8003 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
8004 priv |= DTRACE_PRIV_OWNER;
8005 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
8006 priv |= DTRACE_PRIV_ZONEOWNER;
8007 }
8008
8009 *privp = priv;
8010 }
8011
8012 #ifdef DTRACE_ERRDEBUG
8013 static void
8014 dtrace_errdebug(const char *str)
8015 {
8016 int hval = dtrace_hash_str(str) % DTRACE_ERRHASHSZ;
8017 int occupied = 0;
8018
8019 lck_mtx_lock(&dtrace_errlock);
8020 dtrace_errlast = str;
8021 dtrace_errthread = (kthread_t *)current_thread();
8022
8023 while (occupied++ < DTRACE_ERRHASHSZ) {
8024 if (dtrace_errhash[hval].dter_msg == str) {
8025 dtrace_errhash[hval].dter_count++;
8026 goto out;
8027 }
8028
8029 if (dtrace_errhash[hval].dter_msg != NULL) {
8030 hval = (hval + 1) % DTRACE_ERRHASHSZ;
8031 continue;
8032 }
8033
8034 dtrace_errhash[hval].dter_msg = str;
8035 dtrace_errhash[hval].dter_count = 1;
8036 goto out;
8037 }
8038
8039 panic("dtrace: undersized error hash");
8040 out:
8041 lck_mtx_unlock(&dtrace_errlock);
8042 }
8043 #endif
8044
8045 /*
8046 * DTrace Matching Functions
8047 *
8048 * These functions are used to match groups of probes, given some elements of
8049 * a probe tuple, or some globbed expressions for elements of a probe tuple.
8050 */
8051 static int
8052 dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
8053 zoneid_t zoneid)
8054 {
8055 if (priv != DTRACE_PRIV_ALL) {
8056 uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
8057 uint32_t match = priv & ppriv;
8058
8059 /*
8060 * No PRIV_DTRACE_* privileges...
8061 */
8062 if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |
8063 DTRACE_PRIV_KERNEL)) == 0)
8064 return (0);
8065
8066 /*
8067 * No matching bits, but there were bits to match...
8068 */
8069 if (match == 0 && ppriv != 0)
8070 return (0);
8071
8072 /*
8073 * Need to have permissions to the process, but don't...
8074 */
8075 if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
8076 uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
8077 return (0);
8078 }
8079
8080 /*
8081 * Need to be in the same zone unless we possess the
8082 * privilege to examine all zones.
8083 */
8084 if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
8085 zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
8086 return (0);
8087 }
8088 }
8089
8090 return (1);
8091 }
8092
8093 /*
8094 * dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
8095 * consists of input pattern strings and an ops-vector to evaluate them.
8096 * This function returns >0 for match, 0 for no match, and <0 for error.
8097 */
8098 static int
8099 dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,
8100 uint32_t priv, uid_t uid, zoneid_t zoneid)
8101 {
8102 dtrace_provider_t *pvp = prp->dtpr_provider;
8103 int rv;
8104
8105 if (pvp->dtpv_defunct)
8106 return (0);
8107
8108 if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
8109 return (rv);
8110
8111 if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
8112 return (rv);
8113
8114 if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
8115 return (rv);
8116
8117 if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
8118 return (rv);
8119
8120 if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
8121 return (0);
8122
8123 return (rv);
8124 }
8125
8126 /*
8127 * dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
8128 * interface for matching a glob pattern 'p' to an input string 's'. Unlike
8129 * libc's version, the kernel version only applies to 8-bit ASCII strings.
8130 * In addition, all of the recursion cases except for '*' matching have been
8131 * unwound. For '*', we still implement recursive evaluation, but a depth
8132 * counter is maintained and matching is aborted if we recurse too deep.
8133 * The function returns 0 if no match, >0 if match, and <0 if recursion error.
8134 */
8135 static int
8136 dtrace_match_glob(const char *s, const char *p, int depth)
8137 {
8138 const char *olds;
8139 char s1, c;
8140 int gs;
8141
8142 if (depth > DTRACE_PROBEKEY_MAXDEPTH)
8143 return (-1);
8144
8145 if (s == NULL)
8146 s = ""; /* treat NULL as empty string */
8147
8148 top:
8149 olds = s;
8150 s1 = *s++;
8151
8152 if (p == NULL)
8153 return (0);
8154
8155 if ((c = *p++) == '\0')
8156 return (s1 == '\0');
8157
8158 switch (c) {
8159 case '[': {
8160 int ok = 0, notflag = 0;
8161 char lc = '\0';
8162
8163 if (s1 == '\0')
8164 return (0);
8165
8166 if (*p == '!') {
8167 notflag = 1;
8168 p++;
8169 }
8170
8171 if ((c = *p++) == '\0')
8172 return (0);
8173
8174 do {
8175 if (c == '-' && lc != '\0' && *p != ']') {
8176 if ((c = *p++) == '\0')
8177 return (0);
8178 if (c == '\\' && (c = *p++) == '\0')
8179 return (0);
8180
8181 if (notflag) {
8182 if (s1 < lc || s1 > c)
8183 ok++;
8184 else
8185 return (0);
8186 } else if (lc <= s1 && s1 <= c)
8187 ok++;
8188
8189 } else if (c == '\\' && (c = *p++) == '\0')
8190 return (0);
8191
8192 lc = c; /* save left-hand 'c' for next iteration */
8193
8194 if (notflag) {
8195 if (s1 != c)
8196 ok++;
8197 else
8198 return (0);
8199 } else if (s1 == c)
8200 ok++;
8201
8202 if ((c = *p++) == '\0')
8203 return (0);
8204
8205 } while (c != ']');
8206
8207 if (ok)
8208 goto top;
8209
8210 return (0);
8211 }
8212
8213 case '\\':
8214 if ((c = *p++) == '\0')
8215 return (0);
8216 /*FALLTHRU*/
8217
8218 default:
8219 if (c != s1)
8220 return (0);
8221 /*FALLTHRU*/
8222
8223 case '?':
8224 if (s1 != '\0')
8225 goto top;
8226 return (0);
8227
8228 case '*':
8229 while (*p == '*')
8230 p++; /* consecutive *'s are identical to a single one */
8231
8232 if (*p == '\0')
8233 return (1);
8234
8235 for (s = olds; *s != '\0'; s++) {
8236 if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
8237 return (gs);
8238 }
8239
8240 return (0);
8241 }
8242 }
8243
8244 /*ARGSUSED*/
8245 static int
8246 dtrace_match_string(const char *s, const char *p, int depth)
8247 {
8248 #pragma unused(depth) /* __APPLE__ */
8249 return (s != NULL && s == p);
8250 }
8251
8252 /*ARGSUSED*/
8253 static int
8254 dtrace_match_module(const char *s, const char *p, int depth)
8255 {
8256 #pragma unused(depth) /* __APPLE__ */
8257 size_t len;
8258 if (s == NULL || p == NULL)
8259 return (0);
8260
8261 len = strlen(p);
8262
8263 if (strncmp(p, s, len) != 0)
8264 return (0);
8265
8266 if (s[len] == '.' || s[len] == '\0')
8267 return (1);
8268
8269 return (0);
8270 }
8271
8272 /*ARGSUSED*/
8273 static int
8274 dtrace_match_nul(const char *s, const char *p, int depth)
8275 {
8276 #pragma unused(s, p, depth) /* __APPLE__ */
8277 return (1); /* always match the empty pattern */
8278 }
8279
8280 /*ARGSUSED*/
8281 static int
8282 dtrace_match_nonzero(const char *s, const char *p, int depth)
8283 {
8284 #pragma unused(p, depth) /* __APPLE__ */
8285 return (s != NULL && s[0] != '\0');
8286 }
8287
8288 static int
8289 dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
8290 zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *, void *), void *arg1, void *arg2)
8291 {
8292 dtrace_probe_t *probe;
8293 dtrace_provider_t prov_template = {
8294 .dtpv_name = (char *)(uintptr_t)pkp->dtpk_prov
8295 };
8296
8297 dtrace_probe_t template = {
8298 .dtpr_provider = &prov_template,
8299 .dtpr_mod = (char *)(uintptr_t)pkp->dtpk_mod,
8300 .dtpr_func = (char *)(uintptr_t)pkp->dtpk_func,
8301 .dtpr_name = (char *)(uintptr_t)pkp->dtpk_name
8302 };
8303
8304 dtrace_hash_t *hash = NULL;
8305 int len, rc, best = INT_MAX, nmatched = 0;
8306 dtrace_id_t i;
8307
8308 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8309
8310 /*
8311 * If the probe ID is specified in the key, just lookup by ID and
8312 * invoke the match callback once if a matching probe is found.
8313 */
8314 if (pkp->dtpk_id != DTRACE_IDNONE) {
8315 if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
8316 dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
8317 if ((*matched)(probe, arg1, arg2) == DTRACE_MATCH_FAIL)
8318 return (DTRACE_MATCH_FAIL);
8319 nmatched++;
8320 }
8321 return (nmatched);
8322 }
8323
8324 /*
8325 * We want to find the most distinct of the provider name, module name,
8326 * function name, and name. So for each one that is not a glob
8327 * pattern or empty string, we perform a lookup in the corresponding
8328 * hash and use the hash table with the fewest collisions to do our
8329 * search.
8330 */
8331 if (pkp->dtpk_pmatch == &dtrace_match_string &&
8332 (len = dtrace_hash_collisions(dtrace_byprov, &template)) < best) {
8333 best = len;
8334 hash = dtrace_byprov;
8335 }
8336
8337 if (pkp->dtpk_mmatch == &dtrace_match_string &&
8338 (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
8339 best = len;
8340 hash = dtrace_bymod;
8341 }
8342
8343 if (pkp->dtpk_fmatch == &dtrace_match_string &&
8344 (len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
8345 best = len;
8346 hash = dtrace_byfunc;
8347 }
8348
8349 if (pkp->dtpk_nmatch == &dtrace_match_string &&
8350 (len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
8351 best = len;
8352 hash = dtrace_byname;
8353 }
8354
8355 /*
8356 * If we did not select a hash table, iterate over every probe and
8357 * invoke our callback for each one that matches our input probe key.
8358 */
8359 if (hash == NULL) {
8360 for (i = 0; i < (dtrace_id_t)dtrace_nprobes; i++) {
8361 if ((probe = dtrace_probes[i]) == NULL ||
8362 dtrace_match_probe(probe, pkp, priv, uid,
8363 zoneid) <= 0)
8364 continue;
8365
8366 nmatched++;
8367
8368 if ((rc = (*matched)(probe, arg1, arg2)) != DTRACE_MATCH_NEXT) {
8369 if (rc == DTRACE_MATCH_FAIL)
8370 return (DTRACE_MATCH_FAIL);
8371 break;
8372 }
8373 }
8374
8375 return (nmatched);
8376 }
8377
8378 /*
8379 * If we selected a hash table, iterate over each probe of the same key
8380 * name and invoke the callback for every probe that matches the other
8381 * attributes of our input probe key.
8382 */
8383 for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
8384 probe = *(DTRACE_HASHNEXT(hash, probe))) {
8385
8386 if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
8387 continue;
8388
8389 nmatched++;
8390
8391 if ((rc = (*matched)(probe, arg1, arg2)) != DTRACE_MATCH_NEXT) {
8392 if (rc == DTRACE_MATCH_FAIL)
8393 return (DTRACE_MATCH_FAIL);
8394 break;
8395 }
8396 }
8397
8398 return (nmatched);
8399 }
8400
8401 /*
8402 * Return the function pointer dtrace_probecmp() should use to compare the
8403 * specified pattern with a string. For NULL or empty patterns, we select
8404 * dtrace_match_nul(). For glob pattern strings, we use dtrace_match_glob().
8405 * For non-empty non-glob strings, we use dtrace_match_string().
8406 */
8407 static dtrace_probekey_f *
8408 dtrace_probekey_func(const char *p)
8409 {
8410 char c;
8411
8412 if (p == NULL || *p == '\0')
8413 return (&dtrace_match_nul);
8414
8415 while ((c = *p++) != '\0') {
8416 if (c == '[' || c == '?' || c == '*' || c == '\\')
8417 return (&dtrace_match_glob);
8418 }
8419
8420 return (&dtrace_match_string);
8421 }
8422
8423 static dtrace_probekey_f *
8424 dtrace_probekey_module_func(const char *p)
8425 {
8426 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8427
8428 dtrace_probekey_f *f = dtrace_probekey_func(p);
8429 if (f == &dtrace_match_string) {
8430 dtrace_probe_t template = {
8431 .dtpr_mod = (char *)(uintptr_t)p,
8432 };
8433 if (dtrace_hash_lookup(dtrace_bymod, &template) == NULL) {
8434 return (&dtrace_match_module);
8435 }
8436 return (&dtrace_match_string);
8437 }
8438 return f;
8439 }
8440
8441 /*
8442 * Build a probe comparison key for use with dtrace_match_probe() from the
8443 * given probe description. By convention, a null key only matches anchored
8444 * probes: if each field is the empty string, reset dtpk_fmatch to
8445 * dtrace_match_nonzero().
8446 */
8447 static void
8448 dtrace_probekey(const dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
8449 {
8450
8451 pkp->dtpk_prov = dtrace_strref(pdp->dtpd_provider);
8452 pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
8453
8454 pkp->dtpk_mod = dtrace_strref(pdp->dtpd_mod);
8455 pkp->dtpk_mmatch = dtrace_probekey_module_func(pdp->dtpd_mod);
8456
8457 pkp->dtpk_func = dtrace_strref(pdp->dtpd_func);
8458 pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
8459
8460 pkp->dtpk_name = dtrace_strref(pdp->dtpd_name);
8461 pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
8462
8463 pkp->dtpk_id = pdp->dtpd_id;
8464
8465 if (pkp->dtpk_id == DTRACE_IDNONE &&
8466 pkp->dtpk_pmatch == &dtrace_match_nul &&
8467 pkp->dtpk_mmatch == &dtrace_match_nul &&
8468 pkp->dtpk_fmatch == &dtrace_match_nul &&
8469 pkp->dtpk_nmatch == &dtrace_match_nul)
8470 pkp->dtpk_fmatch = &dtrace_match_nonzero;
8471 }
8472
8473 static void
8474 dtrace_probekey_release(dtrace_probekey_t *pkp)
8475 {
8476 dtrace_strunref(pkp->dtpk_prov);
8477 dtrace_strunref(pkp->dtpk_mod);
8478 dtrace_strunref(pkp->dtpk_func);
8479 dtrace_strunref(pkp->dtpk_name);
8480 }
8481
8482 static int
8483 dtrace_cond_provider_match(dtrace_probedesc_t *desc, void *data)
8484 {
8485 if (desc == NULL)
8486 return 1;
8487
8488 dtrace_probekey_f *func = dtrace_probekey_func(desc->dtpd_provider);
8489
8490 return func((char*)data, desc->dtpd_provider, 0);
8491 }
8492
8493 /*
8494 * DTrace Provider-to-Framework API Functions
8495 *
8496 * These functions implement much of the Provider-to-Framework API, as
8497 * described in <sys/dtrace.h>. The parts of the API not in this section are
8498 * the functions in the API for probe management (found below), and
8499 * dtrace_probe() itself (found above).
8500 */
8501
8502 /*
8503 * Register the calling provider with the DTrace framework. This should
8504 * generally be called by DTrace providers in their attach(9E) entry point.
8505 */
8506 int
8507 dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
8508 cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)
8509 {
8510 dtrace_provider_t *provider;
8511
8512 if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {
8513 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8514 "arguments", name ? name : "<NULL>");
8515 return (EINVAL);
8516 }
8517
8518 if (name[0] == '\0' || dtrace_badname(name)) {
8519 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8520 "provider name", name);
8521 return (EINVAL);
8522 }
8523
8524 if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||
8525 pops->dtps_enable == NULL || pops->dtps_disable == NULL ||
8526 pops->dtps_destroy == NULL ||
8527 ((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
8528 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8529 "provider ops", name);
8530 return (EINVAL);
8531 }
8532
8533 if (dtrace_badattr(&pap->dtpa_provider) ||
8534 dtrace_badattr(&pap->dtpa_mod) ||
8535 dtrace_badattr(&pap->dtpa_func) ||
8536 dtrace_badattr(&pap->dtpa_name) ||
8537 dtrace_badattr(&pap->dtpa_args)) {
8538 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8539 "provider attributes", name);
8540 return (EINVAL);
8541 }
8542
8543 if (priv & ~DTRACE_PRIV_ALL) {
8544 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8545 "privilege attributes", name);
8546 return (EINVAL);
8547 }
8548
8549 if ((priv & DTRACE_PRIV_KERNEL) &&
8550 (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&
8551 pops->dtps_usermode == NULL) {
8552 cmn_err(CE_WARN, "failed to register provider '%s': need "
8553 "dtps_usermode() op for given privilege attributes", name);
8554 return (EINVAL);
8555 }
8556
8557 provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
8558
8559 provider->dtpv_attr = *pap;
8560 provider->dtpv_priv.dtpp_flags = priv;
8561 if (cr != NULL) {
8562 provider->dtpv_priv.dtpp_uid = crgetuid(cr);
8563 provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
8564 }
8565 provider->dtpv_pops = *pops;
8566
8567 if (pops->dtps_provide == NULL) {
8568 ASSERT(pops->dtps_provide_module != NULL);
8569 provider->dtpv_pops.dtps_provide = dtrace_provide_nullop;
8570 }
8571
8572 if (pops->dtps_provide_module == NULL) {
8573 ASSERT(pops->dtps_provide != NULL);
8574 provider->dtpv_pops.dtps_provide_module =
8575 dtrace_provide_module_nullop;
8576 }
8577
8578 if (pops->dtps_suspend == NULL) {
8579 ASSERT(pops->dtps_resume == NULL);
8580 provider->dtpv_pops.dtps_suspend = dtrace_suspend_nullop;
8581 provider->dtpv_pops.dtps_resume = dtrace_resume_nullop;
8582 }
8583
8584 provider->dtpv_arg = arg;
8585 *idp = (dtrace_provider_id_t)provider;
8586
8587 if (pops == &dtrace_provider_ops) {
8588 LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
8589 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8590
8591 provider->dtpv_name = dtrace_strref(name);
8592
8593 ASSERT(dtrace_anon.dta_enabling == NULL);
8594
8595 /*
8596 * We make sure that the DTrace provider is at the head of
8597 * the provider chain.
8598 */
8599 provider->dtpv_next = dtrace_provider;
8600 dtrace_provider = provider;
8601 return (0);
8602 }
8603
8604 lck_mtx_lock(&dtrace_provider_lock);
8605 lck_mtx_lock(&dtrace_lock);
8606
8607 provider->dtpv_name = dtrace_strref(name);
8608
8609 /*
8610 * If there is at least one provider registered, we'll add this
8611 * provider after the first provider.
8612 */
8613 if (dtrace_provider != NULL) {
8614 provider->dtpv_next = dtrace_provider->dtpv_next;
8615 dtrace_provider->dtpv_next = provider;
8616 } else {
8617 dtrace_provider = provider;
8618 }
8619
8620 if (dtrace_retained != NULL) {
8621 dtrace_enabling_provide(provider);
8622
8623 /*
8624 * Now we need to call dtrace_enabling_matchall_with_cond() --
8625 * with a condition matching the provider name we just added,
8626 * which will acquire cpu_lock and dtrace_lock. We therefore need
8627 * to drop all of our locks before calling into it...
8628 */
8629 lck_mtx_unlock(&dtrace_lock);
8630 lck_mtx_unlock(&dtrace_provider_lock);
8631
8632 dtrace_match_cond_t cond = {dtrace_cond_provider_match, provider->dtpv_name};
8633 dtrace_enabling_matchall_with_cond(&cond);
8634
8635 return (0);
8636 }
8637
8638 lck_mtx_unlock(&dtrace_lock);
8639 lck_mtx_unlock(&dtrace_provider_lock);
8640
8641 return (0);
8642 }
8643
8644 /*
8645 * Unregister the specified provider from the DTrace framework. This should
8646 * generally be called by DTrace providers in their detach(9E) entry point.
8647 */
8648 int
8649 dtrace_unregister(dtrace_provider_id_t id)
8650 {
8651 dtrace_provider_t *old = (dtrace_provider_t *)id;
8652 dtrace_provider_t *prev = NULL;
8653 int self = 0;
8654 dtrace_probe_t *probe, *first = NULL, *next = NULL;
8655 dtrace_probe_t template = {
8656 .dtpr_provider = old
8657 };
8658
8659 if (old->dtpv_pops.dtps_enable ==
8660 (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop) {
8661 /*
8662 * If DTrace itself is the provider, we're called with locks
8663 * already held.
8664 */
8665 ASSERT(old == dtrace_provider);
8666 ASSERT(dtrace_devi != NULL);
8667 LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
8668 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8669 self = 1;
8670
8671 if (dtrace_provider->dtpv_next != NULL) {
8672 /*
8673 * There's another provider here; return failure.
8674 */
8675 return (EBUSY);
8676 }
8677 } else {
8678 lck_mtx_lock(&dtrace_provider_lock);
8679 lck_mtx_lock(&mod_lock);
8680 lck_mtx_lock(&dtrace_lock);
8681 }
8682
8683 /*
8684 * If anyone has /dev/dtrace open, or if there are anonymous enabled
8685 * probes, we refuse to let providers slither away, unless this
8686 * provider has already been explicitly invalidated.
8687 */
8688 if (!old->dtpv_defunct &&
8689 (dtrace_opens || (dtrace_anon.dta_state != NULL &&
8690 dtrace_anon.dta_state->dts_necbs > 0))) {
8691 if (!self) {
8692 lck_mtx_unlock(&dtrace_lock);
8693 lck_mtx_unlock(&mod_lock);
8694 lck_mtx_unlock(&dtrace_provider_lock);
8695 }
8696 return (EBUSY);
8697 }
8698
8699 /*
8700 * Attempt to destroy the probes associated with this provider.
8701 */
8702 if (old->dtpv_ecb_count!=0) {
8703 /*
8704 * We have at least one ECB; we can't remove this provider.
8705 */
8706 if (!self) {
8707 lck_mtx_unlock(&dtrace_lock);
8708 lck_mtx_unlock(&mod_lock);
8709 lck_mtx_unlock(&dtrace_provider_lock);
8710 }
8711 return (EBUSY);
8712 }
8713
8714 /*
8715 * All of the probes for this provider are disabled; we can safely
8716 * remove all of them from their hash chains and from the probe array.
8717 */
8718 for (probe = dtrace_hash_lookup(dtrace_byprov, &template); probe != NULL;
8719 probe = *(DTRACE_HASHNEXT(dtrace_byprov, probe))) {
8720 if (probe->dtpr_provider != old)
8721 continue;
8722
8723 dtrace_probes[probe->dtpr_id - 1] = NULL;
8724 old->dtpv_probe_count--;
8725
8726 dtrace_hash_remove(dtrace_bymod, probe);
8727 dtrace_hash_remove(dtrace_byfunc, probe);
8728 dtrace_hash_remove(dtrace_byname, probe);
8729
8730 if (first == NULL) {
8731 first = probe;
8732 probe->dtpr_nextmod = NULL;
8733 } else {
8734 /*
8735 * Use nextmod as the chain of probes to remove
8736 */
8737 probe->dtpr_nextmod = first;
8738 first = probe;
8739 }
8740 }
8741
8742 for (probe = first; probe != NULL; probe = next) {
8743 next = probe->dtpr_nextmod;
8744 dtrace_hash_remove(dtrace_byprov, probe);
8745 }
8746
8747 /*
8748 * The provider's probes have been removed from the hash chains and
8749 * from the probe array. Now issue a dtrace_sync() to be sure that
8750 * everyone has cleared out from any probe array processing.
8751 */
8752 dtrace_sync();
8753
8754 for (probe = first; probe != NULL; probe = next) {
8755 next = probe->dtpr_nextmod;
8756
8757 old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
8758 probe->dtpr_arg);
8759 dtrace_strunref(probe->dtpr_mod);
8760 dtrace_strunref(probe->dtpr_func);
8761 dtrace_strunref(probe->dtpr_name);
8762 vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
8763 zfree(dtrace_probe_t_zone, probe);
8764 }
8765
8766 if ((prev = dtrace_provider) == old) {
8767 ASSERT(self || dtrace_devi == NULL);
8768 ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
8769 dtrace_provider = old->dtpv_next;
8770 } else {
8771 while (prev != NULL && prev->dtpv_next != old)
8772 prev = prev->dtpv_next;
8773
8774 if (prev == NULL) {
8775 panic("attempt to unregister non-existent "
8776 "dtrace provider %p\n", (void *)id);
8777 }
8778
8779 prev->dtpv_next = old->dtpv_next;
8780 }
8781
8782 dtrace_strunref(old->dtpv_name);
8783
8784 if (!self) {
8785 lck_mtx_unlock(&dtrace_lock);
8786 lck_mtx_unlock(&mod_lock);
8787 lck_mtx_unlock(&dtrace_provider_lock);
8788 }
8789
8790 kmem_free(old, sizeof (dtrace_provider_t));
8791
8792 return (0);
8793 }
8794
8795 /*
8796 * Invalidate the specified provider. All subsequent probe lookups for the
8797 * specified provider will fail, but its probes will not be removed.
8798 */
8799 void
8800 dtrace_invalidate(dtrace_provider_id_t id)
8801 {
8802 dtrace_provider_t *pvp = (dtrace_provider_t *)id;
8803
8804 ASSERT(pvp->dtpv_pops.dtps_enable !=
8805 (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
8806
8807 lck_mtx_lock(&dtrace_provider_lock);
8808 lck_mtx_lock(&dtrace_lock);
8809
8810 pvp->dtpv_defunct = 1;
8811
8812 lck_mtx_unlock(&dtrace_lock);
8813 lck_mtx_unlock(&dtrace_provider_lock);
8814 }
8815
8816 /*
8817 * Indicate whether or not DTrace has attached.
8818 */
8819 int
8820 dtrace_attached(void)
8821 {
8822 /*
8823 * dtrace_provider will be non-NULL iff the DTrace driver has
8824 * attached. (It's non-NULL because DTrace is always itself a
8825 * provider.)
8826 */
8827 return (dtrace_provider != NULL);
8828 }
8829
8830 /*
8831 * Remove all the unenabled probes for the given provider. This function is
8832 * not unlike dtrace_unregister(), except that it doesn't remove the provider
8833 * -- just as many of its associated probes as it can.
8834 */
8835 int
8836 dtrace_condense(dtrace_provider_id_t id)
8837 {
8838 dtrace_provider_t *prov = (dtrace_provider_t *)id;
8839 dtrace_probe_t *probe, *first = NULL;
8840 dtrace_probe_t template = {
8841 .dtpr_provider = prov
8842 };
8843
8844 /*
8845 * Make sure this isn't the dtrace provider itself.
8846 */
8847 ASSERT(prov->dtpv_pops.dtps_enable !=
8848 (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
8849
8850 lck_mtx_lock(&dtrace_provider_lock);
8851 lck_mtx_lock(&dtrace_lock);
8852
8853 /*
8854 * Attempt to destroy the probes associated with this provider.
8855 */
8856 for (probe = dtrace_hash_lookup(dtrace_byprov, &template); probe != NULL;
8857 probe = *(DTRACE_HASHNEXT(dtrace_byprov, probe))) {
8858
8859 if (probe->dtpr_provider != prov)
8860 continue;
8861
8862 if (probe->dtpr_ecb != NULL)
8863 continue;
8864
8865 dtrace_probes[probe->dtpr_id - 1] = NULL;
8866 prov->dtpv_probe_count--;
8867
8868 dtrace_hash_remove(dtrace_bymod, probe);
8869 dtrace_hash_remove(dtrace_byfunc, probe);
8870 dtrace_hash_remove(dtrace_byname, probe);
8871
8872 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
8873 probe->dtpr_arg);
8874 dtrace_strunref(probe->dtpr_mod);
8875 dtrace_strunref(probe->dtpr_func);
8876 dtrace_strunref(probe->dtpr_name);
8877 if (first == NULL) {
8878 first = probe;
8879 probe->dtpr_nextmod = NULL;
8880 } else {
8881 /*
8882 * Use nextmod as the chain of probes to remove
8883 */
8884 probe->dtpr_nextmod = first;
8885 first = probe;
8886 }
8887 }
8888
8889 for (probe = first; probe != NULL; probe = first) {
8890 first = probe->dtpr_nextmod;
8891 dtrace_hash_remove(dtrace_byprov, probe);
8892 vmem_free(dtrace_arena, (void *)((uintptr_t)probe->dtpr_id), 1);
8893 zfree(dtrace_probe_t_zone, probe);
8894 }
8895
8896 lck_mtx_unlock(&dtrace_lock);
8897 lck_mtx_unlock(&dtrace_provider_lock);
8898
8899 return (0);
8900 }
8901
8902 /*
8903 * DTrace Probe Management Functions
8904 *
8905 * The functions in this section perform the DTrace probe management,
8906 * including functions to create probes, look-up probes, and call into the
8907 * providers to request that probes be provided. Some of these functions are
8908 * in the Provider-to-Framework API; these functions can be identified by the
8909 * fact that they are not declared "static".
8910 */
8911
8912 /*
8913 * Create a probe with the specified module name, function name, and name.
8914 */
8915 dtrace_id_t
8916 dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
8917 const char *func, const char *name, int aframes, void *arg)
8918 {
8919 dtrace_probe_t *probe, **probes;
8920 dtrace_provider_t *provider = (dtrace_provider_t *)prov;
8921 dtrace_id_t id;
8922
8923 if (provider == dtrace_provider) {
8924 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8925 } else {
8926 lck_mtx_lock(&dtrace_lock);
8927 }
8928
8929 id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
8930 VM_BESTFIT | VM_SLEEP);
8931
8932 probe = zalloc(dtrace_probe_t_zone);
8933 bzero(probe, sizeof (dtrace_probe_t));
8934
8935 probe->dtpr_id = id;
8936 probe->dtpr_gen = dtrace_probegen++;
8937 probe->dtpr_mod = dtrace_strref(mod);
8938 probe->dtpr_func = dtrace_strref(func);
8939 probe->dtpr_name = dtrace_strref(name);
8940 probe->dtpr_arg = arg;
8941 probe->dtpr_aframes = aframes;
8942 probe->dtpr_provider = provider;
8943
8944 dtrace_hash_add(dtrace_byprov, probe);
8945 dtrace_hash_add(dtrace_bymod, probe);
8946 dtrace_hash_add(dtrace_byfunc, probe);
8947 dtrace_hash_add(dtrace_byname, probe);
8948
8949 if (id - 1 >= (dtrace_id_t)dtrace_nprobes) {
8950 size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
8951 size_t nsize = osize * 2;
8952
8953 probes = kmem_zalloc(nsize, KM_SLEEP);
8954
8955 dtrace_probe_t **oprobes = dtrace_probes;
8956
8957 bcopy(oprobes, probes, osize);
8958 dtrace_membar_producer();
8959 dtrace_probes = probes;
8960
8961 dtrace_sync();
8962
8963 /*
8964 * All CPUs are now seeing the new probes array; we can
8965 * safely free the old array.
8966 */
8967 kmem_free(oprobes, osize);
8968 dtrace_nprobes *= 2;
8969
8970 ASSERT(id - 1 < (dtrace_id_t)dtrace_nprobes);
8971 }
8972
8973 ASSERT(dtrace_probes[id - 1] == NULL);
8974 dtrace_probes[id - 1] = probe;
8975 provider->dtpv_probe_count++;
8976
8977 if (provider != dtrace_provider)
8978 lck_mtx_unlock(&dtrace_lock);
8979
8980 return (id);
8981 }
8982
8983 static dtrace_probe_t *
8984 dtrace_probe_lookup_id(dtrace_id_t id)
8985 {
8986 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8987
8988 if (id == 0 || id > (dtrace_id_t)dtrace_nprobes)
8989 return (NULL);
8990
8991 return (dtrace_probes[id - 1]);
8992 }
8993
8994 static int
8995 dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg1, void *arg2)
8996 {
8997 #pragma unused(arg2)
8998 *((dtrace_id_t *)arg1) = probe->dtpr_id;
8999
9000 return (DTRACE_MATCH_DONE);
9001 }
9002
9003 /*
9004 * Look up a probe based on provider and one or more of module name, function
9005 * name and probe name.
9006 */
9007 dtrace_id_t
9008 dtrace_probe_lookup(dtrace_provider_id_t prid, const char *mod,
9009 const char *func, const char *name)
9010 {
9011 dtrace_probekey_t pkey;
9012 dtrace_id_t id;
9013 int match;
9014
9015 lck_mtx_lock(&dtrace_lock);
9016
9017 pkey.dtpk_prov = dtrace_strref(((dtrace_provider_t *)prid)->dtpv_name);
9018 pkey.dtpk_pmatch = &dtrace_match_string;
9019 pkey.dtpk_mod = dtrace_strref(mod);
9020 pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
9021 pkey.dtpk_func = dtrace_strref(func);
9022 pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
9023 pkey.dtpk_name = dtrace_strref(name);
9024 pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
9025 pkey.dtpk_id = DTRACE_IDNONE;
9026
9027 match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
9028 dtrace_probe_lookup_match, &id, NULL);
9029
9030 dtrace_probekey_release(&pkey);
9031
9032 lck_mtx_unlock(&dtrace_lock);
9033
9034 ASSERT(match == 1 || match == 0);
9035 return (match ? id : 0);
9036 }
9037
9038 /*
9039 * Returns the probe argument associated with the specified probe.
9040 */
9041 void *
9042 dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
9043 {
9044 dtrace_probe_t *probe;
9045 void *rval = NULL;
9046
9047 lck_mtx_lock(&dtrace_lock);
9048
9049 if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
9050 probe->dtpr_provider == (dtrace_provider_t *)id)
9051 rval = probe->dtpr_arg;
9052
9053 lck_mtx_unlock(&dtrace_lock);
9054
9055 return (rval);
9056 }
9057
9058 /*
9059 * Copy a probe into a probe description.
9060 */
9061 static void
9062 dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)
9063 {
9064 bzero(pdp, sizeof (dtrace_probedesc_t));
9065 pdp->dtpd_id = prp->dtpr_id;
9066
9067 /* APPLE NOTE: Darwin employs size bounded string operation. */
9068 (void) strlcpy(pdp->dtpd_provider,
9069 prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN);
9070
9071 (void) strlcpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN);
9072 (void) strlcpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN);
9073 (void) strlcpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN);
9074 }
9075
9076 /*
9077 * Called to indicate that a probe -- or probes -- should be provided by a
9078 * specfied provider. If the specified description is NULL, the provider will
9079 * be told to provide all of its probes. (This is done whenever a new
9080 * consumer comes along, or whenever a retained enabling is to be matched.) If
9081 * the specified description is non-NULL, the provider is given the
9082 * opportunity to dynamically provide the specified probe, allowing providers
9083 * to support the creation of probes on-the-fly. (So-called _autocreated_
9084 * probes.) If the provider is NULL, the operations will be applied to all
9085 * providers; if the provider is non-NULL the operations will only be applied
9086 * to the specified provider. The dtrace_provider_lock must be held, and the
9087 * dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
9088 * will need to grab the dtrace_lock when it reenters the framework through
9089 * dtrace_probe_lookup(), dtrace_probe_create(), etc.
9090 */
9091 static void
9092 dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
9093 {
9094 struct modctl *ctl;
9095 int all = 0;
9096
9097 LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
9098
9099 if (prv == NULL) {
9100 all = 1;
9101 prv = dtrace_provider;
9102 }
9103
9104 do {
9105 /*
9106 * First, call the blanket provide operation.
9107 */
9108 prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
9109
9110 /*
9111 * Now call the per-module provide operation. We will grab
9112 * mod_lock to prevent the list from being modified. Note
9113 * that this also prevents the mod_busy bits from changing.
9114 * (mod_busy can only be changed with mod_lock held.)
9115 */
9116 lck_mtx_lock(&mod_lock);
9117
9118 ctl = dtrace_modctl_list;
9119 while (ctl) {
9120 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
9121 ctl = ctl->mod_next;
9122 }
9123
9124 lck_mtx_unlock(&mod_lock);
9125 } while (all && (prv = prv->dtpv_next) != NULL);
9126 }
9127
9128 /*
9129 * Iterate over each probe, and call the Framework-to-Provider API function
9130 * denoted by offs.
9131 */
9132 static void
9133 dtrace_probe_foreach(uintptr_t offs)
9134 {
9135 dtrace_provider_t *prov;
9136 void (*func)(void *, dtrace_id_t, void *);
9137 dtrace_probe_t *probe;
9138 dtrace_icookie_t cookie;
9139 int i;
9140
9141 /*
9142 * We disable interrupts to walk through the probe array. This is
9143 * safe -- the dtrace_sync() in dtrace_unregister() assures that we
9144 * won't see stale data.
9145 */
9146 cookie = dtrace_interrupt_disable();
9147
9148 for (i = 0; i < dtrace_nprobes; i++) {
9149 if ((probe = dtrace_probes[i]) == NULL)
9150 continue;
9151
9152 if (probe->dtpr_ecb == NULL) {
9153 /*
9154 * This probe isn't enabled -- don't call the function.
9155 */
9156 continue;
9157 }
9158
9159 prov = probe->dtpr_provider;
9160 func = *((void(**)(void *, dtrace_id_t, void *))
9161 ((uintptr_t)&prov->dtpv_pops + offs));
9162
9163 func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
9164 }
9165
9166 dtrace_interrupt_enable(cookie);
9167 }
9168
9169 static int
9170 dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab, dtrace_ecbdesc_t *ep)
9171 {
9172 dtrace_probekey_t pkey;
9173 uint32_t priv;
9174 uid_t uid;
9175 zoneid_t zoneid;
9176 int err;
9177
9178 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9179
9180 dtrace_ecb_create_cache = NULL;
9181
9182 if (desc == NULL) {
9183 /*
9184 * If we're passed a NULL description, we're being asked to
9185 * create an ECB with a NULL probe.
9186 */
9187 (void) dtrace_ecb_create_enable(NULL, enab, ep);
9188 return (0);
9189 }
9190
9191 dtrace_probekey(desc, &pkey);
9192 dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
9193 &priv, &uid, &zoneid);
9194
9195 err = dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable, enab, ep);
9196
9197 dtrace_probekey_release(&pkey);
9198
9199 return err;
9200 }
9201
9202 /*
9203 * DTrace Helper Provider Functions
9204 */
9205 static void
9206 dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
9207 {
9208 attr->dtat_name = DOF_ATTR_NAME(dofattr);
9209 attr->dtat_data = DOF_ATTR_DATA(dofattr);
9210 attr->dtat_class = DOF_ATTR_CLASS(dofattr);
9211 }
9212
9213 static void
9214 dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
9215 const dof_provider_t *dofprov, char *strtab)
9216 {
9217 hprov->dthpv_provname = strtab + dofprov->dofpv_name;
9218 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
9219 dofprov->dofpv_provattr);
9220 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
9221 dofprov->dofpv_modattr);
9222 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
9223 dofprov->dofpv_funcattr);
9224 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
9225 dofprov->dofpv_nameattr);
9226 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
9227 dofprov->dofpv_argsattr);
9228 }
9229
9230 static void
9231 dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, proc_t *p)
9232 {
9233 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9234 dof_hdr_t *dof = (dof_hdr_t *)daddr;
9235 dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
9236 dof_provider_t *provider;
9237 dof_probe_t *probe;
9238 uint32_t *off, *enoff;
9239 uint8_t *arg;
9240 char *strtab;
9241 uint_t i, nprobes;
9242 dtrace_helper_provdesc_t dhpv;
9243 dtrace_helper_probedesc_t dhpb;
9244 dtrace_meta_t *meta = dtrace_meta_pid;
9245 dtrace_mops_t *mops = &meta->dtm_mops;
9246 void *parg;
9247
9248 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
9249 str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9250 provider->dofpv_strtab * dof->dofh_secsize);
9251 prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9252 provider->dofpv_probes * dof->dofh_secsize);
9253 arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9254 provider->dofpv_prargs * dof->dofh_secsize);
9255 off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9256 provider->dofpv_proffs * dof->dofh_secsize);
9257
9258 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
9259 off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
9260 arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
9261 enoff = NULL;
9262
9263 /*
9264 * See dtrace_helper_provider_validate().
9265 */
9266 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
9267 provider->dofpv_prenoffs != DOF_SECT_NONE) {
9268 enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9269 provider->dofpv_prenoffs * dof->dofh_secsize);
9270 enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
9271 }
9272
9273 nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
9274
9275 /*
9276 * Create the provider.
9277 */
9278 dtrace_dofprov2hprov(&dhpv, provider, strtab);
9279
9280 if ((parg = mops->dtms_provide_proc(meta->dtm_arg, &dhpv, p)) == NULL)
9281 return;
9282
9283 meta->dtm_count++;
9284
9285 /*
9286 * Create the probes.
9287 */
9288 for (i = 0; i < nprobes; i++) {
9289 probe = (dof_probe_t *)(uintptr_t)(daddr +
9290 prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
9291
9292 dhpb.dthpb_mod = dhp->dofhp_mod;
9293 dhpb.dthpb_func = strtab + probe->dofpr_func;
9294 dhpb.dthpb_name = strtab + probe->dofpr_name;
9295 #if !defined(__APPLE__)
9296 dhpb.dthpb_base = probe->dofpr_addr;
9297 #else
9298 dhpb.dthpb_base = dhp->dofhp_addr; /* FIXME: James, why? */
9299 #endif
9300 dhpb.dthpb_offs = (int32_t *)(off + probe->dofpr_offidx);
9301 dhpb.dthpb_noffs = probe->dofpr_noffs;
9302 if (enoff != NULL) {
9303 dhpb.dthpb_enoffs = (int32_t *)(enoff + probe->dofpr_enoffidx);
9304 dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
9305 } else {
9306 dhpb.dthpb_enoffs = NULL;
9307 dhpb.dthpb_nenoffs = 0;
9308 }
9309 dhpb.dthpb_args = arg + probe->dofpr_argidx;
9310 dhpb.dthpb_nargc = probe->dofpr_nargc;
9311 dhpb.dthpb_xargc = probe->dofpr_xargc;
9312 dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
9313 dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
9314
9315 mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
9316 }
9317
9318 /*
9319 * Since we just created probes, we need to match our enablings
9320 * against those, with a precondition knowing that we have only
9321 * added probes from this provider
9322 */
9323 char *prov_name = mops->dtms_provider_name(parg);
9324 ASSERT(prov_name != NULL);
9325 dtrace_match_cond_t cond = {dtrace_cond_provider_match, (void*)prov_name};
9326
9327 dtrace_enabling_matchall_with_cond(&cond);
9328 }
9329
9330 static void
9331 dtrace_helper_provide(dof_helper_t *dhp, proc_t *p)
9332 {
9333 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9334 dof_hdr_t *dof = (dof_hdr_t *)daddr;
9335 uint32_t i;
9336
9337 LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
9338
9339 for (i = 0; i < dof->dofh_secnum; i++) {
9340 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
9341 dof->dofh_secoff + i * dof->dofh_secsize);
9342
9343 if (sec->dofs_type != DOF_SECT_PROVIDER)
9344 continue;
9345
9346 dtrace_helper_provide_one(dhp, sec, p);
9347 }
9348 }
9349
9350 static void
9351 dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, proc_t *p)
9352 {
9353 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9354 dof_hdr_t *dof = (dof_hdr_t *)daddr;
9355 dof_sec_t *str_sec;
9356 dof_provider_t *provider;
9357 char *strtab;
9358 dtrace_helper_provdesc_t dhpv;
9359 dtrace_meta_t *meta = dtrace_meta_pid;
9360 dtrace_mops_t *mops = &meta->dtm_mops;
9361
9362 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
9363 str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9364 provider->dofpv_strtab * dof->dofh_secsize);
9365
9366 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
9367
9368 /*
9369 * Create the provider.
9370 */
9371 dtrace_dofprov2hprov(&dhpv, provider, strtab);
9372
9373 mops->dtms_remove_proc(meta->dtm_arg, &dhpv, p);
9374
9375 meta->dtm_count--;
9376 }
9377
9378 static void
9379 dtrace_helper_provider_remove(dof_helper_t *dhp, proc_t *p)
9380 {
9381 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9382 dof_hdr_t *dof = (dof_hdr_t *)daddr;
9383 uint32_t i;
9384
9385 LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
9386
9387 for (i = 0; i < dof->dofh_secnum; i++) {
9388 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
9389 dof->dofh_secoff + i * dof->dofh_secsize);
9390
9391 if (sec->dofs_type != DOF_SECT_PROVIDER)
9392 continue;
9393
9394 dtrace_helper_provider_remove_one(dhp, sec, p);
9395 }
9396 }
9397
9398 /*
9399 * DTrace Meta Provider-to-Framework API Functions
9400 *
9401 * These functions implement the Meta Provider-to-Framework API, as described
9402 * in <sys/dtrace.h>.
9403 */
9404 int
9405 dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
9406 dtrace_meta_provider_id_t *idp)
9407 {
9408 dtrace_meta_t *meta;
9409 dtrace_helpers_t *help, *next;
9410 uint_t i;
9411
9412 *idp = DTRACE_METAPROVNONE;
9413
9414 /*
9415 * We strictly don't need the name, but we hold onto it for
9416 * debuggability. All hail error queues!
9417 */
9418 if (name == NULL) {
9419 cmn_err(CE_WARN, "failed to register meta-provider: "
9420 "invalid name");
9421 return (EINVAL);
9422 }
9423
9424 if (mops == NULL ||
9425 mops->dtms_create_probe == NULL ||
9426 mops->dtms_provide_proc == NULL ||
9427 mops->dtms_remove_proc == NULL) {
9428 cmn_err(CE_WARN, "failed to register meta-register %s: "
9429 "invalid ops", name);
9430 return (EINVAL);
9431 }
9432
9433 meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
9434 meta->dtm_mops = *mops;
9435 meta->dtm_arg = arg;
9436
9437 lck_mtx_lock(&dtrace_meta_lock);
9438 lck_mtx_lock(&dtrace_lock);
9439
9440 if (dtrace_meta_pid != NULL) {
9441 lck_mtx_unlock(&dtrace_lock);
9442 lck_mtx_unlock(&dtrace_meta_lock);
9443 cmn_err(CE_WARN, "failed to register meta-register %s: "
9444 "user-land meta-provider exists", name);
9445 kmem_free(meta, sizeof (dtrace_meta_t));
9446 return (EINVAL);
9447 }
9448
9449 meta->dtm_name = dtrace_strref(name);
9450
9451 dtrace_meta_pid = meta;
9452 *idp = (dtrace_meta_provider_id_t)meta;
9453
9454 /*
9455 * If there are providers and probes ready to go, pass them
9456 * off to the new meta provider now.
9457 */
9458
9459 help = dtrace_deferred_pid;
9460 dtrace_deferred_pid = NULL;
9461
9462 lck_mtx_unlock(&dtrace_lock);
9463
9464 while (help != NULL) {
9465 for (i = 0; i < help->dthps_nprovs; i++) {
9466 proc_t *p = proc_find(help->dthps_pid);
9467 if (p == PROC_NULL)
9468 continue;
9469 dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
9470 p);
9471 proc_rele(p);
9472 }
9473
9474 next = help->dthps_next;
9475 help->dthps_next = NULL;
9476 help->dthps_prev = NULL;
9477 help->dthps_deferred = 0;
9478 help = next;
9479 }
9480
9481 lck_mtx_unlock(&dtrace_meta_lock);
9482
9483 return (0);
9484 }
9485
9486 int
9487 dtrace_meta_unregister(dtrace_meta_provider_id_t id)
9488 {
9489 dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;
9490
9491 lck_mtx_lock(&dtrace_meta_lock);
9492 lck_mtx_lock(&dtrace_lock);
9493
9494 if (old == dtrace_meta_pid) {
9495 pp = &dtrace_meta_pid;
9496 } else {
9497 panic("attempt to unregister non-existent "
9498 "dtrace meta-provider %p\n", (void *)old);
9499 }
9500
9501 if (old->dtm_count != 0) {
9502 lck_mtx_unlock(&dtrace_lock);
9503 lck_mtx_unlock(&dtrace_meta_lock);
9504 return (EBUSY);
9505 }
9506
9507 *pp = NULL;
9508
9509 dtrace_strunref(old->dtm_name);
9510
9511 lck_mtx_unlock(&dtrace_lock);
9512 lck_mtx_unlock(&dtrace_meta_lock);
9513
9514 kmem_free(old, sizeof (dtrace_meta_t));
9515
9516 return (0);
9517 }
9518
9519
9520 /*
9521 * DTrace DIF Object Functions
9522 */
9523 static int
9524 dtrace_difo_err(uint_t pc, const char *format, ...)
9525 {
9526 if (dtrace_err_verbose) {
9527 va_list alist;
9528
9529 (void) uprintf("dtrace DIF object error: [%u]: ", pc);
9530 va_start(alist, format);
9531 (void) vuprintf(format, alist);
9532 va_end(alist);
9533 }
9534
9535 #ifdef DTRACE_ERRDEBUG
9536 dtrace_errdebug(format);
9537 #endif
9538 return (1);
9539 }
9540
9541 /*
9542 * Validate a DTrace DIF object by checking the IR instructions. The following
9543 * rules are currently enforced by dtrace_difo_validate():
9544 *
9545 * 1. Each instruction must have a valid opcode
9546 * 2. Each register, string, variable, or subroutine reference must be valid
9547 * 3. No instruction can modify register %r0 (must be zero)
9548 * 4. All instruction reserved bits must be set to zero
9549 * 5. The last instruction must be a "ret" instruction
9550 * 6. All branch targets must reference a valid instruction _after_ the branch
9551 */
9552 static int
9553 dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
9554 cred_t *cr)
9555 {
9556 int err = 0;
9557 uint_t i;
9558
9559 int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
9560 int kcheckload;
9561 uint_t pc;
9562 int maxglobal = -1, maxlocal = -1, maxtlocal = -1;
9563
9564 kcheckload = cr == NULL ||
9565 (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
9566
9567 dp->dtdo_destructive = 0;
9568
9569 for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
9570 dif_instr_t instr = dp->dtdo_buf[pc];
9571
9572 uint_t r1 = DIF_INSTR_R1(instr);
9573 uint_t r2 = DIF_INSTR_R2(instr);
9574 uint_t rd = DIF_INSTR_RD(instr);
9575 uint_t rs = DIF_INSTR_RS(instr);
9576 uint_t label = DIF_INSTR_LABEL(instr);
9577 uint_t v = DIF_INSTR_VAR(instr);
9578 uint_t subr = DIF_INSTR_SUBR(instr);
9579 uint_t type = DIF_INSTR_TYPE(instr);
9580 uint_t op = DIF_INSTR_OP(instr);
9581
9582 switch (op) {
9583 case DIF_OP_OR:
9584 case DIF_OP_XOR:
9585 case DIF_OP_AND:
9586 case DIF_OP_SLL:
9587 case DIF_OP_SRL:
9588 case DIF_OP_SRA:
9589 case DIF_OP_SUB:
9590 case DIF_OP_ADD:
9591 case DIF_OP_MUL:
9592 case DIF_OP_SDIV:
9593 case DIF_OP_UDIV:
9594 case DIF_OP_SREM:
9595 case DIF_OP_UREM:
9596 case DIF_OP_COPYS:
9597 if (r1 >= nregs)
9598 err += efunc(pc, "invalid register %u\n", r1);
9599 if (r2 >= nregs)
9600 err += efunc(pc, "invalid register %u\n", r2);
9601 if (rd >= nregs)
9602 err += efunc(pc, "invalid register %u\n", rd);
9603 if (rd == 0)
9604 err += efunc(pc, "cannot write to %%r0\n");
9605 break;
9606 case DIF_OP_NOT:
9607 case DIF_OP_MOV:
9608 case DIF_OP_ALLOCS:
9609 if (r1 >= nregs)
9610 err += efunc(pc, "invalid register %u\n", r1);
9611 if (r2 != 0)
9612 err += efunc(pc, "non-zero reserved bits\n");
9613 if (rd >= nregs)
9614 err += efunc(pc, "invalid register %u\n", rd);
9615 if (rd == 0)
9616 err += efunc(pc, "cannot write to %%r0\n");
9617 break;
9618 case DIF_OP_LDSB:
9619 case DIF_OP_LDSH:
9620 case DIF_OP_LDSW:
9621 case DIF_OP_LDUB:
9622 case DIF_OP_LDUH:
9623 case DIF_OP_LDUW:
9624 case DIF_OP_LDX:
9625 if (r1 >= nregs)
9626 err += efunc(pc, "invalid register %u\n", r1);
9627 if (r2 != 0)
9628 err += efunc(pc, "non-zero reserved bits\n");
9629 if (rd >= nregs)
9630 err += efunc(pc, "invalid register %u\n", rd);
9631 if (rd == 0)
9632 err += efunc(pc, "cannot write to %%r0\n");
9633 if (kcheckload)
9634 dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
9635 DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
9636 break;
9637 case DIF_OP_RLDSB:
9638 case DIF_OP_RLDSH:
9639 case DIF_OP_RLDSW:
9640 case DIF_OP_RLDUB:
9641 case DIF_OP_RLDUH:
9642 case DIF_OP_RLDUW:
9643 case DIF_OP_RLDX:
9644 if (r1 >= nregs)
9645 err += efunc(pc, "invalid register %u\n", r1);
9646 if (r2 != 0)
9647 err += efunc(pc, "non-zero reserved bits\n");
9648 if (rd >= nregs)
9649 err += efunc(pc, "invalid register %u\n", rd);
9650 if (rd == 0)
9651 err += efunc(pc, "cannot write to %%r0\n");
9652 break;
9653 case DIF_OP_ULDSB:
9654 case DIF_OP_ULDSH:
9655 case DIF_OP_ULDSW:
9656 case DIF_OP_ULDUB:
9657 case DIF_OP_ULDUH:
9658 case DIF_OP_ULDUW:
9659 case DIF_OP_ULDX:
9660 if (r1 >= nregs)
9661 err += efunc(pc, "invalid register %u\n", r1);
9662 if (r2 != 0)
9663 err += efunc(pc, "non-zero reserved bits\n");
9664 if (rd >= nregs)
9665 err += efunc(pc, "invalid register %u\n", rd);
9666 if (rd == 0)
9667 err += efunc(pc, "cannot write to %%r0\n");
9668 break;
9669 case DIF_OP_STB:
9670 case DIF_OP_STH:
9671 case DIF_OP_STW:
9672 case DIF_OP_STX:
9673 if (r1 >= nregs)
9674 err += efunc(pc, "invalid register %u\n", r1);
9675 if (r2 != 0)
9676 err += efunc(pc, "non-zero reserved bits\n");
9677 if (rd >= nregs)
9678 err += efunc(pc, "invalid register %u\n", rd);
9679 if (rd == 0)
9680 err += efunc(pc, "cannot write to 0 address\n");
9681 break;
9682 case DIF_OP_CMP:
9683 case DIF_OP_SCMP:
9684 if (r1 >= nregs)
9685 err += efunc(pc, "invalid register %u\n", r1);
9686 if (r2 >= nregs)
9687 err += efunc(pc, "invalid register %u\n", r2);
9688 if (rd != 0)
9689 err += efunc(pc, "non-zero reserved bits\n");
9690 break;
9691 case DIF_OP_TST:
9692 if (r1 >= nregs)
9693 err += efunc(pc, "invalid register %u\n", r1);
9694 if (r2 != 0 || rd != 0)
9695 err += efunc(pc, "non-zero reserved bits\n");
9696 break;
9697 case DIF_OP_BA:
9698 case DIF_OP_BE:
9699 case DIF_OP_BNE:
9700 case DIF_OP_BG:
9701 case DIF_OP_BGU:
9702 case DIF_OP_BGE:
9703 case DIF_OP_BGEU:
9704 case DIF_OP_BL:
9705 case DIF_OP_BLU:
9706 case DIF_OP_BLE:
9707 case DIF_OP_BLEU:
9708 if (label >= dp->dtdo_len) {
9709 err += efunc(pc, "invalid branch target %u\n",
9710 label);
9711 }
9712 if (label <= pc) {
9713 err += efunc(pc, "backward branch to %u\n",
9714 label);
9715 }
9716 break;
9717 case DIF_OP_RET:
9718 if (r1 != 0 || r2 != 0)
9719 err += efunc(pc, "non-zero reserved bits\n");
9720 if (rd >= nregs)
9721 err += efunc(pc, "invalid register %u\n", rd);
9722 break;
9723 case DIF_OP_NOP:
9724 case DIF_OP_POPTS:
9725 case DIF_OP_FLUSHTS:
9726 if (r1 != 0 || r2 != 0 || rd != 0)
9727 err += efunc(pc, "non-zero reserved bits\n");
9728 break;
9729 case DIF_OP_SETX:
9730 if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
9731 err += efunc(pc, "invalid integer ref %u\n",
9732 DIF_INSTR_INTEGER(instr));
9733 }
9734 if (rd >= nregs)
9735 err += efunc(pc, "invalid register %u\n", rd);
9736 if (rd == 0)
9737 err += efunc(pc, "cannot write to %%r0\n");
9738 break;
9739 case DIF_OP_SETS:
9740 if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
9741 err += efunc(pc, "invalid string ref %u\n",
9742 DIF_INSTR_STRING(instr));
9743 }
9744 if (rd >= nregs)
9745 err += efunc(pc, "invalid register %u\n", rd);
9746 if (rd == 0)
9747 err += efunc(pc, "cannot write to %%r0\n");
9748 break;
9749 case DIF_OP_LDGA:
9750 case DIF_OP_LDTA:
9751 if (r1 > DIF_VAR_ARRAY_MAX)
9752 err += efunc(pc, "invalid array %u\n", r1);
9753 if (r2 >= nregs)
9754 err += efunc(pc, "invalid register %u\n", r2);
9755 if (rd >= nregs)
9756 err += efunc(pc, "invalid register %u\n", rd);
9757 if (rd == 0)
9758 err += efunc(pc, "cannot write to %%r0\n");
9759 break;
9760 case DIF_OP_LDGS:
9761 case DIF_OP_LDTS:
9762 case DIF_OP_LDLS:
9763 case DIF_OP_LDGAA:
9764 case DIF_OP_LDTAA:
9765 if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)
9766 err += efunc(pc, "invalid variable %u\n", v);
9767 if (rd >= nregs)
9768 err += efunc(pc, "invalid register %u\n", rd);
9769 if (rd == 0)
9770 err += efunc(pc, "cannot write to %%r0\n");
9771 break;
9772 case DIF_OP_STGS:
9773 case DIF_OP_STTS:
9774 case DIF_OP_STLS:
9775 case DIF_OP_STGAA:
9776 case DIF_OP_STTAA:
9777 if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)
9778 err += efunc(pc, "invalid variable %u\n", v);
9779 if (rs >= nregs)
9780 err += efunc(pc, "invalid register %u\n", rd);
9781 break;
9782 case DIF_OP_CALL:
9783 if (subr > DIF_SUBR_MAX &&
9784 !(subr >= DIF_SUBR_APPLE_MIN && subr <= DIF_SUBR_APPLE_MAX))
9785 err += efunc(pc, "invalid subr %u\n", subr);
9786 if (rd >= nregs)
9787 err += efunc(pc, "invalid register %u\n", rd);
9788 if (rd == 0)
9789 err += efunc(pc, "cannot write to %%r0\n");
9790
9791 if (subr == DIF_SUBR_COPYOUT ||
9792 subr == DIF_SUBR_COPYOUTSTR ||
9793 subr == DIF_SUBR_KDEBUG_TRACE ||
9794 subr == DIF_SUBR_KDEBUG_TRACE_STRING) {
9795 dp->dtdo_destructive = 1;
9796 }
9797 break;
9798 case DIF_OP_PUSHTR:
9799 if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
9800 err += efunc(pc, "invalid ref type %u\n", type);
9801 if (r2 >= nregs)
9802 err += efunc(pc, "invalid register %u\n", r2);
9803 if (rs >= nregs)
9804 err += efunc(pc, "invalid register %u\n", rs);
9805 break;
9806 case DIF_OP_PUSHTV:
9807 if (type != DIF_TYPE_CTF)
9808 err += efunc(pc, "invalid val type %u\n", type);
9809 if (r2 >= nregs)
9810 err += efunc(pc, "invalid register %u\n", r2);
9811 if (rs >= nregs)
9812 err += efunc(pc, "invalid register %u\n", rs);
9813 break;
9814 case DIF_OP_STRIP:
9815 if (r1 >= nregs)
9816 err += efunc(pc, "invalid register %u\n", r1);
9817 if (!dtrace_is_valid_ptrauth_key(r2))
9818 err += efunc(pc, "invalid key\n");
9819 if (rd >= nregs)
9820 err += efunc(pc, "invalid register %u\n", rd);
9821 if (rd == 0)
9822 err += efunc(pc, "cannot write to %%r0\n");
9823 break;
9824 default:
9825 err += efunc(pc, "invalid opcode %u\n",
9826 DIF_INSTR_OP(instr));
9827 }
9828 }
9829
9830 if (dp->dtdo_len != 0 &&
9831 DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
9832 err += efunc(dp->dtdo_len - 1,
9833 "expected 'ret' as last DIF instruction\n");
9834 }
9835
9836 if (!(dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF | DIF_TF_BYUREF))) {
9837 /*
9838 * If we're not returning by reference, the size must be either
9839 * 0 or the size of one of the base types.
9840 */
9841 switch (dp->dtdo_rtype.dtdt_size) {
9842 case 0:
9843 case sizeof (uint8_t):
9844 case sizeof (uint16_t):
9845 case sizeof (uint32_t):
9846 case sizeof (uint64_t):
9847 break;
9848
9849 default:
9850 err += efunc(dp->dtdo_len - 1, "bad return size\n");
9851 }
9852 }
9853
9854 for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
9855 dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;
9856 dtrace_diftype_t *vt, *et;
9857 uint_t id;
9858 int ndx;
9859
9860 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
9861 v->dtdv_scope != DIFV_SCOPE_THREAD &&
9862 v->dtdv_scope != DIFV_SCOPE_LOCAL) {
9863 err += efunc(i, "unrecognized variable scope %d\n",
9864 v->dtdv_scope);
9865 break;
9866 }
9867
9868 if (v->dtdv_kind != DIFV_KIND_ARRAY &&
9869 v->dtdv_kind != DIFV_KIND_SCALAR) {
9870 err += efunc(i, "unrecognized variable type %d\n",
9871 v->dtdv_kind);
9872 break;
9873 }
9874
9875 if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
9876 err += efunc(i, "%d exceeds variable id limit\n", id);
9877 break;
9878 }
9879
9880 if (id < DIF_VAR_OTHER_UBASE)
9881 continue;
9882
9883 /*
9884 * For user-defined variables, we need to check that this
9885 * definition is identical to any previous definition that we
9886 * encountered.
9887 */
9888 ndx = id - DIF_VAR_OTHER_UBASE;
9889
9890 switch (v->dtdv_scope) {
9891 case DIFV_SCOPE_GLOBAL:
9892 if (maxglobal == -1 || ndx > maxglobal)
9893 maxglobal = ndx;
9894
9895 if (ndx < vstate->dtvs_nglobals) {
9896 dtrace_statvar_t *svar;
9897
9898 if ((svar = vstate->dtvs_globals[ndx]) != NULL)
9899 existing = &svar->dtsv_var;
9900 }
9901
9902 break;
9903
9904 case DIFV_SCOPE_THREAD:
9905 if (maxtlocal == -1 || ndx > maxtlocal)
9906 maxtlocal = ndx;
9907
9908 if (ndx < vstate->dtvs_ntlocals)
9909 existing = &vstate->dtvs_tlocals[ndx];
9910 break;
9911
9912 case DIFV_SCOPE_LOCAL:
9913 if (maxlocal == -1 || ndx > maxlocal)
9914 maxlocal = ndx;
9915 if (ndx < vstate->dtvs_nlocals) {
9916 dtrace_statvar_t *svar;
9917
9918 if ((svar = vstate->dtvs_locals[ndx]) != NULL)
9919 existing = &svar->dtsv_var;
9920 }
9921
9922 break;
9923 }
9924
9925 vt = &v->dtdv_type;
9926
9927 if (vt->dtdt_flags & DIF_TF_BYREF) {
9928 if (vt->dtdt_size == 0) {
9929 err += efunc(i, "zero-sized variable\n");
9930 break;
9931 }
9932
9933 if ((v->dtdv_scope == DIFV_SCOPE_GLOBAL ||
9934 v->dtdv_scope == DIFV_SCOPE_LOCAL) &&
9935 vt->dtdt_size > dtrace_statvar_maxsize) {
9936 err += efunc(i, "oversized by-ref static\n");
9937 break;
9938 }
9939 }
9940
9941 if (existing == NULL || existing->dtdv_id == 0)
9942 continue;
9943
9944 ASSERT(existing->dtdv_id == v->dtdv_id);
9945 ASSERT(existing->dtdv_scope == v->dtdv_scope);
9946
9947 if (existing->dtdv_kind != v->dtdv_kind)
9948 err += efunc(i, "%d changed variable kind\n", id);
9949
9950 et = &existing->dtdv_type;
9951
9952 if (vt->dtdt_flags != et->dtdt_flags) {
9953 err += efunc(i, "%d changed variable type flags\n", id);
9954 break;
9955 }
9956
9957 if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
9958 err += efunc(i, "%d changed variable type size\n", id);
9959 break;
9960 }
9961 }
9962
9963 for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
9964 dif_instr_t instr = dp->dtdo_buf[pc];
9965
9966 uint_t v = DIF_INSTR_VAR(instr);
9967 uint_t op = DIF_INSTR_OP(instr);
9968
9969 switch (op) {
9970 case DIF_OP_LDGS:
9971 case DIF_OP_LDGAA:
9972 case DIF_OP_STGS:
9973 case DIF_OP_STGAA:
9974 if (v > (uint_t)(DIF_VAR_OTHER_UBASE + maxglobal))
9975 err += efunc(pc, "invalid variable %u\n", v);
9976 break;
9977 case DIF_OP_LDTS:
9978 case DIF_OP_LDTAA:
9979 case DIF_OP_STTS:
9980 case DIF_OP_STTAA:
9981 if (v > (uint_t)(DIF_VAR_OTHER_UBASE + maxtlocal))
9982 err += efunc(pc, "invalid variable %u\n", v);
9983 break;
9984 case DIF_OP_LDLS:
9985 case DIF_OP_STLS:
9986 if (v > (uint_t)(DIF_VAR_OTHER_UBASE + maxlocal))
9987 err += efunc(pc, "invalid variable %u\n", v);
9988 break;
9989 default:
9990 break;
9991 }
9992 }
9993
9994 return (err);
9995 }
9996
9997 /*
9998 * Validate a DTrace DIF object that it is to be used as a helper. Helpers
9999 * are much more constrained than normal DIFOs. Specifically, they may
10000 * not:
10001 *
10002 * 1. Make calls to subroutines other than copyin(), copyinstr() or
10003 * miscellaneous string routines
10004 * 2. Access DTrace variables other than the args[] array, and the
10005 * curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
10006 * 3. Have thread-local variables.
10007 * 4. Have dynamic variables.
10008 */
10009 static int
10010 dtrace_difo_validate_helper(dtrace_difo_t *dp)
10011 {
10012 int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
10013 int err = 0;
10014 uint_t pc;
10015
10016 for (pc = 0; pc < dp->dtdo_len; pc++) {
10017 dif_instr_t instr = dp->dtdo_buf[pc];
10018
10019 uint_t v = DIF_INSTR_VAR(instr);
10020 uint_t subr = DIF_INSTR_SUBR(instr);
10021 uint_t op = DIF_INSTR_OP(instr);
10022
10023 switch (op) {
10024 case DIF_OP_OR:
10025 case DIF_OP_XOR:
10026 case DIF_OP_AND:
10027 case DIF_OP_SLL:
10028 case DIF_OP_SRL:
10029 case DIF_OP_SRA:
10030 case DIF_OP_SUB:
10031 case DIF_OP_ADD:
10032 case DIF_OP_MUL:
10033 case DIF_OP_SDIV:
10034 case DIF_OP_UDIV:
10035 case DIF_OP_SREM:
10036 case DIF_OP_UREM:
10037 case DIF_OP_COPYS:
10038 case DIF_OP_NOT:
10039 case DIF_OP_MOV:
10040 case DIF_OP_RLDSB:
10041 case DIF_OP_RLDSH:
10042 case DIF_OP_RLDSW:
10043 case DIF_OP_RLDUB:
10044 case DIF_OP_RLDUH:
10045 case DIF_OP_RLDUW:
10046 case DIF_OP_RLDX:
10047 case DIF_OP_ULDSB:
10048 case DIF_OP_ULDSH:
10049 case DIF_OP_ULDSW:
10050 case DIF_OP_ULDUB:
10051 case DIF_OP_ULDUH:
10052 case DIF_OP_ULDUW:
10053 case DIF_OP_ULDX:
10054 case DIF_OP_STB:
10055 case DIF_OP_STH:
10056 case DIF_OP_STW:
10057 case DIF_OP_STX:
10058 case DIF_OP_ALLOCS:
10059 case DIF_OP_CMP:
10060 case DIF_OP_SCMP:
10061 case DIF_OP_TST:
10062 case DIF_OP_BA:
10063 case DIF_OP_BE:
10064 case DIF_OP_BNE:
10065 case DIF_OP_BG:
10066 case DIF_OP_BGU:
10067 case DIF_OP_BGE:
10068 case DIF_OP_BGEU:
10069 case DIF_OP_BL:
10070 case DIF_OP_BLU:
10071 case DIF_OP_BLE:
10072 case DIF_OP_BLEU:
10073 case DIF_OP_RET:
10074 case DIF_OP_NOP:
10075 case DIF_OP_POPTS:
10076 case DIF_OP_FLUSHTS:
10077 case DIF_OP_SETX:
10078 case DIF_OP_SETS:
10079 case DIF_OP_LDGA:
10080 case DIF_OP_LDLS:
10081 case DIF_OP_STGS:
10082 case DIF_OP_STLS:
10083 case DIF_OP_PUSHTR:
10084 case DIF_OP_PUSHTV:
10085 break;
10086
10087 case DIF_OP_LDGS:
10088 if (v >= DIF_VAR_OTHER_UBASE)
10089 break;
10090
10091 if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
10092 break;
10093
10094 if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||
10095 v == DIF_VAR_PPID || v == DIF_VAR_TID ||
10096 v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||
10097 v == DIF_VAR_UID || v == DIF_VAR_GID)
10098 break;
10099
10100 err += efunc(pc, "illegal variable %u\n", v);
10101 break;
10102
10103 case DIF_OP_LDTA:
10104 case DIF_OP_LDTS:
10105 case DIF_OP_LDGAA:
10106 case DIF_OP_LDTAA:
10107 err += efunc(pc, "illegal dynamic variable load\n");
10108 break;
10109
10110 case DIF_OP_STTS:
10111 case DIF_OP_STGAA:
10112 case DIF_OP_STTAA:
10113 err += efunc(pc, "illegal dynamic variable store\n");
10114 break;
10115
10116 case DIF_OP_CALL:
10117 if (subr == DIF_SUBR_ALLOCA ||
10118 subr == DIF_SUBR_BCOPY ||
10119 subr == DIF_SUBR_COPYIN ||
10120 subr == DIF_SUBR_COPYINTO ||
10121 subr == DIF_SUBR_COPYINSTR ||
10122 subr == DIF_SUBR_INDEX ||
10123 subr == DIF_SUBR_INET_NTOA ||
10124 subr == DIF_SUBR_INET_NTOA6 ||
10125 subr == DIF_SUBR_INET_NTOP ||
10126 subr == DIF_SUBR_JSON ||
10127 subr == DIF_SUBR_LLTOSTR ||
10128 subr == DIF_SUBR_STRTOLL ||
10129 subr == DIF_SUBR_RINDEX ||
10130 subr == DIF_SUBR_STRCHR ||
10131 subr == DIF_SUBR_STRJOIN ||
10132 subr == DIF_SUBR_STRRCHR ||
10133 subr == DIF_SUBR_STRSTR ||
10134 subr == DIF_SUBR_KDEBUG_TRACE ||
10135 subr == DIF_SUBR_KDEBUG_TRACE_STRING ||
10136 subr == DIF_SUBR_HTONS ||
10137 subr == DIF_SUBR_HTONL ||
10138 subr == DIF_SUBR_HTONLL ||
10139 subr == DIF_SUBR_NTOHS ||
10140 subr == DIF_SUBR_NTOHL ||
10141 subr == DIF_SUBR_NTOHLL)
10142 break;
10143
10144 err += efunc(pc, "invalid subr %u\n", subr);
10145 break;
10146
10147 default:
10148 err += efunc(pc, "invalid opcode %u\n",
10149 DIF_INSTR_OP(instr));
10150 }
10151 }
10152
10153 return (err);
10154 }
10155
10156 /*
10157 * Returns 1 if the expression in the DIF object can be cached on a per-thread
10158 * basis; 0 if not.
10159 */
10160 static int
10161 dtrace_difo_cacheable(dtrace_difo_t *dp)
10162 {
10163 uint_t i;
10164
10165 if (dp == NULL)
10166 return (0);
10167
10168 for (i = 0; i < dp->dtdo_varlen; i++) {
10169 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10170
10171 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
10172 continue;
10173
10174 switch (v->dtdv_id) {
10175 case DIF_VAR_CURTHREAD:
10176 case DIF_VAR_PID:
10177 case DIF_VAR_TID:
10178 case DIF_VAR_EXECNAME:
10179 case DIF_VAR_ZONENAME:
10180 break;
10181
10182 default:
10183 return (0);
10184 }
10185 }
10186
10187 /*
10188 * This DIF object may be cacheable. Now we need to look for any
10189 * array loading instructions, any memory loading instructions, or
10190 * any stores to thread-local variables.
10191 */
10192 for (i = 0; i < dp->dtdo_len; i++) {
10193 uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
10194
10195 if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||
10196 (op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||
10197 (op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||
10198 op == DIF_OP_LDGA || op == DIF_OP_STTS)
10199 return (0);
10200 }
10201
10202 return (1);
10203 }
10204
10205 static void
10206 dtrace_difo_hold(dtrace_difo_t *dp)
10207 {
10208 uint_t i;
10209
10210 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10211
10212 dp->dtdo_refcnt++;
10213 ASSERT(dp->dtdo_refcnt != 0);
10214
10215 /*
10216 * We need to check this DIF object for references to the variable
10217 * DIF_VAR_VTIMESTAMP.
10218 */
10219 for (i = 0; i < dp->dtdo_varlen; i++) {
10220 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10221
10222 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10223 continue;
10224
10225 if (dtrace_vtime_references++ == 0)
10226 dtrace_vtime_enable();
10227 }
10228 }
10229
10230 /*
10231 * This routine calculates the dynamic variable chunksize for a given DIF
10232 * object. The calculation is not fool-proof, and can probably be tricked by
10233 * malicious DIF -- but it works for all compiler-generated DIF. Because this
10234 * calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
10235 * if a dynamic variable size exceeds the chunksize.
10236 */
10237 static void
10238 dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10239 {
10240 uint64_t sval = 0;
10241 dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
10242 const dif_instr_t *text = dp->dtdo_buf;
10243 uint_t pc, srd = 0;
10244 uint_t ttop = 0;
10245 size_t size, ksize;
10246 uint_t id, i;
10247
10248 for (pc = 0; pc < dp->dtdo_len; pc++) {
10249 dif_instr_t instr = text[pc];
10250 uint_t op = DIF_INSTR_OP(instr);
10251 uint_t rd = DIF_INSTR_RD(instr);
10252 uint_t r1 = DIF_INSTR_R1(instr);
10253 uint_t nkeys = 0;
10254 uchar_t scope;
10255
10256 dtrace_key_t *key = tupregs;
10257
10258 switch (op) {
10259 case DIF_OP_SETX:
10260 sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
10261 srd = rd;
10262 continue;
10263
10264 case DIF_OP_STTS:
10265 key = &tupregs[DIF_DTR_NREGS];
10266 key[0].dttk_size = 0;
10267 key[1].dttk_size = 0;
10268 nkeys = 2;
10269 scope = DIFV_SCOPE_THREAD;
10270 break;
10271
10272 case DIF_OP_STGAA:
10273 case DIF_OP_STTAA:
10274 nkeys = ttop;
10275
10276 if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
10277 key[nkeys++].dttk_size = 0;
10278
10279 key[nkeys++].dttk_size = 0;
10280
10281 if (op == DIF_OP_STTAA) {
10282 scope = DIFV_SCOPE_THREAD;
10283 } else {
10284 scope = DIFV_SCOPE_GLOBAL;
10285 }
10286
10287 break;
10288
10289 case DIF_OP_PUSHTR:
10290 if (ttop == DIF_DTR_NREGS)
10291 return;
10292
10293 if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {
10294 /*
10295 * If the register for the size of the "pushtr"
10296 * is %r0 (or the value is 0) and the type is
10297 * a string, we'll use the system-wide default
10298 * string size.
10299 */
10300 tupregs[ttop++].dttk_size =
10301 dtrace_strsize_default;
10302 } else {
10303 if (srd == 0)
10304 return;
10305
10306 if (sval > LONG_MAX)
10307 return;
10308
10309 tupregs[ttop++].dttk_size = sval;
10310 }
10311
10312 break;
10313
10314 case DIF_OP_PUSHTV:
10315 if (ttop == DIF_DTR_NREGS)
10316 return;
10317
10318 tupregs[ttop++].dttk_size = 0;
10319 break;
10320
10321 case DIF_OP_FLUSHTS:
10322 ttop = 0;
10323 break;
10324
10325 case DIF_OP_POPTS:
10326 if (ttop != 0)
10327 ttop--;
10328 break;
10329 }
10330
10331 sval = 0;
10332 srd = 0;
10333
10334 if (nkeys == 0)
10335 continue;
10336
10337 /*
10338 * We have a dynamic variable allocation; calculate its size.
10339 */
10340 for (ksize = 0, i = 0; i < nkeys; i++)
10341 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
10342
10343 size = sizeof (dtrace_dynvar_t);
10344 size += sizeof (dtrace_key_t) * (nkeys - 1);
10345 size += ksize;
10346
10347 /*
10348 * Now we need to determine the size of the stored data.
10349 */
10350 id = DIF_INSTR_VAR(instr);
10351
10352 for (i = 0; i < dp->dtdo_varlen; i++) {
10353 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10354
10355 if (v->dtdv_id == id && v->dtdv_scope == scope) {
10356 size += v->dtdv_type.dtdt_size;
10357 break;
10358 }
10359 }
10360
10361 if (i == dp->dtdo_varlen)
10362 return;
10363
10364 /*
10365 * We have the size. If this is larger than the chunk size
10366 * for our dynamic variable state, reset the chunk size.
10367 */
10368 size = P2ROUNDUP(size, sizeof (uint64_t));
10369
10370 /*
10371 * Before setting the chunk size, check that we're not going
10372 * to set it to a negative value...
10373 */
10374 if (size > LONG_MAX)
10375 return;
10376
10377 /*
10378 * ...and make certain that we didn't badly overflow.
10379 */
10380 if (size < ksize || size < sizeof (dtrace_dynvar_t))
10381 return;
10382
10383 if (size > vstate->dtvs_dynvars.dtds_chunksize)
10384 vstate->dtvs_dynvars.dtds_chunksize = size;
10385 }
10386 }
10387
10388 static void
10389 dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10390 {
10391 int oldsvars, osz, nsz, otlocals, ntlocals;
10392 uint_t i, id;
10393
10394 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10395 ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);
10396
10397 for (i = 0; i < dp->dtdo_varlen; i++) {
10398 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10399 dtrace_statvar_t *svar;
10400 dtrace_statvar_t ***svarp = NULL;
10401 size_t dsize = 0;
10402 uint8_t scope = v->dtdv_scope;
10403 int *np = (int *)NULL;
10404
10405 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10406 continue;
10407
10408 id -= DIF_VAR_OTHER_UBASE;
10409
10410 switch (scope) {
10411 case DIFV_SCOPE_THREAD:
10412 while (id >= (uint_t)(otlocals = vstate->dtvs_ntlocals)) {
10413 dtrace_difv_t *tlocals;
10414
10415 if ((ntlocals = (otlocals << 1)) == 0)
10416 ntlocals = 1;
10417
10418 osz = otlocals * sizeof (dtrace_difv_t);
10419 nsz = ntlocals * sizeof (dtrace_difv_t);
10420
10421 tlocals = kmem_zalloc(nsz, KM_SLEEP);
10422
10423 if (osz != 0) {
10424 bcopy(vstate->dtvs_tlocals,
10425 tlocals, osz);
10426 kmem_free(vstate->dtvs_tlocals, osz);
10427 }
10428
10429 vstate->dtvs_tlocals = tlocals;
10430 vstate->dtvs_ntlocals = ntlocals;
10431 }
10432
10433 vstate->dtvs_tlocals[id] = *v;
10434 continue;
10435
10436 case DIFV_SCOPE_LOCAL:
10437 np = &vstate->dtvs_nlocals;
10438 svarp = &vstate->dtvs_locals;
10439
10440 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
10441 dsize = (int)NCPU * (v->dtdv_type.dtdt_size +
10442 sizeof (uint64_t));
10443 else
10444 dsize = (int)NCPU * sizeof (uint64_t);
10445
10446 break;
10447
10448 case DIFV_SCOPE_GLOBAL:
10449 np = &vstate->dtvs_nglobals;
10450 svarp = &vstate->dtvs_globals;
10451
10452 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
10453 dsize = v->dtdv_type.dtdt_size +
10454 sizeof (uint64_t);
10455
10456 break;
10457
10458 default:
10459 ASSERT(0);
10460 }
10461
10462 while (id >= (uint_t)(oldsvars = *np)) {
10463 dtrace_statvar_t **statics;
10464 int newsvars, oldsize, newsize;
10465
10466 if ((newsvars = (oldsvars << 1)) == 0)
10467 newsvars = 1;
10468
10469 oldsize = oldsvars * sizeof (dtrace_statvar_t *);
10470 newsize = newsvars * sizeof (dtrace_statvar_t *);
10471
10472 statics = kmem_zalloc(newsize, KM_SLEEP);
10473
10474 if (oldsize != 0) {
10475 bcopy(*svarp, statics, oldsize);
10476 kmem_free(*svarp, oldsize);
10477 }
10478
10479 *svarp = statics;
10480 *np = newsvars;
10481 }
10482
10483 if ((svar = (*svarp)[id]) == NULL) {
10484 svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
10485 svar->dtsv_var = *v;
10486
10487 if ((svar->dtsv_size = dsize) != 0) {
10488 svar->dtsv_data = (uint64_t)(uintptr_t)
10489 kmem_zalloc(dsize, KM_SLEEP);
10490 }
10491
10492 (*svarp)[id] = svar;
10493 }
10494
10495 svar->dtsv_refcnt++;
10496 }
10497
10498 dtrace_difo_chunksize(dp, vstate);
10499 dtrace_difo_hold(dp);
10500 }
10501
10502 static dtrace_difo_t *
10503 dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10504 {
10505 dtrace_difo_t *new;
10506 size_t sz;
10507
10508 ASSERT(dp->dtdo_buf != NULL);
10509 ASSERT(dp->dtdo_refcnt != 0);
10510
10511 new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
10512
10513 ASSERT(dp->dtdo_buf != NULL);
10514 sz = dp->dtdo_len * sizeof (dif_instr_t);
10515 new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
10516 bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
10517 new->dtdo_len = dp->dtdo_len;
10518
10519 if (dp->dtdo_strtab != NULL) {
10520 ASSERT(dp->dtdo_strlen != 0);
10521 new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
10522 bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
10523 new->dtdo_strlen = dp->dtdo_strlen;
10524 }
10525
10526 if (dp->dtdo_inttab != NULL) {
10527 ASSERT(dp->dtdo_intlen != 0);
10528 sz = dp->dtdo_intlen * sizeof (uint64_t);
10529 new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
10530 bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
10531 new->dtdo_intlen = dp->dtdo_intlen;
10532 }
10533
10534 if (dp->dtdo_vartab != NULL) {
10535 ASSERT(dp->dtdo_varlen != 0);
10536 sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
10537 new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
10538 bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
10539 new->dtdo_varlen = dp->dtdo_varlen;
10540 }
10541
10542 dtrace_difo_init(new, vstate);
10543 return (new);
10544 }
10545
10546 static void
10547 dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10548 {
10549 uint_t i;
10550
10551 ASSERT(dp->dtdo_refcnt == 0);
10552
10553 for (i = 0; i < dp->dtdo_varlen; i++) {
10554 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10555 dtrace_statvar_t *svar;
10556 dtrace_statvar_t **svarp = NULL;
10557 uint_t id;
10558 uint8_t scope = v->dtdv_scope;
10559 int *np = NULL;
10560
10561 switch (scope) {
10562 case DIFV_SCOPE_THREAD:
10563 continue;
10564
10565 case DIFV_SCOPE_LOCAL:
10566 np = &vstate->dtvs_nlocals;
10567 svarp = vstate->dtvs_locals;
10568 break;
10569
10570 case DIFV_SCOPE_GLOBAL:
10571 np = &vstate->dtvs_nglobals;
10572 svarp = vstate->dtvs_globals;
10573 break;
10574
10575 default:
10576 ASSERT(0);
10577 }
10578
10579 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10580 continue;
10581
10582 id -= DIF_VAR_OTHER_UBASE;
10583
10584 ASSERT(id < (uint_t)*np);
10585
10586 svar = svarp[id];
10587 ASSERT(svar != NULL);
10588 ASSERT(svar->dtsv_refcnt > 0);
10589
10590 if (--svar->dtsv_refcnt > 0)
10591 continue;
10592
10593 if (svar->dtsv_size != 0) {
10594 ASSERT(svar->dtsv_data != 0);
10595 kmem_free((void *)(uintptr_t)svar->dtsv_data,
10596 svar->dtsv_size);
10597 }
10598
10599 kmem_free(svar, sizeof (dtrace_statvar_t));
10600 svarp[id] = NULL;
10601 }
10602
10603 kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
10604 kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
10605 kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
10606 kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
10607
10608 kmem_free(dp, sizeof (dtrace_difo_t));
10609 }
10610
10611 static void
10612 dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10613 {
10614 uint_t i;
10615
10616 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10617 ASSERT(dp->dtdo_refcnt != 0);
10618
10619 for (i = 0; i < dp->dtdo_varlen; i++) {
10620 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10621
10622 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10623 continue;
10624
10625 ASSERT(dtrace_vtime_references > 0);
10626 if (--dtrace_vtime_references == 0)
10627 dtrace_vtime_disable();
10628 }
10629
10630 if (--dp->dtdo_refcnt == 0)
10631 dtrace_difo_destroy(dp, vstate);
10632 }
10633
10634 /*
10635 * DTrace Format Functions
10636 */
10637
10638 static dtrace_format_t*
10639 dtrace_format_new(char *str)
10640 {
10641 dtrace_format_t *fmt = NULL;
10642 size_t bufsize = strlen(str) + 1;
10643
10644 fmt = kmem_zalloc(sizeof(*fmt) + bufsize, KM_SLEEP);
10645
10646 fmt->dtf_refcount = 1;
10647 (void) strlcpy(fmt->dtf_str, str, bufsize);
10648
10649 return fmt;
10650 }
10651
10652 static uint16_t
10653 dtrace_format_add(dtrace_state_t *state, char *str)
10654 {
10655 dtrace_format_t **new;
10656 uint16_t ndx;
10657
10658 for (ndx = 0; ndx < state->dts_nformats; ndx++) {
10659 if (state->dts_formats[ndx] == NULL) {
10660 state->dts_formats[ndx] = dtrace_format_new(str);
10661 return (ndx + 1);
10662 }
10663 else if (strcmp(state->dts_formats[ndx]->dtf_str, str) == 0) {
10664 VERIFY(state->dts_formats[ndx]->dtf_refcount < UINT64_MAX);
10665 state->dts_formats[ndx]->dtf_refcount++;
10666 return (ndx + 1);
10667 }
10668 }
10669
10670 if (state->dts_nformats == USHRT_MAX) {
10671 /*
10672 * This is only likely if a denial-of-service attack is being
10673 * attempted. As such, it's okay to fail silently here.
10674 */
10675 return (0);
10676 }
10677
10678 /*
10679 * For simplicity, we always resize the formats array to be exactly the
10680 * number of formats.
10681 */
10682 ndx = state->dts_nformats++;
10683 new = kmem_alloc((ndx + 1) * sizeof (*state->dts_formats), KM_SLEEP);
10684
10685 if (state->dts_formats != NULL) {
10686 ASSERT(ndx != 0);
10687 bcopy(state->dts_formats, new, ndx * sizeof (*state->dts_formats));
10688 kmem_free(state->dts_formats, ndx * sizeof (*state->dts_formats));
10689 }
10690
10691 state->dts_formats = new;
10692 state->dts_formats[ndx] = dtrace_format_new(str);
10693
10694 return (ndx + 1);
10695 }
10696
10697 static void
10698 dtrace_format_remove(dtrace_state_t *state, uint16_t format)
10699 {
10700 dtrace_format_t *fmt;
10701
10702 ASSERT(state->dts_formats != NULL);
10703 ASSERT(format <= state->dts_nformats);
10704
10705 fmt = state->dts_formats[format - 1];
10706
10707 ASSERT(fmt != NULL);
10708 VERIFY(fmt->dtf_refcount > 0);
10709
10710 fmt->dtf_refcount--;
10711
10712 if (fmt->dtf_refcount == 0) {
10713 kmem_free(fmt, DTRACE_FORMAT_SIZE(fmt));
10714 state->dts_formats[format - 1] = NULL;
10715 }
10716 }
10717
10718 static void
10719 dtrace_format_destroy(dtrace_state_t *state)
10720 {
10721 int i;
10722
10723 if (state->dts_nformats == 0) {
10724 ASSERT(state->dts_formats == NULL);
10725 return;
10726 }
10727
10728 ASSERT(state->dts_formats != NULL);
10729
10730 for (i = 0; i < state->dts_nformats; i++) {
10731 dtrace_format_t *fmt = state->dts_formats[i];
10732
10733 if (fmt == NULL)
10734 continue;
10735
10736 kmem_free(fmt, DTRACE_FORMAT_SIZE(fmt));
10737 }
10738
10739 kmem_free(state->dts_formats, state->dts_nformats * sizeof (*state->dts_formats));
10740 state->dts_nformats = 0;
10741 state->dts_formats = NULL;
10742 }
10743
10744 /*
10745 * DTrace Predicate Functions
10746 */
10747 static dtrace_predicate_t *
10748 dtrace_predicate_create(dtrace_difo_t *dp)
10749 {
10750 dtrace_predicate_t *pred;
10751
10752 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10753 ASSERT(dp->dtdo_refcnt != 0);
10754
10755 pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
10756 pred->dtp_difo = dp;
10757 pred->dtp_refcnt = 1;
10758
10759 if (!dtrace_difo_cacheable(dp))
10760 return (pred);
10761
10762 if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
10763 /*
10764 * This is only theoretically possible -- we have had 2^32
10765 * cacheable predicates on this machine. We cannot allow any
10766 * more predicates to become cacheable: as unlikely as it is,
10767 * there may be a thread caching a (now stale) predicate cache
10768 * ID. (N.B.: the temptation is being successfully resisted to
10769 * have this cmn_err() "Holy shit -- we executed this code!")
10770 */
10771 return (pred);
10772 }
10773
10774 pred->dtp_cacheid = dtrace_predcache_id++;
10775
10776 return (pred);
10777 }
10778
10779 static void
10780 dtrace_predicate_hold(dtrace_predicate_t *pred)
10781 {
10782 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10783 ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
10784 ASSERT(pred->dtp_refcnt > 0);
10785
10786 pred->dtp_refcnt++;
10787 }
10788
10789 static void
10790 dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)
10791 {
10792 dtrace_difo_t *dp = pred->dtp_difo;
10793 #pragma unused(dp) /* __APPLE__ */
10794
10795 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10796 ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
10797 ASSERT(pred->dtp_refcnt > 0);
10798
10799 if (--pred->dtp_refcnt == 0) {
10800 dtrace_difo_release(pred->dtp_difo, vstate);
10801 kmem_free(pred, sizeof (dtrace_predicate_t));
10802 }
10803 }
10804
10805 /*
10806 * DTrace Action Description Functions
10807 */
10808 static dtrace_actdesc_t *
10809 dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
10810 uint64_t uarg, uint64_t arg)
10811 {
10812 dtrace_actdesc_t *act;
10813
10814 ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != 0 &&
10815 arg >= KERNELBASE) || (arg == 0 && kind == DTRACEACT_PRINTA));
10816
10817 act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
10818 act->dtad_kind = kind;
10819 act->dtad_ntuple = ntuple;
10820 act->dtad_uarg = uarg;
10821 act->dtad_arg = arg;
10822 act->dtad_refcnt = 1;
10823
10824 return (act);
10825 }
10826
10827 static void
10828 dtrace_actdesc_hold(dtrace_actdesc_t *act)
10829 {
10830 ASSERT(act->dtad_refcnt >= 1);
10831 act->dtad_refcnt++;
10832 }
10833
10834 static void
10835 dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)
10836 {
10837 dtrace_actkind_t kind = act->dtad_kind;
10838 dtrace_difo_t *dp;
10839
10840 ASSERT(act->dtad_refcnt >= 1);
10841
10842 if (--act->dtad_refcnt != 0)
10843 return;
10844
10845 if ((dp = act->dtad_difo) != NULL)
10846 dtrace_difo_release(dp, vstate);
10847
10848 if (DTRACEACT_ISPRINTFLIKE(kind)) {
10849 char *str = (char *)(uintptr_t)act->dtad_arg;
10850
10851 ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
10852 (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
10853
10854 if (str != NULL)
10855 kmem_free(str, strlen(str) + 1);
10856 }
10857
10858 kmem_free(act, sizeof (dtrace_actdesc_t));
10859 }
10860
10861 /*
10862 * DTrace ECB Functions
10863 */
10864 static dtrace_ecb_t *
10865 dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
10866 {
10867 dtrace_ecb_t *ecb;
10868 dtrace_epid_t epid;
10869
10870 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10871
10872 ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
10873 ecb->dte_predicate = NULL;
10874 ecb->dte_probe = probe;
10875
10876 /*
10877 * The default size is the size of the default action: recording
10878 * the header.
10879 */
10880 ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t);
10881 ecb->dte_alignment = sizeof (dtrace_epid_t);
10882
10883 epid = state->dts_epid++;
10884
10885 if (epid - 1 >= (dtrace_epid_t)state->dts_necbs) {
10886 dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;
10887 int necbs = state->dts_necbs << 1;
10888
10889 ASSERT(epid == (dtrace_epid_t)state->dts_necbs + 1);
10890
10891 if (necbs == 0) {
10892 ASSERT(oecbs == NULL);
10893 necbs = 1;
10894 }
10895
10896 ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
10897
10898 if (oecbs != NULL)
10899 bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
10900
10901 dtrace_membar_producer();
10902 state->dts_ecbs = ecbs;
10903
10904 if (oecbs != NULL) {
10905 /*
10906 * If this state is active, we must dtrace_sync()
10907 * before we can free the old dts_ecbs array: we're
10908 * coming in hot, and there may be active ring
10909 * buffer processing (which indexes into the dts_ecbs
10910 * array) on another CPU.
10911 */
10912 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
10913 dtrace_sync();
10914
10915 kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
10916 }
10917
10918 dtrace_membar_producer();
10919 state->dts_necbs = necbs;
10920 }
10921
10922 ecb->dte_state = state;
10923
10924 ASSERT(state->dts_ecbs[epid - 1] == NULL);
10925 dtrace_membar_producer();
10926 state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;
10927
10928 return (ecb);
10929 }
10930
10931 static int
10932 dtrace_ecb_enable(dtrace_ecb_t *ecb)
10933 {
10934 dtrace_probe_t *probe = ecb->dte_probe;
10935
10936 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
10937 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10938 ASSERT(ecb->dte_next == NULL);
10939
10940 if (probe == NULL) {
10941 /*
10942 * This is the NULL probe -- there's nothing to do.
10943 */
10944 return(0);
10945 }
10946
10947 probe->dtpr_provider->dtpv_ecb_count++;
10948 if (probe->dtpr_ecb == NULL) {
10949 dtrace_provider_t *prov = probe->dtpr_provider;
10950
10951 /*
10952 * We're the first ECB on this probe.
10953 */
10954 probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
10955
10956 if (ecb->dte_predicate != NULL)
10957 probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
10958
10959 return (prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
10960 probe->dtpr_id, probe->dtpr_arg));
10961 } else {
10962 /*
10963 * This probe is already active. Swing the last pointer to
10964 * point to the new ECB, and issue a dtrace_sync() to assure
10965 * that all CPUs have seen the change.
10966 */
10967 ASSERT(probe->dtpr_ecb_last != NULL);
10968 probe->dtpr_ecb_last->dte_next = ecb;
10969 probe->dtpr_ecb_last = ecb;
10970 probe->dtpr_predcache = 0;
10971
10972 dtrace_sync();
10973 return(0);
10974 }
10975 }
10976
10977 static int
10978 dtrace_ecb_resize(dtrace_ecb_t *ecb)
10979 {
10980 dtrace_action_t *act;
10981 uint32_t curneeded = UINT32_MAX;
10982 uint32_t aggbase = UINT32_MAX;
10983
10984 /*
10985 * If we record anything, we always record the dtrace_rechdr_t. (And
10986 * we always record it first.)
10987 */
10988 ecb->dte_size = sizeof (dtrace_rechdr_t);
10989 ecb->dte_alignment = sizeof (dtrace_epid_t);
10990
10991 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
10992 dtrace_recdesc_t *rec = &act->dta_rec;
10993 ASSERT(rec->dtrd_size > 0 || rec->dtrd_alignment == 1);
10994
10995 ecb->dte_alignment = MAX(ecb->dte_alignment, rec->dtrd_alignment);
10996
10997 if (DTRACEACT_ISAGG(act->dta_kind)) {
10998 dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
10999
11000 ASSERT(rec->dtrd_size != 0);
11001 ASSERT(agg->dtag_first != NULL);
11002 ASSERT(act->dta_prev->dta_intuple);
11003 ASSERT(aggbase != UINT32_MAX);
11004 ASSERT(curneeded != UINT32_MAX);
11005
11006 agg->dtag_base = aggbase;
11007 curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
11008 rec->dtrd_offset = curneeded;
11009 if (curneeded + rec->dtrd_size < curneeded)
11010 return (EINVAL);
11011 curneeded += rec->dtrd_size;
11012 ecb->dte_needed = MAX(ecb->dte_needed, curneeded);
11013
11014 aggbase = UINT32_MAX;
11015 curneeded = UINT32_MAX;
11016 } else if (act->dta_intuple) {
11017 if (curneeded == UINT32_MAX) {
11018 /*
11019 * This is the first record in a tuple. Align
11020 * curneeded to be at offset 4 in an 8-byte
11021 * aligned block.
11022 */
11023 ASSERT(act->dta_prev == NULL || !act->dta_prev->dta_intuple);
11024 ASSERT(aggbase == UINT32_MAX);
11025
11026 curneeded = P2PHASEUP(ecb->dte_size,
11027 sizeof (uint64_t), sizeof (dtrace_aggid_t));
11028
11029 aggbase = curneeded - sizeof (dtrace_aggid_t);
11030 ASSERT(IS_P2ALIGNED(aggbase,
11031 sizeof (uint64_t)));
11032 }
11033
11034 curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
11035 rec->dtrd_offset = curneeded;
11036 curneeded += rec->dtrd_size;
11037 if (curneeded + rec->dtrd_size < curneeded)
11038 return (EINVAL);
11039 } else {
11040 /* tuples must be followed by an aggregation */
11041 ASSERT(act->dta_prev == NULL || !act->dta_prev->dta_intuple);
11042 ecb->dte_size = P2ROUNDUP(ecb->dte_size, rec->dtrd_alignment);
11043 rec->dtrd_offset = ecb->dte_size;
11044 if (ecb->dte_size + rec->dtrd_size < ecb->dte_size)
11045 return (EINVAL);
11046 ecb->dte_size += rec->dtrd_size;
11047 ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size);
11048 }
11049 }
11050
11051 if ((act = ecb->dte_action) != NULL &&
11052 !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
11053 ecb->dte_size == sizeof (dtrace_rechdr_t)) {
11054 /*
11055 * If the size is still sizeof (dtrace_rechdr_t), then all
11056 * actions store no data; set the size to 0.
11057 */
11058 ecb->dte_size = 0;
11059 }
11060
11061 ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t));
11062 ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t)));
11063 ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed, ecb->dte_needed);
11064 return (0);
11065 }
11066
11067 static dtrace_action_t *
11068 dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
11069 {
11070 dtrace_aggregation_t *agg;
11071 size_t size = sizeof (uint64_t);
11072 int ntuple = desc->dtad_ntuple;
11073 dtrace_action_t *act;
11074 dtrace_recdesc_t *frec;
11075 dtrace_aggid_t aggid;
11076 dtrace_state_t *state = ecb->dte_state;
11077
11078 agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
11079 agg->dtag_ecb = ecb;
11080
11081 ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
11082
11083 switch (desc->dtad_kind) {
11084 case DTRACEAGG_MIN:
11085 agg->dtag_initial = INT64_MAX;
11086 agg->dtag_aggregate = dtrace_aggregate_min;
11087 break;
11088
11089 case DTRACEAGG_MAX:
11090 agg->dtag_initial = INT64_MIN;
11091 agg->dtag_aggregate = dtrace_aggregate_max;
11092 break;
11093
11094 case DTRACEAGG_COUNT:
11095 agg->dtag_aggregate = dtrace_aggregate_count;
11096 break;
11097
11098 case DTRACEAGG_QUANTIZE:
11099 agg->dtag_aggregate = dtrace_aggregate_quantize;
11100 size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
11101 sizeof (uint64_t);
11102 break;
11103
11104 case DTRACEAGG_LQUANTIZE: {
11105 uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
11106 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
11107
11108 agg->dtag_initial = desc->dtad_arg;
11109 agg->dtag_aggregate = dtrace_aggregate_lquantize;
11110
11111 if (step == 0 || levels == 0)
11112 goto err;
11113
11114 size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
11115 break;
11116 }
11117
11118 case DTRACEAGG_LLQUANTIZE: {
11119 uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(desc->dtad_arg);
11120 uint16_t low = DTRACE_LLQUANTIZE_LOW(desc->dtad_arg);
11121 uint16_t high = DTRACE_LLQUANTIZE_HIGH(desc->dtad_arg);
11122 uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(desc->dtad_arg);
11123 int64_t v;
11124
11125 agg->dtag_initial = desc->dtad_arg;
11126 agg->dtag_aggregate = dtrace_aggregate_llquantize;
11127
11128 if (factor < 2 || low >= high || nsteps < factor)
11129 goto err;
11130
11131 /*
11132 * Now check that the number of steps evenly divides a power
11133 * of the factor. (This assures both integer bucket size and
11134 * linearity within each magnitude.)
11135 */
11136 for (v = factor; v < nsteps; v *= factor)
11137 continue;
11138
11139 if ((v % nsteps) || (nsteps % factor))
11140 goto err;
11141
11142 size = (dtrace_aggregate_llquantize_bucket(factor, low, high, nsteps, INT64_MAX) + 2) * sizeof (uint64_t);
11143 break;
11144 }
11145
11146 case DTRACEAGG_AVG:
11147 agg->dtag_aggregate = dtrace_aggregate_avg;
11148 size = sizeof (uint64_t) * 2;
11149 break;
11150
11151 case DTRACEAGG_STDDEV:
11152 agg->dtag_aggregate = dtrace_aggregate_stddev;
11153 size = sizeof (uint64_t) * 4;
11154 break;
11155
11156 case DTRACEAGG_SUM:
11157 agg->dtag_aggregate = dtrace_aggregate_sum;
11158 break;
11159
11160 default:
11161 goto err;
11162 }
11163
11164 agg->dtag_action.dta_rec.dtrd_size = size;
11165
11166 if (ntuple == 0)
11167 goto err;
11168
11169 /*
11170 * We must make sure that we have enough actions for the n-tuple.
11171 */
11172 for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
11173 if (DTRACEACT_ISAGG(act->dta_kind))
11174 break;
11175
11176 if (--ntuple == 0) {
11177 /*
11178 * This is the action with which our n-tuple begins.
11179 */
11180 agg->dtag_first = act;
11181 goto success;
11182 }
11183 }
11184
11185 /*
11186 * This n-tuple is short by ntuple elements. Return failure.
11187 */
11188 ASSERT(ntuple != 0);
11189 err:
11190 kmem_free(agg, sizeof (dtrace_aggregation_t));
11191 return (NULL);
11192
11193 success:
11194 /*
11195 * If the last action in the tuple has a size of zero, it's actually
11196 * an expression argument for the aggregating action.
11197 */
11198 ASSERT(ecb->dte_action_last != NULL);
11199 act = ecb->dte_action_last;
11200
11201 if (act->dta_kind == DTRACEACT_DIFEXPR) {
11202 ASSERT(act->dta_difo != NULL);
11203
11204 if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
11205 agg->dtag_hasarg = 1;
11206 }
11207
11208 /*
11209 * We need to allocate an id for this aggregation.
11210 */
11211 aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
11212 VM_BESTFIT | VM_SLEEP);
11213
11214 if (aggid - 1 >= (dtrace_aggid_t)state->dts_naggregations) {
11215 dtrace_aggregation_t **oaggs = state->dts_aggregations;
11216 dtrace_aggregation_t **aggs;
11217 int naggs = state->dts_naggregations << 1;
11218 int onaggs = state->dts_naggregations;
11219
11220 ASSERT(aggid == (dtrace_aggid_t)state->dts_naggregations + 1);
11221
11222 if (naggs == 0) {
11223 ASSERT(oaggs == NULL);
11224 naggs = 1;
11225 }
11226
11227 aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
11228
11229 if (oaggs != NULL) {
11230 bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
11231 kmem_free(oaggs, onaggs * sizeof (*aggs));
11232 }
11233
11234 state->dts_aggregations = aggs;
11235 state->dts_naggregations = naggs;
11236 }
11237
11238 ASSERT(state->dts_aggregations[aggid - 1] == NULL);
11239 state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;
11240
11241 frec = &agg->dtag_first->dta_rec;
11242 if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
11243 frec->dtrd_alignment = sizeof (dtrace_aggid_t);
11244
11245 for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
11246 ASSERT(!act->dta_intuple);
11247 act->dta_intuple = 1;
11248 }
11249
11250 return (&agg->dtag_action);
11251 }
11252
11253 static void
11254 dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)
11255 {
11256 dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
11257 dtrace_state_t *state = ecb->dte_state;
11258 dtrace_aggid_t aggid = agg->dtag_id;
11259
11260 ASSERT(DTRACEACT_ISAGG(act->dta_kind));
11261 vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
11262
11263 ASSERT(state->dts_aggregations[aggid - 1] == agg);
11264 state->dts_aggregations[aggid - 1] = NULL;
11265
11266 kmem_free(agg, sizeof (dtrace_aggregation_t));
11267 }
11268
11269 static int
11270 dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
11271 {
11272 dtrace_action_t *action, *last;
11273 dtrace_difo_t *dp = desc->dtad_difo;
11274 uint32_t size = 0, align = sizeof (uint8_t), mask;
11275 uint16_t format = 0;
11276 dtrace_recdesc_t *rec;
11277 dtrace_state_t *state = ecb->dte_state;
11278 dtrace_optval_t *opt = state->dts_options;
11279 dtrace_optval_t nframes=0, strsize;
11280 uint64_t arg = desc->dtad_arg;
11281
11282 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11283 ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);
11284
11285 if (DTRACEACT_ISAGG(desc->dtad_kind)) {
11286 /*
11287 * If this is an aggregating action, there must be neither
11288 * a speculate nor a commit on the action chain.
11289 */
11290 dtrace_action_t *act;
11291
11292 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
11293 if (act->dta_kind == DTRACEACT_COMMIT)
11294 return (EINVAL);
11295
11296 if (act->dta_kind == DTRACEACT_SPECULATE)
11297 return (EINVAL);
11298 }
11299
11300 action = dtrace_ecb_aggregation_create(ecb, desc);
11301
11302 if (action == NULL)
11303 return (EINVAL);
11304 } else {
11305 if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||
11306 (desc->dtad_kind == DTRACEACT_DIFEXPR &&
11307 dp != NULL && dp->dtdo_destructive)) {
11308 state->dts_destructive = 1;
11309 }
11310
11311 switch (desc->dtad_kind) {
11312 case DTRACEACT_PRINTF:
11313 case DTRACEACT_PRINTA:
11314 case DTRACEACT_SYSTEM:
11315 case DTRACEACT_FREOPEN:
11316 case DTRACEACT_DIFEXPR:
11317 /*
11318 * We know that our arg is a string -- turn it into a
11319 * format.
11320 */
11321 if (arg == 0) {
11322 ASSERT(desc->dtad_kind == DTRACEACT_PRINTA ||
11323 desc->dtad_kind == DTRACEACT_DIFEXPR);
11324 format = 0;
11325 } else {
11326 ASSERT(arg != 0);
11327 ASSERT(arg > KERNELBASE);
11328 format = dtrace_format_add(state,
11329 (char *)(uintptr_t)arg);
11330 }
11331
11332 /*FALLTHROUGH*/
11333 case DTRACEACT_LIBACT:
11334 case DTRACEACT_TRACEMEM:
11335 case DTRACEACT_TRACEMEM_DYNSIZE:
11336 case DTRACEACT_APPLEBINARY: /* __APPLE__ */
11337 if (dp == NULL)
11338 return (EINVAL);
11339
11340 if ((size = dp->dtdo_rtype.dtdt_size) != 0)
11341 break;
11342
11343 if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
11344 if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11345 return (EINVAL);
11346
11347 size = opt[DTRACEOPT_STRSIZE];
11348 }
11349
11350 break;
11351
11352 case DTRACEACT_STACK:
11353 if ((nframes = arg) == 0) {
11354 nframes = opt[DTRACEOPT_STACKFRAMES];
11355 ASSERT(nframes > 0);
11356 arg = nframes;
11357 }
11358
11359 size = nframes * sizeof (pc_t);
11360 break;
11361
11362 case DTRACEACT_JSTACK:
11363 if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
11364 strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
11365
11366 if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
11367 nframes = opt[DTRACEOPT_JSTACKFRAMES];
11368
11369 arg = DTRACE_USTACK_ARG(nframes, strsize);
11370
11371 /*FALLTHROUGH*/
11372 case DTRACEACT_USTACK:
11373 if (desc->dtad_kind != DTRACEACT_JSTACK &&
11374 (nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
11375 strsize = DTRACE_USTACK_STRSIZE(arg);
11376 nframes = opt[DTRACEOPT_USTACKFRAMES];
11377 ASSERT(nframes > 0);
11378 arg = DTRACE_USTACK_ARG(nframes, strsize);
11379 }
11380
11381 /*
11382 * Save a slot for the pid.
11383 */
11384 size = (nframes + 1) * sizeof (uint64_t);
11385 size += DTRACE_USTACK_STRSIZE(arg);
11386 size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
11387
11388 break;
11389
11390 case DTRACEACT_SYM:
11391 case DTRACEACT_MOD:
11392 if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=
11393 sizeof (uint64_t)) ||
11394 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11395 return (EINVAL);
11396 break;
11397
11398 case DTRACEACT_USYM:
11399 case DTRACEACT_UMOD:
11400 case DTRACEACT_UADDR:
11401 if (dp == NULL ||
11402 (dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||
11403 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11404 return (EINVAL);
11405
11406 /*
11407 * We have a slot for the pid, plus a slot for the
11408 * argument. To keep things simple (aligned with
11409 * bitness-neutral sizing), we store each as a 64-bit
11410 * quantity.
11411 */
11412 size = 2 * sizeof (uint64_t);
11413 break;
11414
11415 case DTRACEACT_STOP:
11416 case DTRACEACT_BREAKPOINT:
11417 case DTRACEACT_PANIC:
11418 break;
11419
11420 case DTRACEACT_CHILL:
11421 case DTRACEACT_DISCARD:
11422 case DTRACEACT_RAISE:
11423 case DTRACEACT_PIDRESUME: /* __APPLE__ */
11424 if (dp == NULL)
11425 return (EINVAL);
11426 break;
11427
11428 case DTRACEACT_EXIT:
11429 if (dp == NULL ||
11430 (size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||
11431 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11432 return (EINVAL);
11433 break;
11434
11435 case DTRACEACT_SPECULATE:
11436 if (ecb->dte_size > sizeof (dtrace_rechdr_t))
11437 return (EINVAL);
11438
11439 if (dp == NULL)
11440 return (EINVAL);
11441
11442 state->dts_speculates = 1;
11443 break;
11444
11445 case DTRACEACT_COMMIT: {
11446 dtrace_action_t *act = ecb->dte_action;
11447
11448 for (; act != NULL; act = act->dta_next) {
11449 if (act->dta_kind == DTRACEACT_COMMIT)
11450 return (EINVAL);
11451 }
11452
11453 if (dp == NULL)
11454 return (EINVAL);
11455 break;
11456 }
11457
11458 default:
11459 return (EINVAL);
11460 }
11461
11462 if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {
11463 /*
11464 * If this is a data-storing action or a speculate,
11465 * we must be sure that there isn't a commit on the
11466 * action chain.
11467 */
11468 dtrace_action_t *act = ecb->dte_action;
11469
11470 for (; act != NULL; act = act->dta_next) {
11471 if (act->dta_kind == DTRACEACT_COMMIT)
11472 return (EINVAL);
11473 }
11474 }
11475
11476 action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
11477 action->dta_rec.dtrd_size = size;
11478 }
11479
11480 action->dta_refcnt = 1;
11481 rec = &action->dta_rec;
11482 size = rec->dtrd_size;
11483
11484 for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
11485 if (!(size & mask)) {
11486 align = mask + 1;
11487 break;
11488 }
11489 }
11490
11491 action->dta_kind = desc->dtad_kind;
11492
11493 if ((action->dta_difo = dp) != NULL)
11494 dtrace_difo_hold(dp);
11495
11496 rec->dtrd_action = action->dta_kind;
11497 rec->dtrd_arg = arg;
11498 rec->dtrd_uarg = desc->dtad_uarg;
11499 rec->dtrd_alignment = (uint16_t)align;
11500 rec->dtrd_format = format;
11501
11502 if ((last = ecb->dte_action_last) != NULL) {
11503 ASSERT(ecb->dte_action != NULL);
11504 action->dta_prev = last;
11505 last->dta_next = action;
11506 } else {
11507 ASSERT(ecb->dte_action == NULL);
11508 ecb->dte_action = action;
11509 }
11510
11511 ecb->dte_action_last = action;
11512
11513 return (0);
11514 }
11515
11516 static void
11517 dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
11518 {
11519 dtrace_action_t *act = ecb->dte_action, *next;
11520 dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
11521 dtrace_difo_t *dp;
11522 uint16_t format;
11523
11524 if (act != NULL && act->dta_refcnt > 1) {
11525 ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);
11526 act->dta_refcnt--;
11527 } else {
11528 for (; act != NULL; act = next) {
11529 next = act->dta_next;
11530 ASSERT(next != NULL || act == ecb->dte_action_last);
11531 ASSERT(act->dta_refcnt == 1);
11532
11533 if ((format = act->dta_rec.dtrd_format) != 0)
11534 dtrace_format_remove(ecb->dte_state, format);
11535
11536 if ((dp = act->dta_difo) != NULL)
11537 dtrace_difo_release(dp, vstate);
11538
11539 if (DTRACEACT_ISAGG(act->dta_kind)) {
11540 dtrace_ecb_aggregation_destroy(ecb, act);
11541 } else {
11542 kmem_free(act, sizeof (dtrace_action_t));
11543 }
11544 }
11545 }
11546
11547 ecb->dte_action = NULL;
11548 ecb->dte_action_last = NULL;
11549 ecb->dte_size = 0;
11550 }
11551
11552 static void
11553 dtrace_ecb_disable(dtrace_ecb_t *ecb)
11554 {
11555 /*
11556 * We disable the ECB by removing it from its probe.
11557 */
11558 dtrace_ecb_t *pecb, *prev = NULL;
11559 dtrace_probe_t *probe = ecb->dte_probe;
11560
11561 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11562
11563 if (probe == NULL) {
11564 /*
11565 * This is the NULL probe; there is nothing to disable.
11566 */
11567 return;
11568 }
11569
11570 for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
11571 if (pecb == ecb)
11572 break;
11573 prev = pecb;
11574 }
11575
11576 ASSERT(pecb != NULL);
11577
11578 if (prev == NULL) {
11579 probe->dtpr_ecb = ecb->dte_next;
11580 } else {
11581 prev->dte_next = ecb->dte_next;
11582 }
11583
11584 if (ecb == probe->dtpr_ecb_last) {
11585 ASSERT(ecb->dte_next == NULL);
11586 probe->dtpr_ecb_last = prev;
11587 }
11588
11589 probe->dtpr_provider->dtpv_ecb_count--;
11590 /*
11591 * The ECB has been disconnected from the probe; now sync to assure
11592 * that all CPUs have seen the change before returning.
11593 */
11594 dtrace_sync();
11595
11596 if (probe->dtpr_ecb == NULL) {
11597 /*
11598 * That was the last ECB on the probe; clear the predicate
11599 * cache ID for the probe, disable it and sync one more time
11600 * to assure that we'll never hit it again.
11601 */
11602 dtrace_provider_t *prov = probe->dtpr_provider;
11603
11604 ASSERT(ecb->dte_next == NULL);
11605 ASSERT(probe->dtpr_ecb_last == NULL);
11606 probe->dtpr_predcache = DTRACE_CACHEIDNONE;
11607 prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
11608 probe->dtpr_id, probe->dtpr_arg);
11609 dtrace_sync();
11610 } else {
11611 /*
11612 * There is at least one ECB remaining on the probe. If there
11613 * is _exactly_ one, set the probe's predicate cache ID to be
11614 * the predicate cache ID of the remaining ECB.
11615 */
11616 ASSERT(probe->dtpr_ecb_last != NULL);
11617 ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
11618
11619 if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
11620 dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
11621
11622 ASSERT(probe->dtpr_ecb->dte_next == NULL);
11623
11624 if (p != NULL)
11625 probe->dtpr_predcache = p->dtp_cacheid;
11626 }
11627
11628 ecb->dte_next = NULL;
11629 }
11630 }
11631
11632 static void
11633 dtrace_ecb_destroy(dtrace_ecb_t *ecb)
11634 {
11635 dtrace_state_t *state = ecb->dte_state;
11636 dtrace_vstate_t *vstate = &state->dts_vstate;
11637 dtrace_predicate_t *pred;
11638 dtrace_epid_t epid = ecb->dte_epid;
11639
11640 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11641 ASSERT(ecb->dte_next == NULL);
11642 ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);
11643
11644 if ((pred = ecb->dte_predicate) != NULL)
11645 dtrace_predicate_release(pred, vstate);
11646
11647 dtrace_ecb_action_remove(ecb);
11648
11649 ASSERT(state->dts_ecbs[epid - 1] == ecb);
11650 state->dts_ecbs[epid - 1] = NULL;
11651
11652 kmem_free(ecb, sizeof (dtrace_ecb_t));
11653 }
11654
11655 static dtrace_ecb_t *
11656 dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
11657 dtrace_enabling_t *enab)
11658 {
11659 dtrace_ecb_t *ecb;
11660 dtrace_predicate_t *pred;
11661 dtrace_actdesc_t *act;
11662 dtrace_provider_t *prov;
11663 dtrace_ecbdesc_t *desc = enab->dten_current;
11664
11665 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11666 ASSERT(state != NULL);
11667
11668 ecb = dtrace_ecb_add(state, probe);
11669 ecb->dte_uarg = desc->dted_uarg;
11670
11671 if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
11672 dtrace_predicate_hold(pred);
11673 ecb->dte_predicate = pred;
11674 }
11675
11676 if (probe != NULL) {
11677 /*
11678 * If the provider shows more leg than the consumer is old
11679 * enough to see, we need to enable the appropriate implicit
11680 * predicate bits to prevent the ecb from activating at
11681 * revealing times.
11682 *
11683 * Providers specifying DTRACE_PRIV_USER at register time
11684 * are stating that they need the /proc-style privilege
11685 * model to be enforced, and this is what DTRACE_COND_OWNER
11686 * and DTRACE_COND_ZONEOWNER will then do at probe time.
11687 */
11688 prov = probe->dtpr_provider;
11689 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
11690 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11691 ecb->dte_cond |= DTRACE_COND_OWNER;
11692
11693 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
11694 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11695 ecb->dte_cond |= DTRACE_COND_ZONEOWNER;
11696
11697 /*
11698 * If the provider shows us kernel innards and the user
11699 * is lacking sufficient privilege, enable the
11700 * DTRACE_COND_USERMODE implicit predicate.
11701 */
11702 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
11703 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
11704 ecb->dte_cond |= DTRACE_COND_USERMODE;
11705 }
11706
11707 if (dtrace_ecb_create_cache != NULL) {
11708 /*
11709 * If we have a cached ecb, we'll use its action list instead
11710 * of creating our own (saving both time and space).
11711 */
11712 dtrace_ecb_t *cached = dtrace_ecb_create_cache;
11713 dtrace_action_t *act_if = cached->dte_action;
11714
11715 if (act_if != NULL) {
11716 ASSERT(act_if->dta_refcnt > 0);
11717 act_if->dta_refcnt++;
11718 ecb->dte_action = act_if;
11719 ecb->dte_action_last = cached->dte_action_last;
11720 ecb->dte_needed = cached->dte_needed;
11721 ecb->dte_size = cached->dte_size;
11722 ecb->dte_alignment = cached->dte_alignment;
11723 }
11724
11725 return (ecb);
11726 }
11727
11728 for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
11729 if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
11730 dtrace_ecb_destroy(ecb);
11731 return (NULL);
11732 }
11733 }
11734
11735 if ((enab->dten_error = dtrace_ecb_resize(ecb)) != 0) {
11736 dtrace_ecb_destroy(ecb);
11737 return (NULL);
11738 }
11739
11740 return (dtrace_ecb_create_cache = ecb);
11741 }
11742
11743 static int
11744 dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg1, void *arg2)
11745 {
11746 dtrace_ecb_t *ecb;
11747 dtrace_enabling_t *enab = arg1;
11748 dtrace_ecbdesc_t *ep = arg2;
11749 dtrace_state_t *state = enab->dten_vstate->dtvs_state;
11750
11751 ASSERT(state != NULL);
11752
11753 if (probe != NULL && ep != NULL && probe->dtpr_gen < ep->dted_probegen) {
11754 /*
11755 * This probe was created in a generation for which this
11756 * enabling has previously created ECBs; we don't want to
11757 * enable it again, so just kick out.
11758 */
11759 return (DTRACE_MATCH_NEXT);
11760 }
11761
11762 if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
11763 return (DTRACE_MATCH_DONE);
11764
11765 if (dtrace_ecb_enable(ecb) < 0)
11766 return (DTRACE_MATCH_FAIL);
11767
11768 return (DTRACE_MATCH_NEXT);
11769 }
11770
11771 static dtrace_ecb_t *
11772 dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
11773 {
11774 dtrace_ecb_t *ecb;
11775 #pragma unused(ecb) /* __APPLE__ */
11776
11777 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11778
11779 if (id == 0 || id > (dtrace_epid_t)state->dts_necbs)
11780 return (NULL);
11781
11782 ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
11783 ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);
11784
11785 return (state->dts_ecbs[id - 1]);
11786 }
11787
11788 static dtrace_aggregation_t *
11789 dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
11790 {
11791 dtrace_aggregation_t *agg;
11792 #pragma unused(agg) /* __APPLE__ */
11793
11794 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11795
11796 if (id == 0 || id > (dtrace_aggid_t)state->dts_naggregations)
11797 return (NULL);
11798
11799 ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
11800 ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||
11801 agg->dtag_id == id);
11802
11803 return (state->dts_aggregations[id - 1]);
11804 }
11805
11806 /*
11807 * DTrace Buffer Functions
11808 *
11809 * The following functions manipulate DTrace buffers. Most of these functions
11810 * are called in the context of establishing or processing consumer state;
11811 * exceptions are explicitly noted.
11812 */
11813
11814 /*
11815 * Note: called from cross call context. This function switches the two
11816 * buffers on a given CPU. The atomicity of this operation is assured by
11817 * disabling interrupts while the actual switch takes place; the disabling of
11818 * interrupts serializes the execution with any execution of dtrace_probe() on
11819 * the same CPU.
11820 */
11821 static void
11822 dtrace_buffer_switch(dtrace_buffer_t *buf)
11823 {
11824 caddr_t tomax = buf->dtb_tomax;
11825 caddr_t xamot = buf->dtb_xamot;
11826 dtrace_icookie_t cookie;
11827 hrtime_t now;
11828
11829 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
11830 ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
11831
11832 cookie = dtrace_interrupt_disable();
11833 now = dtrace_gethrtime();
11834 buf->dtb_tomax = xamot;
11835 buf->dtb_xamot = tomax;
11836 buf->dtb_xamot_drops = buf->dtb_drops;
11837 buf->dtb_xamot_offset = buf->dtb_offset;
11838 buf->dtb_xamot_errors = buf->dtb_errors;
11839 buf->dtb_xamot_flags = buf->dtb_flags;
11840 buf->dtb_offset = 0;
11841 buf->dtb_drops = 0;
11842 buf->dtb_errors = 0;
11843 buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
11844 buf->dtb_interval = now - buf->dtb_switched;
11845 buf->dtb_switched = now;
11846 buf->dtb_cur_limit = buf->dtb_limit;
11847
11848 dtrace_interrupt_enable(cookie);
11849 }
11850
11851 /*
11852 * Note: called from cross call context. This function activates a buffer
11853 * on a CPU. As with dtrace_buffer_switch(), the atomicity of the operation
11854 * is guaranteed by the disabling of interrupts.
11855 */
11856 static void
11857 dtrace_buffer_activate(dtrace_state_t *state)
11858 {
11859 dtrace_buffer_t *buf;
11860 dtrace_icookie_t cookie = dtrace_interrupt_disable();
11861
11862 buf = &state->dts_buffer[CPU->cpu_id];
11863
11864 if (buf->dtb_tomax != NULL) {
11865 /*
11866 * We might like to assert that the buffer is marked inactive,
11867 * but this isn't necessarily true: the buffer for the CPU
11868 * that processes the BEGIN probe has its buffer activated
11869 * manually. In this case, we take the (harmless) action
11870 * re-clearing the bit INACTIVE bit.
11871 */
11872 buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
11873 }
11874
11875 dtrace_interrupt_enable(cookie);
11876 }
11877
11878 static int
11879 dtrace_buffer_canalloc(size_t size)
11880 {
11881 if (size > (UINT64_MAX - dtrace_buffer_memory_inuse))
11882 return (B_FALSE);
11883 if ((size + dtrace_buffer_memory_inuse) > dtrace_buffer_memory_maxsize)
11884 return (B_FALSE);
11885
11886 return (B_TRUE);
11887 }
11888
11889 static int
11890 dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t limit, size_t size, int flags,
11891 processorid_t cpu)
11892 {
11893 dtrace_cpu_t *cp;
11894 dtrace_buffer_t *buf;
11895 size_t size_before_alloc = dtrace_buffer_memory_inuse;
11896
11897 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
11898 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11899
11900 if (size > (size_t)dtrace_nonroot_maxsize &&
11901 !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
11902 return (EFBIG);
11903
11904 cp = cpu_list;
11905
11906 do {
11907 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
11908 continue;
11909
11910 buf = &bufs[cp->cpu_id];
11911
11912 /*
11913 * If there is already a buffer allocated for this CPU, it
11914 * is only possible that this is a DR event. In this case,
11915 * the buffer size must match our specified size.
11916 */
11917 if (buf->dtb_tomax != NULL) {
11918 ASSERT(buf->dtb_size == size);
11919 continue;
11920 }
11921
11922 ASSERT(buf->dtb_xamot == NULL);
11923
11924 /* DTrace, please do not eat all the memory. */
11925 if (dtrace_buffer_canalloc(size) == B_FALSE)
11926 goto err;
11927 if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
11928 goto err;
11929 dtrace_buffer_memory_inuse += size;
11930
11931 /* Unsure that limit is always lower than size */
11932 limit = limit == size ? limit - 1 : limit;
11933 buf->dtb_cur_limit = limit;
11934 buf->dtb_limit = limit;
11935 buf->dtb_size = size;
11936 buf->dtb_flags = flags;
11937 buf->dtb_offset = 0;
11938 buf->dtb_drops = 0;
11939
11940 if (flags & DTRACEBUF_NOSWITCH)
11941 continue;
11942
11943 /* DTrace, please do not eat all the memory. */
11944 if (dtrace_buffer_canalloc(size) == B_FALSE)
11945 goto err;
11946 if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
11947 goto err;
11948 dtrace_buffer_memory_inuse += size;
11949 } while ((cp = cp->cpu_next) != cpu_list);
11950
11951 ASSERT(dtrace_buffer_memory_inuse <= dtrace_buffer_memory_maxsize);
11952
11953 return (0);
11954
11955 err:
11956 cp = cpu_list;
11957
11958 do {
11959 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
11960 continue;
11961
11962 buf = &bufs[cp->cpu_id];
11963
11964 if (buf->dtb_xamot != NULL) {
11965 ASSERT(buf->dtb_tomax != NULL);
11966 ASSERT(buf->dtb_size == size);
11967 kmem_free(buf->dtb_xamot, size);
11968 }
11969
11970 if (buf->dtb_tomax != NULL) {
11971 ASSERT(buf->dtb_size == size);
11972 kmem_free(buf->dtb_tomax, size);
11973 }
11974
11975 buf->dtb_tomax = NULL;
11976 buf->dtb_xamot = NULL;
11977 buf->dtb_size = 0;
11978 } while ((cp = cp->cpu_next) != cpu_list);
11979
11980 /* Restore the size saved before allocating memory */
11981 dtrace_buffer_memory_inuse = size_before_alloc;
11982
11983 return (ENOMEM);
11984 }
11985
11986 /*
11987 * Note: called from probe context. This function just increments the drop
11988 * count on a buffer. It has been made a function to allow for the
11989 * possibility of understanding the source of mysterious drop counts. (A
11990 * problem for which one may be particularly disappointed that DTrace cannot
11991 * be used to understand DTrace.)
11992 */
11993 static void
11994 dtrace_buffer_drop(dtrace_buffer_t *buf)
11995 {
11996 buf->dtb_drops++;
11997 }
11998
11999 /*
12000 * Note: called from probe context. This function is called to reserve space
12001 * in a buffer. If mstate is non-NULL, sets the scratch base and size in the
12002 * mstate. Returns the new offset in the buffer, or a negative value if an
12003 * error has occurred.
12004 */
12005 static intptr_t
12006 dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
12007 dtrace_state_t *state, dtrace_mstate_t *mstate)
12008 {
12009 intptr_t offs = buf->dtb_offset, soffs;
12010 intptr_t woffs;
12011 caddr_t tomax;
12012 size_t total_off;
12013
12014 if (buf->dtb_flags & DTRACEBUF_INACTIVE)
12015 return (-1);
12016
12017 if ((tomax = buf->dtb_tomax) == NULL) {
12018 dtrace_buffer_drop(buf);
12019 return (-1);
12020 }
12021
12022 if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {
12023 while (offs & (align - 1)) {
12024 /*
12025 * Assert that our alignment is off by a number which
12026 * is itself sizeof (uint32_t) aligned.
12027 */
12028 ASSERT(!((align - (offs & (align - 1))) &
12029 (sizeof (uint32_t) - 1)));
12030 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
12031 offs += sizeof (uint32_t);
12032 }
12033
12034 if ((uint64_t)(soffs = offs + needed) > buf->dtb_cur_limit) {
12035 if (buf->dtb_cur_limit == buf->dtb_limit) {
12036 buf->dtb_cur_limit = buf->dtb_size;
12037
12038 os_atomic_inc(&state->dts_buf_over_limit, relaxed);
12039 /**
12040 * Set an AST on the current processor
12041 * so that we can wake up the process
12042 * outside of probe context, when we know
12043 * it is safe to do so
12044 */
12045 minor_t minor = getminor(state->dts_dev);
12046 ASSERT(minor < 32);
12047
12048 os_atomic_or(&dtrace_wake_clients, 1 << minor, relaxed);
12049 ast_dtrace_on();
12050 }
12051 if ((uint64_t)soffs > buf->dtb_size) {
12052 dtrace_buffer_drop(buf);
12053 return (-1);
12054 }
12055 }
12056
12057 if (mstate == NULL)
12058 return (offs);
12059
12060 mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
12061 mstate->dtms_scratch_size = buf->dtb_size - soffs;
12062 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
12063
12064 return (offs);
12065 }
12066
12067 if (buf->dtb_flags & DTRACEBUF_FILL) {
12068 if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
12069 (buf->dtb_flags & DTRACEBUF_FULL))
12070 return (-1);
12071 goto out;
12072 }
12073
12074 total_off = needed + (offs & (align - 1));
12075
12076 /*
12077 * For a ring buffer, life is quite a bit more complicated. Before
12078 * we can store any padding, we need to adjust our wrapping offset.
12079 * (If we've never before wrapped or we're not about to, no adjustment
12080 * is required.)
12081 */
12082 if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
12083 offs + total_off > buf->dtb_size) {
12084 woffs = buf->dtb_xamot_offset;
12085
12086 if (offs + total_off > buf->dtb_size) {
12087 /*
12088 * We can't fit in the end of the buffer. First, a
12089 * sanity check that we can fit in the buffer at all.
12090 */
12091 if (total_off > buf->dtb_size) {
12092 dtrace_buffer_drop(buf);
12093 return (-1);
12094 }
12095
12096 /*
12097 * We're going to be storing at the top of the buffer,
12098 * so now we need to deal with the wrapped offset. We
12099 * only reset our wrapped offset to 0 if it is
12100 * currently greater than the current offset. If it
12101 * is less than the current offset, it is because a
12102 * previous allocation induced a wrap -- but the
12103 * allocation didn't subsequently take the space due
12104 * to an error or false predicate evaluation. In this
12105 * case, we'll just leave the wrapped offset alone: if
12106 * the wrapped offset hasn't been advanced far enough
12107 * for this allocation, it will be adjusted in the
12108 * lower loop.
12109 */
12110 if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
12111 if (woffs >= offs)
12112 woffs = 0;
12113 } else {
12114 woffs = 0;
12115 }
12116
12117 /*
12118 * Now we know that we're going to be storing to the
12119 * top of the buffer and that there is room for us
12120 * there. We need to clear the buffer from the current
12121 * offset to the end (there may be old gunk there).
12122 */
12123 while ((uint64_t)offs < buf->dtb_size)
12124 tomax[offs++] = 0;
12125
12126 /*
12127 * We need to set our offset to zero. And because we
12128 * are wrapping, we need to set the bit indicating as
12129 * much. We can also adjust our needed space back
12130 * down to the space required by the ECB -- we know
12131 * that the top of the buffer is aligned.
12132 */
12133 offs = 0;
12134 total_off = needed;
12135 buf->dtb_flags |= DTRACEBUF_WRAPPED;
12136 } else {
12137 /*
12138 * There is room for us in the buffer, so we simply
12139 * need to check the wrapped offset.
12140 */
12141 if (woffs < offs) {
12142 /*
12143 * The wrapped offset is less than the offset.
12144 * This can happen if we allocated buffer space
12145 * that induced a wrap, but then we didn't
12146 * subsequently take the space due to an error
12147 * or false predicate evaluation. This is
12148 * okay; we know that _this_ allocation isn't
12149 * going to induce a wrap. We still can't
12150 * reset the wrapped offset to be zero,
12151 * however: the space may have been trashed in
12152 * the previous failed probe attempt. But at
12153 * least the wrapped offset doesn't need to
12154 * be adjusted at all...
12155 */
12156 goto out;
12157 }
12158 }
12159
12160 while (offs + total_off > (size_t)woffs) {
12161 dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
12162 size_t size;
12163
12164 if (epid == DTRACE_EPIDNONE) {
12165 size = sizeof (uint32_t);
12166 } else {
12167 ASSERT(epid <= (dtrace_epid_t)state->dts_necbs);
12168 ASSERT(state->dts_ecbs[epid - 1] != NULL);
12169
12170 size = state->dts_ecbs[epid - 1]->dte_size;
12171 }
12172
12173 ASSERT(woffs + size <= buf->dtb_size);
12174 ASSERT(size != 0);
12175
12176 if (woffs + size == buf->dtb_size) {
12177 /*
12178 * We've reached the end of the buffer; we want
12179 * to set the wrapped offset to 0 and break
12180 * out. However, if the offs is 0, then we're
12181 * in a strange edge-condition: the amount of
12182 * space that we want to reserve plus the size
12183 * of the record that we're overwriting is
12184 * greater than the size of the buffer. This
12185 * is problematic because if we reserve the
12186 * space but subsequently don't consume it (due
12187 * to a failed predicate or error) the wrapped
12188 * offset will be 0 -- yet the EPID at offset 0
12189 * will not be committed. This situation is
12190 * relatively easy to deal with: if we're in
12191 * this case, the buffer is indistinguishable
12192 * from one that hasn't wrapped; we need only
12193 * finish the job by clearing the wrapped bit,
12194 * explicitly setting the offset to be 0, and
12195 * zero'ing out the old data in the buffer.
12196 */
12197 if (offs == 0) {
12198 buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
12199 buf->dtb_offset = 0;
12200 woffs = total_off;
12201
12202 while ((uint64_t)woffs < buf->dtb_size)
12203 tomax[woffs++] = 0;
12204 }
12205
12206 woffs = 0;
12207 break;
12208 }
12209
12210 woffs += size;
12211 }
12212
12213 /*
12214 * We have a wrapped offset. It may be that the wrapped offset
12215 * has become zero -- that's okay.
12216 */
12217 buf->dtb_xamot_offset = woffs;
12218 }
12219
12220 out:
12221 /*
12222 * Now we can plow the buffer with any necessary padding.
12223 */
12224 while (offs & (align - 1)) {
12225 /*
12226 * Assert that our alignment is off by a number which
12227 * is itself sizeof (uint32_t) aligned.
12228 */
12229 ASSERT(!((align - (offs & (align - 1))) &
12230 (sizeof (uint32_t) - 1)));
12231 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
12232 offs += sizeof (uint32_t);
12233 }
12234
12235 if (buf->dtb_flags & DTRACEBUF_FILL) {
12236 if (offs + needed > buf->dtb_size - state->dts_reserve) {
12237 buf->dtb_flags |= DTRACEBUF_FULL;
12238 return (-1);
12239 }
12240 }
12241
12242 if (mstate == NULL)
12243 return (offs);
12244
12245 /*
12246 * For ring buffers and fill buffers, the scratch space is always
12247 * the inactive buffer.
12248 */
12249 mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
12250 mstate->dtms_scratch_size = buf->dtb_size;
12251 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
12252
12253 return (offs);
12254 }
12255
12256 static void
12257 dtrace_buffer_polish(dtrace_buffer_t *buf)
12258 {
12259 ASSERT(buf->dtb_flags & DTRACEBUF_RING);
12260 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12261
12262 if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
12263 return;
12264
12265 /*
12266 * We need to polish the ring buffer. There are three cases:
12267 *
12268 * - The first (and presumably most common) is that there is no gap
12269 * between the buffer offset and the wrapped offset. In this case,
12270 * there is nothing in the buffer that isn't valid data; we can
12271 * mark the buffer as polished and return.
12272 *
12273 * - The second (less common than the first but still more common
12274 * than the third) is that there is a gap between the buffer offset
12275 * and the wrapped offset, and the wrapped offset is larger than the
12276 * buffer offset. This can happen because of an alignment issue, or
12277 * can happen because of a call to dtrace_buffer_reserve() that
12278 * didn't subsequently consume the buffer space. In this case,
12279 * we need to zero the data from the buffer offset to the wrapped
12280 * offset.
12281 *
12282 * - The third (and least common) is that there is a gap between the
12283 * buffer offset and the wrapped offset, but the wrapped offset is
12284 * _less_ than the buffer offset. This can only happen because a
12285 * call to dtrace_buffer_reserve() induced a wrap, but the space
12286 * was not subsequently consumed. In this case, we need to zero the
12287 * space from the offset to the end of the buffer _and_ from the
12288 * top of the buffer to the wrapped offset.
12289 */
12290 if (buf->dtb_offset < buf->dtb_xamot_offset) {
12291 bzero(buf->dtb_tomax + buf->dtb_offset,
12292 buf->dtb_xamot_offset - buf->dtb_offset);
12293 }
12294
12295 if (buf->dtb_offset > buf->dtb_xamot_offset) {
12296 bzero(buf->dtb_tomax + buf->dtb_offset,
12297 buf->dtb_size - buf->dtb_offset);
12298 bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
12299 }
12300 }
12301
12302 static void
12303 dtrace_buffer_free(dtrace_buffer_t *bufs)
12304 {
12305 int i;
12306
12307 for (i = 0; i < (int)NCPU; i++) {
12308 dtrace_buffer_t *buf = &bufs[i];
12309
12310 if (buf->dtb_tomax == NULL) {
12311 ASSERT(buf->dtb_xamot == NULL);
12312 ASSERT(buf->dtb_size == 0);
12313 continue;
12314 }
12315
12316 if (buf->dtb_xamot != NULL) {
12317 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
12318 kmem_free(buf->dtb_xamot, buf->dtb_size);
12319
12320 ASSERT(dtrace_buffer_memory_inuse >= buf->dtb_size);
12321 dtrace_buffer_memory_inuse -= buf->dtb_size;
12322 }
12323
12324 kmem_free(buf->dtb_tomax, buf->dtb_size);
12325 ASSERT(dtrace_buffer_memory_inuse >= buf->dtb_size);
12326 dtrace_buffer_memory_inuse -= buf->dtb_size;
12327
12328 buf->dtb_size = 0;
12329 buf->dtb_tomax = NULL;
12330 buf->dtb_xamot = NULL;
12331 }
12332 }
12333
12334 /*
12335 * DTrace Enabling Functions
12336 */
12337 static dtrace_enabling_t *
12338 dtrace_enabling_create(dtrace_vstate_t *vstate)
12339 {
12340 dtrace_enabling_t *enab;
12341
12342 enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
12343 enab->dten_vstate = vstate;
12344
12345 return (enab);
12346 }
12347
12348 static void
12349 dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)
12350 {
12351 dtrace_ecbdesc_t **ndesc;
12352 size_t osize, nsize;
12353
12354 /*
12355 * We can't add to enablings after we've enabled them, or after we've
12356 * retained them.
12357 */
12358 ASSERT(enab->dten_probegen == 0);
12359 ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
12360
12361 /* APPLE NOTE: this protects against gcc 4.0 botch on x86 */
12362 if (ecb == NULL) return;
12363
12364 if (enab->dten_ndesc < enab->dten_maxdesc) {
12365 enab->dten_desc[enab->dten_ndesc++] = ecb;
12366 return;
12367 }
12368
12369 osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
12370
12371 if (enab->dten_maxdesc == 0) {
12372 enab->dten_maxdesc = 1;
12373 } else {
12374 enab->dten_maxdesc <<= 1;
12375 }
12376
12377 ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
12378
12379 nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
12380 ndesc = kmem_zalloc(nsize, KM_SLEEP);
12381 bcopy(enab->dten_desc, ndesc, osize);
12382 kmem_free(enab->dten_desc, osize);
12383
12384 enab->dten_desc = ndesc;
12385 enab->dten_desc[enab->dten_ndesc++] = ecb;
12386 }
12387
12388 static void
12389 dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,
12390 dtrace_probedesc_t *pd)
12391 {
12392 dtrace_ecbdesc_t *new;
12393 dtrace_predicate_t *pred;
12394 dtrace_actdesc_t *act;
12395
12396 /*
12397 * We're going to create a new ECB description that matches the
12398 * specified ECB in every way, but has the specified probe description.
12399 */
12400 new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
12401
12402 if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
12403 dtrace_predicate_hold(pred);
12404
12405 for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
12406 dtrace_actdesc_hold(act);
12407
12408 new->dted_action = ecb->dted_action;
12409 new->dted_pred = ecb->dted_pred;
12410 new->dted_probe = *pd;
12411 new->dted_uarg = ecb->dted_uarg;
12412
12413 dtrace_enabling_add(enab, new);
12414 }
12415
12416 static void
12417 dtrace_enabling_dump(dtrace_enabling_t *enab)
12418 {
12419 int i;
12420
12421 for (i = 0; i < enab->dten_ndesc; i++) {
12422 dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
12423
12424 cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
12425 desc->dtpd_provider, desc->dtpd_mod,
12426 desc->dtpd_func, desc->dtpd_name);
12427 }
12428 }
12429
12430 static void
12431 dtrace_enabling_destroy(dtrace_enabling_t *enab)
12432 {
12433 int i;
12434 dtrace_ecbdesc_t *ep;
12435 dtrace_vstate_t *vstate = enab->dten_vstate;
12436
12437 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12438
12439 for (i = 0; i < enab->dten_ndesc; i++) {
12440 dtrace_actdesc_t *act, *next;
12441 dtrace_predicate_t *pred;
12442
12443 ep = enab->dten_desc[i];
12444
12445 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
12446 dtrace_predicate_release(pred, vstate);
12447
12448 for (act = ep->dted_action; act != NULL; act = next) {
12449 next = act->dtad_next;
12450 dtrace_actdesc_release(act, vstate);
12451 }
12452
12453 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
12454 }
12455
12456 kmem_free(enab->dten_desc,
12457 enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
12458
12459 /*
12460 * If this was a retained enabling, decrement the dts_nretained count
12461 * and take it off of the dtrace_retained list.
12462 */
12463 if (enab->dten_prev != NULL || enab->dten_next != NULL ||
12464 dtrace_retained == enab) {
12465 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12466 ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
12467 enab->dten_vstate->dtvs_state->dts_nretained--;
12468 dtrace_retained_gen++;
12469 }
12470
12471 if (enab->dten_prev == NULL) {
12472 if (dtrace_retained == enab) {
12473 dtrace_retained = enab->dten_next;
12474
12475 if (dtrace_retained != NULL)
12476 dtrace_retained->dten_prev = NULL;
12477 }
12478 } else {
12479 ASSERT(enab != dtrace_retained);
12480 ASSERT(dtrace_retained != NULL);
12481 enab->dten_prev->dten_next = enab->dten_next;
12482 }
12483
12484 if (enab->dten_next != NULL) {
12485 ASSERT(dtrace_retained != NULL);
12486 enab->dten_next->dten_prev = enab->dten_prev;
12487 }
12488
12489 kmem_free(enab, sizeof (dtrace_enabling_t));
12490 }
12491
12492 static int
12493 dtrace_enabling_retain(dtrace_enabling_t *enab)
12494 {
12495 dtrace_state_t *state;
12496
12497 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12498 ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
12499 ASSERT(enab->dten_vstate != NULL);
12500
12501 state = enab->dten_vstate->dtvs_state;
12502 ASSERT(state != NULL);
12503
12504 /*
12505 * We only allow each state to retain dtrace_retain_max enablings.
12506 */
12507 if (state->dts_nretained >= dtrace_retain_max)
12508 return (ENOSPC);
12509
12510 state->dts_nretained++;
12511 dtrace_retained_gen++;
12512
12513 if (dtrace_retained == NULL) {
12514 dtrace_retained = enab;
12515 return (0);
12516 }
12517
12518 enab->dten_next = dtrace_retained;
12519 dtrace_retained->dten_prev = enab;
12520 dtrace_retained = enab;
12521
12522 return (0);
12523 }
12524
12525 static int
12526 dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,
12527 dtrace_probedesc_t *create)
12528 {
12529 dtrace_enabling_t *new, *enab;
12530 int found = 0, err = ENOENT;
12531
12532 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12533 ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
12534 ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
12535 ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
12536 ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
12537
12538 new = dtrace_enabling_create(&state->dts_vstate);
12539
12540 /*
12541 * Iterate over all retained enablings, looking for enablings that
12542 * match the specified state.
12543 */
12544 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12545 int i;
12546
12547 /*
12548 * dtvs_state can only be NULL for helper enablings -- and
12549 * helper enablings can't be retained.
12550 */
12551 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12552
12553 if (enab->dten_vstate->dtvs_state != state)
12554 continue;
12555
12556 /*
12557 * Now iterate over each probe description; we're looking for
12558 * an exact match to the specified probe description.
12559 */
12560 for (i = 0; i < enab->dten_ndesc; i++) {
12561 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12562 dtrace_probedesc_t *pd = &ep->dted_probe;
12563
12564 /* APPLE NOTE: Darwin employs size bounded string operation. */
12565 if (strncmp(pd->dtpd_provider, match->dtpd_provider, DTRACE_PROVNAMELEN))
12566 continue;
12567
12568 if (strncmp(pd->dtpd_mod, match->dtpd_mod, DTRACE_MODNAMELEN))
12569 continue;
12570
12571 if (strncmp(pd->dtpd_func, match->dtpd_func, DTRACE_FUNCNAMELEN))
12572 continue;
12573
12574 if (strncmp(pd->dtpd_name, match->dtpd_name, DTRACE_NAMELEN))
12575 continue;
12576
12577 /*
12578 * We have a winning probe! Add it to our growing
12579 * enabling.
12580 */
12581 found = 1;
12582 dtrace_enabling_addlike(new, ep, create);
12583 }
12584 }
12585
12586 if (!found || (err = dtrace_enabling_retain(new)) != 0) {
12587 dtrace_enabling_destroy(new);
12588 return (err);
12589 }
12590
12591 return (0);
12592 }
12593
12594 static void
12595 dtrace_enabling_retract(dtrace_state_t *state)
12596 {
12597 dtrace_enabling_t *enab, *next;
12598
12599 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12600
12601 /*
12602 * Iterate over all retained enablings, destroy the enablings retained
12603 * for the specified state.
12604 */
12605 for (enab = dtrace_retained; enab != NULL; enab = next) {
12606 next = enab->dten_next;
12607
12608 /*
12609 * dtvs_state can only be NULL for helper enablings -- and
12610 * helper enablings can't be retained.
12611 */
12612 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12613
12614 if (enab->dten_vstate->dtvs_state == state) {
12615 ASSERT(state->dts_nretained > 0);
12616 dtrace_enabling_destroy(enab);
12617 }
12618 }
12619
12620 ASSERT(state->dts_nretained == 0);
12621 }
12622
12623 static int
12624 dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched, dtrace_match_cond_t *cond)
12625 {
12626 int i = 0;
12627 int total_matched = 0, matched = 0;
12628
12629 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
12630 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12631
12632 for (i = 0; i < enab->dten_ndesc; i++) {
12633 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12634
12635 enab->dten_current = ep;
12636 enab->dten_error = 0;
12637
12638 /**
12639 * Before doing a dtrace_probe_enable, which is really
12640 * expensive, check that this enabling matches the matching precondition
12641 * if we have one
12642 */
12643 if (cond && (cond->dmc_func(&ep->dted_probe, cond->dmc_data) == 0)) {
12644 continue;
12645 }
12646 /*
12647 * If a provider failed to enable a probe then get out and
12648 * let the consumer know we failed.
12649 */
12650 if ((matched = dtrace_probe_enable(&ep->dted_probe, enab, ep)) < 0)
12651 return (EBUSY);
12652
12653 total_matched += matched;
12654
12655 if (enab->dten_error != 0) {
12656 /*
12657 * If we get an error half-way through enabling the
12658 * probes, we kick out -- perhaps with some number of
12659 * them enabled. Leaving enabled probes enabled may
12660 * be slightly confusing for user-level, but we expect
12661 * that no one will attempt to actually drive on in
12662 * the face of such errors. If this is an anonymous
12663 * enabling (indicated with a NULL nmatched pointer),
12664 * we cmn_err() a message. We aren't expecting to
12665 * get such an error -- such as it can exist at all,
12666 * it would be a result of corrupted DOF in the driver
12667 * properties.
12668 */
12669 if (nmatched == NULL) {
12670 cmn_err(CE_WARN, "dtrace_enabling_match() "
12671 "error on %p: %d", (void *)ep,
12672 enab->dten_error);
12673 }
12674
12675 return (enab->dten_error);
12676 }
12677
12678 ep->dted_probegen = dtrace_probegen;
12679 }
12680
12681 if (nmatched != NULL)
12682 *nmatched = total_matched;
12683
12684 return (0);
12685 }
12686
12687 static void
12688 dtrace_enabling_matchall_with_cond(dtrace_match_cond_t *cond)
12689 {
12690 dtrace_enabling_t *enab;
12691
12692 lck_mtx_lock(&cpu_lock);
12693 lck_mtx_lock(&dtrace_lock);
12694
12695 /*
12696 * Iterate over all retained enablings to see if any probes match
12697 * against them. We only perform this operation on enablings for which
12698 * we have sufficient permissions by virtue of being in the global zone
12699 * or in the same zone as the DTrace client. Because we can be called
12700 * after dtrace_detach() has been called, we cannot assert that there
12701 * are retained enablings. We can safely load from dtrace_retained,
12702 * however: the taskq_destroy() at the end of dtrace_detach() will
12703 * block pending our completion.
12704 */
12705
12706 /*
12707 * Darwin doesn't do zones.
12708 * Behave as if always in "global" zone."
12709 */
12710 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12711 (void) dtrace_enabling_match(enab, NULL, cond);
12712 }
12713
12714 lck_mtx_unlock(&dtrace_lock);
12715 lck_mtx_unlock(&cpu_lock);
12716
12717 }
12718
12719 static void
12720 dtrace_enabling_matchall(void)
12721 {
12722 dtrace_enabling_matchall_with_cond(NULL);
12723 }
12724
12725
12726
12727 /*
12728 * If an enabling is to be enabled without having matched probes (that is, if
12729 * dtrace_state_go() is to be called on the underlying dtrace_state_t), the
12730 * enabling must be _primed_ by creating an ECB for every ECB description.
12731 * This must be done to assure that we know the number of speculations, the
12732 * number of aggregations, the minimum buffer size needed, etc. before we
12733 * transition out of DTRACE_ACTIVITY_INACTIVE. To do this without actually
12734 * enabling any probes, we create ECBs for every ECB decription, but with a
12735 * NULL probe -- which is exactly what this function does.
12736 */
12737 static void
12738 dtrace_enabling_prime(dtrace_state_t *state)
12739 {
12740 dtrace_enabling_t *enab;
12741 int i;
12742
12743 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12744 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12745
12746 if (enab->dten_vstate->dtvs_state != state)
12747 continue;
12748
12749 /*
12750 * We don't want to prime an enabling more than once, lest
12751 * we allow a malicious user to induce resource exhaustion.
12752 * (The ECBs that result from priming an enabling aren't
12753 * leaked -- but they also aren't deallocated until the
12754 * consumer state is destroyed.)
12755 */
12756 if (enab->dten_primed)
12757 continue;
12758
12759 for (i = 0; i < enab->dten_ndesc; i++) {
12760 enab->dten_current = enab->dten_desc[i];
12761 (void) dtrace_probe_enable(NULL, enab, NULL);
12762 }
12763
12764 enab->dten_primed = 1;
12765 }
12766 }
12767
12768 /*
12769 * Called to indicate that probes should be provided due to retained
12770 * enablings. This is implemented in terms of dtrace_probe_provide(), but it
12771 * must take an initial lap through the enabling calling the dtps_provide()
12772 * entry point explicitly to allow for autocreated probes.
12773 */
12774 static void
12775 dtrace_enabling_provide(dtrace_provider_t *prv)
12776 {
12777 int i, all = 0;
12778 dtrace_probedesc_t desc;
12779 dtrace_genid_t gen;
12780
12781 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12782 LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
12783
12784 if (prv == NULL) {
12785 all = 1;
12786 prv = dtrace_provider;
12787 }
12788
12789 do {
12790 dtrace_enabling_t *enab;
12791 void *parg = prv->dtpv_arg;
12792
12793 retry:
12794 gen = dtrace_retained_gen;
12795 for (enab = dtrace_retained; enab != NULL;
12796 enab = enab->dten_next) {
12797 for (i = 0; i < enab->dten_ndesc; i++) {
12798 desc = enab->dten_desc[i]->dted_probe;
12799 lck_mtx_unlock(&dtrace_lock);
12800 prv->dtpv_pops.dtps_provide(parg, &desc);
12801 lck_mtx_lock(&dtrace_lock);
12802 /*
12803 * Process the retained enablings again if
12804 * they have changed while we weren't holding
12805 * dtrace_lock.
12806 */
12807 if (gen != dtrace_retained_gen)
12808 goto retry;
12809 }
12810 }
12811 } while (all && (prv = prv->dtpv_next) != NULL);
12812
12813 lck_mtx_unlock(&dtrace_lock);
12814 dtrace_probe_provide(NULL, all ? NULL : prv);
12815 lck_mtx_lock(&dtrace_lock);
12816 }
12817
12818 /*
12819 * DTrace DOF Functions
12820 */
12821 /*ARGSUSED*/
12822 static void
12823 dtrace_dof_error(dof_hdr_t *dof, const char *str)
12824 {
12825 #pragma unused(dof) /* __APPLE__ */
12826 if (dtrace_err_verbose)
12827 cmn_err(CE_WARN, "failed to process DOF: %s", str);
12828
12829 #ifdef DTRACE_ERRDEBUG
12830 dtrace_errdebug(str);
12831 #endif
12832 }
12833
12834 /*
12835 * Create DOF out of a currently enabled state. Right now, we only create
12836 * DOF containing the run-time options -- but this could be expanded to create
12837 * complete DOF representing the enabled state.
12838 */
12839 static dof_hdr_t *
12840 dtrace_dof_create(dtrace_state_t *state)
12841 {
12842 dof_hdr_t *dof;
12843 dof_sec_t *sec;
12844 dof_optdesc_t *opt;
12845 int i, len = sizeof (dof_hdr_t) +
12846 roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
12847 sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12848
12849 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12850
12851 dof = kmem_zalloc_aligned(len, 8, KM_SLEEP);
12852 dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
12853 dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
12854 dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
12855 dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
12856
12857 dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
12858 dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
12859 dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
12860 dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
12861 dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
12862 dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
12863
12864 dof->dofh_flags = 0;
12865 dof->dofh_hdrsize = sizeof (dof_hdr_t);
12866 dof->dofh_secsize = sizeof (dof_sec_t);
12867 dof->dofh_secnum = 1; /* only DOF_SECT_OPTDESC */
12868 dof->dofh_secoff = sizeof (dof_hdr_t);
12869 dof->dofh_loadsz = len;
12870 dof->dofh_filesz = len;
12871 dof->dofh_pad = 0;
12872
12873 /*
12874 * Fill in the option section header...
12875 */
12876 sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
12877 sec->dofs_type = DOF_SECT_OPTDESC;
12878 sec->dofs_align = sizeof (uint64_t);
12879 sec->dofs_flags = DOF_SECF_LOAD;
12880 sec->dofs_entsize = sizeof (dof_optdesc_t);
12881
12882 opt = (dof_optdesc_t *)((uintptr_t)sec +
12883 roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
12884
12885 sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
12886 sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12887
12888 for (i = 0; i < DTRACEOPT_MAX; i++) {
12889 opt[i].dofo_option = i;
12890 opt[i].dofo_strtab = DOF_SECIDX_NONE;
12891 opt[i].dofo_value = state->dts_options[i];
12892 }
12893
12894 return (dof);
12895 }
12896
12897 static dof_hdr_t *
12898 dtrace_dof_copyin(user_addr_t uarg, int *errp)
12899 {
12900 dof_hdr_t hdr, *dof;
12901
12902 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
12903
12904 /*
12905 * First, we're going to copyin() the sizeof (dof_hdr_t).
12906 */
12907 if (copyin(uarg, &hdr, sizeof (hdr)) != 0) {
12908 dtrace_dof_error(NULL, "failed to copyin DOF header");
12909 *errp = EFAULT;
12910 return (NULL);
12911 }
12912
12913 /*
12914 * Now we'll allocate the entire DOF and copy it in -- provided
12915 * that the length isn't outrageous.
12916 */
12917 if (hdr.dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
12918 dtrace_dof_error(&hdr, "load size exceeds maximum");
12919 *errp = E2BIG;
12920 return (NULL);
12921 }
12922
12923 if (hdr.dofh_loadsz < sizeof (hdr)) {
12924 dtrace_dof_error(&hdr, "invalid load size");
12925 *errp = EINVAL;
12926 return (NULL);
12927 }
12928
12929 dof = kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP);
12930
12931 if (copyin(uarg, dof, hdr.dofh_loadsz) != 0 ||
12932 dof->dofh_loadsz != hdr.dofh_loadsz) {
12933 kmem_free_aligned(dof, hdr.dofh_loadsz);
12934 *errp = EFAULT;
12935 return (NULL);
12936 }
12937
12938 return (dof);
12939 }
12940
12941 static dof_hdr_t *
12942 dtrace_dof_copyin_from_proc(proc_t* p, user_addr_t uarg, int *errp)
12943 {
12944 dof_hdr_t hdr, *dof;
12945
12946 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
12947
12948 /*
12949 * First, we're going to copyin() the sizeof (dof_hdr_t).
12950 */
12951 if (uread(p, &hdr, sizeof(hdr), uarg) != KERN_SUCCESS) {
12952 dtrace_dof_error(NULL, "failed to copyin DOF header");
12953 *errp = EFAULT;
12954 return (NULL);
12955 }
12956
12957 /*
12958 * Now we'll allocate the entire DOF and copy it in -- provided
12959 * that the length isn't outrageous.
12960 */
12961 if (hdr.dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
12962 dtrace_dof_error(&hdr, "load size exceeds maximum");
12963 *errp = E2BIG;
12964 return (NULL);
12965 }
12966
12967 if (hdr.dofh_loadsz < sizeof (hdr)) {
12968 dtrace_dof_error(&hdr, "invalid load size");
12969 *errp = EINVAL;
12970 return (NULL);
12971 }
12972
12973 dof = kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP);
12974
12975 if (uread(p, dof, hdr.dofh_loadsz, uarg) != KERN_SUCCESS) {
12976 kmem_free_aligned(dof, hdr.dofh_loadsz);
12977 *errp = EFAULT;
12978 return (NULL);
12979 }
12980
12981 return (dof);
12982 }
12983
12984 static void
12985 dtrace_dof_destroy(dof_hdr_t *dof)
12986 {
12987 kmem_free_aligned(dof, dof->dofh_loadsz);
12988 }
12989
12990 static dof_hdr_t *
12991 dtrace_dof_property(const char *name)
12992 {
12993 unsigned int len = 0;
12994 dof_hdr_t *dof;
12995
12996 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
12997 return NULL;
12998 }
12999
13000 if (!PEReadNVRAMProperty(name, NULL, &len)) {
13001 return NULL;
13002 }
13003
13004 dof = kmem_alloc_aligned(len, 8, KM_SLEEP);
13005
13006 if (!PEReadNVRAMProperty(name, dof, &len)) {
13007 dtrace_dof_destroy(dof);
13008 dtrace_dof_error(NULL, "unreadable DOF");
13009 return NULL;
13010 }
13011
13012 if (len < sizeof (dof_hdr_t)) {
13013 dtrace_dof_destroy(dof);
13014 dtrace_dof_error(NULL, "truncated header");
13015 return (NULL);
13016 }
13017
13018 if (len < dof->dofh_loadsz) {
13019 dtrace_dof_destroy(dof);
13020 dtrace_dof_error(NULL, "truncated DOF");
13021 return (NULL);
13022 }
13023
13024 if (len != dof->dofh_loadsz) {
13025 dtrace_dof_destroy(dof);
13026 dtrace_dof_error(NULL, "invalid DOF size");
13027 return (NULL);
13028 }
13029
13030 if (dof->dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
13031 dtrace_dof_destroy(dof);
13032 dtrace_dof_error(NULL, "oversized DOF");
13033 return (NULL);
13034 }
13035
13036 return (dof);
13037 }
13038
13039 /*
13040 * Return the dof_sec_t pointer corresponding to a given section index. If the
13041 * index is not valid, dtrace_dof_error() is called and NULL is returned. If
13042 * a type other than DOF_SECT_NONE is specified, the header is checked against
13043 * this type and NULL is returned if the types do not match.
13044 */
13045 static dof_sec_t *
13046 dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
13047 {
13048 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)
13049 ((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
13050
13051 if (i >= dof->dofh_secnum) {
13052 dtrace_dof_error(dof, "referenced section index is invalid");
13053 return (NULL);
13054 }
13055
13056 if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
13057 dtrace_dof_error(dof, "referenced section is not loadable");
13058 return (NULL);
13059 }
13060
13061 if (type != DOF_SECT_NONE && type != sec->dofs_type) {
13062 dtrace_dof_error(dof, "referenced section is the wrong type");
13063 return (NULL);
13064 }
13065
13066 return (sec);
13067 }
13068
13069 static dtrace_probedesc_t *
13070 dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)
13071 {
13072 dof_probedesc_t *probe;
13073 dof_sec_t *strtab;
13074 uintptr_t daddr = (uintptr_t)dof;
13075 uintptr_t str;
13076 size_t size;
13077
13078 if (sec->dofs_type != DOF_SECT_PROBEDESC) {
13079 dtrace_dof_error(dof, "invalid probe section");
13080 return (NULL);
13081 }
13082
13083 if (sec->dofs_align != sizeof (dof_secidx_t)) {
13084 dtrace_dof_error(dof, "bad alignment in probe description");
13085 return (NULL);
13086 }
13087
13088 if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
13089 dtrace_dof_error(dof, "truncated probe description");
13090 return (NULL);
13091 }
13092
13093 probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
13094 strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
13095
13096 if (strtab == NULL)
13097 return (NULL);
13098
13099 str = daddr + strtab->dofs_offset;
13100 size = strtab->dofs_size;
13101
13102 if (probe->dofp_provider >= strtab->dofs_size) {
13103 dtrace_dof_error(dof, "corrupt probe provider");
13104 return (NULL);
13105 }
13106
13107 (void) strncpy(desc->dtpd_provider,
13108 (char *)(str + probe->dofp_provider),
13109 MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));
13110
13111 /* APPLE NOTE: Darwin employs size bounded string operation. */
13112 desc->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
13113
13114 if (probe->dofp_mod >= strtab->dofs_size) {
13115 dtrace_dof_error(dof, "corrupt probe module");
13116 return (NULL);
13117 }
13118
13119 (void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
13120 MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));
13121
13122 /* APPLE NOTE: Darwin employs size bounded string operation. */
13123 desc->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
13124
13125 if (probe->dofp_func >= strtab->dofs_size) {
13126 dtrace_dof_error(dof, "corrupt probe function");
13127 return (NULL);
13128 }
13129
13130 (void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
13131 MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));
13132
13133 /* APPLE NOTE: Darwin employs size bounded string operation. */
13134 desc->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
13135
13136 if (probe->dofp_name >= strtab->dofs_size) {
13137 dtrace_dof_error(dof, "corrupt probe name");
13138 return (NULL);
13139 }
13140
13141 (void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
13142 MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));
13143
13144 /* APPLE NOTE: Darwin employs size bounded string operation. */
13145 desc->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
13146
13147 return (desc);
13148 }
13149
13150 static dtrace_difo_t *
13151 dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13152 cred_t *cr)
13153 {
13154 dtrace_difo_t *dp;
13155 size_t ttl = 0;
13156 dof_difohdr_t *dofd;
13157 uintptr_t daddr = (uintptr_t)dof;
13158 size_t max_size = dtrace_difo_maxsize;
13159 uint_t i;
13160 int l, n;
13161
13162
13163 static const struct {
13164 int section;
13165 int bufoffs;
13166 int lenoffs;
13167 int entsize;
13168 int align;
13169 const char *msg;
13170 } difo[] = {
13171 { DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
13172 offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
13173 sizeof (dif_instr_t), "multiple DIF sections" },
13174
13175 { DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
13176 offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
13177 sizeof (uint64_t), "multiple integer tables" },
13178
13179 { DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
13180 offsetof(dtrace_difo_t, dtdo_strlen), 0,
13181 sizeof (char), "multiple string tables" },
13182
13183 { DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
13184 offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
13185 sizeof (uint_t), "multiple variable tables" },
13186
13187 { DOF_SECT_NONE, 0, 0, 0, 0, NULL }
13188 };
13189
13190 if (sec->dofs_type != DOF_SECT_DIFOHDR) {
13191 dtrace_dof_error(dof, "invalid DIFO header section");
13192 return (NULL);
13193 }
13194
13195 if (sec->dofs_align != sizeof (dof_secidx_t)) {
13196 dtrace_dof_error(dof, "bad alignment in DIFO header");
13197 return (NULL);
13198 }
13199
13200 if (sec->dofs_size < sizeof (dof_difohdr_t) ||
13201 sec->dofs_size % sizeof (dof_secidx_t)) {
13202 dtrace_dof_error(dof, "bad size in DIFO header");
13203 return (NULL);
13204 }
13205
13206 dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
13207 n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;
13208
13209 dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
13210 dp->dtdo_rtype = dofd->dofd_rtype;
13211
13212 for (l = 0; l < n; l++) {
13213 dof_sec_t *subsec;
13214 void **bufp;
13215 uint32_t *lenp;
13216
13217 if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
13218 dofd->dofd_links[l])) == NULL)
13219 goto err; /* invalid section link */
13220
13221 if (ttl + subsec->dofs_size > max_size) {
13222 dtrace_dof_error(dof, "exceeds maximum size");
13223 goto err;
13224 }
13225
13226 ttl += subsec->dofs_size;
13227
13228 for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
13229
13230 if (subsec->dofs_type != (uint32_t)difo[i].section)
13231 continue;
13232
13233 if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
13234 dtrace_dof_error(dof, "section not loaded");
13235 goto err;
13236 }
13237
13238 if (subsec->dofs_align != (uint32_t)difo[i].align) {
13239 dtrace_dof_error(dof, "bad alignment");
13240 goto err;
13241 }
13242
13243 bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
13244 lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
13245
13246 if (*bufp != NULL) {
13247 dtrace_dof_error(dof, difo[i].msg);
13248 goto err;
13249 }
13250
13251 if ((uint32_t)difo[i].entsize != subsec->dofs_entsize) {
13252 dtrace_dof_error(dof, "entry size mismatch");
13253 goto err;
13254 }
13255
13256 if (subsec->dofs_entsize != 0 &&
13257 (subsec->dofs_size % subsec->dofs_entsize) != 0) {
13258 dtrace_dof_error(dof, "corrupt entry size");
13259 goto err;
13260 }
13261
13262 *lenp = subsec->dofs_size;
13263 *bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
13264 bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
13265 *bufp, subsec->dofs_size);
13266
13267 if (subsec->dofs_entsize != 0)
13268 *lenp /= subsec->dofs_entsize;
13269
13270 break;
13271 }
13272
13273 /*
13274 * If we encounter a loadable DIFO sub-section that is not
13275 * known to us, assume this is a broken program and fail.
13276 */
13277 if (difo[i].section == DOF_SECT_NONE &&
13278 (subsec->dofs_flags & DOF_SECF_LOAD)) {
13279 dtrace_dof_error(dof, "unrecognized DIFO subsection");
13280 goto err;
13281 }
13282 }
13283
13284 if (dp->dtdo_buf == NULL) {
13285 /*
13286 * We can't have a DIF object without DIF text.
13287 */
13288 dtrace_dof_error(dof, "missing DIF text");
13289 goto err;
13290 }
13291
13292 /*
13293 * Before we validate the DIF object, run through the variable table
13294 * looking for the strings -- if any of their size are under, we'll set
13295 * their size to be the system-wide default string size. Note that
13296 * this should _not_ happen if the "strsize" option has been set --
13297 * in this case, the compiler should have set the size to reflect the
13298 * setting of the option.
13299 */
13300 for (i = 0; i < dp->dtdo_varlen; i++) {
13301 dtrace_difv_t *v = &dp->dtdo_vartab[i];
13302 dtrace_diftype_t *t = &v->dtdv_type;
13303
13304 if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
13305 continue;
13306
13307 if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
13308 t->dtdt_size = dtrace_strsize_default;
13309 }
13310
13311 if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
13312 goto err;
13313
13314 dtrace_difo_init(dp, vstate);
13315 return (dp);
13316
13317 err:
13318 kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
13319 kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
13320 kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
13321 kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
13322
13323 kmem_free(dp, sizeof (dtrace_difo_t));
13324 return (NULL);
13325 }
13326
13327 static dtrace_predicate_t *
13328 dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13329 cred_t *cr)
13330 {
13331 dtrace_difo_t *dp;
13332
13333 if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
13334 return (NULL);
13335
13336 return (dtrace_predicate_create(dp));
13337 }
13338
13339 static dtrace_actdesc_t *
13340 dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13341 cred_t *cr)
13342 {
13343 dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;
13344 dof_actdesc_t *desc;
13345 dof_sec_t *difosec;
13346 size_t offs;
13347 uintptr_t daddr = (uintptr_t)dof;
13348 uint64_t arg;
13349 dtrace_actkind_t kind;
13350
13351 if (sec->dofs_type != DOF_SECT_ACTDESC) {
13352 dtrace_dof_error(dof, "invalid action section");
13353 return (NULL);
13354 }
13355
13356 if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
13357 dtrace_dof_error(dof, "truncated action description");
13358 return (NULL);
13359 }
13360
13361 if (sec->dofs_align != sizeof (uint64_t)) {
13362 dtrace_dof_error(dof, "bad alignment in action description");
13363 return (NULL);
13364 }
13365
13366 if (sec->dofs_size < sec->dofs_entsize) {
13367 dtrace_dof_error(dof, "section entry size exceeds total size");
13368 return (NULL);
13369 }
13370
13371 if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
13372 dtrace_dof_error(dof, "bad entry size in action description");
13373 return (NULL);
13374 }
13375
13376 if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
13377 dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
13378 return (NULL);
13379 }
13380
13381 for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
13382 desc = (dof_actdesc_t *)(daddr +
13383 (uintptr_t)sec->dofs_offset + offs);
13384 kind = (dtrace_actkind_t)desc->dofa_kind;
13385
13386 if ((DTRACEACT_ISPRINTFLIKE(kind) &&
13387 (kind != DTRACEACT_PRINTA || desc->dofa_strtab != DOF_SECIDX_NONE)) ||
13388 (kind == DTRACEACT_DIFEXPR && desc->dofa_strtab != DOF_SECIDX_NONE))
13389 {
13390 dof_sec_t *strtab;
13391 char *str, *fmt;
13392 uint64_t i;
13393
13394 /*
13395 * The argument to these actions is an index into the
13396 * DOF string table. For printf()-like actions, this
13397 * is the format string. For print(), this is the
13398 * CTF type of the expression result.
13399 */
13400 if ((strtab = dtrace_dof_sect(dof,
13401 DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
13402 goto err;
13403
13404 str = (char *)((uintptr_t)dof +
13405 (uintptr_t)strtab->dofs_offset);
13406
13407 for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
13408 if (str[i] == '\0')
13409 break;
13410 }
13411
13412 if (i >= strtab->dofs_size) {
13413 dtrace_dof_error(dof, "bogus format string");
13414 goto err;
13415 }
13416
13417 if (i == desc->dofa_arg) {
13418 dtrace_dof_error(dof, "empty format string");
13419 goto err;
13420 }
13421
13422 i -= desc->dofa_arg;
13423 fmt = kmem_alloc(i + 1, KM_SLEEP);
13424 bcopy(&str[desc->dofa_arg], fmt, i + 1);
13425 arg = (uint64_t)(uintptr_t)fmt;
13426 } else {
13427 if (kind == DTRACEACT_PRINTA) {
13428 ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
13429 arg = 0;
13430 } else {
13431 arg = desc->dofa_arg;
13432 }
13433 }
13434
13435 act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
13436 desc->dofa_uarg, arg);
13437
13438 if (last != NULL) {
13439 last->dtad_next = act;
13440 } else {
13441 first = act;
13442 }
13443
13444 last = act;
13445
13446 if (desc->dofa_difo == DOF_SECIDX_NONE)
13447 continue;
13448
13449 if ((difosec = dtrace_dof_sect(dof,
13450 DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
13451 goto err;
13452
13453 act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
13454
13455 if (act->dtad_difo == NULL)
13456 goto err;
13457 }
13458
13459 ASSERT(first != NULL);
13460 return (first);
13461
13462 err:
13463 for (act = first; act != NULL; act = next) {
13464 next = act->dtad_next;
13465 dtrace_actdesc_release(act, vstate);
13466 }
13467
13468 return (NULL);
13469 }
13470
13471 static dtrace_ecbdesc_t *
13472 dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13473 cred_t *cr)
13474 {
13475 dtrace_ecbdesc_t *ep;
13476 dof_ecbdesc_t *ecb;
13477 dtrace_probedesc_t *desc;
13478 dtrace_predicate_t *pred = NULL;
13479
13480 if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
13481 dtrace_dof_error(dof, "truncated ECB description");
13482 return (NULL);
13483 }
13484
13485 if (sec->dofs_align != sizeof (uint64_t)) {
13486 dtrace_dof_error(dof, "bad alignment in ECB description");
13487 return (NULL);
13488 }
13489
13490 ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
13491 sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
13492
13493 if (sec == NULL)
13494 return (NULL);
13495
13496 ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
13497 ep->dted_uarg = ecb->dofe_uarg;
13498 desc = &ep->dted_probe;
13499
13500 if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
13501 goto err;
13502
13503 if (ecb->dofe_pred != DOF_SECIDX_NONE) {
13504 if ((sec = dtrace_dof_sect(dof,
13505 DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
13506 goto err;
13507
13508 if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
13509 goto err;
13510
13511 ep->dted_pred.dtpdd_predicate = pred;
13512 }
13513
13514 if (ecb->dofe_actions != DOF_SECIDX_NONE) {
13515 if ((sec = dtrace_dof_sect(dof,
13516 DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
13517 goto err;
13518
13519 ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
13520
13521 if (ep->dted_action == NULL)
13522 goto err;
13523 }
13524
13525 return (ep);
13526
13527 err:
13528 if (pred != NULL)
13529 dtrace_predicate_release(pred, vstate);
13530 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
13531 return (NULL);
13532 }
13533
13534 /*
13535 * APPLE NOTE: dyld handles dof relocation.
13536 * Darwin does not need dtrace_dof_relocate()
13537 */
13538
13539 /*
13540 * The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
13541 * header: it should be at the front of a memory region that is at least
13542 * sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
13543 * size. It need not be validated in any other way.
13544 */
13545 static int
13546 dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
13547 dtrace_enabling_t **enabp, uint64_t ubase, int noprobes)
13548 {
13549 #pragma unused(ubase) /* __APPLE__ */
13550 uint64_t len = dof->dofh_loadsz, seclen;
13551 uintptr_t daddr = (uintptr_t)dof;
13552 dtrace_ecbdesc_t *ep;
13553 dtrace_enabling_t *enab;
13554 uint_t i;
13555
13556 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13557 ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
13558
13559 /*
13560 * Check the DOF header identification bytes. In addition to checking
13561 * valid settings, we also verify that unused bits/bytes are zeroed so
13562 * we can use them later without fear of regressing existing binaries.
13563 */
13564 if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
13565 DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
13566 dtrace_dof_error(dof, "DOF magic string mismatch");
13567 return (-1);
13568 }
13569
13570 if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
13571 dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
13572 dtrace_dof_error(dof, "DOF has invalid data model");
13573 return (-1);
13574 }
13575
13576 if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
13577 dtrace_dof_error(dof, "DOF encoding mismatch");
13578 return (-1);
13579 }
13580
13581 /*
13582 * APPLE NOTE: Darwin only supports DOF_VERSION_3 for now.
13583 */
13584 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_3) {
13585 dtrace_dof_error(dof, "DOF version mismatch");
13586 return (-1);
13587 }
13588
13589 if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
13590 dtrace_dof_error(dof, "DOF uses unsupported instruction set");
13591 return (-1);
13592 }
13593
13594 if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
13595 dtrace_dof_error(dof, "DOF uses too many integer registers");
13596 return (-1);
13597 }
13598
13599 if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
13600 dtrace_dof_error(dof, "DOF uses too many tuple registers");
13601 return (-1);
13602 }
13603
13604 for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
13605 if (dof->dofh_ident[i] != 0) {
13606 dtrace_dof_error(dof, "DOF has invalid ident byte set");
13607 return (-1);
13608 }
13609 }
13610
13611 if (dof->dofh_flags & ~DOF_FL_VALID) {
13612 dtrace_dof_error(dof, "DOF has invalid flag bits set");
13613 return (-1);
13614 }
13615
13616 if (dof->dofh_secsize < sizeof(dof_sec_t)) {
13617 dtrace_dof_error(dof, "invalid section header size");
13618 return (-1);
13619 }
13620
13621 /*
13622 * Check that the section headers don't exceed the amount of DOF
13623 * data. Note that we cast the section size and number of sections
13624 * to uint64_t's to prevent possible overflow in the multiplication.
13625 */
13626 seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
13627
13628 if (dof->dofh_secoff > len || seclen > len ||
13629 dof->dofh_secoff + seclen > len) {
13630 dtrace_dof_error(dof, "truncated section headers");
13631 return (-1);
13632 }
13633
13634 if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
13635 dtrace_dof_error(dof, "misaligned section headers");
13636 return (-1);
13637 }
13638
13639 if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
13640 dtrace_dof_error(dof, "misaligned section size");
13641 return (-1);
13642 }
13643
13644 /*
13645 * Take an initial pass through the section headers to be sure that
13646 * the headers don't have stray offsets. If the 'noprobes' flag is
13647 * set, do not permit sections relating to providers, probes, or args.
13648 */
13649 for (i = 0; i < dof->dofh_secnum; i++) {
13650 dof_sec_t *sec = (dof_sec_t *)(daddr +
13651 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13652
13653 if (noprobes) {
13654 switch (sec->dofs_type) {
13655 case DOF_SECT_PROVIDER:
13656 case DOF_SECT_PROBES:
13657 case DOF_SECT_PRARGS:
13658 case DOF_SECT_PROFFS:
13659 dtrace_dof_error(dof, "illegal sections "
13660 "for enabling");
13661 return (-1);
13662 }
13663 }
13664
13665 if (!(sec->dofs_flags & DOF_SECF_LOAD))
13666 continue; /* just ignore non-loadable sections */
13667
13668 if (sec->dofs_align & (sec->dofs_align - 1)) {
13669 dtrace_dof_error(dof, "bad section alignment");
13670 return (-1);
13671 }
13672
13673 if (sec->dofs_offset & (sec->dofs_align - 1)) {
13674 dtrace_dof_error(dof, "misaligned section");
13675 return (-1);
13676 }
13677
13678 if (sec->dofs_offset > len || sec->dofs_size > len ||
13679 sec->dofs_offset + sec->dofs_size > len) {
13680 dtrace_dof_error(dof, "corrupt section header");
13681 return (-1);
13682 }
13683
13684 if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +
13685 sec->dofs_offset + sec->dofs_size - 1) != '\0') {
13686 dtrace_dof_error(dof, "non-terminating string table");
13687 return (-1);
13688 }
13689 }
13690
13691 /*
13692 * APPLE NOTE: We have no further relocation to perform.
13693 * All dof values are relative offsets.
13694 */
13695
13696 if ((enab = *enabp) == NULL)
13697 enab = *enabp = dtrace_enabling_create(vstate);
13698
13699 for (i = 0; i < dof->dofh_secnum; i++) {
13700 dof_sec_t *sec = (dof_sec_t *)(daddr +
13701 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13702
13703 if (sec->dofs_type != DOF_SECT_ECBDESC)
13704 continue;
13705
13706 /*
13707 * APPLE NOTE: Defend against gcc 4.0 botch on x86.
13708 * not all paths out of inlined dtrace_dof_ecbdesc
13709 * are checked for the NULL return value.
13710 * Check for NULL explicitly here.
13711 */
13712 ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr);
13713 if (ep == NULL) {
13714 dtrace_enabling_destroy(enab);
13715 *enabp = NULL;
13716 return (-1);
13717 }
13718
13719 dtrace_enabling_add(enab, ep);
13720 }
13721
13722 return (0);
13723 }
13724
13725 /*
13726 * Process DOF for any options. This routine assumes that the DOF has been
13727 * at least processed by dtrace_dof_slurp().
13728 */
13729 static int
13730 dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)
13731 {
13732 uint_t i;
13733 int rval;
13734 uint32_t entsize;
13735 size_t offs;
13736 dof_optdesc_t *desc;
13737
13738 for (i = 0; i < dof->dofh_secnum; i++) {
13739 dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +
13740 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13741
13742 if (sec->dofs_type != DOF_SECT_OPTDESC)
13743 continue;
13744
13745 if (sec->dofs_align != sizeof (uint64_t)) {
13746 dtrace_dof_error(dof, "bad alignment in "
13747 "option description");
13748 return (EINVAL);
13749 }
13750
13751 if ((entsize = sec->dofs_entsize) == 0) {
13752 dtrace_dof_error(dof, "zeroed option entry size");
13753 return (EINVAL);
13754 }
13755
13756 if (entsize < sizeof (dof_optdesc_t)) {
13757 dtrace_dof_error(dof, "bad option entry size");
13758 return (EINVAL);
13759 }
13760
13761 for (offs = 0; offs < sec->dofs_size; offs += entsize) {
13762 desc = (dof_optdesc_t *)((uintptr_t)dof +
13763 (uintptr_t)sec->dofs_offset + offs);
13764
13765 if (desc->dofo_strtab != DOF_SECIDX_NONE) {
13766 dtrace_dof_error(dof, "non-zero option string");
13767 return (EINVAL);
13768 }
13769
13770 if (desc->dofo_value == (uint64_t)DTRACEOPT_UNSET) {
13771 dtrace_dof_error(dof, "unset option");
13772 return (EINVAL);
13773 }
13774
13775 if ((rval = dtrace_state_option(state,
13776 desc->dofo_option, desc->dofo_value)) != 0) {
13777 dtrace_dof_error(dof, "rejected option");
13778 return (rval);
13779 }
13780 }
13781 }
13782
13783 return (0);
13784 }
13785
13786 /*
13787 * DTrace Consumer State Functions
13788 */
13789 static int
13790 dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
13791 {
13792 size_t hashsize, maxper, min_size, chunksize = dstate->dtds_chunksize;
13793 void *base;
13794 uintptr_t limit;
13795 dtrace_dynvar_t *dvar, *next, *start;
13796 size_t i;
13797
13798 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13799 ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
13800
13801 bzero(dstate, sizeof (dtrace_dstate_t));
13802
13803 if ((dstate->dtds_chunksize = chunksize) == 0)
13804 dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
13805
13806 VERIFY(dstate->dtds_chunksize < (LONG_MAX - sizeof (dtrace_dynhash_t)));
13807
13808 if (size < (min_size = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
13809 size = min_size;
13810
13811 if ((base = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
13812 return (ENOMEM);
13813
13814 dstate->dtds_size = size;
13815 dstate->dtds_base = base;
13816 dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
13817 bzero(dstate->dtds_percpu, (int)NCPU * sizeof (dtrace_dstate_percpu_t));
13818
13819 hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
13820
13821 if (hashsize != 1 && (hashsize & 1))
13822 hashsize--;
13823
13824 dstate->dtds_hashsize = hashsize;
13825 dstate->dtds_hash = dstate->dtds_base;
13826
13827 /*
13828 * Set all of our hash buckets to point to the single sink, and (if
13829 * it hasn't already been set), set the sink's hash value to be the
13830 * sink sentinel value. The sink is needed for dynamic variable
13831 * lookups to know that they have iterated over an entire, valid hash
13832 * chain.
13833 */
13834 for (i = 0; i < hashsize; i++)
13835 dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
13836
13837 if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
13838 dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
13839
13840 /*
13841 * Determine number of active CPUs. Divide free list evenly among
13842 * active CPUs.
13843 */
13844 start = (dtrace_dynvar_t *)
13845 ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
13846 limit = (uintptr_t)base + size;
13847
13848 VERIFY((uintptr_t)start < limit);
13849 VERIFY((uintptr_t)start >= (uintptr_t)base);
13850
13851 maxper = (limit - (uintptr_t)start) / (int)NCPU;
13852 maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
13853
13854 for (i = 0; i < NCPU; i++) {
13855 dstate->dtds_percpu[i].dtdsc_free = dvar = start;
13856
13857 /*
13858 * If we don't even have enough chunks to make it once through
13859 * NCPUs, we're just going to allocate everything to the first
13860 * CPU. And if we're on the last CPU, we're going to allocate
13861 * whatever is left over. In either case, we set the limit to
13862 * be the limit of the dynamic variable space.
13863 */
13864 if (maxper == 0 || i == NCPU - 1) {
13865 limit = (uintptr_t)base + size;
13866 start = NULL;
13867 } else {
13868 limit = (uintptr_t)start + maxper;
13869 start = (dtrace_dynvar_t *)limit;
13870 }
13871
13872 VERIFY(limit <= (uintptr_t)base + size);
13873
13874 for (;;) {
13875 next = (dtrace_dynvar_t *)((uintptr_t)dvar +
13876 dstate->dtds_chunksize);
13877
13878 if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
13879 break;
13880
13881 VERIFY((uintptr_t)dvar >= (uintptr_t)base &&
13882 (uintptr_t)dvar <= (uintptr_t)base + size);
13883 dvar->dtdv_next = next;
13884 dvar = next;
13885 }
13886
13887 if (maxper == 0)
13888 break;
13889 }
13890
13891 return (0);
13892 }
13893
13894 static void
13895 dtrace_dstate_fini(dtrace_dstate_t *dstate)
13896 {
13897 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
13898
13899 if (dstate->dtds_base == NULL)
13900 return;
13901
13902 kmem_free(dstate->dtds_base, dstate->dtds_size);
13903 kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);
13904 }
13905
13906 static void
13907 dtrace_vstate_fini(dtrace_vstate_t *vstate)
13908 {
13909 /*
13910 * Logical XOR, where are you?
13911 */
13912 ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));
13913
13914 if (vstate->dtvs_nglobals > 0) {
13915 kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
13916 sizeof (dtrace_statvar_t *));
13917 }
13918
13919 if (vstate->dtvs_ntlocals > 0) {
13920 kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
13921 sizeof (dtrace_difv_t));
13922 }
13923
13924 ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));
13925
13926 if (vstate->dtvs_nlocals > 0) {
13927 kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
13928 sizeof (dtrace_statvar_t *));
13929 }
13930 }
13931
13932 static void
13933 dtrace_state_clean(dtrace_state_t *state)
13934 {
13935 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
13936 return;
13937
13938 dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
13939 dtrace_speculation_clean(state);
13940 }
13941
13942 static void
13943 dtrace_state_deadman(dtrace_state_t *state)
13944 {
13945 hrtime_t now;
13946
13947 dtrace_sync();
13948
13949 now = dtrace_gethrtime();
13950
13951 if (state != dtrace_anon.dta_state &&
13952 now - state->dts_laststatus >= dtrace_deadman_user)
13953 return;
13954
13955 /*
13956 * We must be sure that dts_alive never appears to be less than the
13957 * value upon entry to dtrace_state_deadman(), and because we lack a
13958 * dtrace_cas64(), we cannot store to it atomically. We thus instead
13959 * store INT64_MAX to it, followed by a memory barrier, followed by
13960 * the new value. This assures that dts_alive never appears to be
13961 * less than its true value, regardless of the order in which the
13962 * stores to the underlying storage are issued.
13963 */
13964 state->dts_alive = INT64_MAX;
13965 dtrace_membar_producer();
13966 state->dts_alive = now;
13967 }
13968
13969 static int
13970 dtrace_state_create(dev_t *devp, cred_t *cr, dtrace_state_t **new_state)
13971 {
13972 minor_t minor;
13973 major_t major;
13974 char c[30];
13975 dtrace_state_t *state;
13976 dtrace_optval_t *opt;
13977 int bufsize = (int)NCPU * sizeof (dtrace_buffer_t), i;
13978 unsigned int cpu_it;
13979
13980 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13981 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
13982
13983 /* Cause restart */
13984 *new_state = NULL;
13985
13986 if (devp != NULL) {
13987 minor = getminor(*devp);
13988 }
13989 else {
13990 minor = DTRACE_NCLIENTS - 1;
13991 }
13992
13993 state = dtrace_state_allocate(minor);
13994 if (NULL == state) {
13995 printf("dtrace_open: couldn't acquire minor number %d. This usually means that too many DTrace clients are in use at the moment", minor);
13996 return (ERESTART); /* can't reacquire */
13997 }
13998
13999 state->dts_epid = DTRACE_EPIDNONE + 1;
14000
14001 (void) snprintf(c, sizeof (c), "dtrace_aggid_%d", minor);
14002 state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
14003 NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
14004
14005 if (devp != NULL) {
14006 major = getemajor(*devp);
14007 } else {
14008 major = ddi_driver_major(dtrace_devi);
14009 }
14010
14011 state->dts_dev = makedev(major, minor);
14012
14013 if (devp != NULL)
14014 *devp = state->dts_dev;
14015
14016 /*
14017 * We allocate NCPU buffers. On the one hand, this can be quite
14018 * a bit of memory per instance (nearly 36K on a Starcat). On the
14019 * other hand, it saves an additional memory reference in the probe
14020 * path.
14021 */
14022 state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
14023 state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
14024 state->dts_buf_over_limit = 0;
14025
14026 /*
14027 * Allocate and initialise the per-process per-CPU random state.
14028 * SI_SUB_RANDOM < SI_SUB_DTRACE_ANON therefore entropy device is
14029 * assumed to be seeded at this point (if from Fortuna seed file).
14030 */
14031 state->dts_rstate = kmem_zalloc(NCPU * sizeof(uint64_t*), KM_SLEEP);
14032 state->dts_rstate[0] = kmem_zalloc(2 * sizeof(uint64_t), KM_SLEEP);
14033 (void) read_random(state->dts_rstate[0], 2 * sizeof(uint64_t));
14034 for (cpu_it = 1; cpu_it < NCPU; cpu_it++) {
14035 state->dts_rstate[cpu_it] = kmem_zalloc(2 * sizeof(uint64_t), KM_SLEEP);
14036 /*
14037 * Each CPU is assigned a 2^64 period, non-overlapping
14038 * subsequence.
14039 */
14040 dtrace_xoroshiro128_plus_jump(state->dts_rstate[cpu_it-1],
14041 state->dts_rstate[cpu_it]);
14042 }
14043
14044 state->dts_cleaner = CYCLIC_NONE;
14045 state->dts_deadman = CYCLIC_NONE;
14046 state->dts_vstate.dtvs_state = state;
14047
14048 for (i = 0; i < DTRACEOPT_MAX; i++)
14049 state->dts_options[i] = DTRACEOPT_UNSET;
14050
14051 /*
14052 * Set the default options.
14053 */
14054 opt = state->dts_options;
14055 opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
14056 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
14057 opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
14058 opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
14059 opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
14060 opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
14061 opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
14062 opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
14063 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
14064 opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
14065 opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
14066 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
14067 opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
14068 opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
14069 opt[DTRACEOPT_BUFLIMIT] = dtrace_buflimit_default;
14070
14071 /*
14072 * Depending on the user credentials, we set flag bits which alter probe
14073 * visibility or the amount of destructiveness allowed. In the case of
14074 * actual anonymous tracing, or the possession of all privileges, all of
14075 * the normal checks are bypassed.
14076 */
14077 #if defined(__APPLE__)
14078 if (cr != NULL) {
14079 kauth_cred_ref(cr);
14080 state->dts_cred.dcr_cred = cr;
14081 }
14082 if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
14083 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
14084 /*
14085 * Allow only proc credentials when DTrace is
14086 * restricted by the current security policy
14087 */
14088 state->dts_cred.dcr_visible = DTRACE_CRV_ALLPROC;
14089 state->dts_cred.dcr_action = DTRACE_CRA_PROC | DTRACE_CRA_PROC_CONTROL | DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14090 }
14091 else {
14092 state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
14093 state->dts_cred.dcr_action = DTRACE_CRA_ALL;
14094 }
14095 }
14096
14097 #else
14098 if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
14099 state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
14100 state->dts_cred.dcr_action = DTRACE_CRA_ALL;
14101 }
14102 else {
14103 /*
14104 * Set up the credentials for this instantiation. We take a
14105 * hold on the credential to prevent it from disappearing on
14106 * us; this in turn prevents the zone_t referenced by this
14107 * credential from disappearing. This means that we can
14108 * examine the credential and the zone from probe context.
14109 */
14110 crhold(cr);
14111 state->dts_cred.dcr_cred = cr;
14112
14113 /*
14114 * CRA_PROC means "we have *some* privilege for dtrace" and
14115 * unlocks the use of variables like pid, zonename, etc.
14116 */
14117 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||
14118 PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
14119 state->dts_cred.dcr_action |= DTRACE_CRA_PROC;
14120 }
14121
14122 /*
14123 * dtrace_user allows use of syscall and profile providers.
14124 * If the user also has proc_owner and/or proc_zone, we
14125 * extend the scope to include additional visibility and
14126 * destructive power.
14127 */
14128 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
14129 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
14130 state->dts_cred.dcr_visible |=
14131 DTRACE_CRV_ALLPROC;
14132
14133 state->dts_cred.dcr_action |=
14134 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14135 }
14136
14137 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
14138 state->dts_cred.dcr_visible |=
14139 DTRACE_CRV_ALLZONE;
14140
14141 state->dts_cred.dcr_action |=
14142 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14143 }
14144
14145 /*
14146 * If we have all privs in whatever zone this is,
14147 * we can do destructive things to processes which
14148 * have altered credentials.
14149 *
14150 * APPLE NOTE: Darwin doesn't do zones.
14151 * Behave as if zone always has destructive privs.
14152 */
14153
14154 state->dts_cred.dcr_action |=
14155 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
14156 }
14157
14158 /*
14159 * Holding the dtrace_kernel privilege also implies that
14160 * the user has the dtrace_user privilege from a visibility
14161 * perspective. But without further privileges, some
14162 * destructive actions are not available.
14163 */
14164 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
14165 /*
14166 * Make all probes in all zones visible. However,
14167 * this doesn't mean that all actions become available
14168 * to all zones.
14169 */
14170 state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |
14171 DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;
14172
14173 state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |
14174 DTRACE_CRA_PROC;
14175 /*
14176 * Holding proc_owner means that destructive actions
14177 * for *this* zone are allowed.
14178 */
14179 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
14180 state->dts_cred.dcr_action |=
14181 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14182
14183 /*
14184 * Holding proc_zone means that destructive actions
14185 * for this user/group ID in all zones is allowed.
14186 */
14187 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
14188 state->dts_cred.dcr_action |=
14189 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14190
14191 /*
14192 * If we have all privs in whatever zone this is,
14193 * we can do destructive things to processes which
14194 * have altered credentials.
14195 *
14196 * APPLE NOTE: Darwin doesn't do zones.
14197 * Behave as if zone always has destructive privs.
14198 */
14199 state->dts_cred.dcr_action |=
14200 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
14201 }
14202
14203 /*
14204 * Holding the dtrace_proc privilege gives control over fasttrap
14205 * and pid providers. We need to grant wider destructive
14206 * privileges in the event that the user has proc_owner and/or
14207 * proc_zone.
14208 */
14209 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
14210 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
14211 state->dts_cred.dcr_action |=
14212 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14213
14214 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
14215 state->dts_cred.dcr_action |=
14216 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14217 }
14218 }
14219 #endif
14220
14221 *new_state = state;
14222 return(0); /* Success */
14223 }
14224
14225 static int
14226 dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
14227 {
14228 dtrace_optval_t *opt = state->dts_options, size;
14229 processorid_t cpu = 0;
14230 size_t limit = buf->dtb_size;
14231 int flags = 0, rval;
14232
14233 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14234 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14235 ASSERT(which < DTRACEOPT_MAX);
14236 ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||
14237 (state == dtrace_anon.dta_state &&
14238 state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
14239
14240 if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)
14241 return (0);
14242
14243 if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
14244 cpu = opt[DTRACEOPT_CPU];
14245
14246 if (which == DTRACEOPT_SPECSIZE)
14247 flags |= DTRACEBUF_NOSWITCH;
14248
14249 if (which == DTRACEOPT_BUFSIZE) {
14250 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
14251 flags |= DTRACEBUF_RING;
14252
14253 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
14254 flags |= DTRACEBUF_FILL;
14255
14256 if (state != dtrace_anon.dta_state ||
14257 state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
14258 flags |= DTRACEBUF_INACTIVE;
14259 }
14260
14261 for (size = opt[which]; (size_t)size >= sizeof (uint64_t); size >>= 1) {
14262 /*
14263 * The size must be 8-byte aligned. If the size is not 8-byte
14264 * aligned, drop it down by the difference.
14265 */
14266 if (size & (sizeof (uint64_t) - 1))
14267 size -= size & (sizeof (uint64_t) - 1);
14268
14269 if (size < state->dts_reserve) {
14270 /*
14271 * Buffers always must be large enough to accommodate
14272 * their prereserved space. We return E2BIG instead
14273 * of ENOMEM in this case to allow for user-level
14274 * software to differentiate the cases.
14275 */
14276 return (E2BIG);
14277 }
14278 limit = opt[DTRACEOPT_BUFLIMIT] * size / 100;
14279 rval = dtrace_buffer_alloc(buf, limit, size, flags, cpu);
14280
14281 if (rval != ENOMEM) {
14282 opt[which] = size;
14283 return (rval);
14284 }
14285
14286 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
14287 return (rval);
14288 }
14289
14290 return (ENOMEM);
14291 }
14292
14293 static int
14294 dtrace_state_buffers(dtrace_state_t *state)
14295 {
14296 dtrace_speculation_t *spec = state->dts_speculations;
14297 int rval, i;
14298
14299 if ((rval = dtrace_state_buffer(state, state->dts_buffer,
14300 DTRACEOPT_BUFSIZE)) != 0)
14301 return (rval);
14302
14303 if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
14304 DTRACEOPT_AGGSIZE)) != 0)
14305 return (rval);
14306
14307 for (i = 0; i < state->dts_nspeculations; i++) {
14308 if ((rval = dtrace_state_buffer(state,
14309 spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
14310 return (rval);
14311 }
14312
14313 return (0);
14314 }
14315
14316 static void
14317 dtrace_state_prereserve(dtrace_state_t *state)
14318 {
14319 dtrace_ecb_t *ecb;
14320 dtrace_probe_t *probe;
14321
14322 state->dts_reserve = 0;
14323
14324 if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
14325 return;
14326
14327 /*
14328 * If our buffer policy is a "fill" buffer policy, we need to set the
14329 * prereserved space to be the space required by the END probes.
14330 */
14331 probe = dtrace_probes[dtrace_probeid_end - 1];
14332 ASSERT(probe != NULL);
14333
14334 for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
14335 if (ecb->dte_state != state)
14336 continue;
14337
14338 state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
14339 }
14340 }
14341
14342 static int
14343 dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
14344 {
14345 dtrace_optval_t *opt = state->dts_options, sz, nspec;
14346 dtrace_speculation_t *spec;
14347 dtrace_buffer_t *buf;
14348 cyc_handler_t hdlr;
14349 cyc_time_t when;
14350 int rval = 0, i, bufsize = (int)NCPU * sizeof (dtrace_buffer_t);
14351 dtrace_icookie_t cookie;
14352
14353 lck_mtx_lock(&cpu_lock);
14354 lck_mtx_lock(&dtrace_lock);
14355
14356 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
14357 rval = EBUSY;
14358 goto out;
14359 }
14360
14361 /*
14362 * Before we can perform any checks, we must prime all of the
14363 * retained enablings that correspond to this state.
14364 */
14365 dtrace_enabling_prime(state);
14366
14367 if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
14368 rval = EACCES;
14369 goto out;
14370 }
14371
14372 dtrace_state_prereserve(state);
14373
14374 /*
14375 * Now we want to do is try to allocate our speculations.
14376 * We do not automatically resize the number of speculations; if
14377 * this fails, we will fail the operation.
14378 */
14379 nspec = opt[DTRACEOPT_NSPEC];
14380 ASSERT(nspec != DTRACEOPT_UNSET);
14381
14382 if (nspec > INT_MAX) {
14383 rval = ENOMEM;
14384 goto out;
14385 }
14386
14387 spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t), KM_NOSLEEP);
14388
14389 if (spec == NULL) {
14390 rval = ENOMEM;
14391 goto out;
14392 }
14393
14394 state->dts_speculations = spec;
14395 state->dts_nspeculations = (int)nspec;
14396
14397 for (i = 0; i < nspec; i++) {
14398 if ((buf = kmem_zalloc(bufsize, KM_NOSLEEP)) == NULL) {
14399 rval = ENOMEM;
14400 goto err;
14401 }
14402
14403 spec[i].dtsp_buffer = buf;
14404 }
14405
14406 if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
14407 if (dtrace_anon.dta_state == NULL) {
14408 rval = ENOENT;
14409 goto out;
14410 }
14411
14412 if (state->dts_necbs != 0) {
14413 rval = EALREADY;
14414 goto out;
14415 }
14416
14417 state->dts_anon = dtrace_anon_grab();
14418 ASSERT(state->dts_anon != NULL);
14419 state = state->dts_anon;
14420
14421 /*
14422 * We want "grabanon" to be set in the grabbed state, so we'll
14423 * copy that option value from the grabbing state into the
14424 * grabbed state.
14425 */
14426 state->dts_options[DTRACEOPT_GRABANON] =
14427 opt[DTRACEOPT_GRABANON];
14428
14429 *cpu = dtrace_anon.dta_beganon;
14430
14431 /*
14432 * If the anonymous state is active (as it almost certainly
14433 * is if the anonymous enabling ultimately matched anything),
14434 * we don't allow any further option processing -- but we
14435 * don't return failure.
14436 */
14437 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14438 goto out;
14439 }
14440
14441 if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
14442 opt[DTRACEOPT_AGGSIZE] != 0) {
14443 if (state->dts_aggregations == NULL) {
14444 /*
14445 * We're not going to create an aggregation buffer
14446 * because we don't have any ECBs that contain
14447 * aggregations -- set this option to 0.
14448 */
14449 opt[DTRACEOPT_AGGSIZE] = 0;
14450 } else {
14451 /*
14452 * If we have an aggregation buffer, we must also have
14453 * a buffer to use as scratch.
14454 */
14455 if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
14456 (size_t)opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
14457 opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
14458 }
14459 }
14460 }
14461
14462 if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
14463 opt[DTRACEOPT_SPECSIZE] != 0) {
14464 if (!state->dts_speculates) {
14465 /*
14466 * We're not going to create speculation buffers
14467 * because we don't have any ECBs that actually
14468 * speculate -- set the speculation size to 0.
14469 */
14470 opt[DTRACEOPT_SPECSIZE] = 0;
14471 }
14472 }
14473
14474 /*
14475 * The bare minimum size for any buffer that we're actually going to
14476 * do anything to is sizeof (uint64_t).
14477 */
14478 sz = sizeof (uint64_t);
14479
14480 if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||
14481 (state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||
14482 (state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
14483 /*
14484 * A buffer size has been explicitly set to 0 (or to a size
14485 * that will be adjusted to 0) and we need the space -- we
14486 * need to return failure. We return ENOSPC to differentiate
14487 * it from failing to allocate a buffer due to failure to meet
14488 * the reserve (for which we return E2BIG).
14489 */
14490 rval = ENOSPC;
14491 goto out;
14492 }
14493
14494 if ((rval = dtrace_state_buffers(state)) != 0)
14495 goto err;
14496
14497 if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
14498 sz = dtrace_dstate_defsize;
14499
14500 do {
14501 rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
14502
14503 if (rval == 0)
14504 break;
14505
14506 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
14507 goto err;
14508 } while (sz >>= 1);
14509
14510 opt[DTRACEOPT_DYNVARSIZE] = sz;
14511
14512 if (rval != 0)
14513 goto err;
14514
14515 if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
14516 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
14517
14518 if (opt[DTRACEOPT_CLEANRATE] == 0)
14519 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
14520
14521 if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
14522 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
14523
14524 if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
14525 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
14526
14527 if (opt[DTRACEOPT_STRSIZE] > dtrace_strsize_max)
14528 opt[DTRACEOPT_STRSIZE] = dtrace_strsize_max;
14529
14530 if (opt[DTRACEOPT_STRSIZE] < dtrace_strsize_min)
14531 opt[DTRACEOPT_STRSIZE] = dtrace_strsize_min;
14532
14533 if (opt[DTRACEOPT_BUFLIMIT] > dtrace_buflimit_max)
14534 opt[DTRACEOPT_BUFLIMIT] = dtrace_buflimit_max;
14535
14536 if (opt[DTRACEOPT_BUFLIMIT] < dtrace_buflimit_min)
14537 opt[DTRACEOPT_BUFLIMIT] = dtrace_buflimit_min;
14538
14539 hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
14540 hdlr.cyh_arg = state;
14541 hdlr.cyh_level = CY_LOW_LEVEL;
14542
14543 when.cyt_when = 0;
14544 when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
14545
14546 state->dts_cleaner = cyclic_add(&hdlr, &when);
14547
14548 hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
14549 hdlr.cyh_arg = state;
14550 hdlr.cyh_level = CY_LOW_LEVEL;
14551
14552 when.cyt_when = 0;
14553 when.cyt_interval = dtrace_deadman_interval;
14554
14555 state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
14556 state->dts_deadman = cyclic_add(&hdlr, &when);
14557
14558 state->dts_activity = DTRACE_ACTIVITY_WARMUP;
14559
14560 /*
14561 * Now it's time to actually fire the BEGIN probe. We need to disable
14562 * interrupts here both to record the CPU on which we fired the BEGIN
14563 * probe (the data from this CPU will be processed first at user
14564 * level) and to manually activate the buffer for this CPU.
14565 */
14566 cookie = dtrace_interrupt_disable();
14567 *cpu = CPU->cpu_id;
14568 ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
14569 state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
14570
14571 dtrace_probe(dtrace_probeid_begin,
14572 (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
14573 dtrace_interrupt_enable(cookie);
14574 /*
14575 * We may have had an exit action from a BEGIN probe; only change our
14576 * state to ACTIVE if we're still in WARMUP.
14577 */
14578 ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||
14579 state->dts_activity == DTRACE_ACTIVITY_DRAINING);
14580
14581 if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
14582 state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
14583
14584 /*
14585 * Regardless of whether or not now we're in ACTIVE or DRAINING, we
14586 * want each CPU to transition its principal buffer out of the
14587 * INACTIVE state. Doing this assures that no CPU will suddenly begin
14588 * processing an ECB halfway down a probe's ECB chain; all CPUs will
14589 * atomically transition from processing none of a state's ECBs to
14590 * processing all of them.
14591 */
14592 dtrace_xcall(DTRACE_CPUALL,
14593 (dtrace_xcall_t)dtrace_buffer_activate, state);
14594 goto out;
14595
14596 err:
14597 dtrace_buffer_free(state->dts_buffer);
14598 dtrace_buffer_free(state->dts_aggbuffer);
14599
14600 if ((nspec = state->dts_nspeculations) == 0) {
14601 ASSERT(state->dts_speculations == NULL);
14602 goto out;
14603 }
14604
14605 spec = state->dts_speculations;
14606 ASSERT(spec != NULL);
14607
14608 for (i = 0; i < state->dts_nspeculations; i++) {
14609 if ((buf = spec[i].dtsp_buffer) == NULL)
14610 break;
14611
14612 dtrace_buffer_free(buf);
14613 kmem_free(buf, bufsize);
14614 }
14615
14616 kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
14617 state->dts_nspeculations = 0;
14618 state->dts_speculations = NULL;
14619
14620 out:
14621 lck_mtx_unlock(&dtrace_lock);
14622 lck_mtx_unlock(&cpu_lock);
14623
14624 return (rval);
14625 }
14626
14627 static int
14628 dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
14629 {
14630 dtrace_icookie_t cookie;
14631
14632 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14633
14634 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
14635 state->dts_activity != DTRACE_ACTIVITY_DRAINING)
14636 return (EINVAL);
14637
14638 /*
14639 * We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
14640 * to be sure that every CPU has seen it. See below for the details
14641 * on why this is done.
14642 */
14643 state->dts_activity = DTRACE_ACTIVITY_DRAINING;
14644 dtrace_sync();
14645
14646 /*
14647 * By this point, it is impossible for any CPU to be still processing
14648 * with DTRACE_ACTIVITY_ACTIVE. We can thus set our activity to
14649 * DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
14650 * other CPU in dtrace_buffer_reserve(). This allows dtrace_probe()
14651 * and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
14652 * iff we're in the END probe.
14653 */
14654 state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
14655 dtrace_sync();
14656 ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
14657
14658 /*
14659 * Finally, we can release the reserve and call the END probe. We
14660 * disable interrupts across calling the END probe to allow us to
14661 * return the CPU on which we actually called the END probe. This
14662 * allows user-land to be sure that this CPU's principal buffer is
14663 * processed last.
14664 */
14665 state->dts_reserve = 0;
14666
14667 cookie = dtrace_interrupt_disable();
14668 *cpu = CPU->cpu_id;
14669 dtrace_probe(dtrace_probeid_end,
14670 (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
14671 dtrace_interrupt_enable(cookie);
14672
14673 state->dts_activity = DTRACE_ACTIVITY_STOPPED;
14674 dtrace_sync();
14675
14676 return (0);
14677 }
14678
14679 static int
14680 dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
14681 dtrace_optval_t val)
14682 {
14683 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14684
14685 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14686 return (EBUSY);
14687
14688 if (option >= DTRACEOPT_MAX)
14689 return (EINVAL);
14690
14691 if (option != DTRACEOPT_CPU && val < 0)
14692 return (EINVAL);
14693
14694 switch (option) {
14695 case DTRACEOPT_DESTRUCTIVE:
14696 /*
14697 * Prevent consumers from enabling destructive actions if DTrace
14698 * is running in a restricted environment, or if actions are
14699 * disallowed.
14700 */
14701 if (dtrace_is_restricted() || dtrace_destructive_disallow)
14702 return (EACCES);
14703
14704 state->dts_cred.dcr_destructive = 1;
14705 break;
14706
14707 case DTRACEOPT_BUFSIZE:
14708 case DTRACEOPT_DYNVARSIZE:
14709 case DTRACEOPT_AGGSIZE:
14710 case DTRACEOPT_SPECSIZE:
14711 case DTRACEOPT_STRSIZE:
14712 if (val < 0)
14713 return (EINVAL);
14714
14715 if (val >= LONG_MAX) {
14716 /*
14717 * If this is an otherwise negative value, set it to
14718 * the highest multiple of 128m less than LONG_MAX.
14719 * Technically, we're adjusting the size without
14720 * regard to the buffer resizing policy, but in fact,
14721 * this has no effect -- if we set the buffer size to
14722 * ~LONG_MAX and the buffer policy is ultimately set to
14723 * be "manual", the buffer allocation is guaranteed to
14724 * fail, if only because the allocation requires two
14725 * buffers. (We set the the size to the highest
14726 * multiple of 128m because it ensures that the size
14727 * will remain a multiple of a megabyte when
14728 * repeatedly halved -- all the way down to 15m.)
14729 */
14730 val = LONG_MAX - (1 << 27) + 1;
14731 }
14732 }
14733
14734 state->dts_options[option] = val;
14735
14736 return (0);
14737 }
14738
14739 static void
14740 dtrace_state_destroy(dtrace_state_t *state)
14741 {
14742 dtrace_ecb_t *ecb;
14743 dtrace_vstate_t *vstate = &state->dts_vstate;
14744 minor_t minor = getminor(state->dts_dev);
14745 int i, bufsize = (int)NCPU * sizeof (dtrace_buffer_t);
14746 dtrace_speculation_t *spec = state->dts_speculations;
14747 int nspec = state->dts_nspeculations;
14748 uint32_t match;
14749
14750 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14751 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14752
14753 /*
14754 * First, retract any retained enablings for this state.
14755 */
14756 dtrace_enabling_retract(state);
14757 ASSERT(state->dts_nretained == 0);
14758
14759 if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||
14760 state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
14761 /*
14762 * We have managed to come into dtrace_state_destroy() on a
14763 * hot enabling -- almost certainly because of a disorderly
14764 * shutdown of a consumer. (That is, a consumer that is
14765 * exiting without having called dtrace_stop().) In this case,
14766 * we're going to set our activity to be KILLED, and then
14767 * issue a sync to be sure that everyone is out of probe
14768 * context before we start blowing away ECBs.
14769 */
14770 state->dts_activity = DTRACE_ACTIVITY_KILLED;
14771 dtrace_sync();
14772 }
14773
14774 /*
14775 * Release the credential hold we took in dtrace_state_create().
14776 */
14777 if (state->dts_cred.dcr_cred != NULL)
14778 kauth_cred_unref(&state->dts_cred.dcr_cred);
14779
14780 /*
14781 * Now we can safely disable and destroy any enabled probes. Because
14782 * any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
14783 * (especially if they're all enabled), we take two passes through the
14784 * ECBs: in the first, we disable just DTRACE_PRIV_KERNEL probes, and
14785 * in the second we disable whatever is left over.
14786 */
14787 for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
14788 for (i = 0; i < state->dts_necbs; i++) {
14789 if ((ecb = state->dts_ecbs[i]) == NULL)
14790 continue;
14791
14792 if (match && ecb->dte_probe != NULL) {
14793 dtrace_probe_t *probe = ecb->dte_probe;
14794 dtrace_provider_t *prov = probe->dtpr_provider;
14795
14796 if (!(prov->dtpv_priv.dtpp_flags & match))
14797 continue;
14798 }
14799
14800 dtrace_ecb_disable(ecb);
14801 dtrace_ecb_destroy(ecb);
14802 }
14803
14804 if (!match)
14805 break;
14806 }
14807
14808 /*
14809 * Before we free the buffers, perform one more sync to assure that
14810 * every CPU is out of probe context.
14811 */
14812 dtrace_sync();
14813
14814 dtrace_buffer_free(state->dts_buffer);
14815 dtrace_buffer_free(state->dts_aggbuffer);
14816
14817 for (i = 0; i < (int)NCPU; i++) {
14818 kmem_free(state->dts_rstate[i], 2 * sizeof(uint64_t));
14819 }
14820 kmem_free(state->dts_rstate, NCPU * sizeof(uint64_t*));
14821
14822 for (i = 0; i < nspec; i++)
14823 dtrace_buffer_free(spec[i].dtsp_buffer);
14824
14825 if (state->dts_cleaner != CYCLIC_NONE)
14826 cyclic_remove(state->dts_cleaner);
14827
14828 if (state->dts_deadman != CYCLIC_NONE)
14829 cyclic_remove(state->dts_deadman);
14830
14831 dtrace_dstate_fini(&vstate->dtvs_dynvars);
14832 dtrace_vstate_fini(vstate);
14833 kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
14834
14835 if (state->dts_aggregations != NULL) {
14836 #if DEBUG
14837 for (i = 0; i < state->dts_naggregations; i++)
14838 ASSERT(state->dts_aggregations[i] == NULL);
14839 #endif
14840 ASSERT(state->dts_naggregations > 0);
14841 kmem_free(state->dts_aggregations,
14842 state->dts_naggregations * sizeof (dtrace_aggregation_t *));
14843 }
14844
14845 kmem_free(state->dts_buffer, bufsize);
14846 kmem_free(state->dts_aggbuffer, bufsize);
14847
14848 for (i = 0; i < nspec; i++)
14849 kmem_free(spec[i].dtsp_buffer, bufsize);
14850
14851 kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
14852
14853 dtrace_format_destroy(state);
14854
14855 vmem_destroy(state->dts_aggid_arena);
14856 dtrace_state_free(minor);
14857 }
14858
14859 /*
14860 * DTrace Anonymous Enabling Functions
14861 */
14862
14863 int
14864 dtrace_keep_kernel_symbols(void)
14865 {
14866 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
14867 return 0;
14868 }
14869
14870 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL)
14871 return 1;
14872
14873 return 0;
14874 }
14875
14876 static dtrace_state_t *
14877 dtrace_anon_grab(void)
14878 {
14879 dtrace_state_t *state;
14880
14881 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14882
14883 if ((state = dtrace_anon.dta_state) == NULL) {
14884 ASSERT(dtrace_anon.dta_enabling == NULL);
14885 return (NULL);
14886 }
14887
14888 ASSERT(dtrace_anon.dta_enabling != NULL);
14889 ASSERT(dtrace_retained != NULL);
14890
14891 dtrace_enabling_destroy(dtrace_anon.dta_enabling);
14892 dtrace_anon.dta_enabling = NULL;
14893 dtrace_anon.dta_state = NULL;
14894
14895 return (state);
14896 }
14897
14898 static void
14899 dtrace_anon_property(void)
14900 {
14901 int i, rv;
14902 dtrace_state_t *state;
14903 dof_hdr_t *dof;
14904 char c[32]; /* enough for "dof-data-" + digits */
14905
14906 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14907 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14908
14909 for (i = 0; ; i++) {
14910 (void) snprintf(c, sizeof (c), "dof-data-%d", i);
14911
14912 dtrace_err_verbose = 1;
14913
14914 if ((dof = dtrace_dof_property(c)) == NULL) {
14915 dtrace_err_verbose = 0;
14916 break;
14917 }
14918
14919 #ifdef illumos
14920 /*
14921 * We want to create anonymous state, so we need to transition
14922 * the kernel debugger to indicate that DTrace is active. If
14923 * this fails (e.g. because the debugger has modified text in
14924 * some way), we won't continue with the processing.
14925 */
14926 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
14927 cmn_err(CE_NOTE, "kernel debugger active; anonymous "
14928 "enabling ignored.");
14929 dtrace_dof_destroy(dof);
14930 break;
14931 }
14932 #endif
14933
14934 /*
14935 * If we haven't allocated an anonymous state, we'll do so now.
14936 */
14937 if ((state = dtrace_anon.dta_state) == NULL) {
14938 rv = dtrace_state_create(NULL, NULL, &state);
14939 dtrace_anon.dta_state = state;
14940 if (rv != 0 || state == NULL) {
14941 /*
14942 * This basically shouldn't happen: the only
14943 * failure mode from dtrace_state_create() is a
14944 * failure of ddi_soft_state_zalloc() that
14945 * itself should never happen. Still, the
14946 * interface allows for a failure mode, and
14947 * we want to fail as gracefully as possible:
14948 * we'll emit an error message and cease
14949 * processing anonymous state in this case.
14950 */
14951 cmn_err(CE_WARN, "failed to create "
14952 "anonymous state");
14953 dtrace_dof_destroy(dof);
14954 break;
14955 }
14956 }
14957
14958 rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
14959 &dtrace_anon.dta_enabling, 0, B_TRUE);
14960
14961 if (rv == 0)
14962 rv = dtrace_dof_options(dof, state);
14963
14964 dtrace_err_verbose = 0;
14965 dtrace_dof_destroy(dof);
14966
14967 if (rv != 0) {
14968 /*
14969 * This is malformed DOF; chuck any anonymous state
14970 * that we created.
14971 */
14972 ASSERT(dtrace_anon.dta_enabling == NULL);
14973 dtrace_state_destroy(state);
14974 dtrace_anon.dta_state = NULL;
14975 break;
14976 }
14977
14978 ASSERT(dtrace_anon.dta_enabling != NULL);
14979 }
14980
14981 if (dtrace_anon.dta_enabling != NULL) {
14982 int rval;
14983
14984 /*
14985 * dtrace_enabling_retain() can only fail because we are
14986 * trying to retain more enablings than are allowed -- but
14987 * we only have one anonymous enabling, and we are guaranteed
14988 * to be allowed at least one retained enabling; we assert
14989 * that dtrace_enabling_retain() returns success.
14990 */
14991 rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
14992 ASSERT(rval == 0);
14993
14994 dtrace_enabling_dump(dtrace_anon.dta_enabling);
14995 }
14996 }
14997
14998 /*
14999 * DTrace Helper Functions
15000 */
15001 static void
15002 dtrace_helper_trace(dtrace_helper_action_t *helper,
15003 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
15004 {
15005 uint32_t size, next, nnext;
15006 int i;
15007 dtrace_helptrace_t *ent;
15008 uint16_t flags = cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
15009
15010 if (!dtrace_helptrace_enabled)
15011 return;
15012
15013 ASSERT((uint32_t)vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
15014
15015 /*
15016 * What would a tracing framework be without its own tracing
15017 * framework? (Well, a hell of a lot simpler, for starters...)
15018 */
15019 size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
15020 sizeof (uint64_t) - sizeof (uint64_t);
15021
15022 /*
15023 * Iterate until we can allocate a slot in the trace buffer.
15024 */
15025 do {
15026 next = dtrace_helptrace_next;
15027
15028 if (next + size < dtrace_helptrace_bufsize) {
15029 nnext = next + size;
15030 } else {
15031 nnext = size;
15032 }
15033 } while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
15034
15035 /*
15036 * We have our slot; fill it in.
15037 */
15038 if (nnext == size)
15039 next = 0;
15040
15041 ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
15042 ent->dtht_helper = helper;
15043 ent->dtht_where = where;
15044 ent->dtht_nlocals = vstate->dtvs_nlocals;
15045
15046 ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
15047 mstate->dtms_fltoffs : -1;
15048 ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
15049 ent->dtht_illval = cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
15050
15051 for (i = 0; i < vstate->dtvs_nlocals; i++) {
15052 dtrace_statvar_t *svar;
15053
15054 if ((svar = vstate->dtvs_locals[i]) == NULL)
15055 continue;
15056
15057 ASSERT(svar->dtsv_size >= (int)NCPU * sizeof (uint64_t));
15058 ent->dtht_locals[i] =
15059 ((uint64_t *)(uintptr_t)svar->dtsv_data)[CPU->cpu_id];
15060 }
15061 }
15062
15063 static uint64_t
15064 dtrace_helper(int which, dtrace_mstate_t *mstate,
15065 dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
15066 {
15067 uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
15068 uint64_t sarg0 = mstate->dtms_arg[0];
15069 uint64_t sarg1 = mstate->dtms_arg[1];
15070 uint64_t rval = 0;
15071 dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
15072 dtrace_helper_action_t *helper;
15073 dtrace_vstate_t *vstate;
15074 dtrace_difo_t *pred;
15075 int i, trace = dtrace_helptrace_enabled;
15076
15077 ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
15078
15079 if (helpers == NULL)
15080 return (0);
15081
15082 if ((helper = helpers->dthps_actions[which]) == NULL)
15083 return (0);
15084
15085 vstate = &helpers->dthps_vstate;
15086 mstate->dtms_arg[0] = arg0;
15087 mstate->dtms_arg[1] = arg1;
15088
15089 /*
15090 * Now iterate over each helper. If its predicate evaluates to 'true',
15091 * we'll call the corresponding actions. Note that the below calls
15092 * to dtrace_dif_emulate() may set faults in machine state. This is
15093 * okay: our caller (the outer dtrace_dif_emulate()) will simply plow
15094 * the stored DIF offset with its own (which is the desired behavior).
15095 * Also, note the calls to dtrace_dif_emulate() may allocate scratch
15096 * from machine state; this is okay, too.
15097 */
15098 for (; helper != NULL; helper = helper->dtha_next) {
15099 if ((pred = helper->dtha_predicate) != NULL) {
15100 if (trace)
15101 dtrace_helper_trace(helper, mstate, vstate, 0);
15102
15103 if (!dtrace_dif_emulate(pred, mstate, vstate, state))
15104 goto next;
15105
15106 if (*flags & CPU_DTRACE_FAULT)
15107 goto err;
15108 }
15109
15110 for (i = 0; i < helper->dtha_nactions; i++) {
15111 if (trace)
15112 dtrace_helper_trace(helper,
15113 mstate, vstate, i + 1);
15114
15115 rval = dtrace_dif_emulate(helper->dtha_actions[i],
15116 mstate, vstate, state);
15117
15118 if (*flags & CPU_DTRACE_FAULT)
15119 goto err;
15120 }
15121
15122 next:
15123 if (trace)
15124 dtrace_helper_trace(helper, mstate, vstate,
15125 DTRACE_HELPTRACE_NEXT);
15126 }
15127
15128 if (trace)
15129 dtrace_helper_trace(helper, mstate, vstate,
15130 DTRACE_HELPTRACE_DONE);
15131
15132 /*
15133 * Restore the arg0 that we saved upon entry.
15134 */
15135 mstate->dtms_arg[0] = sarg0;
15136 mstate->dtms_arg[1] = sarg1;
15137
15138 return (rval);
15139
15140 err:
15141 if (trace)
15142 dtrace_helper_trace(helper, mstate, vstate,
15143 DTRACE_HELPTRACE_ERR);
15144
15145 /*
15146 * Restore the arg0 that we saved upon entry.
15147 */
15148 mstate->dtms_arg[0] = sarg0;
15149 mstate->dtms_arg[1] = sarg1;
15150
15151 return (0);
15152 }
15153
15154 static void
15155 dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
15156 dtrace_vstate_t *vstate)
15157 {
15158 int i;
15159
15160 if (helper->dtha_predicate != NULL)
15161 dtrace_difo_release(helper->dtha_predicate, vstate);
15162
15163 for (i = 0; i < helper->dtha_nactions; i++) {
15164 ASSERT(helper->dtha_actions[i] != NULL);
15165 dtrace_difo_release(helper->dtha_actions[i], vstate);
15166 }
15167
15168 kmem_free(helper->dtha_actions,
15169 helper->dtha_nactions * sizeof (dtrace_difo_t *));
15170 kmem_free(helper, sizeof (dtrace_helper_action_t));
15171 }
15172
15173 static int
15174 dtrace_helper_destroygen(proc_t* p, int gen)
15175 {
15176 dtrace_helpers_t *help = p->p_dtrace_helpers;
15177 dtrace_vstate_t *vstate;
15178 uint_t i;
15179
15180 LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
15181 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15182
15183 if (help == NULL || gen > help->dthps_generation)
15184 return (EINVAL);
15185
15186 vstate = &help->dthps_vstate;
15187
15188 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
15189 dtrace_helper_action_t *last = NULL, *h, *next;
15190
15191 for (h = help->dthps_actions[i]; h != NULL; h = next) {
15192 next = h->dtha_next;
15193
15194 if (h->dtha_generation == gen) {
15195 if (last != NULL) {
15196 last->dtha_next = next;
15197 } else {
15198 help->dthps_actions[i] = next;
15199 }
15200
15201 dtrace_helper_action_destroy(h, vstate);
15202 } else {
15203 last = h;
15204 }
15205 }
15206 }
15207
15208 /*
15209 * Interate until we've cleared out all helper providers with the
15210 * given generation number.
15211 */
15212 for (;;) {
15213 dtrace_helper_provider_t *prov = NULL;
15214
15215 /*
15216 * Look for a helper provider with the right generation. We
15217 * have to start back at the beginning of the list each time
15218 * because we drop dtrace_lock. It's unlikely that we'll make
15219 * more than two passes.
15220 */
15221 for (i = 0; i < help->dthps_nprovs; i++) {
15222 prov = help->dthps_provs[i];
15223
15224 if (prov->dthp_generation == gen)
15225 break;
15226 }
15227
15228 /*
15229 * If there were no matches, we're done.
15230 */
15231 if (i == help->dthps_nprovs)
15232 break;
15233
15234 /*
15235 * Move the last helper provider into this slot.
15236 */
15237 help->dthps_nprovs--;
15238 help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
15239 help->dthps_provs[help->dthps_nprovs] = NULL;
15240
15241 lck_mtx_unlock(&dtrace_lock);
15242
15243 /*
15244 * If we have a meta provider, remove this helper provider.
15245 */
15246 if (dtrace_meta_pid != NULL) {
15247 ASSERT(dtrace_deferred_pid == NULL);
15248 dtrace_helper_provider_remove(&prov->dthp_prov,
15249 p);
15250 }
15251
15252 dtrace_helper_provider_destroy(prov);
15253
15254 lck_mtx_lock(&dtrace_lock);
15255 }
15256
15257 return (0);
15258 }
15259
15260 static int
15261 dtrace_helper_validate(dtrace_helper_action_t *helper)
15262 {
15263 int err = 0, i;
15264 dtrace_difo_t *dp;
15265
15266 if ((dp = helper->dtha_predicate) != NULL)
15267 err += dtrace_difo_validate_helper(dp);
15268
15269 for (i = 0; i < helper->dtha_nactions; i++)
15270 err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
15271
15272 return (err == 0);
15273 }
15274
15275 static int
15276 dtrace_helper_action_add(proc_t* p, int which, dtrace_ecbdesc_t *ep)
15277 {
15278 dtrace_helpers_t *help;
15279 dtrace_helper_action_t *helper, *last;
15280 dtrace_actdesc_t *act;
15281 dtrace_vstate_t *vstate;
15282 dtrace_predicate_t *pred;
15283 int count = 0, nactions = 0, i;
15284
15285 if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
15286 return (EINVAL);
15287
15288 help = p->p_dtrace_helpers;
15289 last = help->dthps_actions[which];
15290 vstate = &help->dthps_vstate;
15291
15292 for (count = 0; last != NULL; last = last->dtha_next) {
15293 count++;
15294 if (last->dtha_next == NULL)
15295 break;
15296 }
15297
15298 /*
15299 * If we already have dtrace_helper_actions_max helper actions for this
15300 * helper action type, we'll refuse to add a new one.
15301 */
15302 if (count >= dtrace_helper_actions_max)
15303 return (ENOSPC);
15304
15305 helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
15306 helper->dtha_generation = help->dthps_generation;
15307
15308 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
15309 ASSERT(pred->dtp_difo != NULL);
15310 dtrace_difo_hold(pred->dtp_difo);
15311 helper->dtha_predicate = pred->dtp_difo;
15312 }
15313
15314 for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
15315 if (act->dtad_kind != DTRACEACT_DIFEXPR)
15316 goto err;
15317
15318 if (act->dtad_difo == NULL)
15319 goto err;
15320
15321 nactions++;
15322 }
15323
15324 helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *
15325 (helper->dtha_nactions = nactions), KM_SLEEP);
15326
15327 for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
15328 dtrace_difo_hold(act->dtad_difo);
15329 helper->dtha_actions[i++] = act->dtad_difo;
15330 }
15331
15332 if (!dtrace_helper_validate(helper))
15333 goto err;
15334
15335 if (last == NULL) {
15336 help->dthps_actions[which] = helper;
15337 } else {
15338 last->dtha_next = helper;
15339 }
15340
15341 if ((uint32_t)vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
15342 dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
15343 dtrace_helptrace_next = 0;
15344 }
15345
15346 return (0);
15347 err:
15348 dtrace_helper_action_destroy(helper, vstate);
15349 return (EINVAL);
15350 }
15351
15352 static void
15353 dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
15354 dof_helper_t *dofhp)
15355 {
15356 LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
15357 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
15358
15359 lck_mtx_lock(&dtrace_lock);
15360
15361 if (!dtrace_attached() || dtrace_meta_pid == NULL) {
15362 /*
15363 * If the dtrace module is loaded but not attached, or if
15364 * there aren't isn't a meta provider registered to deal with
15365 * these provider descriptions, we need to postpone creating
15366 * the actual providers until later.
15367 */
15368
15369 if (help->dthps_next == NULL && help->dthps_prev == NULL &&
15370 dtrace_deferred_pid != help) {
15371 help->dthps_deferred = 1;
15372 help->dthps_pid = p->p_pid;
15373 help->dthps_next = dtrace_deferred_pid;
15374 help->dthps_prev = NULL;
15375 if (dtrace_deferred_pid != NULL)
15376 dtrace_deferred_pid->dthps_prev = help;
15377 dtrace_deferred_pid = help;
15378 }
15379
15380 lck_mtx_unlock(&dtrace_lock);
15381
15382 } else if (dofhp != NULL) {
15383 /*
15384 * If the dtrace module is loaded and we have a particular
15385 * helper provider description, pass that off to the
15386 * meta provider.
15387 */
15388
15389 lck_mtx_unlock(&dtrace_lock);
15390
15391 dtrace_helper_provide(dofhp, p);
15392
15393 } else {
15394 /*
15395 * Otherwise, just pass all the helper provider descriptions
15396 * off to the meta provider.
15397 */
15398
15399 uint_t i;
15400 lck_mtx_unlock(&dtrace_lock);
15401
15402 for (i = 0; i < help->dthps_nprovs; i++) {
15403 dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
15404 p);
15405 }
15406 }
15407 }
15408
15409 static int
15410 dtrace_helper_provider_add(proc_t* p, dof_helper_t *dofhp, int gen)
15411 {
15412 dtrace_helpers_t *help;
15413 dtrace_helper_provider_t *hprov, **tmp_provs;
15414 uint_t tmp_maxprovs, i;
15415
15416 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15417 help = p->p_dtrace_helpers;
15418 ASSERT(help != NULL);
15419
15420 /*
15421 * If we already have dtrace_helper_providers_max helper providers,
15422 * we're refuse to add a new one.
15423 */
15424 if (help->dthps_nprovs >= dtrace_helper_providers_max)
15425 return (ENOSPC);
15426
15427 /*
15428 * Check to make sure this isn't a duplicate.
15429 */
15430 for (i = 0; i < help->dthps_nprovs; i++) {
15431 if (dofhp->dofhp_addr ==
15432 help->dthps_provs[i]->dthp_prov.dofhp_addr)
15433 return (EALREADY);
15434 }
15435
15436 hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
15437 hprov->dthp_prov = *dofhp;
15438 hprov->dthp_ref = 1;
15439 hprov->dthp_generation = gen;
15440
15441 /*
15442 * Allocate a bigger table for helper providers if it's already full.
15443 */
15444 if (help->dthps_maxprovs == help->dthps_nprovs) {
15445 tmp_maxprovs = help->dthps_maxprovs;
15446 tmp_provs = help->dthps_provs;
15447
15448 if (help->dthps_maxprovs == 0)
15449 help->dthps_maxprovs = 2;
15450 else
15451 help->dthps_maxprovs *= 2;
15452 if (help->dthps_maxprovs > dtrace_helper_providers_max)
15453 help->dthps_maxprovs = dtrace_helper_providers_max;
15454
15455 ASSERT(tmp_maxprovs < help->dthps_maxprovs);
15456
15457 help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
15458 sizeof (dtrace_helper_provider_t *), KM_SLEEP);
15459
15460 if (tmp_provs != NULL) {
15461 bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
15462 sizeof (dtrace_helper_provider_t *));
15463 kmem_free(tmp_provs, tmp_maxprovs *
15464 sizeof (dtrace_helper_provider_t *));
15465 }
15466 }
15467
15468 help->dthps_provs[help->dthps_nprovs] = hprov;
15469 help->dthps_nprovs++;
15470
15471 return (0);
15472 }
15473
15474 static void
15475 dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
15476 {
15477 lck_mtx_lock(&dtrace_lock);
15478
15479 if (--hprov->dthp_ref == 0) {
15480 dof_hdr_t *dof;
15481 lck_mtx_unlock(&dtrace_lock);
15482 dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
15483 dtrace_dof_destroy(dof);
15484 kmem_free(hprov, sizeof (dtrace_helper_provider_t));
15485 } else {
15486 lck_mtx_unlock(&dtrace_lock);
15487 }
15488 }
15489
15490 static int
15491 dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)
15492 {
15493 uintptr_t daddr = (uintptr_t)dof;
15494 dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
15495 dof_provider_t *provider;
15496 dof_probe_t *probe;
15497 uint8_t *arg;
15498 char *strtab, *typestr;
15499 dof_stridx_t typeidx;
15500 size_t typesz;
15501 uint_t nprobes, j, k;
15502
15503 ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
15504
15505 if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
15506 dtrace_dof_error(dof, "misaligned section offset");
15507 return (-1);
15508 }
15509
15510 /*
15511 * The section needs to be large enough to contain the DOF provider
15512 * structure appropriate for the given version.
15513 */
15514 if (sec->dofs_size <
15515 ((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
15516 offsetof(dof_provider_t, dofpv_prenoffs) :
15517 sizeof (dof_provider_t))) {
15518 dtrace_dof_error(dof, "provider section too small");
15519 return (-1);
15520 }
15521
15522 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
15523 str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
15524 prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
15525 arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
15526 off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
15527
15528 if (str_sec == NULL || prb_sec == NULL ||
15529 arg_sec == NULL || off_sec == NULL)
15530 return (-1);
15531
15532 enoff_sec = NULL;
15533
15534 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
15535 provider->dofpv_prenoffs != DOF_SECT_NONE &&
15536 (enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
15537 provider->dofpv_prenoffs)) == NULL)
15538 return (-1);
15539
15540 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
15541
15542 if (provider->dofpv_name >= str_sec->dofs_size ||
15543 strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
15544 dtrace_dof_error(dof, "invalid provider name");
15545 return (-1);
15546 }
15547
15548 if (prb_sec->dofs_entsize == 0 ||
15549 prb_sec->dofs_entsize > prb_sec->dofs_size) {
15550 dtrace_dof_error(dof, "invalid entry size");
15551 return (-1);
15552 }
15553
15554 if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
15555 dtrace_dof_error(dof, "misaligned entry size");
15556 return (-1);
15557 }
15558
15559 if (off_sec->dofs_entsize != sizeof (uint32_t)) {
15560 dtrace_dof_error(dof, "invalid entry size");
15561 return (-1);
15562 }
15563
15564 if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
15565 dtrace_dof_error(dof, "misaligned section offset");
15566 return (-1);
15567 }
15568
15569 if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
15570 dtrace_dof_error(dof, "invalid entry size");
15571 return (-1);
15572 }
15573
15574 arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
15575
15576 nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
15577
15578 /*
15579 * Take a pass through the probes to check for errors.
15580 */
15581 for (j = 0; j < nprobes; j++) {
15582 probe = (dof_probe_t *)(uintptr_t)(daddr +
15583 prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
15584
15585 if (probe->dofpr_func >= str_sec->dofs_size) {
15586 dtrace_dof_error(dof, "invalid function name");
15587 return (-1);
15588 }
15589
15590 if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
15591 dtrace_dof_error(dof, "function name too long");
15592 return (-1);
15593 }
15594
15595 if (probe->dofpr_name >= str_sec->dofs_size ||
15596 strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
15597 dtrace_dof_error(dof, "invalid probe name");
15598 return (-1);
15599 }
15600
15601 /*
15602 * The offset count must not wrap the index, and the offsets
15603 * must also not overflow the section's data.
15604 */
15605 if (probe->dofpr_offidx + probe->dofpr_noffs <
15606 probe->dofpr_offidx ||
15607 (probe->dofpr_offidx + probe->dofpr_noffs) *
15608 off_sec->dofs_entsize > off_sec->dofs_size) {
15609 dtrace_dof_error(dof, "invalid probe offset");
15610 return (-1);
15611 }
15612
15613 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
15614 /*
15615 * If there's no is-enabled offset section, make sure
15616 * there aren't any is-enabled offsets. Otherwise
15617 * perform the same checks as for probe offsets
15618 * (immediately above).
15619 */
15620 if (enoff_sec == NULL) {
15621 if (probe->dofpr_enoffidx != 0 ||
15622 probe->dofpr_nenoffs != 0) {
15623 dtrace_dof_error(dof, "is-enabled "
15624 "offsets with null section");
15625 return (-1);
15626 }
15627 } else if (probe->dofpr_enoffidx +
15628 probe->dofpr_nenoffs < probe->dofpr_enoffidx ||
15629 (probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
15630 enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
15631 dtrace_dof_error(dof, "invalid is-enabled "
15632 "offset");
15633 return (-1);
15634 }
15635
15636 if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
15637 dtrace_dof_error(dof, "zero probe and "
15638 "is-enabled offsets");
15639 return (-1);
15640 }
15641 } else if (probe->dofpr_noffs == 0) {
15642 dtrace_dof_error(dof, "zero probe offsets");
15643 return (-1);
15644 }
15645
15646 if (probe->dofpr_argidx + probe->dofpr_xargc <
15647 probe->dofpr_argidx ||
15648 (probe->dofpr_argidx + probe->dofpr_xargc) *
15649 arg_sec->dofs_entsize > arg_sec->dofs_size) {
15650 dtrace_dof_error(dof, "invalid args");
15651 return (-1);
15652 }
15653
15654 typeidx = probe->dofpr_nargv;
15655 typestr = strtab + probe->dofpr_nargv;
15656 for (k = 0; k < probe->dofpr_nargc; k++) {
15657 if (typeidx >= str_sec->dofs_size) {
15658 dtrace_dof_error(dof, "bad "
15659 "native argument type");
15660 return (-1);
15661 }
15662
15663 typesz = strlen(typestr) + 1;
15664 if (typesz > DTRACE_ARGTYPELEN) {
15665 dtrace_dof_error(dof, "native "
15666 "argument type too long");
15667 return (-1);
15668 }
15669 typeidx += typesz;
15670 typestr += typesz;
15671 }
15672
15673 typeidx = probe->dofpr_xargv;
15674 typestr = strtab + probe->dofpr_xargv;
15675 for (k = 0; k < probe->dofpr_xargc; k++) {
15676 if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
15677 dtrace_dof_error(dof, "bad "
15678 "native argument index");
15679 return (-1);
15680 }
15681
15682 if (typeidx >= str_sec->dofs_size) {
15683 dtrace_dof_error(dof, "bad "
15684 "translated argument type");
15685 return (-1);
15686 }
15687
15688 typesz = strlen(typestr) + 1;
15689 if (typesz > DTRACE_ARGTYPELEN) {
15690 dtrace_dof_error(dof, "translated argument "
15691 "type too long");
15692 return (-1);
15693 }
15694
15695 typeidx += typesz;
15696 typestr += typesz;
15697 }
15698 }
15699
15700 return (0);
15701 }
15702
15703 static int
15704 dtrace_helper_slurp(proc_t* p, dof_hdr_t *dof, dof_helper_t *dhp)
15705 {
15706 dtrace_helpers_t *help;
15707 dtrace_vstate_t *vstate;
15708 dtrace_enabling_t *enab = NULL;
15709 int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
15710 uintptr_t daddr = (uintptr_t)dof;
15711
15712 LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
15713 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15714
15715 if ((help = p->p_dtrace_helpers) == NULL)
15716 help = dtrace_helpers_create(p);
15717
15718 vstate = &help->dthps_vstate;
15719
15720 if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab,
15721 dhp != NULL ? dhp->dofhp_addr : 0, B_FALSE)) != 0) {
15722 dtrace_dof_destroy(dof);
15723 return (rv);
15724 }
15725
15726 /*
15727 * Look for helper providers and validate their descriptions.
15728 */
15729 if (dhp != NULL) {
15730 for (i = 0; (uint32_t)i < dof->dofh_secnum; i++) {
15731 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
15732 dof->dofh_secoff + i * dof->dofh_secsize);
15733
15734 if (sec->dofs_type != DOF_SECT_PROVIDER)
15735 continue;
15736
15737 if (dtrace_helper_provider_validate(dof, sec) != 0) {
15738 dtrace_enabling_destroy(enab);
15739 dtrace_dof_destroy(dof);
15740 return (-1);
15741 }
15742
15743 nprovs++;
15744 }
15745 }
15746
15747 /*
15748 * Now we need to walk through the ECB descriptions in the enabling.
15749 */
15750 for (i = 0; i < enab->dten_ndesc; i++) {
15751 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
15752 dtrace_probedesc_t *desc = &ep->dted_probe;
15753
15754 /* APPLE NOTE: Darwin employs size bounded string operation. */
15755 if (!LIT_STRNEQL(desc->dtpd_provider, "dtrace"))
15756 continue;
15757
15758 if (!LIT_STRNEQL(desc->dtpd_mod, "helper"))
15759 continue;
15760
15761 if (!LIT_STRNEQL(desc->dtpd_func, "ustack"))
15762 continue;
15763
15764 if ((rv = dtrace_helper_action_add(p, DTRACE_HELPER_ACTION_USTACK,
15765 ep)) != 0) {
15766 /*
15767 * Adding this helper action failed -- we are now going
15768 * to rip out the entire generation and return failure.
15769 */
15770 (void) dtrace_helper_destroygen(p, help->dthps_generation);
15771 dtrace_enabling_destroy(enab);
15772 dtrace_dof_destroy(dof);
15773 return (-1);
15774 }
15775
15776 nhelpers++;
15777 }
15778
15779 if (nhelpers < enab->dten_ndesc)
15780 dtrace_dof_error(dof, "unmatched helpers");
15781
15782 gen = help->dthps_generation++;
15783 dtrace_enabling_destroy(enab);
15784
15785 if (dhp != NULL && nprovs > 0) {
15786 dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
15787 if (dtrace_helper_provider_add(p, dhp, gen) == 0) {
15788 lck_mtx_unlock(&dtrace_lock);
15789 dtrace_helper_provider_register(p, help, dhp);
15790 lck_mtx_lock(&dtrace_lock);
15791
15792 destroy = 0;
15793 }
15794 }
15795
15796 if (destroy)
15797 dtrace_dof_destroy(dof);
15798
15799 return (gen);
15800 }
15801
15802 /*
15803 * APPLE NOTE: DTrace lazy dof implementation
15804 *
15805 * DTrace user static probes (USDT probes) and helper actions are loaded
15806 * in a process by proccessing dof sections. The dof sections are passed
15807 * into the kernel by dyld, in a dof_ioctl_data_t block. It is rather
15808 * expensive to process dof for a process that will never use it. There
15809 * is a memory cost (allocating the providers/probes), and a cpu cost
15810 * (creating the providers/probes).
15811 *
15812 * To reduce this cost, we use "lazy dof". The normal proceedure for
15813 * dof processing is to copyin the dof(s) pointed to by the dof_ioctl_data_t
15814 * block, and invoke dof_slurp_helper() on them. When "lazy dof" is
15815 * used, each process retains the dof_ioctl_data_t block, instead of
15816 * copying in the data it points to.
15817 *
15818 * The dof_ioctl_data_t blocks are managed as if they were the actual
15819 * processed dof; on fork the block is copied to the child, on exec and
15820 * exit the block is freed.
15821 *
15822 * If the process loads library(s) containing additional dof, the
15823 * new dof_ioctl_data_t is merged with the existing block.
15824 *
15825 * There are a few catches that make this slightly more difficult.
15826 * When dyld registers dof_ioctl_data_t blocks, it expects a unique
15827 * identifier value for each dof in the block. In non-lazy dof terms,
15828 * this is the generation that dof was loaded in. If we hand back
15829 * a UID for a lazy dof, that same UID must be able to unload the
15830 * dof once it has become non-lazy. To meet this requirement, the
15831 * code that loads lazy dof requires that the UID's for dof(s) in
15832 * the lazy dof be sorted, and in ascending order. It is okay to skip
15833 * UID's, I.E., 1 -> 5 -> 6 is legal.
15834 *
15835 * Once a process has become non-lazy, it will stay non-lazy. All
15836 * future dof operations for that process will be non-lazy, even
15837 * if the dof mode transitions back to lazy.
15838 *
15839 * Always do lazy dof checks before non-lazy (I.E. In fork, exit, exec.).
15840 * That way if the lazy check fails due to transitioning to non-lazy, the
15841 * right thing is done with the newly faulted in dof.
15842 */
15843
15844 /*
15845 * This method is a bit squicky. It must handle:
15846 *
15847 * dof should not be lazy.
15848 * dof should have been handled lazily, but there was an error
15849 * dof was handled lazily, and needs to be freed.
15850 * dof was handled lazily, and must not be freed.
15851 *
15852 *
15853 * Returns EACCESS if dof should be handled non-lazily.
15854 *
15855 * KERN_SUCCESS and all other return codes indicate lazy handling of dof.
15856 *
15857 * If the dofs data is claimed by this method, dofs_claimed will be set.
15858 * Callers should not free claimed dofs.
15859 */
15860 static int
15861 dtrace_lazy_dofs_add(proc_t *p, dof_ioctl_data_t* incoming_dofs, int *dofs_claimed)
15862 {
15863 ASSERT(p);
15864 ASSERT(incoming_dofs && incoming_dofs->dofiod_count > 0);
15865
15866 int rval = 0;
15867 *dofs_claimed = 0;
15868
15869 lck_rw_lock_shared(&dtrace_dof_mode_lock);
15870
15871 ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
15872 ASSERT(dtrace_dof_mode != DTRACE_DOF_MODE_NEVER);
15873
15874 /*
15875 * Any existing helpers force non-lazy behavior.
15876 */
15877 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON && (p->p_dtrace_helpers == NULL)) {
15878 dtrace_sprlock(p);
15879
15880 dof_ioctl_data_t* existing_dofs = p->p_dtrace_lazy_dofs;
15881 unsigned int existing_dofs_count = (existing_dofs) ? existing_dofs->dofiod_count : 0;
15882 unsigned int i, merged_dofs_count = incoming_dofs->dofiod_count + existing_dofs_count;
15883
15884 /*
15885 * Range check...
15886 */
15887 if (merged_dofs_count == 0 || merged_dofs_count > 1024) {
15888 dtrace_dof_error(NULL, "lazy_dofs_add merged_dofs_count out of range");
15889 rval = EINVAL;
15890 goto unlock;
15891 }
15892
15893 /*
15894 * Each dof being added must be assigned a unique generation.
15895 */
15896 uint64_t generation = (existing_dofs) ? existing_dofs->dofiod_helpers[existing_dofs_count - 1].dofhp_dof + 1 : 1;
15897 for (i=0; i<incoming_dofs->dofiod_count; i++) {
15898 /*
15899 * We rely on these being the same so we can overwrite dofhp_dof and not lose info.
15900 */
15901 ASSERT(incoming_dofs->dofiod_helpers[i].dofhp_dof == incoming_dofs->dofiod_helpers[i].dofhp_addr);
15902 incoming_dofs->dofiod_helpers[i].dofhp_dof = generation++;
15903 }
15904
15905
15906 if (existing_dofs) {
15907 /*
15908 * Merge the existing and incoming dofs
15909 */
15910 size_t merged_dofs_size = DOF_IOCTL_DATA_T_SIZE(merged_dofs_count);
15911 dof_ioctl_data_t* merged_dofs = kmem_alloc(merged_dofs_size, KM_SLEEP);
15912
15913 bcopy(&existing_dofs->dofiod_helpers[0],
15914 &merged_dofs->dofiod_helpers[0],
15915 sizeof(dof_helper_t) * existing_dofs_count);
15916 bcopy(&incoming_dofs->dofiod_helpers[0],
15917 &merged_dofs->dofiod_helpers[existing_dofs_count],
15918 sizeof(dof_helper_t) * incoming_dofs->dofiod_count);
15919
15920 merged_dofs->dofiod_count = merged_dofs_count;
15921
15922 kmem_free(existing_dofs, DOF_IOCTL_DATA_T_SIZE(existing_dofs_count));
15923
15924 p->p_dtrace_lazy_dofs = merged_dofs;
15925 } else {
15926 /*
15927 * Claim the incoming dofs
15928 */
15929 *dofs_claimed = 1;
15930 p->p_dtrace_lazy_dofs = incoming_dofs;
15931 }
15932
15933 #if DEBUG
15934 dof_ioctl_data_t* all_dofs = p->p_dtrace_lazy_dofs;
15935 for (i=0; i<all_dofs->dofiod_count-1; i++) {
15936 ASSERT(all_dofs->dofiod_helpers[i].dofhp_dof < all_dofs->dofiod_helpers[i+1].dofhp_dof);
15937 }
15938 #endif /* DEBUG */
15939
15940 unlock:
15941 dtrace_sprunlock(p);
15942 } else {
15943 rval = EACCES;
15944 }
15945
15946 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
15947
15948 return rval;
15949 }
15950
15951 /*
15952 * Returns:
15953 *
15954 * EINVAL: lazy dof is enabled, but the requested generation was not found.
15955 * EACCES: This removal needs to be handled non-lazily.
15956 */
15957 static int
15958 dtrace_lazy_dofs_remove(proc_t *p, int generation)
15959 {
15960 int rval = EINVAL;
15961
15962 lck_rw_lock_shared(&dtrace_dof_mode_lock);
15963
15964 ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
15965 ASSERT(dtrace_dof_mode != DTRACE_DOF_MODE_NEVER);
15966
15967 /*
15968 * Any existing helpers force non-lazy behavior.
15969 */
15970 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON && (p->p_dtrace_helpers == NULL)) {
15971 dtrace_sprlock(p);
15972
15973 dof_ioctl_data_t* existing_dofs = p->p_dtrace_lazy_dofs;
15974
15975 if (existing_dofs) {
15976 int index, existing_dofs_count = existing_dofs->dofiod_count;
15977 for (index=0; index<existing_dofs_count; index++) {
15978 if ((int)existing_dofs->dofiod_helpers[index].dofhp_dof == generation) {
15979 dof_ioctl_data_t* removed_dofs = NULL;
15980
15981 /*
15982 * If there is only 1 dof, we'll delete it and swap in NULL.
15983 */
15984 if (existing_dofs_count > 1) {
15985 int removed_dofs_count = existing_dofs_count - 1;
15986 size_t removed_dofs_size = DOF_IOCTL_DATA_T_SIZE(removed_dofs_count);
15987
15988 removed_dofs = kmem_alloc(removed_dofs_size, KM_SLEEP);
15989 removed_dofs->dofiod_count = removed_dofs_count;
15990
15991 /*
15992 * copy the remaining data.
15993 */
15994 if (index > 0) {
15995 bcopy(&existing_dofs->dofiod_helpers[0],
15996 &removed_dofs->dofiod_helpers[0],
15997 index * sizeof(dof_helper_t));
15998 }
15999
16000 if (index < existing_dofs_count-1) {
16001 bcopy(&existing_dofs->dofiod_helpers[index+1],
16002 &removed_dofs->dofiod_helpers[index],
16003 (existing_dofs_count - index - 1) * sizeof(dof_helper_t));
16004 }
16005 }
16006
16007 kmem_free(existing_dofs, DOF_IOCTL_DATA_T_SIZE(existing_dofs_count));
16008
16009 p->p_dtrace_lazy_dofs = removed_dofs;
16010
16011 rval = KERN_SUCCESS;
16012
16013 break;
16014 }
16015 }
16016
16017 #if DEBUG
16018 dof_ioctl_data_t* all_dofs = p->p_dtrace_lazy_dofs;
16019 if (all_dofs) {
16020 unsigned int i;
16021 for (i=0; i<all_dofs->dofiod_count-1; i++) {
16022 ASSERT(all_dofs->dofiod_helpers[i].dofhp_dof < all_dofs->dofiod_helpers[i+1].dofhp_dof);
16023 }
16024 }
16025 #endif
16026
16027 }
16028 dtrace_sprunlock(p);
16029 } else {
16030 rval = EACCES;
16031 }
16032
16033 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
16034
16035 return rval;
16036 }
16037
16038 void
16039 dtrace_lazy_dofs_destroy(proc_t *p)
16040 {
16041 lck_rw_lock_shared(&dtrace_dof_mode_lock);
16042 dtrace_sprlock(p);
16043
16044 ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
16045
16046 dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs;
16047 p->p_dtrace_lazy_dofs = NULL;
16048
16049 dtrace_sprunlock(p);
16050 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
16051
16052 if (lazy_dofs) {
16053 kmem_free(lazy_dofs, DOF_IOCTL_DATA_T_SIZE(lazy_dofs->dofiod_count));
16054 }
16055 }
16056
16057 static int
16058 dtrace_lazy_dofs_proc_iterate_filter(proc_t *p, void* ignored)
16059 {
16060 #pragma unused(ignored)
16061 /*
16062 * Okay to NULL test without taking the sprlock.
16063 */
16064 return p->p_dtrace_lazy_dofs != NULL;
16065 }
16066
16067 static void
16068 dtrace_lazy_dofs_process(proc_t *p) {
16069 /*
16070 * It is possible this process may exit during our attempt to
16071 * fault in the dof. We could fix this by holding locks longer,
16072 * but the errors are benign.
16073 */
16074 dtrace_sprlock(p);
16075
16076
16077 ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
16078 ASSERT(dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF);
16079
16080 dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs;
16081 p->p_dtrace_lazy_dofs = NULL;
16082
16083 dtrace_sprunlock(p);
16084 lck_mtx_lock(&dtrace_meta_lock);
16085 /*
16086 * Process each dof_helper_t
16087 */
16088 if (lazy_dofs != NULL) {
16089 unsigned int i;
16090 int rval;
16091
16092 for (i=0; i<lazy_dofs->dofiod_count; i++) {
16093 /*
16094 * When loading lazy dof, we depend on the generations being sorted in ascending order.
16095 */
16096 ASSERT(i >= (lazy_dofs->dofiod_count - 1) || lazy_dofs->dofiod_helpers[i].dofhp_dof < lazy_dofs->dofiod_helpers[i+1].dofhp_dof);
16097
16098 dof_helper_t *dhp = &lazy_dofs->dofiod_helpers[i];
16099
16100 /*
16101 * We stored the generation in dofhp_dof. Save it, and restore the original value.
16102 */
16103 int generation = dhp->dofhp_dof;
16104 dhp->dofhp_dof = dhp->dofhp_addr;
16105
16106 dof_hdr_t *dof = dtrace_dof_copyin_from_proc(p, dhp->dofhp_dof, &rval);
16107
16108 if (dof != NULL) {
16109 dtrace_helpers_t *help;
16110
16111 lck_mtx_lock(&dtrace_lock);
16112
16113 /*
16114 * This must be done with the dtrace_lock held
16115 */
16116 if ((help = p->p_dtrace_helpers) == NULL)
16117 help = dtrace_helpers_create(p);
16118
16119 /*
16120 * If the generation value has been bumped, someone snuck in
16121 * when we released the dtrace lock. We have to dump this generation,
16122 * there is no safe way to load it.
16123 */
16124 if (help->dthps_generation <= generation) {
16125 help->dthps_generation = generation;
16126
16127 /*
16128 * dtrace_helper_slurp() takes responsibility for the dof --
16129 * it may free it now or it may save it and free it later.
16130 */
16131 if ((rval = dtrace_helper_slurp(p, dof, dhp)) != generation) {
16132 dtrace_dof_error(NULL, "returned value did not match expected generation");
16133 }
16134 }
16135
16136 lck_mtx_unlock(&dtrace_lock);
16137 }
16138 }
16139 lck_mtx_unlock(&dtrace_meta_lock);
16140 kmem_free(lazy_dofs, DOF_IOCTL_DATA_T_SIZE(lazy_dofs->dofiod_count));
16141 } else {
16142 lck_mtx_unlock(&dtrace_meta_lock);
16143 }
16144 }
16145
16146 static int
16147 dtrace_lazy_dofs_proc_iterate_doit(proc_t *p, void* ignored)
16148 {
16149 #pragma unused(ignored)
16150
16151 dtrace_lazy_dofs_process(p);
16152
16153 return PROC_RETURNED;
16154 }
16155
16156 #define DTRACE_LAZY_DOFS_DUPLICATED 1
16157
16158 static int
16159 dtrace_lazy_dofs_duplicate(proc_t *parent, proc_t *child)
16160 {
16161 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
16162 LCK_MTX_ASSERT(&parent->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED);
16163 LCK_MTX_ASSERT(&child->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED);
16164
16165 lck_rw_lock_shared(&dtrace_dof_mode_lock);
16166 dtrace_sprlock(parent);
16167
16168 /*
16169 * We need to make sure that the transition to lazy dofs -> helpers
16170 * was atomic for our parent
16171 */
16172 ASSERT(parent->p_dtrace_lazy_dofs == NULL || parent->p_dtrace_helpers == NULL);
16173 /*
16174 * In theory we should hold the child sprlock, but this is safe...
16175 */
16176 ASSERT(child->p_dtrace_lazy_dofs == NULL && child->p_dtrace_helpers == NULL);
16177
16178 dof_ioctl_data_t* parent_dofs = parent->p_dtrace_lazy_dofs;
16179 dof_ioctl_data_t* child_dofs = NULL;
16180 if (parent_dofs) {
16181 size_t parent_dofs_size = DOF_IOCTL_DATA_T_SIZE(parent_dofs->dofiod_count);
16182 child_dofs = kmem_alloc(parent_dofs_size, KM_SLEEP);
16183 bcopy(parent_dofs, child_dofs, parent_dofs_size);
16184 }
16185
16186 dtrace_sprunlock(parent);
16187
16188 if (child_dofs) {
16189 dtrace_sprlock(child);
16190 child->p_dtrace_lazy_dofs = child_dofs;
16191 dtrace_sprunlock(child);
16192 /**
16193 * We process the DOF at this point if the mode is set to
16194 * LAZY_OFF. This can happen if DTrace is still processing the
16195 * DOF of other process (which can happen because the
16196 * protected pager can have a huge latency)
16197 * but has not processed our parent yet
16198 */
16199 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF) {
16200 dtrace_lazy_dofs_process(child);
16201 }
16202 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
16203
16204 return DTRACE_LAZY_DOFS_DUPLICATED;
16205 }
16206 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
16207
16208 return 0;
16209 }
16210
16211 static dtrace_helpers_t *
16212 dtrace_helpers_create(proc_t *p)
16213 {
16214 dtrace_helpers_t *help;
16215
16216 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
16217 ASSERT(p->p_dtrace_helpers == NULL);
16218
16219 help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
16220 help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *
16221 DTRACE_NHELPER_ACTIONS, KM_SLEEP);
16222
16223 p->p_dtrace_helpers = help;
16224 dtrace_helpers++;
16225
16226 return (help);
16227 }
16228
16229 static void
16230 dtrace_helpers_destroy(proc_t* p)
16231 {
16232 dtrace_helpers_t *help;
16233 dtrace_vstate_t *vstate;
16234 uint_t i;
16235
16236 lck_mtx_lock(&dtrace_meta_lock);
16237 lck_mtx_lock(&dtrace_lock);
16238
16239 ASSERT(p->p_dtrace_helpers != NULL);
16240 ASSERT(dtrace_helpers > 0);
16241
16242 help = p->p_dtrace_helpers;
16243 vstate = &help->dthps_vstate;
16244
16245 /*
16246 * We're now going to lose the help from this process.
16247 */
16248 p->p_dtrace_helpers = NULL;
16249 dtrace_sync();
16250
16251 /*
16252 * Destory the helper actions.
16253 */
16254 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
16255 dtrace_helper_action_t *h, *next;
16256
16257 for (h = help->dthps_actions[i]; h != NULL; h = next) {
16258 next = h->dtha_next;
16259 dtrace_helper_action_destroy(h, vstate);
16260 h = next;
16261 }
16262 }
16263
16264 lck_mtx_unlock(&dtrace_lock);
16265
16266 /*
16267 * Destroy the helper providers.
16268 */
16269 if (help->dthps_maxprovs > 0) {
16270 if (dtrace_meta_pid != NULL) {
16271 ASSERT(dtrace_deferred_pid == NULL);
16272
16273 for (i = 0; i < help->dthps_nprovs; i++) {
16274 dtrace_helper_provider_remove(
16275 &help->dthps_provs[i]->dthp_prov, p);
16276 }
16277 } else {
16278 lck_mtx_lock(&dtrace_lock);
16279 ASSERT(help->dthps_deferred == 0 ||
16280 help->dthps_next != NULL ||
16281 help->dthps_prev != NULL ||
16282 help == dtrace_deferred_pid);
16283
16284 /*
16285 * Remove the helper from the deferred list.
16286 */
16287 if (help->dthps_next != NULL)
16288 help->dthps_next->dthps_prev = help->dthps_prev;
16289 if (help->dthps_prev != NULL)
16290 help->dthps_prev->dthps_next = help->dthps_next;
16291 if (dtrace_deferred_pid == help) {
16292 dtrace_deferred_pid = help->dthps_next;
16293 ASSERT(help->dthps_prev == NULL);
16294 }
16295
16296 lck_mtx_unlock(&dtrace_lock);
16297 }
16298
16299
16300 for (i = 0; i < help->dthps_nprovs; i++) {
16301 dtrace_helper_provider_destroy(help->dthps_provs[i]);
16302 }
16303
16304 kmem_free(help->dthps_provs, help->dthps_maxprovs *
16305 sizeof (dtrace_helper_provider_t *));
16306 }
16307
16308 lck_mtx_lock(&dtrace_lock);
16309
16310 dtrace_vstate_fini(&help->dthps_vstate);
16311 kmem_free(help->dthps_actions,
16312 sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);
16313 kmem_free(help, sizeof (dtrace_helpers_t));
16314
16315 --dtrace_helpers;
16316 lck_mtx_unlock(&dtrace_lock);
16317 lck_mtx_unlock(&dtrace_meta_lock);
16318 }
16319
16320 static void
16321 dtrace_helpers_duplicate(proc_t *from, proc_t *to)
16322 {
16323 dtrace_helpers_t *help, *newhelp;
16324 dtrace_helper_action_t *helper, *new, *last;
16325 dtrace_difo_t *dp;
16326 dtrace_vstate_t *vstate;
16327 uint_t i;
16328 int j, sz, hasprovs = 0;
16329
16330 lck_mtx_lock(&dtrace_meta_lock);
16331 lck_mtx_lock(&dtrace_lock);
16332 ASSERT(from->p_dtrace_helpers != NULL);
16333 ASSERT(dtrace_helpers > 0);
16334
16335 help = from->p_dtrace_helpers;
16336 newhelp = dtrace_helpers_create(to);
16337 ASSERT(to->p_dtrace_helpers != NULL);
16338
16339 newhelp->dthps_generation = help->dthps_generation;
16340 vstate = &newhelp->dthps_vstate;
16341
16342 /*
16343 * Duplicate the helper actions.
16344 */
16345 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
16346 if ((helper = help->dthps_actions[i]) == NULL)
16347 continue;
16348
16349 for (last = NULL; helper != NULL; helper = helper->dtha_next) {
16350 new = kmem_zalloc(sizeof (dtrace_helper_action_t),
16351 KM_SLEEP);
16352 new->dtha_generation = helper->dtha_generation;
16353
16354 if ((dp = helper->dtha_predicate) != NULL) {
16355 dp = dtrace_difo_duplicate(dp, vstate);
16356 new->dtha_predicate = dp;
16357 }
16358
16359 new->dtha_nactions = helper->dtha_nactions;
16360 sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;
16361 new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
16362
16363 for (j = 0; j < new->dtha_nactions; j++) {
16364 dtrace_difo_t *dpj = helper->dtha_actions[j];
16365
16366 ASSERT(dpj != NULL);
16367 dpj = dtrace_difo_duplicate(dpj, vstate);
16368 new->dtha_actions[j] = dpj;
16369 }
16370
16371 if (last != NULL) {
16372 last->dtha_next = new;
16373 } else {
16374 newhelp->dthps_actions[i] = new;
16375 }
16376
16377 last = new;
16378 }
16379 }
16380
16381 /*
16382 * Duplicate the helper providers and register them with the
16383 * DTrace framework.
16384 */
16385 if (help->dthps_nprovs > 0) {
16386 newhelp->dthps_nprovs = help->dthps_nprovs;
16387 newhelp->dthps_maxprovs = help->dthps_nprovs;
16388 newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
16389 sizeof (dtrace_helper_provider_t *), KM_SLEEP);
16390 for (i = 0; i < newhelp->dthps_nprovs; i++) {
16391 newhelp->dthps_provs[i] = help->dthps_provs[i];
16392 newhelp->dthps_provs[i]->dthp_ref++;
16393 }
16394
16395 hasprovs = 1;
16396 }
16397
16398 lck_mtx_unlock(&dtrace_lock);
16399
16400 if (hasprovs)
16401 dtrace_helper_provider_register(to, newhelp, NULL);
16402
16403 lck_mtx_unlock(&dtrace_meta_lock);
16404 }
16405
16406 /**
16407 * DTrace Process functions
16408 */
16409
16410 void
16411 dtrace_proc_fork(proc_t *parent_proc, proc_t *child_proc, int spawn)
16412 {
16413 /*
16414 * This code applies to new processes who are copying the task
16415 * and thread state and address spaces of their parent process.
16416 */
16417 if (!spawn) {
16418 /*
16419 * APPLE NOTE: Solaris does a sprlock() and drops the
16420 * proc_lock here. We're cheating a bit and only taking
16421 * the p_dtrace_sprlock lock. A full sprlock would
16422 * task_suspend the parent.
16423 */
16424 dtrace_sprlock(parent_proc);
16425
16426 /*
16427 * Remove all DTrace tracepoints from the child process. We
16428 * need to do this _before_ duplicating USDT providers since
16429 * any associated probes may be immediately enabled.
16430 */
16431 if (parent_proc->p_dtrace_count > 0) {
16432 dtrace_fasttrap_fork(parent_proc, child_proc);
16433 }
16434
16435 dtrace_sprunlock(parent_proc);
16436
16437 /*
16438 * Duplicate any lazy dof(s). This must be done while NOT
16439 * holding the parent sprlock! Lock ordering is
16440 * dtrace_dof_mode_lock, then sprlock. It is imperative we
16441 * always call dtrace_lazy_dofs_duplicate, rather than null
16442 * check and call if !NULL. If we NULL test, during lazy dof
16443 * faulting we can race with the faulting code and proceed
16444 * from here to beyond the helpers copy. The lazy dof
16445 * faulting will then fail to copy the helpers to the child
16446 * process. We return if we duplicated lazy dofs as a process
16447 * can only have one at the same time to avoid a race between
16448 * a dtrace client and dtrace_proc_fork where a process would
16449 * end up with both lazy dofs and helpers.
16450 */
16451 if (dtrace_lazy_dofs_duplicate(parent_proc, child_proc) == DTRACE_LAZY_DOFS_DUPLICATED) {
16452 return;
16453 }
16454
16455 /*
16456 * Duplicate any helper actions and providers if they haven't
16457 * already.
16458 */
16459 #if !defined(__APPLE__)
16460 /*
16461 * The SFORKING
16462 * we set above informs the code to enable USDT probes that
16463 * sprlock() may fail because the child is being forked.
16464 */
16465 #endif
16466 /*
16467 * APPLE NOTE: As best I can tell, Apple's sprlock() equivalent
16468 * never fails to find the child. We do not set SFORKING.
16469 */
16470 if (parent_proc->p_dtrace_helpers != NULL && dtrace_helpers_fork) {
16471 (*dtrace_helpers_fork)(parent_proc, child_proc);
16472 }
16473 }
16474 }
16475
16476 void
16477 dtrace_proc_exec(proc_t *p)
16478 {
16479 /*
16480 * Invalidate any predicate evaluation already cached for this thread by DTrace.
16481 * That's because we've just stored to p_comm and DTrace refers to that when it
16482 * evaluates the "execname" special variable. uid and gid may have changed as well.
16483 */
16484 dtrace_set_thread_predcache(current_thread(), 0);
16485
16486 /*
16487 * Free any outstanding lazy dof entries. It is imperative we
16488 * always call dtrace_lazy_dofs_destroy, rather than null check
16489 * and call if !NULL. If we NULL test, during lazy dof faulting
16490 * we can race with the faulting code and proceed from here to
16491 * beyond the helpers cleanup. The lazy dof faulting will then
16492 * install new helpers which no longer belong to this process!
16493 */
16494 dtrace_lazy_dofs_destroy(p);
16495
16496
16497 /*
16498 * Clean up any DTrace helpers for the process.
16499 */
16500 if (p->p_dtrace_helpers != NULL && dtrace_helpers_cleanup) {
16501 (*dtrace_helpers_cleanup)(p);
16502 }
16503
16504 /*
16505 * Cleanup the DTrace provider associated with this process.
16506 */
16507 proc_lock(p);
16508 if (p->p_dtrace_probes && dtrace_fasttrap_exec_ptr) {
16509 (*dtrace_fasttrap_exec_ptr)(p);
16510 }
16511 proc_unlock(p);
16512 }
16513
16514 void
16515 dtrace_proc_exit(proc_t *p)
16516 {
16517 /*
16518 * Free any outstanding lazy dof entries. It is imperative we
16519 * always call dtrace_lazy_dofs_destroy, rather than null check
16520 * and call if !NULL. If we NULL test, during lazy dof faulting
16521 * we can race with the faulting code and proceed from here to
16522 * beyond the helpers cleanup. The lazy dof faulting will then
16523 * install new helpers which will never be cleaned up, and leak.
16524 */
16525 dtrace_lazy_dofs_destroy(p);
16526
16527 /*
16528 * Clean up any DTrace helper actions or probes for the process.
16529 */
16530 if (p->p_dtrace_helpers != NULL) {
16531 (*dtrace_helpers_cleanup)(p);
16532 }
16533
16534 /*
16535 * Clean up any DTrace probes associated with this process.
16536 */
16537 /*
16538 * APPLE NOTE: We release ptss pages/entries in dtrace_fasttrap_exit_ptr(),
16539 * call this after dtrace_helpers_cleanup()
16540 */
16541 proc_lock(p);
16542 if (p->p_dtrace_probes && dtrace_fasttrap_exit_ptr) {
16543 (*dtrace_fasttrap_exit_ptr)(p);
16544 }
16545 proc_unlock(p);
16546 }
16547
16548 /*
16549 * DTrace Hook Functions
16550 */
16551
16552 /*
16553 * APPLE NOTE: dtrace_modctl_* routines for kext support.
16554 * Used to manipulate the modctl list within dtrace xnu.
16555 */
16556
16557 modctl_t *dtrace_modctl_list;
16558
16559 static void
16560 dtrace_modctl_add(struct modctl * newctl)
16561 {
16562 struct modctl *nextp, *prevp;
16563
16564 ASSERT(newctl != NULL);
16565 LCK_MTX_ASSERT(&mod_lock, LCK_MTX_ASSERT_OWNED);
16566
16567 // Insert new module at the front of the list,
16568
16569 newctl->mod_next = dtrace_modctl_list;
16570 dtrace_modctl_list = newctl;
16571
16572 /*
16573 * If a module exists with the same name, then that module
16574 * must have been unloaded with enabled probes. We will move
16575 * the unloaded module to the new module's stale chain and
16576 * then stop traversing the list.
16577 */
16578
16579 prevp = newctl;
16580 nextp = newctl->mod_next;
16581
16582 while (nextp != NULL) {
16583 if (nextp->mod_loaded) {
16584 /* This is a loaded module. Keep traversing. */
16585 prevp = nextp;
16586 nextp = nextp->mod_next;
16587 continue;
16588 }
16589 else {
16590 /* Found an unloaded module */
16591 if (strncmp (newctl->mod_modname, nextp->mod_modname, KMOD_MAX_NAME)) {
16592 /* Names don't match. Keep traversing. */
16593 prevp = nextp;
16594 nextp = nextp->mod_next;
16595 continue;
16596 }
16597 else {
16598 /* We found a stale entry, move it. We're done. */
16599 prevp->mod_next = nextp->mod_next;
16600 newctl->mod_stale = nextp;
16601 nextp->mod_next = NULL;
16602 break;
16603 }
16604 }
16605 }
16606 }
16607
16608 static modctl_t *
16609 dtrace_modctl_lookup(struct kmod_info * kmod)
16610 {
16611 LCK_MTX_ASSERT(&mod_lock, LCK_MTX_ASSERT_OWNED);
16612
16613 struct modctl * ctl;
16614
16615 for (ctl = dtrace_modctl_list; ctl; ctl=ctl->mod_next) {
16616 if (ctl->mod_id == kmod->id)
16617 return(ctl);
16618 }
16619 return (NULL);
16620 }
16621
16622 /*
16623 * This routine is called from dtrace_module_unloaded().
16624 * It removes a modctl structure and its stale chain
16625 * from the kext shadow list.
16626 */
16627 static void
16628 dtrace_modctl_remove(struct modctl * ctl)
16629 {
16630 ASSERT(ctl != NULL);
16631 LCK_MTX_ASSERT(&mod_lock, LCK_MTX_ASSERT_OWNED);
16632 modctl_t *prevp, *nextp, *curp;
16633
16634 // Remove stale chain first
16635 for (curp=ctl->mod_stale; curp != NULL; curp=nextp) {
16636 nextp = curp->mod_stale;
16637 /* There should NEVER be user symbols allocated at this point */
16638 ASSERT(curp->mod_user_symbols == NULL);
16639 kmem_free(curp, sizeof(modctl_t));
16640 }
16641
16642 prevp = NULL;
16643 curp = dtrace_modctl_list;
16644
16645 while (curp != ctl) {
16646 prevp = curp;
16647 curp = curp->mod_next;
16648 }
16649
16650 if (prevp != NULL) {
16651 prevp->mod_next = ctl->mod_next;
16652 }
16653 else {
16654 dtrace_modctl_list = ctl->mod_next;
16655 }
16656
16657 /* There should NEVER be user symbols allocated at this point */
16658 ASSERT(ctl->mod_user_symbols == NULL);
16659
16660 kmem_free (ctl, sizeof(modctl_t));
16661 }
16662
16663 /*
16664 * APPLE NOTE: The kext loader will call dtrace_module_loaded
16665 * when the kext is loaded in memory, but before calling the
16666 * kext's start routine.
16667 *
16668 * Return 0 on success
16669 * Return -1 on failure
16670 */
16671
16672 static int
16673 dtrace_module_loaded(struct kmod_info *kmod, uint32_t flag)
16674 {
16675 dtrace_provider_t *prv;
16676
16677 /*
16678 * If kernel symbols have been disabled, return immediately
16679 * DTRACE_KERNEL_SYMBOLS_NEVER is a permanent mode, it is safe to test without holding locks
16680 */
16681 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER)
16682 return 0;
16683
16684 struct modctl *ctl = NULL;
16685 if (!kmod || kmod->address == 0 || kmod->size == 0)
16686 return(-1);
16687
16688 lck_mtx_lock(&dtrace_provider_lock);
16689 lck_mtx_lock(&mod_lock);
16690
16691 /*
16692 * Have we seen this kext before?
16693 */
16694
16695 ctl = dtrace_modctl_lookup(kmod);
16696
16697 if (ctl != NULL) {
16698 /* bail... we already have this kext in the modctl list */
16699 lck_mtx_unlock(&mod_lock);
16700 lck_mtx_unlock(&dtrace_provider_lock);
16701 if (dtrace_err_verbose)
16702 cmn_err(CE_WARN, "dtrace load module already exists '%s %u' is failing against '%s %u'", kmod->name, (uint_t)kmod->id, ctl->mod_modname, ctl->mod_id);
16703 return(-1);
16704 }
16705 else {
16706 ctl = kmem_alloc(sizeof(struct modctl), KM_SLEEP);
16707 if (ctl == NULL) {
16708 if (dtrace_err_verbose)
16709 cmn_err(CE_WARN, "dtrace module load '%s %u' is failing ", kmod->name, (uint_t)kmod->id);
16710 lck_mtx_unlock(&mod_lock);
16711 lck_mtx_unlock(&dtrace_provider_lock);
16712 return (-1);
16713 }
16714 ctl->mod_next = NULL;
16715 ctl->mod_stale = NULL;
16716 strlcpy (ctl->mod_modname, kmod->name, sizeof(ctl->mod_modname));
16717 ctl->mod_loadcnt = kmod->id;
16718 ctl->mod_nenabled = 0;
16719 ctl->mod_address = kmod->address;
16720 ctl->mod_size = kmod->size;
16721 ctl->mod_id = kmod->id;
16722 ctl->mod_loaded = 1;
16723 ctl->mod_flags = 0;
16724 ctl->mod_user_symbols = NULL;
16725
16726 /*
16727 * Find the UUID for this module, if it has one
16728 */
16729 kernel_mach_header_t* header = (kernel_mach_header_t *)ctl->mod_address;
16730 struct load_command* load_cmd = (struct load_command *)&header[1];
16731 uint32_t i;
16732 for (i = 0; i < header->ncmds; i++) {
16733 if (load_cmd->cmd == LC_UUID) {
16734 struct uuid_command* uuid_cmd = (struct uuid_command *)load_cmd;
16735 memcpy(ctl->mod_uuid, uuid_cmd->uuid, sizeof(uuid_cmd->uuid));
16736 ctl->mod_flags |= MODCTL_HAS_UUID;
16737 break;
16738 }
16739 load_cmd = (struct load_command *)((caddr_t)load_cmd + load_cmd->cmdsize);
16740 }
16741
16742 if (ctl->mod_address == g_kernel_kmod_info.address) {
16743 ctl->mod_flags |= MODCTL_IS_MACH_KERNEL;
16744 memcpy(dtrace_kerneluuid, ctl->mod_uuid, sizeof(dtrace_kerneluuid));
16745 }
16746 /*
16747 * Static kexts have a UUID that is not used for symbolication, as all their
16748 * symbols are in kernel
16749 */
16750 else if ((flag & KMOD_DTRACE_STATIC_KEXT) == KMOD_DTRACE_STATIC_KEXT) {
16751 memcpy(ctl->mod_uuid, dtrace_kerneluuid, sizeof(dtrace_kerneluuid));
16752 ctl->mod_flags |= MODCTL_IS_STATIC_KEXT;
16753 }
16754 }
16755 dtrace_modctl_add(ctl);
16756
16757 /*
16758 * We must hold the dtrace_lock to safely test non permanent dtrace_fbt_symbol_mode(s)
16759 */
16760 lck_mtx_lock(&dtrace_lock);
16761
16762 /*
16763 * DTrace must decide if it will instrument modules lazily via
16764 * userspace symbols (default mode), or instrument immediately via
16765 * kernel symbols (non-default mode)
16766 *
16767 * When in default/lazy mode, DTrace will only support modules
16768 * built with a valid UUID.
16769 *
16770 * Overriding the default can be done explicitly in one of
16771 * the following two ways.
16772 *
16773 * A module can force symbols from kernel space using the plist key,
16774 * OSBundleForceDTraceInit (see kmod.h). If this per kext state is set,
16775 * we fall through and instrument this module now.
16776 *
16777 * Or, the boot-arg, dtrace_kernel_symbol_mode, can be set to force symbols
16778 * from kernel space (see dtrace_impl.h). If this system state is set
16779 * to a non-userspace mode, we fall through and instrument the module now.
16780 */
16781
16782 if ((dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) &&
16783 (!(flag & KMOD_DTRACE_FORCE_INIT)))
16784 {
16785 /* We will instrument the module lazily -- this is the default */
16786 lck_mtx_unlock(&dtrace_lock);
16787 lck_mtx_unlock(&mod_lock);
16788 lck_mtx_unlock(&dtrace_provider_lock);
16789 return 0;
16790 }
16791
16792 /* We will instrument the module immediately using kernel symbols */
16793 ctl->mod_flags |= MODCTL_HAS_KERNEL_SYMBOLS;
16794
16795 lck_mtx_unlock(&dtrace_lock);
16796
16797 /*
16798 * We're going to call each providers per-module provide operation
16799 * specifying only this module.
16800 */
16801 for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
16802 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
16803
16804 /*
16805 * APPLE NOTE: The contract with the kext loader is that once this function
16806 * has completed, it may delete kernel symbols at will.
16807 * We must set this while still holding the mod_lock.
16808 */
16809 ctl->mod_flags &= ~MODCTL_HAS_KERNEL_SYMBOLS;
16810
16811 lck_mtx_unlock(&mod_lock);
16812 lck_mtx_unlock(&dtrace_provider_lock);
16813
16814 /*
16815 * If we have any retained enablings, we need to match against them.
16816 * Enabling probes requires that cpu_lock be held, and we cannot hold
16817 * cpu_lock here -- it is legal for cpu_lock to be held when loading a
16818 * module. (In particular, this happens when loading scheduling
16819 * classes.) So if we have any retained enablings, we need to dispatch
16820 * our task queue to do the match for us.
16821 */
16822 lck_mtx_lock(&dtrace_lock);
16823
16824 if (dtrace_retained == NULL) {
16825 lck_mtx_unlock(&dtrace_lock);
16826 return 0;
16827 }
16828
16829 /* APPLE NOTE!
16830 *
16831 * The cpu_lock mentioned above is only held by dtrace code, Apple's xnu never actually
16832 * holds it for any reason. Thus the comment above is invalid, we can directly invoke
16833 * dtrace_enabling_matchall without jumping through all the hoops, and we can avoid
16834 * the delay call as well.
16835 */
16836 lck_mtx_unlock(&dtrace_lock);
16837
16838 dtrace_enabling_matchall();
16839
16840 return 0;
16841 }
16842
16843 /*
16844 * Return 0 on success
16845 * Return -1 on failure
16846 */
16847 static int
16848 dtrace_module_unloaded(struct kmod_info *kmod)
16849 {
16850 dtrace_probe_t template, *probe, *first, *next;
16851 dtrace_provider_t *prov;
16852 struct modctl *ctl = NULL;
16853 struct modctl *syncctl = NULL;
16854 struct modctl *nextsyncctl = NULL;
16855 int syncmode = 0;
16856
16857 lck_mtx_lock(&dtrace_provider_lock);
16858 lck_mtx_lock(&mod_lock);
16859 lck_mtx_lock(&dtrace_lock);
16860
16861 if (kmod == NULL) {
16862 syncmode = 1;
16863 }
16864 else {
16865 ctl = dtrace_modctl_lookup(kmod);
16866 if (ctl == NULL)
16867 {
16868 lck_mtx_unlock(&dtrace_lock);
16869 lck_mtx_unlock(&mod_lock);
16870 lck_mtx_unlock(&dtrace_provider_lock);
16871 return (-1);
16872 }
16873 ctl->mod_loaded = 0;
16874 ctl->mod_address = 0;
16875 ctl->mod_size = 0;
16876 }
16877
16878 if (dtrace_bymod == NULL) {
16879 /*
16880 * The DTrace module is loaded (obviously) but not attached;
16881 * we don't have any work to do.
16882 */
16883 if (ctl != NULL)
16884 (void)dtrace_modctl_remove(ctl);
16885 lck_mtx_unlock(&dtrace_lock);
16886 lck_mtx_unlock(&mod_lock);
16887 lck_mtx_unlock(&dtrace_provider_lock);
16888 return(0);
16889 }
16890
16891 /* Syncmode set means we target and traverse entire modctl list. */
16892 if (syncmode)
16893 nextsyncctl = dtrace_modctl_list;
16894
16895 syncloop:
16896 if (syncmode)
16897 {
16898 /* find a stale modctl struct */
16899 for (syncctl = nextsyncctl; syncctl != NULL; syncctl=syncctl->mod_next) {
16900 if (syncctl->mod_address == 0)
16901 break;
16902 }
16903 if (syncctl==NULL)
16904 {
16905 /* We have no more work to do */
16906 lck_mtx_unlock(&dtrace_lock);
16907 lck_mtx_unlock(&mod_lock);
16908 lck_mtx_unlock(&dtrace_provider_lock);
16909 return(0);
16910 }
16911 else {
16912 /* keep track of next syncctl in case this one is removed */
16913 nextsyncctl = syncctl->mod_next;
16914 ctl = syncctl;
16915 }
16916 }
16917
16918 template.dtpr_mod = ctl->mod_modname;
16919
16920 for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
16921 probe != NULL; probe = probe->dtpr_nextmod) {
16922 if (probe->dtpr_ecb != NULL) {
16923 /*
16924 * This shouldn't _actually_ be possible -- we're
16925 * unloading a module that has an enabled probe in it.
16926 * (It's normally up to the provider to make sure that
16927 * this can't happen.) However, because dtps_enable()
16928 * doesn't have a failure mode, there can be an
16929 * enable/unload race. Upshot: we don't want to
16930 * assert, but we're not going to disable the
16931 * probe, either.
16932 */
16933
16934
16935 if (syncmode) {
16936 /* We're syncing, let's look at next in list */
16937 goto syncloop;
16938 }
16939
16940 lck_mtx_unlock(&dtrace_lock);
16941 lck_mtx_unlock(&mod_lock);
16942 lck_mtx_unlock(&dtrace_provider_lock);
16943
16944 if (dtrace_err_verbose) {
16945 cmn_err(CE_WARN, "unloaded module '%s' had "
16946 "enabled probes", ctl->mod_modname);
16947 }
16948 return(-1);
16949 }
16950 }
16951
16952 probe = first;
16953
16954 for (first = NULL; probe != NULL; probe = next) {
16955 ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
16956
16957 dtrace_probes[probe->dtpr_id - 1] = NULL;
16958 probe->dtpr_provider->dtpv_probe_count--;
16959
16960 next = probe->dtpr_nextmod;
16961 dtrace_hash_remove(dtrace_byprov, probe);
16962 dtrace_hash_remove(dtrace_bymod, probe);
16963 dtrace_hash_remove(dtrace_byfunc, probe);
16964 dtrace_hash_remove(dtrace_byname, probe);
16965
16966 if (first == NULL) {
16967 first = probe;
16968 probe->dtpr_nextmod = NULL;
16969 } else {
16970 probe->dtpr_nextmod = first;
16971 first = probe;
16972 }
16973 }
16974
16975 /*
16976 * We've removed all of the module's probes from the hash chains and
16977 * from the probe array. Now issue a dtrace_sync() to be sure that
16978 * everyone has cleared out from any probe array processing.
16979 */
16980 dtrace_sync();
16981
16982 for (probe = first; probe != NULL; probe = first) {
16983 first = probe->dtpr_nextmod;
16984 prov = probe->dtpr_provider;
16985 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
16986 probe->dtpr_arg);
16987 dtrace_strunref(probe->dtpr_mod);
16988 dtrace_strunref(probe->dtpr_func);
16989 dtrace_strunref(probe->dtpr_name);
16990 vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
16991
16992 zfree(dtrace_probe_t_zone, probe);
16993 }
16994
16995 dtrace_modctl_remove(ctl);
16996
16997 if (syncmode)
16998 goto syncloop;
16999
17000 lck_mtx_unlock(&dtrace_lock);
17001 lck_mtx_unlock(&mod_lock);
17002 lck_mtx_unlock(&dtrace_provider_lock);
17003
17004 return(0);
17005 }
17006
17007 void
17008 dtrace_suspend(void)
17009 {
17010 dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
17011 }
17012
17013 void
17014 dtrace_resume(void)
17015 {
17016 dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
17017 }
17018
17019 static int
17020 dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
17021 {
17022 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
17023 lck_mtx_lock(&dtrace_lock);
17024
17025 switch (what) {
17026 case CPU_CONFIG: {
17027 dtrace_state_t *state;
17028 dtrace_optval_t *opt, rs, c;
17029
17030 /*
17031 * For now, we only allocate a new buffer for anonymous state.
17032 */
17033 if ((state = dtrace_anon.dta_state) == NULL)
17034 break;
17035
17036 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
17037 break;
17038
17039 opt = state->dts_options;
17040 c = opt[DTRACEOPT_CPU];
17041
17042 if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
17043 break;
17044
17045 /*
17046 * Regardless of what the actual policy is, we're going to
17047 * temporarily set our resize policy to be manual. We're
17048 * also going to temporarily set our CPU option to denote
17049 * the newly configured CPU.
17050 */
17051 rs = opt[DTRACEOPT_BUFRESIZE];
17052 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
17053 opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
17054
17055 (void) dtrace_state_buffers(state);
17056
17057 opt[DTRACEOPT_BUFRESIZE] = rs;
17058 opt[DTRACEOPT_CPU] = c;
17059
17060 break;
17061 }
17062
17063 case CPU_UNCONFIG:
17064 /*
17065 * We don't free the buffer in the CPU_UNCONFIG case. (The
17066 * buffer will be freed when the consumer exits.)
17067 */
17068 break;
17069
17070 default:
17071 break;
17072 }
17073
17074 lck_mtx_unlock(&dtrace_lock);
17075 return (0);
17076 }
17077
17078 static void
17079 dtrace_cpu_setup_initial(processorid_t cpu)
17080 {
17081 (void) dtrace_cpu_setup(CPU_CONFIG, cpu);
17082 }
17083
17084 static void
17085 dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
17086 {
17087 if (dtrace_toxranges >= dtrace_toxranges_max) {
17088 int osize, nsize;
17089 dtrace_toxrange_t *range;
17090
17091 osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
17092
17093 if (osize == 0) {
17094 ASSERT(dtrace_toxrange == NULL);
17095 ASSERT(dtrace_toxranges_max == 0);
17096 dtrace_toxranges_max = 1;
17097 } else {
17098 dtrace_toxranges_max <<= 1;
17099 }
17100
17101 nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
17102 range = kmem_zalloc(nsize, KM_SLEEP);
17103
17104 if (dtrace_toxrange != NULL) {
17105 ASSERT(osize != 0);
17106 bcopy(dtrace_toxrange, range, osize);
17107 kmem_free(dtrace_toxrange, osize);
17108 }
17109
17110 dtrace_toxrange = range;
17111 }
17112
17113 ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == 0);
17114 ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == 0);
17115
17116 dtrace_toxrange[dtrace_toxranges].dtt_base = base;
17117 dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
17118 dtrace_toxranges++;
17119 }
17120
17121 /*
17122 * DTrace Driver Cookbook Functions
17123 */
17124 /*ARGSUSED*/
17125 static int
17126 dtrace_attach(dev_info_t *devi)
17127 {
17128 dtrace_provider_id_t id;
17129 dtrace_state_t *state = NULL;
17130 dtrace_enabling_t *enab;
17131
17132 lck_mtx_lock(&cpu_lock);
17133 lck_mtx_lock(&dtrace_provider_lock);
17134 lck_mtx_lock(&dtrace_lock);
17135
17136 /* Darwin uses BSD cloning device driver to automagically obtain minor device number. */
17137 dtrace_devi = devi;
17138
17139 dtrace_modload = dtrace_module_loaded;
17140 dtrace_modunload = dtrace_module_unloaded;
17141 dtrace_cpu_init = dtrace_cpu_setup_initial;
17142 dtrace_helpers_cleanup = dtrace_helpers_destroy;
17143 dtrace_helpers_fork = dtrace_helpers_duplicate;
17144 dtrace_cpustart_init = dtrace_suspend;
17145 dtrace_cpustart_fini = dtrace_resume;
17146 dtrace_debugger_init = dtrace_suspend;
17147 dtrace_debugger_fini = dtrace_resume;
17148
17149 register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
17150
17151 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
17152
17153 dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1,
17154 NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
17155
17156 dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
17157 sizeof (dtrace_dstate_percpu_t) * (int)NCPU, DTRACE_STATE_ALIGN,
17158 NULL, NULL, NULL, NULL, NULL, 0);
17159
17160 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
17161
17162 dtrace_nprobes = dtrace_nprobes_default;
17163 dtrace_probes = kmem_zalloc(sizeof(dtrace_probe_t*) * dtrace_nprobes,
17164 KM_SLEEP);
17165
17166 dtrace_byprov = dtrace_hash_create(dtrace_strkey_probe_provider,
17167 0, /* unused */
17168 offsetof(dtrace_probe_t, dtpr_nextprov),
17169 offsetof(dtrace_probe_t, dtpr_prevprov));
17170
17171 dtrace_bymod = dtrace_hash_create(dtrace_strkey_deref_offset,
17172 offsetof(dtrace_probe_t, dtpr_mod),
17173 offsetof(dtrace_probe_t, dtpr_nextmod),
17174 offsetof(dtrace_probe_t, dtpr_prevmod));
17175
17176 dtrace_byfunc = dtrace_hash_create(dtrace_strkey_deref_offset,
17177 offsetof(dtrace_probe_t, dtpr_func),
17178 offsetof(dtrace_probe_t, dtpr_nextfunc),
17179 offsetof(dtrace_probe_t, dtpr_prevfunc));
17180
17181 dtrace_byname = dtrace_hash_create(dtrace_strkey_deref_offset,
17182 offsetof(dtrace_probe_t, dtpr_name),
17183 offsetof(dtrace_probe_t, dtpr_nextname),
17184 offsetof(dtrace_probe_t, dtpr_prevname));
17185
17186 if (dtrace_retain_max < 1) {
17187 cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
17188 "setting to 1", dtrace_retain_max);
17189 dtrace_retain_max = 1;
17190 }
17191
17192 /*
17193 * Now discover our toxic ranges.
17194 */
17195 dtrace_toxic_ranges(dtrace_toxrange_add);
17196
17197 /*
17198 * Before we register ourselves as a provider to our own framework,
17199 * we would like to assert that dtrace_provider is NULL -- but that's
17200 * not true if we were loaded as a dependency of a DTrace provider.
17201 * Once we've registered, we can assert that dtrace_provider is our
17202 * pseudo provider.
17203 */
17204 (void) dtrace_register("dtrace", &dtrace_provider_attr,
17205 DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);
17206
17207 ASSERT(dtrace_provider != NULL);
17208 ASSERT((dtrace_provider_id_t)dtrace_provider == id);
17209
17210 #if defined (__x86_64__)
17211 dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
17212 dtrace_provider, NULL, NULL, "BEGIN", 1, NULL);
17213 dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
17214 dtrace_provider, NULL, NULL, "END", 0, NULL);
17215 dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
17216 dtrace_provider, NULL, NULL, "ERROR", 3, NULL);
17217 #elif (defined(__arm__) || defined(__arm64__))
17218 dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
17219 dtrace_provider, NULL, NULL, "BEGIN", 2, NULL);
17220 dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
17221 dtrace_provider, NULL, NULL, "END", 1, NULL);
17222 dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
17223 dtrace_provider, NULL, NULL, "ERROR", 4, NULL);
17224 #else
17225 #error Unknown Architecture
17226 #endif
17227
17228 dtrace_anon_property();
17229 lck_mtx_unlock(&cpu_lock);
17230
17231 /*
17232 * If DTrace helper tracing is enabled, we need to allocate the
17233 * trace buffer and initialize the values.
17234 */
17235 if (dtrace_helptrace_enabled) {
17236 ASSERT(dtrace_helptrace_buffer == NULL);
17237 dtrace_helptrace_buffer =
17238 kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
17239 dtrace_helptrace_next = 0;
17240 }
17241
17242 /*
17243 * If there are already providers, we must ask them to provide their
17244 * probes, and then match any anonymous enabling against them. Note
17245 * that there should be no other retained enablings at this time:
17246 * the only retained enablings at this time should be the anonymous
17247 * enabling.
17248 */
17249 if (dtrace_anon.dta_enabling != NULL) {
17250 ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
17251
17252 /*
17253 * APPLE NOTE: if handling anonymous dof, switch symbol modes.
17254 */
17255 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) {
17256 dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_KERNEL;
17257 }
17258
17259 dtrace_enabling_provide(NULL);
17260 state = dtrace_anon.dta_state;
17261
17262 /*
17263 * We couldn't hold cpu_lock across the above call to
17264 * dtrace_enabling_provide(), but we must hold it to actually
17265 * enable the probes. We have to drop all of our locks, pick
17266 * up cpu_lock, and regain our locks before matching the
17267 * retained anonymous enabling.
17268 */
17269 lck_mtx_unlock(&dtrace_lock);
17270 lck_mtx_unlock(&dtrace_provider_lock);
17271
17272 lck_mtx_lock(&cpu_lock);
17273 lck_mtx_lock(&dtrace_provider_lock);
17274 lck_mtx_lock(&dtrace_lock);
17275
17276 if ((enab = dtrace_anon.dta_enabling) != NULL)
17277 (void) dtrace_enabling_match(enab, NULL, NULL);
17278
17279 lck_mtx_unlock(&cpu_lock);
17280 }
17281
17282 lck_mtx_unlock(&dtrace_lock);
17283 lck_mtx_unlock(&dtrace_provider_lock);
17284
17285 if (state != NULL) {
17286 /*
17287 * If we created any anonymous state, set it going now.
17288 */
17289 (void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
17290 }
17291
17292 return (DDI_SUCCESS);
17293 }
17294
17295 /*ARGSUSED*/
17296 static int
17297 dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
17298 {
17299 #pragma unused(flag, otyp)
17300 dtrace_state_t *state;
17301 uint32_t priv;
17302 uid_t uid;
17303 zoneid_t zoneid;
17304 int rv;
17305
17306 /* APPLE: Darwin puts Helper on its own major device. */
17307
17308 /*
17309 * If no DTRACE_PRIV_* bits are set in the credential, then the
17310 * caller lacks sufficient permission to do anything with DTrace.
17311 */
17312 dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
17313 if (priv == DTRACE_PRIV_NONE)
17314 return (EACCES);
17315
17316 /*
17317 * APPLE NOTE: We delay the initialization of fasttrap as late as possible.
17318 * It certainly can't be later than now!
17319 */
17320 fasttrap_init();
17321
17322 /*
17323 * Ask all providers to provide all their probes.
17324 */
17325 lck_mtx_lock(&dtrace_provider_lock);
17326 dtrace_probe_provide(NULL, NULL);
17327 lck_mtx_unlock(&dtrace_provider_lock);
17328
17329 lck_mtx_lock(&cpu_lock);
17330 lck_mtx_lock(&dtrace_lock);
17331 dtrace_opens++;
17332 dtrace_membar_producer();
17333
17334 #ifdef illumos
17335 /*
17336 * If the kernel debugger is active (that is, if the kernel debugger
17337 * modified text in some way), we won't allow the open.
17338 */
17339 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
17340 dtrace_opens--;
17341 lck_mtx_unlock(&dtrace_lock);
17342 lck_mtx_unlock(&cpu_lock);
17343 return (EBUSY);
17344 }
17345 #endif
17346
17347 rv = dtrace_state_create(devp, cred_p, &state);
17348 lck_mtx_unlock(&cpu_lock);
17349
17350 if (rv != 0 || state == NULL) {
17351 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) {
17352 #ifdef illumos
17353 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
17354 #endif
17355 }
17356 lck_mtx_unlock(&dtrace_lock);
17357 /* propagate EAGAIN or ERESTART */
17358 return (rv);
17359 }
17360
17361 lck_mtx_unlock(&dtrace_lock);
17362
17363 lck_rw_lock_exclusive(&dtrace_dof_mode_lock);
17364
17365 /*
17366 * If we are currently lazy, transition states.
17367 *
17368 * Unlike dtrace_close, we do not need to check the
17369 * value of dtrace_opens, as any positive value (and
17370 * we count as 1) means we transition states.
17371 */
17372 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON) {
17373 dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_OFF;
17374 /*
17375 * We do not need to hold the exclusive lock while processing
17376 * DOF on processes. We do need to make sure the mode does not get
17377 * changed to DTRACE_DOF_MODE_LAZY_ON during that stage though
17378 * (which should not happen anyway since it only happens in
17379 * dtrace_close). There is no way imcomplete USDT probes can be
17380 * activate by any DTrace clients here since they all have to
17381 * call dtrace_open and be blocked on dtrace_dof_mode_lock
17382 */
17383 lck_rw_lock_exclusive_to_shared(&dtrace_dof_mode_lock);
17384 /*
17385 * Iterate all existing processes and load lazy dofs.
17386 */
17387 proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS,
17388 dtrace_lazy_dofs_proc_iterate_doit,
17389 NULL,
17390 dtrace_lazy_dofs_proc_iterate_filter,
17391 NULL);
17392
17393 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
17394 }
17395 else {
17396 lck_rw_unlock_exclusive(&dtrace_dof_mode_lock);
17397 }
17398
17399
17400 /*
17401 * Update kernel symbol state.
17402 *
17403 * We must own the provider and dtrace locks.
17404 *
17405 * NOTE! It may appear there is a race by setting this value so late
17406 * after dtrace_probe_provide. However, any kext loaded after the
17407 * call to probe provide and before we set LAZY_OFF will be marked as
17408 * eligible for symbols from userspace. The same dtrace that is currently
17409 * calling dtrace_open() (this call!) will get a list of kexts needing
17410 * symbols and fill them in, thus closing the race window.
17411 *
17412 * We want to set this value only after it certain it will succeed, as
17413 * this significantly reduces the complexity of error exits.
17414 */
17415 lck_mtx_lock(&dtrace_lock);
17416 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) {
17417 dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_KERNEL;
17418 }
17419 lck_mtx_unlock(&dtrace_lock);
17420
17421 return (0);
17422 }
17423
17424 /*ARGSUSED*/
17425 static int
17426 dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
17427 {
17428 #pragma unused(flag, otyp, cred_p) /* __APPLE__ */
17429 minor_t minor = getminor(dev);
17430 dtrace_state_t *state;
17431
17432 /* APPLE NOTE: Darwin puts Helper on its own major device. */
17433 state = dtrace_state_get(minor);
17434
17435 lck_mtx_lock(&cpu_lock);
17436 lck_mtx_lock(&dtrace_lock);
17437
17438 if (state->dts_anon) {
17439 /*
17440 * There is anonymous state. Destroy that first.
17441 */
17442 ASSERT(dtrace_anon.dta_state == NULL);
17443 dtrace_state_destroy(state->dts_anon);
17444 }
17445
17446 dtrace_state_destroy(state);
17447 ASSERT(dtrace_opens > 0);
17448
17449 /*
17450 * Only relinquish control of the kernel debugger interface when there
17451 * are no consumers and no anonymous enablings.
17452 */
17453 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) {
17454 #ifdef illumos
17455 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
17456 #endif
17457 }
17458
17459 lck_mtx_unlock(&dtrace_lock);
17460 lck_mtx_unlock(&cpu_lock);
17461
17462 /*
17463 * Lock ordering requires the dof mode lock be taken before
17464 * the dtrace_lock.
17465 */
17466 lck_rw_lock_exclusive(&dtrace_dof_mode_lock);
17467 lck_mtx_lock(&dtrace_lock);
17468
17469 if (dtrace_opens == 0) {
17470 /*
17471 * If we are currently lazy-off, and this is the last close, transition to
17472 * lazy state.
17473 */
17474 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF) {
17475 dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_ON;
17476 }
17477
17478 /*
17479 * If we are the last dtrace client, switch back to lazy (from userspace) symbols
17480 */
17481 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_KERNEL) {
17482 dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE;
17483 }
17484 }
17485
17486 lck_mtx_unlock(&dtrace_lock);
17487 lck_rw_unlock_exclusive(&dtrace_dof_mode_lock);
17488
17489 /*
17490 * Kext probes may be retained past the end of the kext's lifespan. The
17491 * probes are kept until the last reference to them has been removed.
17492 * Since closing an active dtrace context is likely to drop that last reference,
17493 * lets take a shot at cleaning out the orphaned probes now.
17494 */
17495 dtrace_module_unloaded(NULL);
17496
17497 return (0);
17498 }
17499
17500 /*ARGSUSED*/
17501 static int
17502 dtrace_ioctl_helper(u_long cmd, caddr_t arg, int *rv)
17503 {
17504 #pragma unused(rv)
17505 /*
17506 * Safe to check this outside the dof mode lock
17507 */
17508 if (dtrace_dof_mode == DTRACE_DOF_MODE_NEVER)
17509 return KERN_SUCCESS;
17510
17511 switch (cmd) {
17512 #if defined (__arm64__)
17513 case DTRACEHIOC_ADDDOF_U32:
17514 case DTRACEHIOC_ADDDOF_U64:
17515 #else
17516 case DTRACEHIOC_ADDDOF:
17517 #endif /* __arm64__*/
17518 {
17519 dof_helper_t *dhp = NULL;
17520 size_t dof_ioctl_data_size;
17521 dof_ioctl_data_t* multi_dof;
17522 unsigned int i;
17523 int rval = 0;
17524 user_addr_t user_address = *(user_addr_t*)arg;
17525 uint64_t dof_count;
17526 int multi_dof_claimed = 0;
17527 proc_t* p = current_proc();
17528
17529 /*
17530 * If this is a restricted process and dtrace is restricted,
17531 * do not allow DOFs to be registered
17532 */
17533 if (dtrace_is_restricted() &&
17534 !dtrace_are_restrictions_relaxed() &&
17535 !dtrace_can_attach_to_proc(current_proc())) {
17536 return (EACCES);
17537 }
17538
17539 /*
17540 * Read the number of DOF sections being passed in.
17541 */
17542 if (copyin(user_address + offsetof(dof_ioctl_data_t, dofiod_count),
17543 &dof_count,
17544 sizeof(dof_count))) {
17545 dtrace_dof_error(NULL, "failed to copyin dofiod_count");
17546 return (EFAULT);
17547 }
17548
17549 /*
17550 * Range check the count.
17551 */
17552 if (dof_count == 0 || dof_count > 1024) {
17553 dtrace_dof_error(NULL, "dofiod_count is not valid");
17554 return (EINVAL);
17555 }
17556
17557 /*
17558 * Allocate a correctly sized structure and copyin the data.
17559 */
17560 dof_ioctl_data_size = DOF_IOCTL_DATA_T_SIZE(dof_count);
17561 if ((multi_dof = kmem_alloc(dof_ioctl_data_size, KM_SLEEP)) == NULL)
17562 return (ENOMEM);
17563
17564 /* NOTE! We can no longer exit this method via return */
17565 if (copyin(user_address, multi_dof, dof_ioctl_data_size) != 0) {
17566 dtrace_dof_error(NULL, "failed copyin of dof_ioctl_data_t");
17567 rval = EFAULT;
17568 goto cleanup;
17569 }
17570
17571 /*
17572 * Check that the count didn't change between the first copyin and the second.
17573 */
17574 if (multi_dof->dofiod_count != dof_count) {
17575 rval = EINVAL;
17576 goto cleanup;
17577 }
17578
17579 /*
17580 * Try to process lazily first.
17581 */
17582 rval = dtrace_lazy_dofs_add(p, multi_dof, &multi_dof_claimed);
17583
17584 /*
17585 * If rval is EACCES, we must be non-lazy.
17586 */
17587 if (rval == EACCES) {
17588 rval = 0;
17589 /*
17590 * Process each dof_helper_t
17591 */
17592 i = 0;
17593 do {
17594 dhp = &multi_dof->dofiod_helpers[i];
17595
17596 dof_hdr_t *dof = dtrace_dof_copyin(dhp->dofhp_dof, &rval);
17597
17598 if (dof != NULL) {
17599 lck_mtx_lock(&dtrace_meta_lock);
17600 lck_mtx_lock(&dtrace_lock);
17601
17602 /*
17603 * dtrace_helper_slurp() takes responsibility for the dof --
17604 * it may free it now or it may save it and free it later.
17605 */
17606 if ((dhp->dofhp_dof = (uint64_t)dtrace_helper_slurp(p, dof, dhp)) == -1ULL) {
17607 rval = EINVAL;
17608 }
17609
17610 lck_mtx_unlock(&dtrace_lock);
17611 lck_mtx_unlock(&dtrace_meta_lock);
17612 }
17613 } while (++i < multi_dof->dofiod_count && rval == 0);
17614 }
17615
17616 /*
17617 * We need to copyout the multi_dof struct, because it contains
17618 * the generation (unique id) values needed to call DTRACEHIOC_REMOVE
17619 *
17620 * This could certainly be better optimized.
17621 */
17622 if (copyout(multi_dof, user_address, dof_ioctl_data_size) != 0) {
17623 dtrace_dof_error(NULL, "failed copyout of dof_ioctl_data_t");
17624 /* Don't overwrite pre-existing error code */
17625 if (rval == 0) rval = EFAULT;
17626 }
17627
17628 cleanup:
17629 /*
17630 * If we had to allocate struct memory, free it.
17631 */
17632 if (multi_dof != NULL && !multi_dof_claimed) {
17633 kmem_free(multi_dof, dof_ioctl_data_size);
17634 }
17635
17636 return rval;
17637 }
17638
17639 case DTRACEHIOC_REMOVE: {
17640 int generation = *(int*)arg;
17641 proc_t* p = current_proc();
17642
17643 /*
17644 * Try lazy first.
17645 */
17646 int rval = dtrace_lazy_dofs_remove(p, generation);
17647
17648 /*
17649 * EACCES means non-lazy
17650 */
17651 if (rval == EACCES) {
17652 lck_mtx_lock(&dtrace_meta_lock);
17653 lck_mtx_lock(&dtrace_lock);
17654 rval = dtrace_helper_destroygen(p, generation);
17655 lck_mtx_unlock(&dtrace_lock);
17656 lck_mtx_unlock(&dtrace_meta_lock);
17657 }
17658
17659 return (rval);
17660 }
17661
17662 default:
17663 break;
17664 }
17665
17666 return ENOTTY;
17667 }
17668
17669 /*ARGSUSED*/
17670 static int
17671 dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv)
17672 {
17673 #pragma unused(md)
17674 minor_t minor = getminor(dev);
17675 dtrace_state_t *state;
17676 int rval;
17677
17678 /* Darwin puts Helper on its own major device. */
17679
17680 state = dtrace_state_get(minor);
17681
17682 if (state->dts_anon) {
17683 ASSERT(dtrace_anon.dta_state == NULL);
17684 state = state->dts_anon;
17685 }
17686
17687 switch (cmd) {
17688 case DTRACEIOC_PROVIDER: {
17689 dtrace_providerdesc_t pvd;
17690 dtrace_provider_t *pvp;
17691
17692 if (copyin(arg, &pvd, sizeof (pvd)) != 0)
17693 return (EFAULT);
17694
17695 pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
17696 lck_mtx_lock(&dtrace_provider_lock);
17697
17698 for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
17699 if (strncmp(pvp->dtpv_name, pvd.dtvd_name, DTRACE_PROVNAMELEN) == 0)
17700 break;
17701 }
17702
17703 lck_mtx_unlock(&dtrace_provider_lock);
17704
17705 if (pvp == NULL)
17706 return (ESRCH);
17707
17708 bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
17709 bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
17710 if (copyout(&pvd, arg, sizeof (pvd)) != 0)
17711 return (EFAULT);
17712
17713 return (0);
17714 }
17715
17716 case DTRACEIOC_EPROBE: {
17717 dtrace_eprobedesc_t epdesc;
17718 dtrace_ecb_t *ecb;
17719 dtrace_action_t *act;
17720 void *buf;
17721 size_t size;
17722 uintptr_t dest;
17723 int nrecs;
17724
17725 if (copyin(arg, &epdesc, sizeof (epdesc)) != 0)
17726 return (EFAULT);
17727
17728 lck_mtx_lock(&dtrace_lock);
17729
17730 if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
17731 lck_mtx_unlock(&dtrace_lock);
17732 return (EINVAL);
17733 }
17734
17735 if (ecb->dte_probe == NULL) {
17736 lck_mtx_unlock(&dtrace_lock);
17737 return (EINVAL);
17738 }
17739
17740 epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
17741 epdesc.dtepd_uarg = ecb->dte_uarg;
17742 epdesc.dtepd_size = ecb->dte_size;
17743
17744 nrecs = epdesc.dtepd_nrecs;
17745 epdesc.dtepd_nrecs = 0;
17746 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17747 if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17748 continue;
17749
17750 epdesc.dtepd_nrecs++;
17751 }
17752
17753 /*
17754 * Now that we have the size, we need to allocate a temporary
17755 * buffer in which to store the complete description. We need
17756 * the temporary buffer to be able to drop dtrace_lock()
17757 * across the copyout(), below.
17758 */
17759 size = sizeof (dtrace_eprobedesc_t) +
17760 (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
17761
17762 buf = kmem_alloc(size, KM_SLEEP);
17763 dest = (uintptr_t)buf;
17764
17765 bcopy(&epdesc, (void *)dest, sizeof (epdesc));
17766 dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
17767
17768 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17769 if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17770 continue;
17771
17772 if (nrecs-- == 0)
17773 break;
17774
17775 bcopy(&act->dta_rec, (void *)dest,
17776 sizeof (dtrace_recdesc_t));
17777 dest += sizeof (dtrace_recdesc_t);
17778 }
17779
17780 lck_mtx_unlock(&dtrace_lock);
17781
17782 if (copyout(buf, arg, dest - (uintptr_t)buf) != 0) {
17783 kmem_free(buf, size);
17784 return (EFAULT);
17785 }
17786
17787 kmem_free(buf, size);
17788 return (0);
17789 }
17790
17791 case DTRACEIOC_AGGDESC: {
17792 dtrace_aggdesc_t aggdesc;
17793 dtrace_action_t *act;
17794 dtrace_aggregation_t *agg;
17795 int nrecs;
17796 uint32_t offs;
17797 dtrace_recdesc_t *lrec;
17798 void *buf;
17799 size_t size;
17800 uintptr_t dest;
17801
17802 if (copyin(arg, &aggdesc, sizeof (aggdesc)) != 0)
17803 return (EFAULT);
17804
17805 lck_mtx_lock(&dtrace_lock);
17806
17807 if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
17808 lck_mtx_unlock(&dtrace_lock);
17809 return (EINVAL);
17810 }
17811
17812 aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
17813
17814 nrecs = aggdesc.dtagd_nrecs;
17815 aggdesc.dtagd_nrecs = 0;
17816
17817 offs = agg->dtag_base;
17818 lrec = &agg->dtag_action.dta_rec;
17819 aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
17820
17821 for (act = agg->dtag_first; ; act = act->dta_next) {
17822 ASSERT(act->dta_intuple ||
17823 DTRACEACT_ISAGG(act->dta_kind));
17824
17825 /*
17826 * If this action has a record size of zero, it
17827 * denotes an argument to the aggregating action.
17828 * Because the presence of this record doesn't (or
17829 * shouldn't) affect the way the data is interpreted,
17830 * we don't copy it out to save user-level the
17831 * confusion of dealing with a zero-length record.
17832 */
17833 if (act->dta_rec.dtrd_size == 0) {
17834 ASSERT(agg->dtag_hasarg);
17835 continue;
17836 }
17837
17838 aggdesc.dtagd_nrecs++;
17839
17840 if (act == &agg->dtag_action)
17841 break;
17842 }
17843
17844 /*
17845 * Now that we have the size, we need to allocate a temporary
17846 * buffer in which to store the complete description. We need
17847 * the temporary buffer to be able to drop dtrace_lock()
17848 * across the copyout(), below.
17849 */
17850 size = sizeof (dtrace_aggdesc_t) +
17851 (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
17852
17853 buf = kmem_alloc(size, KM_SLEEP);
17854 dest = (uintptr_t)buf;
17855
17856 bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
17857 dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
17858
17859 for (act = agg->dtag_first; ; act = act->dta_next) {
17860 dtrace_recdesc_t rec = act->dta_rec;
17861
17862 /*
17863 * See the comment in the above loop for why we pass
17864 * over zero-length records.
17865 */
17866 if (rec.dtrd_size == 0) {
17867 ASSERT(agg->dtag_hasarg);
17868 continue;
17869 }
17870
17871 if (nrecs-- == 0)
17872 break;
17873
17874 rec.dtrd_offset -= offs;
17875 bcopy(&rec, (void *)dest, sizeof (rec));
17876 dest += sizeof (dtrace_recdesc_t);
17877
17878 if (act == &agg->dtag_action)
17879 break;
17880 }
17881
17882 lck_mtx_unlock(&dtrace_lock);
17883
17884 if (copyout(buf, arg, dest - (uintptr_t)buf) != 0) {
17885 kmem_free(buf, size);
17886 return (EFAULT);
17887 }
17888
17889 kmem_free(buf, size);
17890 return (0);
17891 }
17892
17893 case DTRACEIOC_ENABLE: {
17894 dof_hdr_t *dof;
17895 dtrace_enabling_t *enab = NULL;
17896 dtrace_vstate_t *vstate;
17897 int err = 0;
17898
17899 *rv = 0;
17900
17901 /*
17902 * If a NULL argument has been passed, we take this as our
17903 * cue to reevaluate our enablings.
17904 */
17905 if (arg == 0) {
17906 dtrace_enabling_matchall();
17907
17908 return (0);
17909 }
17910
17911 if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
17912 return (rval);
17913
17914 lck_mtx_lock(&cpu_lock);
17915 lck_mtx_lock(&dtrace_lock);
17916 vstate = &state->dts_vstate;
17917
17918 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
17919 lck_mtx_unlock(&dtrace_lock);
17920 lck_mtx_unlock(&cpu_lock);
17921 dtrace_dof_destroy(dof);
17922 return (EBUSY);
17923 }
17924
17925 if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
17926 lck_mtx_unlock(&dtrace_lock);
17927 lck_mtx_unlock(&cpu_lock);
17928 dtrace_dof_destroy(dof);
17929 return (EINVAL);
17930 }
17931
17932 if ((rval = dtrace_dof_options(dof, state)) != 0) {
17933 dtrace_enabling_destroy(enab);
17934 lck_mtx_unlock(&dtrace_lock);
17935 lck_mtx_unlock(&cpu_lock);
17936 dtrace_dof_destroy(dof);
17937 return (rval);
17938 }
17939
17940 if ((err = dtrace_enabling_match(enab, rv, NULL)) == 0) {
17941 err = dtrace_enabling_retain(enab);
17942 } else {
17943 dtrace_enabling_destroy(enab);
17944 }
17945
17946 lck_mtx_unlock(&dtrace_lock);
17947 lck_mtx_unlock(&cpu_lock);
17948 dtrace_dof_destroy(dof);
17949
17950 return (err);
17951 }
17952
17953 case DTRACEIOC_REPLICATE: {
17954 dtrace_repldesc_t desc;
17955 dtrace_probedesc_t *match = &desc.dtrpd_match;
17956 dtrace_probedesc_t *create = &desc.dtrpd_create;
17957 int err;
17958
17959 if (copyin(arg, &desc, sizeof (desc)) != 0)
17960 return (EFAULT);
17961
17962 match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17963 match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17964 match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17965 match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17966
17967 create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17968 create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17969 create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17970 create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17971
17972 lck_mtx_lock(&dtrace_lock);
17973 err = dtrace_enabling_replicate(state, match, create);
17974 lck_mtx_unlock(&dtrace_lock);
17975
17976 return (err);
17977 }
17978
17979 case DTRACEIOC_PROBEMATCH:
17980 case DTRACEIOC_PROBES: {
17981 dtrace_probe_t *probe = NULL;
17982 dtrace_probedesc_t desc;
17983 dtrace_probekey_t pkey;
17984 dtrace_id_t i;
17985 int m = 0;
17986 uint32_t priv;
17987 uid_t uid;
17988 zoneid_t zoneid;
17989
17990 if (copyin(arg, &desc, sizeof (desc)) != 0)
17991 return (EFAULT);
17992
17993 desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17994 desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17995 desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17996 desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17997
17998 /*
17999 * Before we attempt to match this probe, we want to give
18000 * all providers the opportunity to provide it.
18001 */
18002 if (desc.dtpd_id == DTRACE_IDNONE) {
18003 lck_mtx_lock(&dtrace_provider_lock);
18004 dtrace_probe_provide(&desc, NULL);
18005 lck_mtx_unlock(&dtrace_provider_lock);
18006 desc.dtpd_id++;
18007 }
18008
18009 dtrace_cred2priv(cr, &priv, &uid, &zoneid);
18010
18011 lck_mtx_lock(&dtrace_lock);
18012
18013 if (cmd == DTRACEIOC_PROBEMATCH) {
18014 dtrace_probekey(&desc, &pkey);
18015 pkey.dtpk_id = DTRACE_IDNONE;
18016
18017 /* Quiet compiler warning */
18018 for (i = desc.dtpd_id; i <= (dtrace_id_t)dtrace_nprobes; i++) {
18019 if ((probe = dtrace_probes[i - 1]) != NULL &&
18020 (m = dtrace_match_probe(probe, &pkey,
18021 priv, uid, zoneid)) != 0)
18022 break;
18023 }
18024
18025 if (m < 0) {
18026 lck_mtx_unlock(&dtrace_lock);
18027 return (EINVAL);
18028 }
18029 dtrace_probekey_release(&pkey);
18030
18031 } else {
18032 /* Quiet compiler warning */
18033 for (i = desc.dtpd_id; i <= (dtrace_id_t)dtrace_nprobes; i++) {
18034 if ((probe = dtrace_probes[i - 1]) != NULL &&
18035 dtrace_match_priv(probe, priv, uid, zoneid))
18036 break;
18037 }
18038 }
18039
18040 if (probe == NULL) {
18041 lck_mtx_unlock(&dtrace_lock);
18042 return (ESRCH);
18043 }
18044
18045 dtrace_probe_description(probe, &desc);
18046 lck_mtx_unlock(&dtrace_lock);
18047
18048 if (copyout(&desc, arg, sizeof (desc)) != 0)
18049 return (EFAULT);
18050
18051 return (0);
18052 }
18053
18054 case DTRACEIOC_PROBEARG: {
18055 dtrace_argdesc_t desc;
18056 dtrace_probe_t *probe;
18057 dtrace_provider_t *prov;
18058
18059 if (copyin(arg, &desc, sizeof (desc)) != 0)
18060 return (EFAULT);
18061
18062 if (desc.dtargd_id == DTRACE_IDNONE)
18063 return (EINVAL);
18064
18065 if (desc.dtargd_ndx == DTRACE_ARGNONE)
18066 return (EINVAL);
18067
18068 lck_mtx_lock(&dtrace_provider_lock);
18069 lck_mtx_lock(&mod_lock);
18070 lck_mtx_lock(&dtrace_lock);
18071
18072 /* Quiet compiler warning */
18073 if (desc.dtargd_id > (dtrace_id_t)dtrace_nprobes) {
18074 lck_mtx_unlock(&dtrace_lock);
18075 lck_mtx_unlock(&mod_lock);
18076 lck_mtx_unlock(&dtrace_provider_lock);
18077 return (EINVAL);
18078 }
18079
18080 if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
18081 lck_mtx_unlock(&dtrace_lock);
18082 lck_mtx_unlock(&mod_lock);
18083 lck_mtx_unlock(&dtrace_provider_lock);
18084 return (EINVAL);
18085 }
18086
18087 lck_mtx_unlock(&dtrace_lock);
18088
18089 prov = probe->dtpr_provider;
18090
18091 if (prov->dtpv_pops.dtps_getargdesc == NULL) {
18092 /*
18093 * There isn't any typed information for this probe.
18094 * Set the argument number to DTRACE_ARGNONE.
18095 */
18096 desc.dtargd_ndx = DTRACE_ARGNONE;
18097 } else {
18098 desc.dtargd_native[0] = '\0';
18099 desc.dtargd_xlate[0] = '\0';
18100 desc.dtargd_mapping = desc.dtargd_ndx;
18101
18102 prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
18103 probe->dtpr_id, probe->dtpr_arg, &desc);
18104 }
18105
18106 lck_mtx_unlock(&mod_lock);
18107 lck_mtx_unlock(&dtrace_provider_lock);
18108
18109 if (copyout(&desc, arg, sizeof (desc)) != 0)
18110 return (EFAULT);
18111
18112 return (0);
18113 }
18114
18115 case DTRACEIOC_GO: {
18116 processorid_t cpuid;
18117 rval = dtrace_state_go(state, &cpuid);
18118
18119 if (rval != 0)
18120 return (rval);
18121
18122 if (copyout(&cpuid, arg, sizeof (cpuid)) != 0)
18123 return (EFAULT);
18124
18125 return (0);
18126 }
18127
18128 case DTRACEIOC_STOP: {
18129 processorid_t cpuid;
18130
18131 lck_mtx_lock(&dtrace_lock);
18132 rval = dtrace_state_stop(state, &cpuid);
18133 lck_mtx_unlock(&dtrace_lock);
18134
18135 if (rval != 0)
18136 return (rval);
18137
18138 if (copyout(&cpuid, arg, sizeof (cpuid)) != 0)
18139 return (EFAULT);
18140
18141 return (0);
18142 }
18143
18144 case DTRACEIOC_DOFGET: {
18145 dof_hdr_t hdr, *dof;
18146 uint64_t len;
18147
18148 if (copyin(arg, &hdr, sizeof (hdr)) != 0)
18149 return (EFAULT);
18150
18151 lck_mtx_lock(&dtrace_lock);
18152 dof = dtrace_dof_create(state);
18153 lck_mtx_unlock(&dtrace_lock);
18154
18155 len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
18156 rval = copyout(dof, arg, len);
18157 dtrace_dof_destroy(dof);
18158
18159 return (rval == 0 ? 0 : EFAULT);
18160 }
18161
18162 case DTRACEIOC_SLEEP: {
18163 int64_t time;
18164 uint64_t abstime;
18165 uint64_t rvalue = DTRACE_WAKE_TIMEOUT;
18166
18167 if (copyin(arg, &time, sizeof(time)) != 0)
18168 return (EFAULT);
18169
18170 nanoseconds_to_absolutetime((uint64_t)time, &abstime);
18171 clock_absolutetime_interval_to_deadline(abstime, &abstime);
18172
18173 if (assert_wait_deadline(state, THREAD_ABORTSAFE, abstime) == THREAD_WAITING) {
18174 if (state->dts_buf_over_limit > 0) {
18175 clear_wait(current_thread(), THREAD_INTERRUPTED);
18176 rvalue = DTRACE_WAKE_BUF_LIMIT;
18177 } else {
18178 thread_block(THREAD_CONTINUE_NULL);
18179 if (state->dts_buf_over_limit > 0) {
18180 rvalue = DTRACE_WAKE_BUF_LIMIT;
18181 }
18182 }
18183 }
18184
18185 if (copyout(&rvalue, arg, sizeof(rvalue)) != 0)
18186 return (EFAULT);
18187
18188 return (0);
18189 }
18190
18191 case DTRACEIOC_SIGNAL: {
18192 wakeup(state);
18193 return (0);
18194 }
18195
18196 case DTRACEIOC_AGGSNAP:
18197 case DTRACEIOC_BUFSNAP: {
18198 dtrace_bufdesc_t desc;
18199 caddr_t cached;
18200 boolean_t over_limit;
18201 dtrace_buffer_t *buf;
18202
18203 if (copyin(arg, &desc, sizeof (desc)) != 0)
18204 return (EFAULT);
18205
18206 if ((int)desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)
18207 return (EINVAL);
18208
18209 lck_mtx_lock(&dtrace_lock);
18210
18211 if (cmd == DTRACEIOC_BUFSNAP) {
18212 buf = &state->dts_buffer[desc.dtbd_cpu];
18213 } else {
18214 buf = &state->dts_aggbuffer[desc.dtbd_cpu];
18215 }
18216
18217 if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
18218 size_t sz = buf->dtb_offset;
18219
18220 if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
18221 lck_mtx_unlock(&dtrace_lock);
18222 return (EBUSY);
18223 }
18224
18225 /*
18226 * If this buffer has already been consumed, we're
18227 * going to indicate that there's nothing left here
18228 * to consume.
18229 */
18230 if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
18231 lck_mtx_unlock(&dtrace_lock);
18232
18233 desc.dtbd_size = 0;
18234 desc.dtbd_drops = 0;
18235 desc.dtbd_errors = 0;
18236 desc.dtbd_oldest = 0;
18237 sz = sizeof (desc);
18238
18239 if (copyout(&desc, arg, sz) != 0)
18240 return (EFAULT);
18241
18242 return (0);
18243 }
18244
18245 /*
18246 * If this is a ring buffer that has wrapped, we want
18247 * to copy the whole thing out.
18248 */
18249 if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
18250 dtrace_buffer_polish(buf);
18251 sz = buf->dtb_size;
18252 }
18253
18254 if (copyout(buf->dtb_tomax, (user_addr_t)desc.dtbd_data, sz) != 0) {
18255 lck_mtx_unlock(&dtrace_lock);
18256 return (EFAULT);
18257 }
18258
18259 desc.dtbd_size = sz;
18260 desc.dtbd_drops = buf->dtb_drops;
18261 desc.dtbd_errors = buf->dtb_errors;
18262 desc.dtbd_oldest = buf->dtb_xamot_offset;
18263 desc.dtbd_timestamp = dtrace_gethrtime();
18264
18265 lck_mtx_unlock(&dtrace_lock);
18266
18267 if (copyout(&desc, arg, sizeof (desc)) != 0)
18268 return (EFAULT);
18269
18270 buf->dtb_flags |= DTRACEBUF_CONSUMED;
18271
18272 return (0);
18273 }
18274
18275 if (buf->dtb_tomax == NULL) {
18276 ASSERT(buf->dtb_xamot == NULL);
18277 lck_mtx_unlock(&dtrace_lock);
18278 return (ENOENT);
18279 }
18280
18281 cached = buf->dtb_tomax;
18282 over_limit = buf->dtb_cur_limit == buf->dtb_size;
18283
18284 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
18285
18286 dtrace_xcall(desc.dtbd_cpu,
18287 (dtrace_xcall_t)dtrace_buffer_switch, buf);
18288
18289 state->dts_errors += buf->dtb_xamot_errors;
18290
18291 /*
18292 * If the buffers did not actually switch, then the cross call
18293 * did not take place -- presumably because the given CPU is
18294 * not in the ready set. If this is the case, we'll return
18295 * ENOENT.
18296 */
18297 if (buf->dtb_tomax == cached) {
18298 ASSERT(buf->dtb_xamot != cached);
18299 lck_mtx_unlock(&dtrace_lock);
18300 return (ENOENT);
18301 }
18302
18303 ASSERT(cached == buf->dtb_xamot);
18304 /*
18305 * At this point we know the buffer have switched, so we
18306 * can decrement the over limit count if the buffer was over
18307 * its limit. The new buffer might already be over its limit
18308 * yet, but we don't care since we're guaranteed not to be
18309 * checking the buffer over limit count at this point.
18310 */
18311 if (over_limit) {
18312 uint32_t old = os_atomic_dec_orig(&state->dts_buf_over_limit, relaxed);
18313 #pragma unused(old)
18314
18315 /*
18316 * Verify that we didn't underflow the value
18317 */
18318 ASSERT(old != 0);
18319 }
18320
18321 /*
18322 * We have our snapshot; now copy it out.
18323 */
18324 if (dtrace_buffer_copyout(buf->dtb_xamot,
18325 (user_addr_t)desc.dtbd_data,
18326 buf->dtb_xamot_offset) != 0) {
18327 lck_mtx_unlock(&dtrace_lock);
18328 return (EFAULT);
18329 }
18330
18331 desc.dtbd_size = buf->dtb_xamot_offset;
18332 desc.dtbd_drops = buf->dtb_xamot_drops;
18333 desc.dtbd_errors = buf->dtb_xamot_errors;
18334 desc.dtbd_oldest = 0;
18335 desc.dtbd_timestamp = buf->dtb_switched;
18336
18337 lck_mtx_unlock(&dtrace_lock);
18338
18339 /*
18340 * Finally, copy out the buffer description.
18341 */
18342 if (copyout(&desc, arg, sizeof (desc)) != 0)
18343 return (EFAULT);
18344
18345 return (0);
18346 }
18347
18348 case DTRACEIOC_CONF: {
18349 dtrace_conf_t conf;
18350
18351 bzero(&conf, sizeof (conf));
18352 conf.dtc_difversion = DIF_VERSION;
18353 conf.dtc_difintregs = DIF_DIR_NREGS;
18354 conf.dtc_diftupregs = DIF_DTR_NREGS;
18355 conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
18356
18357 if (copyout(&conf, arg, sizeof (conf)) != 0)
18358 return (EFAULT);
18359
18360 return (0);
18361 }
18362
18363 case DTRACEIOC_STATUS: {
18364 dtrace_status_t stat;
18365 dtrace_dstate_t *dstate;
18366 int i, j;
18367 uint64_t nerrs;
18368
18369 /*
18370 * See the comment in dtrace_state_deadman() for the reason
18371 * for setting dts_laststatus to INT64_MAX before setting
18372 * it to the correct value.
18373 */
18374 state->dts_laststatus = INT64_MAX;
18375 dtrace_membar_producer();
18376 state->dts_laststatus = dtrace_gethrtime();
18377
18378 bzero(&stat, sizeof (stat));
18379
18380 lck_mtx_lock(&dtrace_lock);
18381
18382 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
18383 lck_mtx_unlock(&dtrace_lock);
18384 return (ENOENT);
18385 }
18386
18387 if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
18388 stat.dtst_exiting = 1;
18389
18390 nerrs = state->dts_errors;
18391 dstate = &state->dts_vstate.dtvs_dynvars;
18392
18393 for (i = 0; i < (int)NCPU; i++) {
18394 dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
18395
18396 stat.dtst_dyndrops += dcpu->dtdsc_drops;
18397 stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
18398 stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
18399
18400 if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
18401 stat.dtst_filled++;
18402
18403 nerrs += state->dts_buffer[i].dtb_errors;
18404
18405 for (j = 0; j < state->dts_nspeculations; j++) {
18406 dtrace_speculation_t *spec;
18407 dtrace_buffer_t *buf;
18408
18409 spec = &state->dts_speculations[j];
18410 buf = &spec->dtsp_buffer[i];
18411 stat.dtst_specdrops += buf->dtb_xamot_drops;
18412 }
18413 }
18414
18415 stat.dtst_specdrops_busy = state->dts_speculations_busy;
18416 stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
18417 stat.dtst_stkstroverflows = state->dts_stkstroverflows;
18418 stat.dtst_dblerrors = state->dts_dblerrors;
18419 stat.dtst_killed =
18420 (state->dts_activity == DTRACE_ACTIVITY_KILLED);
18421 stat.dtst_errors = nerrs;
18422
18423 lck_mtx_unlock(&dtrace_lock);
18424
18425 if (copyout(&stat, arg, sizeof (stat)) != 0)
18426 return (EFAULT);
18427
18428 return (0);
18429 }
18430
18431 case DTRACEIOC_FORMAT: {
18432 dtrace_fmtdesc_t fmt;
18433 char *str;
18434 int len;
18435
18436 if (copyin(arg, &fmt, sizeof (fmt)) != 0)
18437 return (EFAULT);
18438
18439 lck_mtx_lock(&dtrace_lock);
18440
18441 if (fmt.dtfd_format == 0 ||
18442 fmt.dtfd_format > state->dts_nformats) {
18443 lck_mtx_unlock(&dtrace_lock);
18444 return (EINVAL);
18445 }
18446
18447 /*
18448 * Format strings are allocated contiguously and they are
18449 * never freed; if a format index is less than the number
18450 * of formats, we can assert that the format map is non-NULL
18451 * and that the format for the specified index is non-NULL.
18452 */
18453 ASSERT(state->dts_formats != NULL);
18454 str = state->dts_formats[fmt.dtfd_format - 1]->dtf_str;
18455 ASSERT(str != NULL);
18456
18457 len = strlen(str) + 1;
18458
18459 if (len > fmt.dtfd_length) {
18460 fmt.dtfd_length = len;
18461
18462 if (copyout(&fmt, arg, sizeof (fmt)) != 0) {
18463 lck_mtx_unlock(&dtrace_lock);
18464 return (EINVAL);
18465 }
18466 } else {
18467 if (copyout(str, (user_addr_t)fmt.dtfd_string, len) != 0) {
18468 lck_mtx_unlock(&dtrace_lock);
18469 return (EINVAL);
18470 }
18471 }
18472
18473 lck_mtx_unlock(&dtrace_lock);
18474 return (0);
18475 }
18476
18477 case DTRACEIOC_MODUUIDSLIST: {
18478 size_t module_uuids_list_size;
18479 dtrace_module_uuids_list_t* uuids_list;
18480 uint64_t dtmul_count;
18481
18482 /*
18483 * Security restrictions make this operation illegal, if this is enabled DTrace
18484 * must refuse to provide any fbt probes.
18485 */
18486 if (dtrace_fbt_probes_restricted()) {
18487 cmn_err(CE_WARN, "security restrictions disallow DTRACEIOC_MODUUIDSLIST");
18488 return (EPERM);
18489 }
18490
18491 /*
18492 * Fail if the kernel symbol mode makes this operation illegal.
18493 * Both NEVER & ALWAYS_FROM_KERNEL are permanent states, it is legal to check
18494 * for them without holding the dtrace_lock.
18495 */
18496 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER ||
18497 dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) {
18498 cmn_err(CE_WARN, "dtrace_kernel_symbol_mode of %u disallows DTRACEIOC_MODUUIDSLIST", dtrace_kernel_symbol_mode);
18499 return (EPERM);
18500 }
18501
18502 /*
18503 * Read the number of symbolsdesc structs being passed in.
18504 */
18505 if (copyin(arg + offsetof(dtrace_module_uuids_list_t, dtmul_count),
18506 &dtmul_count,
18507 sizeof(dtmul_count))) {
18508 cmn_err(CE_WARN, "failed to copyin dtmul_count");
18509 return (EFAULT);
18510 }
18511
18512 /*
18513 * Range check the count. More than 2k kexts is probably an error.
18514 */
18515 if (dtmul_count > 2048) {
18516 cmn_err(CE_WARN, "dtmul_count is not valid");
18517 return (EINVAL);
18518 }
18519
18520 /*
18521 * For all queries, we return EINVAL when the user specified
18522 * count does not match the actual number of modules we find
18523 * available.
18524 *
18525 * If the user specified count is zero, then this serves as a
18526 * simple query to count the available modules in need of symbols.
18527 */
18528
18529 rval = 0;
18530
18531 if (dtmul_count == 0)
18532 {
18533 lck_mtx_lock(&mod_lock);
18534 struct modctl* ctl = dtrace_modctl_list;
18535 while (ctl) {
18536 ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
18537 if (!MOD_SYMBOLS_DONE(ctl) && !MOD_IS_STATIC_KEXT(ctl)) {
18538 dtmul_count++;
18539 rval = EINVAL;
18540 }
18541 ctl = ctl->mod_next;
18542 }
18543 lck_mtx_unlock(&mod_lock);
18544
18545 if (copyout(&dtmul_count, arg, sizeof (dtmul_count)) != 0)
18546 return (EFAULT);
18547 else
18548 return (rval);
18549 }
18550
18551 /*
18552 * If we reach this point, then we have a request for full list data.
18553 * Allocate a correctly sized structure and copyin the data.
18554 */
18555 module_uuids_list_size = DTRACE_MODULE_UUIDS_LIST_SIZE(dtmul_count);
18556 if ((uuids_list = kmem_alloc(module_uuids_list_size, KM_SLEEP)) == NULL)
18557 return (ENOMEM);
18558
18559 /* NOTE! We can no longer exit this method via return */
18560 if (copyin(arg, uuids_list, module_uuids_list_size) != 0) {
18561 cmn_err(CE_WARN, "failed copyin of dtrace_module_uuids_list_t");
18562 rval = EFAULT;
18563 goto moduuidslist_cleanup;
18564 }
18565
18566 /*
18567 * Check that the count didn't change between the first copyin and the second.
18568 */
18569 if (uuids_list->dtmul_count != dtmul_count) {
18570 rval = EINVAL;
18571 goto moduuidslist_cleanup;
18572 }
18573
18574 /*
18575 * Build the list of UUID's that need symbols
18576 */
18577 lck_mtx_lock(&mod_lock);
18578
18579 dtmul_count = 0;
18580
18581 struct modctl* ctl = dtrace_modctl_list;
18582 while (ctl) {
18583 /*
18584 * We assume that userspace symbols will be "better" than kernel level symbols,
18585 * as userspace can search for dSYM(s) and symbol'd binaries. Even if kernel syms
18586 * are available, add user syms if the module might use them.
18587 */
18588 ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
18589 if (!MOD_SYMBOLS_DONE(ctl) && !MOD_IS_STATIC_KEXT(ctl)) {
18590 UUID* uuid = &uuids_list->dtmul_uuid[dtmul_count];
18591 if (dtmul_count++ < uuids_list->dtmul_count) {
18592 memcpy(uuid, ctl->mod_uuid, sizeof(UUID));
18593 }
18594 }
18595 ctl = ctl->mod_next;
18596 }
18597
18598 lck_mtx_unlock(&mod_lock);
18599
18600 if (uuids_list->dtmul_count < dtmul_count)
18601 rval = EINVAL;
18602
18603 uuids_list->dtmul_count = dtmul_count;
18604
18605 /*
18606 * Copyout the symbols list (or at least the count!)
18607 */
18608 if (copyout(uuids_list, arg, module_uuids_list_size) != 0) {
18609 cmn_err(CE_WARN, "failed copyout of dtrace_symbolsdesc_list_t");
18610 rval = EFAULT;
18611 }
18612
18613 moduuidslist_cleanup:
18614 /*
18615 * If we had to allocate struct memory, free it.
18616 */
18617 if (uuids_list != NULL) {
18618 kmem_free(uuids_list, module_uuids_list_size);
18619 }
18620
18621 return rval;
18622 }
18623
18624 case DTRACEIOC_PROVMODSYMS: {
18625 size_t module_symbols_size;
18626 dtrace_module_symbols_t* module_symbols;
18627 uint64_t dtmodsyms_count;
18628
18629 /*
18630 * Security restrictions make this operation illegal, if this is enabled DTrace
18631 * must refuse to provide any fbt probes.
18632 */
18633 if (dtrace_fbt_probes_restricted()) {
18634 cmn_err(CE_WARN, "security restrictions disallow DTRACEIOC_MODUUIDSLIST");
18635 return (EPERM);
18636 }
18637
18638 /*
18639 * Fail if the kernel symbol mode makes this operation illegal.
18640 * Both NEVER & ALWAYS_FROM_KERNEL are permanent states, it is legal to check
18641 * for them without holding the dtrace_lock.
18642 */
18643 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER ||
18644 dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) {
18645 cmn_err(CE_WARN, "dtrace_kernel_symbol_mode of %u disallows DTRACEIOC_PROVMODSYMS", dtrace_kernel_symbol_mode);
18646 return (EPERM);
18647 }
18648
18649 /*
18650 * Read the number of module symbols structs being passed in.
18651 */
18652 if (copyin(arg + offsetof(dtrace_module_symbols_t, dtmodsyms_count),
18653 &dtmodsyms_count,
18654 sizeof(dtmodsyms_count))) {
18655 cmn_err(CE_WARN, "failed to copyin dtmodsyms_count");
18656 return (EFAULT);
18657 }
18658
18659 /*
18660 * Range check the count. How much data can we pass around?
18661 * FIX ME!
18662 */
18663 if (dtmodsyms_count == 0 || (dtmodsyms_count > 100 * 1024)) {
18664 cmn_err(CE_WARN, "dtmodsyms_count is not valid");
18665 return (EINVAL);
18666 }
18667
18668 /*
18669 * Allocate a correctly sized structure and copyin the data.
18670 */
18671 module_symbols_size = DTRACE_MODULE_SYMBOLS_SIZE(dtmodsyms_count);
18672 if ((module_symbols = kmem_alloc(module_symbols_size, KM_SLEEP)) == NULL)
18673 return (ENOMEM);
18674
18675 rval = 0;
18676
18677 /* NOTE! We can no longer exit this method via return */
18678 if (copyin(arg, module_symbols, module_symbols_size) != 0) {
18679 cmn_err(CE_WARN, "failed copyin of dtrace_module_symbols_t");
18680 rval = EFAULT;
18681 goto module_symbols_cleanup;
18682 }
18683
18684 /*
18685 * Check that the count didn't change between the first copyin and the second.
18686 */
18687 if (module_symbols->dtmodsyms_count != dtmodsyms_count) {
18688 rval = EINVAL;
18689 goto module_symbols_cleanup;
18690 }
18691
18692 /*
18693 * Find the modctl to add symbols to.
18694 */
18695 lck_mtx_lock(&dtrace_provider_lock);
18696 lck_mtx_lock(&mod_lock);
18697
18698 struct modctl* ctl = dtrace_modctl_list;
18699 while (ctl) {
18700 ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
18701 if (MOD_HAS_UUID(ctl) && !MOD_SYMBOLS_DONE(ctl) && memcmp(module_symbols->dtmodsyms_uuid, ctl->mod_uuid, sizeof(UUID)) == 0) {
18702 dtrace_provider_t *prv;
18703 ctl->mod_user_symbols = module_symbols;
18704
18705 /*
18706 * We're going to call each providers per-module provide operation
18707 * specifying only this module.
18708 */
18709 for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
18710 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
18711 /*
18712 * We gave every provider a chance to provide with the user syms, go ahead and clear them
18713 */
18714 ctl->mod_user_symbols = NULL; /* MUST reset this to clear HAS_USERSPACE_SYMBOLS */
18715 }
18716 ctl = ctl->mod_next;
18717 }
18718
18719 lck_mtx_unlock(&mod_lock);
18720 lck_mtx_unlock(&dtrace_provider_lock);
18721
18722 module_symbols_cleanup:
18723 /*
18724 * If we had to allocate struct memory, free it.
18725 */
18726 if (module_symbols != NULL) {
18727 kmem_free(module_symbols, module_symbols_size);
18728 }
18729
18730 return rval;
18731 }
18732
18733 case DTRACEIOC_PROCWAITFOR: {
18734 dtrace_procdesc_t pdesc = {
18735 .p_name = {0},
18736 .p_pid = -1
18737 };
18738
18739 if ((rval = copyin(arg, &pdesc, sizeof(pdesc))) != 0)
18740 goto proc_waitfor_error;
18741
18742 if ((rval = dtrace_proc_waitfor(&pdesc)) != 0)
18743 goto proc_waitfor_error;
18744
18745 if ((rval = copyout(&pdesc, arg, sizeof(pdesc))) != 0)
18746 goto proc_waitfor_error;
18747
18748 return 0;
18749
18750 proc_waitfor_error:
18751 /* The process was suspended, revert this since the client will not do it. */
18752 if (pdesc.p_pid != -1) {
18753 proc_t *proc = proc_find(pdesc.p_pid);
18754 if (proc != PROC_NULL) {
18755 task_pidresume(proc->task);
18756 proc_rele(proc);
18757 }
18758 }
18759
18760 return rval;
18761 }
18762
18763 default:
18764 break;
18765 }
18766
18767 return (ENOTTY);
18768 }
18769
18770 /*
18771 * APPLE NOTE: dtrace_detach not implemented
18772 */
18773 #if !defined(__APPLE__)
18774 /*ARGSUSED*/
18775 static int
18776 dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
18777 {
18778 dtrace_state_t *state;
18779
18780 switch (cmd) {
18781 case DDI_DETACH:
18782 break;
18783
18784 case DDI_SUSPEND:
18785 return (DDI_SUCCESS);
18786
18787 default:
18788 return (DDI_FAILURE);
18789 }
18790
18791 lck_mtx_lock(&cpu_lock);
18792 lck_mtx_lock(&dtrace_provider_lock);
18793 lck_mtx_lock(&dtrace_lock);
18794
18795 ASSERT(dtrace_opens == 0);
18796
18797 if (dtrace_helpers > 0) {
18798 lck_mtx_unlock(&dtrace_lock);
18799 lck_mtx_unlock(&dtrace_provider_lock);
18800 lck_mtx_unlock(&cpu_lock);
18801 return (DDI_FAILURE);
18802 }
18803
18804 if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
18805 lck_mtx_unlock(&dtrace_lock);
18806 lck_mtx_unlock(&dtrace_provider_lock);
18807 lck_mtx_unlock(&cpu_lock);
18808 return (DDI_FAILURE);
18809 }
18810
18811 dtrace_provider = NULL;
18812
18813 if ((state = dtrace_anon_grab()) != NULL) {
18814 /*
18815 * If there were ECBs on this state, the provider should
18816 * have not been allowed to detach; assert that there is
18817 * none.
18818 */
18819 ASSERT(state->dts_necbs == 0);
18820 dtrace_state_destroy(state);
18821
18822 /*
18823 * If we're being detached with anonymous state, we need to
18824 * indicate to the kernel debugger that DTrace is now inactive.
18825 */
18826 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
18827 }
18828
18829 bzero(&dtrace_anon, sizeof (dtrace_anon_t));
18830 unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
18831 dtrace_cpu_init = NULL;
18832 dtrace_helpers_cleanup = NULL;
18833 dtrace_helpers_fork = NULL;
18834 dtrace_cpustart_init = NULL;
18835 dtrace_cpustart_fini = NULL;
18836 dtrace_debugger_init = NULL;
18837 dtrace_debugger_fini = NULL;
18838 dtrace_kreloc_init = NULL;
18839 dtrace_kreloc_fini = NULL;
18840 dtrace_modload = NULL;
18841 dtrace_modunload = NULL;
18842
18843 lck_mtx_unlock(&cpu_lock);
18844
18845 if (dtrace_helptrace_enabled) {
18846 kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
18847 dtrace_helptrace_buffer = NULL;
18848 }
18849
18850 kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
18851 dtrace_probes = NULL;
18852 dtrace_nprobes = 0;
18853
18854 dtrace_hash_destroy(dtrace_strings);
18855 dtrace_hash_destroy(dtrace_byprov);
18856 dtrace_hash_destroy(dtrace_bymod);
18857 dtrace_hash_destroy(dtrace_byfunc);
18858 dtrace_hash_destroy(dtrace_byname);
18859 dtrace_strings = NULL;
18860 dtrace_byprov = NULL;
18861 dtrace_bymod = NULL;
18862 dtrace_byfunc = NULL;
18863 dtrace_byname = NULL;
18864
18865 kmem_cache_destroy(dtrace_state_cache);
18866 vmem_destroy(dtrace_arena);
18867
18868 if (dtrace_toxrange != NULL) {
18869 kmem_free(dtrace_toxrange,
18870 dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
18871 dtrace_toxrange = NULL;
18872 dtrace_toxranges = 0;
18873 dtrace_toxranges_max = 0;
18874 }
18875
18876 ddi_remove_minor_node(dtrace_devi, NULL);
18877 dtrace_devi = NULL;
18878
18879 ddi_soft_state_fini(&dtrace_softstate);
18880
18881 ASSERT(dtrace_vtime_references == 0);
18882 ASSERT(dtrace_opens == 0);
18883 ASSERT(dtrace_retained == NULL);
18884
18885 lck_mtx_unlock(&dtrace_lock);
18886 lck_mtx_unlock(&dtrace_provider_lock);
18887
18888 #ifdef illumos
18889 /*
18890 * We don't destroy the task queue until after we have dropped our
18891 * locks (taskq_destroy() may block on running tasks). To prevent
18892 * attempting to do work after we have effectively detached but before
18893 * the task queue has been destroyed, all tasks dispatched via the
18894 * task queue must check that DTrace is still attached before
18895 * performing any operation.
18896 */
18897 taskq_destroy(dtrace_taskq);
18898 dtrace_taskq = NULL;
18899 #endif
18900
18901 return (DDI_SUCCESS);
18902 }
18903 #endif /* __APPLE__ */
18904
18905 d_open_t _dtrace_open, helper_open;
18906 d_close_t _dtrace_close, helper_close;
18907 d_ioctl_t _dtrace_ioctl, helper_ioctl;
18908
18909 int
18910 _dtrace_open(dev_t dev, int flags, int devtype, struct proc *p)
18911 {
18912 #pragma unused(p)
18913 dev_t locdev = dev;
18914
18915 return dtrace_open( &locdev, flags, devtype, CRED());
18916 }
18917
18918 int
18919 helper_open(dev_t dev, int flags, int devtype, struct proc *p)
18920 {
18921 #pragma unused(dev,flags,devtype,p)
18922 return 0;
18923 }
18924
18925 int
18926 _dtrace_close(dev_t dev, int flags, int devtype, struct proc *p)
18927 {
18928 #pragma unused(p)
18929 return dtrace_close( dev, flags, devtype, CRED());
18930 }
18931
18932 int
18933 helper_close(dev_t dev, int flags, int devtype, struct proc *p)
18934 {
18935 #pragma unused(dev,flags,devtype,p)
18936 return 0;
18937 }
18938
18939 int
18940 _dtrace_ioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct proc *p)
18941 {
18942 #pragma unused(p)
18943 int err, rv = 0;
18944 user_addr_t uaddrp;
18945
18946 if (proc_is64bit(p))
18947 uaddrp = *(user_addr_t *)data;
18948 else
18949 uaddrp = (user_addr_t) *(uint32_t *)data;
18950
18951 err = dtrace_ioctl(dev, cmd, uaddrp, fflag, CRED(), &rv);
18952
18953 /* Darwin's BSD ioctls only return -1 or zero. Overload errno to mimic Solaris. 20 bits suffice. */
18954 if (err != 0) {
18955 ASSERT( (err & 0xfffff000) == 0 );
18956 return (err & 0xfff); /* ioctl will return -1 and will set errno to an error code < 4096 */
18957 } else if (rv != 0) {
18958 ASSERT( (rv & 0xfff00000) == 0 );
18959 return (((rv & 0xfffff) << 12)); /* ioctl will return -1 and will set errno to a value >= 4096 */
18960 } else
18961 return 0;
18962 }
18963
18964 int
18965 helper_ioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct proc *p)
18966 {
18967 #pragma unused(dev,fflag,p)
18968 int err, rv = 0;
18969
18970 err = dtrace_ioctl_helper(cmd, data, &rv);
18971 /* Darwin's BSD ioctls only return -1 or zero. Overload errno to mimic Solaris. 20 bits suffice. */
18972 if (err != 0) {
18973 ASSERT( (err & 0xfffff000) == 0 );
18974 return (err & 0xfff); /* ioctl will return -1 and will set errno to an error code < 4096 */
18975 } else if (rv != 0) {
18976 ASSERT( (rv & 0xfff00000) == 0 );
18977 return (((rv & 0xfffff) << 12)); /* ioctl will return -1 and will set errno to a value >= 4096 */
18978 } else
18979 return 0;
18980 }
18981
18982 #define HELPER_MAJOR -24 /* let the kernel pick the device number */
18983
18984 /*
18985 * A struct describing which functions will get invoked for certain
18986 * actions.
18987 */
18988 static struct cdevsw helper_cdevsw =
18989 {
18990 helper_open, /* open */
18991 helper_close, /* close */
18992 eno_rdwrt, /* read */
18993 eno_rdwrt, /* write */
18994 helper_ioctl, /* ioctl */
18995 (stop_fcn_t *)nulldev, /* stop */
18996 (reset_fcn_t *)nulldev, /* reset */
18997 NULL, /* tty's */
18998 eno_select, /* select */
18999 eno_mmap, /* mmap */
19000 eno_strat, /* strategy */
19001 eno_getc, /* getc */
19002 eno_putc, /* putc */
19003 0 /* type */
19004 };
19005
19006 static int helper_majdevno = 0;
19007
19008 static int gDTraceInited = 0;
19009
19010 void
19011 helper_init( void )
19012 {
19013 /*
19014 * Once the "helper" is initialized, it can take ioctl calls that use locks
19015 * and zones initialized in dtrace_init. Make certain dtrace_init was called
19016 * before us.
19017 */
19018
19019 if (!gDTraceInited) {
19020 panic("helper_init before dtrace_init\n");
19021 }
19022
19023 if (0 >= helper_majdevno)
19024 {
19025 helper_majdevno = cdevsw_add(HELPER_MAJOR, &helper_cdevsw);
19026
19027 if (helper_majdevno < 0) {
19028 printf("helper_init: failed to allocate a major number!\n");
19029 return;
19030 }
19031
19032 if (NULL == devfs_make_node( makedev(helper_majdevno, 0), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666,
19033 DTRACEMNR_HELPER, 0 )) {
19034 printf("dtrace_init: failed to devfs_make_node for helper!\n");
19035 return;
19036 }
19037 } else
19038 panic("helper_init: called twice!\n");
19039 }
19040
19041 #undef HELPER_MAJOR
19042
19043 static int
19044 dtrace_clone_func(dev_t dev, int action)
19045 {
19046 #pragma unused(dev)
19047
19048 if (action == DEVFS_CLONE_ALLOC) {
19049 return dtrace_state_reserve();
19050 }
19051 else if (action == DEVFS_CLONE_FREE) {
19052 return 0;
19053 }
19054 else return -1;
19055 }
19056
19057 void dtrace_ast(void);
19058
19059 void
19060 dtrace_ast(void)
19061 {
19062 int i;
19063 uint32_t clients = os_atomic_xchg(&dtrace_wake_clients, 0, relaxed);
19064 if (clients == 0)
19065 return;
19066 /**
19067 * We disable preemption here to be sure that we won't get
19068 * interrupted by a wakeup to a thread that is higher
19069 * priority than us, so that we do issue all wakeups
19070 */
19071 disable_preemption();
19072 for (i = 0; i < DTRACE_NCLIENTS; i++) {
19073 if (clients & (1 << i)) {
19074 dtrace_state_t *state = dtrace_state_get(i);
19075 if (state) {
19076 wakeup(state);
19077 }
19078
19079 }
19080 }
19081 enable_preemption();
19082 }
19083
19084
19085 #define DTRACE_MAJOR -24 /* let the kernel pick the device number */
19086
19087 static struct cdevsw dtrace_cdevsw =
19088 {
19089 _dtrace_open, /* open */
19090 _dtrace_close, /* close */
19091 eno_rdwrt, /* read */
19092 eno_rdwrt, /* write */
19093 _dtrace_ioctl, /* ioctl */
19094 (stop_fcn_t *)nulldev, /* stop */
19095 (reset_fcn_t *)nulldev, /* reset */
19096 NULL, /* tty's */
19097 eno_select, /* select */
19098 eno_mmap, /* mmap */
19099 eno_strat, /* strategy */
19100 eno_getc, /* getc */
19101 eno_putc, /* putc */
19102 0 /* type */
19103 };
19104
19105 lck_attr_t* dtrace_lck_attr;
19106 lck_grp_attr_t* dtrace_lck_grp_attr;
19107 lck_grp_t* dtrace_lck_grp;
19108
19109 static int gMajDevNo;
19110
19111 void dtrace_early_init (void)
19112 {
19113 dtrace_restriction_policy_load();
19114
19115 /*
19116 * See dtrace_impl.h for a description of kernel symbol modes.
19117 * The default is to wait for symbols from userspace (lazy symbols).
19118 */
19119 if (!PE_parse_boot_argn("dtrace_kernel_symbol_mode", &dtrace_kernel_symbol_mode, sizeof (dtrace_kernel_symbol_mode))) {
19120 dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE;
19121 }
19122 }
19123
19124 void
19125 dtrace_init( void )
19126 {
19127 if (0 == gDTraceInited) {
19128 int i, ncpu;
19129 size_t size = sizeof(dtrace_buffer_memory_maxsize);
19130
19131 /*
19132 * DTrace allocates buffers based on the maximum number
19133 * of enabled cpus. This call avoids any race when finding
19134 * that count.
19135 */
19136 ASSERT(dtrace_max_cpus == 0);
19137 ncpu = dtrace_max_cpus = ml_get_max_cpus();
19138
19139 /*
19140 * Retrieve the size of the physical memory in order to define
19141 * the state buffer memory maximal size. If we cannot retrieve
19142 * this value, we'll consider that we have 1Gb of memory per CPU, that's
19143 * still better than raising a kernel panic.
19144 */
19145 if (0 != kernel_sysctlbyname("hw.memsize", &dtrace_buffer_memory_maxsize,
19146 &size, NULL, 0))
19147 {
19148 dtrace_buffer_memory_maxsize = ncpu * 1024 * 1024 * 1024;
19149 printf("dtrace_init: failed to retrieve the hw.memsize, defaulted to %lld bytes\n",
19150 dtrace_buffer_memory_maxsize);
19151 }
19152
19153 /*
19154 * Finally, divide by three to prevent DTrace from eating too
19155 * much memory.
19156 */
19157 dtrace_buffer_memory_maxsize /= 3;
19158 ASSERT(dtrace_buffer_memory_maxsize > 0);
19159
19160 gMajDevNo = cdevsw_add(DTRACE_MAJOR, &dtrace_cdevsw);
19161
19162 if (gMajDevNo < 0) {
19163 printf("dtrace_init: failed to allocate a major number!\n");
19164 gDTraceInited = 0;
19165 return;
19166 }
19167
19168 if (NULL == devfs_make_node_clone( makedev(gMajDevNo, 0), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666,
19169 dtrace_clone_func, DTRACEMNR_DTRACE, 0 )) {
19170 printf("dtrace_init: failed to devfs_make_node_clone for dtrace!\n");
19171 gDTraceInited = 0;
19172 return;
19173 }
19174
19175 /*
19176 * Allocate the dtrace_probe_t zone
19177 */
19178 dtrace_probe_t_zone = zinit(sizeof(dtrace_probe_t),
19179 1024 * sizeof(dtrace_probe_t),
19180 sizeof(dtrace_probe_t),
19181 "dtrace.dtrace_probe_t");
19182
19183 /*
19184 * Create the dtrace lock group and attrs.
19185 */
19186 dtrace_lck_attr = lck_attr_alloc_init();
19187 dtrace_lck_grp_attr= lck_grp_attr_alloc_init();
19188 dtrace_lck_grp = lck_grp_alloc_init("dtrace", dtrace_lck_grp_attr);
19189
19190 /*
19191 * We have to initialize all locks explicitly
19192 */
19193 lck_mtx_init(&dtrace_lock, dtrace_lck_grp, dtrace_lck_attr);
19194 lck_mtx_init(&dtrace_provider_lock, dtrace_lck_grp, dtrace_lck_attr);
19195 lck_mtx_init(&dtrace_meta_lock, dtrace_lck_grp, dtrace_lck_attr);
19196 lck_mtx_init(&dtrace_procwaitfor_lock, dtrace_lck_grp, dtrace_lck_attr);
19197 #if DEBUG
19198 lck_mtx_init(&dtrace_errlock, dtrace_lck_grp, dtrace_lck_attr);
19199 #endif
19200 lck_rw_init(&dtrace_dof_mode_lock, dtrace_lck_grp, dtrace_lck_attr);
19201
19202 /*
19203 * The cpu_core structure consists of per-CPU state available in any context.
19204 * On some architectures, this may mean that the page(s) containing the
19205 * NCPU-sized array of cpu_core structures must be locked in the TLB -- it
19206 * is up to the platform to assure that this is performed properly. Note that
19207 * the structure is sized to avoid false sharing.
19208 */
19209 lck_mtx_init(&cpu_lock, dtrace_lck_grp, dtrace_lck_attr);
19210 lck_mtx_init(&cyc_lock, dtrace_lck_grp, dtrace_lck_attr);
19211 lck_mtx_init(&mod_lock, dtrace_lck_grp, dtrace_lck_attr);
19212
19213 /*
19214 * Initialize the CPU offline/online hooks.
19215 */
19216 dtrace_install_cpu_hooks();
19217
19218 dtrace_modctl_list = NULL;
19219
19220 cpu_core = (cpu_core_t *)kmem_zalloc( ncpu * sizeof(cpu_core_t), KM_SLEEP );
19221 for (i = 0; i < ncpu; ++i) {
19222 lck_mtx_init(&cpu_core[i].cpuc_pid_lock, dtrace_lck_grp, dtrace_lck_attr);
19223 }
19224
19225 cpu_list = (dtrace_cpu_t *)kmem_zalloc( ncpu * sizeof(dtrace_cpu_t), KM_SLEEP );
19226 for (i = 0; i < ncpu; ++i) {
19227 cpu_list[i].cpu_id = (processorid_t)i;
19228 cpu_list[i].cpu_next = &(cpu_list[(i+1) % ncpu]);
19229 LIST_INIT(&cpu_list[i].cpu_cyc_list);
19230 lck_rw_init(&cpu_list[i].cpu_ft_lock, dtrace_lck_grp, dtrace_lck_attr);
19231 }
19232
19233 lck_mtx_lock(&cpu_lock);
19234 for (i = 0; i < ncpu; ++i)
19235 /* FIXME: track CPU configuration */
19236 dtrace_cpu_setup_initial( (processorid_t)i ); /* In lieu of register_cpu_setup_func() callback */
19237 lck_mtx_unlock(&cpu_lock);
19238
19239 (void)dtrace_abs_to_nano(0LL); /* Force once only call to clock_timebase_info (which can take a lock) */
19240
19241 dtrace_strings = dtrace_hash_create(dtrace_strkey_offset,
19242 offsetof(dtrace_string_t, dtst_str),
19243 offsetof(dtrace_string_t, dtst_next),
19244 offsetof(dtrace_string_t, dtst_prev));
19245
19246 dtrace_isa_init();
19247 /*
19248 * See dtrace_impl.h for a description of dof modes.
19249 * The default is lazy dof.
19250 *
19251 * FIXME: Warn if state is LAZY_OFF? It won't break anything, but
19252 * makes no sense...
19253 */
19254 if (!PE_parse_boot_argn("dtrace_dof_mode", &dtrace_dof_mode, sizeof (dtrace_dof_mode))) {
19255 #if CONFIG_EMBEDDED
19256 /* Disable DOF mode by default for performance reasons */
19257 dtrace_dof_mode = DTRACE_DOF_MODE_NEVER;
19258 #else
19259 dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_ON;
19260 #endif
19261 }
19262
19263 /*
19264 * Sanity check of dof mode value.
19265 */
19266 switch (dtrace_dof_mode) {
19267 case DTRACE_DOF_MODE_NEVER:
19268 case DTRACE_DOF_MODE_LAZY_ON:
19269 /* valid modes, but nothing else we need to do */
19270 break;
19271
19272 case DTRACE_DOF_MODE_LAZY_OFF:
19273 case DTRACE_DOF_MODE_NON_LAZY:
19274 /* Cannot wait for a dtrace_open to init fasttrap */
19275 fasttrap_init();
19276 break;
19277
19278 default:
19279 /* Invalid, clamp to non lazy */
19280 dtrace_dof_mode = DTRACE_DOF_MODE_NON_LAZY;
19281 fasttrap_init();
19282 break;
19283 }
19284
19285 #if CONFIG_DTRACE
19286 if (dtrace_dof_mode != DTRACE_DOF_MODE_NEVER)
19287 commpage_update_dof(true);
19288 #endif
19289
19290 gDTraceInited = 1;
19291
19292 } else
19293 panic("dtrace_init: called twice!\n");
19294 }
19295
19296 void
19297 dtrace_postinit(void)
19298 {
19299 /*
19300 * Called from bsd_init after all provider's *_init() routines have been
19301 * run. That way, anonymous DOF enabled under dtrace_attach() is safe
19302 * to go.
19303 */
19304 dtrace_attach( (dev_info_t *)(uintptr_t)makedev(gMajDevNo, 0)); /* Punning a dev_t to a dev_info_t* */
19305
19306 /*
19307 * Add the mach_kernel to the module list for lazy processing
19308 */
19309 struct kmod_info fake_kernel_kmod;
19310 memset(&fake_kernel_kmod, 0, sizeof(fake_kernel_kmod));
19311
19312 strlcpy(fake_kernel_kmod.name, "mach_kernel", sizeof(fake_kernel_kmod.name));
19313 fake_kernel_kmod.id = 1;
19314 fake_kernel_kmod.address = g_kernel_kmod_info.address;
19315 fake_kernel_kmod.size = g_kernel_kmod_info.size;
19316
19317 if (dtrace_module_loaded(&fake_kernel_kmod, 0) != 0) {
19318 printf("dtrace_postinit: Could not register mach_kernel modctl\n");
19319 }
19320
19321 (void)OSKextRegisterKextsWithDTrace();
19322 }
19323 #undef DTRACE_MAJOR
19324
19325 /*
19326 * Routines used to register interest in cpu's being added to or removed
19327 * from the system.
19328 */
19329 void
19330 register_cpu_setup_func(cpu_setup_func_t *ignore1, void *ignore2)
19331 {
19332 #pragma unused(ignore1,ignore2)
19333 }
19334
19335 void
19336 unregister_cpu_setup_func(cpu_setup_func_t *ignore1, void *ignore2)
19337 {
19338 #pragma unused(ignore1,ignore2)
19339 }