]> git.saurik.com Git - apple/xnu.git/blob - bsd/dev/dtrace/dtrace.c
xnu-7195.50.7.100.1.tar.gz
[apple/xnu.git] / bsd / dev / dtrace / dtrace.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Portions Copyright (c) 2013, 2016, Joyent, Inc. All rights reserved.
24 * Portions Copyright (c) 2013 by Delphix. All rights reserved.
25 */
26
27 /*
28 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
29 * Use is subject to license terms.
30 */
31
32 /*
33 * DTrace - Dynamic Tracing for Solaris
34 *
35 * This is the implementation of the Solaris Dynamic Tracing framework
36 * (DTrace). The user-visible interface to DTrace is described at length in
37 * the "Solaris Dynamic Tracing Guide". The interfaces between the libdtrace
38 * library, the in-kernel DTrace framework, and the DTrace providers are
39 * described in the block comments in the <sys/dtrace.h> header file. The
40 * internal architecture of DTrace is described in the block comments in the
41 * <sys/dtrace_impl.h> header file. The comments contained within the DTrace
42 * implementation very much assume mastery of all of these sources; if one has
43 * an unanswered question about the implementation, one should consult them
44 * first.
45 *
46 * The functions here are ordered roughly as follows:
47 *
48 * - Probe context functions
49 * - Probe hashing functions
50 * - Non-probe context utility functions
51 * - Matching functions
52 * - Provider-to-Framework API functions
53 * - Probe management functions
54 * - DIF object functions
55 * - Format functions
56 * - Predicate functions
57 * - ECB functions
58 * - Buffer functions
59 * - Enabling functions
60 * - DOF functions
61 * - Anonymous enabling functions
62 * - Process functions
63 * - Consumer state functions
64 * - Helper functions
65 * - Hook functions
66 * - Driver cookbook functions
67 *
68 * Each group of functions begins with a block comment labelled the "DTrace
69 * [Group] Functions", allowing one to find each block by searching forward
70 * on capital-f functions.
71 */
72 #include <sys/errno.h>
73 #include <sys/types.h>
74 #include <sys/stat.h>
75 #include <sys/conf.h>
76 #include <sys/random.h>
77 #include <sys/systm.h>
78 #include <sys/dtrace_impl.h>
79 #include <sys/param.h>
80 #include <sys/proc_internal.h>
81 #include <sys/ioctl.h>
82 #include <sys/fcntl.h>
83 #include <miscfs/devfs/devfs.h>
84 #include <sys/malloc.h>
85 #include <sys/kernel_types.h>
86 #include <sys/proc_internal.h>
87 #include <sys/uio_internal.h>
88 #include <sys/kauth.h>
89 #include <vm/pmap.h>
90 #include <sys/user.h>
91 #include <mach/exception_types.h>
92 #include <sys/signalvar.h>
93 #include <mach/task.h>
94 #include <kern/zalloc.h>
95 #include <kern/ast.h>
96 #include <kern/sched_prim.h>
97 #include <kern/task.h>
98 #include <netinet/in.h>
99 #include <libkern/sysctl.h>
100 #include <sys/kdebug.h>
101
102 #if MONOTONIC
103 #include <kern/monotonic.h>
104 #include <machine/monotonic.h>
105 #endif /* MONOTONIC */
106
107 #include "dtrace_xoroshiro128_plus.h"
108
109 #include <IOKit/IOPlatformExpert.h>
110
111 #include <kern/cpu_data.h>
112
113 extern addr64_t kvtophys(vm_offset_t va);
114
115 extern uint32_t pmap_find_phys(void *, uint64_t);
116 extern boolean_t pmap_valid_page(uint32_t);
117 extern void OSKextRegisterKextsWithDTrace(void);
118 extern kmod_info_t g_kernel_kmod_info;
119 extern void commpage_update_dof(boolean_t enabled);
120
121 /* Solaris proc_t is the struct. Darwin's proc_t is a pointer to it. */
122 #define proc_t struct proc /* Steer clear of the Darwin typedef for proc_t */
123
124 #define t_predcache t_dtrace_predcache /* Cosmetic. Helps readability of thread.h */
125
126 extern void dtrace_suspend(void);
127 extern void dtrace_resume(void);
128 extern void dtrace_early_init(void);
129 extern int dtrace_keep_kernel_symbols(void);
130 extern void dtrace_init(void);
131 extern void helper_init(void);
132 extern void fasttrap_init(void);
133
134 static int dtrace_lazy_dofs_duplicate(proc_t *, proc_t *);
135 extern void dtrace_lazy_dofs_destroy(proc_t *);
136 extern void dtrace_postinit(void);
137
138 extern void dtrace_proc_fork(proc_t*, proc_t*, int);
139 extern void dtrace_proc_exec(proc_t*);
140 extern void dtrace_proc_exit(proc_t*);
141
142 /*
143 * DTrace Tunable Variables
144 *
145 * The following variables may be dynamically tuned by using sysctl(8), the
146 * variables being stored in the kern.dtrace namespace. For example:
147 * sysctl kern.dtrace.dof_maxsize = 1048575 # 1M
148 *
149 * In general, the only variables that one should be tuning this way are those
150 * that affect system-wide DTrace behavior, and for which the default behavior
151 * is undesirable. Most of these variables are tunable on a per-consumer
152 * basis using DTrace options, and need not be tuned on a system-wide basis.
153 * When tuning these variables, avoid pathological values; while some attempt
154 * is made to verify the integrity of these variables, they are not considered
155 * part of the supported interface to DTrace, and they are therefore not
156 * checked comprehensively.
157 */
158 uint64_t dtrace_buffer_memory_maxsize = 0; /* initialized in dtrace_init */
159 uint64_t dtrace_buffer_memory_inuse = 0;
160 int dtrace_destructive_disallow = 1;
161 dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024);
162 size_t dtrace_difo_maxsize = (256 * 1024);
163 dtrace_optval_t dtrace_dof_maxsize = (512 * 1024);
164 dtrace_optval_t dtrace_statvar_maxsize = (16 * 1024);
165 dtrace_optval_t dtrace_statvar_maxsize_max = (16 * 10 * 1024);
166 size_t dtrace_actions_max = (16 * 1024);
167 size_t dtrace_retain_max = 1024;
168 dtrace_optval_t dtrace_helper_actions_max = 32;
169 dtrace_optval_t dtrace_helper_providers_max = 64;
170 dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024);
171 size_t dtrace_strsize_default = 256;
172 dtrace_optval_t dtrace_strsize_min = 8;
173 dtrace_optval_t dtrace_strsize_max = 65536;
174 dtrace_optval_t dtrace_cleanrate_default = 990099000; /* 1.1 hz */
175 dtrace_optval_t dtrace_cleanrate_min = 20000000; /* 50 hz */
176 dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC; /* 1/minute */
177 dtrace_optval_t dtrace_aggrate_default = NANOSEC; /* 1 hz */
178 dtrace_optval_t dtrace_statusrate_default = NANOSEC; /* 1 hz */
179 dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC; /* 6/minute */
180 dtrace_optval_t dtrace_switchrate_default = NANOSEC; /* 1 hz */
181 dtrace_optval_t dtrace_nspec_default = 1;
182 dtrace_optval_t dtrace_specsize_default = 32 * 1024;
183 dtrace_optval_t dtrace_stackframes_default = 20;
184 dtrace_optval_t dtrace_ustackframes_default = 20;
185 dtrace_optval_t dtrace_jstackframes_default = 50;
186 dtrace_optval_t dtrace_jstackstrsize_default = 512;
187 dtrace_optval_t dtrace_buflimit_default = 75;
188 dtrace_optval_t dtrace_buflimit_min = 1;
189 dtrace_optval_t dtrace_buflimit_max = 99;
190 size_t dtrace_nprobes_default = 4;
191 int dtrace_msgdsize_max = 128;
192 hrtime_t dtrace_chill_max = 500 * (NANOSEC / MILLISEC); /* 500 ms */
193 hrtime_t dtrace_chill_interval = NANOSEC; /* 1000 ms */
194 int dtrace_devdepth_max = 32;
195 int dtrace_err_verbose;
196 hrtime_t dtrace_deadman_interval = NANOSEC;
197 hrtime_t dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
198 hrtime_t dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
199
200 /*
201 * DTrace External Variables
202 *
203 * As dtrace(7D) is a kernel module, any DTrace variables are obviously
204 * available to DTrace consumers via the backtick (`) syntax. One of these,
205 * dtrace_zero, is made deliberately so: it is provided as a source of
206 * well-known, zero-filled memory. While this variable is not documented,
207 * it is used by some translators as an implementation detail.
208 */
209 const char dtrace_zero[256] = { 0 }; /* zero-filled memory */
210 unsigned int dtrace_max_cpus = 0; /* number of enabled cpus */
211 /*
212 * DTrace Internal Variables
213 */
214 static dev_info_t *dtrace_devi; /* device info */
215 static vmem_t *dtrace_arena; /* probe ID arena */
216 static dtrace_probe_t **dtrace_probes; /* array of all probes */
217 static int dtrace_nprobes; /* number of probes */
218 static dtrace_provider_t *dtrace_provider; /* provider list */
219 static dtrace_meta_t *dtrace_meta_pid; /* user-land meta provider */
220 static int dtrace_opens; /* number of opens */
221 static int dtrace_helpers; /* number of helpers */
222 static dtrace_hash_t *dtrace_strings;
223 static dtrace_hash_t *dtrace_byprov; /* probes hashed by provider */
224 static dtrace_hash_t *dtrace_bymod; /* probes hashed by module */
225 static dtrace_hash_t *dtrace_byfunc; /* probes hashed by function */
226 static dtrace_hash_t *dtrace_byname; /* probes hashed by name */
227 static dtrace_toxrange_t *dtrace_toxrange; /* toxic range array */
228 static int dtrace_toxranges; /* number of toxic ranges */
229 static int dtrace_toxranges_max; /* size of toxic range array */
230 static dtrace_anon_t dtrace_anon; /* anonymous enabling */
231 static kmem_cache_t *dtrace_state_cache; /* cache for dynamic state */
232 static uint64_t dtrace_vtime_references; /* number of vtimestamp refs */
233 static kthread_t *dtrace_panicked; /* panicking thread */
234 static dtrace_ecb_t *dtrace_ecb_create_cache; /* cached created ECB */
235 static dtrace_genid_t dtrace_probegen; /* current probe generation */
236 static dtrace_helpers_t *dtrace_deferred_pid; /* deferred helper list */
237 static dtrace_enabling_t *dtrace_retained; /* list of retained enablings */
238 static dtrace_genid_t dtrace_retained_gen; /* current retained enab gen */
239 static dtrace_dynvar_t dtrace_dynhash_sink; /* end of dynamic hash chains */
240
241 static int dtrace_dof_mode; /* See dtrace_impl.h for a description of Darwin's dof modes. */
242
243 /*
244 * This does't quite fit as an internal variable, as it must be accessed in
245 * fbt_provide and sdt_provide. Its clearly not a dtrace tunable variable either...
246 */
247 int dtrace_kernel_symbol_mode; /* See dtrace_impl.h for a description of Darwin's kernel symbol modes. */
248 static uint32_t dtrace_wake_clients;
249 static uint8_t dtrace_kerneluuid[16]; /* the 128-bit uuid */
250
251 /*
252 * To save memory, some common memory allocations are given a
253 * unique zone. For example, dtrace_probe_t is 72 bytes in size,
254 * which means it would fall into the kalloc.128 bucket. With
255 * 20k elements allocated, the space saved is substantial.
256 */
257
258 static ZONE_DECLARE(dtrace_probe_t_zone, "dtrace.dtrace_probe_t",
259 sizeof(dtrace_probe_t), ZC_NONE);
260
261 static int dtrace_module_unloaded(struct kmod_info *kmod);
262
263 /*
264 * DTrace Locking
265 * DTrace is protected by three (relatively coarse-grained) locks:
266 *
267 * (1) dtrace_lock is required to manipulate essentially any DTrace state,
268 * including enabling state, probes, ECBs, consumer state, helper state,
269 * etc. Importantly, dtrace_lock is _not_ required when in probe context;
270 * probe context is lock-free -- synchronization is handled via the
271 * dtrace_sync() cross call mechanism.
272 *
273 * (2) dtrace_provider_lock is required when manipulating provider state, or
274 * when provider state must be held constant.
275 *
276 * (3) dtrace_meta_lock is required when manipulating meta provider state, or
277 * when meta provider state must be held constant.
278 *
279 * The lock ordering between these three locks is dtrace_meta_lock before
280 * dtrace_provider_lock before dtrace_lock. (In particular, there are
281 * several places where dtrace_provider_lock is held by the framework as it
282 * calls into the providers -- which then call back into the framework,
283 * grabbing dtrace_lock.)
284 *
285 * There are two other locks in the mix: mod_lock and cpu_lock. With respect
286 * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
287 * role as a coarse-grained lock; it is acquired before both of these locks.
288 * With respect to dtrace_meta_lock, its behavior is stranger: cpu_lock must
289 * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
290 * mod_lock is similar with respect to dtrace_provider_lock in that it must be
291 * acquired _between_ dtrace_provider_lock and dtrace_lock.
292 */
293
294
295 /*
296 * APPLE NOTE:
297 *
298 * For porting purposes, all kmutex_t vars have been changed
299 * to lck_mtx_t, which require explicit initialization.
300 *
301 * kmutex_t becomes lck_mtx_t
302 * mutex_enter() becomes lck_mtx_lock()
303 * mutex_exit() becomes lck_mtx_unlock()
304 *
305 * Lock asserts are changed like this:
306 *
307 * ASSERT(MUTEX_HELD(&cpu_lock));
308 * becomes:
309 * LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
310 *
311 */
312 static lck_mtx_t dtrace_lock; /* probe state lock */
313 static lck_mtx_t dtrace_provider_lock; /* provider state lock */
314 static lck_mtx_t dtrace_meta_lock; /* meta-provider state lock */
315 static lck_rw_t dtrace_dof_mode_lock; /* dof mode lock */
316
317 /*
318 * DTrace Provider Variables
319 *
320 * These are the variables relating to DTrace as a provider (that is, the
321 * provider of the BEGIN, END, and ERROR probes).
322 */
323 static dtrace_pattr_t dtrace_provider_attr = {
324 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
325 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
326 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
327 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
328 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
329 };
330
331 static void
332 dtrace_provide_nullop(void *arg, const dtrace_probedesc_t *desc)
333 {
334 #pragma unused(arg, desc)
335 }
336
337 static void
338 dtrace_provide_module_nullop(void *arg, struct modctl *ctl)
339 {
340 #pragma unused(arg, ctl)
341 }
342
343 static int
344 dtrace_enable_nullop(void *arg, dtrace_id_t id, void *parg)
345 {
346 #pragma unused(arg, id, parg)
347 return (0);
348 }
349
350 static void
351 dtrace_disable_nullop(void *arg, dtrace_id_t id, void *parg)
352 {
353 #pragma unused(arg, id, parg)
354 }
355
356 static void
357 dtrace_suspend_nullop(void *arg, dtrace_id_t id, void *parg)
358 {
359 #pragma unused(arg, id, parg)
360 }
361
362 static void
363 dtrace_resume_nullop(void *arg, dtrace_id_t id, void *parg)
364 {
365 #pragma unused(arg, id, parg)
366 }
367
368 static void
369 dtrace_destroy_nullop(void *arg, dtrace_id_t id, void *parg)
370 {
371 #pragma unused(arg, id, parg)
372 }
373
374
375 static dtrace_pops_t dtrace_provider_ops = {
376 .dtps_provide = dtrace_provide_nullop,
377 .dtps_provide_module = dtrace_provide_module_nullop,
378 .dtps_enable = dtrace_enable_nullop,
379 .dtps_disable = dtrace_disable_nullop,
380 .dtps_suspend = dtrace_suspend_nullop,
381 .dtps_resume = dtrace_resume_nullop,
382 .dtps_getargdesc = NULL,
383 .dtps_getargval = NULL,
384 .dtps_usermode = NULL,
385 .dtps_destroy = dtrace_destroy_nullop,
386 };
387
388 static dtrace_id_t dtrace_probeid_begin; /* special BEGIN probe */
389 static dtrace_id_t dtrace_probeid_end; /* special END probe */
390 dtrace_id_t dtrace_probeid_error; /* special ERROR probe */
391
392 /*
393 * DTrace Helper Tracing Variables
394 */
395 uint32_t dtrace_helptrace_next = 0;
396 uint32_t dtrace_helptrace_nlocals;
397 char *dtrace_helptrace_buffer;
398 size_t dtrace_helptrace_bufsize = 512 * 1024;
399
400 #if DEBUG
401 int dtrace_helptrace_enabled = 1;
402 #else
403 int dtrace_helptrace_enabled = 0;
404 #endif
405
406 #if defined (__arm64__)
407 /*
408 * The ioctl for adding helper DOF is based on the
409 * size of a user_addr_t. We need to recognize both
410 * U32 and U64 as the same action.
411 */
412 #define DTRACEHIOC_ADDDOF_U32 _IOW('h', 4, user32_addr_t)
413 #define DTRACEHIOC_ADDDOF_U64 _IOW('h', 4, user64_addr_t)
414 #endif /* __arm64__ */
415
416 /*
417 * DTrace Error Hashing
418 *
419 * On DEBUG kernels, DTrace will track the errors that has seen in a hash
420 * table. This is very useful for checking coverage of tests that are
421 * expected to induce DIF or DOF processing errors, and may be useful for
422 * debugging problems in the DIF code generator or in DOF generation . The
423 * error hash may be examined with the ::dtrace_errhash MDB dcmd.
424 */
425 #if DEBUG
426 static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ];
427 static const char *dtrace_errlast;
428 static kthread_t *dtrace_errthread;
429 static lck_mtx_t dtrace_errlock;
430 #endif
431
432 /*
433 * DTrace Macros and Constants
434 *
435 * These are various macros that are useful in various spots in the
436 * implementation, along with a few random constants that have no meaning
437 * outside of the implementation. There is no real structure to this cpp
438 * mishmash -- but is there ever?
439 */
440
441 #define DTRACE_GETSTR(hash, elm) \
442 (hash->dth_getstr(elm, hash->dth_stroffs))
443
444 #define DTRACE_HASHSTR(hash, elm) \
445 dtrace_hash_str(DTRACE_GETSTR(hash, elm))
446
447 #define DTRACE_HASHNEXT(hash, elm) \
448 (void**)((uintptr_t)(elm) + (hash)->dth_nextoffs)
449
450 #define DTRACE_HASHPREV(hash, elm) \
451 (void**)((uintptr_t)(elm) + (hash)->dth_prevoffs)
452
453 #define DTRACE_HASHEQ(hash, lhs, rhs) \
454 (strcmp(DTRACE_GETSTR(hash, lhs), \
455 DTRACE_GETSTR(hash, rhs)) == 0)
456
457 #define DTRACE_AGGHASHSIZE_SLEW 17
458
459 #define DTRACE_V4MAPPED_OFFSET (sizeof (uint32_t) * 3)
460
461 /*
462 * The key for a thread-local variable consists of the lower 61 bits of the
463 * current_thread(), plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
464 * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
465 * equal to a variable identifier. This is necessary (but not sufficient) to
466 * assure that global associative arrays never collide with thread-local
467 * variables. To guarantee that they cannot collide, we must also define the
468 * order for keying dynamic variables. That order is:
469 *
470 * [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
471 *
472 * Because the variable-key and the tls-key are in orthogonal spaces, there is
473 * no way for a global variable key signature to match a thread-local key
474 * signature.
475 */
476 #if defined (__x86_64__)
477 /* FIXME: two function calls!! */
478 #define DTRACE_TLS_THRKEY(where) { \
479 uint_t intr = ml_at_interrupt_context(); /* Note: just one measly bit */ \
480 uint64_t thr = (uintptr_t)current_thread(); \
481 ASSERT(intr < (1 << 3)); \
482 (where) = ((thr + DIF_VARIABLE_MAX) & \
483 (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
484 }
485 #elif defined(__arm__)
486 /* FIXME: three function calls!!! */
487 #define DTRACE_TLS_THRKEY(where) { \
488 uint_t intr = ml_at_interrupt_context(); /* Note: just one measly bit */ \
489 uint64_t thr = (uintptr_t)current_thread(); \
490 uint_t pid = (uint_t)dtrace_proc_selfpid(); \
491 ASSERT(intr < (1 << 3)); \
492 (where) = (((thr << 32 | pid) + DIF_VARIABLE_MAX) & \
493 (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
494 }
495 #elif defined (__arm64__)
496 /* FIXME: two function calls!! */
497 #define DTRACE_TLS_THRKEY(where) { \
498 uint_t intr = ml_at_interrupt_context(); /* Note: just one measly bit */ \
499 uint64_t thr = (uintptr_t)current_thread(); \
500 ASSERT(intr < (1 << 3)); \
501 (where) = ((thr + DIF_VARIABLE_MAX) & \
502 (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
503 }
504 #else
505 #error Unknown architecture
506 #endif
507
508 #define DT_BSWAP_8(x) ((x) & 0xff)
509 #define DT_BSWAP_16(x) ((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
510 #define DT_BSWAP_32(x) ((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
511 #define DT_BSWAP_64(x) ((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
512
513 #define DT_MASK_LO 0x00000000FFFFFFFFULL
514
515 #define DTRACE_STORE(type, tomax, offset, what) \
516 *((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
517
518
519 #define DTRACE_ALIGNCHECK(addr, size, flags) \
520 if (addr & (MIN(size,4) - 1)) { \
521 *flags |= CPU_DTRACE_BADALIGN; \
522 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr; \
523 return (0); \
524 }
525
526 #define DTRACE_RANGE_REMAIN(remp, addr, baseaddr, basesz) \
527 do { \
528 if ((remp) != NULL) { \
529 *(remp) = (uintptr_t)(baseaddr) + (basesz) - (addr); \
530 } \
531 } while (0)
532
533
534 /*
535 * Test whether a range of memory starting at testaddr of size testsz falls
536 * within the range of memory described by addr, sz. We take care to avoid
537 * problems with overflow and underflow of the unsigned quantities, and
538 * disallow all negative sizes. Ranges of size 0 are allowed.
539 */
540 #define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
541 ((testaddr) - (baseaddr) < (basesz) && \
542 (testaddr) + (testsz) - (baseaddr) <= (basesz) && \
543 (testaddr) + (testsz) >= (testaddr))
544
545 /*
546 * Test whether alloc_sz bytes will fit in the scratch region. We isolate
547 * alloc_sz on the righthand side of the comparison in order to avoid overflow
548 * or underflow in the comparison with it. This is simpler than the INRANGE
549 * check above, because we know that the dtms_scratch_ptr is valid in the
550 * range. Allocations of size zero are allowed.
551 */
552 #define DTRACE_INSCRATCH(mstate, alloc_sz) \
553 ((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
554 (mstate)->dtms_scratch_ptr >= (alloc_sz))
555
556 #define RECOVER_LABEL(bits) dtraceLoadRecover##bits:
557
558 #if defined (__x86_64__) || (defined (__arm__) || defined (__arm64__))
559 #define DTRACE_LOADFUNC(bits) \
560 /*CSTYLED*/ \
561 uint##bits##_t dtrace_load##bits(uintptr_t addr); \
562 \
563 uint##bits##_t \
564 dtrace_load##bits(uintptr_t addr) \
565 { \
566 size_t size = bits / NBBY; \
567 /*CSTYLED*/ \
568 uint##bits##_t rval = 0; \
569 int i; \
570 volatile uint16_t *flags = (volatile uint16_t *) \
571 &cpu_core[CPU->cpu_id].cpuc_dtrace_flags; \
572 \
573 DTRACE_ALIGNCHECK(addr, size, flags); \
574 \
575 for (i = 0; i < dtrace_toxranges; i++) { \
576 if (addr >= dtrace_toxrange[i].dtt_limit) \
577 continue; \
578 \
579 if (addr + size <= dtrace_toxrange[i].dtt_base) \
580 continue; \
581 \
582 /* \
583 * This address falls within a toxic region; return 0. \
584 */ \
585 *flags |= CPU_DTRACE_BADADDR; \
586 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr; \
587 return (0); \
588 } \
589 \
590 { \
591 volatile vm_offset_t recover = (vm_offset_t)&&dtraceLoadRecover##bits; \
592 *flags |= CPU_DTRACE_NOFAULT; \
593 recover = dtrace_sign_and_set_thread_recover(current_thread(), recover); \
594 /*CSTYLED*/ \
595 /* \
596 * PR6394061 - avoid device memory that is unpredictably \
597 * mapped and unmapped \
598 */ \
599 if (pmap_valid_page(pmap_find_phys(kernel_pmap, addr))) \
600 rval = *((volatile uint##bits##_t *)addr); \
601 else { \
602 *flags |= CPU_DTRACE_BADADDR; \
603 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr; \
604 return (0); \
605 } \
606 \
607 RECOVER_LABEL(bits); \
608 (void)dtrace_set_thread_recover(current_thread(), recover); \
609 *flags &= ~CPU_DTRACE_NOFAULT; \
610 } \
611 \
612 return (rval); \
613 }
614 #else /* all other architectures */
615 #error Unknown Architecture
616 #endif
617
618 #ifdef __LP64__
619 #define dtrace_loadptr dtrace_load64
620 #else
621 #define dtrace_loadptr dtrace_load32
622 #endif
623
624 #define DTRACE_DYNHASH_FREE 0
625 #define DTRACE_DYNHASH_SINK 1
626 #define DTRACE_DYNHASH_VALID 2
627
628 #define DTRACE_MATCH_FAIL -1
629 #define DTRACE_MATCH_NEXT 0
630 #define DTRACE_MATCH_DONE 1
631 #define DTRACE_ANCHORED(probe) ((probe)->dtpr_func[0] != '\0')
632 #define DTRACE_STATE_ALIGN 64
633
634 #define DTRACE_FLAGS2FLT(flags) \
635 (((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR : \
636 ((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP : \
637 ((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO : \
638 ((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV : \
639 ((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV : \
640 ((flags) & CPU_DTRACE_TUPOFLOW) ? DTRACEFLT_TUPOFLOW : \
641 ((flags) & CPU_DTRACE_BADALIGN) ? DTRACEFLT_BADALIGN : \
642 ((flags) & CPU_DTRACE_NOSCRATCH) ? DTRACEFLT_NOSCRATCH : \
643 ((flags) & CPU_DTRACE_BADSTACK) ? DTRACEFLT_BADSTACK : \
644 DTRACEFLT_UNKNOWN)
645
646 #define DTRACEACT_ISSTRING(act) \
647 ((act)->dta_kind == DTRACEACT_DIFEXPR && \
648 (act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
649
650
651 static size_t dtrace_strlen(const char *, size_t);
652 static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
653 static void dtrace_enabling_provide(dtrace_provider_t *);
654 static int dtrace_enabling_match(dtrace_enabling_t *, int *, dtrace_match_cond_t *cond);
655 static void dtrace_enabling_matchall_with_cond(dtrace_match_cond_t *cond);
656 static void dtrace_enabling_matchall(void);
657 static dtrace_state_t *dtrace_anon_grab(void);
658 static uint64_t dtrace_helper(int, dtrace_mstate_t *,
659 dtrace_state_t *, uint64_t, uint64_t);
660 static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
661 static void dtrace_buffer_drop(dtrace_buffer_t *);
662 static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
663 dtrace_state_t *, dtrace_mstate_t *);
664 static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
665 dtrace_optval_t);
666 static int dtrace_ecb_create_enable(dtrace_probe_t *, void *, void *);
667 static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
668 static int dtrace_canload_remains(uint64_t, size_t, size_t *,
669 dtrace_mstate_t *, dtrace_vstate_t *);
670 static int dtrace_canstore_remains(uint64_t, size_t, size_t *,
671 dtrace_mstate_t *, dtrace_vstate_t *);
672
673
674 /*
675 * DTrace sysctl handlers
676 *
677 * These declarations and functions are used for a deeper DTrace configuration.
678 * Most of them are not per-consumer basis and may impact the other DTrace
679 * consumers. Correctness may not be supported for all the variables, so you
680 * should be careful about what values you are using.
681 */
682
683 SYSCTL_DECL(_kern_dtrace);
684 SYSCTL_NODE(_kern, OID_AUTO, dtrace, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "dtrace");
685
686 static int
687 sysctl_dtrace_err_verbose SYSCTL_HANDLER_ARGS
688 {
689 #pragma unused(oidp, arg2)
690 int changed, error;
691 int value = *(int *) arg1;
692
693 error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
694 if (error || !changed)
695 return (error);
696
697 if (value != 0 && value != 1)
698 return (ERANGE);
699
700 lck_mtx_lock(&dtrace_lock);
701 dtrace_err_verbose = value;
702 lck_mtx_unlock(&dtrace_lock);
703
704 return (0);
705 }
706
707 /*
708 * kern.dtrace.err_verbose
709 *
710 * Set DTrace verbosity when an error occured (0 = disabled, 1 = enabld).
711 * Errors are reported when a DIFO or a DOF has been rejected by the kernel.
712 */
713 SYSCTL_PROC(_kern_dtrace, OID_AUTO, err_verbose,
714 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
715 &dtrace_err_verbose, 0,
716 sysctl_dtrace_err_verbose, "I", "dtrace error verbose");
717
718 static int
719 sysctl_dtrace_buffer_memory_maxsize SYSCTL_HANDLER_ARGS
720 {
721 #pragma unused(oidp, arg2, req)
722 int changed, error;
723 uint64_t value = *(uint64_t *) arg1;
724
725 error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
726 if (error || !changed)
727 return (error);
728
729 if (value <= dtrace_buffer_memory_inuse)
730 return (ERANGE);
731
732 lck_mtx_lock(&dtrace_lock);
733 dtrace_buffer_memory_maxsize = value;
734 lck_mtx_unlock(&dtrace_lock);
735
736 return (0);
737 }
738
739 /*
740 * kern.dtrace.buffer_memory_maxsize
741 *
742 * Set DTrace maximal size in bytes used by all the consumers' state buffers. By default
743 * the limit is PHYS_MEM / 3 for *all* consumers. Attempting to set a null, a negative value
744 * or a value <= to dtrace_buffer_memory_inuse will result in a failure.
745 */
746 SYSCTL_PROC(_kern_dtrace, OID_AUTO, buffer_memory_maxsize,
747 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
748 &dtrace_buffer_memory_maxsize, 0,
749 sysctl_dtrace_buffer_memory_maxsize, "Q", "dtrace state buffer memory maxsize");
750
751 /*
752 * kern.dtrace.buffer_memory_inuse
753 *
754 * Current state buffer memory used, in bytes, by all the DTrace consumers.
755 * This value is read-only.
756 */
757 SYSCTL_QUAD(_kern_dtrace, OID_AUTO, buffer_memory_inuse, CTLFLAG_RD | CTLFLAG_LOCKED,
758 &dtrace_buffer_memory_inuse, "dtrace state buffer memory in-use");
759
760 static int
761 sysctl_dtrace_difo_maxsize SYSCTL_HANDLER_ARGS
762 {
763 #pragma unused(oidp, arg2, req)
764 int changed, error;
765 size_t value = *(size_t*) arg1;
766
767 error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
768 if (error || !changed)
769 return (error);
770
771 if (value <= 0)
772 return (ERANGE);
773
774 lck_mtx_lock(&dtrace_lock);
775 dtrace_difo_maxsize = value;
776 lck_mtx_unlock(&dtrace_lock);
777
778 return (0);
779 }
780
781 /*
782 * kern.dtrace.difo_maxsize
783 *
784 * Set the DIFO max size in bytes, check the definition of dtrace_difo_maxsize
785 * to get the default value. Attempting to set a null or negative size will
786 * result in a failure.
787 */
788 SYSCTL_PROC(_kern_dtrace, OID_AUTO, difo_maxsize,
789 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
790 &dtrace_difo_maxsize, 0,
791 sysctl_dtrace_difo_maxsize, "Q", "dtrace difo maxsize");
792
793 static int
794 sysctl_dtrace_dof_maxsize SYSCTL_HANDLER_ARGS
795 {
796 #pragma unused(oidp, arg2, req)
797 int changed, error;
798 dtrace_optval_t value = *(dtrace_optval_t *) arg1;
799
800 error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
801 if (error || !changed)
802 return (error);
803
804 if (value <= 0)
805 return (ERANGE);
806
807 if (value >= dtrace_copy_maxsize())
808 return (ERANGE);
809
810 lck_mtx_lock(&dtrace_lock);
811 dtrace_dof_maxsize = value;
812 lck_mtx_unlock(&dtrace_lock);
813
814 return (0);
815 }
816
817 /*
818 * kern.dtrace.dof_maxsize
819 *
820 * Set the DOF max size in bytes, check the definition of dtrace_dof_maxsize to
821 * get the default value. Attempting to set a null or negative size will result
822 * in a failure.
823 */
824 SYSCTL_PROC(_kern_dtrace, OID_AUTO, dof_maxsize,
825 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
826 &dtrace_dof_maxsize, 0,
827 sysctl_dtrace_dof_maxsize, "Q", "dtrace dof maxsize");
828
829 static int
830 sysctl_dtrace_statvar_maxsize SYSCTL_HANDLER_ARGS
831 {
832 #pragma unused(oidp, arg2, req)
833 int changed, error;
834 dtrace_optval_t value = *(dtrace_optval_t*) arg1;
835
836 error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
837 if (error || !changed)
838 return (error);
839
840 if (value <= 0)
841 return (ERANGE);
842 if (value > dtrace_statvar_maxsize_max)
843 return (ERANGE);
844
845 lck_mtx_lock(&dtrace_lock);
846 dtrace_statvar_maxsize = value;
847 lck_mtx_unlock(&dtrace_lock);
848
849 return (0);
850 }
851
852 /*
853 * kern.dtrace.global_maxsize
854 *
855 * Set the variable max size in bytes, check the definition of
856 * dtrace_statvar_maxsize to get the default value. Attempting to set a null,
857 * too high or negative size will result in a failure.
858 */
859 SYSCTL_PROC(_kern_dtrace, OID_AUTO, global_maxsize,
860 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
861 &dtrace_statvar_maxsize, 0,
862 sysctl_dtrace_statvar_maxsize, "Q", "dtrace statvar maxsize");
863
864
865 /*
866 * kern.dtrace.provide_private_probes
867 *
868 * Set whether the providers must provide the private probes. This is
869 * kept as compatibility as they are always provided.
870 */
871 SYSCTL_INT(_kern_dtrace, OID_AUTO, provide_private_probes,
872 CTLFLAG_RD | CTLFLAG_LOCKED,
873 (int *)NULL, 1, "provider must provide the private probes");
874
875 /*
876 * kern.dtrace.dof_mode
877 *
878 * Returns the current DOF mode.
879 * This value is read-only.
880 */
881 SYSCTL_INT(_kern_dtrace, OID_AUTO, dof_mode, CTLFLAG_RD | CTLFLAG_LOCKED,
882 &dtrace_dof_mode, 0, "dtrace dof mode");
883
884 /*
885 * DTrace Probe Context Functions
886 *
887 * These functions are called from probe context. Because probe context is
888 * any context in which C may be called, arbitrarily locks may be held,
889 * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
890 * As a result, functions called from probe context may only call other DTrace
891 * support functions -- they may not interact at all with the system at large.
892 * (Note that the ASSERT macro is made probe-context safe by redefining it in
893 * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
894 * loads are to be performed from probe context, they _must_ be in terms of
895 * the safe dtrace_load*() variants.
896 *
897 * Some functions in this block are not actually called from probe context;
898 * for these functions, there will be a comment above the function reading
899 * "Note: not called from probe context."
900 */
901
902 int
903 dtrace_assfail(const char *a, const char *f, int l)
904 {
905 panic("dtrace: assertion failed: %s, file: %s, line: %d", a, f, l);
906
907 /*
908 * We just need something here that even the most clever compiler
909 * cannot optimize away.
910 */
911 return (a[(uintptr_t)f]);
912 }
913
914 /*
915 * Atomically increment a specified error counter from probe context.
916 */
917 static void
918 dtrace_error(uint32_t *counter)
919 {
920 /*
921 * Most counters stored to in probe context are per-CPU counters.
922 * However, there are some error conditions that are sufficiently
923 * arcane that they don't merit per-CPU storage. If these counters
924 * are incremented concurrently on different CPUs, scalability will be
925 * adversely affected -- but we don't expect them to be white-hot in a
926 * correctly constructed enabling...
927 */
928 uint32_t oval, nval;
929
930 do {
931 oval = *counter;
932
933 if ((nval = oval + 1) == 0) {
934 /*
935 * If the counter would wrap, set it to 1 -- assuring
936 * that the counter is never zero when we have seen
937 * errors. (The counter must be 32-bits because we
938 * aren't guaranteed a 64-bit compare&swap operation.)
939 * To save this code both the infamy of being fingered
940 * by a priggish news story and the indignity of being
941 * the target of a neo-puritan witch trial, we're
942 * carefully avoiding any colorful description of the
943 * likelihood of this condition -- but suffice it to
944 * say that it is only slightly more likely than the
945 * overflow of predicate cache IDs, as discussed in
946 * dtrace_predicate_create().
947 */
948 nval = 1;
949 }
950 } while (dtrace_cas32(counter, oval, nval) != oval);
951 }
952
953 /*
954 * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
955 * uint8_t, a uint16_t, a uint32_t and a uint64_t.
956 */
957 DTRACE_LOADFUNC(8)
958 DTRACE_LOADFUNC(16)
959 DTRACE_LOADFUNC(32)
960 DTRACE_LOADFUNC(64)
961
962 static int
963 dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
964 {
965 if (dest < mstate->dtms_scratch_base)
966 return (0);
967
968 if (dest + size < dest)
969 return (0);
970
971 if (dest + size > mstate->dtms_scratch_ptr)
972 return (0);
973
974 return (1);
975 }
976
977 static int
978 dtrace_canstore_statvar(uint64_t addr, size_t sz, size_t *remain,
979 dtrace_statvar_t **svars, int nsvars)
980 {
981 int i;
982
983 size_t maxglobalsize, maxlocalsize;
984
985 maxglobalsize = dtrace_statvar_maxsize + sizeof (uint64_t);
986 maxlocalsize = (maxglobalsize) * NCPU;
987
988 if (nsvars == 0)
989 return (0);
990
991 for (i = 0; i < nsvars; i++) {
992 dtrace_statvar_t *svar = svars[i];
993 uint8_t scope;
994 size_t size;
995
996 if (svar == NULL || (size = svar->dtsv_size) == 0)
997 continue;
998
999 scope = svar->dtsv_var.dtdv_scope;
1000
1001 /**
1002 * We verify that our size is valid in the spirit of providing
1003 * defense in depth: we want to prevent attackers from using
1004 * DTrace to escalate an orthogonal kernel heap corruption bug
1005 * into the ability to store to arbitrary locations in memory.
1006 */
1007 VERIFY((scope == DIFV_SCOPE_GLOBAL && size <= maxglobalsize) ||
1008 (scope == DIFV_SCOPE_LOCAL && size <= maxlocalsize));
1009
1010 if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size)) {
1011 DTRACE_RANGE_REMAIN(remain, addr, svar->dtsv_data,
1012 svar->dtsv_size);
1013 return (1);
1014 }
1015 }
1016
1017 return (0);
1018 }
1019
1020 /*
1021 * Check to see if the address is within a memory region to which a store may
1022 * be issued. This includes the DTrace scratch areas, and any DTrace variable
1023 * region. The caller of dtrace_canstore() is responsible for performing any
1024 * alignment checks that are needed before stores are actually executed.
1025 */
1026 static int
1027 dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
1028 dtrace_vstate_t *vstate)
1029 {
1030 return (dtrace_canstore_remains(addr, sz, NULL, mstate, vstate));
1031 }
1032 /*
1033 * Implementation of dtrace_canstore which communicates the upper bound of the
1034 * allowed memory region.
1035 */
1036 static int
1037 dtrace_canstore_remains(uint64_t addr, size_t sz, size_t *remain,
1038 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1039 {
1040 /*
1041 * First, check to see if the address is in scratch space...
1042 */
1043 if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
1044 mstate->dtms_scratch_size)) {
1045 DTRACE_RANGE_REMAIN(remain, addr, mstate->dtms_scratch_base,
1046 mstate->dtms_scratch_size);
1047 return (1);
1048 }
1049 /*
1050 * Now check to see if it's a dynamic variable. This check will pick
1051 * up both thread-local variables and any global dynamically-allocated
1052 * variables.
1053 */
1054 if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base,
1055 vstate->dtvs_dynvars.dtds_size)) {
1056 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
1057 uintptr_t base = (uintptr_t)dstate->dtds_base +
1058 (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
1059 uintptr_t chunkoffs;
1060 dtrace_dynvar_t *dvar;
1061
1062 /*
1063 * Before we assume that we can store here, we need to make
1064 * sure that it isn't in our metadata -- storing to our
1065 * dynamic variable metadata would corrupt our state. For
1066 * the range to not include any dynamic variable metadata,
1067 * it must:
1068 *
1069 * (1) Start above the hash table that is at the base of
1070 * the dynamic variable space
1071 *
1072 * (2) Have a starting chunk offset that is beyond the
1073 * dtrace_dynvar_t that is at the base of every chunk
1074 *
1075 * (3) Not span a chunk boundary
1076 *
1077 * (4) Not be in the tuple space of a dynamic variable
1078 *
1079 */
1080 if (addr < base)
1081 return (0);
1082
1083 chunkoffs = (addr - base) % dstate->dtds_chunksize;
1084
1085 if (chunkoffs < sizeof (dtrace_dynvar_t))
1086 return (0);
1087
1088 if (chunkoffs + sz > dstate->dtds_chunksize)
1089 return (0);
1090
1091 dvar = (dtrace_dynvar_t *)((uintptr_t)addr - chunkoffs);
1092
1093 if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE)
1094 return (0);
1095
1096 if (chunkoffs < sizeof (dtrace_dynvar_t) +
1097 ((dvar->dtdv_tuple.dtt_nkeys - 1) * sizeof (dtrace_key_t)))
1098 return (0);
1099
1100 return (1);
1101 }
1102
1103 /*
1104 * Finally, check the static local and global variables. These checks
1105 * take the longest, so we perform them last.
1106 */
1107 if (dtrace_canstore_statvar(addr, sz, remain,
1108 vstate->dtvs_locals, vstate->dtvs_nlocals))
1109 return (1);
1110
1111 if (dtrace_canstore_statvar(addr, sz, remain,
1112 vstate->dtvs_globals, vstate->dtvs_nglobals))
1113 return (1);
1114
1115 return (0);
1116 }
1117
1118
1119 /*
1120 * Convenience routine to check to see if the address is within a memory
1121 * region in which a load may be issued given the user's privilege level;
1122 * if not, it sets the appropriate error flags and loads 'addr' into the
1123 * illegal value slot.
1124 *
1125 * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
1126 * appropriate memory access protection.
1127 */
1128 int
1129 dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
1130 dtrace_vstate_t *vstate)
1131 {
1132 return (dtrace_canload_remains(addr, sz, NULL, mstate, vstate));
1133 }
1134
1135 /*
1136 * Implementation of dtrace_canload which communicates the upper bound of the
1137 * allowed memory region.
1138 */
1139 static int
1140 dtrace_canload_remains(uint64_t addr, size_t sz, size_t *remain,
1141 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1142 {
1143 volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
1144
1145 /*
1146 * If we hold the privilege to read from kernel memory, then
1147 * everything is readable.
1148 */
1149 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
1150 DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
1151 return (1);
1152 }
1153
1154 /*
1155 * You can obviously read that which you can store.
1156 */
1157 if (dtrace_canstore_remains(addr, sz, remain, mstate, vstate))
1158 return (1);
1159
1160 /*
1161 * We're allowed to read from our own string table.
1162 */
1163 if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab,
1164 mstate->dtms_difo->dtdo_strlen)) {
1165 DTRACE_RANGE_REMAIN(remain, addr,
1166 mstate->dtms_difo->dtdo_strtab,
1167 mstate->dtms_difo->dtdo_strlen);
1168 return (1);
1169 }
1170
1171 DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
1172 *illval = addr;
1173 return (0);
1174 }
1175
1176 /*
1177 * Convenience routine to check to see if a given string is within a memory
1178 * region in which a load may be issued given the user's privilege level;
1179 * this exists so that we don't need to issue unnecessary dtrace_strlen()
1180 * calls in the event that the user has all privileges.
1181 */
1182 static int
1183 dtrace_strcanload(uint64_t addr, size_t sz, size_t *remain,
1184 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1185 {
1186 size_t rsize;
1187
1188 /*
1189 * If we hold the privilege to read from kernel memory, then
1190 * everything is readable.
1191 */
1192 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
1193 DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
1194 return (1);
1195 }
1196
1197 /*
1198 * Even if the caller is uninterested in querying the remaining valid
1199 * range, it is required to ensure that the access is allowed.
1200 */
1201 if (remain == NULL) {
1202 remain = &rsize;
1203 }
1204 if (dtrace_canload_remains(addr, 0, remain, mstate, vstate)) {
1205 size_t strsz;
1206 /*
1207 * Perform the strlen after determining the length of the
1208 * memory region which is accessible. This prevents timing
1209 * information from being used to find NULs in memory which is
1210 * not accessible to the caller.
1211 */
1212 strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr,
1213 MIN(sz, *remain));
1214 if (strsz <= *remain) {
1215 return (1);
1216 }
1217 }
1218
1219 return (0);
1220 }
1221
1222 /*
1223 * Convenience routine to check to see if a given variable is within a memory
1224 * region in which a load may be issued given the user's privilege level.
1225 */
1226 static int
1227 dtrace_vcanload(void *src, dtrace_diftype_t *type, size_t *remain,
1228 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1229 {
1230 size_t sz;
1231 ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1232
1233 /*
1234 * Calculate the max size before performing any checks since even
1235 * DTRACE_ACCESS_KERNEL-credentialed callers expect that this function
1236 * return the max length via 'remain'.
1237 */
1238 if (type->dtdt_kind == DIF_TYPE_STRING) {
1239 dtrace_state_t *state = vstate->dtvs_state;
1240
1241 if (state != NULL) {
1242 sz = state->dts_options[DTRACEOPT_STRSIZE];
1243 } else {
1244 /*
1245 * In helper context, we have a NULL state; fall back
1246 * to using the system-wide default for the string size
1247 * in this case.
1248 */
1249 sz = dtrace_strsize_default;
1250 }
1251 } else {
1252 sz = type->dtdt_size;
1253 }
1254
1255 /*
1256 * If we hold the privilege to read from kernel memory, then
1257 * everything is readable.
1258 */
1259 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
1260 DTRACE_RANGE_REMAIN(remain, (uintptr_t)src, src, sz);
1261 return (1);
1262 }
1263
1264 if (type->dtdt_kind == DIF_TYPE_STRING) {
1265 return (dtrace_strcanload((uintptr_t)src, sz, remain, mstate,
1266 vstate));
1267 }
1268 return (dtrace_canload_remains((uintptr_t)src, sz, remain, mstate,
1269 vstate));
1270 }
1271
1272 #define isdigit(ch) ((ch) >= '0' && (ch) <= '9')
1273 #define islower(ch) ((ch) >= 'a' && (ch) <= 'z')
1274 #define isspace(ch) (((ch) == ' ') || ((ch) == '\r') || ((ch) == '\n') || \
1275 ((ch) == '\t') || ((ch) == '\f'))
1276 #define isxdigit(ch) (isdigit(ch) || ((ch) >= 'a' && (ch) <= 'f') || \
1277 ((ch) >= 'A' && (ch) <= 'F'))
1278 #define lisalnum(x) \
1279 (isdigit(x) || ((x) >= 'a' && (x) <= 'z') || ((x) >= 'A' && (x) <= 'Z'))
1280
1281 #define DIGIT(x) \
1282 (isdigit(x) ? (x) - '0' : islower(x) ? (x) + 10 - 'a' : (x) + 10 - 'A')
1283
1284 /*
1285 * Convert a string to a signed integer using safe loads.
1286 */
1287 static int64_t
1288 dtrace_strtoll(char *input, int base, size_t limit)
1289 {
1290 uintptr_t pos = (uintptr_t)input;
1291 int64_t val = 0;
1292 int x;
1293 boolean_t neg = B_FALSE;
1294 char c, cc, ccc;
1295 uintptr_t end = pos + limit;
1296
1297 /*
1298 * Consume any whitespace preceding digits.
1299 */
1300 while ((c = dtrace_load8(pos)) == ' ' || c == '\t')
1301 pos++;
1302
1303 /*
1304 * Handle an explicit sign if one is present.
1305 */
1306 if (c == '-' || c == '+') {
1307 if (c == '-')
1308 neg = B_TRUE;
1309 c = dtrace_load8(++pos);
1310 }
1311
1312 /*
1313 * Check for an explicit hexadecimal prefix ("0x" or "0X") and skip it
1314 * if present.
1315 */
1316 if (base == 16 && c == '0' && ((cc = dtrace_load8(pos + 1)) == 'x' ||
1317 cc == 'X') && isxdigit(ccc = dtrace_load8(pos + 2))) {
1318 pos += 2;
1319 c = ccc;
1320 }
1321
1322 /*
1323 * Read in contiguous digits until the first non-digit character.
1324 */
1325 for (; pos < end && c != '\0' && lisalnum(c) && (x = DIGIT(c)) < base;
1326 c = dtrace_load8(++pos))
1327 val = val * base + x;
1328
1329 return (neg ? -val : val);
1330 }
1331
1332
1333 /*
1334 * Compare two strings using safe loads.
1335 */
1336 static int
1337 dtrace_strncmp(const char *s1, const char *s2, size_t limit)
1338 {
1339 uint8_t c1, c2;
1340 volatile uint16_t *flags;
1341
1342 if (s1 == s2 || limit == 0)
1343 return (0);
1344
1345 flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1346
1347 do {
1348 if (s1 == NULL) {
1349 c1 = '\0';
1350 } else {
1351 c1 = dtrace_load8((uintptr_t)s1++);
1352 }
1353
1354 if (s2 == NULL) {
1355 c2 = '\0';
1356 } else {
1357 c2 = dtrace_load8((uintptr_t)s2++);
1358 }
1359
1360 if (c1 != c2)
1361 return (c1 - c2);
1362 } while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
1363
1364 return (0);
1365 }
1366
1367 /*
1368 * Compute strlen(s) for a string using safe memory accesses. The additional
1369 * len parameter is used to specify a maximum length to ensure completion.
1370 */
1371 static size_t
1372 dtrace_strlen(const char *s, size_t lim)
1373 {
1374 uint_t len;
1375
1376 for (len = 0; len != lim; len++) {
1377 if (dtrace_load8((uintptr_t)s++) == '\0')
1378 break;
1379 }
1380
1381 return (len);
1382 }
1383
1384 /*
1385 * Check if an address falls within a toxic region.
1386 */
1387 static int
1388 dtrace_istoxic(uintptr_t kaddr, size_t size)
1389 {
1390 uintptr_t taddr, tsize;
1391 int i;
1392
1393 for (i = 0; i < dtrace_toxranges; i++) {
1394 taddr = dtrace_toxrange[i].dtt_base;
1395 tsize = dtrace_toxrange[i].dtt_limit - taddr;
1396
1397 if (kaddr - taddr < tsize) {
1398 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1399 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = kaddr;
1400 return (1);
1401 }
1402
1403 if (taddr - kaddr < size) {
1404 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1405 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = taddr;
1406 return (1);
1407 }
1408 }
1409
1410 return (0);
1411 }
1412
1413 /*
1414 * Copy src to dst using safe memory accesses. The src is assumed to be unsafe
1415 * memory specified by the DIF program. The dst is assumed to be safe memory
1416 * that we can store to directly because it is managed by DTrace. As with
1417 * standard bcopy, overlapping copies are handled properly.
1418 */
1419 static void
1420 dtrace_bcopy(const void *src, void *dst, size_t len)
1421 {
1422 if (len != 0) {
1423 uint8_t *s1 = dst;
1424 const uint8_t *s2 = src;
1425
1426 if (s1 <= s2) {
1427 do {
1428 *s1++ = dtrace_load8((uintptr_t)s2++);
1429 } while (--len != 0);
1430 } else {
1431 s2 += len;
1432 s1 += len;
1433
1434 do {
1435 *--s1 = dtrace_load8((uintptr_t)--s2);
1436 } while (--len != 0);
1437 }
1438 }
1439 }
1440
1441 /*
1442 * Copy src to dst using safe memory accesses, up to either the specified
1443 * length, or the point that a nul byte is encountered. The src is assumed to
1444 * be unsafe memory specified by the DIF program. The dst is assumed to be
1445 * safe memory that we can store to directly because it is managed by DTrace.
1446 * Unlike dtrace_bcopy(), overlapping regions are not handled.
1447 */
1448 static void
1449 dtrace_strcpy(const void *src, void *dst, size_t len)
1450 {
1451 if (len != 0) {
1452 uint8_t *s1 = dst, c;
1453 const uint8_t *s2 = src;
1454
1455 do {
1456 *s1++ = c = dtrace_load8((uintptr_t)s2++);
1457 } while (--len != 0 && c != '\0');
1458 }
1459 }
1460
1461 /*
1462 * Copy src to dst, deriving the size and type from the specified (BYREF)
1463 * variable type. The src is assumed to be unsafe memory specified by the DIF
1464 * program. The dst is assumed to be DTrace variable memory that is of the
1465 * specified type; we assume that we can store to directly.
1466 */
1467 static void
1468 dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type, size_t limit)
1469 {
1470 ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1471
1472 if (type->dtdt_kind == DIF_TYPE_STRING) {
1473 dtrace_strcpy(src, dst, MIN(type->dtdt_size, limit));
1474 } else {
1475 dtrace_bcopy(src, dst, MIN(type->dtdt_size, limit));
1476 }
1477 }
1478
1479 /*
1480 * Compare s1 to s2 using safe memory accesses. The s1 data is assumed to be
1481 * unsafe memory specified by the DIF program. The s2 data is assumed to be
1482 * safe memory that we can access directly because it is managed by DTrace.
1483 */
1484 static int
1485 dtrace_bcmp(const void *s1, const void *s2, size_t len)
1486 {
1487 volatile uint16_t *flags;
1488
1489 flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1490
1491 if (s1 == s2)
1492 return (0);
1493
1494 if (s1 == NULL || s2 == NULL)
1495 return (1);
1496
1497 if (s1 != s2 && len != 0) {
1498 const uint8_t *ps1 = s1;
1499 const uint8_t *ps2 = s2;
1500
1501 do {
1502 if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
1503 return (1);
1504 } while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
1505 }
1506 return (0);
1507 }
1508
1509 /*
1510 * Zero the specified region using a simple byte-by-byte loop. Note that this
1511 * is for safe DTrace-managed memory only.
1512 */
1513 static void
1514 dtrace_bzero(void *dst, size_t len)
1515 {
1516 uchar_t *cp;
1517
1518 for (cp = dst; len != 0; len--)
1519 *cp++ = 0;
1520 }
1521
1522 static void
1523 dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
1524 {
1525 uint64_t result[2];
1526
1527 result[0] = addend1[0] + addend2[0];
1528 result[1] = addend1[1] + addend2[1] +
1529 (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
1530
1531 sum[0] = result[0];
1532 sum[1] = result[1];
1533 }
1534
1535 /*
1536 * Shift the 128-bit value in a by b. If b is positive, shift left.
1537 * If b is negative, shift right.
1538 */
1539 static void
1540 dtrace_shift_128(uint64_t *a, int b)
1541 {
1542 uint64_t mask;
1543
1544 if (b == 0)
1545 return;
1546
1547 if (b < 0) {
1548 b = -b;
1549 if (b >= 64) {
1550 a[0] = a[1] >> (b - 64);
1551 a[1] = 0;
1552 } else {
1553 a[0] >>= b;
1554 mask = 1LL << (64 - b);
1555 mask -= 1;
1556 a[0] |= ((a[1] & mask) << (64 - b));
1557 a[1] >>= b;
1558 }
1559 } else {
1560 if (b >= 64) {
1561 a[1] = a[0] << (b - 64);
1562 a[0] = 0;
1563 } else {
1564 a[1] <<= b;
1565 mask = a[0] >> (64 - b);
1566 a[1] |= mask;
1567 a[0] <<= b;
1568 }
1569 }
1570 }
1571
1572 /*
1573 * The basic idea is to break the 2 64-bit values into 4 32-bit values,
1574 * use native multiplication on those, and then re-combine into the
1575 * resulting 128-bit value.
1576 *
1577 * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1578 * hi1 * hi2 << 64 +
1579 * hi1 * lo2 << 32 +
1580 * hi2 * lo1 << 32 +
1581 * lo1 * lo2
1582 */
1583 static void
1584 dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1585 {
1586 uint64_t hi1, hi2, lo1, lo2;
1587 uint64_t tmp[2];
1588
1589 hi1 = factor1 >> 32;
1590 hi2 = factor2 >> 32;
1591
1592 lo1 = factor1 & DT_MASK_LO;
1593 lo2 = factor2 & DT_MASK_LO;
1594
1595 product[0] = lo1 * lo2;
1596 product[1] = hi1 * hi2;
1597
1598 tmp[0] = hi1 * lo2;
1599 tmp[1] = 0;
1600 dtrace_shift_128(tmp, 32);
1601 dtrace_add_128(product, tmp, product);
1602
1603 tmp[0] = hi2 * lo1;
1604 tmp[1] = 0;
1605 dtrace_shift_128(tmp, 32);
1606 dtrace_add_128(product, tmp, product);
1607 }
1608
1609 /*
1610 * This privilege check should be used by actions and subroutines to
1611 * verify that the user credentials of the process that enabled the
1612 * invoking ECB match the target credentials
1613 */
1614 static int
1615 dtrace_priv_proc_common_user(dtrace_state_t *state)
1616 {
1617 cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1618
1619 /*
1620 * We should always have a non-NULL state cred here, since if cred
1621 * is null (anonymous tracing), we fast-path bypass this routine.
1622 */
1623 ASSERT(s_cr != NULL);
1624
1625 if ((cr = dtrace_CRED()) != NULL &&
1626 posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_uid &&
1627 posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_ruid &&
1628 posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_suid &&
1629 posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_gid &&
1630 posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_rgid &&
1631 posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_sgid)
1632 return (1);
1633
1634 return (0);
1635 }
1636
1637 /*
1638 * This privilege check should be used by actions and subroutines to
1639 * verify that the zone of the process that enabled the invoking ECB
1640 * matches the target credentials
1641 */
1642 static int
1643 dtrace_priv_proc_common_zone(dtrace_state_t *state)
1644 {
1645 cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1646 #pragma unused(cr, s_cr, state) /* __APPLE__ */
1647
1648 /*
1649 * We should always have a non-NULL state cred here, since if cred
1650 * is null (anonymous tracing), we fast-path bypass this routine.
1651 */
1652 ASSERT(s_cr != NULL);
1653
1654 return 1; /* APPLE NOTE: Darwin doesn't do zones. */
1655 }
1656
1657 /*
1658 * This privilege check should be used by actions and subroutines to
1659 * verify that the process has not setuid or changed credentials.
1660 */
1661 static int
1662 dtrace_priv_proc_common_nocd(void)
1663 {
1664 return 1; /* Darwin omits "No Core Dump" flag. */
1665 }
1666
1667 static int
1668 dtrace_priv_proc_destructive(dtrace_state_t *state)
1669 {
1670 int action = state->dts_cred.dcr_action;
1671
1672 if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1673 goto bad;
1674
1675 if (dtrace_is_restricted() && !dtrace_can_attach_to_proc(current_proc()))
1676 goto bad;
1677
1678 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1679 dtrace_priv_proc_common_zone(state) == 0)
1680 goto bad;
1681
1682 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1683 dtrace_priv_proc_common_user(state) == 0)
1684 goto bad;
1685
1686 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1687 dtrace_priv_proc_common_nocd() == 0)
1688 goto bad;
1689
1690 return (1);
1691
1692 bad:
1693 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1694
1695 return (0);
1696 }
1697
1698 static int
1699 dtrace_priv_proc_control(dtrace_state_t *state)
1700 {
1701 if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1702 goto bad;
1703
1704 if (dtrace_is_restricted() && !dtrace_can_attach_to_proc(current_proc()))
1705 goto bad;
1706
1707 if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1708 return (1);
1709
1710 if (dtrace_priv_proc_common_zone(state) &&
1711 dtrace_priv_proc_common_user(state) &&
1712 dtrace_priv_proc_common_nocd())
1713 return (1);
1714
1715 bad:
1716 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1717
1718 return (0);
1719 }
1720
1721 static int
1722 dtrace_priv_proc(dtrace_state_t *state)
1723 {
1724 if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1725 goto bad;
1726
1727 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed() && !dtrace_can_attach_to_proc(current_proc()))
1728 goto bad;
1729
1730 if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1731 return (1);
1732
1733 bad:
1734 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1735
1736 return (0);
1737 }
1738
1739 /*
1740 * The P_LNOATTACH check is an Apple specific check.
1741 * We need a version of dtrace_priv_proc() that omits
1742 * that check for PID and EXECNAME accesses
1743 */
1744 static int
1745 dtrace_priv_proc_relaxed(dtrace_state_t *state)
1746 {
1747
1748 if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1749 return (1);
1750
1751 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1752
1753 return (0);
1754 }
1755
1756 static int
1757 dtrace_priv_kernel(dtrace_state_t *state)
1758 {
1759 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed())
1760 goto bad;
1761
1762 if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1763 return (1);
1764
1765 bad:
1766 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1767
1768 return (0);
1769 }
1770
1771 static int
1772 dtrace_priv_kernel_destructive(dtrace_state_t *state)
1773 {
1774 if (dtrace_is_restricted())
1775 goto bad;
1776
1777 if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1778 return (1);
1779
1780 bad:
1781 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1782
1783 return (0);
1784 }
1785
1786 /*
1787 * Note: not called from probe context. This function is called
1788 * asynchronously (and at a regular interval) from outside of probe context to
1789 * clean the dirty dynamic variable lists on all CPUs. Dynamic variable
1790 * cleaning is explained in detail in <sys/dtrace_impl.h>.
1791 */
1792 static void
1793 dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1794 {
1795 dtrace_dynvar_t *dirty;
1796 dtrace_dstate_percpu_t *dcpu;
1797 int i, work = 0;
1798
1799 for (i = 0; i < (int)NCPU; i++) {
1800 dcpu = &dstate->dtds_percpu[i];
1801
1802 ASSERT(dcpu->dtdsc_rinsing == NULL);
1803
1804 /*
1805 * If the dirty list is NULL, there is no dirty work to do.
1806 */
1807 if (dcpu->dtdsc_dirty == NULL)
1808 continue;
1809
1810 /*
1811 * If the clean list is non-NULL, then we're not going to do
1812 * any work for this CPU -- it means that there has not been
1813 * a dtrace_dynvar() allocation on this CPU (or from this CPU)
1814 * since the last time we cleaned house.
1815 */
1816 if (dcpu->dtdsc_clean != NULL)
1817 continue;
1818
1819 work = 1;
1820
1821 /*
1822 * Atomically move the dirty list aside.
1823 */
1824 do {
1825 dirty = dcpu->dtdsc_dirty;
1826
1827 /*
1828 * Before we zap the dirty list, set the rinsing list.
1829 * (This allows for a potential assertion in
1830 * dtrace_dynvar(): if a free dynamic variable appears
1831 * on a hash chain, either the dirty list or the
1832 * rinsing list for some CPU must be non-NULL.)
1833 */
1834 dcpu->dtdsc_rinsing = dirty;
1835 dtrace_membar_producer();
1836 } while (dtrace_casptr(&dcpu->dtdsc_dirty,
1837 dirty, NULL) != dirty);
1838 }
1839
1840 if (!work) {
1841 /*
1842 * We have no work to do; we can simply return.
1843 */
1844 return;
1845 }
1846
1847 dtrace_sync();
1848
1849 for (i = 0; i < (int)NCPU; i++) {
1850 dcpu = &dstate->dtds_percpu[i];
1851
1852 if (dcpu->dtdsc_rinsing == NULL)
1853 continue;
1854
1855 /*
1856 * We are now guaranteed that no hash chain contains a pointer
1857 * into this dirty list; we can make it clean.
1858 */
1859 ASSERT(dcpu->dtdsc_clean == NULL);
1860 dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1861 dcpu->dtdsc_rinsing = NULL;
1862 }
1863
1864 /*
1865 * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1866 * sure that all CPUs have seen all of the dtdsc_clean pointers.
1867 * This prevents a race whereby a CPU incorrectly decides that
1868 * the state should be something other than DTRACE_DSTATE_CLEAN
1869 * after dtrace_dynvar_clean() has completed.
1870 */
1871 dtrace_sync();
1872
1873 dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1874 }
1875
1876 /*
1877 * Depending on the value of the op parameter, this function looks-up,
1878 * allocates or deallocates an arbitrarily-keyed dynamic variable. If an
1879 * allocation is requested, this function will return a pointer to a
1880 * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1881 * variable can be allocated. If NULL is returned, the appropriate counter
1882 * will be incremented.
1883 */
1884 static dtrace_dynvar_t *
1885 dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1886 dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1887 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1888 {
1889 uint64_t hashval = DTRACE_DYNHASH_VALID;
1890 dtrace_dynhash_t *hash = dstate->dtds_hash;
1891 dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1892 processorid_t me = CPU->cpu_id, cpu = me;
1893 dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
1894 size_t bucket, ksize;
1895 size_t chunksize = dstate->dtds_chunksize;
1896 uintptr_t kdata, lock, nstate;
1897 uint_t i;
1898
1899 ASSERT(nkeys != 0);
1900
1901 /*
1902 * Hash the key. As with aggregations, we use Jenkins' "One-at-a-time"
1903 * algorithm. For the by-value portions, we perform the algorithm in
1904 * 16-bit chunks (as opposed to 8-bit chunks). This speeds things up a
1905 * bit, and seems to have only a minute effect on distribution. For
1906 * the by-reference data, we perform "One-at-a-time" iterating (safely)
1907 * over each referenced byte. It's painful to do this, but it's much
1908 * better than pathological hash distribution. The efficacy of the
1909 * hashing algorithm (and a comparison with other algorithms) may be
1910 * found by running the ::dtrace_dynstat MDB dcmd.
1911 */
1912 for (i = 0; i < nkeys; i++) {
1913 if (key[i].dttk_size == 0) {
1914 uint64_t val = key[i].dttk_value;
1915
1916 hashval += (val >> 48) & 0xffff;
1917 hashval += (hashval << 10);
1918 hashval ^= (hashval >> 6);
1919
1920 hashval += (val >> 32) & 0xffff;
1921 hashval += (hashval << 10);
1922 hashval ^= (hashval >> 6);
1923
1924 hashval += (val >> 16) & 0xffff;
1925 hashval += (hashval << 10);
1926 hashval ^= (hashval >> 6);
1927
1928 hashval += val & 0xffff;
1929 hashval += (hashval << 10);
1930 hashval ^= (hashval >> 6);
1931 } else {
1932 /*
1933 * This is incredibly painful, but it beats the hell
1934 * out of the alternative.
1935 */
1936 uint64_t j, size = key[i].dttk_size;
1937 uintptr_t base = (uintptr_t)key[i].dttk_value;
1938
1939 if (!dtrace_canload(base, size, mstate, vstate))
1940 break;
1941
1942 for (j = 0; j < size; j++) {
1943 hashval += dtrace_load8(base + j);
1944 hashval += (hashval << 10);
1945 hashval ^= (hashval >> 6);
1946 }
1947 }
1948 }
1949
1950 if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1951 return (NULL);
1952
1953 hashval += (hashval << 3);
1954 hashval ^= (hashval >> 11);
1955 hashval += (hashval << 15);
1956
1957 /*
1958 * There is a remote chance (ideally, 1 in 2^31) that our hashval
1959 * comes out to be one of our two sentinel hash values. If this
1960 * actually happens, we set the hashval to be a value known to be a
1961 * non-sentinel value.
1962 */
1963 if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1964 hashval = DTRACE_DYNHASH_VALID;
1965
1966 /*
1967 * Yes, it's painful to do a divide here. If the cycle count becomes
1968 * important here, tricks can be pulled to reduce it. (However, it's
1969 * critical that hash collisions be kept to an absolute minimum;
1970 * they're much more painful than a divide.) It's better to have a
1971 * solution that generates few collisions and still keeps things
1972 * relatively simple.
1973 */
1974 bucket = hashval % dstate->dtds_hashsize;
1975
1976 if (op == DTRACE_DYNVAR_DEALLOC) {
1977 volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1978
1979 for (;;) {
1980 while ((lock = *lockp) & 1)
1981 continue;
1982
1983 if (dtrace_casptr((void *)(uintptr_t)lockp,
1984 (void *)lock, (void *)(lock + 1)) == (void *)lock)
1985 break;
1986 }
1987
1988 dtrace_membar_producer();
1989 }
1990
1991 top:
1992 prev = NULL;
1993 lock = hash[bucket].dtdh_lock;
1994
1995 dtrace_membar_consumer();
1996
1997 start = hash[bucket].dtdh_chain;
1998 ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
1999 start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
2000 op != DTRACE_DYNVAR_DEALLOC));
2001
2002 for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
2003 dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
2004 dtrace_key_t *dkey = &dtuple->dtt_key[0];
2005
2006 if (dvar->dtdv_hashval != hashval) {
2007 if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
2008 /*
2009 * We've reached the sink, and therefore the
2010 * end of the hash chain; we can kick out of
2011 * the loop knowing that we have seen a valid
2012 * snapshot of state.
2013 */
2014 ASSERT(dvar->dtdv_next == NULL);
2015 ASSERT(dvar == &dtrace_dynhash_sink);
2016 break;
2017 }
2018
2019 if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
2020 /*
2021 * We've gone off the rails: somewhere along
2022 * the line, one of the members of this hash
2023 * chain was deleted. Note that we could also
2024 * detect this by simply letting this loop run
2025 * to completion, as we would eventually hit
2026 * the end of the dirty list. However, we
2027 * want to avoid running the length of the
2028 * dirty list unnecessarily (it might be quite
2029 * long), so we catch this as early as
2030 * possible by detecting the hash marker. In
2031 * this case, we simply set dvar to NULL and
2032 * break; the conditional after the loop will
2033 * send us back to top.
2034 */
2035 dvar = NULL;
2036 break;
2037 }
2038
2039 goto next;
2040 }
2041
2042 if (dtuple->dtt_nkeys != nkeys)
2043 goto next;
2044
2045 for (i = 0; i < nkeys; i++, dkey++) {
2046 if (dkey->dttk_size != key[i].dttk_size)
2047 goto next; /* size or type mismatch */
2048
2049 if (dkey->dttk_size != 0) {
2050 if (dtrace_bcmp(
2051 (void *)(uintptr_t)key[i].dttk_value,
2052 (void *)(uintptr_t)dkey->dttk_value,
2053 dkey->dttk_size))
2054 goto next;
2055 } else {
2056 if (dkey->dttk_value != key[i].dttk_value)
2057 goto next;
2058 }
2059 }
2060
2061 if (op != DTRACE_DYNVAR_DEALLOC)
2062 return (dvar);
2063
2064 ASSERT(dvar->dtdv_next == NULL ||
2065 dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
2066
2067 if (prev != NULL) {
2068 ASSERT(hash[bucket].dtdh_chain != dvar);
2069 ASSERT(start != dvar);
2070 ASSERT(prev->dtdv_next == dvar);
2071 prev->dtdv_next = dvar->dtdv_next;
2072 } else {
2073 if (dtrace_casptr(&hash[bucket].dtdh_chain,
2074 start, dvar->dtdv_next) != start) {
2075 /*
2076 * We have failed to atomically swing the
2077 * hash table head pointer, presumably because
2078 * of a conflicting allocation on another CPU.
2079 * We need to reread the hash chain and try
2080 * again.
2081 */
2082 goto top;
2083 }
2084 }
2085
2086 dtrace_membar_producer();
2087
2088 /*
2089 * Now set the hash value to indicate that it's free.
2090 */
2091 ASSERT(hash[bucket].dtdh_chain != dvar);
2092 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2093
2094 dtrace_membar_producer();
2095
2096 /*
2097 * Set the next pointer to point at the dirty list, and
2098 * atomically swing the dirty pointer to the newly freed dvar.
2099 */
2100 do {
2101 next = dcpu->dtdsc_dirty;
2102 dvar->dtdv_next = next;
2103 } while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
2104
2105 /*
2106 * Finally, unlock this hash bucket.
2107 */
2108 ASSERT(hash[bucket].dtdh_lock == lock);
2109 ASSERT(lock & 1);
2110 hash[bucket].dtdh_lock++;
2111
2112 return (NULL);
2113 next:
2114 prev = dvar;
2115 continue;
2116 }
2117
2118 if (dvar == NULL) {
2119 /*
2120 * If dvar is NULL, it is because we went off the rails:
2121 * one of the elements that we traversed in the hash chain
2122 * was deleted while we were traversing it. In this case,
2123 * we assert that we aren't doing a dealloc (deallocs lock
2124 * the hash bucket to prevent themselves from racing with
2125 * one another), and retry the hash chain traversal.
2126 */
2127 ASSERT(op != DTRACE_DYNVAR_DEALLOC);
2128 goto top;
2129 }
2130
2131 if (op != DTRACE_DYNVAR_ALLOC) {
2132 /*
2133 * If we are not to allocate a new variable, we want to
2134 * return NULL now. Before we return, check that the value
2135 * of the lock word hasn't changed. If it has, we may have
2136 * seen an inconsistent snapshot.
2137 */
2138 if (op == DTRACE_DYNVAR_NOALLOC) {
2139 if (hash[bucket].dtdh_lock != lock)
2140 goto top;
2141 } else {
2142 ASSERT(op == DTRACE_DYNVAR_DEALLOC);
2143 ASSERT(hash[bucket].dtdh_lock == lock);
2144 ASSERT(lock & 1);
2145 hash[bucket].dtdh_lock++;
2146 }
2147
2148 return (NULL);
2149 }
2150
2151 /*
2152 * We need to allocate a new dynamic variable. The size we need is the
2153 * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
2154 * size of any auxiliary key data (rounded up to 8-byte alignment) plus
2155 * the size of any referred-to data (dsize). We then round the final
2156 * size up to the chunksize for allocation.
2157 */
2158 for (ksize = 0, i = 0; i < nkeys; i++)
2159 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
2160
2161 /*
2162 * This should be pretty much impossible, but could happen if, say,
2163 * strange DIF specified the tuple. Ideally, this should be an
2164 * assertion and not an error condition -- but that requires that the
2165 * chunksize calculation in dtrace_difo_chunksize() be absolutely
2166 * bullet-proof. (That is, it must not be able to be fooled by
2167 * malicious DIF.) Given the lack of backwards branches in DIF,
2168 * solving this would presumably not amount to solving the Halting
2169 * Problem -- but it still seems awfully hard.
2170 */
2171 if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
2172 ksize + dsize > chunksize) {
2173 dcpu->dtdsc_drops++;
2174 return (NULL);
2175 }
2176
2177 nstate = DTRACE_DSTATE_EMPTY;
2178
2179 do {
2180 retry:
2181 free = dcpu->dtdsc_free;
2182
2183 if (free == NULL) {
2184 dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
2185 void *rval;
2186
2187 if (clean == NULL) {
2188 /*
2189 * We're out of dynamic variable space on
2190 * this CPU. Unless we have tried all CPUs,
2191 * we'll try to allocate from a different
2192 * CPU.
2193 */
2194 switch (dstate->dtds_state) {
2195 case DTRACE_DSTATE_CLEAN: {
2196 void *sp = &dstate->dtds_state;
2197
2198 if (++cpu >= (int)NCPU)
2199 cpu = 0;
2200
2201 if (dcpu->dtdsc_dirty != NULL &&
2202 nstate == DTRACE_DSTATE_EMPTY)
2203 nstate = DTRACE_DSTATE_DIRTY;
2204
2205 if (dcpu->dtdsc_rinsing != NULL)
2206 nstate = DTRACE_DSTATE_RINSING;
2207
2208 dcpu = &dstate->dtds_percpu[cpu];
2209
2210 if (cpu != me)
2211 goto retry;
2212
2213 (void) dtrace_cas32(sp,
2214 DTRACE_DSTATE_CLEAN, nstate);
2215
2216 /*
2217 * To increment the correct bean
2218 * counter, take another lap.
2219 */
2220 goto retry;
2221 }
2222
2223 case DTRACE_DSTATE_DIRTY:
2224 dcpu->dtdsc_dirty_drops++;
2225 break;
2226
2227 case DTRACE_DSTATE_RINSING:
2228 dcpu->dtdsc_rinsing_drops++;
2229 break;
2230
2231 case DTRACE_DSTATE_EMPTY:
2232 dcpu->dtdsc_drops++;
2233 break;
2234 }
2235
2236 DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
2237 return (NULL);
2238 }
2239
2240 /*
2241 * The clean list appears to be non-empty. We want to
2242 * move the clean list to the free list; we start by
2243 * moving the clean pointer aside.
2244 */
2245 if (dtrace_casptr(&dcpu->dtdsc_clean,
2246 clean, NULL) != clean) {
2247 /*
2248 * We are in one of two situations:
2249 *
2250 * (a) The clean list was switched to the
2251 * free list by another CPU.
2252 *
2253 * (b) The clean list was added to by the
2254 * cleansing cyclic.
2255 *
2256 * In either of these situations, we can
2257 * just reattempt the free list allocation.
2258 */
2259 goto retry;
2260 }
2261
2262 ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
2263
2264 /*
2265 * Now we'll move the clean list to the free list.
2266 * It's impossible for this to fail: the only way
2267 * the free list can be updated is through this
2268 * code path, and only one CPU can own the clean list.
2269 * Thus, it would only be possible for this to fail if
2270 * this code were racing with dtrace_dynvar_clean().
2271 * (That is, if dtrace_dynvar_clean() updated the clean
2272 * list, and we ended up racing to update the free
2273 * list.) This race is prevented by the dtrace_sync()
2274 * in dtrace_dynvar_clean() -- which flushes the
2275 * owners of the clean lists out before resetting
2276 * the clean lists.
2277 */
2278 rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
2279 ASSERT(rval == NULL);
2280 goto retry;
2281 }
2282
2283 dvar = free;
2284 new_free = dvar->dtdv_next;
2285 } while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
2286
2287 /*
2288 * We have now allocated a new chunk. We copy the tuple keys into the
2289 * tuple array and copy any referenced key data into the data space
2290 * following the tuple array. As we do this, we relocate dttk_value
2291 * in the final tuple to point to the key data address in the chunk.
2292 */
2293 kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
2294 dvar->dtdv_data = (void *)(kdata + ksize);
2295 dvar->dtdv_tuple.dtt_nkeys = nkeys;
2296
2297 for (i = 0; i < nkeys; i++) {
2298 dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
2299 size_t kesize = key[i].dttk_size;
2300
2301 if (kesize != 0) {
2302 dtrace_bcopy(
2303 (const void *)(uintptr_t)key[i].dttk_value,
2304 (void *)kdata, kesize);
2305 dkey->dttk_value = kdata;
2306 kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
2307 } else {
2308 dkey->dttk_value = key[i].dttk_value;
2309 }
2310
2311 dkey->dttk_size = kesize;
2312 }
2313
2314 ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
2315 dvar->dtdv_hashval = hashval;
2316 dvar->dtdv_next = start;
2317
2318 if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
2319 return (dvar);
2320
2321 /*
2322 * The cas has failed. Either another CPU is adding an element to
2323 * this hash chain, or another CPU is deleting an element from this
2324 * hash chain. The simplest way to deal with both of these cases
2325 * (though not necessarily the most efficient) is to free our
2326 * allocated block and tail-call ourselves. Note that the free is
2327 * to the dirty list and _not_ to the free list. This is to prevent
2328 * races with allocators, above.
2329 */
2330 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2331
2332 dtrace_membar_producer();
2333
2334 do {
2335 free = dcpu->dtdsc_dirty;
2336 dvar->dtdv_next = free;
2337 } while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
2338
2339 return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
2340 }
2341
2342 /*ARGSUSED*/
2343 static void
2344 dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
2345 {
2346 #pragma unused(arg) /* __APPLE__ */
2347 if ((int64_t)nval < (int64_t)*oval)
2348 *oval = nval;
2349 }
2350
2351 /*ARGSUSED*/
2352 static void
2353 dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
2354 {
2355 #pragma unused(arg) /* __APPLE__ */
2356 if ((int64_t)nval > (int64_t)*oval)
2357 *oval = nval;
2358 }
2359
2360 static void
2361 dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
2362 {
2363 int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
2364 int64_t val = (int64_t)nval;
2365
2366 if (val < 0) {
2367 for (i = 0; i < zero; i++) {
2368 if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
2369 quanta[i] += incr;
2370 return;
2371 }
2372 }
2373 } else {
2374 for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
2375 if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
2376 quanta[i - 1] += incr;
2377 return;
2378 }
2379 }
2380
2381 quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
2382 return;
2383 }
2384
2385 ASSERT(0);
2386 }
2387
2388 static void
2389 dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
2390 {
2391 uint64_t arg = *lquanta++;
2392 int32_t base = DTRACE_LQUANTIZE_BASE(arg);
2393 uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
2394 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
2395 int32_t val = (int32_t)nval, level;
2396
2397 ASSERT(step != 0);
2398 ASSERT(levels != 0);
2399
2400 if (val < base) {
2401 /*
2402 * This is an underflow.
2403 */
2404 lquanta[0] += incr;
2405 return;
2406 }
2407
2408 level = (val - base) / step;
2409
2410 if (level < levels) {
2411 lquanta[level + 1] += incr;
2412 return;
2413 }
2414
2415 /*
2416 * This is an overflow.
2417 */
2418 lquanta[levels + 1] += incr;
2419 }
2420
2421 static int
2422 dtrace_aggregate_llquantize_bucket(int16_t factor, int16_t low, int16_t high,
2423 int16_t nsteps, int64_t value)
2424 {
2425 int64_t this = 1, last, next;
2426 int base = 1, order;
2427
2428 for (order = 0; order < low; ++order)
2429 this *= factor;
2430
2431 /*
2432 * If our value is less than our factor taken to the power of the
2433 * low order of magnitude, it goes into the zeroth bucket.
2434 */
2435 if (value < this)
2436 return 0;
2437 else
2438 last = this;
2439
2440 for (this *= factor; order <= high; ++order) {
2441 int nbuckets = this > nsteps ? nsteps : this;
2442
2443 /*
2444 * We should not generally get log/linear quantizations
2445 * with a high magnitude that allows 64-bits to
2446 * overflow, but we nonetheless protect against this
2447 * by explicitly checking for overflow, and clamping
2448 * our value accordingly.
2449 */
2450 next = this * factor;
2451 if (next < this) {
2452 value = this - 1;
2453 }
2454
2455 /*
2456 * If our value lies within this order of magnitude,
2457 * determine its position by taking the offset within
2458 * the order of magnitude, dividing by the bucket
2459 * width, and adding to our (accumulated) base.
2460 */
2461 if (value < this) {
2462 return (base + (value - last) / (this / nbuckets));
2463 }
2464
2465 base += nbuckets - (nbuckets / factor);
2466 last = this;
2467 this = next;
2468 }
2469
2470 /*
2471 * Our value is greater than or equal to our factor taken to the
2472 * power of one plus the high magnitude -- return the top bucket.
2473 */
2474 return base;
2475 }
2476
2477 static void
2478 dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
2479 {
2480 uint64_t arg = *llquanta++;
2481 uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
2482 uint16_t low = DTRACE_LLQUANTIZE_LOW(arg);
2483 uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg);
2484 uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
2485
2486 llquanta[dtrace_aggregate_llquantize_bucket(factor, low, high, nsteps, nval)] += incr;
2487 }
2488
2489 /*ARGSUSED*/
2490 static void
2491 dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
2492 {
2493 #pragma unused(arg) /* __APPLE__ */
2494 data[0]++;
2495 data[1] += nval;
2496 }
2497
2498 /*ARGSUSED*/
2499 static void
2500 dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
2501 {
2502 #pragma unused(arg) /* __APPLE__ */
2503 int64_t snval = (int64_t)nval;
2504 uint64_t tmp[2];
2505
2506 data[0]++;
2507 data[1] += nval;
2508
2509 /*
2510 * What we want to say here is:
2511 *
2512 * data[2] += nval * nval;
2513 *
2514 * But given that nval is 64-bit, we could easily overflow, so
2515 * we do this as 128-bit arithmetic.
2516 */
2517 if (snval < 0)
2518 snval = -snval;
2519
2520 dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
2521 dtrace_add_128(data + 2, tmp, data + 2);
2522 }
2523
2524 /*ARGSUSED*/
2525 static void
2526 dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
2527 {
2528 #pragma unused(nval, arg) /* __APPLE__ */
2529 *oval = *oval + 1;
2530 }
2531
2532 /*ARGSUSED*/
2533 static void
2534 dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
2535 {
2536 #pragma unused(arg) /* __APPLE__ */
2537 *oval += nval;
2538 }
2539
2540 /*
2541 * Aggregate given the tuple in the principal data buffer, and the aggregating
2542 * action denoted by the specified dtrace_aggregation_t. The aggregation
2543 * buffer is specified as the buf parameter. This routine does not return
2544 * failure; if there is no space in the aggregation buffer, the data will be
2545 * dropped, and a corresponding counter incremented.
2546 */
2547 __attribute__((noinline))
2548 static void
2549 dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
2550 intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
2551 {
2552 #pragma unused(arg)
2553 dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
2554 uint32_t i, ndx, size, fsize;
2555 uint32_t align = sizeof (uint64_t) - 1;
2556 dtrace_aggbuffer_t *agb;
2557 dtrace_aggkey_t *key;
2558 uint32_t hashval = 0, limit, isstr;
2559 caddr_t tomax, data, kdata;
2560 dtrace_actkind_t action;
2561 dtrace_action_t *act;
2562 uintptr_t offs;
2563
2564 if (buf == NULL)
2565 return;
2566
2567 if (!agg->dtag_hasarg) {
2568 /*
2569 * Currently, only quantize() and lquantize() take additional
2570 * arguments, and they have the same semantics: an increment
2571 * value that defaults to 1 when not present. If additional
2572 * aggregating actions take arguments, the setting of the
2573 * default argument value will presumably have to become more
2574 * sophisticated...
2575 */
2576 arg = 1;
2577 }
2578
2579 action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
2580 size = rec->dtrd_offset - agg->dtag_base;
2581 fsize = size + rec->dtrd_size;
2582
2583 ASSERT(dbuf->dtb_tomax != NULL);
2584 data = dbuf->dtb_tomax + offset + agg->dtag_base;
2585
2586 if ((tomax = buf->dtb_tomax) == NULL) {
2587 dtrace_buffer_drop(buf);
2588 return;
2589 }
2590
2591 /*
2592 * The metastructure is always at the bottom of the buffer.
2593 */
2594 agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
2595 sizeof (dtrace_aggbuffer_t));
2596
2597 if (buf->dtb_offset == 0) {
2598 /*
2599 * We just kludge up approximately 1/8th of the size to be
2600 * buckets. If this guess ends up being routinely
2601 * off-the-mark, we may need to dynamically readjust this
2602 * based on past performance.
2603 */
2604 uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
2605
2606 if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
2607 (uintptr_t)tomax || hashsize == 0) {
2608 /*
2609 * We've been given a ludicrously small buffer;
2610 * increment our drop count and leave.
2611 */
2612 dtrace_buffer_drop(buf);
2613 return;
2614 }
2615
2616 /*
2617 * And now, a pathetic attempt to try to get a an odd (or
2618 * perchance, a prime) hash size for better hash distribution.
2619 */
2620 if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
2621 hashsize -= DTRACE_AGGHASHSIZE_SLEW;
2622
2623 agb->dtagb_hashsize = hashsize;
2624 agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
2625 agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
2626 agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
2627
2628 for (i = 0; i < agb->dtagb_hashsize; i++)
2629 agb->dtagb_hash[i] = NULL;
2630 }
2631
2632 ASSERT(agg->dtag_first != NULL);
2633 ASSERT(agg->dtag_first->dta_intuple);
2634
2635 /*
2636 * Calculate the hash value based on the key. Note that we _don't_
2637 * include the aggid in the hashing (but we will store it as part of
2638 * the key). The hashing algorithm is Bob Jenkins' "One-at-a-time"
2639 * algorithm: a simple, quick algorithm that has no known funnels, and
2640 * gets good distribution in practice. The efficacy of the hashing
2641 * algorithm (and a comparison with other algorithms) may be found by
2642 * running the ::dtrace_aggstat MDB dcmd.
2643 */
2644 for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2645 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2646 limit = i + act->dta_rec.dtrd_size;
2647 ASSERT(limit <= size);
2648 isstr = DTRACEACT_ISSTRING(act);
2649
2650 for (; i < limit; i++) {
2651 hashval += data[i];
2652 hashval += (hashval << 10);
2653 hashval ^= (hashval >> 6);
2654
2655 if (isstr && data[i] == '\0')
2656 break;
2657 }
2658 }
2659
2660 hashval += (hashval << 3);
2661 hashval ^= (hashval >> 11);
2662 hashval += (hashval << 15);
2663
2664 /*
2665 * Yes, the divide here is expensive -- but it's generally the least
2666 * of the performance issues given the amount of data that we iterate
2667 * over to compute hash values, compare data, etc.
2668 */
2669 ndx = hashval % agb->dtagb_hashsize;
2670
2671 for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
2672 ASSERT((caddr_t)key >= tomax);
2673 ASSERT((caddr_t)key < tomax + buf->dtb_size);
2674
2675 if (hashval != key->dtak_hashval || key->dtak_size != size)
2676 continue;
2677
2678 kdata = key->dtak_data;
2679 ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
2680
2681 for (act = agg->dtag_first; act->dta_intuple;
2682 act = act->dta_next) {
2683 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2684 limit = i + act->dta_rec.dtrd_size;
2685 ASSERT(limit <= size);
2686 isstr = DTRACEACT_ISSTRING(act);
2687
2688 for (; i < limit; i++) {
2689 if (kdata[i] != data[i])
2690 goto next;
2691
2692 if (isstr && data[i] == '\0')
2693 break;
2694 }
2695 }
2696
2697 if (action != key->dtak_action) {
2698 /*
2699 * We are aggregating on the same value in the same
2700 * aggregation with two different aggregating actions.
2701 * (This should have been picked up in the compiler,
2702 * so we may be dealing with errant or devious DIF.)
2703 * This is an error condition; we indicate as much,
2704 * and return.
2705 */
2706 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2707 return;
2708 }
2709
2710 /*
2711 * This is a hit: we need to apply the aggregator to
2712 * the value at this key.
2713 */
2714 agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2715 return;
2716 next:
2717 continue;
2718 }
2719
2720 /*
2721 * We didn't find it. We need to allocate some zero-filled space,
2722 * link it into the hash table appropriately, and apply the aggregator
2723 * to the (zero-filled) value.
2724 */
2725 offs = buf->dtb_offset;
2726 while (offs & (align - 1))
2727 offs += sizeof (uint32_t);
2728
2729 /*
2730 * If we don't have enough room to both allocate a new key _and_
2731 * its associated data, increment the drop count and return.
2732 */
2733 if ((uintptr_t)tomax + offs + fsize >
2734 agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2735 dtrace_buffer_drop(buf);
2736 return;
2737 }
2738
2739 /*CONSTCOND*/
2740 ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
2741 key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
2742 agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2743
2744 key->dtak_data = kdata = tomax + offs;
2745 buf->dtb_offset = offs + fsize;
2746
2747 /*
2748 * Now copy the data across.
2749 */
2750 *((dtrace_aggid_t *)kdata) = agg->dtag_id;
2751
2752 for (i = sizeof (dtrace_aggid_t); i < size; i++)
2753 kdata[i] = data[i];
2754
2755 /*
2756 * Because strings are not zeroed out by default, we need to iterate
2757 * looking for actions that store strings, and we need to explicitly
2758 * pad these strings out with zeroes.
2759 */
2760 for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2761 int nul;
2762
2763 if (!DTRACEACT_ISSTRING(act))
2764 continue;
2765
2766 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2767 limit = i + act->dta_rec.dtrd_size;
2768 ASSERT(limit <= size);
2769
2770 for (nul = 0; i < limit; i++) {
2771 if (nul) {
2772 kdata[i] = '\0';
2773 continue;
2774 }
2775
2776 if (data[i] != '\0')
2777 continue;
2778
2779 nul = 1;
2780 }
2781 }
2782
2783 for (i = size; i < fsize; i++)
2784 kdata[i] = 0;
2785
2786 key->dtak_hashval = hashval;
2787 key->dtak_size = size;
2788 key->dtak_action = action;
2789 key->dtak_next = agb->dtagb_hash[ndx];
2790 agb->dtagb_hash[ndx] = key;
2791
2792 /*
2793 * Finally, apply the aggregator.
2794 */
2795 *((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
2796 agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2797 }
2798
2799 /*
2800 * Given consumer state, this routine finds a speculation in the INACTIVE
2801 * state and transitions it into the ACTIVE state. If there is no speculation
2802 * in the INACTIVE state, 0 is returned. In this case, no error counter is
2803 * incremented -- it is up to the caller to take appropriate action.
2804 */
2805 static int
2806 dtrace_speculation(dtrace_state_t *state)
2807 {
2808 int i = 0;
2809 dtrace_speculation_state_t current;
2810 uint32_t *stat = &state->dts_speculations_unavail, count;
2811
2812 while (i < state->dts_nspeculations) {
2813 dtrace_speculation_t *spec = &state->dts_speculations[i];
2814
2815 current = spec->dtsp_state;
2816
2817 if (current != DTRACESPEC_INACTIVE) {
2818 if (current == DTRACESPEC_COMMITTINGMANY ||
2819 current == DTRACESPEC_COMMITTING ||
2820 current == DTRACESPEC_DISCARDING)
2821 stat = &state->dts_speculations_busy;
2822 i++;
2823 continue;
2824 }
2825
2826 if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2827 current, DTRACESPEC_ACTIVE) == current)
2828 return (i + 1);
2829 }
2830
2831 /*
2832 * We couldn't find a speculation. If we found as much as a single
2833 * busy speculation buffer, we'll attribute this failure as "busy"
2834 * instead of "unavail".
2835 */
2836 do {
2837 count = *stat;
2838 } while (dtrace_cas32(stat, count, count + 1) != count);
2839
2840 return (0);
2841 }
2842
2843 /*
2844 * This routine commits an active speculation. If the specified speculation
2845 * is not in a valid state to perform a commit(), this routine will silently do
2846 * nothing. The state of the specified speculation is transitioned according
2847 * to the state transition diagram outlined in <sys/dtrace_impl.h>
2848 */
2849 static void
2850 dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2851 dtrace_specid_t which)
2852 {
2853 dtrace_speculation_t *spec;
2854 dtrace_buffer_t *src, *dest;
2855 uintptr_t daddr, saddr, dlimit, slimit;
2856 dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
2857 intptr_t offs;
2858 uint64_t timestamp;
2859
2860 if (which == 0)
2861 return;
2862
2863 if (which > (dtrace_specid_t)state->dts_nspeculations) {
2864 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2865 return;
2866 }
2867
2868 spec = &state->dts_speculations[which - 1];
2869 src = &spec->dtsp_buffer[cpu];
2870 dest = &state->dts_buffer[cpu];
2871
2872 do {
2873 current = spec->dtsp_state;
2874
2875 if (current == DTRACESPEC_COMMITTINGMANY)
2876 break;
2877
2878 switch (current) {
2879 case DTRACESPEC_INACTIVE:
2880 case DTRACESPEC_DISCARDING:
2881 return;
2882
2883 case DTRACESPEC_COMMITTING:
2884 /*
2885 * This is only possible if we are (a) commit()'ing
2886 * without having done a prior speculate() on this CPU
2887 * and (b) racing with another commit() on a different
2888 * CPU. There's nothing to do -- we just assert that
2889 * our offset is 0.
2890 */
2891 ASSERT(src->dtb_offset == 0);
2892 return;
2893
2894 case DTRACESPEC_ACTIVE:
2895 new = DTRACESPEC_COMMITTING;
2896 break;
2897
2898 case DTRACESPEC_ACTIVEONE:
2899 /*
2900 * This speculation is active on one CPU. If our
2901 * buffer offset is non-zero, we know that the one CPU
2902 * must be us. Otherwise, we are committing on a
2903 * different CPU from the speculate(), and we must
2904 * rely on being asynchronously cleaned.
2905 */
2906 if (src->dtb_offset != 0) {
2907 new = DTRACESPEC_COMMITTING;
2908 break;
2909 }
2910 OS_FALLTHROUGH;
2911
2912 case DTRACESPEC_ACTIVEMANY:
2913 new = DTRACESPEC_COMMITTINGMANY;
2914 break;
2915
2916 default:
2917 ASSERT(0);
2918 }
2919 } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2920 current, new) != current);
2921
2922 /*
2923 * We have set the state to indicate that we are committing this
2924 * speculation. Now reserve the necessary space in the destination
2925 * buffer.
2926 */
2927 if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2928 sizeof (uint64_t), state, NULL)) < 0) {
2929 dtrace_buffer_drop(dest);
2930 goto out;
2931 }
2932
2933 /*
2934 * We have sufficient space to copy the speculative buffer into the
2935 * primary buffer. First, modify the speculative buffer, filling
2936 * in the timestamp of all entries with the current time. The data
2937 * must have the commit() time rather than the time it was traced,
2938 * so that all entries in the primary buffer are in timestamp order.
2939 */
2940 timestamp = dtrace_gethrtime();
2941 saddr = (uintptr_t)src->dtb_tomax;
2942 slimit = saddr + src->dtb_offset;
2943 while (saddr < slimit) {
2944 size_t size;
2945 dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr;
2946
2947 if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
2948 saddr += sizeof (dtrace_epid_t);
2949 continue;
2950 }
2951
2952 ASSERT(dtrh->dtrh_epid <= ((dtrace_epid_t) state->dts_necbs));
2953 size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size;
2954
2955 ASSERT(saddr + size <= slimit);
2956 ASSERT(size >= sizeof(dtrace_rechdr_t));
2957 ASSERT(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh) == UINT64_MAX);
2958
2959 DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);
2960
2961 saddr += size;
2962 }
2963
2964 /*
2965 * Copy the buffer across. (Note that this is a
2966 * highly subobtimal bcopy(); in the unlikely event that this becomes
2967 * a serious performance issue, a high-performance DTrace-specific
2968 * bcopy() should obviously be invented.)
2969 */
2970 daddr = (uintptr_t)dest->dtb_tomax + offs;
2971 dlimit = daddr + src->dtb_offset;
2972 saddr = (uintptr_t)src->dtb_tomax;
2973
2974 /*
2975 * First, the aligned portion.
2976 */
2977 while (dlimit - daddr >= sizeof (uint64_t)) {
2978 *((uint64_t *)daddr) = *((uint64_t *)saddr);
2979
2980 daddr += sizeof (uint64_t);
2981 saddr += sizeof (uint64_t);
2982 }
2983
2984 /*
2985 * Now any left-over bit...
2986 */
2987 while (dlimit - daddr)
2988 *((uint8_t *)daddr++) = *((uint8_t *)saddr++);
2989
2990 /*
2991 * Finally, commit the reserved space in the destination buffer.
2992 */
2993 dest->dtb_offset = offs + src->dtb_offset;
2994
2995 out:
2996 /*
2997 * If we're lucky enough to be the only active CPU on this speculation
2998 * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2999 */
3000 if (current == DTRACESPEC_ACTIVE ||
3001 (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
3002 uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
3003 DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
3004 #pragma unused(rval) /* __APPLE__ */
3005
3006 ASSERT(rval == DTRACESPEC_COMMITTING);
3007 }
3008
3009 src->dtb_offset = 0;
3010 src->dtb_xamot_drops += src->dtb_drops;
3011 src->dtb_drops = 0;
3012 }
3013
3014 /*
3015 * This routine discards an active speculation. If the specified speculation
3016 * is not in a valid state to perform a discard(), this routine will silently
3017 * do nothing. The state of the specified speculation is transitioned
3018 * according to the state transition diagram outlined in <sys/dtrace_impl.h>
3019 */
3020 __attribute__((noinline))
3021 static void
3022 dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
3023 dtrace_specid_t which)
3024 {
3025 dtrace_speculation_t *spec;
3026 dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
3027 dtrace_buffer_t *buf;
3028
3029 if (which == 0)
3030 return;
3031
3032 if (which > (dtrace_specid_t)state->dts_nspeculations) {
3033 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3034 return;
3035 }
3036
3037 spec = &state->dts_speculations[which - 1];
3038 buf = &spec->dtsp_buffer[cpu];
3039
3040 do {
3041 current = spec->dtsp_state;
3042
3043 switch (current) {
3044 case DTRACESPEC_INACTIVE:
3045 case DTRACESPEC_COMMITTINGMANY:
3046 case DTRACESPEC_COMMITTING:
3047 case DTRACESPEC_DISCARDING:
3048 return;
3049
3050 case DTRACESPEC_ACTIVE:
3051 case DTRACESPEC_ACTIVEMANY:
3052 new = DTRACESPEC_DISCARDING;
3053 break;
3054
3055 case DTRACESPEC_ACTIVEONE:
3056 if (buf->dtb_offset != 0) {
3057 new = DTRACESPEC_INACTIVE;
3058 } else {
3059 new = DTRACESPEC_DISCARDING;
3060 }
3061 break;
3062
3063 default:
3064 ASSERT(0);
3065 }
3066 } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
3067 current, new) != current);
3068
3069 buf->dtb_offset = 0;
3070 buf->dtb_drops = 0;
3071 }
3072
3073 /*
3074 * Note: not called from probe context. This function is called
3075 * asynchronously from cross call context to clean any speculations that are
3076 * in the COMMITTINGMANY or DISCARDING states. These speculations may not be
3077 * transitioned back to the INACTIVE state until all CPUs have cleaned the
3078 * speculation.
3079 */
3080 static void
3081 dtrace_speculation_clean_here(dtrace_state_t *state)
3082 {
3083 dtrace_icookie_t cookie;
3084 processorid_t cpu = CPU->cpu_id;
3085 dtrace_buffer_t *dest = &state->dts_buffer[cpu];
3086 dtrace_specid_t i;
3087
3088 cookie = dtrace_interrupt_disable();
3089
3090 if (dest->dtb_tomax == NULL) {
3091 dtrace_interrupt_enable(cookie);
3092 return;
3093 }
3094
3095 for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
3096 dtrace_speculation_t *spec = &state->dts_speculations[i];
3097 dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
3098
3099 if (src->dtb_tomax == NULL)
3100 continue;
3101
3102 if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
3103 src->dtb_offset = 0;
3104 continue;
3105 }
3106
3107 if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
3108 continue;
3109
3110 if (src->dtb_offset == 0)
3111 continue;
3112
3113 dtrace_speculation_commit(state, cpu, i + 1);
3114 }
3115
3116 dtrace_interrupt_enable(cookie);
3117 }
3118
3119 /*
3120 * Note: not called from probe context. This function is called
3121 * asynchronously (and at a regular interval) to clean any speculations that
3122 * are in the COMMITTINGMANY or DISCARDING states. If it discovers that there
3123 * is work to be done, it cross calls all CPUs to perform that work;
3124 * COMMITMANY and DISCARDING speculations may not be transitioned back to the
3125 * INACTIVE state until they have been cleaned by all CPUs.
3126 */
3127 static void
3128 dtrace_speculation_clean(dtrace_state_t *state)
3129 {
3130 int work = 0;
3131 uint32_t rv;
3132 dtrace_specid_t i;
3133
3134 for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
3135 dtrace_speculation_t *spec = &state->dts_speculations[i];
3136
3137 ASSERT(!spec->dtsp_cleaning);
3138
3139 if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
3140 spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
3141 continue;
3142
3143 work++;
3144 spec->dtsp_cleaning = 1;
3145 }
3146
3147 if (!work)
3148 return;
3149
3150 dtrace_xcall(DTRACE_CPUALL,
3151 (dtrace_xcall_t)dtrace_speculation_clean_here, state);
3152
3153 /*
3154 * We now know that all CPUs have committed or discarded their
3155 * speculation buffers, as appropriate. We can now set the state
3156 * to inactive.
3157 */
3158 for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
3159 dtrace_speculation_t *spec = &state->dts_speculations[i];
3160 dtrace_speculation_state_t current, new;
3161
3162 if (!spec->dtsp_cleaning)
3163 continue;
3164
3165 current = spec->dtsp_state;
3166 ASSERT(current == DTRACESPEC_DISCARDING ||
3167 current == DTRACESPEC_COMMITTINGMANY);
3168
3169 new = DTRACESPEC_INACTIVE;
3170
3171 rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
3172 ASSERT(rv == current);
3173 spec->dtsp_cleaning = 0;
3174 }
3175 }
3176
3177 /*
3178 * Called as part of a speculate() to get the speculative buffer associated
3179 * with a given speculation. Returns NULL if the specified speculation is not
3180 * in an ACTIVE state. If the speculation is in the ACTIVEONE state -- and
3181 * the active CPU is not the specified CPU -- the speculation will be
3182 * atomically transitioned into the ACTIVEMANY state.
3183 */
3184 __attribute__((noinline))
3185 static dtrace_buffer_t *
3186 dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
3187 dtrace_specid_t which)
3188 {
3189 dtrace_speculation_t *spec;
3190 dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
3191 dtrace_buffer_t *buf;
3192
3193 if (which == 0)
3194 return (NULL);
3195
3196 if (which > (dtrace_specid_t)state->dts_nspeculations) {
3197 cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3198 return (NULL);
3199 }
3200
3201 spec = &state->dts_speculations[which - 1];
3202 buf = &spec->dtsp_buffer[cpuid];
3203
3204 do {
3205 current = spec->dtsp_state;
3206
3207 switch (current) {
3208 case DTRACESPEC_INACTIVE:
3209 case DTRACESPEC_COMMITTINGMANY:
3210 case DTRACESPEC_DISCARDING:
3211 return (NULL);
3212
3213 case DTRACESPEC_COMMITTING:
3214 ASSERT(buf->dtb_offset == 0);
3215 return (NULL);
3216
3217 case DTRACESPEC_ACTIVEONE:
3218 /*
3219 * This speculation is currently active on one CPU.
3220 * Check the offset in the buffer; if it's non-zero,
3221 * that CPU must be us (and we leave the state alone).
3222 * If it's zero, assume that we're starting on a new
3223 * CPU -- and change the state to indicate that the
3224 * speculation is active on more than one CPU.
3225 */
3226 if (buf->dtb_offset != 0)
3227 return (buf);
3228
3229 new = DTRACESPEC_ACTIVEMANY;
3230 break;
3231
3232 case DTRACESPEC_ACTIVEMANY:
3233 return (buf);
3234
3235 case DTRACESPEC_ACTIVE:
3236 new = DTRACESPEC_ACTIVEONE;
3237 break;
3238
3239 default:
3240 ASSERT(0);
3241 }
3242 } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
3243 current, new) != current);
3244
3245 ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
3246 return (buf);
3247 }
3248
3249 /*
3250 * Return a string. In the event that the user lacks the privilege to access
3251 * arbitrary kernel memory, we copy the string out to scratch memory so that we
3252 * don't fail access checking.
3253 *
3254 * dtrace_dif_variable() uses this routine as a helper for various
3255 * builtin values such as 'execname' and 'probefunc.'
3256 */
3257 static
3258 uintptr_t
3259 dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
3260 dtrace_mstate_t *mstate)
3261 {
3262 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3263 uintptr_t ret;
3264 size_t strsz;
3265
3266 /*
3267 * The easy case: this probe is allowed to read all of memory, so
3268 * we can just return this as a vanilla pointer.
3269 */
3270 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
3271 return (addr);
3272
3273 /*
3274 * This is the tougher case: we copy the string in question from
3275 * kernel memory into scratch memory and return it that way: this
3276 * ensures that we won't trip up when access checking tests the
3277 * BYREF return value.
3278 */
3279 strsz = dtrace_strlen((char *)addr, size) + 1;
3280
3281 if (mstate->dtms_scratch_ptr + strsz >
3282 mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
3283 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3284 return (0);
3285 }
3286
3287 dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
3288 strsz);
3289 ret = mstate->dtms_scratch_ptr;
3290 mstate->dtms_scratch_ptr += strsz;
3291 return (ret);
3292 }
3293
3294 /*
3295 * This function implements the DIF emulator's variable lookups. The emulator
3296 * passes a reserved variable identifier and optional built-in array index.
3297 */
3298 static uint64_t
3299 dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
3300 uint64_t ndx)
3301 {
3302 /*
3303 * If we're accessing one of the uncached arguments, we'll turn this
3304 * into a reference in the args array.
3305 */
3306 if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
3307 ndx = v - DIF_VAR_ARG0;
3308 v = DIF_VAR_ARGS;
3309 }
3310
3311 switch (v) {
3312 case DIF_VAR_ARGS:
3313 ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
3314 if (ndx >= sizeof (mstate->dtms_arg) /
3315 sizeof (mstate->dtms_arg[0])) {
3316 int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3317 dtrace_vstate_t *vstate = &state->dts_vstate;
3318 dtrace_provider_t *pv;
3319 uint64_t val;
3320
3321 pv = mstate->dtms_probe->dtpr_provider;
3322 if (pv->dtpv_pops.dtps_getargval != NULL)
3323 val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
3324 mstate->dtms_probe->dtpr_id,
3325 mstate->dtms_probe->dtpr_arg, ndx, aframes);
3326 /* Special case access of arg5 as passed to dtrace_probe_error() (which see.) */
3327 else if (mstate->dtms_probe->dtpr_id == dtrace_probeid_error && ndx == 5) {
3328 return ((dtrace_state_t *)(uintptr_t)(mstate->dtms_arg[0]))->dts_arg_error_illval;
3329 }
3330
3331 else
3332 val = dtrace_getarg(ndx, aframes, mstate, vstate);
3333
3334 /*
3335 * This is regrettably required to keep the compiler
3336 * from tail-optimizing the call to dtrace_getarg().
3337 * The condition always evaluates to true, but the
3338 * compiler has no way of figuring that out a priori.
3339 * (None of this would be necessary if the compiler
3340 * could be relied upon to _always_ tail-optimize
3341 * the call to dtrace_getarg() -- but it can't.)
3342 */
3343 if (mstate->dtms_probe != NULL)
3344 return (val);
3345
3346 ASSERT(0);
3347 }
3348
3349 return (mstate->dtms_arg[ndx]);
3350
3351 case DIF_VAR_UREGS: {
3352 thread_t thread;
3353
3354 if (!dtrace_priv_proc(state))
3355 return (0);
3356
3357 if ((thread = current_thread()) == NULL) {
3358 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3359 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = 0;
3360 return (0);
3361 }
3362
3363 return (dtrace_getreg(find_user_regs(thread), ndx));
3364 }
3365
3366 case DIF_VAR_VMREGS: {
3367 uint64_t rval;
3368
3369 if (!dtrace_priv_kernel(state))
3370 return (0);
3371
3372 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3373
3374 rval = dtrace_getvmreg(ndx);
3375
3376 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3377
3378 return (rval);
3379 }
3380
3381 case DIF_VAR_CURTHREAD:
3382 if (!dtrace_priv_kernel(state))
3383 return (0);
3384
3385 return ((uint64_t)(uintptr_t)current_thread());
3386
3387 case DIF_VAR_TIMESTAMP:
3388 if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
3389 mstate->dtms_timestamp = dtrace_gethrtime();
3390 mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
3391 }
3392 return (mstate->dtms_timestamp);
3393
3394 case DIF_VAR_VTIMESTAMP:
3395 ASSERT(dtrace_vtime_references != 0);
3396 return (dtrace_get_thread_vtime(current_thread()));
3397
3398 case DIF_VAR_WALLTIMESTAMP:
3399 if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
3400 mstate->dtms_walltimestamp = dtrace_gethrestime();
3401 mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
3402 }
3403 return (mstate->dtms_walltimestamp);
3404
3405 case DIF_VAR_MACHTIMESTAMP:
3406 if (!(mstate->dtms_present & DTRACE_MSTATE_MACHTIMESTAMP)) {
3407 mstate->dtms_machtimestamp = mach_absolute_time();
3408 mstate->dtms_present |= DTRACE_MSTATE_MACHTIMESTAMP;
3409 }
3410 return (mstate->dtms_machtimestamp);
3411
3412 case DIF_VAR_MACHCTIMESTAMP:
3413 if (!(mstate->dtms_present & DTRACE_MSTATE_MACHCTIMESTAMP)) {
3414 mstate->dtms_machctimestamp = mach_continuous_time();
3415 mstate->dtms_present |= DTRACE_MSTATE_MACHCTIMESTAMP;
3416 }
3417 return (mstate->dtms_machctimestamp);
3418
3419
3420 case DIF_VAR_CPU:
3421 return ((uint64_t) dtrace_get_thread_last_cpu_id(current_thread()));
3422
3423 case DIF_VAR_IPL:
3424 if (!dtrace_priv_kernel(state))
3425 return (0);
3426 if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
3427 mstate->dtms_ipl = dtrace_getipl();
3428 mstate->dtms_present |= DTRACE_MSTATE_IPL;
3429 }
3430 return (mstate->dtms_ipl);
3431
3432 case DIF_VAR_EPID:
3433 ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
3434 return (mstate->dtms_epid);
3435
3436 case DIF_VAR_ID:
3437 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3438 return (mstate->dtms_probe->dtpr_id);
3439
3440 case DIF_VAR_STACKDEPTH:
3441 if (!dtrace_priv_kernel(state))
3442 return (0);
3443 if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
3444 int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3445
3446 mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
3447 mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
3448 }
3449 return (mstate->dtms_stackdepth);
3450
3451 case DIF_VAR_USTACKDEPTH:
3452 if (!dtrace_priv_proc(state))
3453 return (0);
3454 if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
3455 /*
3456 * See comment in DIF_VAR_PID.
3457 */
3458 if (DTRACE_ANCHORED(mstate->dtms_probe) &&
3459 CPU_ON_INTR(CPU)) {
3460 mstate->dtms_ustackdepth = 0;
3461 } else {
3462 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3463 mstate->dtms_ustackdepth =
3464 dtrace_getustackdepth();
3465 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3466 }
3467 mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
3468 }
3469 return (mstate->dtms_ustackdepth);
3470
3471 case DIF_VAR_CALLER:
3472 if (!dtrace_priv_kernel(state))
3473 return (0);
3474 if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
3475 int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3476
3477 if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
3478 /*
3479 * If this is an unanchored probe, we are
3480 * required to go through the slow path:
3481 * dtrace_caller() only guarantees correct
3482 * results for anchored probes.
3483 */
3484 pc_t caller[2];
3485
3486 dtrace_getpcstack(caller, 2, aframes,
3487 (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
3488 mstate->dtms_caller = caller[1];
3489 } else if ((mstate->dtms_caller =
3490 dtrace_caller(aframes)) == (uintptr_t)-1) {
3491 /*
3492 * We have failed to do this the quick way;
3493 * we must resort to the slower approach of
3494 * calling dtrace_getpcstack().
3495 */
3496 pc_t caller;
3497
3498 dtrace_getpcstack(&caller, 1, aframes, NULL);
3499 mstate->dtms_caller = caller;
3500 }
3501
3502 mstate->dtms_present |= DTRACE_MSTATE_CALLER;
3503 }
3504 return (mstate->dtms_caller);
3505
3506 case DIF_VAR_UCALLER:
3507 if (!dtrace_priv_proc(state))
3508 return (0);
3509
3510 if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
3511 uint64_t ustack[3];
3512
3513 /*
3514 * dtrace_getupcstack() fills in the first uint64_t
3515 * with the current PID. The second uint64_t will
3516 * be the program counter at user-level. The third
3517 * uint64_t will contain the caller, which is what
3518 * we're after.
3519 */
3520 ustack[2] = 0;
3521 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3522 dtrace_getupcstack(ustack, 3);
3523 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3524 mstate->dtms_ucaller = ustack[2];
3525 mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
3526 }
3527
3528 return (mstate->dtms_ucaller);
3529
3530 case DIF_VAR_PROBEPROV:
3531 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3532 return (dtrace_dif_varstr(
3533 (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
3534 state, mstate));
3535
3536 case DIF_VAR_PROBEMOD:
3537 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3538 return (dtrace_dif_varstr(
3539 (uintptr_t)mstate->dtms_probe->dtpr_mod,
3540 state, mstate));
3541
3542 case DIF_VAR_PROBEFUNC:
3543 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3544 return (dtrace_dif_varstr(
3545 (uintptr_t)mstate->dtms_probe->dtpr_func,
3546 state, mstate));
3547
3548 case DIF_VAR_PROBENAME:
3549 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3550 return (dtrace_dif_varstr(
3551 (uintptr_t)mstate->dtms_probe->dtpr_name,
3552 state, mstate));
3553
3554 case DIF_VAR_PID:
3555 if (!dtrace_priv_proc_relaxed(state))
3556 return (0);
3557
3558 /*
3559 * Note that we are assuming that an unanchored probe is
3560 * always due to a high-level interrupt. (And we're assuming
3561 * that there is only a single high level interrupt.)
3562 */
3563 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3564 /* Anchored probe that fires while on an interrupt accrues to process 0 */
3565 return 0;
3566
3567 return ((uint64_t)dtrace_proc_selfpid());
3568
3569 case DIF_VAR_PPID:
3570 if (!dtrace_priv_proc_relaxed(state))
3571 return (0);
3572
3573 /*
3574 * See comment in DIF_VAR_PID.
3575 */
3576 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3577 return (0);
3578
3579 return ((uint64_t)dtrace_proc_selfppid());
3580
3581 case DIF_VAR_TID:
3582 /* We do not need to check for null current_thread() */
3583 return thread_tid(current_thread()); /* globally unique */
3584
3585 case DIF_VAR_PTHREAD_SELF:
3586 if (!dtrace_priv_proc(state))
3587 return (0);
3588
3589 /* Not currently supported, but we should be able to delta the dispatchqaddr and dispatchqoffset to get pthread_self */
3590 return 0;
3591
3592 case DIF_VAR_DISPATCHQADDR:
3593 if (!dtrace_priv_proc(state))
3594 return (0);
3595
3596 /* We do not need to check for null current_thread() */
3597 return thread_dispatchqaddr(current_thread());
3598
3599 case DIF_VAR_EXECNAME:
3600 {
3601 char *xname = (char *)mstate->dtms_scratch_ptr;
3602 char *pname = proc_best_name(curproc);
3603 size_t scratch_size = sizeof(proc_name_t);
3604
3605 /* The scratch allocation's lifetime is that of the clause. */
3606 if (!DTRACE_INSCRATCH(mstate, scratch_size)) {
3607 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3608 return 0;
3609 }
3610
3611 if (!dtrace_priv_proc_relaxed(state))
3612 return (0);
3613
3614 mstate->dtms_scratch_ptr += scratch_size;
3615 strlcpy(xname, pname, scratch_size);
3616
3617 return ((uint64_t)(uintptr_t)xname);
3618 }
3619
3620
3621 case DIF_VAR_ZONENAME:
3622 {
3623 /* scratch_size is equal to length('global') + 1 for the null-terminator. */
3624 char *zname = (char *)mstate->dtms_scratch_ptr;
3625 size_t scratch_size = 6 + 1;
3626
3627 if (!dtrace_priv_proc(state))
3628 return (0);
3629
3630 /* The scratch allocation's lifetime is that of the clause. */
3631 if (!DTRACE_INSCRATCH(mstate, scratch_size)) {
3632 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3633 return 0;
3634 }
3635
3636 mstate->dtms_scratch_ptr += scratch_size;
3637
3638 /* The kernel does not provide zonename, it will always return 'global'. */
3639 strlcpy(zname, "global", scratch_size);
3640
3641 return ((uint64_t)(uintptr_t)zname);
3642 }
3643
3644 #if MONOTONIC
3645 case DIF_VAR_CPUINSTRS:
3646 return mt_cur_cpu_instrs();
3647
3648 case DIF_VAR_CPUCYCLES:
3649 return mt_cur_cpu_cycles();
3650
3651 case DIF_VAR_VINSTRS:
3652 return mt_cur_thread_instrs();
3653
3654 case DIF_VAR_VCYCLES:
3655 return mt_cur_thread_cycles();
3656 #else /* MONOTONIC */
3657 case DIF_VAR_CPUINSTRS: /* FALLTHROUGH */
3658 case DIF_VAR_CPUCYCLES: /* FALLTHROUGH */
3659 case DIF_VAR_VINSTRS: /* FALLTHROUGH */
3660 case DIF_VAR_VCYCLES: /* FALLTHROUGH */
3661 return 0;
3662 #endif /* !MONOTONIC */
3663
3664 case DIF_VAR_UID:
3665 if (!dtrace_priv_proc_relaxed(state))
3666 return (0);
3667
3668 /*
3669 * See comment in DIF_VAR_PID.
3670 */
3671 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3672 return (0);
3673
3674 return ((uint64_t) dtrace_proc_selfruid());
3675
3676 case DIF_VAR_GID:
3677 if (!dtrace_priv_proc(state))
3678 return (0);
3679
3680 /*
3681 * See comment in DIF_VAR_PID.
3682 */
3683 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3684 return (0);
3685
3686 if (dtrace_CRED() != NULL)
3687 /* Credential does not require lazy initialization. */
3688 return ((uint64_t)kauth_getgid());
3689 else {
3690 /* proc_lock would be taken under kauth_cred_proc_ref() in kauth_cred_get(). */
3691 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3692 return -1ULL;
3693 }
3694
3695 case DIF_VAR_ERRNO: {
3696 uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
3697 if (!dtrace_priv_proc(state))
3698 return (0);
3699
3700 /*
3701 * See comment in DIF_VAR_PID.
3702 */
3703 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3704 return (0);
3705
3706 if (uthread)
3707 return (uint64_t)uthread->t_dtrace_errno;
3708 else {
3709 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3710 return -1ULL;
3711 }
3712 }
3713
3714 default:
3715 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3716 return (0);
3717 }
3718 }
3719
3720 typedef enum dtrace_json_state {
3721 DTRACE_JSON_REST = 1,
3722 DTRACE_JSON_OBJECT,
3723 DTRACE_JSON_STRING,
3724 DTRACE_JSON_STRING_ESCAPE,
3725 DTRACE_JSON_STRING_ESCAPE_UNICODE,
3726 DTRACE_JSON_COLON,
3727 DTRACE_JSON_COMMA,
3728 DTRACE_JSON_VALUE,
3729 DTRACE_JSON_IDENTIFIER,
3730 DTRACE_JSON_NUMBER,
3731 DTRACE_JSON_NUMBER_FRAC,
3732 DTRACE_JSON_NUMBER_EXP,
3733 DTRACE_JSON_COLLECT_OBJECT
3734 } dtrace_json_state_t;
3735
3736 /*
3737 * This function possesses just enough knowledge about JSON to extract a single
3738 * value from a JSON string and store it in the scratch buffer. It is able
3739 * to extract nested object values, and members of arrays by index.
3740 *
3741 * elemlist is a list of JSON keys, stored as packed NUL-terminated strings, to
3742 * be looked up as we descend into the object tree. e.g.
3743 *
3744 * foo[0].bar.baz[32] --> "foo" NUL "0" NUL "bar" NUL "baz" NUL "32" NUL
3745 * with nelems = 5.
3746 *
3747 * The run time of this function must be bounded above by strsize to limit the
3748 * amount of work done in probe context. As such, it is implemented as a
3749 * simple state machine, reading one character at a time using safe loads
3750 * until we find the requested element, hit a parsing error or run off the
3751 * end of the object or string.
3752 *
3753 * As there is no way for a subroutine to return an error without interrupting
3754 * clause execution, we simply return NULL in the event of a missing key or any
3755 * other error condition. Each NULL return in this function is commented with
3756 * the error condition it represents -- parsing or otherwise.
3757 *
3758 * The set of states for the state machine closely matches the JSON
3759 * specification (http://json.org/). Briefly:
3760 *
3761 * DTRACE_JSON_REST:
3762 * Skip whitespace until we find either a top-level Object, moving
3763 * to DTRACE_JSON_OBJECT; or an Array, moving to DTRACE_JSON_VALUE.
3764 *
3765 * DTRACE_JSON_OBJECT:
3766 * Locate the next key String in an Object. Sets a flag to denote
3767 * the next String as a key string and moves to DTRACE_JSON_STRING.
3768 *
3769 * DTRACE_JSON_COLON:
3770 * Skip whitespace until we find the colon that separates key Strings
3771 * from their values. Once found, move to DTRACE_JSON_VALUE.
3772 *
3773 * DTRACE_JSON_VALUE:
3774 * Detects the type of the next value (String, Number, Identifier, Object
3775 * or Array) and routes to the states that process that type. Here we also
3776 * deal with the element selector list if we are requested to traverse down
3777 * into the object tree.
3778 *
3779 * DTRACE_JSON_COMMA:
3780 * Skip whitespace until we find the comma that separates key-value pairs
3781 * in Objects (returning to DTRACE_JSON_OBJECT) or values in Arrays
3782 * (similarly DTRACE_JSON_VALUE). All following literal value processing
3783 * states return to this state at the end of their value, unless otherwise
3784 * noted.
3785 *
3786 * DTRACE_JSON_NUMBER, DTRACE_JSON_NUMBER_FRAC, DTRACE_JSON_NUMBER_EXP:
3787 * Processes a Number literal from the JSON, including any exponent
3788 * component that may be present. Numbers are returned as strings, which
3789 * may be passed to strtoll() if an integer is required.
3790 *
3791 * DTRACE_JSON_IDENTIFIER:
3792 * Processes a "true", "false" or "null" literal in the JSON.
3793 *
3794 * DTRACE_JSON_STRING, DTRACE_JSON_STRING_ESCAPE,
3795 * DTRACE_JSON_STRING_ESCAPE_UNICODE:
3796 * Processes a String literal from the JSON, whether the String denotes
3797 * a key, a value or part of a larger Object. Handles all escape sequences
3798 * present in the specification, including four-digit unicode characters,
3799 * but merely includes the escape sequence without converting it to the
3800 * actual escaped character. If the String is flagged as a key, we
3801 * move to DTRACE_JSON_COLON rather than DTRACE_JSON_COMMA.
3802 *
3803 * DTRACE_JSON_COLLECT_OBJECT:
3804 * This state collects an entire Object (or Array), correctly handling
3805 * embedded strings. If the full element selector list matches this nested
3806 * object, we return the Object in full as a string. If not, we use this
3807 * state to skip to the next value at this level and continue processing.
3808 */
3809 static char *
3810 dtrace_json(uint64_t size, uintptr_t json, char *elemlist, int nelems,
3811 char *dest)
3812 {
3813 dtrace_json_state_t state = DTRACE_JSON_REST;
3814 int64_t array_elem = INT64_MIN;
3815 int64_t array_pos = 0;
3816 uint8_t escape_unicount = 0;
3817 boolean_t string_is_key = B_FALSE;
3818 boolean_t collect_object = B_FALSE;
3819 boolean_t found_key = B_FALSE;
3820 boolean_t in_array = B_FALSE;
3821 uint32_t braces = 0, brackets = 0;
3822 char *elem = elemlist;
3823 char *dd = dest;
3824 uintptr_t cur;
3825
3826 for (cur = json; cur < json + size; cur++) {
3827 char cc = dtrace_load8(cur);
3828 if (cc == '\0')
3829 return (NULL);
3830
3831 switch (state) {
3832 case DTRACE_JSON_REST:
3833 if (isspace(cc))
3834 break;
3835
3836 if (cc == '{') {
3837 state = DTRACE_JSON_OBJECT;
3838 break;
3839 }
3840
3841 if (cc == '[') {
3842 in_array = B_TRUE;
3843 array_pos = 0;
3844 array_elem = dtrace_strtoll(elem, 10, size);
3845 found_key = array_elem == 0 ? B_TRUE : B_FALSE;
3846 state = DTRACE_JSON_VALUE;
3847 break;
3848 }
3849
3850 /*
3851 * ERROR: expected to find a top-level object or array.
3852 */
3853 return (NULL);
3854 case DTRACE_JSON_OBJECT:
3855 if (isspace(cc))
3856 break;
3857
3858 if (cc == '"') {
3859 state = DTRACE_JSON_STRING;
3860 string_is_key = B_TRUE;
3861 break;
3862 }
3863
3864 /*
3865 * ERROR: either the object did not start with a key
3866 * string, or we've run off the end of the object
3867 * without finding the requested key.
3868 */
3869 return (NULL);
3870 case DTRACE_JSON_STRING:
3871 if (cc == '\\') {
3872 *dd++ = '\\';
3873 state = DTRACE_JSON_STRING_ESCAPE;
3874 break;
3875 }
3876
3877 if (cc == '"') {
3878 if (collect_object) {
3879 /*
3880 * We don't reset the dest here, as
3881 * the string is part of a larger
3882 * object being collected.
3883 */
3884 *dd++ = cc;
3885 collect_object = B_FALSE;
3886 state = DTRACE_JSON_COLLECT_OBJECT;
3887 break;
3888 }
3889 *dd = '\0';
3890 dd = dest; /* reset string buffer */
3891 if (string_is_key) {
3892 if (dtrace_strncmp(dest, elem,
3893 size) == 0)
3894 found_key = B_TRUE;
3895 } else if (found_key) {
3896 if (nelems > 1) {
3897 /*
3898 * We expected an object, not
3899 * this string.
3900 */
3901 return (NULL);
3902 }
3903 return (dest);
3904 }
3905 state = string_is_key ? DTRACE_JSON_COLON :
3906 DTRACE_JSON_COMMA;
3907 string_is_key = B_FALSE;
3908 break;
3909 }
3910
3911 *dd++ = cc;
3912 break;
3913 case DTRACE_JSON_STRING_ESCAPE:
3914 *dd++ = cc;
3915 if (cc == 'u') {
3916 escape_unicount = 0;
3917 state = DTRACE_JSON_STRING_ESCAPE_UNICODE;
3918 } else {
3919 state = DTRACE_JSON_STRING;
3920 }
3921 break;
3922 case DTRACE_JSON_STRING_ESCAPE_UNICODE:
3923 if (!isxdigit(cc)) {
3924 /*
3925 * ERROR: invalid unicode escape, expected
3926 * four valid hexidecimal digits.
3927 */
3928 return (NULL);
3929 }
3930
3931 *dd++ = cc;
3932 if (++escape_unicount == 4)
3933 state = DTRACE_JSON_STRING;
3934 break;
3935 case DTRACE_JSON_COLON:
3936 if (isspace(cc))
3937 break;
3938
3939 if (cc == ':') {
3940 state = DTRACE_JSON_VALUE;
3941 break;
3942 }
3943
3944 /*
3945 * ERROR: expected a colon.
3946 */
3947 return (NULL);
3948 case DTRACE_JSON_COMMA:
3949 if (isspace(cc))
3950 break;
3951
3952 if (cc == ',') {
3953 if (in_array) {
3954 state = DTRACE_JSON_VALUE;
3955 if (++array_pos == array_elem)
3956 found_key = B_TRUE;
3957 } else {
3958 state = DTRACE_JSON_OBJECT;
3959 }
3960 break;
3961 }
3962
3963 /*
3964 * ERROR: either we hit an unexpected character, or
3965 * we reached the end of the object or array without
3966 * finding the requested key.
3967 */
3968 return (NULL);
3969 case DTRACE_JSON_IDENTIFIER:
3970 if (islower(cc)) {
3971 *dd++ = cc;
3972 break;
3973 }
3974
3975 *dd = '\0';
3976 dd = dest; /* reset string buffer */
3977
3978 if (dtrace_strncmp(dest, "true", 5) == 0 ||
3979 dtrace_strncmp(dest, "false", 6) == 0 ||
3980 dtrace_strncmp(dest, "null", 5) == 0) {
3981 if (found_key) {
3982 if (nelems > 1) {
3983 /*
3984 * ERROR: We expected an object,
3985 * not this identifier.
3986 */
3987 return (NULL);
3988 }
3989 return (dest);
3990 } else {
3991 cur--;
3992 state = DTRACE_JSON_COMMA;
3993 break;
3994 }
3995 }
3996
3997 /*
3998 * ERROR: we did not recognise the identifier as one
3999 * of those in the JSON specification.
4000 */
4001 return (NULL);
4002 case DTRACE_JSON_NUMBER:
4003 if (cc == '.') {
4004 *dd++ = cc;
4005 state = DTRACE_JSON_NUMBER_FRAC;
4006 break;
4007 }
4008
4009 if (cc == 'x' || cc == 'X') {
4010 /*
4011 * ERROR: specification explicitly excludes
4012 * hexidecimal or octal numbers.
4013 */
4014 return (NULL);
4015 }
4016
4017 OS_FALLTHROUGH;
4018 case DTRACE_JSON_NUMBER_FRAC:
4019 if (cc == 'e' || cc == 'E') {
4020 *dd++ = cc;
4021 state = DTRACE_JSON_NUMBER_EXP;
4022 break;
4023 }
4024
4025 if (cc == '+' || cc == '-') {
4026 /*
4027 * ERROR: expect sign as part of exponent only.
4028 */
4029 return (NULL);
4030 }
4031 OS_FALLTHROUGH;
4032 case DTRACE_JSON_NUMBER_EXP:
4033 if (isdigit(cc) || cc == '+' || cc == '-') {
4034 *dd++ = cc;
4035 break;
4036 }
4037
4038 *dd = '\0';
4039 dd = dest; /* reset string buffer */
4040 if (found_key) {
4041 if (nelems > 1) {
4042 /*
4043 * ERROR: We expected an object, not
4044 * this number.
4045 */
4046 return (NULL);
4047 }
4048 return (dest);
4049 }
4050
4051 cur--;
4052 state = DTRACE_JSON_COMMA;
4053 break;
4054 case DTRACE_JSON_VALUE:
4055 if (isspace(cc))
4056 break;
4057
4058 if (cc == '{' || cc == '[') {
4059 if (nelems > 1 && found_key) {
4060 in_array = cc == '[' ? B_TRUE : B_FALSE;
4061 /*
4062 * If our element selector directs us
4063 * to descend into this nested object,
4064 * then move to the next selector
4065 * element in the list and restart the
4066 * state machine.
4067 */
4068 while (*elem != '\0')
4069 elem++;
4070 elem++; /* skip the inter-element NUL */
4071 nelems--;
4072 dd = dest;
4073 if (in_array) {
4074 state = DTRACE_JSON_VALUE;
4075 array_pos = 0;
4076 array_elem = dtrace_strtoll(
4077 elem, 10, size);
4078 found_key = array_elem == 0 ?
4079 B_TRUE : B_FALSE;
4080 } else {
4081 found_key = B_FALSE;
4082 state = DTRACE_JSON_OBJECT;
4083 }
4084 break;
4085 }
4086
4087 /*
4088 * Otherwise, we wish to either skip this
4089 * nested object or return it in full.
4090 */
4091 if (cc == '[')
4092 brackets = 1;
4093 else
4094 braces = 1;
4095 *dd++ = cc;
4096 state = DTRACE_JSON_COLLECT_OBJECT;
4097 break;
4098 }
4099
4100 if (cc == '"') {
4101 state = DTRACE_JSON_STRING;
4102 break;
4103 }
4104
4105 if (islower(cc)) {
4106 /*
4107 * Here we deal with true, false and null.
4108 */
4109 *dd++ = cc;
4110 state = DTRACE_JSON_IDENTIFIER;
4111 break;
4112 }
4113
4114 if (cc == '-' || isdigit(cc)) {
4115 *dd++ = cc;
4116 state = DTRACE_JSON_NUMBER;
4117 break;
4118 }
4119
4120 /*
4121 * ERROR: unexpected character at start of value.
4122 */
4123 return (NULL);
4124 case DTRACE_JSON_COLLECT_OBJECT:
4125 if (cc == '\0')
4126 /*
4127 * ERROR: unexpected end of input.
4128 */
4129 return (NULL);
4130
4131 *dd++ = cc;
4132 if (cc == '"') {
4133 collect_object = B_TRUE;
4134 state = DTRACE_JSON_STRING;
4135 break;
4136 }
4137
4138 if (cc == ']') {
4139 if (brackets-- == 0) {
4140 /*
4141 * ERROR: unbalanced brackets.
4142 */
4143 return (NULL);
4144 }
4145 } else if (cc == '}') {
4146 if (braces-- == 0) {
4147 /*
4148 * ERROR: unbalanced braces.
4149 */
4150 return (NULL);
4151 }
4152 } else if (cc == '{') {
4153 braces++;
4154 } else if (cc == '[') {
4155 brackets++;
4156 }
4157
4158 if (brackets == 0 && braces == 0) {
4159 if (found_key) {
4160 *dd = '\0';
4161 return (dest);
4162 }
4163 dd = dest; /* reset string buffer */
4164 state = DTRACE_JSON_COMMA;
4165 }
4166 break;
4167 }
4168 }
4169 return (NULL);
4170 }
4171
4172 /*
4173 * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
4174 * Notice that we don't bother validating the proper number of arguments or
4175 * their types in the tuple stack. This isn't needed because all argument
4176 * interpretation is safe because of our load safety -- the worst that can
4177 * happen is that a bogus program can obtain bogus results.
4178 */
4179 static void
4180 dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
4181 dtrace_key_t *tupregs, int nargs,
4182 dtrace_mstate_t *mstate, dtrace_state_t *state)
4183 {
4184 volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
4185 volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
4186 dtrace_vstate_t *vstate = &state->dts_vstate;
4187
4188 #if !defined(__APPLE__)
4189 union {
4190 mutex_impl_t mi;
4191 uint64_t mx;
4192 } m;
4193
4194 union {
4195 krwlock_t ri;
4196 uintptr_t rw;
4197 } r;
4198 #else
4199 /* FIXME: awaits lock/mutex work */
4200 #endif /* __APPLE__ */
4201
4202 switch (subr) {
4203 case DIF_SUBR_RAND:
4204 regs[rd] = dtrace_xoroshiro128_plus_next(
4205 state->dts_rstate[CPU->cpu_id]);
4206 break;
4207
4208 #if !defined(__APPLE__)
4209 case DIF_SUBR_MUTEX_OWNED:
4210 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4211 mstate, vstate)) {
4212 regs[rd] = 0;
4213 break;
4214 }
4215
4216 m.mx = dtrace_load64(tupregs[0].dttk_value);
4217 if (MUTEX_TYPE_ADAPTIVE(&m.mi))
4218 regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
4219 else
4220 regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
4221 break;
4222
4223 case DIF_SUBR_MUTEX_OWNER:
4224 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4225 mstate, vstate)) {
4226 regs[rd] = 0;
4227 break;
4228 }
4229
4230 m.mx = dtrace_load64(tupregs[0].dttk_value);
4231 if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
4232 MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
4233 regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
4234 else
4235 regs[rd] = 0;
4236 break;
4237
4238 case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
4239 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4240 mstate, vstate)) {
4241 regs[rd] = 0;
4242 break;
4243 }
4244
4245 m.mx = dtrace_load64(tupregs[0].dttk_value);
4246 regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
4247 break;
4248
4249 case DIF_SUBR_MUTEX_TYPE_SPIN:
4250 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4251 mstate, vstate)) {
4252 regs[rd] = 0;
4253 break;
4254 }
4255
4256 m.mx = dtrace_load64(tupregs[0].dttk_value);
4257 regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
4258 break;
4259
4260 case DIF_SUBR_RW_READ_HELD: {
4261 uintptr_t tmp;
4262
4263 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4264 mstate, vstate)) {
4265 regs[rd] = 0;
4266 break;
4267 }
4268
4269 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4270 regs[rd] = _RW_READ_HELD(&r.ri, tmp);
4271 break;
4272 }
4273
4274 case DIF_SUBR_RW_WRITE_HELD:
4275 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
4276 mstate, vstate)) {
4277 regs[rd] = 0;
4278 break;
4279 }
4280
4281 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4282 regs[rd] = _RW_WRITE_HELD(&r.ri);
4283 break;
4284
4285 case DIF_SUBR_RW_ISWRITER:
4286 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
4287 mstate, vstate)) {
4288 regs[rd] = 0;
4289 break;
4290 }
4291
4292 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4293 regs[rd] = _RW_ISWRITER(&r.ri);
4294 break;
4295 #else
4296 /* FIXME: awaits lock/mutex work */
4297 #endif /* __APPLE__ */
4298
4299 case DIF_SUBR_BCOPY: {
4300 /*
4301 * We need to be sure that the destination is in the scratch
4302 * region -- no other region is allowed.
4303 */
4304 uintptr_t src = tupregs[0].dttk_value;
4305 uintptr_t dest = tupregs[1].dttk_value;
4306 size_t size = tupregs[2].dttk_value;
4307
4308 if (!dtrace_inscratch(dest, size, mstate)) {
4309 *flags |= CPU_DTRACE_BADADDR;
4310 *illval = regs[rd];
4311 break;
4312 }
4313
4314 if (!dtrace_canload(src, size, mstate, vstate)) {
4315 regs[rd] = 0;
4316 break;
4317 }
4318
4319 dtrace_bcopy((void *)src, (void *)dest, size);
4320 break;
4321 }
4322
4323 case DIF_SUBR_ALLOCA:
4324 case DIF_SUBR_COPYIN: {
4325 uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
4326 uint64_t size =
4327 tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
4328 size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
4329
4330 /*
4331 * Check whether the user can access kernel memory
4332 */
4333 if (dtrace_priv_kernel(state) == 0) {
4334 DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
4335 regs[rd] = 0;
4336 break;
4337 }
4338 /*
4339 * This action doesn't require any credential checks since
4340 * probes will not activate in user contexts to which the
4341 * enabling user does not have permissions.
4342 */
4343
4344 /*
4345 * Rounding up the user allocation size could have overflowed
4346 * a large, bogus allocation (like -1ULL) to 0.
4347 */
4348 if (scratch_size < size ||
4349 !DTRACE_INSCRATCH(mstate, scratch_size)) {
4350 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4351 regs[rd] = 0;
4352 break;
4353 }
4354
4355 if (subr == DIF_SUBR_COPYIN) {
4356 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4357 if (dtrace_priv_proc(state))
4358 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
4359 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4360 }
4361
4362 mstate->dtms_scratch_ptr += scratch_size;
4363 regs[rd] = dest;
4364 break;
4365 }
4366
4367 case DIF_SUBR_COPYINTO: {
4368 uint64_t size = tupregs[1].dttk_value;
4369 uintptr_t dest = tupregs[2].dttk_value;
4370
4371 /*
4372 * This action doesn't require any credential checks since
4373 * probes will not activate in user contexts to which the
4374 * enabling user does not have permissions.
4375 */
4376 if (!dtrace_inscratch(dest, size, mstate)) {
4377 *flags |= CPU_DTRACE_BADADDR;
4378 *illval = regs[rd];
4379 break;
4380 }
4381
4382 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4383 if (dtrace_priv_proc(state))
4384 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
4385 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4386 break;
4387 }
4388
4389 case DIF_SUBR_COPYINSTR: {
4390 uintptr_t dest = mstate->dtms_scratch_ptr;
4391 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4392
4393 if (nargs > 1 && tupregs[1].dttk_value < size)
4394 size = tupregs[1].dttk_value + 1;
4395
4396 /*
4397 * This action doesn't require any credential checks since
4398 * probes will not activate in user contexts to which the
4399 * enabling user does not have permissions.
4400 */
4401 if (!DTRACE_INSCRATCH(mstate, size)) {
4402 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4403 regs[rd] = 0;
4404 break;
4405 }
4406
4407 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4408 if (dtrace_priv_proc(state))
4409 dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
4410 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4411
4412 ((char *)dest)[size - 1] = '\0';
4413 mstate->dtms_scratch_ptr += size;
4414 regs[rd] = dest;
4415 break;
4416 }
4417
4418 case DIF_SUBR_MSGSIZE:
4419 case DIF_SUBR_MSGDSIZE: {
4420 /* Darwin does not implement SysV streams messages */
4421 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4422 regs[rd] = 0;
4423 break;
4424 }
4425
4426 case DIF_SUBR_PROGENYOF: {
4427 pid_t pid = tupregs[0].dttk_value;
4428 struct proc *p = current_proc();
4429 int rval = 0, lim = nprocs;
4430
4431 while(p && (lim-- > 0)) {
4432 pid_t ppid;
4433
4434 ppid = (pid_t)dtrace_load32((uintptr_t)&(p->p_pid));
4435 if (*flags & CPU_DTRACE_FAULT)
4436 break;
4437
4438 if (ppid == pid) {
4439 rval = 1;
4440 break;
4441 }
4442
4443 if (ppid == 0)
4444 break; /* Can't climb process tree any further. */
4445
4446 p = (struct proc *)dtrace_loadptr((uintptr_t)&(p->p_pptr));
4447 #if __has_feature(ptrauth_calls)
4448 p = ptrauth_strip(p, ptrauth_key_process_independent_data);
4449 #endif
4450 if (*flags & CPU_DTRACE_FAULT)
4451 break;
4452 }
4453
4454 regs[rd] = rval;
4455 break;
4456 }
4457
4458 case DIF_SUBR_SPECULATION:
4459 regs[rd] = dtrace_speculation(state);
4460 break;
4461
4462
4463 case DIF_SUBR_COPYOUT: {
4464 uintptr_t kaddr = tupregs[0].dttk_value;
4465 user_addr_t uaddr = tupregs[1].dttk_value;
4466 uint64_t size = tupregs[2].dttk_value;
4467
4468 if (!dtrace_destructive_disallow &&
4469 dtrace_priv_proc_control(state) &&
4470 !dtrace_istoxic(kaddr, size) &&
4471 dtrace_canload(kaddr, size, mstate, vstate)) {
4472 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4473 dtrace_copyout(kaddr, uaddr, size, flags);
4474 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4475 }
4476 break;
4477 }
4478
4479 case DIF_SUBR_COPYOUTSTR: {
4480 uintptr_t kaddr = tupregs[0].dttk_value;
4481 user_addr_t uaddr = tupregs[1].dttk_value;
4482 uint64_t size = tupregs[2].dttk_value;
4483 size_t lim;
4484
4485 if (!dtrace_destructive_disallow &&
4486 dtrace_priv_proc_control(state) &&
4487 !dtrace_istoxic(kaddr, size) &&
4488 dtrace_strcanload(kaddr, size, &lim, mstate, vstate)) {
4489 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4490 dtrace_copyoutstr(kaddr, uaddr, lim, flags);
4491 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4492 }
4493 break;
4494 }
4495
4496 case DIF_SUBR_STRLEN: {
4497 size_t size = state->dts_options[DTRACEOPT_STRSIZE];
4498 uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
4499 size_t lim;
4500
4501 if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) {
4502 regs[rd] = 0;
4503 break;
4504 }
4505
4506 regs[rd] = dtrace_strlen((char *)addr, lim);
4507
4508 break;
4509 }
4510
4511 case DIF_SUBR_STRCHR:
4512 case DIF_SUBR_STRRCHR: {
4513 /*
4514 * We're going to iterate over the string looking for the
4515 * specified character. We will iterate until we have reached
4516 * the string length or we have found the character. If this
4517 * is DIF_SUBR_STRRCHR, we will look for the last occurrence
4518 * of the specified character instead of the first.
4519 */
4520 uintptr_t addr = tupregs[0].dttk_value;
4521 uintptr_t addr_limit;
4522 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4523 size_t lim;
4524 char c, target = (char)tupregs[1].dttk_value;
4525
4526 if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) {
4527 regs[rd] = 0;
4528 break;
4529 }
4530 addr_limit = addr + lim;
4531
4532 for (regs[rd] = 0; addr < addr_limit; addr++) {
4533 if ((c = dtrace_load8(addr)) == target) {
4534 regs[rd] = addr;
4535
4536 if (subr == DIF_SUBR_STRCHR)
4537 break;
4538 }
4539
4540 if (c == '\0')
4541 break;
4542 }
4543
4544 break;
4545 }
4546
4547 case DIF_SUBR_STRSTR:
4548 case DIF_SUBR_INDEX:
4549 case DIF_SUBR_RINDEX: {
4550 /*
4551 * We're going to iterate over the string looking for the
4552 * specified string. We will iterate until we have reached
4553 * the string length or we have found the string. (Yes, this
4554 * is done in the most naive way possible -- but considering
4555 * that the string we're searching for is likely to be
4556 * relatively short, the complexity of Rabin-Karp or similar
4557 * hardly seems merited.)
4558 */
4559 char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
4560 char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
4561 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4562 size_t len = dtrace_strlen(addr, size);
4563 size_t sublen = dtrace_strlen(substr, size);
4564 char *limit = addr + len, *orig = addr;
4565 int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
4566 int inc = 1;
4567
4568 regs[rd] = notfound;
4569
4570 if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
4571 regs[rd] = 0;
4572 break;
4573 }
4574
4575 if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
4576 vstate)) {
4577 regs[rd] = 0;
4578 break;
4579 }
4580
4581 /*
4582 * strstr() and index()/rindex() have similar semantics if
4583 * both strings are the empty string: strstr() returns a
4584 * pointer to the (empty) string, and index() and rindex()
4585 * both return index 0 (regardless of any position argument).
4586 */
4587 if (sublen == 0 && len == 0) {
4588 if (subr == DIF_SUBR_STRSTR)
4589 regs[rd] = (uintptr_t)addr;
4590 else
4591 regs[rd] = 0;
4592 break;
4593 }
4594
4595 if (subr != DIF_SUBR_STRSTR) {
4596 if (subr == DIF_SUBR_RINDEX) {
4597 limit = orig - 1;
4598 addr += len;
4599 inc = -1;
4600 }
4601
4602 /*
4603 * Both index() and rindex() take an optional position
4604 * argument that denotes the starting position.
4605 */
4606 if (nargs == 3) {
4607 int64_t pos = (int64_t)tupregs[2].dttk_value;
4608
4609 /*
4610 * If the position argument to index() is
4611 * negative, Perl implicitly clamps it at
4612 * zero. This semantic is a little surprising
4613 * given the special meaning of negative
4614 * positions to similar Perl functions like
4615 * substr(), but it appears to reflect a
4616 * notion that index() can start from a
4617 * negative index and increment its way up to
4618 * the string. Given this notion, Perl's
4619 * rindex() is at least self-consistent in
4620 * that it implicitly clamps positions greater
4621 * than the string length to be the string
4622 * length. Where Perl completely loses
4623 * coherence, however, is when the specified
4624 * substring is the empty string (""). In
4625 * this case, even if the position is
4626 * negative, rindex() returns 0 -- and even if
4627 * the position is greater than the length,
4628 * index() returns the string length. These
4629 * semantics violate the notion that index()
4630 * should never return a value less than the
4631 * specified position and that rindex() should
4632 * never return a value greater than the
4633 * specified position. (One assumes that
4634 * these semantics are artifacts of Perl's
4635 * implementation and not the results of
4636 * deliberate design -- it beggars belief that
4637 * even Larry Wall could desire such oddness.)
4638 * While in the abstract one would wish for
4639 * consistent position semantics across
4640 * substr(), index() and rindex() -- or at the
4641 * very least self-consistent position
4642 * semantics for index() and rindex() -- we
4643 * instead opt to keep with the extant Perl
4644 * semantics, in all their broken glory. (Do
4645 * we have more desire to maintain Perl's
4646 * semantics than Perl does? Probably.)
4647 */
4648 if (subr == DIF_SUBR_RINDEX) {
4649 if (pos < 0) {
4650 if (sublen == 0)
4651 regs[rd] = 0;
4652 break;
4653 }
4654
4655 if ((size_t)pos > len)
4656 pos = len;
4657 } else {
4658 if (pos < 0)
4659 pos = 0;
4660
4661 if ((size_t)pos >= len) {
4662 if (sublen == 0)
4663 regs[rd] = len;
4664 break;
4665 }
4666 }
4667
4668 addr = orig + pos;
4669 }
4670 }
4671
4672 for (regs[rd] = notfound; addr != limit; addr += inc) {
4673 if (dtrace_strncmp(addr, substr, sublen) == 0) {
4674 if (subr != DIF_SUBR_STRSTR) {
4675 /*
4676 * As D index() and rindex() are
4677 * modeled on Perl (and not on awk),
4678 * we return a zero-based (and not a
4679 * one-based) index. (For you Perl
4680 * weenies: no, we're not going to add
4681 * $[ -- and shouldn't you be at a con
4682 * or something?)
4683 */
4684 regs[rd] = (uintptr_t)(addr - orig);
4685 break;
4686 }
4687
4688 ASSERT(subr == DIF_SUBR_STRSTR);
4689 regs[rd] = (uintptr_t)addr;
4690 break;
4691 }
4692 }
4693
4694 break;
4695 }
4696
4697 case DIF_SUBR_STRTOK: {
4698 uintptr_t addr = tupregs[0].dttk_value;
4699 uintptr_t tokaddr = tupregs[1].dttk_value;
4700 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4701 uintptr_t limit, toklimit;
4702 size_t clim;
4703 char *dest = (char *)mstate->dtms_scratch_ptr;
4704 uint8_t c='\0', tokmap[32]; /* 256 / 8 */
4705 uint64_t i = 0;
4706
4707 /*
4708 * Check both the token buffer and (later) the input buffer,
4709 * since both could be non-scratch addresses.
4710 */
4711 if (!dtrace_strcanload(tokaddr, size, &clim, mstate, vstate)) {
4712 regs[rd] = 0;
4713 break;
4714 }
4715 toklimit = tokaddr + clim;
4716
4717 if (!DTRACE_INSCRATCH(mstate, size)) {
4718 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4719 regs[rd] = 0;
4720 break;
4721 }
4722
4723 if (addr == 0) {
4724 /*
4725 * If the address specified is NULL, we use our saved
4726 * strtok pointer from the mstate. Note that this
4727 * means that the saved strtok pointer is _only_
4728 * valid within multiple enablings of the same probe --
4729 * it behaves like an implicit clause-local variable.
4730 */
4731 addr = mstate->dtms_strtok;
4732 limit = mstate->dtms_strtok_limit;
4733 } else {
4734 /*
4735 * If the user-specified address is non-NULL we must
4736 * access check it. This is the only time we have
4737 * a chance to do so, since this address may reside
4738 * in the string table of this clause-- future calls
4739 * (when we fetch addr from mstate->dtms_strtok)
4740 * would fail this access check.
4741 */
4742 if (!dtrace_strcanload(addr, size, &clim, mstate,
4743 vstate)) {
4744 regs[rd] = 0;
4745 break;
4746 }
4747 limit = addr + clim;
4748 }
4749
4750 /*
4751 * First, zero the token map, and then process the token
4752 * string -- setting a bit in the map for every character
4753 * found in the token string.
4754 */
4755 for (i = 0; i < (int)sizeof (tokmap); i++)
4756 tokmap[i] = 0;
4757
4758 for (; tokaddr < toklimit; tokaddr++) {
4759 if ((c = dtrace_load8(tokaddr)) == '\0')
4760 break;
4761
4762 ASSERT((c >> 3) < sizeof (tokmap));
4763 tokmap[c >> 3] |= (1 << (c & 0x7));
4764 }
4765
4766 for (; addr < limit; addr++) {
4767 /*
4768 * We're looking for a character that is _not_
4769 * contained in the token string.
4770 */
4771 if ((c = dtrace_load8(addr)) == '\0')
4772 break;
4773
4774 if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
4775 break;
4776 }
4777
4778 if (c == '\0') {
4779 /*
4780 * We reached the end of the string without finding
4781 * any character that was not in the token string.
4782 * We return NULL in this case, and we set the saved
4783 * address to NULL as well.
4784 */
4785 regs[rd] = 0;
4786 mstate->dtms_strtok = 0;
4787 mstate->dtms_strtok_limit = 0;
4788 break;
4789 }
4790
4791 /*
4792 * From here on, we're copying into the destination string.
4793 */
4794 for (i = 0; addr < limit && i < size - 1; addr++) {
4795 if ((c = dtrace_load8(addr)) == '\0')
4796 break;
4797
4798 if (tokmap[c >> 3] & (1 << (c & 0x7)))
4799 break;
4800
4801 ASSERT(i < size);
4802 dest[i++] = c;
4803 }
4804
4805 ASSERT(i < size);
4806 dest[i] = '\0';
4807 regs[rd] = (uintptr_t)dest;
4808 mstate->dtms_scratch_ptr += size;
4809 mstate->dtms_strtok = addr;
4810 mstate->dtms_strtok_limit = limit;
4811 break;
4812 }
4813
4814 case DIF_SUBR_SUBSTR: {
4815 uintptr_t s = tupregs[0].dttk_value;
4816 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4817 char *d = (char *)mstate->dtms_scratch_ptr;
4818 int64_t index = (int64_t)tupregs[1].dttk_value;
4819 int64_t remaining = (int64_t)tupregs[2].dttk_value;
4820 size_t len = dtrace_strlen((char *)s, size);
4821 int64_t i = 0;
4822
4823 if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4824 regs[rd] = 0;
4825 break;
4826 }
4827
4828 if (!DTRACE_INSCRATCH(mstate, size)) {
4829 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4830 regs[rd] = 0;
4831 break;
4832 }
4833
4834 if (nargs <= 2)
4835 remaining = (int64_t)size;
4836
4837 if (index < 0) {
4838 index += len;
4839
4840 if (index < 0 && index + remaining > 0) {
4841 remaining += index;
4842 index = 0;
4843 }
4844 }
4845
4846 if ((size_t)index >= len || index < 0) {
4847 remaining = 0;
4848 } else if (remaining < 0) {
4849 remaining += len - index;
4850 } else if ((uint64_t)index + (uint64_t)remaining > size) {
4851 remaining = size - index;
4852 }
4853
4854 for (i = 0; i < remaining; i++) {
4855 if ((d[i] = dtrace_load8(s + index + i)) == '\0')
4856 break;
4857 }
4858
4859 d[i] = '\0';
4860
4861 mstate->dtms_scratch_ptr += size;
4862 regs[rd] = (uintptr_t)d;
4863 break;
4864 }
4865
4866 case DIF_SUBR_GETMAJOR:
4867 regs[rd] = (uintptr_t)major( (dev_t)tupregs[0].dttk_value );
4868 break;
4869
4870 case DIF_SUBR_GETMINOR:
4871 regs[rd] = (uintptr_t)minor( (dev_t)tupregs[0].dttk_value );
4872 break;
4873
4874 case DIF_SUBR_DDI_PATHNAME: {
4875 /* APPLE NOTE: currently unsupported on Darwin */
4876 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4877 regs[rd] = 0;
4878 break;
4879 }
4880
4881 case DIF_SUBR_STRJOIN: {
4882 char *d = (char *)mstate->dtms_scratch_ptr;
4883 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4884 uintptr_t s1 = tupregs[0].dttk_value;
4885 uintptr_t s2 = tupregs[1].dttk_value;
4886 uint64_t i = 0, j = 0;
4887 size_t lim1, lim2;
4888 char c;
4889
4890 if (!dtrace_strcanload(s1, size, &lim1, mstate, vstate) ||
4891 !dtrace_strcanload(s2, size, &lim2, mstate, vstate)) {
4892 regs[rd] = 0;
4893 break;
4894 }
4895
4896 if (!DTRACE_INSCRATCH(mstate, size)) {
4897 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4898 regs[rd] = 0;
4899 break;
4900 }
4901
4902 for (;;) {
4903 if (i >= size) {
4904 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4905 regs[rd] = 0;
4906 break;
4907 }
4908 c = (i >= lim1) ? '\0' : dtrace_load8(s1++);
4909 if ((d[i++] = c) == '\0') {
4910 i--;
4911 break;
4912 }
4913 }
4914
4915 for (;;) {
4916 if (i >= size) {
4917 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4918 regs[rd] = 0;
4919 break;
4920 }
4921 c = (j++ >= lim2) ? '\0' : dtrace_load8(s2++);
4922 if ((d[i++] = c) == '\0')
4923 break;
4924 }
4925
4926 if (i < size) {
4927 mstate->dtms_scratch_ptr += i;
4928 regs[rd] = (uintptr_t)d;
4929 }
4930
4931 break;
4932 }
4933
4934 case DIF_SUBR_STRTOLL: {
4935 uintptr_t s = tupregs[0].dttk_value;
4936 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4937 size_t lim;
4938 int base = 10;
4939
4940 if (nargs > 1) {
4941 if ((base = tupregs[1].dttk_value) <= 1 ||
4942 base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
4943 *flags |= CPU_DTRACE_ILLOP;
4944 break;
4945 }
4946 }
4947
4948 if (!dtrace_strcanload(s, size, &lim, mstate, vstate)) {
4949 regs[rd] = INT64_MIN;
4950 break;
4951 }
4952
4953 regs[rd] = dtrace_strtoll((char *)s, base, lim);
4954 break;
4955 }
4956
4957 case DIF_SUBR_LLTOSTR: {
4958 int64_t i = (int64_t)tupregs[0].dttk_value;
4959 uint64_t val, digit;
4960 uint64_t size = 65; /* enough room for 2^64 in binary */
4961 char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
4962 int base = 10;
4963
4964 if (nargs > 1) {
4965 if ((base = tupregs[1].dttk_value) <= 1 ||
4966 base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
4967 *flags |= CPU_DTRACE_ILLOP;
4968 break;
4969 }
4970 }
4971
4972 val = (base == 10 && i < 0) ? i * -1 : i;
4973
4974 if (!DTRACE_INSCRATCH(mstate, size)) {
4975 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4976 regs[rd] = 0;
4977 break;
4978 }
4979
4980 for (*end-- = '\0'; val; val /= base) {
4981 if ((digit = val % base) <= '9' - '0') {
4982 *end-- = '0' + digit;
4983 } else {
4984 *end-- = 'a' + (digit - ('9' - '0') - 1);
4985 }
4986 }
4987
4988 if (i == 0 && base == 16)
4989 *end-- = '0';
4990
4991 if (base == 16)
4992 *end-- = 'x';
4993
4994 if (i == 0 || base == 8 || base == 16)
4995 *end-- = '0';
4996
4997 if (i < 0 && base == 10)
4998 *end-- = '-';
4999
5000 regs[rd] = (uintptr_t)end + 1;
5001 mstate->dtms_scratch_ptr += size;
5002 break;
5003 }
5004
5005 case DIF_SUBR_HTONS:
5006 case DIF_SUBR_NTOHS:
5007 #ifdef _BIG_ENDIAN
5008 regs[rd] = (uint16_t)tupregs[0].dttk_value;
5009 #else
5010 regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
5011 #endif
5012 break;
5013
5014
5015 case DIF_SUBR_HTONL:
5016 case DIF_SUBR_NTOHL:
5017 #ifdef _BIG_ENDIAN
5018 regs[rd] = (uint32_t)tupregs[0].dttk_value;
5019 #else
5020 regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
5021 #endif
5022 break;
5023
5024
5025 case DIF_SUBR_HTONLL:
5026 case DIF_SUBR_NTOHLL:
5027 #ifdef _BIG_ENDIAN
5028 regs[rd] = (uint64_t)tupregs[0].dttk_value;
5029 #else
5030 regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
5031 #endif
5032 break;
5033
5034
5035 case DIF_SUBR_DIRNAME:
5036 case DIF_SUBR_BASENAME: {
5037 char *dest = (char *)mstate->dtms_scratch_ptr;
5038 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5039 uintptr_t src = tupregs[0].dttk_value;
5040 int i, j, len = dtrace_strlen((char *)src, size);
5041 int lastbase = -1, firstbase = -1, lastdir = -1;
5042 int start, end;
5043
5044 if (!dtrace_canload(src, len + 1, mstate, vstate)) {
5045 regs[rd] = 0;
5046 break;
5047 }
5048
5049 if (!DTRACE_INSCRATCH(mstate, size)) {
5050 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5051 regs[rd] = 0;
5052 break;
5053 }
5054
5055 /*
5056 * The basename and dirname for a zero-length string is
5057 * defined to be "."
5058 */
5059 if (len == 0) {
5060 len = 1;
5061 src = (uintptr_t)".";
5062 }
5063
5064 /*
5065 * Start from the back of the string, moving back toward the
5066 * front until we see a character that isn't a slash. That
5067 * character is the last character in the basename.
5068 */
5069 for (i = len - 1; i >= 0; i--) {
5070 if (dtrace_load8(src + i) != '/')
5071 break;
5072 }
5073
5074 if (i >= 0)
5075 lastbase = i;
5076
5077 /*
5078 * Starting from the last character in the basename, move
5079 * towards the front until we find a slash. The character
5080 * that we processed immediately before that is the first
5081 * character in the basename.
5082 */
5083 for (; i >= 0; i--) {
5084 if (dtrace_load8(src + i) == '/')
5085 break;
5086 }
5087
5088 if (i >= 0)
5089 firstbase = i + 1;
5090
5091 /*
5092 * Now keep going until we find a non-slash character. That
5093 * character is the last character in the dirname.
5094 */
5095 for (; i >= 0; i--) {
5096 if (dtrace_load8(src + i) != '/')
5097 break;
5098 }
5099
5100 if (i >= 0)
5101 lastdir = i;
5102
5103 ASSERT(!(lastbase == -1 && firstbase != -1));
5104 ASSERT(!(firstbase == -1 && lastdir != -1));
5105
5106 if (lastbase == -1) {
5107 /*
5108 * We didn't find a non-slash character. We know that
5109 * the length is non-zero, so the whole string must be
5110 * slashes. In either the dirname or the basename
5111 * case, we return '/'.
5112 */
5113 ASSERT(firstbase == -1);
5114 firstbase = lastbase = lastdir = 0;
5115 }
5116
5117 if (firstbase == -1) {
5118 /*
5119 * The entire string consists only of a basename
5120 * component. If we're looking for dirname, we need
5121 * to change our string to be just "."; if we're
5122 * looking for a basename, we'll just set the first
5123 * character of the basename to be 0.
5124 */
5125 if (subr == DIF_SUBR_DIRNAME) {
5126 ASSERT(lastdir == -1);
5127 src = (uintptr_t)".";
5128 lastdir = 0;
5129 } else {
5130 firstbase = 0;
5131 }
5132 }
5133
5134 if (subr == DIF_SUBR_DIRNAME) {
5135 if (lastdir == -1) {
5136 /*
5137 * We know that we have a slash in the name --
5138 * or lastdir would be set to 0, above. And
5139 * because lastdir is -1, we know that this
5140 * slash must be the first character. (That
5141 * is, the full string must be of the form
5142 * "/basename".) In this case, the last
5143 * character of the directory name is 0.
5144 */
5145 lastdir = 0;
5146 }
5147
5148 start = 0;
5149 end = lastdir;
5150 } else {
5151 ASSERT(subr == DIF_SUBR_BASENAME);
5152 ASSERT(firstbase != -1 && lastbase != -1);
5153 start = firstbase;
5154 end = lastbase;
5155 }
5156
5157 for (i = start, j = 0; i <= end && (uint64_t)j < size - 1; i++, j++)
5158 dest[j] = dtrace_load8(src + i);
5159
5160 dest[j] = '\0';
5161 regs[rd] = (uintptr_t)dest;
5162 mstate->dtms_scratch_ptr += size;
5163 break;
5164 }
5165
5166 case DIF_SUBR_CLEANPATH: {
5167 char *dest = (char *)mstate->dtms_scratch_ptr, c;
5168 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5169 uintptr_t src = tupregs[0].dttk_value;
5170 size_t lim;
5171 size_t i = 0, j = 0;
5172
5173 if (!dtrace_strcanload(src, size, &lim, mstate, vstate)) {
5174 regs[rd] = 0;
5175 break;
5176 }
5177
5178 if (!DTRACE_INSCRATCH(mstate, size)) {
5179 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5180 regs[rd] = 0;
5181 break;
5182 }
5183
5184 /*
5185 * Move forward, loading each character.
5186 */
5187 do {
5188 c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5189 next:
5190 if ((uint64_t)(j + 5) >= size) /* 5 = strlen("/..c\0") */
5191 break;
5192
5193 if (c != '/') {
5194 dest[j++] = c;
5195 continue;
5196 }
5197
5198 c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5199
5200 if (c == '/') {
5201 /*
5202 * We have two slashes -- we can just advance
5203 * to the next character.
5204 */
5205 goto next;
5206 }
5207
5208 if (c != '.') {
5209 /*
5210 * This is not "." and it's not ".." -- we can
5211 * just store the "/" and this character and
5212 * drive on.
5213 */
5214 dest[j++] = '/';
5215 dest[j++] = c;
5216 continue;
5217 }
5218
5219 c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5220
5221 if (c == '/') {
5222 /*
5223 * This is a "/./" component. We're not going
5224 * to store anything in the destination buffer;
5225 * we're just going to go to the next component.
5226 */
5227 goto next;
5228 }
5229
5230 if (c != '.') {
5231 /*
5232 * This is not ".." -- we can just store the
5233 * "/." and this character and continue
5234 * processing.
5235 */
5236 dest[j++] = '/';
5237 dest[j++] = '.';
5238 dest[j++] = c;
5239 continue;
5240 }
5241
5242 c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5243
5244 if (c != '/' && c != '\0') {
5245 /*
5246 * This is not ".." -- it's "..[mumble]".
5247 * We'll store the "/.." and this character
5248 * and continue processing.
5249 */
5250 dest[j++] = '/';
5251 dest[j++] = '.';
5252 dest[j++] = '.';
5253 dest[j++] = c;
5254 continue;
5255 }
5256
5257 /*
5258 * This is "/../" or "/..\0". We need to back up
5259 * our destination pointer until we find a "/".
5260 */
5261 i--;
5262 while (j != 0 && dest[--j] != '/')
5263 continue;
5264
5265 if (c == '\0')
5266 dest[++j] = '/';
5267 } while (c != '\0');
5268
5269 dest[j] = '\0';
5270 regs[rd] = (uintptr_t)dest;
5271 mstate->dtms_scratch_ptr += size;
5272 break;
5273 }
5274
5275 case DIF_SUBR_INET_NTOA:
5276 case DIF_SUBR_INET_NTOA6:
5277 case DIF_SUBR_INET_NTOP: {
5278 size_t size;
5279 int af, argi, i;
5280 char *base, *end;
5281
5282 if (subr == DIF_SUBR_INET_NTOP) {
5283 af = (int)tupregs[0].dttk_value;
5284 argi = 1;
5285 } else {
5286 af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
5287 argi = 0;
5288 }
5289
5290 if (af == AF_INET) {
5291 #if !defined(__APPLE__)
5292 ipaddr_t ip4;
5293 #else
5294 uint32_t ip4;
5295 #endif /* __APPLE__ */
5296 uint8_t *ptr8, val;
5297
5298 /*
5299 * Safely load the IPv4 address.
5300 */
5301 #if !defined(__APPLE__)
5302 ip4 = dtrace_load32(tupregs[argi].dttk_value);
5303 #else
5304 if (!dtrace_canload(tupregs[argi].dttk_value, sizeof(ip4),
5305 mstate, vstate)) {
5306 regs[rd] = 0;
5307 break;
5308 }
5309
5310 dtrace_bcopy(
5311 (void *)(uintptr_t)tupregs[argi].dttk_value,
5312 (void *)(uintptr_t)&ip4, sizeof (ip4));
5313 #endif /* __APPLE__ */
5314 /*
5315 * Check an IPv4 string will fit in scratch.
5316 */
5317 #if !defined(__APPLE__)
5318 size = INET_ADDRSTRLEN;
5319 #else
5320 size = MAX_IPv4_STR_LEN;
5321 #endif /* __APPLE__ */
5322 if (!DTRACE_INSCRATCH(mstate, size)) {
5323 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5324 regs[rd] = 0;
5325 break;
5326 }
5327 base = (char *)mstate->dtms_scratch_ptr;
5328 end = (char *)mstate->dtms_scratch_ptr + size - 1;
5329
5330 /*
5331 * Stringify as a dotted decimal quad.
5332 */
5333 *end-- = '\0';
5334 ptr8 = (uint8_t *)&ip4;
5335 for (i = 3; i >= 0; i--) {
5336 val = ptr8[i];
5337
5338 if (val == 0) {
5339 *end-- = '0';
5340 } else {
5341 for (; val; val /= 10) {
5342 *end-- = '0' + (val % 10);
5343 }
5344 }
5345
5346 if (i > 0)
5347 *end-- = '.';
5348 }
5349 ASSERT(end + 1 >= base);
5350
5351 } else if (af == AF_INET6) {
5352 #if defined(__APPLE__)
5353 #define _S6_un __u6_addr
5354 #define _S6_u8 __u6_addr8
5355 #endif /* __APPLE__ */
5356 struct in6_addr ip6;
5357 int firstzero, tryzero, numzero, v6end;
5358 uint16_t val;
5359 const char digits[] = "0123456789abcdef";
5360
5361 /*
5362 * Stringify using RFC 1884 convention 2 - 16 bit
5363 * hexadecimal values with a zero-run compression.
5364 * Lower case hexadecimal digits are used.
5365 * eg, fe80::214:4fff:fe0b:76c8.
5366 * The IPv4 embedded form is returned for inet_ntop,
5367 * just the IPv4 string is returned for inet_ntoa6.
5368 */
5369
5370 if (!dtrace_canload(tupregs[argi].dttk_value,
5371 sizeof(struct in6_addr), mstate, vstate)) {
5372 regs[rd] = 0;
5373 break;
5374 }
5375
5376 /*
5377 * Safely load the IPv6 address.
5378 */
5379 dtrace_bcopy(
5380 (void *)(uintptr_t)tupregs[argi].dttk_value,
5381 (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
5382
5383 /*
5384 * Check an IPv6 string will fit in scratch.
5385 */
5386 size = INET6_ADDRSTRLEN;
5387 if (!DTRACE_INSCRATCH(mstate, size)) {
5388 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5389 regs[rd] = 0;
5390 break;
5391 }
5392 base = (char *)mstate->dtms_scratch_ptr;
5393 end = (char *)mstate->dtms_scratch_ptr + size - 1;
5394 *end-- = '\0';
5395
5396 /*
5397 * Find the longest run of 16 bit zero values
5398 * for the single allowed zero compression - "::".
5399 */
5400 firstzero = -1;
5401 tryzero = -1;
5402 numzero = 1;
5403 for (i = 0; i < (int)sizeof (struct in6_addr); i++) {
5404 if (ip6._S6_un._S6_u8[i] == 0 &&
5405 tryzero == -1 && i % 2 == 0) {
5406 tryzero = i;
5407 continue;
5408 }
5409
5410 if (tryzero != -1 &&
5411 (ip6._S6_un._S6_u8[i] != 0 ||
5412 i == sizeof (struct in6_addr) - 1)) {
5413
5414 if (i - tryzero <= numzero) {
5415 tryzero = -1;
5416 continue;
5417 }
5418
5419 firstzero = tryzero;
5420 numzero = i - i % 2 - tryzero;
5421 tryzero = -1;
5422
5423 if (ip6._S6_un._S6_u8[i] == 0 &&
5424 i == sizeof (struct in6_addr) - 1)
5425 numzero += 2;
5426 }
5427 }
5428 ASSERT(firstzero + numzero <= (int)sizeof (struct in6_addr));
5429
5430 /*
5431 * Check for an IPv4 embedded address.
5432 */
5433 v6end = sizeof (struct in6_addr) - 2;
5434 if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
5435 IN6_IS_ADDR_V4COMPAT(&ip6)) {
5436 for (i = sizeof (struct in6_addr) - 1;
5437 i >= (int)DTRACE_V4MAPPED_OFFSET; i--) {
5438 ASSERT(end >= base);
5439
5440 val = ip6._S6_un._S6_u8[i];
5441
5442 if (val == 0) {
5443 *end-- = '0';
5444 } else {
5445 for (; val; val /= 10) {
5446 *end-- = '0' + val % 10;
5447 }
5448 }
5449
5450 if (i > (int)DTRACE_V4MAPPED_OFFSET)
5451 *end-- = '.';
5452 }
5453
5454 if (subr == DIF_SUBR_INET_NTOA6)
5455 goto inetout;
5456
5457 /*
5458 * Set v6end to skip the IPv4 address that
5459 * we have already stringified.
5460 */
5461 v6end = 10;
5462 }
5463
5464 /*
5465 * Build the IPv6 string by working through the
5466 * address in reverse.
5467 */
5468 for (i = v6end; i >= 0; i -= 2) {
5469 ASSERT(end >= base);
5470
5471 if (i == firstzero + numzero - 2) {
5472 *end-- = ':';
5473 *end-- = ':';
5474 i -= numzero - 2;
5475 continue;
5476 }
5477
5478 if (i < 14 && i != firstzero - 2)
5479 *end-- = ':';
5480
5481 val = (ip6._S6_un._S6_u8[i] << 8) +
5482 ip6._S6_un._S6_u8[i + 1];
5483
5484 if (val == 0) {
5485 *end-- = '0';
5486 } else {
5487 for (; val; val /= 16) {
5488 *end-- = digits[val % 16];
5489 }
5490 }
5491 }
5492 ASSERT(end + 1 >= base);
5493
5494 #if defined(__APPLE__)
5495 #undef _S6_un
5496 #undef _S6_u8
5497 #endif /* __APPLE__ */
5498 } else {
5499 /*
5500 * The user didn't use AH_INET or AH_INET6.
5501 */
5502 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5503 regs[rd] = 0;
5504 break;
5505 }
5506
5507 inetout: regs[rd] = (uintptr_t)end + 1;
5508 mstate->dtms_scratch_ptr += size;
5509 break;
5510 }
5511
5512 case DIF_SUBR_JSON: {
5513 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5514 uintptr_t json = tupregs[0].dttk_value;
5515 size_t jsonlen = dtrace_strlen((char *)json, size);
5516 uintptr_t elem = tupregs[1].dttk_value;
5517 size_t elemlen = dtrace_strlen((char *)elem, size);
5518
5519 char *dest = (char *)mstate->dtms_scratch_ptr;
5520 char *elemlist = (char *)mstate->dtms_scratch_ptr + jsonlen + 1;
5521 char *ee = elemlist;
5522 int nelems = 1;
5523 uintptr_t cur;
5524
5525 if (!dtrace_canload(json, jsonlen + 1, mstate, vstate) ||
5526 !dtrace_canload(elem, elemlen + 1, mstate, vstate)) {
5527 regs[rd] = 0;
5528 break;
5529 }
5530
5531 if (!DTRACE_INSCRATCH(mstate, jsonlen + 1 + elemlen + 1)) {
5532 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5533 regs[rd] = 0;
5534 break;
5535 }
5536
5537 /*
5538 * Read the element selector and split it up into a packed list
5539 * of strings.
5540 */
5541 for (cur = elem; cur < elem + elemlen; cur++) {
5542 char cc = dtrace_load8(cur);
5543
5544 if (cur == elem && cc == '[') {
5545 /*
5546 * If the first element selector key is
5547 * actually an array index then ignore the
5548 * bracket.
5549 */
5550 continue;
5551 }
5552
5553 if (cc == ']')
5554 continue;
5555
5556 if (cc == '.' || cc == '[') {
5557 nelems++;
5558 cc = '\0';
5559 }
5560
5561 *ee++ = cc;
5562 }
5563 *ee++ = '\0';
5564
5565 if ((regs[rd] = (uintptr_t)dtrace_json(size, json, elemlist,
5566 nelems, dest)) != 0)
5567 mstate->dtms_scratch_ptr += jsonlen + 1;
5568 break;
5569 }
5570
5571 case DIF_SUBR_TOUPPER:
5572 case DIF_SUBR_TOLOWER: {
5573 uintptr_t src = tupregs[0].dttk_value;
5574 char *dest = (char *)mstate->dtms_scratch_ptr;
5575 char lower, upper, base, c;
5576 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5577 size_t len = dtrace_strlen((char*) src, size);
5578 size_t i = 0;
5579
5580 lower = (subr == DIF_SUBR_TOUPPER) ? 'a' : 'A';
5581 upper = (subr == DIF_SUBR_TOUPPER) ? 'z' : 'Z';
5582 base = (subr == DIF_SUBR_TOUPPER) ? 'A' : 'a';
5583
5584 if (!dtrace_canload(src, len + 1, mstate, vstate)) {
5585 regs[rd] = 0;
5586 break;
5587 }
5588
5589 if (!DTRACE_INSCRATCH(mstate, size)) {
5590 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5591 regs[rd] = 0;
5592 break;
5593 }
5594
5595 for (i = 0; i < size - 1; ++i) {
5596 if ((c = dtrace_load8(src + i)) == '\0')
5597 break;
5598 if (c >= lower && c <= upper)
5599 c = base + (c - lower);
5600 dest[i] = c;
5601 }
5602
5603 ASSERT(i < size);
5604
5605 dest[i] = '\0';
5606 regs[rd] = (uintptr_t) dest;
5607 mstate->dtms_scratch_ptr += size;
5608
5609 break;
5610 }
5611
5612 case DIF_SUBR_STRIP:
5613 if (!dtrace_is_valid_ptrauth_key(tupregs[1].dttk_value)) {
5614 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5615 break;
5616 }
5617 regs[rd] = (uint64_t)dtrace_ptrauth_strip(
5618 (void*)tupregs[0].dttk_value, tupregs[1].dttk_value);
5619 break;
5620
5621 #if defined(__APPLE__)
5622 case DIF_SUBR_VM_KERNEL_ADDRPERM: {
5623 if (!dtrace_priv_kernel(state)) {
5624 regs[rd] = 0;
5625 } else {
5626 regs[rd] = VM_KERNEL_ADDRPERM((vm_offset_t) tupregs[0].dttk_value);
5627 }
5628
5629 break;
5630 }
5631
5632 case DIF_SUBR_KDEBUG_TRACE: {
5633 uint32_t debugid;
5634 uintptr_t args[4] = {0};
5635 int i;
5636
5637 if (nargs < 2 || nargs > 5) {
5638 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5639 break;
5640 }
5641
5642 if (dtrace_destructive_disallow ||
5643 !dtrace_priv_kernel_destructive(state)) {
5644 return;
5645 }
5646
5647 debugid = tupregs[0].dttk_value;
5648 for (i = 0; i < nargs - 1; i++)
5649 args[i] = tupregs[i + 1].dttk_value;
5650
5651 kernel_debug(debugid, args[0], args[1], args[2], args[3], 0);
5652
5653 break;
5654 }
5655
5656 case DIF_SUBR_KDEBUG_TRACE_STRING: {
5657 if (nargs != 3) {
5658 break;
5659 }
5660
5661 if (dtrace_destructive_disallow ||
5662 !dtrace_priv_kernel_destructive(state)) {
5663 return;
5664 }
5665
5666 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5667 uint32_t debugid = tupregs[0].dttk_value;
5668 uint64_t str_id = tupregs[1].dttk_value;
5669 uintptr_t src = tupregs[2].dttk_value;
5670 size_t lim;
5671 char buf[size];
5672 char* str = NULL;
5673
5674 if (src != (uintptr_t)0) {
5675 str = buf;
5676 if (!dtrace_strcanload(src, size, &lim, mstate, vstate)) {
5677 break;
5678 }
5679 dtrace_strcpy((void*)src, buf, size);
5680 }
5681
5682 (void)kernel_debug_string(debugid, &str_id, str);
5683 regs[rd] = str_id;
5684
5685 break;
5686 }
5687
5688 case DIF_SUBR_MTONS:
5689 absolutetime_to_nanoseconds(tupregs[0].dttk_value, &regs[rd]);
5690
5691 break;
5692 case DIF_SUBR_PHYSMEM_READ: {
5693 #if DEBUG || DEVELOPMENT
5694 if (dtrace_destructive_disallow ||
5695 !dtrace_priv_kernel_destructive(state)) {
5696 return;
5697 }
5698 regs[rd] = dtrace_physmem_read(tupregs[0].dttk_value,
5699 tupregs[1].dttk_value);
5700 #else
5701 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5702 #endif /* DEBUG || DEVELOPMENT */
5703 break;
5704 }
5705 case DIF_SUBR_PHYSMEM_WRITE: {
5706 #if DEBUG || DEVELOPMENT
5707 if (dtrace_destructive_disallow ||
5708 !dtrace_priv_kernel_destructive(state)) {
5709 return;
5710 }
5711
5712 dtrace_physmem_write(tupregs[0].dttk_value,
5713 tupregs[1].dttk_value, (size_t)tupregs[2].dttk_value);
5714 #else
5715 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5716 #endif /* DEBUG || DEVELOPMENT */
5717 break;
5718 }
5719
5720 case DIF_SUBR_KVTOPHYS: {
5721 #if DEBUG || DEVELOPMENT
5722 regs[rd] = kvtophys(tupregs[0].dttk_value);
5723 #else
5724 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5725 #endif /* DEBUG || DEVELOPMENT */
5726 break;
5727 }
5728 #endif /* defined(__APPLE__) */
5729
5730 }
5731 }
5732
5733 /*
5734 * Emulate the execution of DTrace IR instructions specified by the given
5735 * DIF object. This function is deliberately void of assertions as all of
5736 * the necessary checks are handled by a call to dtrace_difo_validate().
5737 */
5738 static uint64_t
5739 dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
5740 dtrace_vstate_t *vstate, dtrace_state_t *state)
5741 {
5742 const dif_instr_t *text = difo->dtdo_buf;
5743 const uint_t textlen = difo->dtdo_len;
5744 const char *strtab = difo->dtdo_strtab;
5745 const uint64_t *inttab = difo->dtdo_inttab;
5746
5747 uint64_t rval = 0;
5748 dtrace_statvar_t *svar;
5749 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
5750 dtrace_difv_t *v;
5751 volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
5752 volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
5753
5754 dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
5755 uint64_t regs[DIF_DIR_NREGS];
5756 uint64_t *tmp;
5757
5758 uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
5759 int64_t cc_r;
5760 uint_t pc = 0, id, opc = 0;
5761 uint8_t ttop = 0;
5762 dif_instr_t instr;
5763 uint_t r1, r2, rd;
5764
5765 /*
5766 * We stash the current DIF object into the machine state: we need it
5767 * for subsequent access checking.
5768 */
5769 mstate->dtms_difo = difo;
5770
5771 regs[DIF_REG_R0] = 0; /* %r0 is fixed at zero */
5772
5773 while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
5774 opc = pc;
5775
5776 instr = text[pc++];
5777 r1 = DIF_INSTR_R1(instr);
5778 r2 = DIF_INSTR_R2(instr);
5779 rd = DIF_INSTR_RD(instr);
5780
5781 switch (DIF_INSTR_OP(instr)) {
5782 case DIF_OP_OR:
5783 regs[rd] = regs[r1] | regs[r2];
5784 break;
5785 case DIF_OP_XOR:
5786 regs[rd] = regs[r1] ^ regs[r2];
5787 break;
5788 case DIF_OP_AND:
5789 regs[rd] = regs[r1] & regs[r2];
5790 break;
5791 case DIF_OP_SLL:
5792 regs[rd] = regs[r1] << regs[r2];
5793 break;
5794 case DIF_OP_SRL:
5795 regs[rd] = regs[r1] >> regs[r2];
5796 break;
5797 case DIF_OP_SUB:
5798 regs[rd] = regs[r1] - regs[r2];
5799 break;
5800 case DIF_OP_ADD:
5801 regs[rd] = regs[r1] + regs[r2];
5802 break;
5803 case DIF_OP_MUL:
5804 regs[rd] = regs[r1] * regs[r2];
5805 break;
5806 case DIF_OP_SDIV:
5807 if (regs[r2] == 0) {
5808 regs[rd] = 0;
5809 *flags |= CPU_DTRACE_DIVZERO;
5810 } else {
5811 regs[rd] = (int64_t)regs[r1] /
5812 (int64_t)regs[r2];
5813 }
5814 break;
5815
5816 case DIF_OP_UDIV:
5817 if (regs[r2] == 0) {
5818 regs[rd] = 0;
5819 *flags |= CPU_DTRACE_DIVZERO;
5820 } else {
5821 regs[rd] = regs[r1] / regs[r2];
5822 }
5823 break;
5824
5825 case DIF_OP_SREM:
5826 if (regs[r2] == 0) {
5827 regs[rd] = 0;
5828 *flags |= CPU_DTRACE_DIVZERO;
5829 } else {
5830 regs[rd] = (int64_t)regs[r1] %
5831 (int64_t)regs[r2];
5832 }
5833 break;
5834
5835 case DIF_OP_UREM:
5836 if (regs[r2] == 0) {
5837 regs[rd] = 0;
5838 *flags |= CPU_DTRACE_DIVZERO;
5839 } else {
5840 regs[rd] = regs[r1] % regs[r2];
5841 }
5842 break;
5843
5844 case DIF_OP_NOT:
5845 regs[rd] = ~regs[r1];
5846 break;
5847 case DIF_OP_MOV:
5848 regs[rd] = regs[r1];
5849 break;
5850 case DIF_OP_CMP:
5851 cc_r = regs[r1] - regs[r2];
5852 cc_n = cc_r < 0;
5853 cc_z = cc_r == 0;
5854 cc_v = 0;
5855 cc_c = regs[r1] < regs[r2];
5856 break;
5857 case DIF_OP_TST:
5858 cc_n = cc_v = cc_c = 0;
5859 cc_z = regs[r1] == 0;
5860 break;
5861 case DIF_OP_BA:
5862 pc = DIF_INSTR_LABEL(instr);
5863 break;
5864 case DIF_OP_BE:
5865 if (cc_z)
5866 pc = DIF_INSTR_LABEL(instr);
5867 break;
5868 case DIF_OP_BNE:
5869 if (cc_z == 0)
5870 pc = DIF_INSTR_LABEL(instr);
5871 break;
5872 case DIF_OP_BG:
5873 if ((cc_z | (cc_n ^ cc_v)) == 0)
5874 pc = DIF_INSTR_LABEL(instr);
5875 break;
5876 case DIF_OP_BGU:
5877 if ((cc_c | cc_z) == 0)
5878 pc = DIF_INSTR_LABEL(instr);
5879 break;
5880 case DIF_OP_BGE:
5881 if ((cc_n ^ cc_v) == 0)
5882 pc = DIF_INSTR_LABEL(instr);
5883 break;
5884 case DIF_OP_BGEU:
5885 if (cc_c == 0)
5886 pc = DIF_INSTR_LABEL(instr);
5887 break;
5888 case DIF_OP_BL:
5889 if (cc_n ^ cc_v)
5890 pc = DIF_INSTR_LABEL(instr);
5891 break;
5892 case DIF_OP_BLU:
5893 if (cc_c)
5894 pc = DIF_INSTR_LABEL(instr);
5895 break;
5896 case DIF_OP_BLE:
5897 if (cc_z | (cc_n ^ cc_v))
5898 pc = DIF_INSTR_LABEL(instr);
5899 break;
5900 case DIF_OP_BLEU:
5901 if (cc_c | cc_z)
5902 pc = DIF_INSTR_LABEL(instr);
5903 break;
5904 case DIF_OP_RLDSB:
5905 if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
5906 *flags |= CPU_DTRACE_KPRIV;
5907 *illval = regs[r1];
5908 break;
5909 }
5910 OS_FALLTHROUGH;
5911 case DIF_OP_LDSB:
5912 regs[rd] = (int8_t)dtrace_load8(regs[r1]);
5913 break;
5914 case DIF_OP_RLDSH:
5915 if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
5916 *flags |= CPU_DTRACE_KPRIV;
5917 *illval = regs[r1];
5918 break;
5919 }
5920 OS_FALLTHROUGH;
5921 case DIF_OP_LDSH:
5922 regs[rd] = (int16_t)dtrace_load16(regs[r1]);
5923 break;
5924 case DIF_OP_RLDSW:
5925 if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
5926 *flags |= CPU_DTRACE_KPRIV;
5927 *illval = regs[r1];
5928 break;
5929 }
5930 OS_FALLTHROUGH;
5931 case DIF_OP_LDSW:
5932 regs[rd] = (int32_t)dtrace_load32(regs[r1]);
5933 break;
5934 case DIF_OP_RLDUB:
5935 if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
5936 *flags |= CPU_DTRACE_KPRIV;
5937 *illval = regs[r1];
5938 break;
5939 }
5940 OS_FALLTHROUGH;
5941 case DIF_OP_LDUB:
5942 regs[rd] = dtrace_load8(regs[r1]);
5943 break;
5944 case DIF_OP_RLDUH:
5945 if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
5946 *flags |= CPU_DTRACE_KPRIV;
5947 *illval = regs[r1];
5948 break;
5949 }
5950 OS_FALLTHROUGH;
5951 case DIF_OP_LDUH:
5952 regs[rd] = dtrace_load16(regs[r1]);
5953 break;
5954 case DIF_OP_RLDUW:
5955 if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
5956 *flags |= CPU_DTRACE_KPRIV;
5957 *illval = regs[r1];
5958 break;
5959 }
5960 OS_FALLTHROUGH;
5961 case DIF_OP_LDUW:
5962 regs[rd] = dtrace_load32(regs[r1]);
5963 break;
5964 case DIF_OP_RLDX:
5965 if (!dtrace_canstore(regs[r1], 8, mstate, vstate)) {
5966 *flags |= CPU_DTRACE_KPRIV;
5967 *illval = regs[r1];
5968 break;
5969 }
5970 OS_FALLTHROUGH;
5971 case DIF_OP_LDX:
5972 regs[rd] = dtrace_load64(regs[r1]);
5973 break;
5974 /*
5975 * Darwin 32-bit kernel may fetch from 64-bit user.
5976 * Do not cast regs to uintptr_t
5977 * DIF_OP_ULDSB,DIF_OP_ULDSH, DIF_OP_ULDSW, DIF_OP_ULDUB
5978 * DIF_OP_ULDUH, DIF_OP_ULDUW, DIF_OP_ULDX
5979 */
5980 case DIF_OP_ULDSB:
5981 regs[rd] = (int8_t)
5982 dtrace_fuword8(regs[r1]);
5983 break;
5984 case DIF_OP_ULDSH:
5985 regs[rd] = (int16_t)
5986 dtrace_fuword16(regs[r1]);
5987 break;
5988 case DIF_OP_ULDSW:
5989 regs[rd] = (int32_t)
5990 dtrace_fuword32(regs[r1]);
5991 break;
5992 case DIF_OP_ULDUB:
5993 regs[rd] =
5994 dtrace_fuword8(regs[r1]);
5995 break;
5996 case DIF_OP_ULDUH:
5997 regs[rd] =
5998 dtrace_fuword16(regs[r1]);
5999 break;
6000 case DIF_OP_ULDUW:
6001 regs[rd] =
6002 dtrace_fuword32(regs[r1]);
6003 break;
6004 case DIF_OP_ULDX:
6005 regs[rd] =
6006 dtrace_fuword64(regs[r1]);
6007 break;
6008 case DIF_OP_RET:
6009 rval = regs[rd];
6010 pc = textlen;
6011 break;
6012 case DIF_OP_NOP:
6013 break;
6014 case DIF_OP_SETX:
6015 regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
6016 break;
6017 case DIF_OP_SETS:
6018 regs[rd] = (uint64_t)(uintptr_t)
6019 (strtab + DIF_INSTR_STRING(instr));
6020 break;
6021 case DIF_OP_SCMP: {
6022 size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
6023 uintptr_t s1 = regs[r1];
6024 uintptr_t s2 = regs[r2];
6025 size_t lim1 = sz, lim2 = sz;
6026
6027 if (s1 != 0 &&
6028 !dtrace_strcanload(s1, sz, &lim1, mstate, vstate))
6029 break;
6030 if (s2 != 0 &&
6031 !dtrace_strcanload(s2, sz, &lim2, mstate, vstate))
6032 break;
6033
6034 cc_r = dtrace_strncmp((char *)s1, (char *)s2,
6035 MIN(lim1, lim2));
6036
6037 cc_n = cc_r < 0;
6038 cc_z = cc_r == 0;
6039 cc_v = cc_c = 0;
6040 break;
6041 }
6042 case DIF_OP_LDGA:
6043 regs[rd] = dtrace_dif_variable(mstate, state,
6044 r1, regs[r2]);
6045 break;
6046 case DIF_OP_LDGS:
6047 id = DIF_INSTR_VAR(instr);
6048
6049 if (id >= DIF_VAR_OTHER_UBASE) {
6050 uintptr_t a;
6051
6052 id -= DIF_VAR_OTHER_UBASE;
6053 svar = vstate->dtvs_globals[id];
6054 ASSERT(svar != NULL);
6055 v = &svar->dtsv_var;
6056
6057 if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
6058 regs[rd] = svar->dtsv_data;
6059 break;
6060 }
6061
6062 a = (uintptr_t)svar->dtsv_data;
6063
6064 if (*(uint8_t *)a == UINT8_MAX) {
6065 /*
6066 * If the 0th byte is set to UINT8_MAX
6067 * then this is to be treated as a
6068 * reference to a NULL variable.
6069 */
6070 regs[rd] = 0;
6071 } else {
6072 regs[rd] = a + sizeof (uint64_t);
6073 }
6074
6075 break;
6076 }
6077
6078 regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
6079 break;
6080
6081 case DIF_OP_STGS:
6082 id = DIF_INSTR_VAR(instr);
6083
6084 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6085 id -= DIF_VAR_OTHER_UBASE;
6086
6087 VERIFY(id < (uint_t)vstate->dtvs_nglobals);
6088 svar = vstate->dtvs_globals[id];
6089 ASSERT(svar != NULL);
6090 v = &svar->dtsv_var;
6091
6092 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6093 uintptr_t a = (uintptr_t)svar->dtsv_data;
6094 size_t lim;
6095
6096 ASSERT(a != 0);
6097 ASSERT(svar->dtsv_size != 0);
6098
6099 if (regs[rd] == 0) {
6100 *(uint8_t *)a = UINT8_MAX;
6101 break;
6102 } else {
6103 *(uint8_t *)a = 0;
6104 a += sizeof (uint64_t);
6105 }
6106 if (!dtrace_vcanload(
6107 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6108 &lim, mstate, vstate))
6109 break;
6110
6111 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6112 (void *)a, &v->dtdv_type, lim);
6113 break;
6114 }
6115
6116 svar->dtsv_data = regs[rd];
6117 break;
6118
6119 case DIF_OP_LDTA:
6120 /*
6121 * There are no DTrace built-in thread-local arrays at
6122 * present. This opcode is saved for future work.
6123 */
6124 *flags |= CPU_DTRACE_ILLOP;
6125 regs[rd] = 0;
6126 break;
6127
6128 case DIF_OP_LDLS:
6129 id = DIF_INSTR_VAR(instr);
6130
6131 if (id < DIF_VAR_OTHER_UBASE) {
6132 /*
6133 * For now, this has no meaning.
6134 */
6135 regs[rd] = 0;
6136 break;
6137 }
6138
6139 id -= DIF_VAR_OTHER_UBASE;
6140
6141 ASSERT(id < (uint_t)vstate->dtvs_nlocals);
6142 ASSERT(vstate->dtvs_locals != NULL);
6143 svar = vstate->dtvs_locals[id];
6144 ASSERT(svar != NULL);
6145 v = &svar->dtsv_var;
6146
6147 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6148 uintptr_t a = (uintptr_t)svar->dtsv_data;
6149 size_t sz = v->dtdv_type.dtdt_size;
6150
6151 sz += sizeof (uint64_t);
6152 ASSERT(svar->dtsv_size == (int)NCPU * sz);
6153 a += CPU->cpu_id * sz;
6154
6155 if (*(uint8_t *)a == UINT8_MAX) {
6156 /*
6157 * If the 0th byte is set to UINT8_MAX
6158 * then this is to be treated as a
6159 * reference to a NULL variable.
6160 */
6161 regs[rd] = 0;
6162 } else {
6163 regs[rd] = a + sizeof (uint64_t);
6164 }
6165
6166 break;
6167 }
6168
6169 ASSERT(svar->dtsv_size == (int)NCPU * sizeof (uint64_t));
6170 tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6171 regs[rd] = tmp[CPU->cpu_id];
6172 break;
6173
6174 case DIF_OP_STLS:
6175 id = DIF_INSTR_VAR(instr);
6176
6177 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6178 id -= DIF_VAR_OTHER_UBASE;
6179 VERIFY(id < (uint_t)vstate->dtvs_nlocals);
6180 ASSERT(vstate->dtvs_locals != NULL);
6181 svar = vstate->dtvs_locals[id];
6182 ASSERT(svar != NULL);
6183 v = &svar->dtsv_var;
6184
6185 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6186 uintptr_t a = (uintptr_t)svar->dtsv_data;
6187 size_t sz = v->dtdv_type.dtdt_size;
6188 size_t lim;
6189
6190 sz += sizeof (uint64_t);
6191 ASSERT(svar->dtsv_size == (int)NCPU * sz);
6192 a += CPU->cpu_id * sz;
6193
6194 if (regs[rd] == 0) {
6195 *(uint8_t *)a = UINT8_MAX;
6196 break;
6197 } else {
6198 *(uint8_t *)a = 0;
6199 a += sizeof (uint64_t);
6200 }
6201
6202 if (!dtrace_vcanload(
6203 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6204 &lim, mstate, vstate))
6205 break;
6206
6207 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6208 (void *)a, &v->dtdv_type, lim);
6209 break;
6210 }
6211
6212 ASSERT(svar->dtsv_size == (int)NCPU * sizeof (uint64_t));
6213 tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6214 tmp[CPU->cpu_id] = regs[rd];
6215 break;
6216
6217 case DIF_OP_LDTS: {
6218 dtrace_dynvar_t *dvar;
6219 dtrace_key_t *key;
6220
6221 id = DIF_INSTR_VAR(instr);
6222 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6223 id -= DIF_VAR_OTHER_UBASE;
6224 v = &vstate->dtvs_tlocals[id];
6225
6226 key = &tupregs[DIF_DTR_NREGS];
6227 key[0].dttk_value = (uint64_t)id;
6228 key[0].dttk_size = 0;
6229 DTRACE_TLS_THRKEY(key[1].dttk_value);
6230 key[1].dttk_size = 0;
6231
6232 dvar = dtrace_dynvar(dstate, 2, key,
6233 sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
6234 mstate, vstate);
6235
6236 if (dvar == NULL) {
6237 regs[rd] = 0;
6238 break;
6239 }
6240
6241 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6242 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6243 } else {
6244 regs[rd] = *((uint64_t *)dvar->dtdv_data);
6245 }
6246
6247 break;
6248 }
6249
6250 case DIF_OP_STTS: {
6251 dtrace_dynvar_t *dvar;
6252 dtrace_key_t *key;
6253
6254 id = DIF_INSTR_VAR(instr);
6255 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6256 id -= DIF_VAR_OTHER_UBASE;
6257 VERIFY(id < (uint_t)vstate->dtvs_ntlocals);
6258
6259 key = &tupregs[DIF_DTR_NREGS];
6260 key[0].dttk_value = (uint64_t)id;
6261 key[0].dttk_size = 0;
6262 DTRACE_TLS_THRKEY(key[1].dttk_value);
6263 key[1].dttk_size = 0;
6264 v = &vstate->dtvs_tlocals[id];
6265
6266 dvar = dtrace_dynvar(dstate, 2, key,
6267 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6268 v->dtdv_type.dtdt_size : sizeof (uint64_t),
6269 regs[rd] ? DTRACE_DYNVAR_ALLOC :
6270 DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6271
6272 /*
6273 * Given that we're storing to thread-local data,
6274 * we need to flush our predicate cache.
6275 */
6276 dtrace_set_thread_predcache(current_thread(), 0);
6277
6278 if (dvar == NULL)
6279 break;
6280
6281 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6282 size_t lim;
6283
6284 if (!dtrace_vcanload(
6285 (void *)(uintptr_t)regs[rd],
6286 &v->dtdv_type, &lim, mstate, vstate))
6287 break;
6288
6289 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6290 dvar->dtdv_data, &v->dtdv_type, lim);
6291 } else {
6292 *((uint64_t *)dvar->dtdv_data) = regs[rd];
6293 }
6294
6295 break;
6296 }
6297
6298 case DIF_OP_SRA:
6299 regs[rd] = (int64_t)regs[r1] >> regs[r2];
6300 break;
6301
6302 case DIF_OP_CALL:
6303 dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
6304 regs, tupregs, ttop, mstate, state);
6305 break;
6306
6307 case DIF_OP_PUSHTR:
6308 if (ttop == DIF_DTR_NREGS) {
6309 *flags |= CPU_DTRACE_TUPOFLOW;
6310 break;
6311 }
6312
6313 if (r1 == DIF_TYPE_STRING) {
6314 /*
6315 * If this is a string type and the size is 0,
6316 * we'll use the system-wide default string
6317 * size. Note that we are _not_ looking at
6318 * the value of the DTRACEOPT_STRSIZE option;
6319 * had this been set, we would expect to have
6320 * a non-zero size value in the "pushtr".
6321 */
6322 tupregs[ttop].dttk_size =
6323 dtrace_strlen((char *)(uintptr_t)regs[rd],
6324 regs[r2] ? regs[r2] :
6325 dtrace_strsize_default) + 1;
6326 } else {
6327 if (regs[r2] > LONG_MAX) {
6328 *flags |= CPU_DTRACE_ILLOP;
6329 break;
6330 }
6331 tupregs[ttop].dttk_size = regs[r2];
6332 }
6333
6334 tupregs[ttop++].dttk_value = regs[rd];
6335 break;
6336
6337 case DIF_OP_PUSHTV:
6338 if (ttop == DIF_DTR_NREGS) {
6339 *flags |= CPU_DTRACE_TUPOFLOW;
6340 break;
6341 }
6342
6343 tupregs[ttop].dttk_value = regs[rd];
6344 tupregs[ttop++].dttk_size = 0;
6345 break;
6346
6347 case DIF_OP_POPTS:
6348 if (ttop != 0)
6349 ttop--;
6350 break;
6351
6352 case DIF_OP_FLUSHTS:
6353 ttop = 0;
6354 break;
6355
6356 case DIF_OP_LDGAA:
6357 case DIF_OP_LDTAA: {
6358 dtrace_dynvar_t *dvar;
6359 dtrace_key_t *key = tupregs;
6360 uint_t nkeys = ttop;
6361
6362 id = DIF_INSTR_VAR(instr);
6363 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6364 id -= DIF_VAR_OTHER_UBASE;
6365
6366 key[nkeys].dttk_value = (uint64_t)id;
6367 key[nkeys++].dttk_size = 0;
6368
6369 if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
6370 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6371 key[nkeys++].dttk_size = 0;
6372 VERIFY(id < (uint_t)vstate->dtvs_ntlocals);
6373 v = &vstate->dtvs_tlocals[id];
6374 } else {
6375 VERIFY(id < (uint_t)vstate->dtvs_nglobals);
6376 v = &vstate->dtvs_globals[id]->dtsv_var;
6377 }
6378
6379 dvar = dtrace_dynvar(dstate, nkeys, key,
6380 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6381 v->dtdv_type.dtdt_size : sizeof (uint64_t),
6382 DTRACE_DYNVAR_NOALLOC, mstate, vstate);
6383
6384 if (dvar == NULL) {
6385 regs[rd] = 0;
6386 break;
6387 }
6388
6389 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6390 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6391 } else {
6392 regs[rd] = *((uint64_t *)dvar->dtdv_data);
6393 }
6394
6395 break;
6396 }
6397
6398 case DIF_OP_STGAA:
6399 case DIF_OP_STTAA: {
6400 dtrace_dynvar_t *dvar;
6401 dtrace_key_t *key = tupregs;
6402 uint_t nkeys = ttop;
6403
6404 id = DIF_INSTR_VAR(instr);
6405 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6406 id -= DIF_VAR_OTHER_UBASE;
6407
6408 key[nkeys].dttk_value = (uint64_t)id;
6409 key[nkeys++].dttk_size = 0;
6410
6411 if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
6412 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6413 key[nkeys++].dttk_size = 0;
6414 VERIFY(id < (uint_t)vstate->dtvs_ntlocals);
6415 v = &vstate->dtvs_tlocals[id];
6416 } else {
6417 VERIFY(id < (uint_t)vstate->dtvs_nglobals);
6418 v = &vstate->dtvs_globals[id]->dtsv_var;
6419 }
6420
6421 dvar = dtrace_dynvar(dstate, nkeys, key,
6422 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6423 v->dtdv_type.dtdt_size : sizeof (uint64_t),
6424 regs[rd] ? DTRACE_DYNVAR_ALLOC :
6425 DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6426
6427 if (dvar == NULL)
6428 break;
6429
6430 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6431 size_t lim;
6432
6433 if (!dtrace_vcanload(
6434 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6435 &lim, mstate, vstate))
6436 break;
6437
6438 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6439 dvar->dtdv_data, &v->dtdv_type, lim);
6440 } else {
6441 *((uint64_t *)dvar->dtdv_data) = regs[rd];
6442 }
6443
6444 break;
6445 }
6446
6447 case DIF_OP_ALLOCS: {
6448 uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6449 size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
6450
6451 /*
6452 * Rounding up the user allocation size could have
6453 * overflowed large, bogus allocations (like -1ULL) to
6454 * 0.
6455 */
6456 if (size < regs[r1] ||
6457 !DTRACE_INSCRATCH(mstate, size)) {
6458 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6459 regs[rd] = 0;
6460 break;
6461 }
6462
6463 dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
6464 mstate->dtms_scratch_ptr += size;
6465 regs[rd] = ptr;
6466 break;
6467 }
6468
6469 case DIF_OP_COPYS:
6470 if (!dtrace_canstore(regs[rd], regs[r2],
6471 mstate, vstate)) {
6472 *flags |= CPU_DTRACE_BADADDR;
6473 *illval = regs[rd];
6474 break;
6475 }
6476
6477 if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
6478 break;
6479
6480 dtrace_bcopy((void *)(uintptr_t)regs[r1],
6481 (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
6482 break;
6483
6484 case DIF_OP_STB:
6485 if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
6486 *flags |= CPU_DTRACE_BADADDR;
6487 *illval = regs[rd];
6488 break;
6489 }
6490 *((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
6491 break;
6492
6493 case DIF_OP_STH:
6494 if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
6495 *flags |= CPU_DTRACE_BADADDR;
6496 *illval = regs[rd];
6497 break;
6498 }
6499 if (regs[rd] & 1) {
6500 *flags |= CPU_DTRACE_BADALIGN;
6501 *illval = regs[rd];
6502 break;
6503 }
6504 *((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
6505 break;
6506
6507 case DIF_OP_STW:
6508 if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
6509 *flags |= CPU_DTRACE_BADADDR;
6510 *illval = regs[rd];
6511 break;
6512 }
6513 if (regs[rd] & 3) {
6514 *flags |= CPU_DTRACE_BADALIGN;
6515 *illval = regs[rd];
6516 break;
6517 }
6518 *((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
6519 break;
6520
6521 case DIF_OP_STX:
6522 if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
6523 *flags |= CPU_DTRACE_BADADDR;
6524 *illval = regs[rd];
6525 break;
6526 }
6527
6528 /*
6529 * Darwin kmem_zalloc() called from
6530 * dtrace_difo_init() is 4-byte aligned.
6531 */
6532 if (regs[rd] & 3) {
6533 *flags |= CPU_DTRACE_BADALIGN;
6534 *illval = regs[rd];
6535 break;
6536 }
6537 *((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
6538 break;
6539 case DIF_OP_STRIP:
6540 regs[rd] = (uint64_t)dtrace_ptrauth_strip(
6541 (void*)regs[r1], r2);
6542 break;
6543 }
6544 }
6545
6546 if (!(*flags & CPU_DTRACE_FAULT))
6547 return (rval);
6548
6549 mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
6550 mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
6551
6552 return (0);
6553 }
6554
6555 __attribute__((noinline))
6556 static void
6557 dtrace_action_breakpoint(dtrace_ecb_t *ecb)
6558 {
6559 dtrace_probe_t *probe = ecb->dte_probe;
6560 dtrace_provider_t *prov = probe->dtpr_provider;
6561 char c[DTRACE_FULLNAMELEN + 80], *str;
6562 const char *msg = "dtrace: breakpoint action at probe ";
6563 const char *ecbmsg = " (ecb ";
6564 uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
6565 uintptr_t val = (uintptr_t)ecb;
6566 int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
6567
6568 if (dtrace_destructive_disallow)
6569 return;
6570
6571 /*
6572 * It's impossible to be taking action on the NULL probe.
6573 */
6574 ASSERT(probe != NULL);
6575
6576 /*
6577 * This is a poor man's (destitute man's?) sprintf(): we want to
6578 * print the provider name, module name, function name and name of
6579 * the probe, along with the hex address of the ECB with the breakpoint
6580 * action -- all of which we must place in the character buffer by
6581 * hand.
6582 */
6583 while (*msg != '\0')
6584 c[i++] = *msg++;
6585
6586 for (str = prov->dtpv_name; *str != '\0'; str++)
6587 c[i++] = *str;
6588 c[i++] = ':';
6589
6590 for (str = probe->dtpr_mod; *str != '\0'; str++)
6591 c[i++] = *str;
6592 c[i++] = ':';
6593
6594 for (str = probe->dtpr_func; *str != '\0'; str++)
6595 c[i++] = *str;
6596 c[i++] = ':';
6597
6598 for (str = probe->dtpr_name; *str != '\0'; str++)
6599 c[i++] = *str;
6600
6601 while (*ecbmsg != '\0')
6602 c[i++] = *ecbmsg++;
6603
6604 while (shift >= 0) {
6605 mask = (uintptr_t)0xf << shift;
6606
6607 if (val >= ((uintptr_t)1 << shift))
6608 c[i++] = "0123456789abcdef"[(val & mask) >> shift];
6609 shift -= 4;
6610 }
6611
6612 c[i++] = ')';
6613 c[i] = '\0';
6614
6615 debug_enter(c);
6616 }
6617
6618 __attribute__((noinline))
6619 static void
6620 dtrace_action_panic(dtrace_ecb_t *ecb)
6621 {
6622 dtrace_probe_t *probe = ecb->dte_probe;
6623
6624 /*
6625 * It's impossible to be taking action on the NULL probe.
6626 */
6627 ASSERT(probe != NULL);
6628
6629 if (dtrace_destructive_disallow)
6630 return;
6631
6632 if (dtrace_panicked != NULL)
6633 return;
6634
6635 if (dtrace_casptr(&dtrace_panicked, NULL, current_thread()) != NULL)
6636 return;
6637
6638 /*
6639 * We won the right to panic. (We want to be sure that only one
6640 * thread calls panic() from dtrace_probe(), and that panic() is
6641 * called exactly once.)
6642 */
6643 panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
6644 probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
6645 probe->dtpr_func, probe->dtpr_name, (void *)ecb);
6646
6647 /*
6648 * APPLE NOTE: this was for an old Mac OS X debug feature
6649 * allowing a return from panic(). Revisit someday.
6650 */
6651 dtrace_panicked = NULL;
6652 }
6653
6654 static void
6655 dtrace_action_raise(uint64_t sig)
6656 {
6657 if (dtrace_destructive_disallow)
6658 return;
6659
6660 if (sig >= NSIG) {
6661 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6662 return;
6663 }
6664
6665 /*
6666 * raise() has a queue depth of 1 -- we ignore all subsequent
6667 * invocations of the raise() action.
6668 */
6669
6670 uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
6671
6672 if (uthread && uthread->t_dtrace_sig == 0) {
6673 uthread->t_dtrace_sig = sig;
6674 act_set_astbsd(current_thread());
6675 }
6676 }
6677
6678 static void
6679 dtrace_action_stop(void)
6680 {
6681 if (dtrace_destructive_disallow)
6682 return;
6683
6684 uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
6685 if (uthread) {
6686 /*
6687 * The currently running process will be set to task_suspend
6688 * when it next leaves the kernel.
6689 */
6690 uthread->t_dtrace_stop = 1;
6691 act_set_astbsd(current_thread());
6692 }
6693 }
6694
6695
6696 /*
6697 * APPLE NOTE: pidresume works in conjunction with the dtrace stop action.
6698 * Both activate only when the currently running process next leaves the
6699 * kernel.
6700 */
6701 static void
6702 dtrace_action_pidresume(uint64_t pid)
6703 {
6704 if (dtrace_destructive_disallow)
6705 return;
6706
6707 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
6708 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6709 return;
6710 }
6711 uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
6712
6713 /*
6714 * When the currently running process leaves the kernel, it attempts to
6715 * task_resume the process (denoted by pid), if that pid appears to have
6716 * been stopped by dtrace_action_stop().
6717 * The currently running process has a pidresume() queue depth of 1 --
6718 * subsequent invocations of the pidresume() action are ignored.
6719 */
6720
6721 if (pid != 0 && uthread && uthread->t_dtrace_resumepid == 0) {
6722 uthread->t_dtrace_resumepid = pid;
6723 act_set_astbsd(current_thread());
6724 }
6725 }
6726
6727 __attribute__((noinline))
6728 static void
6729 dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
6730 {
6731 hrtime_t now;
6732 volatile uint16_t *flags;
6733 dtrace_cpu_t *cpu = CPU;
6734
6735 if (dtrace_destructive_disallow)
6736 return;
6737
6738 flags = (volatile uint16_t *)&cpu_core[cpu->cpu_id].cpuc_dtrace_flags;
6739
6740 now = dtrace_gethrtime();
6741
6742 if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
6743 /*
6744 * We need to advance the mark to the current time.
6745 */
6746 cpu->cpu_dtrace_chillmark = now;
6747 cpu->cpu_dtrace_chilled = 0;
6748 }
6749
6750 /*
6751 * Now check to see if the requested chill time would take us over
6752 * the maximum amount of time allowed in the chill interval. (Or
6753 * worse, if the calculation itself induces overflow.)
6754 */
6755 if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
6756 cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
6757 *flags |= CPU_DTRACE_ILLOP;
6758 return;
6759 }
6760
6761 while (dtrace_gethrtime() - now < val)
6762 continue;
6763
6764 /*
6765 * Normally, we assure that the value of the variable "timestamp" does
6766 * not change within an ECB. The presence of chill() represents an
6767 * exception to this rule, however.
6768 */
6769 mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
6770 cpu->cpu_dtrace_chilled += val;
6771 }
6772
6773 __attribute__((noinline))
6774 static void
6775 dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
6776 uint64_t *buf, uint64_t arg)
6777 {
6778 int nframes = DTRACE_USTACK_NFRAMES(arg);
6779 int strsize = DTRACE_USTACK_STRSIZE(arg);
6780 uint64_t *pcs = &buf[1], *fps;
6781 char *str = (char *)&pcs[nframes];
6782 int size, offs = 0, i, j;
6783 uintptr_t old = mstate->dtms_scratch_ptr, saved;
6784 uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
6785 char *sym;
6786
6787 /*
6788 * Should be taking a faster path if string space has not been
6789 * allocated.
6790 */
6791 ASSERT(strsize != 0);
6792
6793 /*
6794 * We will first allocate some temporary space for the frame pointers.
6795 */
6796 fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6797 size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
6798 (nframes * sizeof (uint64_t));
6799
6800 if (!DTRACE_INSCRATCH(mstate, (uintptr_t)size)) {
6801 /*
6802 * Not enough room for our frame pointers -- need to indicate
6803 * that we ran out of scratch space.
6804 */
6805 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6806 return;
6807 }
6808
6809 mstate->dtms_scratch_ptr += size;
6810 saved = mstate->dtms_scratch_ptr;
6811
6812 /*
6813 * Now get a stack with both program counters and frame pointers.
6814 */
6815 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6816 dtrace_getufpstack(buf, fps, nframes + 1);
6817 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6818
6819 /*
6820 * If that faulted, we're cooked.
6821 */
6822 if (*flags & CPU_DTRACE_FAULT)
6823 goto out;
6824
6825 /*
6826 * Now we want to walk up the stack, calling the USTACK helper. For
6827 * each iteration, we restore the scratch pointer.
6828 */
6829 for (i = 0; i < nframes; i++) {
6830 mstate->dtms_scratch_ptr = saved;
6831
6832 if (offs >= strsize)
6833 break;
6834
6835 sym = (char *)(uintptr_t)dtrace_helper(
6836 DTRACE_HELPER_ACTION_USTACK,
6837 mstate, state, pcs[i], fps[i]);
6838
6839 /*
6840 * If we faulted while running the helper, we're going to
6841 * clear the fault and null out the corresponding string.
6842 */
6843 if (*flags & CPU_DTRACE_FAULT) {
6844 *flags &= ~CPU_DTRACE_FAULT;
6845 str[offs++] = '\0';
6846 continue;
6847 }
6848
6849 if (sym == NULL) {
6850 str[offs++] = '\0';
6851 continue;
6852 }
6853
6854 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6855
6856 /*
6857 * Now copy in the string that the helper returned to us.
6858 */
6859 for (j = 0; offs + j < strsize; j++) {
6860 if ((str[offs + j] = sym[j]) == '\0')
6861 break;
6862 }
6863
6864 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6865
6866 offs += j + 1;
6867 }
6868
6869 if (offs >= strsize) {
6870 /*
6871 * If we didn't have room for all of the strings, we don't
6872 * abort processing -- this needn't be a fatal error -- but we
6873 * still want to increment a counter (dts_stkstroverflows) to
6874 * allow this condition to be warned about. (If this is from
6875 * a jstack() action, it is easily tuned via jstackstrsize.)
6876 */
6877 dtrace_error(&state->dts_stkstroverflows);
6878 }
6879
6880 while (offs < strsize)
6881 str[offs++] = '\0';
6882
6883 out:
6884 mstate->dtms_scratch_ptr = old;
6885 }
6886
6887 __attribute__((noinline))
6888 static void
6889 dtrace_store_by_ref(dtrace_difo_t *dp, caddr_t tomax, size_t size,
6890 size_t *valoffsp, uint64_t *valp, uint64_t end, int intuple, int dtkind)
6891 {
6892 volatile uint16_t *flags;
6893 uint64_t val = *valp;
6894 size_t valoffs = *valoffsp;
6895
6896 flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
6897 ASSERT(dtkind == DIF_TF_BYREF || dtkind == DIF_TF_BYUREF);
6898
6899 /*
6900 * If this is a string, we're going to only load until we find the zero
6901 * byte -- after which we'll store zero bytes.
6902 */
6903 if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
6904 char c = '\0' + 1;
6905 size_t s;
6906
6907 for (s = 0; s < size; s++) {
6908 if (c != '\0' && dtkind == DIF_TF_BYREF) {
6909 c = dtrace_load8(val++);
6910 } else if (c != '\0' && dtkind == DIF_TF_BYUREF) {
6911 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6912 c = dtrace_fuword8((user_addr_t)(uintptr_t)val++);
6913 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6914 if (*flags & CPU_DTRACE_FAULT)
6915 break;
6916 }
6917
6918 DTRACE_STORE(uint8_t, tomax, valoffs++, c);
6919
6920 if (c == '\0' && intuple)
6921 break;
6922 }
6923 } else {
6924 uint8_t c;
6925 while (valoffs < end) {
6926 if (dtkind == DIF_TF_BYREF) {
6927 c = dtrace_load8(val++);
6928 } else if (dtkind == DIF_TF_BYUREF) {
6929 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6930 c = dtrace_fuword8((user_addr_t)(uintptr_t)val++);
6931 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6932 if (*flags & CPU_DTRACE_FAULT)
6933 break;
6934 }
6935
6936 DTRACE_STORE(uint8_t, tomax,
6937 valoffs++, c);
6938 }
6939 }
6940
6941 *valp = val;
6942 *valoffsp = valoffs;
6943 }
6944
6945 /*
6946 * Disables interrupts and sets the per-thread inprobe flag. When DEBUG is
6947 * defined, we also assert that we are not recursing unless the probe ID is an
6948 * error probe.
6949 */
6950 static dtrace_icookie_t
6951 dtrace_probe_enter(dtrace_id_t id)
6952 {
6953 thread_t thread = current_thread();
6954 uint16_t inprobe;
6955
6956 dtrace_icookie_t cookie;
6957
6958 cookie = dtrace_interrupt_disable();
6959
6960 /*
6961 * Unless this is an ERROR probe, we are not allowed to recurse in
6962 * dtrace_probe(). Recursing into DTrace probe usually means that a
6963 * function is instrumented that should not have been instrumented or
6964 * that the ordering guarantee of the records will be violated,
6965 * resulting in unexpected output. If there is an exception to this
6966 * assertion, a new case should be added.
6967 */
6968 inprobe = dtrace_get_thread_inprobe(thread);
6969 VERIFY(inprobe == 0 ||
6970 id == dtrace_probeid_error);
6971 ASSERT(inprobe < UINT16_MAX);
6972 dtrace_set_thread_inprobe(thread, inprobe + 1);
6973
6974 return (cookie);
6975 }
6976
6977 /*
6978 * Clears the per-thread inprobe flag and enables interrupts.
6979 */
6980 static void
6981 dtrace_probe_exit(dtrace_icookie_t cookie)
6982 {
6983 thread_t thread = current_thread();
6984 uint16_t inprobe = dtrace_get_thread_inprobe(thread);
6985
6986 ASSERT(inprobe > 0);
6987 dtrace_set_thread_inprobe(thread, inprobe - 1);
6988
6989 #if INTERRUPT_MASKED_DEBUG
6990 ml_spin_debug_reset(thread);
6991 #endif /* INTERRUPT_MASKED_DEBUG */
6992
6993 dtrace_interrupt_enable(cookie);
6994 }
6995
6996 /*
6997 * If you're looking for the epicenter of DTrace, you just found it. This
6998 * is the function called by the provider to fire a probe -- from which all
6999 * subsequent probe-context DTrace activity emanates.
7000 */
7001 void
7002 dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
7003 uint64_t arg2, uint64_t arg3, uint64_t arg4)
7004 {
7005 processorid_t cpuid;
7006 dtrace_icookie_t cookie;
7007 dtrace_probe_t *probe;
7008 dtrace_mstate_t mstate;
7009 dtrace_ecb_t *ecb;
7010 dtrace_action_t *act;
7011 intptr_t offs;
7012 size_t size;
7013 int vtime, onintr;
7014 volatile uint16_t *flags;
7015 hrtime_t now;
7016
7017 cookie = dtrace_probe_enter(id);
7018
7019 /* Ensure that probe id is valid. */
7020 if (id - 1 >= (dtrace_id_t)dtrace_nprobes) {
7021 dtrace_probe_exit(cookie);
7022 return;
7023 }
7024
7025 probe = dtrace_probes[id - 1];
7026 if (probe == NULL) {
7027 dtrace_probe_exit(cookie);
7028 return;
7029 }
7030
7031 cpuid = CPU->cpu_id;
7032 onintr = CPU_ON_INTR(CPU);
7033
7034 if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
7035 probe->dtpr_predcache == dtrace_get_thread_predcache(current_thread())) {
7036 /*
7037 * We have hit in the predicate cache; we know that
7038 * this predicate would evaluate to be false.
7039 */
7040 dtrace_probe_exit(cookie);
7041 return;
7042 }
7043
7044 if (panic_quiesce) {
7045 /*
7046 * We don't trace anything if we're panicking.
7047 */
7048 dtrace_probe_exit(cookie);
7049 return;
7050 }
7051
7052 #if !defined(__APPLE__)
7053 now = dtrace_gethrtime();
7054 vtime = dtrace_vtime_references != 0;
7055
7056 if (vtime && curthread->t_dtrace_start)
7057 curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
7058 #else
7059 /*
7060 * APPLE NOTE: The time spent entering DTrace and arriving
7061 * to this point, is attributed to the current thread.
7062 * Instead it should accrue to DTrace. FIXME
7063 */
7064 vtime = dtrace_vtime_references != 0;
7065
7066 if (vtime)
7067 {
7068 int64_t dtrace_accum_time, recent_vtime;
7069 thread_t thread = current_thread();
7070
7071 dtrace_accum_time = dtrace_get_thread_tracing(thread); /* Time spent inside DTrace so far (nanoseconds) */
7072
7073 if (dtrace_accum_time >= 0) {
7074 recent_vtime = dtrace_abs_to_nano(dtrace_calc_thread_recent_vtime(thread)); /* up to the moment thread vtime */
7075
7076 recent_vtime = recent_vtime - dtrace_accum_time; /* Time without DTrace contribution */
7077
7078 dtrace_set_thread_vtime(thread, recent_vtime);
7079 }
7080 }
7081
7082 now = dtrace_gethrtime(); /* must not precede dtrace_calc_thread_recent_vtime() call! */
7083 #endif /* __APPLE__ */
7084
7085 /*
7086 * APPLE NOTE: A provider may call dtrace_probe_error() in lieu of
7087 * dtrace_probe() in some circumstances. See, e.g. fasttrap_isa.c.
7088 * However the provider has no access to ECB context, so passes
7089 * 0 through "arg0" and the probe_id of the overridden probe as arg1.
7090 * Detect that here and cons up a viable state (from the probe_id).
7091 */
7092 if (dtrace_probeid_error == id && 0 == arg0) {
7093 dtrace_id_t ftp_id = (dtrace_id_t)arg1;
7094 dtrace_probe_t *ftp_probe = dtrace_probes[ftp_id - 1];
7095 dtrace_ecb_t *ftp_ecb = ftp_probe->dtpr_ecb;
7096
7097 if (NULL != ftp_ecb) {
7098 dtrace_state_t *ftp_state = ftp_ecb->dte_state;
7099
7100 arg0 = (uint64_t)(uintptr_t)ftp_state;
7101 arg1 = ftp_ecb->dte_epid;
7102 /*
7103 * args[2-4] established by caller.
7104 */
7105 ftp_state->dts_arg_error_illval = -1; /* arg5 */
7106 }
7107 }
7108
7109 mstate.dtms_difo = NULL;
7110 mstate.dtms_probe = probe;
7111 mstate.dtms_strtok = 0;
7112 mstate.dtms_arg[0] = arg0;
7113 mstate.dtms_arg[1] = arg1;
7114 mstate.dtms_arg[2] = arg2;
7115 mstate.dtms_arg[3] = arg3;
7116 mstate.dtms_arg[4] = arg4;
7117
7118 flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
7119
7120 for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
7121 dtrace_predicate_t *pred = ecb->dte_predicate;
7122 dtrace_state_t *state = ecb->dte_state;
7123 dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
7124 dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
7125 dtrace_vstate_t *vstate = &state->dts_vstate;
7126 dtrace_provider_t *prov = probe->dtpr_provider;
7127 uint64_t tracememsize = 0;
7128 int committed = 0;
7129 caddr_t tomax;
7130
7131 /*
7132 * A little subtlety with the following (seemingly innocuous)
7133 * declaration of the automatic 'val': by looking at the
7134 * code, you might think that it could be declared in the
7135 * action processing loop, below. (That is, it's only used in
7136 * the action processing loop.) However, it must be declared
7137 * out of that scope because in the case of DIF expression
7138 * arguments to aggregating actions, one iteration of the
7139 * action loop will use the last iteration's value.
7140 */
7141 #ifdef lint
7142 uint64_t val = 0;
7143 #else
7144 uint64_t val = 0;
7145 #endif
7146
7147 mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
7148 *flags &= ~CPU_DTRACE_ERROR;
7149
7150 if (prov == dtrace_provider) {
7151 /*
7152 * If dtrace itself is the provider of this probe,
7153 * we're only going to continue processing the ECB if
7154 * arg0 (the dtrace_state_t) is equal to the ECB's
7155 * creating state. (This prevents disjoint consumers
7156 * from seeing one another's metaprobes.)
7157 */
7158 if (arg0 != (uint64_t)(uintptr_t)state)
7159 continue;
7160 }
7161
7162 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
7163 /*
7164 * We're not currently active. If our provider isn't
7165 * the dtrace pseudo provider, we're not interested.
7166 */
7167 if (prov != dtrace_provider)
7168 continue;
7169
7170 /*
7171 * Now we must further check if we are in the BEGIN
7172 * probe. If we are, we will only continue processing
7173 * if we're still in WARMUP -- if one BEGIN enabling
7174 * has invoked the exit() action, we don't want to
7175 * evaluate subsequent BEGIN enablings.
7176 */
7177 if (probe->dtpr_id == dtrace_probeid_begin &&
7178 state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
7179 ASSERT(state->dts_activity ==
7180 DTRACE_ACTIVITY_DRAINING);
7181 continue;
7182 }
7183 }
7184
7185 if (ecb->dte_cond) {
7186 /*
7187 * If the dte_cond bits indicate that this
7188 * consumer is only allowed to see user-mode firings
7189 * of this probe, call the provider's dtps_usermode()
7190 * entry point to check that the probe was fired
7191 * while in a user context. Skip this ECB if that's
7192 * not the case.
7193 */
7194 if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&
7195 prov->dtpv_pops.dtps_usermode &&
7196 prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,
7197 probe->dtpr_id, probe->dtpr_arg) == 0)
7198 continue;
7199
7200 /*
7201 * This is more subtle than it looks. We have to be
7202 * absolutely certain that CRED() isn't going to
7203 * change out from under us so it's only legit to
7204 * examine that structure if we're in constrained
7205 * situations. Currently, the only times we'll this
7206 * check is if a non-super-user has enabled the
7207 * profile or syscall providers -- providers that
7208 * allow visibility of all processes. For the
7209 * profile case, the check above will ensure that
7210 * we're examining a user context.
7211 */
7212 if (ecb->dte_cond & DTRACE_COND_OWNER) {
7213 cred_t *cr;
7214 cred_t *s_cr =
7215 ecb->dte_state->dts_cred.dcr_cred;
7216 proc_t *proc;
7217 #pragma unused(proc) /* __APPLE__ */
7218
7219 ASSERT(s_cr != NULL);
7220
7221 /*
7222 * XXX this is hackish, but so is setting a variable
7223 * XXX in a McCarthy OR...
7224 */
7225 if ((cr = dtrace_CRED()) == NULL ||
7226 posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_uid ||
7227 posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_ruid ||
7228 posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_suid ||
7229 posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_gid ||
7230 posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_rgid ||
7231 posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_sgid ||
7232 #if !defined(__APPLE__)
7233 (proc = ttoproc(curthread)) == NULL ||
7234 (proc->p_flag & SNOCD))
7235 #else
7236 1) /* APPLE NOTE: Darwin omits "No Core Dump" flag */
7237 #endif /* __APPLE__ */
7238 continue;
7239 }
7240
7241 if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
7242 cred_t *cr;
7243 cred_t *s_cr =
7244 ecb->dte_state->dts_cred.dcr_cred;
7245 #pragma unused(cr, s_cr) /* __APPLE__ */
7246
7247 ASSERT(s_cr != NULL);
7248
7249 #if !defined(__APPLE__)
7250 if ((cr = CRED()) == NULL ||
7251 s_cr->cr_zone->zone_id !=
7252 cr->cr_zone->zone_id)
7253 continue;
7254 #else
7255 /* APPLE NOTE: Darwin doesn't do zones. */
7256 #endif /* __APPLE__ */
7257 }
7258 }
7259
7260 if (now - state->dts_alive > dtrace_deadman_timeout) {
7261 /*
7262 * We seem to be dead. Unless we (a) have kernel
7263 * destructive permissions (b) have expicitly enabled
7264 * destructive actions and (c) destructive actions have
7265 * not been disabled, we're going to transition into
7266 * the KILLED state, from which no further processing
7267 * on this state will be performed.
7268 */
7269 if (!dtrace_priv_kernel_destructive(state) ||
7270 !state->dts_cred.dcr_destructive ||
7271 dtrace_destructive_disallow) {
7272 void *activity = &state->dts_activity;
7273 dtrace_activity_t current;
7274
7275 do {
7276 current = state->dts_activity;
7277 } while (dtrace_cas32(activity, current,
7278 DTRACE_ACTIVITY_KILLED) != current);
7279
7280 continue;
7281 }
7282 }
7283
7284 if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
7285 ecb->dte_alignment, state, &mstate)) < 0)
7286 continue;
7287
7288 tomax = buf->dtb_tomax;
7289 ASSERT(tomax != NULL);
7290
7291 /*
7292 * Build and store the record header corresponding to the ECB.
7293 */
7294 if (ecb->dte_size != 0) {
7295 dtrace_rechdr_t dtrh;
7296
7297 if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
7298 mstate.dtms_timestamp = dtrace_gethrtime();
7299 mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP;
7300 }
7301
7302 ASSERT(ecb->dte_size >= sizeof(dtrace_rechdr_t));
7303
7304 dtrh.dtrh_epid = ecb->dte_epid;
7305 DTRACE_RECORD_STORE_TIMESTAMP(&dtrh, mstate.dtms_timestamp);
7306 DTRACE_STORE(dtrace_rechdr_t, tomax, offs, dtrh);
7307 }
7308
7309 mstate.dtms_epid = ecb->dte_epid;
7310 mstate.dtms_present |= DTRACE_MSTATE_EPID;
7311
7312 if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
7313 mstate.dtms_access = DTRACE_ACCESS_KERNEL;
7314 else
7315 mstate.dtms_access = 0;
7316
7317 if (pred != NULL) {
7318 dtrace_difo_t *dp = pred->dtp_difo;
7319 uint64_t rval;
7320
7321 rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
7322
7323 if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
7324 dtrace_cacheid_t cid = probe->dtpr_predcache;
7325
7326 if (cid != DTRACE_CACHEIDNONE && !onintr) {
7327 /*
7328 * Update the predicate cache...
7329 */
7330 ASSERT(cid == pred->dtp_cacheid);
7331
7332 dtrace_set_thread_predcache(current_thread(), cid);
7333 }
7334
7335 continue;
7336 }
7337 }
7338
7339 for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
7340 act != NULL; act = act->dta_next) {
7341 size_t valoffs;
7342 dtrace_difo_t *dp;
7343 dtrace_recdesc_t *rec = &act->dta_rec;
7344
7345 size = rec->dtrd_size;
7346 valoffs = offs + rec->dtrd_offset;
7347
7348 if (DTRACEACT_ISAGG(act->dta_kind)) {
7349 uint64_t v = 0xbad;
7350 dtrace_aggregation_t *agg;
7351
7352 agg = (dtrace_aggregation_t *)act;
7353
7354 if ((dp = act->dta_difo) != NULL)
7355 v = dtrace_dif_emulate(dp,
7356 &mstate, vstate, state);
7357
7358 if (*flags & CPU_DTRACE_ERROR)
7359 continue;
7360
7361 /*
7362 * Note that we always pass the expression
7363 * value from the previous iteration of the
7364 * action loop. This value will only be used
7365 * if there is an expression argument to the
7366 * aggregating action, denoted by the
7367 * dtag_hasarg field.
7368 */
7369 dtrace_aggregate(agg, buf,
7370 offs, aggbuf, v, val);
7371 continue;
7372 }
7373
7374 switch (act->dta_kind) {
7375 case DTRACEACT_STOP:
7376 if (dtrace_priv_proc_destructive(state))
7377 dtrace_action_stop();
7378 continue;
7379
7380 case DTRACEACT_BREAKPOINT:
7381 if (dtrace_priv_kernel_destructive(state))
7382 dtrace_action_breakpoint(ecb);
7383 continue;
7384
7385 case DTRACEACT_PANIC:
7386 if (dtrace_priv_kernel_destructive(state))
7387 dtrace_action_panic(ecb);
7388 continue;
7389
7390 case DTRACEACT_STACK:
7391 if (!dtrace_priv_kernel(state))
7392 continue;
7393
7394 dtrace_getpcstack((pc_t *)(tomax + valoffs),
7395 size / sizeof (pc_t), probe->dtpr_aframes,
7396 DTRACE_ANCHORED(probe) ? NULL :
7397 (uint32_t *)(uintptr_t)arg0);
7398 continue;
7399
7400 case DTRACEACT_JSTACK:
7401 case DTRACEACT_USTACK:
7402 if (!dtrace_priv_proc(state))
7403 continue;
7404
7405 /*
7406 * See comment in DIF_VAR_PID.
7407 */
7408 if (DTRACE_ANCHORED(mstate.dtms_probe) &&
7409 CPU_ON_INTR(CPU)) {
7410 int depth = DTRACE_USTACK_NFRAMES(
7411 rec->dtrd_arg) + 1;
7412
7413 dtrace_bzero((void *)(tomax + valoffs),
7414 DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
7415 + depth * sizeof (uint64_t));
7416
7417 continue;
7418 }
7419
7420 if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
7421 curproc->p_dtrace_helpers != NULL) {
7422 /*
7423 * This is the slow path -- we have
7424 * allocated string space, and we're
7425 * getting the stack of a process that
7426 * has helpers. Call into a separate
7427 * routine to perform this processing.
7428 */
7429 dtrace_action_ustack(&mstate, state,
7430 (uint64_t *)(tomax + valoffs),
7431 rec->dtrd_arg);
7432 continue;
7433 }
7434
7435 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
7436 dtrace_getupcstack((uint64_t *)
7437 (tomax + valoffs),
7438 DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
7439 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
7440 continue;
7441
7442 default:
7443 break;
7444 }
7445
7446 dp = act->dta_difo;
7447 ASSERT(dp != NULL);
7448
7449 val = dtrace_dif_emulate(dp, &mstate, vstate, state);
7450
7451 if (*flags & CPU_DTRACE_ERROR)
7452 continue;
7453
7454 switch (act->dta_kind) {
7455 case DTRACEACT_SPECULATE: {
7456 dtrace_rechdr_t *dtrh = NULL;
7457
7458 ASSERT(buf == &state->dts_buffer[cpuid]);
7459 buf = dtrace_speculation_buffer(state,
7460 cpuid, val);
7461
7462 if (buf == NULL) {
7463 *flags |= CPU_DTRACE_DROP;
7464 continue;
7465 }
7466
7467 offs = dtrace_buffer_reserve(buf,
7468 ecb->dte_needed, ecb->dte_alignment,
7469 state, NULL);
7470
7471 if (offs < 0) {
7472 *flags |= CPU_DTRACE_DROP;
7473 continue;
7474 }
7475
7476 tomax = buf->dtb_tomax;
7477 ASSERT(tomax != NULL);
7478
7479 if (ecb->dte_size == 0)
7480 continue;
7481
7482 ASSERT(ecb->dte_size >= sizeof(dtrace_rechdr_t));
7483 dtrh = ((void *)(tomax + offs));
7484 dtrh->dtrh_epid = ecb->dte_epid;
7485
7486 /*
7487 * When the speculation is committed, all of
7488 * the records in the speculative buffer will
7489 * have their timestamps set to the commit
7490 * time. Until then, it is set to a sentinel
7491 * value, for debugability.
7492 */
7493 DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX);
7494
7495 continue;
7496 }
7497
7498 case DTRACEACT_CHILL:
7499 if (dtrace_priv_kernel_destructive(state))
7500 dtrace_action_chill(&mstate, val);
7501 continue;
7502
7503 case DTRACEACT_RAISE:
7504 if (dtrace_priv_proc_destructive(state))
7505 dtrace_action_raise(val);
7506 continue;
7507
7508 case DTRACEACT_PIDRESUME: /* __APPLE__ */
7509 if (dtrace_priv_proc_destructive(state))
7510 dtrace_action_pidresume(val);
7511 continue;
7512
7513 case DTRACEACT_COMMIT:
7514 ASSERT(!committed);
7515
7516 /*
7517 * We need to commit our buffer state.
7518 */
7519 if (ecb->dte_size)
7520 buf->dtb_offset = offs + ecb->dte_size;
7521 buf = &state->dts_buffer[cpuid];
7522 dtrace_speculation_commit(state, cpuid, val);
7523 committed = 1;
7524 continue;
7525
7526 case DTRACEACT_DISCARD:
7527 dtrace_speculation_discard(state, cpuid, val);
7528 continue;
7529
7530 case DTRACEACT_DIFEXPR:
7531 case DTRACEACT_LIBACT:
7532 case DTRACEACT_PRINTF:
7533 case DTRACEACT_PRINTA:
7534 case DTRACEACT_SYSTEM:
7535 case DTRACEACT_FREOPEN:
7536 case DTRACEACT_APPLEBINARY: /* __APPLE__ */
7537 case DTRACEACT_TRACEMEM:
7538 break;
7539
7540 case DTRACEACT_TRACEMEM_DYNSIZE:
7541 tracememsize = val;
7542 break;
7543
7544 case DTRACEACT_SYM:
7545 case DTRACEACT_MOD:
7546 if (!dtrace_priv_kernel(state))
7547 continue;
7548 break;
7549
7550 case DTRACEACT_USYM:
7551 case DTRACEACT_UMOD:
7552 case DTRACEACT_UADDR: {
7553 if (!dtrace_priv_proc(state))
7554 continue;
7555
7556 DTRACE_STORE(uint64_t, tomax,
7557 valoffs, (uint64_t)dtrace_proc_selfpid());
7558 DTRACE_STORE(uint64_t, tomax,
7559 valoffs + sizeof (uint64_t), val);
7560
7561 continue;
7562 }
7563
7564 case DTRACEACT_EXIT: {
7565 /*
7566 * For the exit action, we are going to attempt
7567 * to atomically set our activity to be
7568 * draining. If this fails (either because
7569 * another CPU has beat us to the exit action,
7570 * or because our current activity is something
7571 * other than ACTIVE or WARMUP), we will
7572 * continue. This assures that the exit action
7573 * can be successfully recorded at most once
7574 * when we're in the ACTIVE state. If we're
7575 * encountering the exit() action while in
7576 * COOLDOWN, however, we want to honor the new
7577 * status code. (We know that we're the only
7578 * thread in COOLDOWN, so there is no race.)
7579 */
7580 void *activity = &state->dts_activity;
7581 dtrace_activity_t current = state->dts_activity;
7582
7583 if (current == DTRACE_ACTIVITY_COOLDOWN)
7584 break;
7585
7586 if (current != DTRACE_ACTIVITY_WARMUP)
7587 current = DTRACE_ACTIVITY_ACTIVE;
7588
7589 if (dtrace_cas32(activity, current,
7590 DTRACE_ACTIVITY_DRAINING) != current) {
7591 *flags |= CPU_DTRACE_DROP;
7592 continue;
7593 }
7594
7595 break;
7596 }
7597
7598 default:
7599 ASSERT(0);
7600 }
7601
7602 if (dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF | DIF_TF_BYUREF)) {
7603 uintptr_t end = valoffs + size;
7604
7605 if (tracememsize != 0 &&
7606 valoffs + tracememsize < end)
7607 {
7608 end = valoffs + tracememsize;
7609 tracememsize = 0;
7610 }
7611
7612 if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF &&
7613 !dtrace_vcanload((void *)(uintptr_t)val,
7614 &dp->dtdo_rtype, NULL, &mstate, vstate))
7615 {
7616 continue;
7617 }
7618
7619 dtrace_store_by_ref(dp, tomax, size, &valoffs,
7620 &val, end, act->dta_intuple,
7621 dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ?
7622 DIF_TF_BYREF: DIF_TF_BYUREF);
7623
7624 continue;
7625 }
7626
7627 switch (size) {
7628 case 0:
7629 break;
7630
7631 case sizeof (uint8_t):
7632 DTRACE_STORE(uint8_t, tomax, valoffs, val);
7633 break;
7634 case sizeof (uint16_t):
7635 DTRACE_STORE(uint16_t, tomax, valoffs, val);
7636 break;
7637 case sizeof (uint32_t):
7638 DTRACE_STORE(uint32_t, tomax, valoffs, val);
7639 break;
7640 case sizeof (uint64_t):
7641 DTRACE_STORE(uint64_t, tomax, valoffs, val);
7642 break;
7643 default:
7644 /*
7645 * Any other size should have been returned by
7646 * reference, not by value.
7647 */
7648 ASSERT(0);
7649 break;
7650 }
7651 }
7652
7653 if (*flags & CPU_DTRACE_DROP)
7654 continue;
7655
7656 if (*flags & CPU_DTRACE_FAULT) {
7657 int ndx;
7658 dtrace_action_t *err;
7659
7660 buf->dtb_errors++;
7661
7662 if (probe->dtpr_id == dtrace_probeid_error) {
7663 /*
7664 * There's nothing we can do -- we had an
7665 * error on the error probe. We bump an
7666 * error counter to at least indicate that
7667 * this condition happened.
7668 */
7669 dtrace_error(&state->dts_dblerrors);
7670 continue;
7671 }
7672
7673 if (vtime) {
7674 /*
7675 * Before recursing on dtrace_probe(), we
7676 * need to explicitly clear out our start
7677 * time to prevent it from being accumulated
7678 * into t_dtrace_vtime.
7679 */
7680
7681 /*
7682 * Darwin sets the sign bit on t_dtrace_tracing
7683 * to suspend accumulation to it.
7684 */
7685 dtrace_set_thread_tracing(current_thread(),
7686 (1ULL<<63) | dtrace_get_thread_tracing(current_thread()));
7687
7688 }
7689
7690 /*
7691 * Iterate over the actions to figure out which action
7692 * we were processing when we experienced the error.
7693 * Note that act points _past_ the faulting action; if
7694 * act is ecb->dte_action, the fault was in the
7695 * predicate, if it's ecb->dte_action->dta_next it's
7696 * in action #1, and so on.
7697 */
7698 for (err = ecb->dte_action, ndx = 0;
7699 err != act; err = err->dta_next, ndx++)
7700 continue;
7701
7702 dtrace_probe_error(state, ecb->dte_epid, ndx,
7703 (mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
7704 mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
7705 cpu_core[cpuid].cpuc_dtrace_illval);
7706
7707 continue;
7708 }
7709
7710 if (!committed)
7711 buf->dtb_offset = offs + ecb->dte_size;
7712 }
7713
7714 /* FIXME: On Darwin the time spent leaving DTrace from this point to the rti is attributed
7715 to the current thread. Instead it should accrue to DTrace. */
7716 if (vtime) {
7717 thread_t thread = current_thread();
7718 int64_t t = dtrace_get_thread_tracing(thread);
7719
7720 if (t >= 0) {
7721 /* Usual case, accumulate time spent here into t_dtrace_tracing */
7722 dtrace_set_thread_tracing(thread, t + (dtrace_gethrtime() - now));
7723 } else {
7724 /* Return from error recursion. No accumulation, just clear the sign bit on t_dtrace_tracing. */
7725 dtrace_set_thread_tracing(thread, (~(1ULL<<63)) & t);
7726 }
7727 }
7728
7729 dtrace_probe_exit(cookie);
7730 }
7731
7732 /*
7733 * DTrace Probe Hashing Functions
7734 *
7735 * The functions in this section (and indeed, the functions in remaining
7736 * sections) are not _called_ from probe context. (Any exceptions to this are
7737 * marked with a "Note:".) Rather, they are called from elsewhere in the
7738 * DTrace framework to look-up probes in, add probes to and remove probes from
7739 * the DTrace probe hashes. (Each probe is hashed by each element of the
7740 * probe tuple -- allowing for fast lookups, regardless of what was
7741 * specified.)
7742 */
7743 static uint_t
7744 dtrace_hash_str(const char *p)
7745 {
7746 unsigned int g;
7747 uint_t hval = 0;
7748
7749 while (*p) {
7750 hval = (hval << 4) + *p++;
7751 if ((g = (hval & 0xf0000000)) != 0)
7752 hval ^= g >> 24;
7753 hval &= ~g;
7754 }
7755 return (hval);
7756 }
7757
7758 static const char*
7759 dtrace_strkey_probe_provider(void *elm, uintptr_t offs)
7760 {
7761 #pragma unused(offs)
7762 dtrace_probe_t *probe = (dtrace_probe_t*)elm;
7763 return probe->dtpr_provider->dtpv_name;
7764 }
7765
7766 static const char*
7767 dtrace_strkey_offset(void *elm, uintptr_t offs)
7768 {
7769 return ((char *)((uintptr_t)(elm) + offs));
7770 }
7771
7772 static const char*
7773 dtrace_strkey_deref_offset(void *elm, uintptr_t offs)
7774 {
7775 return *((char **)((uintptr_t)(elm) + offs));
7776 }
7777
7778 static dtrace_hash_t *
7779 dtrace_hash_create(dtrace_strkey_f func, uintptr_t arg, uintptr_t nextoffs, uintptr_t prevoffs)
7780 {
7781 dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
7782
7783 hash->dth_getstr = func;
7784 hash->dth_stroffs = arg;
7785 hash->dth_nextoffs = nextoffs;
7786 hash->dth_prevoffs = prevoffs;
7787
7788 hash->dth_size = 1;
7789 hash->dth_mask = hash->dth_size - 1;
7790
7791 hash->dth_tab = kmem_zalloc(hash->dth_size *
7792 sizeof (dtrace_hashbucket_t *), KM_SLEEP);
7793
7794 return (hash);
7795 }
7796
7797 /*
7798 * APPLE NOTE: dtrace_hash_destroy is not used.
7799 * It is called by dtrace_detach which is not
7800 * currently implemented. Revisit someday.
7801 */
7802 #if !defined(__APPLE__)
7803 static void
7804 dtrace_hash_destroy(dtrace_hash_t *hash)
7805 {
7806 #if DEBUG
7807 int i;
7808
7809 for (i = 0; i < hash->dth_size; i++)
7810 ASSERT(hash->dth_tab[i] == NULL);
7811 #endif
7812
7813 kmem_free(hash->dth_tab,
7814 hash->dth_size * sizeof (dtrace_hashbucket_t *));
7815 kmem_free(hash, sizeof (dtrace_hash_t));
7816 }
7817 #endif /* __APPLE__ */
7818
7819 static void
7820 dtrace_hash_resize(dtrace_hash_t *hash)
7821 {
7822 int size = hash->dth_size, i, ndx;
7823 int new_size = hash->dth_size << 1;
7824 int new_mask = new_size - 1;
7825 dtrace_hashbucket_t **new_tab, *bucket, *next;
7826
7827 ASSERT((new_size & new_mask) == 0);
7828
7829 new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
7830
7831 for (i = 0; i < size; i++) {
7832 for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
7833 void *elm = bucket->dthb_chain;
7834
7835 ASSERT(elm != NULL);
7836 ndx = DTRACE_HASHSTR(hash, elm) & new_mask;
7837
7838 next = bucket->dthb_next;
7839 bucket->dthb_next = new_tab[ndx];
7840 new_tab[ndx] = bucket;
7841 }
7842 }
7843
7844 kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
7845 hash->dth_tab = new_tab;
7846 hash->dth_size = new_size;
7847 hash->dth_mask = new_mask;
7848 }
7849
7850 static void
7851 dtrace_hash_add(dtrace_hash_t *hash, void *new)
7852 {
7853 int hashval = DTRACE_HASHSTR(hash, new);
7854 int ndx = hashval & hash->dth_mask;
7855 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7856 void **nextp, **prevp;
7857
7858 for (; bucket != NULL; bucket = bucket->dthb_next) {
7859 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
7860 goto add;
7861 }
7862
7863 if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
7864 dtrace_hash_resize(hash);
7865 dtrace_hash_add(hash, new);
7866 return;
7867 }
7868
7869 bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
7870 bucket->dthb_next = hash->dth_tab[ndx];
7871 hash->dth_tab[ndx] = bucket;
7872 hash->dth_nbuckets++;
7873
7874 add:
7875 nextp = DTRACE_HASHNEXT(hash, new);
7876 ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);
7877 *nextp = bucket->dthb_chain;
7878
7879 if (bucket->dthb_chain != NULL) {
7880 prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
7881 ASSERT(*prevp == NULL);
7882 *prevp = new;
7883 }
7884
7885 bucket->dthb_chain = new;
7886 bucket->dthb_len++;
7887 }
7888
7889 static void *
7890 dtrace_hash_lookup_string(dtrace_hash_t *hash, const char *str)
7891 {
7892 int hashval = dtrace_hash_str(str);
7893 int ndx = hashval & hash->dth_mask;
7894 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7895
7896 for (; bucket != NULL; bucket = bucket->dthb_next) {
7897 if (strcmp(str, DTRACE_GETSTR(hash, bucket->dthb_chain)) == 0)
7898 return (bucket->dthb_chain);
7899 }
7900
7901 return (NULL);
7902 }
7903
7904 static dtrace_probe_t *
7905 dtrace_hash_lookup(dtrace_hash_t *hash, void *template)
7906 {
7907 return dtrace_hash_lookup_string(hash, DTRACE_GETSTR(hash, template));
7908 }
7909
7910 static int
7911 dtrace_hash_collisions(dtrace_hash_t *hash, void *template)
7912 {
7913 int hashval = DTRACE_HASHSTR(hash, template);
7914 int ndx = hashval & hash->dth_mask;
7915 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7916
7917 for (; bucket != NULL; bucket = bucket->dthb_next) {
7918 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
7919 return (bucket->dthb_len);
7920 }
7921
7922 return (0);
7923 }
7924
7925 static void
7926 dtrace_hash_remove(dtrace_hash_t *hash, void *elm)
7927 {
7928 int ndx = DTRACE_HASHSTR(hash, elm) & hash->dth_mask;
7929 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7930
7931 void **prevp = DTRACE_HASHPREV(hash, elm);
7932 void **nextp = DTRACE_HASHNEXT(hash, elm);
7933
7934 /*
7935 * Find the bucket that we're removing this elm from.
7936 */
7937 for (; bucket != NULL; bucket = bucket->dthb_next) {
7938 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, elm))
7939 break;
7940 }
7941
7942 ASSERT(bucket != NULL);
7943
7944 if (*prevp == NULL) {
7945 if (*nextp == NULL) {
7946 /*
7947 * The removed element was the only element on this
7948 * bucket; we need to remove the bucket.
7949 */
7950 dtrace_hashbucket_t *b = hash->dth_tab[ndx];
7951
7952 ASSERT(bucket->dthb_chain == elm);
7953 ASSERT(b != NULL);
7954
7955 if (b == bucket) {
7956 hash->dth_tab[ndx] = bucket->dthb_next;
7957 } else {
7958 while (b->dthb_next != bucket)
7959 b = b->dthb_next;
7960 b->dthb_next = bucket->dthb_next;
7961 }
7962
7963 ASSERT(hash->dth_nbuckets > 0);
7964 hash->dth_nbuckets--;
7965 kmem_free(bucket, sizeof (dtrace_hashbucket_t));
7966 return;
7967 }
7968
7969 bucket->dthb_chain = *nextp;
7970 } else {
7971 *(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;
7972 }
7973
7974 if (*nextp != NULL)
7975 *(DTRACE_HASHPREV(hash, *nextp)) = *prevp;
7976 }
7977
7978 /*
7979 * DTrace Utility Functions
7980 *
7981 * These are random utility functions that are _not_ called from probe context.
7982 */
7983 static int
7984 dtrace_badattr(const dtrace_attribute_t *a)
7985 {
7986 return (a->dtat_name > DTRACE_STABILITY_MAX ||
7987 a->dtat_data > DTRACE_STABILITY_MAX ||
7988 a->dtat_class > DTRACE_CLASS_MAX);
7989 }
7990
7991 /*
7992 * Returns a dtrace-managed copy of a string, and will
7993 * deduplicate copies of the same string.
7994 * If the specified string is NULL, returns an empty string
7995 */
7996 static char *
7997 dtrace_strref(const char *str)
7998 {
7999 dtrace_string_t *s = NULL;
8000 size_t bufsize = (str != NULL ? strlen(str) : 0) + 1;
8001
8002 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8003
8004 if (str == NULL)
8005 str = "";
8006
8007 for (s = dtrace_hash_lookup_string(dtrace_strings, str); s != NULL;
8008 s = *(DTRACE_HASHNEXT(dtrace_strings, s))) {
8009 if (strncmp(str, s->dtst_str, bufsize) != 0) {
8010 continue;
8011 }
8012 ASSERT(s->dtst_refcount != UINT32_MAX);
8013 s->dtst_refcount++;
8014 return s->dtst_str;
8015 }
8016
8017 s = kmem_zalloc(sizeof(dtrace_string_t) + bufsize, KM_SLEEP);
8018 s->dtst_refcount = 1;
8019 (void) strlcpy(s->dtst_str, str, bufsize);
8020
8021 dtrace_hash_add(dtrace_strings, s);
8022
8023 return s->dtst_str;
8024 }
8025
8026 static void
8027 dtrace_strunref(const char *str)
8028 {
8029 ASSERT(str != NULL);
8030 dtrace_string_t *s = NULL;
8031 size_t bufsize = strlen(str) + 1;
8032
8033 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8034
8035 for (s = dtrace_hash_lookup_string(dtrace_strings, str); s != NULL;
8036 s = *(DTRACE_HASHNEXT(dtrace_strings, s))) {
8037 if (strncmp(str, s->dtst_str, bufsize) != 0) {
8038 continue;
8039 }
8040 ASSERT(s->dtst_refcount != 0);
8041 s->dtst_refcount--;
8042 if (s->dtst_refcount == 0) {
8043 dtrace_hash_remove(dtrace_strings, s);
8044 kmem_free(s, sizeof(dtrace_string_t) + bufsize);
8045 }
8046 return;
8047 }
8048 panic("attempt to unref non-existent string %s", str);
8049 }
8050
8051 #define DTRACE_ISALPHA(c) \
8052 (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
8053
8054 static int
8055 dtrace_badname(const char *s)
8056 {
8057 char c;
8058
8059 if (s == NULL || (c = *s++) == '\0')
8060 return (0);
8061
8062 if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
8063 return (1);
8064
8065 while ((c = *s++) != '\0') {
8066 if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') &&
8067 c != '-' && c != '_' && c != '.' && c != '`')
8068 return (1);
8069 }
8070
8071 return (0);
8072 }
8073
8074 static void
8075 dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
8076 {
8077 uint32_t priv;
8078
8079 if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
8080 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
8081 priv = DTRACE_PRIV_USER | DTRACE_PRIV_PROC | DTRACE_PRIV_OWNER;
8082 }
8083 else {
8084 priv = DTRACE_PRIV_ALL;
8085 }
8086 *uidp = 0;
8087 *zoneidp = 0;
8088 } else {
8089 *uidp = crgetuid(cr);
8090 *zoneidp = crgetzoneid(cr);
8091
8092 priv = 0;
8093 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
8094 priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;
8095 else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
8096 priv |= DTRACE_PRIV_USER;
8097 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
8098 priv |= DTRACE_PRIV_PROC;
8099 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
8100 priv |= DTRACE_PRIV_OWNER;
8101 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
8102 priv |= DTRACE_PRIV_ZONEOWNER;
8103 }
8104
8105 *privp = priv;
8106 }
8107
8108 #ifdef DTRACE_ERRDEBUG
8109 static void
8110 dtrace_errdebug(const char *str)
8111 {
8112 int hval = dtrace_hash_str(str) % DTRACE_ERRHASHSZ;
8113 int occupied = 0;
8114
8115 lck_mtx_lock(&dtrace_errlock);
8116 dtrace_errlast = str;
8117 dtrace_errthread = (kthread_t *)current_thread();
8118
8119 while (occupied++ < DTRACE_ERRHASHSZ) {
8120 if (dtrace_errhash[hval].dter_msg == str) {
8121 dtrace_errhash[hval].dter_count++;
8122 goto out;
8123 }
8124
8125 if (dtrace_errhash[hval].dter_msg != NULL) {
8126 hval = (hval + 1) % DTRACE_ERRHASHSZ;
8127 continue;
8128 }
8129
8130 dtrace_errhash[hval].dter_msg = str;
8131 dtrace_errhash[hval].dter_count = 1;
8132 goto out;
8133 }
8134
8135 panic("dtrace: undersized error hash");
8136 out:
8137 lck_mtx_unlock(&dtrace_errlock);
8138 }
8139 #endif
8140
8141 /*
8142 * DTrace Matching Functions
8143 *
8144 * These functions are used to match groups of probes, given some elements of
8145 * a probe tuple, or some globbed expressions for elements of a probe tuple.
8146 */
8147 static int
8148 dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
8149 zoneid_t zoneid)
8150 {
8151 if (priv != DTRACE_PRIV_ALL) {
8152 uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
8153 uint32_t match = priv & ppriv;
8154
8155 /*
8156 * No PRIV_DTRACE_* privileges...
8157 */
8158 if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |
8159 DTRACE_PRIV_KERNEL)) == 0)
8160 return (0);
8161
8162 /*
8163 * No matching bits, but there were bits to match...
8164 */
8165 if (match == 0 && ppriv != 0)
8166 return (0);
8167
8168 /*
8169 * Need to have permissions to the process, but don't...
8170 */
8171 if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
8172 uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
8173 return (0);
8174 }
8175
8176 /*
8177 * Need to be in the same zone unless we possess the
8178 * privilege to examine all zones.
8179 */
8180 if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
8181 zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
8182 return (0);
8183 }
8184 }
8185
8186 return (1);
8187 }
8188
8189 /*
8190 * dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
8191 * consists of input pattern strings and an ops-vector to evaluate them.
8192 * This function returns >0 for match, 0 for no match, and <0 for error.
8193 */
8194 static int
8195 dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,
8196 uint32_t priv, uid_t uid, zoneid_t zoneid)
8197 {
8198 dtrace_provider_t *pvp = prp->dtpr_provider;
8199 int rv;
8200
8201 if (pvp->dtpv_defunct)
8202 return (0);
8203
8204 if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
8205 return (rv);
8206
8207 if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
8208 return (rv);
8209
8210 if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
8211 return (rv);
8212
8213 if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
8214 return (rv);
8215
8216 if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
8217 return (0);
8218
8219 return (rv);
8220 }
8221
8222 /*
8223 * dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
8224 * interface for matching a glob pattern 'p' to an input string 's'. Unlike
8225 * libc's version, the kernel version only applies to 8-bit ASCII strings.
8226 * In addition, all of the recursion cases except for '*' matching have been
8227 * unwound. For '*', we still implement recursive evaluation, but a depth
8228 * counter is maintained and matching is aborted if we recurse too deep.
8229 * The function returns 0 if no match, >0 if match, and <0 if recursion error.
8230 */
8231 static int
8232 dtrace_match_glob(const char *s, const char *p, int depth)
8233 {
8234 const char *olds;
8235 char s1, c;
8236 int gs;
8237
8238 if (depth > DTRACE_PROBEKEY_MAXDEPTH)
8239 return (-1);
8240
8241 if (s == NULL)
8242 s = ""; /* treat NULL as empty string */
8243
8244 top:
8245 olds = s;
8246 s1 = *s++;
8247
8248 if (p == NULL)
8249 return (0);
8250
8251 if ((c = *p++) == '\0')
8252 return (s1 == '\0');
8253
8254 switch (c) {
8255 case '[': {
8256 int ok = 0, notflag = 0;
8257 char lc = '\0';
8258
8259 if (s1 == '\0')
8260 return (0);
8261
8262 if (*p == '!') {
8263 notflag = 1;
8264 p++;
8265 }
8266
8267 if ((c = *p++) == '\0')
8268 return (0);
8269
8270 do {
8271 if (c == '-' && lc != '\0' && *p != ']') {
8272 if ((c = *p++) == '\0')
8273 return (0);
8274 if (c == '\\' && (c = *p++) == '\0')
8275 return (0);
8276
8277 if (notflag) {
8278 if (s1 < lc || s1 > c)
8279 ok++;
8280 else
8281 return (0);
8282 } else if (lc <= s1 && s1 <= c)
8283 ok++;
8284
8285 } else if (c == '\\' && (c = *p++) == '\0')
8286 return (0);
8287
8288 lc = c; /* save left-hand 'c' for next iteration */
8289
8290 if (notflag) {
8291 if (s1 != c)
8292 ok++;
8293 else
8294 return (0);
8295 } else if (s1 == c)
8296 ok++;
8297
8298 if ((c = *p++) == '\0')
8299 return (0);
8300
8301 } while (c != ']');
8302
8303 if (ok)
8304 goto top;
8305
8306 return (0);
8307 }
8308
8309 case '\\':
8310 if ((c = *p++) == '\0')
8311 return (0);
8312 OS_FALLTHROUGH;
8313
8314 default:
8315 if (c != s1)
8316 return (0);
8317 OS_FALLTHROUGH;
8318
8319 case '?':
8320 if (s1 != '\0')
8321 goto top;
8322 return (0);
8323
8324 case '*':
8325 while (*p == '*')
8326 p++; /* consecutive *'s are identical to a single one */
8327
8328 if (*p == '\0')
8329 return (1);
8330
8331 for (s = olds; *s != '\0'; s++) {
8332 if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
8333 return (gs);
8334 }
8335
8336 return (0);
8337 }
8338 }
8339
8340 /*ARGSUSED*/
8341 static int
8342 dtrace_match_string(const char *s, const char *p, int depth)
8343 {
8344 #pragma unused(depth) /* __APPLE__ */
8345 return (s != NULL && s == p);
8346 }
8347
8348 /*ARGSUSED*/
8349 static int
8350 dtrace_match_module(const char *s, const char *p, int depth)
8351 {
8352 #pragma unused(depth) /* __APPLE__ */
8353 size_t len;
8354 if (s == NULL || p == NULL)
8355 return (0);
8356
8357 len = strlen(p);
8358
8359 if (strncmp(p, s, len) != 0)
8360 return (0);
8361
8362 if (s[len] == '.' || s[len] == '\0')
8363 return (1);
8364
8365 return (0);
8366 }
8367
8368 /*ARGSUSED*/
8369 static int
8370 dtrace_match_nul(const char *s, const char *p, int depth)
8371 {
8372 #pragma unused(s, p, depth) /* __APPLE__ */
8373 return (1); /* always match the empty pattern */
8374 }
8375
8376 /*ARGSUSED*/
8377 static int
8378 dtrace_match_nonzero(const char *s, const char *p, int depth)
8379 {
8380 #pragma unused(p, depth) /* __APPLE__ */
8381 return (s != NULL && s[0] != '\0');
8382 }
8383
8384 static int
8385 dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
8386 zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *, void *), void *arg1, void *arg2)
8387 {
8388 dtrace_probe_t *probe;
8389 dtrace_provider_t prov_template = {
8390 .dtpv_name = (char *)(uintptr_t)pkp->dtpk_prov
8391 };
8392
8393 dtrace_probe_t template = {
8394 .dtpr_provider = &prov_template,
8395 .dtpr_mod = (char *)(uintptr_t)pkp->dtpk_mod,
8396 .dtpr_func = (char *)(uintptr_t)pkp->dtpk_func,
8397 .dtpr_name = (char *)(uintptr_t)pkp->dtpk_name
8398 };
8399
8400 dtrace_hash_t *hash = NULL;
8401 int len, rc, best = INT_MAX, nmatched = 0;
8402 dtrace_id_t i;
8403
8404 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8405
8406 /*
8407 * If the probe ID is specified in the key, just lookup by ID and
8408 * invoke the match callback once if a matching probe is found.
8409 */
8410 if (pkp->dtpk_id != DTRACE_IDNONE) {
8411 if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
8412 dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
8413 if ((*matched)(probe, arg1, arg2) == DTRACE_MATCH_FAIL)
8414 return (DTRACE_MATCH_FAIL);
8415 nmatched++;
8416 }
8417 return (nmatched);
8418 }
8419
8420 /*
8421 * We want to find the most distinct of the provider name, module name,
8422 * function name, and name. So for each one that is not a glob
8423 * pattern or empty string, we perform a lookup in the corresponding
8424 * hash and use the hash table with the fewest collisions to do our
8425 * search.
8426 */
8427 if (pkp->dtpk_pmatch == &dtrace_match_string &&
8428 (len = dtrace_hash_collisions(dtrace_byprov, &template)) < best) {
8429 best = len;
8430 hash = dtrace_byprov;
8431 }
8432
8433 if (pkp->dtpk_mmatch == &dtrace_match_string &&
8434 (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
8435 best = len;
8436 hash = dtrace_bymod;
8437 }
8438
8439 if (pkp->dtpk_fmatch == &dtrace_match_string &&
8440 (len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
8441 best = len;
8442 hash = dtrace_byfunc;
8443 }
8444
8445 if (pkp->dtpk_nmatch == &dtrace_match_string &&
8446 (len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
8447 best = len;
8448 hash = dtrace_byname;
8449 }
8450
8451 /*
8452 * If we did not select a hash table, iterate over every probe and
8453 * invoke our callback for each one that matches our input probe key.
8454 */
8455 if (hash == NULL) {
8456 for (i = 0; i < (dtrace_id_t)dtrace_nprobes; i++) {
8457 if ((probe = dtrace_probes[i]) == NULL ||
8458 dtrace_match_probe(probe, pkp, priv, uid,
8459 zoneid) <= 0)
8460 continue;
8461
8462 nmatched++;
8463
8464 if ((rc = (*matched)(probe, arg1, arg2)) != DTRACE_MATCH_NEXT) {
8465 if (rc == DTRACE_MATCH_FAIL)
8466 return (DTRACE_MATCH_FAIL);
8467 break;
8468 }
8469 }
8470
8471 return (nmatched);
8472 }
8473
8474 /*
8475 * If we selected a hash table, iterate over each probe of the same key
8476 * name and invoke the callback for every probe that matches the other
8477 * attributes of our input probe key.
8478 */
8479 for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
8480 probe = *(DTRACE_HASHNEXT(hash, probe))) {
8481
8482 if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
8483 continue;
8484
8485 nmatched++;
8486
8487 if ((rc = (*matched)(probe, arg1, arg2)) != DTRACE_MATCH_NEXT) {
8488 if (rc == DTRACE_MATCH_FAIL)
8489 return (DTRACE_MATCH_FAIL);
8490 break;
8491 }
8492 }
8493
8494 return (nmatched);
8495 }
8496
8497 /*
8498 * Return the function pointer dtrace_probecmp() should use to compare the
8499 * specified pattern with a string. For NULL or empty patterns, we select
8500 * dtrace_match_nul(). For glob pattern strings, we use dtrace_match_glob().
8501 * For non-empty non-glob strings, we use dtrace_match_string().
8502 */
8503 static dtrace_probekey_f *
8504 dtrace_probekey_func(const char *p)
8505 {
8506 char c;
8507
8508 if (p == NULL || *p == '\0')
8509 return (&dtrace_match_nul);
8510
8511 while ((c = *p++) != '\0') {
8512 if (c == '[' || c == '?' || c == '*' || c == '\\')
8513 return (&dtrace_match_glob);
8514 }
8515
8516 return (&dtrace_match_string);
8517 }
8518
8519 static dtrace_probekey_f *
8520 dtrace_probekey_module_func(const char *p)
8521 {
8522 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8523
8524 dtrace_probekey_f *f = dtrace_probekey_func(p);
8525 if (f == &dtrace_match_string) {
8526 dtrace_probe_t template = {
8527 .dtpr_mod = (char *)(uintptr_t)p,
8528 };
8529 if (dtrace_hash_lookup(dtrace_bymod, &template) == NULL) {
8530 return (&dtrace_match_module);
8531 }
8532 return (&dtrace_match_string);
8533 }
8534 return f;
8535 }
8536
8537 /*
8538 * Build a probe comparison key for use with dtrace_match_probe() from the
8539 * given probe description. By convention, a null key only matches anchored
8540 * probes: if each field is the empty string, reset dtpk_fmatch to
8541 * dtrace_match_nonzero().
8542 */
8543 static void
8544 dtrace_probekey(const dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
8545 {
8546
8547 pkp->dtpk_prov = dtrace_strref(pdp->dtpd_provider);
8548 pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
8549
8550 pkp->dtpk_mod = dtrace_strref(pdp->dtpd_mod);
8551 pkp->dtpk_mmatch = dtrace_probekey_module_func(pdp->dtpd_mod);
8552
8553 pkp->dtpk_func = dtrace_strref(pdp->dtpd_func);
8554 pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
8555
8556 pkp->dtpk_name = dtrace_strref(pdp->dtpd_name);
8557 pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
8558
8559 pkp->dtpk_id = pdp->dtpd_id;
8560
8561 if (pkp->dtpk_id == DTRACE_IDNONE &&
8562 pkp->dtpk_pmatch == &dtrace_match_nul &&
8563 pkp->dtpk_mmatch == &dtrace_match_nul &&
8564 pkp->dtpk_fmatch == &dtrace_match_nul &&
8565 pkp->dtpk_nmatch == &dtrace_match_nul)
8566 pkp->dtpk_fmatch = &dtrace_match_nonzero;
8567 }
8568
8569 static void
8570 dtrace_probekey_release(dtrace_probekey_t *pkp)
8571 {
8572 dtrace_strunref(pkp->dtpk_prov);
8573 dtrace_strunref(pkp->dtpk_mod);
8574 dtrace_strunref(pkp->dtpk_func);
8575 dtrace_strunref(pkp->dtpk_name);
8576 }
8577
8578 static int
8579 dtrace_cond_provider_match(dtrace_probedesc_t *desc, void *data)
8580 {
8581 if (desc == NULL)
8582 return 1;
8583
8584 dtrace_probekey_f *func = dtrace_probekey_func(desc->dtpd_provider);
8585
8586 return func((char*)data, desc->dtpd_provider, 0);
8587 }
8588
8589 /*
8590 * DTrace Provider-to-Framework API Functions
8591 *
8592 * These functions implement much of the Provider-to-Framework API, as
8593 * described in <sys/dtrace.h>. The parts of the API not in this section are
8594 * the functions in the API for probe management (found below), and
8595 * dtrace_probe() itself (found above).
8596 */
8597
8598 /*
8599 * Register the calling provider with the DTrace framework. This should
8600 * generally be called by DTrace providers in their attach(9E) entry point.
8601 */
8602 int
8603 dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
8604 cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)
8605 {
8606 dtrace_provider_t *provider;
8607
8608 if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {
8609 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8610 "arguments", name ? name : "<NULL>");
8611 return (EINVAL);
8612 }
8613
8614 if (name[0] == '\0' || dtrace_badname(name)) {
8615 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8616 "provider name", name);
8617 return (EINVAL);
8618 }
8619
8620 if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||
8621 pops->dtps_enable == NULL || pops->dtps_disable == NULL ||
8622 pops->dtps_destroy == NULL ||
8623 ((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
8624 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8625 "provider ops", name);
8626 return (EINVAL);
8627 }
8628
8629 if (dtrace_badattr(&pap->dtpa_provider) ||
8630 dtrace_badattr(&pap->dtpa_mod) ||
8631 dtrace_badattr(&pap->dtpa_func) ||
8632 dtrace_badattr(&pap->dtpa_name) ||
8633 dtrace_badattr(&pap->dtpa_args)) {
8634 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8635 "provider attributes", name);
8636 return (EINVAL);
8637 }
8638
8639 if (priv & ~DTRACE_PRIV_ALL) {
8640 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8641 "privilege attributes", name);
8642 return (EINVAL);
8643 }
8644
8645 if ((priv & DTRACE_PRIV_KERNEL) &&
8646 (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&
8647 pops->dtps_usermode == NULL) {
8648 cmn_err(CE_WARN, "failed to register provider '%s': need "
8649 "dtps_usermode() op for given privilege attributes", name);
8650 return (EINVAL);
8651 }
8652
8653 provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
8654
8655 provider->dtpv_attr = *pap;
8656 provider->dtpv_priv.dtpp_flags = priv;
8657 if (cr != NULL) {
8658 provider->dtpv_priv.dtpp_uid = crgetuid(cr);
8659 provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
8660 }
8661 provider->dtpv_pops = *pops;
8662
8663 if (pops->dtps_provide == NULL) {
8664 ASSERT(pops->dtps_provide_module != NULL);
8665 provider->dtpv_pops.dtps_provide = dtrace_provide_nullop;
8666 }
8667
8668 if (pops->dtps_provide_module == NULL) {
8669 ASSERT(pops->dtps_provide != NULL);
8670 provider->dtpv_pops.dtps_provide_module =
8671 dtrace_provide_module_nullop;
8672 }
8673
8674 if (pops->dtps_suspend == NULL) {
8675 ASSERT(pops->dtps_resume == NULL);
8676 provider->dtpv_pops.dtps_suspend = dtrace_suspend_nullop;
8677 provider->dtpv_pops.dtps_resume = dtrace_resume_nullop;
8678 }
8679
8680 provider->dtpv_arg = arg;
8681 *idp = (dtrace_provider_id_t)provider;
8682
8683 if (pops == &dtrace_provider_ops) {
8684 LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
8685 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8686
8687 provider->dtpv_name = dtrace_strref(name);
8688
8689 ASSERT(dtrace_anon.dta_enabling == NULL);
8690
8691 /*
8692 * We make sure that the DTrace provider is at the head of
8693 * the provider chain.
8694 */
8695 provider->dtpv_next = dtrace_provider;
8696 dtrace_provider = provider;
8697 return (0);
8698 }
8699
8700 lck_mtx_lock(&dtrace_provider_lock);
8701 lck_mtx_lock(&dtrace_lock);
8702
8703 provider->dtpv_name = dtrace_strref(name);
8704
8705 /*
8706 * If there is at least one provider registered, we'll add this
8707 * provider after the first provider.
8708 */
8709 if (dtrace_provider != NULL) {
8710 provider->dtpv_next = dtrace_provider->dtpv_next;
8711 dtrace_provider->dtpv_next = provider;
8712 } else {
8713 dtrace_provider = provider;
8714 }
8715
8716 if (dtrace_retained != NULL) {
8717 dtrace_enabling_provide(provider);
8718
8719 /*
8720 * Now we need to call dtrace_enabling_matchall_with_cond() --
8721 * with a condition matching the provider name we just added,
8722 * which will acquire cpu_lock and dtrace_lock. We therefore need
8723 * to drop all of our locks before calling into it...
8724 */
8725 lck_mtx_unlock(&dtrace_lock);
8726 lck_mtx_unlock(&dtrace_provider_lock);
8727
8728 dtrace_match_cond_t cond = {dtrace_cond_provider_match, provider->dtpv_name};
8729 dtrace_enabling_matchall_with_cond(&cond);
8730
8731 return (0);
8732 }
8733
8734 lck_mtx_unlock(&dtrace_lock);
8735 lck_mtx_unlock(&dtrace_provider_lock);
8736
8737 return (0);
8738 }
8739
8740 /*
8741 * Unregister the specified provider from the DTrace framework. This should
8742 * generally be called by DTrace providers in their detach(9E) entry point.
8743 */
8744 int
8745 dtrace_unregister(dtrace_provider_id_t id)
8746 {
8747 dtrace_provider_t *old = (dtrace_provider_t *)id;
8748 dtrace_provider_t *prev = NULL;
8749 int self = 0;
8750 dtrace_probe_t *probe, *first = NULL, *next = NULL;
8751 dtrace_probe_t template = {
8752 .dtpr_provider = old
8753 };
8754
8755 if (old->dtpv_pops.dtps_enable ==
8756 (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop) {
8757 /*
8758 * If DTrace itself is the provider, we're called with locks
8759 * already held.
8760 */
8761 ASSERT(old == dtrace_provider);
8762 ASSERT(dtrace_devi != NULL);
8763 LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
8764 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8765 self = 1;
8766
8767 if (dtrace_provider->dtpv_next != NULL) {
8768 /*
8769 * There's another provider here; return failure.
8770 */
8771 return (EBUSY);
8772 }
8773 } else {
8774 lck_mtx_lock(&dtrace_provider_lock);
8775 lck_mtx_lock(&mod_lock);
8776 lck_mtx_lock(&dtrace_lock);
8777 }
8778
8779 /*
8780 * If anyone has /dev/dtrace open, or if there are anonymous enabled
8781 * probes, we refuse to let providers slither away, unless this
8782 * provider has already been explicitly invalidated.
8783 */
8784 if (!old->dtpv_defunct &&
8785 (dtrace_opens || (dtrace_anon.dta_state != NULL &&
8786 dtrace_anon.dta_state->dts_necbs > 0))) {
8787 if (!self) {
8788 lck_mtx_unlock(&dtrace_lock);
8789 lck_mtx_unlock(&mod_lock);
8790 lck_mtx_unlock(&dtrace_provider_lock);
8791 }
8792 return (EBUSY);
8793 }
8794
8795 /*
8796 * Attempt to destroy the probes associated with this provider.
8797 */
8798 if (old->dtpv_ecb_count!=0) {
8799 /*
8800 * We have at least one ECB; we can't remove this provider.
8801 */
8802 if (!self) {
8803 lck_mtx_unlock(&dtrace_lock);
8804 lck_mtx_unlock(&mod_lock);
8805 lck_mtx_unlock(&dtrace_provider_lock);
8806 }
8807 return (EBUSY);
8808 }
8809
8810 /*
8811 * All of the probes for this provider are disabled; we can safely
8812 * remove all of them from their hash chains and from the probe array.
8813 */
8814 for (probe = dtrace_hash_lookup(dtrace_byprov, &template); probe != NULL;
8815 probe = *(DTRACE_HASHNEXT(dtrace_byprov, probe))) {
8816 if (probe->dtpr_provider != old)
8817 continue;
8818
8819 dtrace_probes[probe->dtpr_id - 1] = NULL;
8820 old->dtpv_probe_count--;
8821
8822 dtrace_hash_remove(dtrace_bymod, probe);
8823 dtrace_hash_remove(dtrace_byfunc, probe);
8824 dtrace_hash_remove(dtrace_byname, probe);
8825
8826 if (first == NULL) {
8827 first = probe;
8828 probe->dtpr_nextmod = NULL;
8829 } else {
8830 /*
8831 * Use nextmod as the chain of probes to remove
8832 */
8833 probe->dtpr_nextmod = first;
8834 first = probe;
8835 }
8836 }
8837
8838 for (probe = first; probe != NULL; probe = next) {
8839 next = probe->dtpr_nextmod;
8840 dtrace_hash_remove(dtrace_byprov, probe);
8841 }
8842
8843 /*
8844 * The provider's probes have been removed from the hash chains and
8845 * from the probe array. Now issue a dtrace_sync() to be sure that
8846 * everyone has cleared out from any probe array processing.
8847 */
8848 dtrace_sync();
8849
8850 for (probe = first; probe != NULL; probe = next) {
8851 next = probe->dtpr_nextmod;
8852
8853 old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
8854 probe->dtpr_arg);
8855 dtrace_strunref(probe->dtpr_mod);
8856 dtrace_strunref(probe->dtpr_func);
8857 dtrace_strunref(probe->dtpr_name);
8858 vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
8859 zfree(dtrace_probe_t_zone, probe);
8860 }
8861
8862 if ((prev = dtrace_provider) == old) {
8863 ASSERT(self || dtrace_devi == NULL);
8864 ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
8865 dtrace_provider = old->dtpv_next;
8866 } else {
8867 while (prev != NULL && prev->dtpv_next != old)
8868 prev = prev->dtpv_next;
8869
8870 if (prev == NULL) {
8871 panic("attempt to unregister non-existent "
8872 "dtrace provider %p\n", (void *)id);
8873 }
8874
8875 prev->dtpv_next = old->dtpv_next;
8876 }
8877
8878 dtrace_strunref(old->dtpv_name);
8879
8880 if (!self) {
8881 lck_mtx_unlock(&dtrace_lock);
8882 lck_mtx_unlock(&mod_lock);
8883 lck_mtx_unlock(&dtrace_provider_lock);
8884 }
8885
8886 kmem_free(old, sizeof (dtrace_provider_t));
8887
8888 return (0);
8889 }
8890
8891 /*
8892 * Invalidate the specified provider. All subsequent probe lookups for the
8893 * specified provider will fail, but its probes will not be removed.
8894 */
8895 void
8896 dtrace_invalidate(dtrace_provider_id_t id)
8897 {
8898 dtrace_provider_t *pvp = (dtrace_provider_t *)id;
8899
8900 ASSERT(pvp->dtpv_pops.dtps_enable !=
8901 (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
8902
8903 lck_mtx_lock(&dtrace_provider_lock);
8904 lck_mtx_lock(&dtrace_lock);
8905
8906 pvp->dtpv_defunct = 1;
8907
8908 lck_mtx_unlock(&dtrace_lock);
8909 lck_mtx_unlock(&dtrace_provider_lock);
8910 }
8911
8912 /*
8913 * Indicate whether or not DTrace has attached.
8914 */
8915 int
8916 dtrace_attached(void)
8917 {
8918 /*
8919 * dtrace_provider will be non-NULL iff the DTrace driver has
8920 * attached. (It's non-NULL because DTrace is always itself a
8921 * provider.)
8922 */
8923 return (dtrace_provider != NULL);
8924 }
8925
8926 /*
8927 * Remove all the unenabled probes for the given provider. This function is
8928 * not unlike dtrace_unregister(), except that it doesn't remove the provider
8929 * -- just as many of its associated probes as it can.
8930 */
8931 int
8932 dtrace_condense(dtrace_provider_id_t id)
8933 {
8934 dtrace_provider_t *prov = (dtrace_provider_t *)id;
8935 dtrace_probe_t *probe, *first = NULL;
8936 dtrace_probe_t template = {
8937 .dtpr_provider = prov
8938 };
8939
8940 /*
8941 * Make sure this isn't the dtrace provider itself.
8942 */
8943 ASSERT(prov->dtpv_pops.dtps_enable !=
8944 (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
8945
8946 lck_mtx_lock(&dtrace_provider_lock);
8947 lck_mtx_lock(&dtrace_lock);
8948
8949 /*
8950 * Attempt to destroy the probes associated with this provider.
8951 */
8952 for (probe = dtrace_hash_lookup(dtrace_byprov, &template); probe != NULL;
8953 probe = *(DTRACE_HASHNEXT(dtrace_byprov, probe))) {
8954
8955 if (probe->dtpr_provider != prov)
8956 continue;
8957
8958 if (probe->dtpr_ecb != NULL)
8959 continue;
8960
8961 dtrace_probes[probe->dtpr_id - 1] = NULL;
8962 prov->dtpv_probe_count--;
8963
8964 dtrace_hash_remove(dtrace_bymod, probe);
8965 dtrace_hash_remove(dtrace_byfunc, probe);
8966 dtrace_hash_remove(dtrace_byname, probe);
8967
8968 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
8969 probe->dtpr_arg);
8970 dtrace_strunref(probe->dtpr_mod);
8971 dtrace_strunref(probe->dtpr_func);
8972 dtrace_strunref(probe->dtpr_name);
8973 if (first == NULL) {
8974 first = probe;
8975 probe->dtpr_nextmod = NULL;
8976 } else {
8977 /*
8978 * Use nextmod as the chain of probes to remove
8979 */
8980 probe->dtpr_nextmod = first;
8981 first = probe;
8982 }
8983 }
8984
8985 for (probe = first; probe != NULL; probe = first) {
8986 first = probe->dtpr_nextmod;
8987 dtrace_hash_remove(dtrace_byprov, probe);
8988 vmem_free(dtrace_arena, (void *)((uintptr_t)probe->dtpr_id), 1);
8989 zfree(dtrace_probe_t_zone, probe);
8990 }
8991
8992 lck_mtx_unlock(&dtrace_lock);
8993 lck_mtx_unlock(&dtrace_provider_lock);
8994
8995 return (0);
8996 }
8997
8998 /*
8999 * DTrace Probe Management Functions
9000 *
9001 * The functions in this section perform the DTrace probe management,
9002 * including functions to create probes, look-up probes, and call into the
9003 * providers to request that probes be provided. Some of these functions are
9004 * in the Provider-to-Framework API; these functions can be identified by the
9005 * fact that they are not declared "static".
9006 */
9007
9008 /*
9009 * Create a probe with the specified module name, function name, and name.
9010 */
9011 dtrace_id_t
9012 dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
9013 const char *func, const char *name, int aframes, void *arg)
9014 {
9015 dtrace_probe_t *probe, **probes;
9016 dtrace_provider_t *provider = (dtrace_provider_t *)prov;
9017 dtrace_id_t id;
9018
9019 if (provider == dtrace_provider) {
9020 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9021 } else {
9022 lck_mtx_lock(&dtrace_lock);
9023 }
9024
9025 id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
9026 VM_BESTFIT | VM_SLEEP);
9027
9028 probe = zalloc(dtrace_probe_t_zone);
9029 bzero(probe, sizeof (dtrace_probe_t));
9030
9031 probe->dtpr_id = id;
9032 probe->dtpr_gen = dtrace_probegen++;
9033 probe->dtpr_mod = dtrace_strref(mod);
9034 probe->dtpr_func = dtrace_strref(func);
9035 probe->dtpr_name = dtrace_strref(name);
9036 probe->dtpr_arg = arg;
9037 probe->dtpr_aframes = aframes;
9038 probe->dtpr_provider = provider;
9039
9040 dtrace_hash_add(dtrace_byprov, probe);
9041 dtrace_hash_add(dtrace_bymod, probe);
9042 dtrace_hash_add(dtrace_byfunc, probe);
9043 dtrace_hash_add(dtrace_byname, probe);
9044
9045 if (id - 1 >= (dtrace_id_t)dtrace_nprobes) {
9046 size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
9047 size_t nsize = osize * 2;
9048
9049 probes = kmem_zalloc(nsize, KM_SLEEP);
9050
9051 dtrace_probe_t **oprobes = dtrace_probes;
9052
9053 bcopy(oprobes, probes, osize);
9054 dtrace_membar_producer();
9055 dtrace_probes = probes;
9056
9057 dtrace_sync();
9058
9059 /*
9060 * All CPUs are now seeing the new probes array; we can
9061 * safely free the old array.
9062 */
9063 kmem_free(oprobes, osize);
9064 dtrace_nprobes *= 2;
9065
9066 ASSERT(id - 1 < (dtrace_id_t)dtrace_nprobes);
9067 }
9068
9069 ASSERT(dtrace_probes[id - 1] == NULL);
9070 dtrace_probes[id - 1] = probe;
9071 provider->dtpv_probe_count++;
9072
9073 if (provider != dtrace_provider)
9074 lck_mtx_unlock(&dtrace_lock);
9075
9076 return (id);
9077 }
9078
9079 static dtrace_probe_t *
9080 dtrace_probe_lookup_id(dtrace_id_t id)
9081 {
9082 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9083
9084 if (id == 0 || id > (dtrace_id_t)dtrace_nprobes)
9085 return (NULL);
9086
9087 return (dtrace_probes[id - 1]);
9088 }
9089
9090 static int
9091 dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg1, void *arg2)
9092 {
9093 #pragma unused(arg2)
9094 *((dtrace_id_t *)arg1) = probe->dtpr_id;
9095
9096 return (DTRACE_MATCH_DONE);
9097 }
9098
9099 /*
9100 * Look up a probe based on provider and one or more of module name, function
9101 * name and probe name.
9102 */
9103 dtrace_id_t
9104 dtrace_probe_lookup(dtrace_provider_id_t prid, const char *mod,
9105 const char *func, const char *name)
9106 {
9107 dtrace_probekey_t pkey;
9108 dtrace_id_t id;
9109 int match;
9110
9111 lck_mtx_lock(&dtrace_lock);
9112
9113 pkey.dtpk_prov = dtrace_strref(((dtrace_provider_t *)prid)->dtpv_name);
9114 pkey.dtpk_pmatch = &dtrace_match_string;
9115 pkey.dtpk_mod = dtrace_strref(mod);
9116 pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
9117 pkey.dtpk_func = dtrace_strref(func);
9118 pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
9119 pkey.dtpk_name = dtrace_strref(name);
9120 pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
9121 pkey.dtpk_id = DTRACE_IDNONE;
9122
9123 match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
9124 dtrace_probe_lookup_match, &id, NULL);
9125
9126 dtrace_probekey_release(&pkey);
9127
9128 lck_mtx_unlock(&dtrace_lock);
9129
9130 ASSERT(match == 1 || match == 0);
9131 return (match ? id : 0);
9132 }
9133
9134 /*
9135 * Returns the probe argument associated with the specified probe.
9136 */
9137 void *
9138 dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
9139 {
9140 dtrace_probe_t *probe;
9141 void *rval = NULL;
9142
9143 lck_mtx_lock(&dtrace_lock);
9144
9145 if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
9146 probe->dtpr_provider == (dtrace_provider_t *)id)
9147 rval = probe->dtpr_arg;
9148
9149 lck_mtx_unlock(&dtrace_lock);
9150
9151 return (rval);
9152 }
9153
9154 /*
9155 * Copy a probe into a probe description.
9156 */
9157 static void
9158 dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)
9159 {
9160 bzero(pdp, sizeof (dtrace_probedesc_t));
9161 pdp->dtpd_id = prp->dtpr_id;
9162
9163 /* APPLE NOTE: Darwin employs size bounded string operation. */
9164 (void) strlcpy(pdp->dtpd_provider,
9165 prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN);
9166
9167 (void) strlcpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN);
9168 (void) strlcpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN);
9169 (void) strlcpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN);
9170 }
9171
9172 /*
9173 * Called to indicate that a probe -- or probes -- should be provided by a
9174 * specfied provider. If the specified description is NULL, the provider will
9175 * be told to provide all of its probes. (This is done whenever a new
9176 * consumer comes along, or whenever a retained enabling is to be matched.) If
9177 * the specified description is non-NULL, the provider is given the
9178 * opportunity to dynamically provide the specified probe, allowing providers
9179 * to support the creation of probes on-the-fly. (So-called _autocreated_
9180 * probes.) If the provider is NULL, the operations will be applied to all
9181 * providers; if the provider is non-NULL the operations will only be applied
9182 * to the specified provider. The dtrace_provider_lock must be held, and the
9183 * dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
9184 * will need to grab the dtrace_lock when it reenters the framework through
9185 * dtrace_probe_lookup(), dtrace_probe_create(), etc.
9186 */
9187 static void
9188 dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
9189 {
9190 struct modctl *ctl;
9191 int all = 0;
9192
9193 LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
9194
9195 if (prv == NULL) {
9196 all = 1;
9197 prv = dtrace_provider;
9198 }
9199
9200 do {
9201 /*
9202 * First, call the blanket provide operation.
9203 */
9204 prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
9205
9206 /*
9207 * Now call the per-module provide operation. We will grab
9208 * mod_lock to prevent the list from being modified. Note
9209 * that this also prevents the mod_busy bits from changing.
9210 * (mod_busy can only be changed with mod_lock held.)
9211 */
9212 lck_mtx_lock(&mod_lock);
9213
9214 ctl = dtrace_modctl_list;
9215 while (ctl) {
9216 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
9217 ctl = ctl->mod_next;
9218 }
9219
9220 lck_mtx_unlock(&mod_lock);
9221 } while (all && (prv = prv->dtpv_next) != NULL);
9222 }
9223
9224 /*
9225 * Iterate over each probe, and call the Framework-to-Provider API function
9226 * denoted by offs.
9227 */
9228 static void
9229 dtrace_probe_foreach(uintptr_t offs)
9230 {
9231 dtrace_provider_t *prov;
9232 void (*func)(void *, dtrace_id_t, void *);
9233 dtrace_probe_t *probe;
9234 dtrace_icookie_t cookie;
9235 int i;
9236
9237 /*
9238 * We disable interrupts to walk through the probe array. This is
9239 * safe -- the dtrace_sync() in dtrace_unregister() assures that we
9240 * won't see stale data.
9241 */
9242 cookie = dtrace_interrupt_disable();
9243
9244 for (i = 0; i < dtrace_nprobes; i++) {
9245 if ((probe = dtrace_probes[i]) == NULL)
9246 continue;
9247
9248 if (probe->dtpr_ecb == NULL) {
9249 /*
9250 * This probe isn't enabled -- don't call the function.
9251 */
9252 continue;
9253 }
9254
9255 prov = probe->dtpr_provider;
9256 func = *((void(**)(void *, dtrace_id_t, void *))
9257 ((uintptr_t)&prov->dtpv_pops + offs));
9258
9259 func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
9260 }
9261
9262 dtrace_interrupt_enable(cookie);
9263 }
9264
9265 static int
9266 dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab, dtrace_ecbdesc_t *ep)
9267 {
9268 dtrace_probekey_t pkey;
9269 uint32_t priv;
9270 uid_t uid;
9271 zoneid_t zoneid;
9272 int err;
9273
9274 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9275
9276 dtrace_ecb_create_cache = NULL;
9277
9278 if (desc == NULL) {
9279 /*
9280 * If we're passed a NULL description, we're being asked to
9281 * create an ECB with a NULL probe.
9282 */
9283 (void) dtrace_ecb_create_enable(NULL, enab, ep);
9284 return (0);
9285 }
9286
9287 dtrace_probekey(desc, &pkey);
9288 dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
9289 &priv, &uid, &zoneid);
9290
9291 err = dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable, enab, ep);
9292
9293 dtrace_probekey_release(&pkey);
9294
9295 return err;
9296 }
9297
9298 /*
9299 * DTrace Helper Provider Functions
9300 */
9301 static void
9302 dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
9303 {
9304 attr->dtat_name = DOF_ATTR_NAME(dofattr);
9305 attr->dtat_data = DOF_ATTR_DATA(dofattr);
9306 attr->dtat_class = DOF_ATTR_CLASS(dofattr);
9307 }
9308
9309 static void
9310 dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
9311 const dof_provider_t *dofprov, char *strtab)
9312 {
9313 hprov->dthpv_provname = strtab + dofprov->dofpv_name;
9314 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
9315 dofprov->dofpv_provattr);
9316 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
9317 dofprov->dofpv_modattr);
9318 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
9319 dofprov->dofpv_funcattr);
9320 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
9321 dofprov->dofpv_nameattr);
9322 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
9323 dofprov->dofpv_argsattr);
9324 }
9325
9326 static void
9327 dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, proc_t *p)
9328 {
9329 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9330 dof_hdr_t *dof = (dof_hdr_t *)daddr;
9331 dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
9332 dof_provider_t *provider;
9333 dof_probe_t *probe;
9334 uint32_t *off, *enoff;
9335 uint8_t *arg;
9336 char *strtab;
9337 uint_t i, nprobes;
9338 dtrace_helper_provdesc_t dhpv;
9339 dtrace_helper_probedesc_t dhpb;
9340 dtrace_meta_t *meta = dtrace_meta_pid;
9341 dtrace_mops_t *mops = &meta->dtm_mops;
9342 void *parg;
9343
9344 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
9345 str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9346 provider->dofpv_strtab * dof->dofh_secsize);
9347 prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9348 provider->dofpv_probes * dof->dofh_secsize);
9349 arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9350 provider->dofpv_prargs * dof->dofh_secsize);
9351 off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9352 provider->dofpv_proffs * dof->dofh_secsize);
9353
9354 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
9355 off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
9356 arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
9357 enoff = NULL;
9358
9359 /*
9360 * See dtrace_helper_provider_validate().
9361 */
9362 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
9363 provider->dofpv_prenoffs != DOF_SECT_NONE) {
9364 enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9365 provider->dofpv_prenoffs * dof->dofh_secsize);
9366 enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
9367 }
9368
9369 nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
9370
9371 /*
9372 * Create the provider.
9373 */
9374 dtrace_dofprov2hprov(&dhpv, provider, strtab);
9375
9376 if ((parg = mops->dtms_provide_proc(meta->dtm_arg, &dhpv, p)) == NULL)
9377 return;
9378
9379 meta->dtm_count++;
9380
9381 /*
9382 * Create the probes.
9383 */
9384 for (i = 0; i < nprobes; i++) {
9385 probe = (dof_probe_t *)(uintptr_t)(daddr +
9386 prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
9387
9388 dhpb.dthpb_mod = dhp->dofhp_mod;
9389 dhpb.dthpb_func = strtab + probe->dofpr_func;
9390 dhpb.dthpb_name = strtab + probe->dofpr_name;
9391 #if !defined(__APPLE__)
9392 dhpb.dthpb_base = probe->dofpr_addr;
9393 #else
9394 dhpb.dthpb_base = dhp->dofhp_addr; /* FIXME: James, why? */
9395 #endif
9396 dhpb.dthpb_offs = (int32_t *)(off + probe->dofpr_offidx);
9397 dhpb.dthpb_noffs = probe->dofpr_noffs;
9398 if (enoff != NULL) {
9399 dhpb.dthpb_enoffs = (int32_t *)(enoff + probe->dofpr_enoffidx);
9400 dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
9401 } else {
9402 dhpb.dthpb_enoffs = NULL;
9403 dhpb.dthpb_nenoffs = 0;
9404 }
9405 dhpb.dthpb_args = arg + probe->dofpr_argidx;
9406 dhpb.dthpb_nargc = probe->dofpr_nargc;
9407 dhpb.dthpb_xargc = probe->dofpr_xargc;
9408 dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
9409 dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
9410
9411 mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
9412 }
9413
9414 /*
9415 * Since we just created probes, we need to match our enablings
9416 * against those, with a precondition knowing that we have only
9417 * added probes from this provider
9418 */
9419 char *prov_name = mops->dtms_provider_name(parg);
9420 ASSERT(prov_name != NULL);
9421 dtrace_match_cond_t cond = {dtrace_cond_provider_match, (void*)prov_name};
9422
9423 dtrace_enabling_matchall_with_cond(&cond);
9424 }
9425
9426 static void
9427 dtrace_helper_provide(dof_helper_t *dhp, proc_t *p)
9428 {
9429 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9430 dof_hdr_t *dof = (dof_hdr_t *)daddr;
9431 uint32_t i;
9432
9433 LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
9434
9435 for (i = 0; i < dof->dofh_secnum; i++) {
9436 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
9437 dof->dofh_secoff + i * dof->dofh_secsize);
9438
9439 if (sec->dofs_type != DOF_SECT_PROVIDER)
9440 continue;
9441
9442 dtrace_helper_provide_one(dhp, sec, p);
9443 }
9444 }
9445
9446 static void
9447 dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, proc_t *p)
9448 {
9449 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9450 dof_hdr_t *dof = (dof_hdr_t *)daddr;
9451 dof_sec_t *str_sec;
9452 dof_provider_t *provider;
9453 char *strtab;
9454 dtrace_helper_provdesc_t dhpv;
9455 dtrace_meta_t *meta = dtrace_meta_pid;
9456 dtrace_mops_t *mops = &meta->dtm_mops;
9457
9458 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
9459 str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9460 provider->dofpv_strtab * dof->dofh_secsize);
9461
9462 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
9463
9464 /*
9465 * Create the provider.
9466 */
9467 dtrace_dofprov2hprov(&dhpv, provider, strtab);
9468
9469 mops->dtms_remove_proc(meta->dtm_arg, &dhpv, p);
9470
9471 meta->dtm_count--;
9472 }
9473
9474 static void
9475 dtrace_helper_provider_remove(dof_helper_t *dhp, proc_t *p)
9476 {
9477 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9478 dof_hdr_t *dof = (dof_hdr_t *)daddr;
9479 uint32_t i;
9480
9481 LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
9482
9483 for (i = 0; i < dof->dofh_secnum; i++) {
9484 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
9485 dof->dofh_secoff + i * dof->dofh_secsize);
9486
9487 if (sec->dofs_type != DOF_SECT_PROVIDER)
9488 continue;
9489
9490 dtrace_helper_provider_remove_one(dhp, sec, p);
9491 }
9492 }
9493
9494 /*
9495 * DTrace Meta Provider-to-Framework API Functions
9496 *
9497 * These functions implement the Meta Provider-to-Framework API, as described
9498 * in <sys/dtrace.h>.
9499 */
9500 int
9501 dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
9502 dtrace_meta_provider_id_t *idp)
9503 {
9504 dtrace_meta_t *meta;
9505 dtrace_helpers_t *help, *next;
9506 uint_t i;
9507
9508 *idp = DTRACE_METAPROVNONE;
9509
9510 /*
9511 * We strictly don't need the name, but we hold onto it for
9512 * debuggability. All hail error queues!
9513 */
9514 if (name == NULL) {
9515 cmn_err(CE_WARN, "failed to register meta-provider: "
9516 "invalid name");
9517 return (EINVAL);
9518 }
9519
9520 if (mops == NULL ||
9521 mops->dtms_create_probe == NULL ||
9522 mops->dtms_provide_proc == NULL ||
9523 mops->dtms_remove_proc == NULL) {
9524 cmn_err(CE_WARN, "failed to register meta-register %s: "
9525 "invalid ops", name);
9526 return (EINVAL);
9527 }
9528
9529 meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
9530 meta->dtm_mops = *mops;
9531 meta->dtm_arg = arg;
9532
9533 lck_mtx_lock(&dtrace_meta_lock);
9534 lck_mtx_lock(&dtrace_lock);
9535
9536 if (dtrace_meta_pid != NULL) {
9537 lck_mtx_unlock(&dtrace_lock);
9538 lck_mtx_unlock(&dtrace_meta_lock);
9539 cmn_err(CE_WARN, "failed to register meta-register %s: "
9540 "user-land meta-provider exists", name);
9541 kmem_free(meta, sizeof (dtrace_meta_t));
9542 return (EINVAL);
9543 }
9544
9545 meta->dtm_name = dtrace_strref(name);
9546
9547 dtrace_meta_pid = meta;
9548 *idp = (dtrace_meta_provider_id_t)meta;
9549
9550 /*
9551 * If there are providers and probes ready to go, pass them
9552 * off to the new meta provider now.
9553 */
9554
9555 help = dtrace_deferred_pid;
9556 dtrace_deferred_pid = NULL;
9557
9558 lck_mtx_unlock(&dtrace_lock);
9559
9560 while (help != NULL) {
9561 for (i = 0; i < help->dthps_nprovs; i++) {
9562 proc_t *p = proc_find(help->dthps_pid);
9563 if (p == PROC_NULL)
9564 continue;
9565 dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
9566 p);
9567 proc_rele(p);
9568 }
9569
9570 next = help->dthps_next;
9571 help->dthps_next = NULL;
9572 help->dthps_prev = NULL;
9573 help->dthps_deferred = 0;
9574 help = next;
9575 }
9576
9577 lck_mtx_unlock(&dtrace_meta_lock);
9578
9579 return (0);
9580 }
9581
9582 int
9583 dtrace_meta_unregister(dtrace_meta_provider_id_t id)
9584 {
9585 dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;
9586
9587 lck_mtx_lock(&dtrace_meta_lock);
9588 lck_mtx_lock(&dtrace_lock);
9589
9590 if (old == dtrace_meta_pid) {
9591 pp = &dtrace_meta_pid;
9592 } else {
9593 panic("attempt to unregister non-existent "
9594 "dtrace meta-provider %p\n", (void *)old);
9595 }
9596
9597 if (old->dtm_count != 0) {
9598 lck_mtx_unlock(&dtrace_lock);
9599 lck_mtx_unlock(&dtrace_meta_lock);
9600 return (EBUSY);
9601 }
9602
9603 *pp = NULL;
9604
9605 dtrace_strunref(old->dtm_name);
9606
9607 lck_mtx_unlock(&dtrace_lock);
9608 lck_mtx_unlock(&dtrace_meta_lock);
9609
9610 kmem_free(old, sizeof (dtrace_meta_t));
9611
9612 return (0);
9613 }
9614
9615
9616 /*
9617 * DTrace DIF Object Functions
9618 */
9619 static int
9620 dtrace_difo_err(uint_t pc, const char *format, ...)
9621 {
9622 if (dtrace_err_verbose) {
9623 va_list alist;
9624
9625 (void) uprintf("dtrace DIF object error: [%u]: ", pc);
9626 va_start(alist, format);
9627 (void) vuprintf(format, alist);
9628 va_end(alist);
9629 }
9630
9631 #ifdef DTRACE_ERRDEBUG
9632 dtrace_errdebug(format);
9633 #endif
9634 return (1);
9635 }
9636
9637 /*
9638 * Validate a DTrace DIF object by checking the IR instructions. The following
9639 * rules are currently enforced by dtrace_difo_validate():
9640 *
9641 * 1. Each instruction must have a valid opcode
9642 * 2. Each register, string, variable, or subroutine reference must be valid
9643 * 3. No instruction can modify register %r0 (must be zero)
9644 * 4. All instruction reserved bits must be set to zero
9645 * 5. The last instruction must be a "ret" instruction
9646 * 6. All branch targets must reference a valid instruction _after_ the branch
9647 */
9648 static int
9649 dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
9650 cred_t *cr)
9651 {
9652 int err = 0;
9653 uint_t i;
9654
9655 int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
9656 int kcheckload;
9657 uint_t pc;
9658 int maxglobal = -1, maxlocal = -1, maxtlocal = -1;
9659
9660 kcheckload = cr == NULL ||
9661 (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
9662
9663 dp->dtdo_destructive = 0;
9664
9665 for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
9666 dif_instr_t instr = dp->dtdo_buf[pc];
9667
9668 uint_t r1 = DIF_INSTR_R1(instr);
9669 uint_t r2 = DIF_INSTR_R2(instr);
9670 uint_t rd = DIF_INSTR_RD(instr);
9671 uint_t rs = DIF_INSTR_RS(instr);
9672 uint_t label = DIF_INSTR_LABEL(instr);
9673 uint_t v = DIF_INSTR_VAR(instr);
9674 uint_t subr = DIF_INSTR_SUBR(instr);
9675 uint_t type = DIF_INSTR_TYPE(instr);
9676 uint_t op = DIF_INSTR_OP(instr);
9677
9678 switch (op) {
9679 case DIF_OP_OR:
9680 case DIF_OP_XOR:
9681 case DIF_OP_AND:
9682 case DIF_OP_SLL:
9683 case DIF_OP_SRL:
9684 case DIF_OP_SRA:
9685 case DIF_OP_SUB:
9686 case DIF_OP_ADD:
9687 case DIF_OP_MUL:
9688 case DIF_OP_SDIV:
9689 case DIF_OP_UDIV:
9690 case DIF_OP_SREM:
9691 case DIF_OP_UREM:
9692 case DIF_OP_COPYS:
9693 if (r1 >= nregs)
9694 err += efunc(pc, "invalid register %u\n", r1);
9695 if (r2 >= nregs)
9696 err += efunc(pc, "invalid register %u\n", r2);
9697 if (rd >= nregs)
9698 err += efunc(pc, "invalid register %u\n", rd);
9699 if (rd == 0)
9700 err += efunc(pc, "cannot write to %%r0\n");
9701 break;
9702 case DIF_OP_NOT:
9703 case DIF_OP_MOV:
9704 case DIF_OP_ALLOCS:
9705 if (r1 >= nregs)
9706 err += efunc(pc, "invalid register %u\n", r1);
9707 if (r2 != 0)
9708 err += efunc(pc, "non-zero reserved bits\n");
9709 if (rd >= nregs)
9710 err += efunc(pc, "invalid register %u\n", rd);
9711 if (rd == 0)
9712 err += efunc(pc, "cannot write to %%r0\n");
9713 break;
9714 case DIF_OP_LDSB:
9715 case DIF_OP_LDSH:
9716 case DIF_OP_LDSW:
9717 case DIF_OP_LDUB:
9718 case DIF_OP_LDUH:
9719 case DIF_OP_LDUW:
9720 case DIF_OP_LDX:
9721 if (r1 >= nregs)
9722 err += efunc(pc, "invalid register %u\n", r1);
9723 if (r2 != 0)
9724 err += efunc(pc, "non-zero reserved bits\n");
9725 if (rd >= nregs)
9726 err += efunc(pc, "invalid register %u\n", rd);
9727 if (rd == 0)
9728 err += efunc(pc, "cannot write to %%r0\n");
9729 if (kcheckload)
9730 dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
9731 DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
9732 break;
9733 case DIF_OP_RLDSB:
9734 case DIF_OP_RLDSH:
9735 case DIF_OP_RLDSW:
9736 case DIF_OP_RLDUB:
9737 case DIF_OP_RLDUH:
9738 case DIF_OP_RLDUW:
9739 case DIF_OP_RLDX:
9740 if (r1 >= nregs)
9741 err += efunc(pc, "invalid register %u\n", r1);
9742 if (r2 != 0)
9743 err += efunc(pc, "non-zero reserved bits\n");
9744 if (rd >= nregs)
9745 err += efunc(pc, "invalid register %u\n", rd);
9746 if (rd == 0)
9747 err += efunc(pc, "cannot write to %%r0\n");
9748 break;
9749 case DIF_OP_ULDSB:
9750 case DIF_OP_ULDSH:
9751 case DIF_OP_ULDSW:
9752 case DIF_OP_ULDUB:
9753 case DIF_OP_ULDUH:
9754 case DIF_OP_ULDUW:
9755 case DIF_OP_ULDX:
9756 if (r1 >= nregs)
9757 err += efunc(pc, "invalid register %u\n", r1);
9758 if (r2 != 0)
9759 err += efunc(pc, "non-zero reserved bits\n");
9760 if (rd >= nregs)
9761 err += efunc(pc, "invalid register %u\n", rd);
9762 if (rd == 0)
9763 err += efunc(pc, "cannot write to %%r0\n");
9764 break;
9765 case DIF_OP_STB:
9766 case DIF_OP_STH:
9767 case DIF_OP_STW:
9768 case DIF_OP_STX:
9769 if (r1 >= nregs)
9770 err += efunc(pc, "invalid register %u\n", r1);
9771 if (r2 != 0)
9772 err += efunc(pc, "non-zero reserved bits\n");
9773 if (rd >= nregs)
9774 err += efunc(pc, "invalid register %u\n", rd);
9775 if (rd == 0)
9776 err += efunc(pc, "cannot write to 0 address\n");
9777 break;
9778 case DIF_OP_CMP:
9779 case DIF_OP_SCMP:
9780 if (r1 >= nregs)
9781 err += efunc(pc, "invalid register %u\n", r1);
9782 if (r2 >= nregs)
9783 err += efunc(pc, "invalid register %u\n", r2);
9784 if (rd != 0)
9785 err += efunc(pc, "non-zero reserved bits\n");
9786 break;
9787 case DIF_OP_TST:
9788 if (r1 >= nregs)
9789 err += efunc(pc, "invalid register %u\n", r1);
9790 if (r2 != 0 || rd != 0)
9791 err += efunc(pc, "non-zero reserved bits\n");
9792 break;
9793 case DIF_OP_BA:
9794 case DIF_OP_BE:
9795 case DIF_OP_BNE:
9796 case DIF_OP_BG:
9797 case DIF_OP_BGU:
9798 case DIF_OP_BGE:
9799 case DIF_OP_BGEU:
9800 case DIF_OP_BL:
9801 case DIF_OP_BLU:
9802 case DIF_OP_BLE:
9803 case DIF_OP_BLEU:
9804 if (label >= dp->dtdo_len) {
9805 err += efunc(pc, "invalid branch target %u\n",
9806 label);
9807 }
9808 if (label <= pc) {
9809 err += efunc(pc, "backward branch to %u\n",
9810 label);
9811 }
9812 break;
9813 case DIF_OP_RET:
9814 if (r1 != 0 || r2 != 0)
9815 err += efunc(pc, "non-zero reserved bits\n");
9816 if (rd >= nregs)
9817 err += efunc(pc, "invalid register %u\n", rd);
9818 break;
9819 case DIF_OP_NOP:
9820 case DIF_OP_POPTS:
9821 case DIF_OP_FLUSHTS:
9822 if (r1 != 0 || r2 != 0 || rd != 0)
9823 err += efunc(pc, "non-zero reserved bits\n");
9824 break;
9825 case DIF_OP_SETX:
9826 if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
9827 err += efunc(pc, "invalid integer ref %u\n",
9828 DIF_INSTR_INTEGER(instr));
9829 }
9830 if (rd >= nregs)
9831 err += efunc(pc, "invalid register %u\n", rd);
9832 if (rd == 0)
9833 err += efunc(pc, "cannot write to %%r0\n");
9834 break;
9835 case DIF_OP_SETS:
9836 if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
9837 err += efunc(pc, "invalid string ref %u\n",
9838 DIF_INSTR_STRING(instr));
9839 }
9840 if (rd >= nregs)
9841 err += efunc(pc, "invalid register %u\n", rd);
9842 if (rd == 0)
9843 err += efunc(pc, "cannot write to %%r0\n");
9844 break;
9845 case DIF_OP_LDGA:
9846 case DIF_OP_LDTA:
9847 if (r1 > DIF_VAR_ARRAY_MAX)
9848 err += efunc(pc, "invalid array %u\n", r1);
9849 if (r2 >= nregs)
9850 err += efunc(pc, "invalid register %u\n", r2);
9851 if (rd >= nregs)
9852 err += efunc(pc, "invalid register %u\n", rd);
9853 if (rd == 0)
9854 err += efunc(pc, "cannot write to %%r0\n");
9855 break;
9856 case DIF_OP_LDGS:
9857 case DIF_OP_LDTS:
9858 case DIF_OP_LDLS:
9859 case DIF_OP_LDGAA:
9860 case DIF_OP_LDTAA:
9861 if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)
9862 err += efunc(pc, "invalid variable %u\n", v);
9863 if (rd >= nregs)
9864 err += efunc(pc, "invalid register %u\n", rd);
9865 if (rd == 0)
9866 err += efunc(pc, "cannot write to %%r0\n");
9867 break;
9868 case DIF_OP_STGS:
9869 case DIF_OP_STTS:
9870 case DIF_OP_STLS:
9871 case DIF_OP_STGAA:
9872 case DIF_OP_STTAA:
9873 if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)
9874 err += efunc(pc, "invalid variable %u\n", v);
9875 if (rs >= nregs)
9876 err += efunc(pc, "invalid register %u\n", rd);
9877 break;
9878 case DIF_OP_CALL:
9879 if (subr > DIF_SUBR_MAX &&
9880 !(subr >= DIF_SUBR_APPLE_MIN && subr <= DIF_SUBR_APPLE_MAX))
9881 err += efunc(pc, "invalid subr %u\n", subr);
9882 if (rd >= nregs)
9883 err += efunc(pc, "invalid register %u\n", rd);
9884 if (rd == 0)
9885 err += efunc(pc, "cannot write to %%r0\n");
9886
9887 switch (subr) {
9888 case DIF_SUBR_COPYOUT:
9889 case DIF_SUBR_COPYOUTSTR:
9890 case DIF_SUBR_KDEBUG_TRACE:
9891 case DIF_SUBR_KDEBUG_TRACE_STRING:
9892 case DIF_SUBR_PHYSMEM_READ:
9893 case DIF_SUBR_PHYSMEM_WRITE:
9894 dp->dtdo_destructive = 1;
9895 break;
9896 default:
9897 break;
9898 }
9899 break;
9900 case DIF_OP_PUSHTR:
9901 if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
9902 err += efunc(pc, "invalid ref type %u\n", type);
9903 if (r2 >= nregs)
9904 err += efunc(pc, "invalid register %u\n", r2);
9905 if (rs >= nregs)
9906 err += efunc(pc, "invalid register %u\n", rs);
9907 break;
9908 case DIF_OP_PUSHTV:
9909 if (type != DIF_TYPE_CTF)
9910 err += efunc(pc, "invalid val type %u\n", type);
9911 if (r2 >= nregs)
9912 err += efunc(pc, "invalid register %u\n", r2);
9913 if (rs >= nregs)
9914 err += efunc(pc, "invalid register %u\n", rs);
9915 break;
9916 case DIF_OP_STRIP:
9917 if (r1 >= nregs)
9918 err += efunc(pc, "invalid register %u\n", r1);
9919 if (!dtrace_is_valid_ptrauth_key(r2))
9920 err += efunc(pc, "invalid key\n");
9921 if (rd >= nregs)
9922 err += efunc(pc, "invalid register %u\n", rd);
9923 if (rd == 0)
9924 err += efunc(pc, "cannot write to %%r0\n");
9925 break;
9926 default:
9927 err += efunc(pc, "invalid opcode %u\n",
9928 DIF_INSTR_OP(instr));
9929 }
9930 }
9931
9932 if (dp->dtdo_len != 0 &&
9933 DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
9934 err += efunc(dp->dtdo_len - 1,
9935 "expected 'ret' as last DIF instruction\n");
9936 }
9937
9938 if (!(dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF | DIF_TF_BYUREF))) {
9939 /*
9940 * If we're not returning by reference, the size must be either
9941 * 0 or the size of one of the base types.
9942 */
9943 switch (dp->dtdo_rtype.dtdt_size) {
9944 case 0:
9945 case sizeof (uint8_t):
9946 case sizeof (uint16_t):
9947 case sizeof (uint32_t):
9948 case sizeof (uint64_t):
9949 break;
9950
9951 default:
9952 err += efunc(dp->dtdo_len - 1, "bad return size\n");
9953 }
9954 }
9955
9956 for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
9957 dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;
9958 dtrace_diftype_t *vt, *et;
9959 uint_t id;
9960 int ndx;
9961
9962 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
9963 v->dtdv_scope != DIFV_SCOPE_THREAD &&
9964 v->dtdv_scope != DIFV_SCOPE_LOCAL) {
9965 err += efunc(i, "unrecognized variable scope %d\n",
9966 v->dtdv_scope);
9967 break;
9968 }
9969
9970 if (v->dtdv_kind != DIFV_KIND_ARRAY &&
9971 v->dtdv_kind != DIFV_KIND_SCALAR) {
9972 err += efunc(i, "unrecognized variable type %d\n",
9973 v->dtdv_kind);
9974 break;
9975 }
9976
9977 if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
9978 err += efunc(i, "%d exceeds variable id limit\n", id);
9979 break;
9980 }
9981
9982 if (id < DIF_VAR_OTHER_UBASE)
9983 continue;
9984
9985 /*
9986 * For user-defined variables, we need to check that this
9987 * definition is identical to any previous definition that we
9988 * encountered.
9989 */
9990 ndx = id - DIF_VAR_OTHER_UBASE;
9991
9992 switch (v->dtdv_scope) {
9993 case DIFV_SCOPE_GLOBAL:
9994 if (maxglobal == -1 || ndx > maxglobal)
9995 maxglobal = ndx;
9996
9997 if (ndx < vstate->dtvs_nglobals) {
9998 dtrace_statvar_t *svar;
9999
10000 if ((svar = vstate->dtvs_globals[ndx]) != NULL)
10001 existing = &svar->dtsv_var;
10002 }
10003
10004 break;
10005
10006 case DIFV_SCOPE_THREAD:
10007 if (maxtlocal == -1 || ndx > maxtlocal)
10008 maxtlocal = ndx;
10009
10010 if (ndx < vstate->dtvs_ntlocals)
10011 existing = &vstate->dtvs_tlocals[ndx];
10012 break;
10013
10014 case DIFV_SCOPE_LOCAL:
10015 if (maxlocal == -1 || ndx > maxlocal)
10016 maxlocal = ndx;
10017 if (ndx < vstate->dtvs_nlocals) {
10018 dtrace_statvar_t *svar;
10019
10020 if ((svar = vstate->dtvs_locals[ndx]) != NULL)
10021 existing = &svar->dtsv_var;
10022 }
10023
10024 break;
10025 }
10026
10027 vt = &v->dtdv_type;
10028
10029 if (vt->dtdt_flags & DIF_TF_BYREF) {
10030 if (vt->dtdt_size == 0) {
10031 err += efunc(i, "zero-sized variable\n");
10032 break;
10033 }
10034
10035 if ((v->dtdv_scope == DIFV_SCOPE_GLOBAL ||
10036 v->dtdv_scope == DIFV_SCOPE_LOCAL) &&
10037 vt->dtdt_size > dtrace_statvar_maxsize) {
10038 err += efunc(i, "oversized by-ref static\n");
10039 break;
10040 }
10041 }
10042
10043 if (existing == NULL || existing->dtdv_id == 0)
10044 continue;
10045
10046 ASSERT(existing->dtdv_id == v->dtdv_id);
10047 ASSERT(existing->dtdv_scope == v->dtdv_scope);
10048
10049 if (existing->dtdv_kind != v->dtdv_kind)
10050 err += efunc(i, "%d changed variable kind\n", id);
10051
10052 et = &existing->dtdv_type;
10053
10054 if (vt->dtdt_flags != et->dtdt_flags) {
10055 err += efunc(i, "%d changed variable type flags\n", id);
10056 break;
10057 }
10058
10059 if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
10060 err += efunc(i, "%d changed variable type size\n", id);
10061 break;
10062 }
10063 }
10064
10065 for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
10066 dif_instr_t instr = dp->dtdo_buf[pc];
10067
10068 uint_t v = DIF_INSTR_VAR(instr);
10069 uint_t op = DIF_INSTR_OP(instr);
10070
10071 switch (op) {
10072 case DIF_OP_LDGS:
10073 case DIF_OP_LDGAA:
10074 case DIF_OP_STGS:
10075 case DIF_OP_STGAA:
10076 if (v > (uint_t)(DIF_VAR_OTHER_UBASE + maxglobal))
10077 err += efunc(pc, "invalid variable %u\n", v);
10078 break;
10079 case DIF_OP_LDTS:
10080 case DIF_OP_LDTAA:
10081 case DIF_OP_STTS:
10082 case DIF_OP_STTAA:
10083 if (v > (uint_t)(DIF_VAR_OTHER_UBASE + maxtlocal))
10084 err += efunc(pc, "invalid variable %u\n", v);
10085 break;
10086 case DIF_OP_LDLS:
10087 case DIF_OP_STLS:
10088 if (v > (uint_t)(DIF_VAR_OTHER_UBASE + maxlocal))
10089 err += efunc(pc, "invalid variable %u\n", v);
10090 break;
10091 default:
10092 break;
10093 }
10094 }
10095
10096 return (err);
10097 }
10098
10099 /*
10100 * Validate a DTrace DIF object that it is to be used as a helper. Helpers
10101 * are much more constrained than normal DIFOs. Specifically, they may
10102 * not:
10103 *
10104 * 1. Make calls to subroutines other than copyin(), copyinstr() or
10105 * miscellaneous string routines
10106 * 2. Access DTrace variables other than the args[] array, and the
10107 * curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
10108 * 3. Have thread-local variables.
10109 * 4. Have dynamic variables.
10110 */
10111 static int
10112 dtrace_difo_validate_helper(dtrace_difo_t *dp)
10113 {
10114 int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
10115 int err = 0;
10116 uint_t pc;
10117
10118 for (pc = 0; pc < dp->dtdo_len; pc++) {
10119 dif_instr_t instr = dp->dtdo_buf[pc];
10120
10121 uint_t v = DIF_INSTR_VAR(instr);
10122 uint_t subr = DIF_INSTR_SUBR(instr);
10123 uint_t op = DIF_INSTR_OP(instr);
10124
10125 switch (op) {
10126 case DIF_OP_OR:
10127 case DIF_OP_XOR:
10128 case DIF_OP_AND:
10129 case DIF_OP_SLL:
10130 case DIF_OP_SRL:
10131 case DIF_OP_SRA:
10132 case DIF_OP_SUB:
10133 case DIF_OP_ADD:
10134 case DIF_OP_MUL:
10135 case DIF_OP_SDIV:
10136 case DIF_OP_UDIV:
10137 case DIF_OP_SREM:
10138 case DIF_OP_UREM:
10139 case DIF_OP_COPYS:
10140 case DIF_OP_NOT:
10141 case DIF_OP_MOV:
10142 case DIF_OP_RLDSB:
10143 case DIF_OP_RLDSH:
10144 case DIF_OP_RLDSW:
10145 case DIF_OP_RLDUB:
10146 case DIF_OP_RLDUH:
10147 case DIF_OP_RLDUW:
10148 case DIF_OP_RLDX:
10149 case DIF_OP_ULDSB:
10150 case DIF_OP_ULDSH:
10151 case DIF_OP_ULDSW:
10152 case DIF_OP_ULDUB:
10153 case DIF_OP_ULDUH:
10154 case DIF_OP_ULDUW:
10155 case DIF_OP_ULDX:
10156 case DIF_OP_STB:
10157 case DIF_OP_STH:
10158 case DIF_OP_STW:
10159 case DIF_OP_STX:
10160 case DIF_OP_ALLOCS:
10161 case DIF_OP_CMP:
10162 case DIF_OP_SCMP:
10163 case DIF_OP_TST:
10164 case DIF_OP_BA:
10165 case DIF_OP_BE:
10166 case DIF_OP_BNE:
10167 case DIF_OP_BG:
10168 case DIF_OP_BGU:
10169 case DIF_OP_BGE:
10170 case DIF_OP_BGEU:
10171 case DIF_OP_BL:
10172 case DIF_OP_BLU:
10173 case DIF_OP_BLE:
10174 case DIF_OP_BLEU:
10175 case DIF_OP_RET:
10176 case DIF_OP_NOP:
10177 case DIF_OP_POPTS:
10178 case DIF_OP_FLUSHTS:
10179 case DIF_OP_SETX:
10180 case DIF_OP_SETS:
10181 case DIF_OP_LDGA:
10182 case DIF_OP_LDLS:
10183 case DIF_OP_STGS:
10184 case DIF_OP_STLS:
10185 case DIF_OP_PUSHTR:
10186 case DIF_OP_PUSHTV:
10187 break;
10188
10189 case DIF_OP_LDGS:
10190 if (v >= DIF_VAR_OTHER_UBASE)
10191 break;
10192
10193 if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
10194 break;
10195
10196 if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||
10197 v == DIF_VAR_PPID || v == DIF_VAR_TID ||
10198 v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||
10199 v == DIF_VAR_UID || v == DIF_VAR_GID)
10200 break;
10201
10202 err += efunc(pc, "illegal variable %u\n", v);
10203 break;
10204
10205 case DIF_OP_LDTA:
10206 case DIF_OP_LDTS:
10207 case DIF_OP_LDGAA:
10208 case DIF_OP_LDTAA:
10209 err += efunc(pc, "illegal dynamic variable load\n");
10210 break;
10211
10212 case DIF_OP_STTS:
10213 case DIF_OP_STGAA:
10214 case DIF_OP_STTAA:
10215 err += efunc(pc, "illegal dynamic variable store\n");
10216 break;
10217
10218 case DIF_OP_CALL:
10219 switch (subr) {
10220 case DIF_SUBR_ALLOCA:
10221 case DIF_SUBR_BCOPY:
10222 case DIF_SUBR_COPYIN:
10223 case DIF_SUBR_COPYINTO:
10224 case DIF_SUBR_COPYINSTR:
10225 case DIF_SUBR_HTONS:
10226 case DIF_SUBR_HTONL:
10227 case DIF_SUBR_HTONLL:
10228 case DIF_SUBR_INDEX:
10229 case DIF_SUBR_INET_NTOA:
10230 case DIF_SUBR_INET_NTOA6:
10231 case DIF_SUBR_INET_NTOP:
10232 case DIF_SUBR_JSON:
10233 case DIF_SUBR_LLTOSTR:
10234 case DIF_SUBR_NTOHS:
10235 case DIF_SUBR_NTOHL:
10236 case DIF_SUBR_NTOHLL:
10237 case DIF_SUBR_RINDEX:
10238 case DIF_SUBR_STRCHR:
10239 case DIF_SUBR_STRTOLL:
10240 case DIF_SUBR_STRJOIN:
10241 case DIF_SUBR_STRRCHR:
10242 case DIF_SUBR_STRSTR:
10243 break;
10244 default:
10245 err += efunc(pc, "invalid subr %u\n", subr);
10246 }
10247 break;
10248
10249 default:
10250 err += efunc(pc, "invalid opcode %u\n",
10251 DIF_INSTR_OP(instr));
10252 }
10253 }
10254
10255 return (err);
10256 }
10257
10258 /*
10259 * Returns 1 if the expression in the DIF object can be cached on a per-thread
10260 * basis; 0 if not.
10261 */
10262 static int
10263 dtrace_difo_cacheable(dtrace_difo_t *dp)
10264 {
10265 uint_t i;
10266
10267 if (dp == NULL)
10268 return (0);
10269
10270 for (i = 0; i < dp->dtdo_varlen; i++) {
10271 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10272
10273 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
10274 continue;
10275
10276 switch (v->dtdv_id) {
10277 case DIF_VAR_CURTHREAD:
10278 case DIF_VAR_PID:
10279 case DIF_VAR_TID:
10280 case DIF_VAR_EXECNAME:
10281 case DIF_VAR_ZONENAME:
10282 break;
10283
10284 default:
10285 return (0);
10286 }
10287 }
10288
10289 /*
10290 * This DIF object may be cacheable. Now we need to look for any
10291 * array loading instructions, any memory loading instructions, or
10292 * any stores to thread-local variables.
10293 */
10294 for (i = 0; i < dp->dtdo_len; i++) {
10295 uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
10296
10297 if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||
10298 (op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||
10299 (op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||
10300 op == DIF_OP_LDGA || op == DIF_OP_STTS)
10301 return (0);
10302 }
10303
10304 return (1);
10305 }
10306
10307 static void
10308 dtrace_difo_hold(dtrace_difo_t *dp)
10309 {
10310 uint_t i;
10311
10312 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10313
10314 dp->dtdo_refcnt++;
10315 ASSERT(dp->dtdo_refcnt != 0);
10316
10317 /*
10318 * We need to check this DIF object for references to the variable
10319 * DIF_VAR_VTIMESTAMP.
10320 */
10321 for (i = 0; i < dp->dtdo_varlen; i++) {
10322 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10323
10324 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10325 continue;
10326
10327 if (dtrace_vtime_references++ == 0)
10328 dtrace_vtime_enable();
10329 }
10330 }
10331
10332 /*
10333 * This routine calculates the dynamic variable chunksize for a given DIF
10334 * object. The calculation is not fool-proof, and can probably be tricked by
10335 * malicious DIF -- but it works for all compiler-generated DIF. Because this
10336 * calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
10337 * if a dynamic variable size exceeds the chunksize.
10338 */
10339 static void
10340 dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10341 {
10342 uint64_t sval = 0;
10343 dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
10344 const dif_instr_t *text = dp->dtdo_buf;
10345 uint_t pc, srd = 0;
10346 uint_t ttop = 0;
10347 size_t size, ksize;
10348 uint_t id, i;
10349
10350 for (pc = 0; pc < dp->dtdo_len; pc++) {
10351 dif_instr_t instr = text[pc];
10352 uint_t op = DIF_INSTR_OP(instr);
10353 uint_t rd = DIF_INSTR_RD(instr);
10354 uint_t r1 = DIF_INSTR_R1(instr);
10355 uint_t nkeys = 0;
10356 uchar_t scope;
10357
10358 dtrace_key_t *key = tupregs;
10359
10360 switch (op) {
10361 case DIF_OP_SETX:
10362 sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
10363 srd = rd;
10364 continue;
10365
10366 case DIF_OP_STTS:
10367 key = &tupregs[DIF_DTR_NREGS];
10368 key[0].dttk_size = 0;
10369 key[1].dttk_size = 0;
10370 nkeys = 2;
10371 scope = DIFV_SCOPE_THREAD;
10372 break;
10373
10374 case DIF_OP_STGAA:
10375 case DIF_OP_STTAA:
10376 nkeys = ttop;
10377
10378 if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
10379 key[nkeys++].dttk_size = 0;
10380
10381 key[nkeys++].dttk_size = 0;
10382
10383 if (op == DIF_OP_STTAA) {
10384 scope = DIFV_SCOPE_THREAD;
10385 } else {
10386 scope = DIFV_SCOPE_GLOBAL;
10387 }
10388
10389 break;
10390
10391 case DIF_OP_PUSHTR:
10392 if (ttop == DIF_DTR_NREGS)
10393 return;
10394
10395 if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {
10396 /*
10397 * If the register for the size of the "pushtr"
10398 * is %r0 (or the value is 0) and the type is
10399 * a string, we'll use the system-wide default
10400 * string size.
10401 */
10402 tupregs[ttop++].dttk_size =
10403 dtrace_strsize_default;
10404 } else {
10405 if (srd == 0)
10406 return;
10407
10408 if (sval > LONG_MAX)
10409 return;
10410
10411 tupregs[ttop++].dttk_size = sval;
10412 }
10413
10414 break;
10415
10416 case DIF_OP_PUSHTV:
10417 if (ttop == DIF_DTR_NREGS)
10418 return;
10419
10420 tupregs[ttop++].dttk_size = 0;
10421 break;
10422
10423 case DIF_OP_FLUSHTS:
10424 ttop = 0;
10425 break;
10426
10427 case DIF_OP_POPTS:
10428 if (ttop != 0)
10429 ttop--;
10430 break;
10431 }
10432
10433 sval = 0;
10434 srd = 0;
10435
10436 if (nkeys == 0)
10437 continue;
10438
10439 /*
10440 * We have a dynamic variable allocation; calculate its size.
10441 */
10442 for (ksize = 0, i = 0; i < nkeys; i++)
10443 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
10444
10445 size = sizeof (dtrace_dynvar_t);
10446 size += sizeof (dtrace_key_t) * (nkeys - 1);
10447 size += ksize;
10448
10449 /*
10450 * Now we need to determine the size of the stored data.
10451 */
10452 id = DIF_INSTR_VAR(instr);
10453
10454 for (i = 0; i < dp->dtdo_varlen; i++) {
10455 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10456
10457 if (v->dtdv_id == id && v->dtdv_scope == scope) {
10458 size += v->dtdv_type.dtdt_size;
10459 break;
10460 }
10461 }
10462
10463 if (i == dp->dtdo_varlen)
10464 return;
10465
10466 /*
10467 * We have the size. If this is larger than the chunk size
10468 * for our dynamic variable state, reset the chunk size.
10469 */
10470 size = P2ROUNDUP(size, sizeof (uint64_t));
10471
10472 /*
10473 * Before setting the chunk size, check that we're not going
10474 * to set it to a negative value...
10475 */
10476 if (size > LONG_MAX)
10477 return;
10478
10479 /*
10480 * ...and make certain that we didn't badly overflow.
10481 */
10482 if (size < ksize || size < sizeof (dtrace_dynvar_t))
10483 return;
10484
10485 if (size > vstate->dtvs_dynvars.dtds_chunksize)
10486 vstate->dtvs_dynvars.dtds_chunksize = size;
10487 }
10488 }
10489
10490 static void
10491 dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10492 {
10493 int oldsvars, osz, nsz, otlocals, ntlocals;
10494 uint_t i, id;
10495
10496 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10497 ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);
10498
10499 for (i = 0; i < dp->dtdo_varlen; i++) {
10500 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10501 dtrace_statvar_t *svar;
10502 dtrace_statvar_t ***svarp = NULL;
10503 size_t dsize = 0;
10504 uint8_t scope = v->dtdv_scope;
10505 int *np = (int *)NULL;
10506
10507 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10508 continue;
10509
10510 id -= DIF_VAR_OTHER_UBASE;
10511
10512 switch (scope) {
10513 case DIFV_SCOPE_THREAD:
10514 while (id >= (uint_t)(otlocals = vstate->dtvs_ntlocals)) {
10515 dtrace_difv_t *tlocals;
10516
10517 if ((ntlocals = (otlocals << 1)) == 0)
10518 ntlocals = 1;
10519
10520 osz = otlocals * sizeof (dtrace_difv_t);
10521 nsz = ntlocals * sizeof (dtrace_difv_t);
10522
10523 tlocals = kmem_zalloc(nsz, KM_SLEEP);
10524
10525 if (osz != 0) {
10526 bcopy(vstate->dtvs_tlocals,
10527 tlocals, osz);
10528 kmem_free(vstate->dtvs_tlocals, osz);
10529 }
10530
10531 vstate->dtvs_tlocals = tlocals;
10532 vstate->dtvs_ntlocals = ntlocals;
10533 }
10534
10535 vstate->dtvs_tlocals[id] = *v;
10536 continue;
10537
10538 case DIFV_SCOPE_LOCAL:
10539 np = &vstate->dtvs_nlocals;
10540 svarp = &vstate->dtvs_locals;
10541
10542 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
10543 dsize = (int)NCPU * (v->dtdv_type.dtdt_size +
10544 sizeof (uint64_t));
10545 else
10546 dsize = (int)NCPU * sizeof (uint64_t);
10547
10548 break;
10549
10550 case DIFV_SCOPE_GLOBAL:
10551 np = &vstate->dtvs_nglobals;
10552 svarp = &vstate->dtvs_globals;
10553
10554 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
10555 dsize = v->dtdv_type.dtdt_size +
10556 sizeof (uint64_t);
10557
10558 break;
10559
10560 default:
10561 ASSERT(0);
10562 }
10563
10564 while (id >= (uint_t)(oldsvars = *np)) {
10565 dtrace_statvar_t **statics;
10566 int newsvars, oldsize, newsize;
10567
10568 if ((newsvars = (oldsvars << 1)) == 0)
10569 newsvars = 1;
10570
10571 oldsize = oldsvars * sizeof (dtrace_statvar_t *);
10572 newsize = newsvars * sizeof (dtrace_statvar_t *);
10573
10574 statics = kmem_zalloc(newsize, KM_SLEEP);
10575
10576 if (oldsize != 0) {
10577 bcopy(*svarp, statics, oldsize);
10578 kmem_free(*svarp, oldsize);
10579 }
10580
10581 *svarp = statics;
10582 *np = newsvars;
10583 }
10584
10585 if ((svar = (*svarp)[id]) == NULL) {
10586 svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
10587 svar->dtsv_var = *v;
10588
10589 if ((svar->dtsv_size = dsize) != 0) {
10590 svar->dtsv_data = (uint64_t)(uintptr_t)
10591 kmem_zalloc(dsize, KM_SLEEP);
10592 }
10593
10594 (*svarp)[id] = svar;
10595 }
10596
10597 svar->dtsv_refcnt++;
10598 }
10599
10600 dtrace_difo_chunksize(dp, vstate);
10601 dtrace_difo_hold(dp);
10602 }
10603
10604 static dtrace_difo_t *
10605 dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10606 {
10607 dtrace_difo_t *new;
10608 size_t sz;
10609
10610 ASSERT(dp->dtdo_buf != NULL);
10611 ASSERT(dp->dtdo_refcnt != 0);
10612
10613 new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
10614
10615 ASSERT(dp->dtdo_buf != NULL);
10616 sz = dp->dtdo_len * sizeof (dif_instr_t);
10617 new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
10618 bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
10619 new->dtdo_len = dp->dtdo_len;
10620
10621 if (dp->dtdo_strtab != NULL) {
10622 ASSERT(dp->dtdo_strlen != 0);
10623 new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
10624 bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
10625 new->dtdo_strlen = dp->dtdo_strlen;
10626 }
10627
10628 if (dp->dtdo_inttab != NULL) {
10629 ASSERT(dp->dtdo_intlen != 0);
10630 sz = dp->dtdo_intlen * sizeof (uint64_t);
10631 new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
10632 bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
10633 new->dtdo_intlen = dp->dtdo_intlen;
10634 }
10635
10636 if (dp->dtdo_vartab != NULL) {
10637 ASSERT(dp->dtdo_varlen != 0);
10638 sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
10639 new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
10640 bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
10641 new->dtdo_varlen = dp->dtdo_varlen;
10642 }
10643
10644 dtrace_difo_init(new, vstate);
10645 return (new);
10646 }
10647
10648 static void
10649 dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10650 {
10651 uint_t i;
10652
10653 ASSERT(dp->dtdo_refcnt == 0);
10654
10655 for (i = 0; i < dp->dtdo_varlen; i++) {
10656 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10657 dtrace_statvar_t *svar;
10658 dtrace_statvar_t **svarp = NULL;
10659 uint_t id;
10660 uint8_t scope = v->dtdv_scope;
10661 int *np = NULL;
10662
10663 switch (scope) {
10664 case DIFV_SCOPE_THREAD:
10665 continue;
10666
10667 case DIFV_SCOPE_LOCAL:
10668 np = &vstate->dtvs_nlocals;
10669 svarp = vstate->dtvs_locals;
10670 break;
10671
10672 case DIFV_SCOPE_GLOBAL:
10673 np = &vstate->dtvs_nglobals;
10674 svarp = vstate->dtvs_globals;
10675 break;
10676
10677 default:
10678 ASSERT(0);
10679 }
10680
10681 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10682 continue;
10683
10684 id -= DIF_VAR_OTHER_UBASE;
10685
10686 ASSERT(id < (uint_t)*np);
10687
10688 svar = svarp[id];
10689 ASSERT(svar != NULL);
10690 ASSERT(svar->dtsv_refcnt > 0);
10691
10692 if (--svar->dtsv_refcnt > 0)
10693 continue;
10694
10695 if (svar->dtsv_size != 0) {
10696 ASSERT(svar->dtsv_data != 0);
10697 kmem_free((void *)(uintptr_t)svar->dtsv_data,
10698 svar->dtsv_size);
10699 }
10700
10701 kmem_free(svar, sizeof (dtrace_statvar_t));
10702 svarp[id] = NULL;
10703 }
10704
10705 kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
10706 kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
10707 kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
10708 kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
10709
10710 kmem_free(dp, sizeof (dtrace_difo_t));
10711 }
10712
10713 static void
10714 dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10715 {
10716 uint_t i;
10717
10718 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10719 ASSERT(dp->dtdo_refcnt != 0);
10720
10721 for (i = 0; i < dp->dtdo_varlen; i++) {
10722 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10723
10724 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10725 continue;
10726
10727 ASSERT(dtrace_vtime_references > 0);
10728 if (--dtrace_vtime_references == 0)
10729 dtrace_vtime_disable();
10730 }
10731
10732 if (--dp->dtdo_refcnt == 0)
10733 dtrace_difo_destroy(dp, vstate);
10734 }
10735
10736 /*
10737 * DTrace Format Functions
10738 */
10739
10740 static dtrace_format_t*
10741 dtrace_format_new(char *str)
10742 {
10743 dtrace_format_t *fmt = NULL;
10744 size_t bufsize = strlen(str) + 1;
10745
10746 fmt = kmem_zalloc(sizeof(*fmt) + bufsize, KM_SLEEP);
10747
10748 fmt->dtf_refcount = 1;
10749 (void) strlcpy(fmt->dtf_str, str, bufsize);
10750
10751 return fmt;
10752 }
10753
10754 static uint16_t
10755 dtrace_format_add(dtrace_state_t *state, char *str)
10756 {
10757 dtrace_format_t **new;
10758 uint16_t ndx;
10759
10760 for (ndx = 0; ndx < state->dts_nformats; ndx++) {
10761 if (state->dts_formats[ndx] == NULL) {
10762 state->dts_formats[ndx] = dtrace_format_new(str);
10763 return (ndx + 1);
10764 }
10765 else if (strcmp(state->dts_formats[ndx]->dtf_str, str) == 0) {
10766 VERIFY(state->dts_formats[ndx]->dtf_refcount < UINT64_MAX);
10767 state->dts_formats[ndx]->dtf_refcount++;
10768 return (ndx + 1);
10769 }
10770 }
10771
10772 if (state->dts_nformats == USHRT_MAX) {
10773 /*
10774 * This is only likely if a denial-of-service attack is being
10775 * attempted. As such, it's okay to fail silently here.
10776 */
10777 return (0);
10778 }
10779
10780 /*
10781 * For simplicity, we always resize the formats array to be exactly the
10782 * number of formats.
10783 */
10784 ndx = state->dts_nformats++;
10785 new = kmem_alloc((ndx + 1) * sizeof (*state->dts_formats), KM_SLEEP);
10786
10787 if (state->dts_formats != NULL) {
10788 ASSERT(ndx != 0);
10789 bcopy(state->dts_formats, new, ndx * sizeof (*state->dts_formats));
10790 kmem_free(state->dts_formats, ndx * sizeof (*state->dts_formats));
10791 }
10792
10793 state->dts_formats = new;
10794 state->dts_formats[ndx] = dtrace_format_new(str);
10795
10796 return (ndx + 1);
10797 }
10798
10799 static void
10800 dtrace_format_remove(dtrace_state_t *state, uint16_t format)
10801 {
10802 dtrace_format_t *fmt;
10803
10804 ASSERT(state->dts_formats != NULL);
10805 ASSERT(format <= state->dts_nformats);
10806
10807 fmt = state->dts_formats[format - 1];
10808
10809 ASSERT(fmt != NULL);
10810 VERIFY(fmt->dtf_refcount > 0);
10811
10812 fmt->dtf_refcount--;
10813
10814 if (fmt->dtf_refcount == 0) {
10815 kmem_free(fmt, DTRACE_FORMAT_SIZE(fmt));
10816 state->dts_formats[format - 1] = NULL;
10817 }
10818 }
10819
10820 static void
10821 dtrace_format_destroy(dtrace_state_t *state)
10822 {
10823 int i;
10824
10825 if (state->dts_nformats == 0) {
10826 ASSERT(state->dts_formats == NULL);
10827 return;
10828 }
10829
10830 ASSERT(state->dts_formats != NULL);
10831
10832 for (i = 0; i < state->dts_nformats; i++) {
10833 dtrace_format_t *fmt = state->dts_formats[i];
10834
10835 if (fmt == NULL)
10836 continue;
10837
10838 kmem_free(fmt, DTRACE_FORMAT_SIZE(fmt));
10839 }
10840
10841 kmem_free(state->dts_formats, state->dts_nformats * sizeof (*state->dts_formats));
10842 state->dts_nformats = 0;
10843 state->dts_formats = NULL;
10844 }
10845
10846 /*
10847 * DTrace Predicate Functions
10848 */
10849 static dtrace_predicate_t *
10850 dtrace_predicate_create(dtrace_difo_t *dp)
10851 {
10852 dtrace_predicate_t *pred;
10853
10854 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10855 ASSERT(dp->dtdo_refcnt != 0);
10856
10857 pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
10858 pred->dtp_difo = dp;
10859 pred->dtp_refcnt = 1;
10860
10861 if (!dtrace_difo_cacheable(dp))
10862 return (pred);
10863
10864 if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
10865 /*
10866 * This is only theoretically possible -- we have had 2^32
10867 * cacheable predicates on this machine. We cannot allow any
10868 * more predicates to become cacheable: as unlikely as it is,
10869 * there may be a thread caching a (now stale) predicate cache
10870 * ID. (N.B.: the temptation is being successfully resisted to
10871 * have this cmn_err() "Holy shit -- we executed this code!")
10872 */
10873 return (pred);
10874 }
10875
10876 pred->dtp_cacheid = dtrace_predcache_id++;
10877
10878 return (pred);
10879 }
10880
10881 static void
10882 dtrace_predicate_hold(dtrace_predicate_t *pred)
10883 {
10884 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10885 ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
10886 ASSERT(pred->dtp_refcnt > 0);
10887
10888 pred->dtp_refcnt++;
10889 }
10890
10891 static void
10892 dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)
10893 {
10894 dtrace_difo_t *dp = pred->dtp_difo;
10895 #pragma unused(dp) /* __APPLE__ */
10896
10897 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10898 ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
10899 ASSERT(pred->dtp_refcnt > 0);
10900
10901 if (--pred->dtp_refcnt == 0) {
10902 dtrace_difo_release(pred->dtp_difo, vstate);
10903 kmem_free(pred, sizeof (dtrace_predicate_t));
10904 }
10905 }
10906
10907 /*
10908 * DTrace Action Description Functions
10909 */
10910 static dtrace_actdesc_t *
10911 dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
10912 uint64_t uarg, uint64_t arg)
10913 {
10914 dtrace_actdesc_t *act;
10915
10916 ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != 0 &&
10917 arg >= KERNELBASE) || (arg == 0 && kind == DTRACEACT_PRINTA));
10918
10919 act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
10920 act->dtad_kind = kind;
10921 act->dtad_ntuple = ntuple;
10922 act->dtad_uarg = uarg;
10923 act->dtad_arg = arg;
10924 act->dtad_refcnt = 1;
10925
10926 return (act);
10927 }
10928
10929 static void
10930 dtrace_actdesc_hold(dtrace_actdesc_t *act)
10931 {
10932 ASSERT(act->dtad_refcnt >= 1);
10933 act->dtad_refcnt++;
10934 }
10935
10936 static void
10937 dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)
10938 {
10939 dtrace_actkind_t kind = act->dtad_kind;
10940 dtrace_difo_t *dp;
10941
10942 ASSERT(act->dtad_refcnt >= 1);
10943
10944 if (--act->dtad_refcnt != 0)
10945 return;
10946
10947 if ((dp = act->dtad_difo) != NULL)
10948 dtrace_difo_release(dp, vstate);
10949
10950 if (DTRACEACT_ISPRINTFLIKE(kind)) {
10951 char *str = (char *)(uintptr_t)act->dtad_arg;
10952
10953 ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
10954 (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
10955
10956 if (str != NULL)
10957 kmem_free(str, strlen(str) + 1);
10958 }
10959
10960 kmem_free(act, sizeof (dtrace_actdesc_t));
10961 }
10962
10963 /*
10964 * DTrace ECB Functions
10965 */
10966 static dtrace_ecb_t *
10967 dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
10968 {
10969 dtrace_ecb_t *ecb;
10970 dtrace_epid_t epid;
10971
10972 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10973
10974 ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
10975 ecb->dte_predicate = NULL;
10976 ecb->dte_probe = probe;
10977
10978 /*
10979 * The default size is the size of the default action: recording
10980 * the header.
10981 */
10982 ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t);
10983 ecb->dte_alignment = sizeof (dtrace_epid_t);
10984
10985 epid = state->dts_epid++;
10986
10987 if (epid - 1 >= (dtrace_epid_t)state->dts_necbs) {
10988 dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;
10989 int necbs = state->dts_necbs << 1;
10990
10991 ASSERT(epid == (dtrace_epid_t)state->dts_necbs + 1);
10992
10993 if (necbs == 0) {
10994 ASSERT(oecbs == NULL);
10995 necbs = 1;
10996 }
10997
10998 ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
10999
11000 if (oecbs != NULL)
11001 bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
11002
11003 dtrace_membar_producer();
11004 state->dts_ecbs = ecbs;
11005
11006 if (oecbs != NULL) {
11007 /*
11008 * If this state is active, we must dtrace_sync()
11009 * before we can free the old dts_ecbs array: we're
11010 * coming in hot, and there may be active ring
11011 * buffer processing (which indexes into the dts_ecbs
11012 * array) on another CPU.
11013 */
11014 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
11015 dtrace_sync();
11016
11017 kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
11018 }
11019
11020 dtrace_membar_producer();
11021 state->dts_necbs = necbs;
11022 }
11023
11024 ecb->dte_state = state;
11025
11026 ASSERT(state->dts_ecbs[epid - 1] == NULL);
11027 dtrace_membar_producer();
11028 state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;
11029
11030 return (ecb);
11031 }
11032
11033 static int
11034 dtrace_ecb_enable(dtrace_ecb_t *ecb)
11035 {
11036 dtrace_probe_t *probe = ecb->dte_probe;
11037
11038 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
11039 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11040 ASSERT(ecb->dte_next == NULL);
11041
11042 if (probe == NULL) {
11043 /*
11044 * This is the NULL probe -- there's nothing to do.
11045 */
11046 return(0);
11047 }
11048
11049 probe->dtpr_provider->dtpv_ecb_count++;
11050 if (probe->dtpr_ecb == NULL) {
11051 dtrace_provider_t *prov = probe->dtpr_provider;
11052
11053 /*
11054 * We're the first ECB on this probe.
11055 */
11056 probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
11057
11058 if (ecb->dte_predicate != NULL)
11059 probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
11060
11061 return (prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
11062 probe->dtpr_id, probe->dtpr_arg));
11063 } else {
11064 /*
11065 * This probe is already active. Swing the last pointer to
11066 * point to the new ECB, and issue a dtrace_sync() to assure
11067 * that all CPUs have seen the change.
11068 */
11069 ASSERT(probe->dtpr_ecb_last != NULL);
11070 probe->dtpr_ecb_last->dte_next = ecb;
11071 probe->dtpr_ecb_last = ecb;
11072 probe->dtpr_predcache = 0;
11073
11074 dtrace_sync();
11075 return(0);
11076 }
11077 }
11078
11079 static int
11080 dtrace_ecb_resize(dtrace_ecb_t *ecb)
11081 {
11082 dtrace_action_t *act;
11083 uint32_t curneeded = UINT32_MAX;
11084 uint32_t aggbase = UINT32_MAX;
11085
11086 /*
11087 * If we record anything, we always record the dtrace_rechdr_t. (And
11088 * we always record it first.)
11089 */
11090 ecb->dte_size = sizeof (dtrace_rechdr_t);
11091 ecb->dte_alignment = sizeof (dtrace_epid_t);
11092
11093 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
11094 dtrace_recdesc_t *rec = &act->dta_rec;
11095 ASSERT(rec->dtrd_size > 0 || rec->dtrd_alignment == 1);
11096
11097 ecb->dte_alignment = MAX(ecb->dte_alignment, rec->dtrd_alignment);
11098
11099 if (DTRACEACT_ISAGG(act->dta_kind)) {
11100 dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
11101
11102 ASSERT(rec->dtrd_size != 0);
11103 ASSERT(agg->dtag_first != NULL);
11104 ASSERT(act->dta_prev->dta_intuple);
11105 ASSERT(aggbase != UINT32_MAX);
11106 ASSERT(curneeded != UINT32_MAX);
11107
11108 agg->dtag_base = aggbase;
11109 curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
11110 rec->dtrd_offset = curneeded;
11111 if (curneeded + rec->dtrd_size < curneeded)
11112 return (EINVAL);
11113 curneeded += rec->dtrd_size;
11114 ecb->dte_needed = MAX(ecb->dte_needed, curneeded);
11115
11116 aggbase = UINT32_MAX;
11117 curneeded = UINT32_MAX;
11118 } else if (act->dta_intuple) {
11119 if (curneeded == UINT32_MAX) {
11120 /*
11121 * This is the first record in a tuple. Align
11122 * curneeded to be at offset 4 in an 8-byte
11123 * aligned block.
11124 */
11125 ASSERT(act->dta_prev == NULL || !act->dta_prev->dta_intuple);
11126 ASSERT(aggbase == UINT32_MAX);
11127
11128 curneeded = P2PHASEUP(ecb->dte_size,
11129 sizeof (uint64_t), sizeof (dtrace_aggid_t));
11130
11131 aggbase = curneeded - sizeof (dtrace_aggid_t);
11132 ASSERT(IS_P2ALIGNED(aggbase,
11133 sizeof (uint64_t)));
11134 }
11135
11136 curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
11137 rec->dtrd_offset = curneeded;
11138 curneeded += rec->dtrd_size;
11139 if (curneeded + rec->dtrd_size < curneeded)
11140 return (EINVAL);
11141 } else {
11142 /* tuples must be followed by an aggregation */
11143 ASSERT(act->dta_prev == NULL || !act->dta_prev->dta_intuple);
11144 ecb->dte_size = P2ROUNDUP(ecb->dte_size, rec->dtrd_alignment);
11145 rec->dtrd_offset = ecb->dte_size;
11146 if (ecb->dte_size + rec->dtrd_size < ecb->dte_size)
11147 return (EINVAL);
11148 ecb->dte_size += rec->dtrd_size;
11149 ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size);
11150 }
11151 }
11152
11153 if ((act = ecb->dte_action) != NULL &&
11154 !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
11155 ecb->dte_size == sizeof (dtrace_rechdr_t)) {
11156 /*
11157 * If the size is still sizeof (dtrace_rechdr_t), then all
11158 * actions store no data; set the size to 0.
11159 */
11160 ecb->dte_size = 0;
11161 }
11162
11163 ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t));
11164 ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t)));
11165 ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed, ecb->dte_needed);
11166 return (0);
11167 }
11168
11169 static dtrace_action_t *
11170 dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
11171 {
11172 dtrace_aggregation_t *agg;
11173 size_t size = sizeof (uint64_t);
11174 int ntuple = desc->dtad_ntuple;
11175 dtrace_action_t *act;
11176 dtrace_recdesc_t *frec;
11177 dtrace_aggid_t aggid;
11178 dtrace_state_t *state = ecb->dte_state;
11179
11180 agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
11181 agg->dtag_ecb = ecb;
11182
11183 ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
11184
11185 switch (desc->dtad_kind) {
11186 case DTRACEAGG_MIN:
11187 agg->dtag_initial = INT64_MAX;
11188 agg->dtag_aggregate = dtrace_aggregate_min;
11189 break;
11190
11191 case DTRACEAGG_MAX:
11192 agg->dtag_initial = INT64_MIN;
11193 agg->dtag_aggregate = dtrace_aggregate_max;
11194 break;
11195
11196 case DTRACEAGG_COUNT:
11197 agg->dtag_aggregate = dtrace_aggregate_count;
11198 break;
11199
11200 case DTRACEAGG_QUANTIZE:
11201 agg->dtag_aggregate = dtrace_aggregate_quantize;
11202 size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
11203 sizeof (uint64_t);
11204 break;
11205
11206 case DTRACEAGG_LQUANTIZE: {
11207 uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
11208 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
11209
11210 agg->dtag_initial = desc->dtad_arg;
11211 agg->dtag_aggregate = dtrace_aggregate_lquantize;
11212
11213 if (step == 0 || levels == 0)
11214 goto err;
11215
11216 size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
11217 break;
11218 }
11219
11220 case DTRACEAGG_LLQUANTIZE: {
11221 uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(desc->dtad_arg);
11222 uint16_t low = DTRACE_LLQUANTIZE_LOW(desc->dtad_arg);
11223 uint16_t high = DTRACE_LLQUANTIZE_HIGH(desc->dtad_arg);
11224 uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(desc->dtad_arg);
11225 int64_t v;
11226
11227 agg->dtag_initial = desc->dtad_arg;
11228 agg->dtag_aggregate = dtrace_aggregate_llquantize;
11229
11230 if (factor < 2 || low >= high || nsteps < factor)
11231 goto err;
11232
11233 /*
11234 * Now check that the number of steps evenly divides a power
11235 * of the factor. (This assures both integer bucket size and
11236 * linearity within each magnitude.)
11237 */
11238 for (v = factor; v < nsteps; v *= factor)
11239 continue;
11240
11241 if ((v % nsteps) || (nsteps % factor))
11242 goto err;
11243
11244 size = (dtrace_aggregate_llquantize_bucket(factor, low, high, nsteps, INT64_MAX) + 2) * sizeof (uint64_t);
11245 break;
11246 }
11247
11248 case DTRACEAGG_AVG:
11249 agg->dtag_aggregate = dtrace_aggregate_avg;
11250 size = sizeof (uint64_t) * 2;
11251 break;
11252
11253 case DTRACEAGG_STDDEV:
11254 agg->dtag_aggregate = dtrace_aggregate_stddev;
11255 size = sizeof (uint64_t) * 4;
11256 break;
11257
11258 case DTRACEAGG_SUM:
11259 agg->dtag_aggregate = dtrace_aggregate_sum;
11260 break;
11261
11262 default:
11263 goto err;
11264 }
11265
11266 agg->dtag_action.dta_rec.dtrd_size = size;
11267
11268 if (ntuple == 0)
11269 goto err;
11270
11271 /*
11272 * We must make sure that we have enough actions for the n-tuple.
11273 */
11274 for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
11275 if (DTRACEACT_ISAGG(act->dta_kind))
11276 break;
11277
11278 if (--ntuple == 0) {
11279 /*
11280 * This is the action with which our n-tuple begins.
11281 */
11282 agg->dtag_first = act;
11283 goto success;
11284 }
11285 }
11286
11287 /*
11288 * This n-tuple is short by ntuple elements. Return failure.
11289 */
11290 ASSERT(ntuple != 0);
11291 err:
11292 kmem_free(agg, sizeof (dtrace_aggregation_t));
11293 return (NULL);
11294
11295 success:
11296 /*
11297 * If the last action in the tuple has a size of zero, it's actually
11298 * an expression argument for the aggregating action.
11299 */
11300 ASSERT(ecb->dte_action_last != NULL);
11301 act = ecb->dte_action_last;
11302
11303 if (act->dta_kind == DTRACEACT_DIFEXPR) {
11304 ASSERT(act->dta_difo != NULL);
11305
11306 if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
11307 agg->dtag_hasarg = 1;
11308 }
11309
11310 /*
11311 * We need to allocate an id for this aggregation.
11312 */
11313 aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
11314 VM_BESTFIT | VM_SLEEP);
11315
11316 if (aggid - 1 >= (dtrace_aggid_t)state->dts_naggregations) {
11317 dtrace_aggregation_t **oaggs = state->dts_aggregations;
11318 dtrace_aggregation_t **aggs;
11319 int naggs = state->dts_naggregations << 1;
11320 int onaggs = state->dts_naggregations;
11321
11322 ASSERT(aggid == (dtrace_aggid_t)state->dts_naggregations + 1);
11323
11324 if (naggs == 0) {
11325 ASSERT(oaggs == NULL);
11326 naggs = 1;
11327 }
11328
11329 aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
11330
11331 if (oaggs != NULL) {
11332 bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
11333 kmem_free(oaggs, onaggs * sizeof (*aggs));
11334 }
11335
11336 state->dts_aggregations = aggs;
11337 state->dts_naggregations = naggs;
11338 }
11339
11340 ASSERT(state->dts_aggregations[aggid - 1] == NULL);
11341 state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;
11342
11343 frec = &agg->dtag_first->dta_rec;
11344 if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
11345 frec->dtrd_alignment = sizeof (dtrace_aggid_t);
11346
11347 for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
11348 ASSERT(!act->dta_intuple);
11349 act->dta_intuple = 1;
11350 }
11351
11352 return (&agg->dtag_action);
11353 }
11354
11355 static void
11356 dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)
11357 {
11358 dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
11359 dtrace_state_t *state = ecb->dte_state;
11360 dtrace_aggid_t aggid = agg->dtag_id;
11361
11362 ASSERT(DTRACEACT_ISAGG(act->dta_kind));
11363 vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
11364
11365 ASSERT(state->dts_aggregations[aggid - 1] == agg);
11366 state->dts_aggregations[aggid - 1] = NULL;
11367
11368 kmem_free(agg, sizeof (dtrace_aggregation_t));
11369 }
11370
11371 static int
11372 dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
11373 {
11374 dtrace_action_t *action, *last;
11375 dtrace_difo_t *dp = desc->dtad_difo;
11376 uint32_t size = 0, align = sizeof (uint8_t), mask;
11377 uint16_t format = 0;
11378 dtrace_recdesc_t *rec;
11379 dtrace_state_t *state = ecb->dte_state;
11380 dtrace_optval_t *opt = state->dts_options;
11381 dtrace_optval_t nframes=0, strsize;
11382 uint64_t arg = desc->dtad_arg;
11383
11384 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11385 ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);
11386
11387 if (DTRACEACT_ISAGG(desc->dtad_kind)) {
11388 /*
11389 * If this is an aggregating action, there must be neither
11390 * a speculate nor a commit on the action chain.
11391 */
11392 dtrace_action_t *act;
11393
11394 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
11395 if (act->dta_kind == DTRACEACT_COMMIT)
11396 return (EINVAL);
11397
11398 if (act->dta_kind == DTRACEACT_SPECULATE)
11399 return (EINVAL);
11400 }
11401
11402 action = dtrace_ecb_aggregation_create(ecb, desc);
11403
11404 if (action == NULL)
11405 return (EINVAL);
11406 } else {
11407 if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||
11408 (desc->dtad_kind == DTRACEACT_DIFEXPR &&
11409 dp != NULL && dp->dtdo_destructive)) {
11410 state->dts_destructive = 1;
11411 }
11412
11413 switch (desc->dtad_kind) {
11414 case DTRACEACT_PRINTF:
11415 case DTRACEACT_PRINTA:
11416 case DTRACEACT_SYSTEM:
11417 case DTRACEACT_FREOPEN:
11418 case DTRACEACT_DIFEXPR:
11419 /*
11420 * We know that our arg is a string -- turn it into a
11421 * format.
11422 */
11423 if (arg == 0) {
11424 ASSERT(desc->dtad_kind == DTRACEACT_PRINTA ||
11425 desc->dtad_kind == DTRACEACT_DIFEXPR);
11426 format = 0;
11427 } else {
11428 ASSERT(arg != 0);
11429 ASSERT(arg > KERNELBASE);
11430 format = dtrace_format_add(state,
11431 (char *)(uintptr_t)arg);
11432 }
11433
11434 OS_FALLTHROUGH;
11435 case DTRACEACT_LIBACT:
11436 case DTRACEACT_TRACEMEM:
11437 case DTRACEACT_TRACEMEM_DYNSIZE:
11438 case DTRACEACT_APPLEBINARY: /* __APPLE__ */
11439 if (dp == NULL)
11440 return (EINVAL);
11441
11442 if ((size = dp->dtdo_rtype.dtdt_size) != 0)
11443 break;
11444
11445 if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
11446 if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11447 return (EINVAL);
11448
11449 size = opt[DTRACEOPT_STRSIZE];
11450 }
11451
11452 break;
11453
11454 case DTRACEACT_STACK:
11455 if ((nframes = arg) == 0) {
11456 nframes = opt[DTRACEOPT_STACKFRAMES];
11457 ASSERT(nframes > 0);
11458 arg = nframes;
11459 }
11460
11461 size = nframes * sizeof (pc_t);
11462 break;
11463
11464 case DTRACEACT_JSTACK:
11465 if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
11466 strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
11467
11468 if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
11469 nframes = opt[DTRACEOPT_JSTACKFRAMES];
11470
11471 arg = DTRACE_USTACK_ARG(nframes, strsize);
11472
11473 OS_FALLTHROUGH;
11474 case DTRACEACT_USTACK:
11475 if (desc->dtad_kind != DTRACEACT_JSTACK &&
11476 (nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
11477 strsize = DTRACE_USTACK_STRSIZE(arg);
11478 nframes = opt[DTRACEOPT_USTACKFRAMES];
11479 ASSERT(nframes > 0);
11480 arg = DTRACE_USTACK_ARG(nframes, strsize);
11481 }
11482
11483 /*
11484 * Save a slot for the pid.
11485 */
11486 size = (nframes + 1) * sizeof (uint64_t);
11487 size += DTRACE_USTACK_STRSIZE(arg);
11488 size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
11489
11490 break;
11491
11492 case DTRACEACT_SYM:
11493 case DTRACEACT_MOD:
11494 if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=
11495 sizeof (uint64_t)) ||
11496 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11497 return (EINVAL);
11498 break;
11499
11500 case DTRACEACT_USYM:
11501 case DTRACEACT_UMOD:
11502 case DTRACEACT_UADDR:
11503 if (dp == NULL ||
11504 (dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||
11505 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11506 return (EINVAL);
11507
11508 /*
11509 * We have a slot for the pid, plus a slot for the
11510 * argument. To keep things simple (aligned with
11511 * bitness-neutral sizing), we store each as a 64-bit
11512 * quantity.
11513 */
11514 size = 2 * sizeof (uint64_t);
11515 break;
11516
11517 case DTRACEACT_STOP:
11518 case DTRACEACT_BREAKPOINT:
11519 case DTRACEACT_PANIC:
11520 break;
11521
11522 case DTRACEACT_CHILL:
11523 case DTRACEACT_DISCARD:
11524 case DTRACEACT_RAISE:
11525 case DTRACEACT_PIDRESUME: /* __APPLE__ */
11526 if (dp == NULL)
11527 return (EINVAL);
11528 break;
11529
11530 case DTRACEACT_EXIT:
11531 if (dp == NULL ||
11532 (size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||
11533 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11534 return (EINVAL);
11535 break;
11536
11537 case DTRACEACT_SPECULATE:
11538 if (ecb->dte_size > sizeof (dtrace_rechdr_t))
11539 return (EINVAL);
11540
11541 if (dp == NULL)
11542 return (EINVAL);
11543
11544 state->dts_speculates = 1;
11545 break;
11546
11547 case DTRACEACT_COMMIT: {
11548 dtrace_action_t *act = ecb->dte_action;
11549
11550 for (; act != NULL; act = act->dta_next) {
11551 if (act->dta_kind == DTRACEACT_COMMIT)
11552 return (EINVAL);
11553 }
11554
11555 if (dp == NULL)
11556 return (EINVAL);
11557 break;
11558 }
11559
11560 default:
11561 return (EINVAL);
11562 }
11563
11564 if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {
11565 /*
11566 * If this is a data-storing action or a speculate,
11567 * we must be sure that there isn't a commit on the
11568 * action chain.
11569 */
11570 dtrace_action_t *act = ecb->dte_action;
11571
11572 for (; act != NULL; act = act->dta_next) {
11573 if (act->dta_kind == DTRACEACT_COMMIT)
11574 return (EINVAL);
11575 }
11576 }
11577
11578 action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
11579 action->dta_rec.dtrd_size = size;
11580 }
11581
11582 action->dta_refcnt = 1;
11583 rec = &action->dta_rec;
11584 size = rec->dtrd_size;
11585
11586 for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
11587 if (!(size & mask)) {
11588 align = mask + 1;
11589 break;
11590 }
11591 }
11592
11593 action->dta_kind = desc->dtad_kind;
11594
11595 if ((action->dta_difo = dp) != NULL)
11596 dtrace_difo_hold(dp);
11597
11598 rec->dtrd_action = action->dta_kind;
11599 rec->dtrd_arg = arg;
11600 rec->dtrd_uarg = desc->dtad_uarg;
11601 rec->dtrd_alignment = (uint16_t)align;
11602 rec->dtrd_format = format;
11603
11604 if ((last = ecb->dte_action_last) != NULL) {
11605 ASSERT(ecb->dte_action != NULL);
11606 action->dta_prev = last;
11607 last->dta_next = action;
11608 } else {
11609 ASSERT(ecb->dte_action == NULL);
11610 ecb->dte_action = action;
11611 }
11612
11613 ecb->dte_action_last = action;
11614
11615 return (0);
11616 }
11617
11618 static void
11619 dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
11620 {
11621 dtrace_action_t *act = ecb->dte_action, *next;
11622 dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
11623 dtrace_difo_t *dp;
11624 uint16_t format;
11625
11626 if (act != NULL && act->dta_refcnt > 1) {
11627 ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);
11628 act->dta_refcnt--;
11629 } else {
11630 for (; act != NULL; act = next) {
11631 next = act->dta_next;
11632 ASSERT(next != NULL || act == ecb->dte_action_last);
11633 ASSERT(act->dta_refcnt == 1);
11634
11635 if ((format = act->dta_rec.dtrd_format) != 0)
11636 dtrace_format_remove(ecb->dte_state, format);
11637
11638 if ((dp = act->dta_difo) != NULL)
11639 dtrace_difo_release(dp, vstate);
11640
11641 if (DTRACEACT_ISAGG(act->dta_kind)) {
11642 dtrace_ecb_aggregation_destroy(ecb, act);
11643 } else {
11644 kmem_free(act, sizeof (dtrace_action_t));
11645 }
11646 }
11647 }
11648
11649 ecb->dte_action = NULL;
11650 ecb->dte_action_last = NULL;
11651 ecb->dte_size = 0;
11652 }
11653
11654 static void
11655 dtrace_ecb_disable(dtrace_ecb_t *ecb)
11656 {
11657 /*
11658 * We disable the ECB by removing it from its probe.
11659 */
11660 dtrace_ecb_t *pecb, *prev = NULL;
11661 dtrace_probe_t *probe = ecb->dte_probe;
11662
11663 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11664
11665 if (probe == NULL) {
11666 /*
11667 * This is the NULL probe; there is nothing to disable.
11668 */
11669 return;
11670 }
11671
11672 for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
11673 if (pecb == ecb)
11674 break;
11675 prev = pecb;
11676 }
11677
11678 ASSERT(pecb != NULL);
11679
11680 if (prev == NULL) {
11681 probe->dtpr_ecb = ecb->dte_next;
11682 } else {
11683 prev->dte_next = ecb->dte_next;
11684 }
11685
11686 if (ecb == probe->dtpr_ecb_last) {
11687 ASSERT(ecb->dte_next == NULL);
11688 probe->dtpr_ecb_last = prev;
11689 }
11690
11691 probe->dtpr_provider->dtpv_ecb_count--;
11692 /*
11693 * The ECB has been disconnected from the probe; now sync to assure
11694 * that all CPUs have seen the change before returning.
11695 */
11696 dtrace_sync();
11697
11698 if (probe->dtpr_ecb == NULL) {
11699 /*
11700 * That was the last ECB on the probe; clear the predicate
11701 * cache ID for the probe, disable it and sync one more time
11702 * to assure that we'll never hit it again.
11703 */
11704 dtrace_provider_t *prov = probe->dtpr_provider;
11705
11706 ASSERT(ecb->dte_next == NULL);
11707 ASSERT(probe->dtpr_ecb_last == NULL);
11708 probe->dtpr_predcache = DTRACE_CACHEIDNONE;
11709 prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
11710 probe->dtpr_id, probe->dtpr_arg);
11711 dtrace_sync();
11712 } else {
11713 /*
11714 * There is at least one ECB remaining on the probe. If there
11715 * is _exactly_ one, set the probe's predicate cache ID to be
11716 * the predicate cache ID of the remaining ECB.
11717 */
11718 ASSERT(probe->dtpr_ecb_last != NULL);
11719 ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
11720
11721 if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
11722 dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
11723
11724 ASSERT(probe->dtpr_ecb->dte_next == NULL);
11725
11726 if (p != NULL)
11727 probe->dtpr_predcache = p->dtp_cacheid;
11728 }
11729
11730 ecb->dte_next = NULL;
11731 }
11732 }
11733
11734 static void
11735 dtrace_ecb_destroy(dtrace_ecb_t *ecb)
11736 {
11737 dtrace_state_t *state = ecb->dte_state;
11738 dtrace_vstate_t *vstate = &state->dts_vstate;
11739 dtrace_predicate_t *pred;
11740 dtrace_epid_t epid = ecb->dte_epid;
11741
11742 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11743 ASSERT(ecb->dte_next == NULL);
11744 ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);
11745
11746 if ((pred = ecb->dte_predicate) != NULL)
11747 dtrace_predicate_release(pred, vstate);
11748
11749 dtrace_ecb_action_remove(ecb);
11750
11751 ASSERT(state->dts_ecbs[epid - 1] == ecb);
11752 state->dts_ecbs[epid - 1] = NULL;
11753
11754 kmem_free(ecb, sizeof (dtrace_ecb_t));
11755 }
11756
11757 static dtrace_ecb_t *
11758 dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
11759 dtrace_enabling_t *enab)
11760 {
11761 dtrace_ecb_t *ecb;
11762 dtrace_predicate_t *pred;
11763 dtrace_actdesc_t *act;
11764 dtrace_provider_t *prov;
11765 dtrace_ecbdesc_t *desc = enab->dten_current;
11766
11767 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11768 ASSERT(state != NULL);
11769
11770 ecb = dtrace_ecb_add(state, probe);
11771 ecb->dte_uarg = desc->dted_uarg;
11772
11773 if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
11774 dtrace_predicate_hold(pred);
11775 ecb->dte_predicate = pred;
11776 }
11777
11778 if (probe != NULL) {
11779 /*
11780 * If the provider shows more leg than the consumer is old
11781 * enough to see, we need to enable the appropriate implicit
11782 * predicate bits to prevent the ecb from activating at
11783 * revealing times.
11784 *
11785 * Providers specifying DTRACE_PRIV_USER at register time
11786 * are stating that they need the /proc-style privilege
11787 * model to be enforced, and this is what DTRACE_COND_OWNER
11788 * and DTRACE_COND_ZONEOWNER will then do at probe time.
11789 */
11790 prov = probe->dtpr_provider;
11791 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
11792 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11793 ecb->dte_cond |= DTRACE_COND_OWNER;
11794
11795 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
11796 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11797 ecb->dte_cond |= DTRACE_COND_ZONEOWNER;
11798
11799 /*
11800 * If the provider shows us kernel innards and the user
11801 * is lacking sufficient privilege, enable the
11802 * DTRACE_COND_USERMODE implicit predicate.
11803 */
11804 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
11805 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
11806 ecb->dte_cond |= DTRACE_COND_USERMODE;
11807 }
11808
11809 if (dtrace_ecb_create_cache != NULL) {
11810 /*
11811 * If we have a cached ecb, we'll use its action list instead
11812 * of creating our own (saving both time and space).
11813 */
11814 dtrace_ecb_t *cached = dtrace_ecb_create_cache;
11815 dtrace_action_t *act_if = cached->dte_action;
11816
11817 if (act_if != NULL) {
11818 ASSERT(act_if->dta_refcnt > 0);
11819 act_if->dta_refcnt++;
11820 ecb->dte_action = act_if;
11821 ecb->dte_action_last = cached->dte_action_last;
11822 ecb->dte_needed = cached->dte_needed;
11823 ecb->dte_size = cached->dte_size;
11824 ecb->dte_alignment = cached->dte_alignment;
11825 }
11826
11827 return (ecb);
11828 }
11829
11830 for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
11831 if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
11832 dtrace_ecb_destroy(ecb);
11833 return (NULL);
11834 }
11835 }
11836
11837 if ((enab->dten_error = dtrace_ecb_resize(ecb)) != 0) {
11838 dtrace_ecb_destroy(ecb);
11839 return (NULL);
11840 }
11841
11842 return (dtrace_ecb_create_cache = ecb);
11843 }
11844
11845 static int
11846 dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg1, void *arg2)
11847 {
11848 dtrace_ecb_t *ecb;
11849 dtrace_enabling_t *enab = arg1;
11850 dtrace_ecbdesc_t *ep = arg2;
11851 dtrace_state_t *state = enab->dten_vstate->dtvs_state;
11852
11853 ASSERT(state != NULL);
11854
11855 if (probe != NULL && ep != NULL && probe->dtpr_gen < ep->dted_probegen) {
11856 /*
11857 * This probe was created in a generation for which this
11858 * enabling has previously created ECBs; we don't want to
11859 * enable it again, so just kick out.
11860 */
11861 return (DTRACE_MATCH_NEXT);
11862 }
11863
11864 if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
11865 return (DTRACE_MATCH_DONE);
11866
11867 if (dtrace_ecb_enable(ecb) < 0)
11868 return (DTRACE_MATCH_FAIL);
11869
11870 return (DTRACE_MATCH_NEXT);
11871 }
11872
11873 static dtrace_ecb_t *
11874 dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
11875 {
11876 dtrace_ecb_t *ecb;
11877 #pragma unused(ecb) /* __APPLE__ */
11878
11879 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11880
11881 if (id == 0 || id > (dtrace_epid_t)state->dts_necbs)
11882 return (NULL);
11883
11884 ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
11885 ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);
11886
11887 return (state->dts_ecbs[id - 1]);
11888 }
11889
11890 static dtrace_aggregation_t *
11891 dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
11892 {
11893 dtrace_aggregation_t *agg;
11894 #pragma unused(agg) /* __APPLE__ */
11895
11896 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11897
11898 if (id == 0 || id > (dtrace_aggid_t)state->dts_naggregations)
11899 return (NULL);
11900
11901 ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
11902 ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||
11903 agg->dtag_id == id);
11904
11905 return (state->dts_aggregations[id - 1]);
11906 }
11907
11908 /*
11909 * DTrace Buffer Functions
11910 *
11911 * The following functions manipulate DTrace buffers. Most of these functions
11912 * are called in the context of establishing or processing consumer state;
11913 * exceptions are explicitly noted.
11914 */
11915
11916 /*
11917 * Note: called from cross call context. This function switches the two
11918 * buffers on a given CPU. The atomicity of this operation is assured by
11919 * disabling interrupts while the actual switch takes place; the disabling of
11920 * interrupts serializes the execution with any execution of dtrace_probe() on
11921 * the same CPU.
11922 */
11923 static void
11924 dtrace_buffer_switch(dtrace_buffer_t *buf)
11925 {
11926 caddr_t tomax = buf->dtb_tomax;
11927 caddr_t xamot = buf->dtb_xamot;
11928 dtrace_icookie_t cookie;
11929 hrtime_t now;
11930
11931 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
11932 ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
11933
11934 cookie = dtrace_interrupt_disable();
11935 now = dtrace_gethrtime();
11936 buf->dtb_tomax = xamot;
11937 buf->dtb_xamot = tomax;
11938 buf->dtb_xamot_drops = buf->dtb_drops;
11939 buf->dtb_xamot_offset = buf->dtb_offset;
11940 buf->dtb_xamot_errors = buf->dtb_errors;
11941 buf->dtb_xamot_flags = buf->dtb_flags;
11942 buf->dtb_offset = 0;
11943 buf->dtb_drops = 0;
11944 buf->dtb_errors = 0;
11945 buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
11946 buf->dtb_interval = now - buf->dtb_switched;
11947 buf->dtb_switched = now;
11948 buf->dtb_cur_limit = buf->dtb_limit;
11949
11950 dtrace_interrupt_enable(cookie);
11951 }
11952
11953 /*
11954 * Note: called from cross call context. This function activates a buffer
11955 * on a CPU. As with dtrace_buffer_switch(), the atomicity of the operation
11956 * is guaranteed by the disabling of interrupts.
11957 */
11958 static void
11959 dtrace_buffer_activate(dtrace_state_t *state)
11960 {
11961 dtrace_buffer_t *buf;
11962 dtrace_icookie_t cookie = dtrace_interrupt_disable();
11963
11964 buf = &state->dts_buffer[CPU->cpu_id];
11965
11966 if (buf->dtb_tomax != NULL) {
11967 /*
11968 * We might like to assert that the buffer is marked inactive,
11969 * but this isn't necessarily true: the buffer for the CPU
11970 * that processes the BEGIN probe has its buffer activated
11971 * manually. In this case, we take the (harmless) action
11972 * re-clearing the bit INACTIVE bit.
11973 */
11974 buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
11975 }
11976
11977 dtrace_interrupt_enable(cookie);
11978 }
11979
11980 static int
11981 dtrace_buffer_canalloc(size_t size)
11982 {
11983 if (size > (UINT64_MAX - dtrace_buffer_memory_inuse))
11984 return (B_FALSE);
11985 if ((size + dtrace_buffer_memory_inuse) > dtrace_buffer_memory_maxsize)
11986 return (B_FALSE);
11987
11988 return (B_TRUE);
11989 }
11990
11991 static int
11992 dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t limit, size_t size, int flags,
11993 processorid_t cpu)
11994 {
11995 dtrace_cpu_t *cp;
11996 dtrace_buffer_t *buf;
11997 size_t size_before_alloc = dtrace_buffer_memory_inuse;
11998
11999 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
12000 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12001
12002 if (size > (size_t)dtrace_nonroot_maxsize &&
12003 !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
12004 return (EFBIG);
12005
12006 cp = cpu_list;
12007
12008 do {
12009 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
12010 continue;
12011
12012 buf = &bufs[cp->cpu_id];
12013
12014 /*
12015 * If there is already a buffer allocated for this CPU, it
12016 * is only possible that this is a DR event. In this case,
12017 * the buffer size must match our specified size.
12018 */
12019 if (buf->dtb_tomax != NULL) {
12020 ASSERT(buf->dtb_size == size);
12021 continue;
12022 }
12023
12024 ASSERT(buf->dtb_xamot == NULL);
12025
12026 /* DTrace, please do not eat all the memory. */
12027 if (dtrace_buffer_canalloc(size) == B_FALSE)
12028 goto err;
12029 if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
12030 goto err;
12031 dtrace_buffer_memory_inuse += size;
12032
12033 /* Unsure that limit is always lower than size */
12034 limit = limit == size ? limit - 1 : limit;
12035 buf->dtb_cur_limit = limit;
12036 buf->dtb_limit = limit;
12037 buf->dtb_size = size;
12038 buf->dtb_flags = flags;
12039 buf->dtb_offset = 0;
12040 buf->dtb_drops = 0;
12041
12042 if (flags & DTRACEBUF_NOSWITCH)
12043 continue;
12044
12045 /* DTrace, please do not eat all the memory. */
12046 if (dtrace_buffer_canalloc(size) == B_FALSE)
12047 goto err;
12048 if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
12049 goto err;
12050 dtrace_buffer_memory_inuse += size;
12051 } while ((cp = cp->cpu_next) != cpu_list);
12052
12053 ASSERT(dtrace_buffer_memory_inuse <= dtrace_buffer_memory_maxsize);
12054
12055 return (0);
12056
12057 err:
12058 cp = cpu_list;
12059
12060 do {
12061 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
12062 continue;
12063
12064 buf = &bufs[cp->cpu_id];
12065
12066 if (buf->dtb_xamot != NULL) {
12067 ASSERT(buf->dtb_tomax != NULL);
12068 ASSERT(buf->dtb_size == size);
12069 kmem_free(buf->dtb_xamot, size);
12070 }
12071
12072 if (buf->dtb_tomax != NULL) {
12073 ASSERT(buf->dtb_size == size);
12074 kmem_free(buf->dtb_tomax, size);
12075 }
12076
12077 buf->dtb_tomax = NULL;
12078 buf->dtb_xamot = NULL;
12079 buf->dtb_size = 0;
12080 } while ((cp = cp->cpu_next) != cpu_list);
12081
12082 /* Restore the size saved before allocating memory */
12083 dtrace_buffer_memory_inuse = size_before_alloc;
12084
12085 return (ENOMEM);
12086 }
12087
12088 /*
12089 * Note: called from probe context. This function just increments the drop
12090 * count on a buffer. It has been made a function to allow for the
12091 * possibility of understanding the source of mysterious drop counts. (A
12092 * problem for which one may be particularly disappointed that DTrace cannot
12093 * be used to understand DTrace.)
12094 */
12095 static void
12096 dtrace_buffer_drop(dtrace_buffer_t *buf)
12097 {
12098 buf->dtb_drops++;
12099 }
12100
12101 /*
12102 * Note: called from probe context. This function is called to reserve space
12103 * in a buffer. If mstate is non-NULL, sets the scratch base and size in the
12104 * mstate. Returns the new offset in the buffer, or a negative value if an
12105 * error has occurred.
12106 */
12107 static intptr_t
12108 dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
12109 dtrace_state_t *state, dtrace_mstate_t *mstate)
12110 {
12111 intptr_t offs = buf->dtb_offset, soffs;
12112 intptr_t woffs;
12113 caddr_t tomax;
12114 size_t total_off;
12115
12116 if (buf->dtb_flags & DTRACEBUF_INACTIVE)
12117 return (-1);
12118
12119 if ((tomax = buf->dtb_tomax) == NULL) {
12120 dtrace_buffer_drop(buf);
12121 return (-1);
12122 }
12123
12124 if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {
12125 while (offs & (align - 1)) {
12126 /*
12127 * Assert that our alignment is off by a number which
12128 * is itself sizeof (uint32_t) aligned.
12129 */
12130 ASSERT(!((align - (offs & (align - 1))) &
12131 (sizeof (uint32_t) - 1)));
12132 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
12133 offs += sizeof (uint32_t);
12134 }
12135
12136 if ((uint64_t)(soffs = offs + needed) > buf->dtb_cur_limit) {
12137 if (buf->dtb_cur_limit == buf->dtb_limit) {
12138 buf->dtb_cur_limit = buf->dtb_size;
12139
12140 os_atomic_inc(&state->dts_buf_over_limit, relaxed);
12141 /**
12142 * Set an AST on the current processor
12143 * so that we can wake up the process
12144 * outside of probe context, when we know
12145 * it is safe to do so
12146 */
12147 minor_t minor = getminor(state->dts_dev);
12148 ASSERT(minor < 32);
12149
12150 os_atomic_or(&dtrace_wake_clients, 1 << minor, relaxed);
12151 ast_dtrace_on();
12152 }
12153 if ((uint64_t)soffs > buf->dtb_size) {
12154 dtrace_buffer_drop(buf);
12155 return (-1);
12156 }
12157 }
12158
12159 if (mstate == NULL)
12160 return (offs);
12161
12162 mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
12163 mstate->dtms_scratch_size = buf->dtb_size - soffs;
12164 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
12165
12166 return (offs);
12167 }
12168
12169 if (buf->dtb_flags & DTRACEBUF_FILL) {
12170 if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
12171 (buf->dtb_flags & DTRACEBUF_FULL))
12172 return (-1);
12173 goto out;
12174 }
12175
12176 total_off = needed + (offs & (align - 1));
12177
12178 /*
12179 * For a ring buffer, life is quite a bit more complicated. Before
12180 * we can store any padding, we need to adjust our wrapping offset.
12181 * (If we've never before wrapped or we're not about to, no adjustment
12182 * is required.)
12183 */
12184 if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
12185 offs + total_off > buf->dtb_size) {
12186 woffs = buf->dtb_xamot_offset;
12187
12188 if (offs + total_off > buf->dtb_size) {
12189 /*
12190 * We can't fit in the end of the buffer. First, a
12191 * sanity check that we can fit in the buffer at all.
12192 */
12193 if (total_off > buf->dtb_size) {
12194 dtrace_buffer_drop(buf);
12195 return (-1);
12196 }
12197
12198 /*
12199 * We're going to be storing at the top of the buffer,
12200 * so now we need to deal with the wrapped offset. We
12201 * only reset our wrapped offset to 0 if it is
12202 * currently greater than the current offset. If it
12203 * is less than the current offset, it is because a
12204 * previous allocation induced a wrap -- but the
12205 * allocation didn't subsequently take the space due
12206 * to an error or false predicate evaluation. In this
12207 * case, we'll just leave the wrapped offset alone: if
12208 * the wrapped offset hasn't been advanced far enough
12209 * for this allocation, it will be adjusted in the
12210 * lower loop.
12211 */
12212 if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
12213 if (woffs >= offs)
12214 woffs = 0;
12215 } else {
12216 woffs = 0;
12217 }
12218
12219 /*
12220 * Now we know that we're going to be storing to the
12221 * top of the buffer and that there is room for us
12222 * there. We need to clear the buffer from the current
12223 * offset to the end (there may be old gunk there).
12224 */
12225 while ((uint64_t)offs < buf->dtb_size)
12226 tomax[offs++] = 0;
12227
12228 /*
12229 * We need to set our offset to zero. And because we
12230 * are wrapping, we need to set the bit indicating as
12231 * much. We can also adjust our needed space back
12232 * down to the space required by the ECB -- we know
12233 * that the top of the buffer is aligned.
12234 */
12235 offs = 0;
12236 total_off = needed;
12237 buf->dtb_flags |= DTRACEBUF_WRAPPED;
12238 } else {
12239 /*
12240 * There is room for us in the buffer, so we simply
12241 * need to check the wrapped offset.
12242 */
12243 if (woffs < offs) {
12244 /*
12245 * The wrapped offset is less than the offset.
12246 * This can happen if we allocated buffer space
12247 * that induced a wrap, but then we didn't
12248 * subsequently take the space due to an error
12249 * or false predicate evaluation. This is
12250 * okay; we know that _this_ allocation isn't
12251 * going to induce a wrap. We still can't
12252 * reset the wrapped offset to be zero,
12253 * however: the space may have been trashed in
12254 * the previous failed probe attempt. But at
12255 * least the wrapped offset doesn't need to
12256 * be adjusted at all...
12257 */
12258 goto out;
12259 }
12260 }
12261
12262 while (offs + total_off > (size_t)woffs) {
12263 dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
12264 size_t size;
12265
12266 if (epid == DTRACE_EPIDNONE) {
12267 size = sizeof (uint32_t);
12268 } else {
12269 ASSERT(epid <= (dtrace_epid_t)state->dts_necbs);
12270 ASSERT(state->dts_ecbs[epid - 1] != NULL);
12271
12272 size = state->dts_ecbs[epid - 1]->dte_size;
12273 }
12274
12275 ASSERT(woffs + size <= buf->dtb_size);
12276 ASSERT(size != 0);
12277
12278 if (woffs + size == buf->dtb_size) {
12279 /*
12280 * We've reached the end of the buffer; we want
12281 * to set the wrapped offset to 0 and break
12282 * out. However, if the offs is 0, then we're
12283 * in a strange edge-condition: the amount of
12284 * space that we want to reserve plus the size
12285 * of the record that we're overwriting is
12286 * greater than the size of the buffer. This
12287 * is problematic because if we reserve the
12288 * space but subsequently don't consume it (due
12289 * to a failed predicate or error) the wrapped
12290 * offset will be 0 -- yet the EPID at offset 0
12291 * will not be committed. This situation is
12292 * relatively easy to deal with: if we're in
12293 * this case, the buffer is indistinguishable
12294 * from one that hasn't wrapped; we need only
12295 * finish the job by clearing the wrapped bit,
12296 * explicitly setting the offset to be 0, and
12297 * zero'ing out the old data in the buffer.
12298 */
12299 if (offs == 0) {
12300 buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
12301 buf->dtb_offset = 0;
12302 woffs = total_off;
12303
12304 while ((uint64_t)woffs < buf->dtb_size)
12305 tomax[woffs++] = 0;
12306 }
12307
12308 woffs = 0;
12309 break;
12310 }
12311
12312 woffs += size;
12313 }
12314
12315 /*
12316 * We have a wrapped offset. It may be that the wrapped offset
12317 * has become zero -- that's okay.
12318 */
12319 buf->dtb_xamot_offset = woffs;
12320 }
12321
12322 out:
12323 /*
12324 * Now we can plow the buffer with any necessary padding.
12325 */
12326 while (offs & (align - 1)) {
12327 /*
12328 * Assert that our alignment is off by a number which
12329 * is itself sizeof (uint32_t) aligned.
12330 */
12331 ASSERT(!((align - (offs & (align - 1))) &
12332 (sizeof (uint32_t) - 1)));
12333 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
12334 offs += sizeof (uint32_t);
12335 }
12336
12337 if (buf->dtb_flags & DTRACEBUF_FILL) {
12338 if (offs + needed > buf->dtb_size - state->dts_reserve) {
12339 buf->dtb_flags |= DTRACEBUF_FULL;
12340 return (-1);
12341 }
12342 }
12343
12344 if (mstate == NULL)
12345 return (offs);
12346
12347 /*
12348 * For ring buffers and fill buffers, the scratch space is always
12349 * the inactive buffer.
12350 */
12351 mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
12352 mstate->dtms_scratch_size = buf->dtb_size;
12353 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
12354
12355 return (offs);
12356 }
12357
12358 static void
12359 dtrace_buffer_polish(dtrace_buffer_t *buf)
12360 {
12361 ASSERT(buf->dtb_flags & DTRACEBUF_RING);
12362 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12363
12364 if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
12365 return;
12366
12367 /*
12368 * We need to polish the ring buffer. There are three cases:
12369 *
12370 * - The first (and presumably most common) is that there is no gap
12371 * between the buffer offset and the wrapped offset. In this case,
12372 * there is nothing in the buffer that isn't valid data; we can
12373 * mark the buffer as polished and return.
12374 *
12375 * - The second (less common than the first but still more common
12376 * than the third) is that there is a gap between the buffer offset
12377 * and the wrapped offset, and the wrapped offset is larger than the
12378 * buffer offset. This can happen because of an alignment issue, or
12379 * can happen because of a call to dtrace_buffer_reserve() that
12380 * didn't subsequently consume the buffer space. In this case,
12381 * we need to zero the data from the buffer offset to the wrapped
12382 * offset.
12383 *
12384 * - The third (and least common) is that there is a gap between the
12385 * buffer offset and the wrapped offset, but the wrapped offset is
12386 * _less_ than the buffer offset. This can only happen because a
12387 * call to dtrace_buffer_reserve() induced a wrap, but the space
12388 * was not subsequently consumed. In this case, we need to zero the
12389 * space from the offset to the end of the buffer _and_ from the
12390 * top of the buffer to the wrapped offset.
12391 */
12392 if (buf->dtb_offset < buf->dtb_xamot_offset) {
12393 bzero(buf->dtb_tomax + buf->dtb_offset,
12394 buf->dtb_xamot_offset - buf->dtb_offset);
12395 }
12396
12397 if (buf->dtb_offset > buf->dtb_xamot_offset) {
12398 bzero(buf->dtb_tomax + buf->dtb_offset,
12399 buf->dtb_size - buf->dtb_offset);
12400 bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
12401 }
12402 }
12403
12404 static void
12405 dtrace_buffer_free(dtrace_buffer_t *bufs)
12406 {
12407 int i;
12408
12409 for (i = 0; i < (int)NCPU; i++) {
12410 dtrace_buffer_t *buf = &bufs[i];
12411
12412 if (buf->dtb_tomax == NULL) {
12413 ASSERT(buf->dtb_xamot == NULL);
12414 ASSERT(buf->dtb_size == 0);
12415 continue;
12416 }
12417
12418 if (buf->dtb_xamot != NULL) {
12419 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
12420 kmem_free(buf->dtb_xamot, buf->dtb_size);
12421
12422 ASSERT(dtrace_buffer_memory_inuse >= buf->dtb_size);
12423 dtrace_buffer_memory_inuse -= buf->dtb_size;
12424 }
12425
12426 kmem_free(buf->dtb_tomax, buf->dtb_size);
12427 ASSERT(dtrace_buffer_memory_inuse >= buf->dtb_size);
12428 dtrace_buffer_memory_inuse -= buf->dtb_size;
12429
12430 buf->dtb_size = 0;
12431 buf->dtb_tomax = NULL;
12432 buf->dtb_xamot = NULL;
12433 }
12434 }
12435
12436 /*
12437 * DTrace Enabling Functions
12438 */
12439 static dtrace_enabling_t *
12440 dtrace_enabling_create(dtrace_vstate_t *vstate)
12441 {
12442 dtrace_enabling_t *enab;
12443
12444 enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
12445 enab->dten_vstate = vstate;
12446
12447 return (enab);
12448 }
12449
12450 static void
12451 dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)
12452 {
12453 dtrace_ecbdesc_t **ndesc;
12454 size_t osize, nsize;
12455
12456 /*
12457 * We can't add to enablings after we've enabled them, or after we've
12458 * retained them.
12459 */
12460 ASSERT(enab->dten_probegen == 0);
12461 ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
12462
12463 /* APPLE NOTE: this protects against gcc 4.0 botch on x86 */
12464 if (ecb == NULL) return;
12465
12466 if (enab->dten_ndesc < enab->dten_maxdesc) {
12467 enab->dten_desc[enab->dten_ndesc++] = ecb;
12468 return;
12469 }
12470
12471 osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
12472
12473 if (enab->dten_maxdesc == 0) {
12474 enab->dten_maxdesc = 1;
12475 } else {
12476 enab->dten_maxdesc <<= 1;
12477 }
12478
12479 ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
12480
12481 nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
12482 ndesc = kmem_zalloc(nsize, KM_SLEEP);
12483 bcopy(enab->dten_desc, ndesc, osize);
12484 kmem_free(enab->dten_desc, osize);
12485
12486 enab->dten_desc = ndesc;
12487 enab->dten_desc[enab->dten_ndesc++] = ecb;
12488 }
12489
12490 static void
12491 dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,
12492 dtrace_probedesc_t *pd)
12493 {
12494 dtrace_ecbdesc_t *new;
12495 dtrace_predicate_t *pred;
12496 dtrace_actdesc_t *act;
12497
12498 /*
12499 * We're going to create a new ECB description that matches the
12500 * specified ECB in every way, but has the specified probe description.
12501 */
12502 new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
12503
12504 if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
12505 dtrace_predicate_hold(pred);
12506
12507 for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
12508 dtrace_actdesc_hold(act);
12509
12510 new->dted_action = ecb->dted_action;
12511 new->dted_pred = ecb->dted_pred;
12512 new->dted_probe = *pd;
12513 new->dted_uarg = ecb->dted_uarg;
12514
12515 dtrace_enabling_add(enab, new);
12516 }
12517
12518 static void
12519 dtrace_enabling_dump(dtrace_enabling_t *enab)
12520 {
12521 int i;
12522
12523 for (i = 0; i < enab->dten_ndesc; i++) {
12524 dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
12525
12526 cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
12527 desc->dtpd_provider, desc->dtpd_mod,
12528 desc->dtpd_func, desc->dtpd_name);
12529 }
12530 }
12531
12532 static void
12533 dtrace_enabling_destroy(dtrace_enabling_t *enab)
12534 {
12535 int i;
12536 dtrace_ecbdesc_t *ep;
12537 dtrace_vstate_t *vstate = enab->dten_vstate;
12538
12539 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12540
12541 for (i = 0; i < enab->dten_ndesc; i++) {
12542 dtrace_actdesc_t *act, *next;
12543 dtrace_predicate_t *pred;
12544
12545 ep = enab->dten_desc[i];
12546
12547 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
12548 dtrace_predicate_release(pred, vstate);
12549
12550 for (act = ep->dted_action; act != NULL; act = next) {
12551 next = act->dtad_next;
12552 dtrace_actdesc_release(act, vstate);
12553 }
12554
12555 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
12556 }
12557
12558 kmem_free(enab->dten_desc,
12559 enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
12560
12561 /*
12562 * If this was a retained enabling, decrement the dts_nretained count
12563 * and take it off of the dtrace_retained list.
12564 */
12565 if (enab->dten_prev != NULL || enab->dten_next != NULL ||
12566 dtrace_retained == enab) {
12567 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12568 ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
12569 enab->dten_vstate->dtvs_state->dts_nretained--;
12570 dtrace_retained_gen++;
12571 }
12572
12573 if (enab->dten_prev == NULL) {
12574 if (dtrace_retained == enab) {
12575 dtrace_retained = enab->dten_next;
12576
12577 if (dtrace_retained != NULL)
12578 dtrace_retained->dten_prev = NULL;
12579 }
12580 } else {
12581 ASSERT(enab != dtrace_retained);
12582 ASSERT(dtrace_retained != NULL);
12583 enab->dten_prev->dten_next = enab->dten_next;
12584 }
12585
12586 if (enab->dten_next != NULL) {
12587 ASSERT(dtrace_retained != NULL);
12588 enab->dten_next->dten_prev = enab->dten_prev;
12589 }
12590
12591 kmem_free(enab, sizeof (dtrace_enabling_t));
12592 }
12593
12594 static int
12595 dtrace_enabling_retain(dtrace_enabling_t *enab)
12596 {
12597 dtrace_state_t *state;
12598
12599 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12600 ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
12601 ASSERT(enab->dten_vstate != NULL);
12602
12603 state = enab->dten_vstate->dtvs_state;
12604 ASSERT(state != NULL);
12605
12606 /*
12607 * We only allow each state to retain dtrace_retain_max enablings.
12608 */
12609 if (state->dts_nretained >= dtrace_retain_max)
12610 return (ENOSPC);
12611
12612 state->dts_nretained++;
12613 dtrace_retained_gen++;
12614
12615 if (dtrace_retained == NULL) {
12616 dtrace_retained = enab;
12617 return (0);
12618 }
12619
12620 enab->dten_next = dtrace_retained;
12621 dtrace_retained->dten_prev = enab;
12622 dtrace_retained = enab;
12623
12624 return (0);
12625 }
12626
12627 static int
12628 dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,
12629 dtrace_probedesc_t *create)
12630 {
12631 dtrace_enabling_t *new, *enab;
12632 int found = 0, err = ENOENT;
12633
12634 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12635 ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
12636 ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
12637 ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
12638 ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
12639
12640 new = dtrace_enabling_create(&state->dts_vstate);
12641
12642 /*
12643 * Iterate over all retained enablings, looking for enablings that
12644 * match the specified state.
12645 */
12646 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12647 int i;
12648
12649 /*
12650 * dtvs_state can only be NULL for helper enablings -- and
12651 * helper enablings can't be retained.
12652 */
12653 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12654
12655 if (enab->dten_vstate->dtvs_state != state)
12656 continue;
12657
12658 /*
12659 * Now iterate over each probe description; we're looking for
12660 * an exact match to the specified probe description.
12661 */
12662 for (i = 0; i < enab->dten_ndesc; i++) {
12663 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12664 dtrace_probedesc_t *pd = &ep->dted_probe;
12665
12666 /* APPLE NOTE: Darwin employs size bounded string operation. */
12667 if (strncmp(pd->dtpd_provider, match->dtpd_provider, DTRACE_PROVNAMELEN))
12668 continue;
12669
12670 if (strncmp(pd->dtpd_mod, match->dtpd_mod, DTRACE_MODNAMELEN))
12671 continue;
12672
12673 if (strncmp(pd->dtpd_func, match->dtpd_func, DTRACE_FUNCNAMELEN))
12674 continue;
12675
12676 if (strncmp(pd->dtpd_name, match->dtpd_name, DTRACE_NAMELEN))
12677 continue;
12678
12679 /*
12680 * We have a winning probe! Add it to our growing
12681 * enabling.
12682 */
12683 found = 1;
12684 dtrace_enabling_addlike(new, ep, create);
12685 }
12686 }
12687
12688 if (!found || (err = dtrace_enabling_retain(new)) != 0) {
12689 dtrace_enabling_destroy(new);
12690 return (err);
12691 }
12692
12693 return (0);
12694 }
12695
12696 static void
12697 dtrace_enabling_retract(dtrace_state_t *state)
12698 {
12699 dtrace_enabling_t *enab, *next;
12700
12701 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12702
12703 /*
12704 * Iterate over all retained enablings, destroy the enablings retained
12705 * for the specified state.
12706 */
12707 for (enab = dtrace_retained; enab != NULL; enab = next) {
12708 next = enab->dten_next;
12709
12710 /*
12711 * dtvs_state can only be NULL for helper enablings -- and
12712 * helper enablings can't be retained.
12713 */
12714 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12715
12716 if (enab->dten_vstate->dtvs_state == state) {
12717 ASSERT(state->dts_nretained > 0);
12718 dtrace_enabling_destroy(enab);
12719 }
12720 }
12721
12722 ASSERT(state->dts_nretained == 0);
12723 }
12724
12725 static int
12726 dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched, dtrace_match_cond_t *cond)
12727 {
12728 int i = 0;
12729 int total_matched = 0, matched = 0;
12730
12731 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
12732 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12733
12734 for (i = 0; i < enab->dten_ndesc; i++) {
12735 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12736
12737 enab->dten_current = ep;
12738 enab->dten_error = 0;
12739
12740 /**
12741 * Before doing a dtrace_probe_enable, which is really
12742 * expensive, check that this enabling matches the matching precondition
12743 * if we have one
12744 */
12745 if (cond && (cond->dmc_func(&ep->dted_probe, cond->dmc_data) == 0)) {
12746 continue;
12747 }
12748 /*
12749 * If a provider failed to enable a probe then get out and
12750 * let the consumer know we failed.
12751 */
12752 if ((matched = dtrace_probe_enable(&ep->dted_probe, enab, ep)) < 0)
12753 return (EBUSY);
12754
12755 total_matched += matched;
12756
12757 if (enab->dten_error != 0) {
12758 /*
12759 * If we get an error half-way through enabling the
12760 * probes, we kick out -- perhaps with some number of
12761 * them enabled. Leaving enabled probes enabled may
12762 * be slightly confusing for user-level, but we expect
12763 * that no one will attempt to actually drive on in
12764 * the face of such errors. If this is an anonymous
12765 * enabling (indicated with a NULL nmatched pointer),
12766 * we cmn_err() a message. We aren't expecting to
12767 * get such an error -- such as it can exist at all,
12768 * it would be a result of corrupted DOF in the driver
12769 * properties.
12770 */
12771 if (nmatched == NULL) {
12772 cmn_err(CE_WARN, "dtrace_enabling_match() "
12773 "error on %p: %d", (void *)ep,
12774 enab->dten_error);
12775 }
12776
12777 return (enab->dten_error);
12778 }
12779
12780 ep->dted_probegen = dtrace_probegen;
12781 }
12782
12783 if (nmatched != NULL)
12784 *nmatched = total_matched;
12785
12786 return (0);
12787 }
12788
12789 static void
12790 dtrace_enabling_matchall_with_cond(dtrace_match_cond_t *cond)
12791 {
12792 dtrace_enabling_t *enab;
12793
12794 lck_mtx_lock(&cpu_lock);
12795 lck_mtx_lock(&dtrace_lock);
12796
12797 /*
12798 * Iterate over all retained enablings to see if any probes match
12799 * against them. We only perform this operation on enablings for which
12800 * we have sufficient permissions by virtue of being in the global zone
12801 * or in the same zone as the DTrace client. Because we can be called
12802 * after dtrace_detach() has been called, we cannot assert that there
12803 * are retained enablings. We can safely load from dtrace_retained,
12804 * however: the taskq_destroy() at the end of dtrace_detach() will
12805 * block pending our completion.
12806 */
12807
12808 /*
12809 * Darwin doesn't do zones.
12810 * Behave as if always in "global" zone."
12811 */
12812 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12813 (void) dtrace_enabling_match(enab, NULL, cond);
12814 }
12815
12816 lck_mtx_unlock(&dtrace_lock);
12817 lck_mtx_unlock(&cpu_lock);
12818
12819 }
12820
12821 static void
12822 dtrace_enabling_matchall(void)
12823 {
12824 dtrace_enabling_matchall_with_cond(NULL);
12825 }
12826
12827
12828
12829 /*
12830 * If an enabling is to be enabled without having matched probes (that is, if
12831 * dtrace_state_go() is to be called on the underlying dtrace_state_t), the
12832 * enabling must be _primed_ by creating an ECB for every ECB description.
12833 * This must be done to assure that we know the number of speculations, the
12834 * number of aggregations, the minimum buffer size needed, etc. before we
12835 * transition out of DTRACE_ACTIVITY_INACTIVE. To do this without actually
12836 * enabling any probes, we create ECBs for every ECB decription, but with a
12837 * NULL probe -- which is exactly what this function does.
12838 */
12839 static void
12840 dtrace_enabling_prime(dtrace_state_t *state)
12841 {
12842 dtrace_enabling_t *enab;
12843 int i;
12844
12845 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12846 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12847
12848 if (enab->dten_vstate->dtvs_state != state)
12849 continue;
12850
12851 /*
12852 * We don't want to prime an enabling more than once, lest
12853 * we allow a malicious user to induce resource exhaustion.
12854 * (The ECBs that result from priming an enabling aren't
12855 * leaked -- but they also aren't deallocated until the
12856 * consumer state is destroyed.)
12857 */
12858 if (enab->dten_primed)
12859 continue;
12860
12861 for (i = 0; i < enab->dten_ndesc; i++) {
12862 enab->dten_current = enab->dten_desc[i];
12863 (void) dtrace_probe_enable(NULL, enab, NULL);
12864 }
12865
12866 enab->dten_primed = 1;
12867 }
12868 }
12869
12870 /*
12871 * Called to indicate that probes should be provided due to retained
12872 * enablings. This is implemented in terms of dtrace_probe_provide(), but it
12873 * must take an initial lap through the enabling calling the dtps_provide()
12874 * entry point explicitly to allow for autocreated probes.
12875 */
12876 static void
12877 dtrace_enabling_provide(dtrace_provider_t *prv)
12878 {
12879 int i, all = 0;
12880 dtrace_probedesc_t desc;
12881 dtrace_genid_t gen;
12882
12883 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12884 LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
12885
12886 if (prv == NULL) {
12887 all = 1;
12888 prv = dtrace_provider;
12889 }
12890
12891 do {
12892 dtrace_enabling_t *enab;
12893 void *parg = prv->dtpv_arg;
12894
12895 retry:
12896 gen = dtrace_retained_gen;
12897 for (enab = dtrace_retained; enab != NULL;
12898 enab = enab->dten_next) {
12899 for (i = 0; i < enab->dten_ndesc; i++) {
12900 desc = enab->dten_desc[i]->dted_probe;
12901 lck_mtx_unlock(&dtrace_lock);
12902 prv->dtpv_pops.dtps_provide(parg, &desc);
12903 lck_mtx_lock(&dtrace_lock);
12904 /*
12905 * Process the retained enablings again if
12906 * they have changed while we weren't holding
12907 * dtrace_lock.
12908 */
12909 if (gen != dtrace_retained_gen)
12910 goto retry;
12911 }
12912 }
12913 } while (all && (prv = prv->dtpv_next) != NULL);
12914
12915 lck_mtx_unlock(&dtrace_lock);
12916 dtrace_probe_provide(NULL, all ? NULL : prv);
12917 lck_mtx_lock(&dtrace_lock);
12918 }
12919
12920 /*
12921 * DTrace DOF Functions
12922 */
12923 /*ARGSUSED*/
12924 static void
12925 dtrace_dof_error(dof_hdr_t *dof, const char *str)
12926 {
12927 #pragma unused(dof) /* __APPLE__ */
12928 if (dtrace_err_verbose)
12929 cmn_err(CE_WARN, "failed to process DOF: %s", str);
12930
12931 #ifdef DTRACE_ERRDEBUG
12932 dtrace_errdebug(str);
12933 #endif
12934 }
12935
12936 /*
12937 * Create DOF out of a currently enabled state. Right now, we only create
12938 * DOF containing the run-time options -- but this could be expanded to create
12939 * complete DOF representing the enabled state.
12940 */
12941 static dof_hdr_t *
12942 dtrace_dof_create(dtrace_state_t *state)
12943 {
12944 dof_hdr_t *dof;
12945 dof_sec_t *sec;
12946 dof_optdesc_t *opt;
12947 int i, len = sizeof (dof_hdr_t) +
12948 roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
12949 sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12950
12951 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12952
12953 dof = kmem_zalloc_aligned(len, 8, KM_SLEEP);
12954 dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
12955 dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
12956 dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
12957 dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
12958
12959 dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
12960 dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
12961 dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
12962 dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
12963 dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
12964 dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
12965
12966 dof->dofh_flags = 0;
12967 dof->dofh_hdrsize = sizeof (dof_hdr_t);
12968 dof->dofh_secsize = sizeof (dof_sec_t);
12969 dof->dofh_secnum = 1; /* only DOF_SECT_OPTDESC */
12970 dof->dofh_secoff = sizeof (dof_hdr_t);
12971 dof->dofh_loadsz = len;
12972 dof->dofh_filesz = len;
12973 dof->dofh_pad = 0;
12974
12975 /*
12976 * Fill in the option section header...
12977 */
12978 sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
12979 sec->dofs_type = DOF_SECT_OPTDESC;
12980 sec->dofs_align = sizeof (uint64_t);
12981 sec->dofs_flags = DOF_SECF_LOAD;
12982 sec->dofs_entsize = sizeof (dof_optdesc_t);
12983
12984 opt = (dof_optdesc_t *)((uintptr_t)sec +
12985 roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
12986
12987 sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
12988 sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12989
12990 for (i = 0; i < DTRACEOPT_MAX; i++) {
12991 opt[i].dofo_option = i;
12992 opt[i].dofo_strtab = DOF_SECIDX_NONE;
12993 opt[i].dofo_value = state->dts_options[i];
12994 }
12995
12996 return (dof);
12997 }
12998
12999 static dof_hdr_t *
13000 dtrace_dof_copyin(user_addr_t uarg, int *errp)
13001 {
13002 dof_hdr_t hdr, *dof;
13003
13004 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
13005
13006 /*
13007 * First, we're going to copyin() the sizeof (dof_hdr_t).
13008 */
13009 if (copyin(uarg, &hdr, sizeof (hdr)) != 0) {
13010 dtrace_dof_error(NULL, "failed to copyin DOF header");
13011 *errp = EFAULT;
13012 return (NULL);
13013 }
13014
13015 /*
13016 * Now we'll allocate the entire DOF and copy it in -- provided
13017 * that the length isn't outrageous.
13018 */
13019 if (hdr.dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
13020 dtrace_dof_error(&hdr, "load size exceeds maximum");
13021 *errp = E2BIG;
13022 return (NULL);
13023 }
13024
13025 if (hdr.dofh_loadsz < sizeof (hdr)) {
13026 dtrace_dof_error(&hdr, "invalid load size");
13027 *errp = EINVAL;
13028 return (NULL);
13029 }
13030
13031 dof = kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP);
13032
13033 if (copyin(uarg, dof, hdr.dofh_loadsz) != 0 ||
13034 dof->dofh_loadsz != hdr.dofh_loadsz) {
13035 kmem_free_aligned(dof, hdr.dofh_loadsz);
13036 *errp = EFAULT;
13037 return (NULL);
13038 }
13039
13040 return (dof);
13041 }
13042
13043 static dof_hdr_t *
13044 dtrace_dof_copyin_from_proc(proc_t* p, user_addr_t uarg, int *errp)
13045 {
13046 dof_hdr_t hdr, *dof;
13047
13048 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
13049
13050 /*
13051 * First, we're going to copyin() the sizeof (dof_hdr_t).
13052 */
13053 if (uread(p, &hdr, sizeof(hdr), uarg) != KERN_SUCCESS) {
13054 dtrace_dof_error(NULL, "failed to copyin DOF header");
13055 *errp = EFAULT;
13056 return (NULL);
13057 }
13058
13059 /*
13060 * Now we'll allocate the entire DOF and copy it in -- provided
13061 * that the length isn't outrageous.
13062 */
13063 if (hdr.dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
13064 dtrace_dof_error(&hdr, "load size exceeds maximum");
13065 *errp = E2BIG;
13066 return (NULL);
13067 }
13068
13069 if (hdr.dofh_loadsz < sizeof (hdr)) {
13070 dtrace_dof_error(&hdr, "invalid load size");
13071 *errp = EINVAL;
13072 return (NULL);
13073 }
13074
13075 dof = kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP);
13076
13077 if (uread(p, dof, hdr.dofh_loadsz, uarg) != KERN_SUCCESS) {
13078 kmem_free_aligned(dof, hdr.dofh_loadsz);
13079 *errp = EFAULT;
13080 return (NULL);
13081 }
13082
13083 return (dof);
13084 }
13085
13086 static void
13087 dtrace_dof_destroy(dof_hdr_t *dof)
13088 {
13089 kmem_free_aligned(dof, dof->dofh_loadsz);
13090 }
13091
13092 static dof_hdr_t *
13093 dtrace_dof_property(const char *name)
13094 {
13095 unsigned int len = 0;
13096 dof_hdr_t *dof;
13097
13098 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
13099 return NULL;
13100 }
13101
13102 if (!PEReadNVRAMProperty(name, NULL, &len)) {
13103 return NULL;
13104 }
13105
13106 dof = kmem_alloc_aligned(len, 8, KM_SLEEP);
13107
13108 if (!PEReadNVRAMProperty(name, dof, &len)) {
13109 dtrace_dof_destroy(dof);
13110 dtrace_dof_error(NULL, "unreadable DOF");
13111 return NULL;
13112 }
13113
13114 if (len < sizeof (dof_hdr_t)) {
13115 dtrace_dof_destroy(dof);
13116 dtrace_dof_error(NULL, "truncated header");
13117 return (NULL);
13118 }
13119
13120 if (len < dof->dofh_loadsz) {
13121 dtrace_dof_destroy(dof);
13122 dtrace_dof_error(NULL, "truncated DOF");
13123 return (NULL);
13124 }
13125
13126 if (len != dof->dofh_loadsz) {
13127 dtrace_dof_destroy(dof);
13128 dtrace_dof_error(NULL, "invalid DOF size");
13129 return (NULL);
13130 }
13131
13132 if (dof->dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
13133 dtrace_dof_destroy(dof);
13134 dtrace_dof_error(NULL, "oversized DOF");
13135 return (NULL);
13136 }
13137
13138 return (dof);
13139 }
13140
13141 /*
13142 * Return the dof_sec_t pointer corresponding to a given section index. If the
13143 * index is not valid, dtrace_dof_error() is called and NULL is returned. If
13144 * a type other than DOF_SECT_NONE is specified, the header is checked against
13145 * this type and NULL is returned if the types do not match.
13146 */
13147 static dof_sec_t *
13148 dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
13149 {
13150 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)
13151 ((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
13152
13153 if (i >= dof->dofh_secnum) {
13154 dtrace_dof_error(dof, "referenced section index is invalid");
13155 return (NULL);
13156 }
13157
13158 if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
13159 dtrace_dof_error(dof, "referenced section is not loadable");
13160 return (NULL);
13161 }
13162
13163 if (type != DOF_SECT_NONE && type != sec->dofs_type) {
13164 dtrace_dof_error(dof, "referenced section is the wrong type");
13165 return (NULL);
13166 }
13167
13168 return (sec);
13169 }
13170
13171 static dtrace_probedesc_t *
13172 dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)
13173 {
13174 dof_probedesc_t *probe;
13175 dof_sec_t *strtab;
13176 uintptr_t daddr = (uintptr_t)dof;
13177 uintptr_t str;
13178 size_t size;
13179
13180 if (sec->dofs_type != DOF_SECT_PROBEDESC) {
13181 dtrace_dof_error(dof, "invalid probe section");
13182 return (NULL);
13183 }
13184
13185 if (sec->dofs_align != sizeof (dof_secidx_t)) {
13186 dtrace_dof_error(dof, "bad alignment in probe description");
13187 return (NULL);
13188 }
13189
13190 if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
13191 dtrace_dof_error(dof, "truncated probe description");
13192 return (NULL);
13193 }
13194
13195 probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
13196 strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
13197
13198 if (strtab == NULL)
13199 return (NULL);
13200
13201 str = daddr + strtab->dofs_offset;
13202 size = strtab->dofs_size;
13203
13204 if (probe->dofp_provider >= strtab->dofs_size) {
13205 dtrace_dof_error(dof, "corrupt probe provider");
13206 return (NULL);
13207 }
13208
13209 (void) strncpy(desc->dtpd_provider,
13210 (char *)(str + probe->dofp_provider),
13211 MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));
13212
13213 /* APPLE NOTE: Darwin employs size bounded string operation. */
13214 desc->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
13215
13216 if (probe->dofp_mod >= strtab->dofs_size) {
13217 dtrace_dof_error(dof, "corrupt probe module");
13218 return (NULL);
13219 }
13220
13221 (void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
13222 MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));
13223
13224 /* APPLE NOTE: Darwin employs size bounded string operation. */
13225 desc->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
13226
13227 if (probe->dofp_func >= strtab->dofs_size) {
13228 dtrace_dof_error(dof, "corrupt probe function");
13229 return (NULL);
13230 }
13231
13232 (void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
13233 MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));
13234
13235 /* APPLE NOTE: Darwin employs size bounded string operation. */
13236 desc->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
13237
13238 if (probe->dofp_name >= strtab->dofs_size) {
13239 dtrace_dof_error(dof, "corrupt probe name");
13240 return (NULL);
13241 }
13242
13243 (void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
13244 MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));
13245
13246 /* APPLE NOTE: Darwin employs size bounded string operation. */
13247 desc->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
13248
13249 return (desc);
13250 }
13251
13252 static dtrace_difo_t *
13253 dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13254 cred_t *cr)
13255 {
13256 dtrace_difo_t *dp;
13257 size_t ttl = 0;
13258 dof_difohdr_t *dofd;
13259 uintptr_t daddr = (uintptr_t)dof;
13260 size_t max_size = dtrace_difo_maxsize;
13261 uint_t i;
13262 int l, n;
13263
13264
13265 static const struct {
13266 int section;
13267 int bufoffs;
13268 int lenoffs;
13269 int entsize;
13270 int align;
13271 const char *msg;
13272 } difo[] = {
13273 { DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
13274 offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
13275 sizeof (dif_instr_t), "multiple DIF sections" },
13276
13277 { DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
13278 offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
13279 sizeof (uint64_t), "multiple integer tables" },
13280
13281 { DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
13282 offsetof(dtrace_difo_t, dtdo_strlen), 0,
13283 sizeof (char), "multiple string tables" },
13284
13285 { DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
13286 offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
13287 sizeof (uint_t), "multiple variable tables" },
13288
13289 { DOF_SECT_NONE, 0, 0, 0, 0, NULL }
13290 };
13291
13292 if (sec->dofs_type != DOF_SECT_DIFOHDR) {
13293 dtrace_dof_error(dof, "invalid DIFO header section");
13294 return (NULL);
13295 }
13296
13297 if (sec->dofs_align != sizeof (dof_secidx_t)) {
13298 dtrace_dof_error(dof, "bad alignment in DIFO header");
13299 return (NULL);
13300 }
13301
13302 if (sec->dofs_size < sizeof (dof_difohdr_t) ||
13303 sec->dofs_size % sizeof (dof_secidx_t)) {
13304 dtrace_dof_error(dof, "bad size in DIFO header");
13305 return (NULL);
13306 }
13307
13308 dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
13309 n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;
13310
13311 dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
13312 dp->dtdo_rtype = dofd->dofd_rtype;
13313
13314 for (l = 0; l < n; l++) {
13315 dof_sec_t *subsec;
13316 void **bufp;
13317 uint32_t *lenp;
13318
13319 if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
13320 dofd->dofd_links[l])) == NULL)
13321 goto err; /* invalid section link */
13322
13323 if (ttl + subsec->dofs_size > max_size) {
13324 dtrace_dof_error(dof, "exceeds maximum size");
13325 goto err;
13326 }
13327
13328 ttl += subsec->dofs_size;
13329
13330 for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
13331
13332 if (subsec->dofs_type != (uint32_t)difo[i].section)
13333 continue;
13334
13335 if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
13336 dtrace_dof_error(dof, "section not loaded");
13337 goto err;
13338 }
13339
13340 if (subsec->dofs_align != (uint32_t)difo[i].align) {
13341 dtrace_dof_error(dof, "bad alignment");
13342 goto err;
13343 }
13344
13345 bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
13346 lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
13347
13348 if (*bufp != NULL) {
13349 dtrace_dof_error(dof, difo[i].msg);
13350 goto err;
13351 }
13352
13353 if ((uint32_t)difo[i].entsize != subsec->dofs_entsize) {
13354 dtrace_dof_error(dof, "entry size mismatch");
13355 goto err;
13356 }
13357
13358 if (subsec->dofs_entsize != 0 &&
13359 (subsec->dofs_size % subsec->dofs_entsize) != 0) {
13360 dtrace_dof_error(dof, "corrupt entry size");
13361 goto err;
13362 }
13363
13364 *lenp = subsec->dofs_size;
13365 *bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
13366 bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
13367 *bufp, subsec->dofs_size);
13368
13369 if (subsec->dofs_entsize != 0)
13370 *lenp /= subsec->dofs_entsize;
13371
13372 break;
13373 }
13374
13375 /*
13376 * If we encounter a loadable DIFO sub-section that is not
13377 * known to us, assume this is a broken program and fail.
13378 */
13379 if (difo[i].section == DOF_SECT_NONE &&
13380 (subsec->dofs_flags & DOF_SECF_LOAD)) {
13381 dtrace_dof_error(dof, "unrecognized DIFO subsection");
13382 goto err;
13383 }
13384 }
13385
13386 if (dp->dtdo_buf == NULL) {
13387 /*
13388 * We can't have a DIF object without DIF text.
13389 */
13390 dtrace_dof_error(dof, "missing DIF text");
13391 goto err;
13392 }
13393
13394 /*
13395 * Before we validate the DIF object, run through the variable table
13396 * looking for the strings -- if any of their size are under, we'll set
13397 * their size to be the system-wide default string size. Note that
13398 * this should _not_ happen if the "strsize" option has been set --
13399 * in this case, the compiler should have set the size to reflect the
13400 * setting of the option.
13401 */
13402 for (i = 0; i < dp->dtdo_varlen; i++) {
13403 dtrace_difv_t *v = &dp->dtdo_vartab[i];
13404 dtrace_diftype_t *t = &v->dtdv_type;
13405
13406 if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
13407 continue;
13408
13409 if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
13410 t->dtdt_size = dtrace_strsize_default;
13411 }
13412
13413 if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
13414 goto err;
13415
13416 dtrace_difo_init(dp, vstate);
13417 return (dp);
13418
13419 err:
13420 kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
13421 kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
13422 kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
13423 kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
13424
13425 kmem_free(dp, sizeof (dtrace_difo_t));
13426 return (NULL);
13427 }
13428
13429 static dtrace_predicate_t *
13430 dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13431 cred_t *cr)
13432 {
13433 dtrace_difo_t *dp;
13434
13435 if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
13436 return (NULL);
13437
13438 return (dtrace_predicate_create(dp));
13439 }
13440
13441 static dtrace_actdesc_t *
13442 dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13443 cred_t *cr)
13444 {
13445 dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;
13446 dof_actdesc_t *desc;
13447 dof_sec_t *difosec;
13448 size_t offs;
13449 uintptr_t daddr = (uintptr_t)dof;
13450 uint64_t arg;
13451 dtrace_actkind_t kind;
13452
13453 if (sec->dofs_type != DOF_SECT_ACTDESC) {
13454 dtrace_dof_error(dof, "invalid action section");
13455 return (NULL);
13456 }
13457
13458 if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
13459 dtrace_dof_error(dof, "truncated action description");
13460 return (NULL);
13461 }
13462
13463 if (sec->dofs_align != sizeof (uint64_t)) {
13464 dtrace_dof_error(dof, "bad alignment in action description");
13465 return (NULL);
13466 }
13467
13468 if (sec->dofs_size < sec->dofs_entsize) {
13469 dtrace_dof_error(dof, "section entry size exceeds total size");
13470 return (NULL);
13471 }
13472
13473 if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
13474 dtrace_dof_error(dof, "bad entry size in action description");
13475 return (NULL);
13476 }
13477
13478 if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
13479 dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
13480 return (NULL);
13481 }
13482
13483 for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
13484 desc = (dof_actdesc_t *)(daddr +
13485 (uintptr_t)sec->dofs_offset + offs);
13486 kind = (dtrace_actkind_t)desc->dofa_kind;
13487
13488 if ((DTRACEACT_ISPRINTFLIKE(kind) &&
13489 (kind != DTRACEACT_PRINTA || desc->dofa_strtab != DOF_SECIDX_NONE)) ||
13490 (kind == DTRACEACT_DIFEXPR && desc->dofa_strtab != DOF_SECIDX_NONE))
13491 {
13492 dof_sec_t *strtab;
13493 char *str, *fmt;
13494 uint64_t i;
13495
13496 /*
13497 * The argument to these actions is an index into the
13498 * DOF string table. For printf()-like actions, this
13499 * is the format string. For print(), this is the
13500 * CTF type of the expression result.
13501 */
13502 if ((strtab = dtrace_dof_sect(dof,
13503 DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
13504 goto err;
13505
13506 str = (char *)((uintptr_t)dof +
13507 (uintptr_t)strtab->dofs_offset);
13508
13509 for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
13510 if (str[i] == '\0')
13511 break;
13512 }
13513
13514 if (i >= strtab->dofs_size) {
13515 dtrace_dof_error(dof, "bogus format string");
13516 goto err;
13517 }
13518
13519 if (i == desc->dofa_arg) {
13520 dtrace_dof_error(dof, "empty format string");
13521 goto err;
13522 }
13523
13524 i -= desc->dofa_arg;
13525 fmt = kmem_alloc(i + 1, KM_SLEEP);
13526 bcopy(&str[desc->dofa_arg], fmt, i + 1);
13527 arg = (uint64_t)(uintptr_t)fmt;
13528 } else {
13529 if (kind == DTRACEACT_PRINTA) {
13530 ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
13531 arg = 0;
13532 } else {
13533 arg = desc->dofa_arg;
13534 }
13535 }
13536
13537 act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
13538 desc->dofa_uarg, arg);
13539
13540 if (last != NULL) {
13541 last->dtad_next = act;
13542 } else {
13543 first = act;
13544 }
13545
13546 last = act;
13547
13548 if (desc->dofa_difo == DOF_SECIDX_NONE)
13549 continue;
13550
13551 if ((difosec = dtrace_dof_sect(dof,
13552 DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
13553 goto err;
13554
13555 act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
13556
13557 if (act->dtad_difo == NULL)
13558 goto err;
13559 }
13560
13561 ASSERT(first != NULL);
13562 return (first);
13563
13564 err:
13565 for (act = first; act != NULL; act = next) {
13566 next = act->dtad_next;
13567 dtrace_actdesc_release(act, vstate);
13568 }
13569
13570 return (NULL);
13571 }
13572
13573 static dtrace_ecbdesc_t *
13574 dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13575 cred_t *cr)
13576 {
13577 dtrace_ecbdesc_t *ep;
13578 dof_ecbdesc_t *ecb;
13579 dtrace_probedesc_t *desc;
13580 dtrace_predicate_t *pred = NULL;
13581
13582 if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
13583 dtrace_dof_error(dof, "truncated ECB description");
13584 return (NULL);
13585 }
13586
13587 if (sec->dofs_align != sizeof (uint64_t)) {
13588 dtrace_dof_error(dof, "bad alignment in ECB description");
13589 return (NULL);
13590 }
13591
13592 ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
13593 sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
13594
13595 if (sec == NULL)
13596 return (NULL);
13597
13598 ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
13599 ep->dted_uarg = ecb->dofe_uarg;
13600 desc = &ep->dted_probe;
13601
13602 if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
13603 goto err;
13604
13605 if (ecb->dofe_pred != DOF_SECIDX_NONE) {
13606 if ((sec = dtrace_dof_sect(dof,
13607 DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
13608 goto err;
13609
13610 if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
13611 goto err;
13612
13613 ep->dted_pred.dtpdd_predicate = pred;
13614 }
13615
13616 if (ecb->dofe_actions != DOF_SECIDX_NONE) {
13617 if ((sec = dtrace_dof_sect(dof,
13618 DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
13619 goto err;
13620
13621 ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
13622
13623 if (ep->dted_action == NULL)
13624 goto err;
13625 }
13626
13627 return (ep);
13628
13629 err:
13630 if (pred != NULL)
13631 dtrace_predicate_release(pred, vstate);
13632 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
13633 return (NULL);
13634 }
13635
13636 /*
13637 * APPLE NOTE: dyld handles dof relocation.
13638 * Darwin does not need dtrace_dof_relocate()
13639 */
13640
13641 /*
13642 * The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
13643 * header: it should be at the front of a memory region that is at least
13644 * sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
13645 * size. It need not be validated in any other way.
13646 */
13647 static int
13648 dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
13649 dtrace_enabling_t **enabp, uint64_t ubase, int noprobes)
13650 {
13651 #pragma unused(ubase) /* __APPLE__ */
13652 uint64_t len = dof->dofh_loadsz, seclen;
13653 uintptr_t daddr = (uintptr_t)dof;
13654 dtrace_ecbdesc_t *ep;
13655 dtrace_enabling_t *enab;
13656 uint_t i;
13657
13658 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13659 ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
13660
13661 /*
13662 * Check the DOF header identification bytes. In addition to checking
13663 * valid settings, we also verify that unused bits/bytes are zeroed so
13664 * we can use them later without fear of regressing existing binaries.
13665 */
13666 if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
13667 DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
13668 dtrace_dof_error(dof, "DOF magic string mismatch");
13669 return (-1);
13670 }
13671
13672 if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
13673 dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
13674 dtrace_dof_error(dof, "DOF has invalid data model");
13675 return (-1);
13676 }
13677
13678 if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
13679 dtrace_dof_error(dof, "DOF encoding mismatch");
13680 return (-1);
13681 }
13682
13683 /*
13684 * APPLE NOTE: Darwin only supports DOF_VERSION_3 for now.
13685 */
13686 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_3) {
13687 dtrace_dof_error(dof, "DOF version mismatch");
13688 return (-1);
13689 }
13690
13691 if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
13692 dtrace_dof_error(dof, "DOF uses unsupported instruction set");
13693 return (-1);
13694 }
13695
13696 if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
13697 dtrace_dof_error(dof, "DOF uses too many integer registers");
13698 return (-1);
13699 }
13700
13701 if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
13702 dtrace_dof_error(dof, "DOF uses too many tuple registers");
13703 return (-1);
13704 }
13705
13706 for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
13707 if (dof->dofh_ident[i] != 0) {
13708 dtrace_dof_error(dof, "DOF has invalid ident byte set");
13709 return (-1);
13710 }
13711 }
13712
13713 if (dof->dofh_flags & ~DOF_FL_VALID) {
13714 dtrace_dof_error(dof, "DOF has invalid flag bits set");
13715 return (-1);
13716 }
13717
13718 if (dof->dofh_secsize < sizeof(dof_sec_t)) {
13719 dtrace_dof_error(dof, "invalid section header size");
13720 return (-1);
13721 }
13722
13723 /*
13724 * Check that the section headers don't exceed the amount of DOF
13725 * data. Note that we cast the section size and number of sections
13726 * to uint64_t's to prevent possible overflow in the multiplication.
13727 */
13728 seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
13729
13730 if (dof->dofh_secoff > len || seclen > len ||
13731 dof->dofh_secoff + seclen > len) {
13732 dtrace_dof_error(dof, "truncated section headers");
13733 return (-1);
13734 }
13735
13736 if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
13737 dtrace_dof_error(dof, "misaligned section headers");
13738 return (-1);
13739 }
13740
13741 if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
13742 dtrace_dof_error(dof, "misaligned section size");
13743 return (-1);
13744 }
13745
13746 /*
13747 * Take an initial pass through the section headers to be sure that
13748 * the headers don't have stray offsets. If the 'noprobes' flag is
13749 * set, do not permit sections relating to providers, probes, or args.
13750 */
13751 for (i = 0; i < dof->dofh_secnum; i++) {
13752 dof_sec_t *sec = (dof_sec_t *)(daddr +
13753 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13754
13755 if (noprobes) {
13756 switch (sec->dofs_type) {
13757 case DOF_SECT_PROVIDER:
13758 case DOF_SECT_PROBES:
13759 case DOF_SECT_PRARGS:
13760 case DOF_SECT_PROFFS:
13761 dtrace_dof_error(dof, "illegal sections "
13762 "for enabling");
13763 return (-1);
13764 }
13765 }
13766
13767 if (!(sec->dofs_flags & DOF_SECF_LOAD))
13768 continue; /* just ignore non-loadable sections */
13769
13770 if (sec->dofs_align & (sec->dofs_align - 1)) {
13771 dtrace_dof_error(dof, "bad section alignment");
13772 return (-1);
13773 }
13774
13775 if (sec->dofs_offset & (sec->dofs_align - 1)) {
13776 dtrace_dof_error(dof, "misaligned section");
13777 return (-1);
13778 }
13779
13780 if (sec->dofs_offset > len || sec->dofs_size > len ||
13781 sec->dofs_offset + sec->dofs_size > len) {
13782 dtrace_dof_error(dof, "corrupt section header");
13783 return (-1);
13784 }
13785
13786 if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +
13787 sec->dofs_offset + sec->dofs_size - 1) != '\0') {
13788 dtrace_dof_error(dof, "non-terminating string table");
13789 return (-1);
13790 }
13791 }
13792
13793 /*
13794 * APPLE NOTE: We have no further relocation to perform.
13795 * All dof values are relative offsets.
13796 */
13797
13798 if ((enab = *enabp) == NULL)
13799 enab = *enabp = dtrace_enabling_create(vstate);
13800
13801 for (i = 0; i < dof->dofh_secnum; i++) {
13802 dof_sec_t *sec = (dof_sec_t *)(daddr +
13803 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13804
13805 if (sec->dofs_type != DOF_SECT_ECBDESC)
13806 continue;
13807
13808 /*
13809 * APPLE NOTE: Defend against gcc 4.0 botch on x86.
13810 * not all paths out of inlined dtrace_dof_ecbdesc
13811 * are checked for the NULL return value.
13812 * Check for NULL explicitly here.
13813 */
13814 ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr);
13815 if (ep == NULL) {
13816 dtrace_enabling_destroy(enab);
13817 *enabp = NULL;
13818 return (-1);
13819 }
13820
13821 dtrace_enabling_add(enab, ep);
13822 }
13823
13824 return (0);
13825 }
13826
13827 /*
13828 * Process DOF for any options. This routine assumes that the DOF has been
13829 * at least processed by dtrace_dof_slurp().
13830 */
13831 static int
13832 dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)
13833 {
13834 uint_t i;
13835 int rval;
13836 uint32_t entsize;
13837 size_t offs;
13838 dof_optdesc_t *desc;
13839
13840 for (i = 0; i < dof->dofh_secnum; i++) {
13841 dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +
13842 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13843
13844 if (sec->dofs_type != DOF_SECT_OPTDESC)
13845 continue;
13846
13847 if (sec->dofs_align != sizeof (uint64_t)) {
13848 dtrace_dof_error(dof, "bad alignment in "
13849 "option description");
13850 return (EINVAL);
13851 }
13852
13853 if ((entsize = sec->dofs_entsize) == 0) {
13854 dtrace_dof_error(dof, "zeroed option entry size");
13855 return (EINVAL);
13856 }
13857
13858 if (entsize < sizeof (dof_optdesc_t)) {
13859 dtrace_dof_error(dof, "bad option entry size");
13860 return (EINVAL);
13861 }
13862
13863 for (offs = 0; offs < sec->dofs_size; offs += entsize) {
13864 desc = (dof_optdesc_t *)((uintptr_t)dof +
13865 (uintptr_t)sec->dofs_offset + offs);
13866
13867 if (desc->dofo_strtab != DOF_SECIDX_NONE) {
13868 dtrace_dof_error(dof, "non-zero option string");
13869 return (EINVAL);
13870 }
13871
13872 if (desc->dofo_value == (uint64_t)DTRACEOPT_UNSET) {
13873 dtrace_dof_error(dof, "unset option");
13874 return (EINVAL);
13875 }
13876
13877 if ((rval = dtrace_state_option(state,
13878 desc->dofo_option, desc->dofo_value)) != 0) {
13879 dtrace_dof_error(dof, "rejected option");
13880 return (rval);
13881 }
13882 }
13883 }
13884
13885 return (0);
13886 }
13887
13888 /*
13889 * DTrace Consumer State Functions
13890 */
13891 static int
13892 dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
13893 {
13894 size_t hashsize, maxper, min_size, chunksize = dstate->dtds_chunksize;
13895 void *base;
13896 uintptr_t limit;
13897 dtrace_dynvar_t *dvar, *next, *start;
13898 size_t i;
13899
13900 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13901 ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
13902
13903 bzero(dstate, sizeof (dtrace_dstate_t));
13904
13905 if ((dstate->dtds_chunksize = chunksize) == 0)
13906 dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
13907
13908 VERIFY(dstate->dtds_chunksize < (LONG_MAX - sizeof (dtrace_dynhash_t)));
13909
13910 if (size < (min_size = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
13911 size = min_size;
13912
13913 if ((base = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
13914 return (ENOMEM);
13915
13916 dstate->dtds_size = size;
13917 dstate->dtds_base = base;
13918 dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
13919 bzero(dstate->dtds_percpu, (int)NCPU * sizeof (dtrace_dstate_percpu_t));
13920
13921 hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
13922
13923 if (hashsize != 1 && (hashsize & 1))
13924 hashsize--;
13925
13926 dstate->dtds_hashsize = hashsize;
13927 dstate->dtds_hash = dstate->dtds_base;
13928
13929 /*
13930 * Set all of our hash buckets to point to the single sink, and (if
13931 * it hasn't already been set), set the sink's hash value to be the
13932 * sink sentinel value. The sink is needed for dynamic variable
13933 * lookups to know that they have iterated over an entire, valid hash
13934 * chain.
13935 */
13936 for (i = 0; i < hashsize; i++)
13937 dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
13938
13939 if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
13940 dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
13941
13942 /*
13943 * Determine number of active CPUs. Divide free list evenly among
13944 * active CPUs.
13945 */
13946 start = (dtrace_dynvar_t *)
13947 ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
13948 limit = (uintptr_t)base + size;
13949
13950 VERIFY((uintptr_t)start < limit);
13951 VERIFY((uintptr_t)start >= (uintptr_t)base);
13952
13953 maxper = (limit - (uintptr_t)start) / (int)NCPU;
13954 maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
13955
13956 for (i = 0; i < NCPU; i++) {
13957 dstate->dtds_percpu[i].dtdsc_free = dvar = start;
13958
13959 /*
13960 * If we don't even have enough chunks to make it once through
13961 * NCPUs, we're just going to allocate everything to the first
13962 * CPU. And if we're on the last CPU, we're going to allocate
13963 * whatever is left over. In either case, we set the limit to
13964 * be the limit of the dynamic variable space.
13965 */
13966 if (maxper == 0 || i == NCPU - 1) {
13967 limit = (uintptr_t)base + size;
13968 start = NULL;
13969 } else {
13970 limit = (uintptr_t)start + maxper;
13971 start = (dtrace_dynvar_t *)limit;
13972 }
13973
13974 VERIFY(limit <= (uintptr_t)base + size);
13975
13976 for (;;) {
13977 next = (dtrace_dynvar_t *)((uintptr_t)dvar +
13978 dstate->dtds_chunksize);
13979
13980 if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
13981 break;
13982
13983 VERIFY((uintptr_t)dvar >= (uintptr_t)base &&
13984 (uintptr_t)dvar <= (uintptr_t)base + size);
13985 dvar->dtdv_next = next;
13986 dvar = next;
13987 }
13988
13989 if (maxper == 0)
13990 break;
13991 }
13992
13993 return (0);
13994 }
13995
13996 static void
13997 dtrace_dstate_fini(dtrace_dstate_t *dstate)
13998 {
13999 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14000
14001 if (dstate->dtds_base == NULL)
14002 return;
14003
14004 kmem_free(dstate->dtds_base, dstate->dtds_size);
14005 kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);
14006 }
14007
14008 static void
14009 dtrace_vstate_fini(dtrace_vstate_t *vstate)
14010 {
14011 /*
14012 * Logical XOR, where are you?
14013 */
14014 ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));
14015
14016 if (vstate->dtvs_nglobals > 0) {
14017 kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
14018 sizeof (dtrace_statvar_t *));
14019 }
14020
14021 if (vstate->dtvs_ntlocals > 0) {
14022 kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
14023 sizeof (dtrace_difv_t));
14024 }
14025
14026 ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));
14027
14028 if (vstate->dtvs_nlocals > 0) {
14029 kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
14030 sizeof (dtrace_statvar_t *));
14031 }
14032 }
14033
14034 static void
14035 dtrace_state_clean(dtrace_state_t *state)
14036 {
14037 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
14038 return;
14039
14040 dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
14041 dtrace_speculation_clean(state);
14042 }
14043
14044 static void
14045 dtrace_state_deadman(dtrace_state_t *state)
14046 {
14047 hrtime_t now;
14048
14049 dtrace_sync();
14050
14051 now = dtrace_gethrtime();
14052
14053 if (state != dtrace_anon.dta_state &&
14054 now - state->dts_laststatus >= dtrace_deadman_user)
14055 return;
14056
14057 /*
14058 * We must be sure that dts_alive never appears to be less than the
14059 * value upon entry to dtrace_state_deadman(), and because we lack a
14060 * dtrace_cas64(), we cannot store to it atomically. We thus instead
14061 * store INT64_MAX to it, followed by a memory barrier, followed by
14062 * the new value. This assures that dts_alive never appears to be
14063 * less than its true value, regardless of the order in which the
14064 * stores to the underlying storage are issued.
14065 */
14066 state->dts_alive = INT64_MAX;
14067 dtrace_membar_producer();
14068 state->dts_alive = now;
14069 }
14070
14071 static int
14072 dtrace_state_create(dev_t *devp, cred_t *cr, dtrace_state_t **new_state)
14073 {
14074 minor_t minor;
14075 major_t major;
14076 char c[30];
14077 dtrace_state_t *state;
14078 dtrace_optval_t *opt;
14079 int bufsize = (int)NCPU * sizeof (dtrace_buffer_t), i;
14080 unsigned int cpu_it;
14081
14082 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14083 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14084
14085 /* Cause restart */
14086 *new_state = NULL;
14087
14088 if (devp != NULL) {
14089 minor = getminor(*devp);
14090 }
14091 else {
14092 minor = DTRACE_NCLIENTS - 1;
14093 }
14094
14095 state = dtrace_state_allocate(minor);
14096 if (NULL == state) {
14097 printf("dtrace_open: couldn't acquire minor number %d. This usually means that too many DTrace clients are in use at the moment", minor);
14098 return (ERESTART); /* can't reacquire */
14099 }
14100
14101 state->dts_epid = DTRACE_EPIDNONE + 1;
14102
14103 (void) snprintf(c, sizeof (c), "dtrace_aggid_%d", minor);
14104 state->dts_aggid_arena = vmem_create(c, (void *)1, INT32_MAX, 1,
14105 NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
14106
14107 if (devp != NULL) {
14108 major = getemajor(*devp);
14109 } else {
14110 major = ddi_driver_major(dtrace_devi);
14111 }
14112
14113 state->dts_dev = makedev(major, minor);
14114
14115 if (devp != NULL)
14116 *devp = state->dts_dev;
14117
14118 /*
14119 * We allocate NCPU buffers. On the one hand, this can be quite
14120 * a bit of memory per instance (nearly 36K on a Starcat). On the
14121 * other hand, it saves an additional memory reference in the probe
14122 * path.
14123 */
14124 state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
14125 state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
14126 state->dts_buf_over_limit = 0;
14127
14128 /*
14129 * Allocate and initialise the per-process per-CPU random state.
14130 * SI_SUB_RANDOM < SI_SUB_DTRACE_ANON therefore entropy device is
14131 * assumed to be seeded at this point (if from Fortuna seed file).
14132 */
14133 state->dts_rstate = kmem_zalloc(NCPU * sizeof(uint64_t*), KM_SLEEP);
14134 state->dts_rstate[0] = kmem_zalloc(2 * sizeof(uint64_t), KM_SLEEP);
14135 (void) read_random(state->dts_rstate[0], 2 * sizeof(uint64_t));
14136 for (cpu_it = 1; cpu_it < NCPU; cpu_it++) {
14137 state->dts_rstate[cpu_it] = kmem_zalloc(2 * sizeof(uint64_t), KM_SLEEP);
14138 /*
14139 * Each CPU is assigned a 2^64 period, non-overlapping
14140 * subsequence.
14141 */
14142 dtrace_xoroshiro128_plus_jump(state->dts_rstate[cpu_it-1],
14143 state->dts_rstate[cpu_it]);
14144 }
14145
14146 state->dts_cleaner = CYCLIC_NONE;
14147 state->dts_deadman = CYCLIC_NONE;
14148 state->dts_vstate.dtvs_state = state;
14149
14150 for (i = 0; i < DTRACEOPT_MAX; i++)
14151 state->dts_options[i] = DTRACEOPT_UNSET;
14152
14153 /*
14154 * Set the default options.
14155 */
14156 opt = state->dts_options;
14157 opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
14158 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
14159 opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
14160 opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
14161 opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
14162 opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
14163 opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
14164 opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
14165 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
14166 opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
14167 opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
14168 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
14169 opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
14170 opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
14171 opt[DTRACEOPT_BUFLIMIT] = dtrace_buflimit_default;
14172
14173 /*
14174 * Depending on the user credentials, we set flag bits which alter probe
14175 * visibility or the amount of destructiveness allowed. In the case of
14176 * actual anonymous tracing, or the possession of all privileges, all of
14177 * the normal checks are bypassed.
14178 */
14179 #if defined(__APPLE__)
14180 if (cr != NULL) {
14181 kauth_cred_ref(cr);
14182 state->dts_cred.dcr_cred = cr;
14183 }
14184 if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
14185 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
14186 /*
14187 * Allow only proc credentials when DTrace is
14188 * restricted by the current security policy
14189 */
14190 state->dts_cred.dcr_visible = DTRACE_CRV_ALLPROC;
14191 state->dts_cred.dcr_action = DTRACE_CRA_PROC | DTRACE_CRA_PROC_CONTROL | DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14192 }
14193 else {
14194 state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
14195 state->dts_cred.dcr_action = DTRACE_CRA_ALL;
14196 }
14197 }
14198
14199 #else
14200 if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
14201 state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
14202 state->dts_cred.dcr_action = DTRACE_CRA_ALL;
14203 }
14204 else {
14205 /*
14206 * Set up the credentials for this instantiation. We take a
14207 * hold on the credential to prevent it from disappearing on
14208 * us; this in turn prevents the zone_t referenced by this
14209 * credential from disappearing. This means that we can
14210 * examine the credential and the zone from probe context.
14211 */
14212 crhold(cr);
14213 state->dts_cred.dcr_cred = cr;
14214
14215 /*
14216 * CRA_PROC means "we have *some* privilege for dtrace" and
14217 * unlocks the use of variables like pid, zonename, etc.
14218 */
14219 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||
14220 PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
14221 state->dts_cred.dcr_action |= DTRACE_CRA_PROC;
14222 }
14223
14224 /*
14225 * dtrace_user allows use of syscall and profile providers.
14226 * If the user also has proc_owner and/or proc_zone, we
14227 * extend the scope to include additional visibility and
14228 * destructive power.
14229 */
14230 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
14231 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
14232 state->dts_cred.dcr_visible |=
14233 DTRACE_CRV_ALLPROC;
14234
14235 state->dts_cred.dcr_action |=
14236 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14237 }
14238
14239 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
14240 state->dts_cred.dcr_visible |=
14241 DTRACE_CRV_ALLZONE;
14242
14243 state->dts_cred.dcr_action |=
14244 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14245 }
14246
14247 /*
14248 * If we have all privs in whatever zone this is,
14249 * we can do destructive things to processes which
14250 * have altered credentials.
14251 *
14252 * APPLE NOTE: Darwin doesn't do zones.
14253 * Behave as if zone always has destructive privs.
14254 */
14255
14256 state->dts_cred.dcr_action |=
14257 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
14258 }
14259
14260 /*
14261 * Holding the dtrace_kernel privilege also implies that
14262 * the user has the dtrace_user privilege from a visibility
14263 * perspective. But without further privileges, some
14264 * destructive actions are not available.
14265 */
14266 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
14267 /*
14268 * Make all probes in all zones visible. However,
14269 * this doesn't mean that all actions become available
14270 * to all zones.
14271 */
14272 state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |
14273 DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;
14274
14275 state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |
14276 DTRACE_CRA_PROC;
14277 /*
14278 * Holding proc_owner means that destructive actions
14279 * for *this* zone are allowed.
14280 */
14281 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
14282 state->dts_cred.dcr_action |=
14283 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14284
14285 /*
14286 * Holding proc_zone means that destructive actions
14287 * for this user/group ID in all zones is allowed.
14288 */
14289 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
14290 state->dts_cred.dcr_action |=
14291 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14292
14293 /*
14294 * If we have all privs in whatever zone this is,
14295 * we can do destructive things to processes which
14296 * have altered credentials.
14297 *
14298 * APPLE NOTE: Darwin doesn't do zones.
14299 * Behave as if zone always has destructive privs.
14300 */
14301 state->dts_cred.dcr_action |=
14302 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
14303 }
14304
14305 /*
14306 * Holding the dtrace_proc privilege gives control over fasttrap
14307 * and pid providers. We need to grant wider destructive
14308 * privileges in the event that the user has proc_owner and/or
14309 * proc_zone.
14310 */
14311 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
14312 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
14313 state->dts_cred.dcr_action |=
14314 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14315
14316 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
14317 state->dts_cred.dcr_action |=
14318 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14319 }
14320 }
14321 #endif
14322
14323 *new_state = state;
14324 return(0); /* Success */
14325 }
14326
14327 static int
14328 dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
14329 {
14330 dtrace_optval_t *opt = state->dts_options, size;
14331 processorid_t cpu = 0;
14332 size_t limit = buf->dtb_size;
14333 int flags = 0, rval;
14334
14335 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14336 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14337 ASSERT(which < DTRACEOPT_MAX);
14338 ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||
14339 (state == dtrace_anon.dta_state &&
14340 state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
14341
14342 if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)
14343 return (0);
14344
14345 if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
14346 cpu = opt[DTRACEOPT_CPU];
14347
14348 if (which == DTRACEOPT_SPECSIZE)
14349 flags |= DTRACEBUF_NOSWITCH;
14350
14351 if (which == DTRACEOPT_BUFSIZE) {
14352 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
14353 flags |= DTRACEBUF_RING;
14354
14355 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
14356 flags |= DTRACEBUF_FILL;
14357
14358 if (state != dtrace_anon.dta_state ||
14359 state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
14360 flags |= DTRACEBUF_INACTIVE;
14361 }
14362
14363 for (size = opt[which]; (size_t)size >= sizeof (uint64_t); size >>= 1) {
14364 /*
14365 * The size must be 8-byte aligned. If the size is not 8-byte
14366 * aligned, drop it down by the difference.
14367 */
14368 if (size & (sizeof (uint64_t) - 1))
14369 size -= size & (sizeof (uint64_t) - 1);
14370
14371 if (size < state->dts_reserve) {
14372 /*
14373 * Buffers always must be large enough to accommodate
14374 * their prereserved space. We return E2BIG instead
14375 * of ENOMEM in this case to allow for user-level
14376 * software to differentiate the cases.
14377 */
14378 return (E2BIG);
14379 }
14380 limit = opt[DTRACEOPT_BUFLIMIT] * size / 100;
14381 rval = dtrace_buffer_alloc(buf, limit, size, flags, cpu);
14382
14383 if (rval != ENOMEM) {
14384 opt[which] = size;
14385 return (rval);
14386 }
14387
14388 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
14389 return (rval);
14390 }
14391
14392 return (ENOMEM);
14393 }
14394
14395 static int
14396 dtrace_state_buffers(dtrace_state_t *state)
14397 {
14398 dtrace_speculation_t *spec = state->dts_speculations;
14399 int rval, i;
14400
14401 if ((rval = dtrace_state_buffer(state, state->dts_buffer,
14402 DTRACEOPT_BUFSIZE)) != 0)
14403 return (rval);
14404
14405 if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
14406 DTRACEOPT_AGGSIZE)) != 0)
14407 return (rval);
14408
14409 for (i = 0; i < state->dts_nspeculations; i++) {
14410 if ((rval = dtrace_state_buffer(state,
14411 spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
14412 return (rval);
14413 }
14414
14415 return (0);
14416 }
14417
14418 static void
14419 dtrace_state_prereserve(dtrace_state_t *state)
14420 {
14421 dtrace_ecb_t *ecb;
14422 dtrace_probe_t *probe;
14423
14424 state->dts_reserve = 0;
14425
14426 if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
14427 return;
14428
14429 /*
14430 * If our buffer policy is a "fill" buffer policy, we need to set the
14431 * prereserved space to be the space required by the END probes.
14432 */
14433 probe = dtrace_probes[dtrace_probeid_end - 1];
14434 ASSERT(probe != NULL);
14435
14436 for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
14437 if (ecb->dte_state != state)
14438 continue;
14439
14440 state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
14441 }
14442 }
14443
14444 static int
14445 dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
14446 {
14447 dtrace_optval_t *opt = state->dts_options, sz, nspec;
14448 dtrace_speculation_t *spec;
14449 dtrace_buffer_t *buf;
14450 cyc_handler_t hdlr;
14451 cyc_time_t when;
14452 int rval = 0, i, bufsize = (int)NCPU * sizeof (dtrace_buffer_t);
14453 dtrace_icookie_t cookie;
14454
14455 lck_mtx_lock(&cpu_lock);
14456 lck_mtx_lock(&dtrace_lock);
14457
14458 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
14459 rval = EBUSY;
14460 goto out;
14461 }
14462
14463 /*
14464 * Before we can perform any checks, we must prime all of the
14465 * retained enablings that correspond to this state.
14466 */
14467 dtrace_enabling_prime(state);
14468
14469 if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
14470 rval = EACCES;
14471 goto out;
14472 }
14473
14474 dtrace_state_prereserve(state);
14475
14476 /*
14477 * Now we want to do is try to allocate our speculations.
14478 * We do not automatically resize the number of speculations; if
14479 * this fails, we will fail the operation.
14480 */
14481 nspec = opt[DTRACEOPT_NSPEC];
14482 ASSERT(nspec != DTRACEOPT_UNSET);
14483
14484 if (nspec > INT_MAX) {
14485 rval = ENOMEM;
14486 goto out;
14487 }
14488
14489 spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t), KM_NOSLEEP);
14490
14491 if (spec == NULL) {
14492 rval = ENOMEM;
14493 goto out;
14494 }
14495
14496 state->dts_speculations = spec;
14497 state->dts_nspeculations = (int)nspec;
14498
14499 for (i = 0; i < nspec; i++) {
14500 if ((buf = kmem_zalloc(bufsize, KM_NOSLEEP)) == NULL) {
14501 rval = ENOMEM;
14502 goto err;
14503 }
14504
14505 spec[i].dtsp_buffer = buf;
14506 }
14507
14508 if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
14509 if (dtrace_anon.dta_state == NULL) {
14510 rval = ENOENT;
14511 goto out;
14512 }
14513
14514 if (state->dts_necbs != 0) {
14515 rval = EALREADY;
14516 goto out;
14517 }
14518
14519 state->dts_anon = dtrace_anon_grab();
14520 ASSERT(state->dts_anon != NULL);
14521 state = state->dts_anon;
14522
14523 /*
14524 * We want "grabanon" to be set in the grabbed state, so we'll
14525 * copy that option value from the grabbing state into the
14526 * grabbed state.
14527 */
14528 state->dts_options[DTRACEOPT_GRABANON] =
14529 opt[DTRACEOPT_GRABANON];
14530
14531 *cpu = dtrace_anon.dta_beganon;
14532
14533 /*
14534 * If the anonymous state is active (as it almost certainly
14535 * is if the anonymous enabling ultimately matched anything),
14536 * we don't allow any further option processing -- but we
14537 * don't return failure.
14538 */
14539 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14540 goto out;
14541 }
14542
14543 if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
14544 opt[DTRACEOPT_AGGSIZE] != 0) {
14545 if (state->dts_aggregations == NULL) {
14546 /*
14547 * We're not going to create an aggregation buffer
14548 * because we don't have any ECBs that contain
14549 * aggregations -- set this option to 0.
14550 */
14551 opt[DTRACEOPT_AGGSIZE] = 0;
14552 } else {
14553 /*
14554 * If we have an aggregation buffer, we must also have
14555 * a buffer to use as scratch.
14556 */
14557 if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
14558 (size_t)opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
14559 opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
14560 }
14561 }
14562 }
14563
14564 if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
14565 opt[DTRACEOPT_SPECSIZE] != 0) {
14566 if (!state->dts_speculates) {
14567 /*
14568 * We're not going to create speculation buffers
14569 * because we don't have any ECBs that actually
14570 * speculate -- set the speculation size to 0.
14571 */
14572 opt[DTRACEOPT_SPECSIZE] = 0;
14573 }
14574 }
14575
14576 /*
14577 * The bare minimum size for any buffer that we're actually going to
14578 * do anything to is sizeof (uint64_t).
14579 */
14580 sz = sizeof (uint64_t);
14581
14582 if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||
14583 (state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||
14584 (state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
14585 /*
14586 * A buffer size has been explicitly set to 0 (or to a size
14587 * that will be adjusted to 0) and we need the space -- we
14588 * need to return failure. We return ENOSPC to differentiate
14589 * it from failing to allocate a buffer due to failure to meet
14590 * the reserve (for which we return E2BIG).
14591 */
14592 rval = ENOSPC;
14593 goto out;
14594 }
14595
14596 if ((rval = dtrace_state_buffers(state)) != 0)
14597 goto err;
14598
14599 if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
14600 sz = dtrace_dstate_defsize;
14601
14602 do {
14603 rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
14604
14605 if (rval == 0)
14606 break;
14607
14608 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
14609 goto err;
14610 } while (sz >>= 1);
14611
14612 opt[DTRACEOPT_DYNVARSIZE] = sz;
14613
14614 if (rval != 0)
14615 goto err;
14616
14617 if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
14618 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
14619
14620 if (opt[DTRACEOPT_CLEANRATE] == 0)
14621 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
14622
14623 if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
14624 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
14625
14626 if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
14627 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
14628
14629 if (opt[DTRACEOPT_STRSIZE] > dtrace_strsize_max)
14630 opt[DTRACEOPT_STRSIZE] = dtrace_strsize_max;
14631
14632 if (opt[DTRACEOPT_STRSIZE] < dtrace_strsize_min)
14633 opt[DTRACEOPT_STRSIZE] = dtrace_strsize_min;
14634
14635 if (opt[DTRACEOPT_BUFLIMIT] > dtrace_buflimit_max)
14636 opt[DTRACEOPT_BUFLIMIT] = dtrace_buflimit_max;
14637
14638 if (opt[DTRACEOPT_BUFLIMIT] < dtrace_buflimit_min)
14639 opt[DTRACEOPT_BUFLIMIT] = dtrace_buflimit_min;
14640
14641 hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
14642 hdlr.cyh_arg = state;
14643 hdlr.cyh_level = CY_LOW_LEVEL;
14644
14645 when.cyt_when = 0;
14646 when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
14647
14648 state->dts_cleaner = cyclic_add(&hdlr, &when);
14649
14650 hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
14651 hdlr.cyh_arg = state;
14652 hdlr.cyh_level = CY_LOW_LEVEL;
14653
14654 when.cyt_when = 0;
14655 when.cyt_interval = dtrace_deadman_interval;
14656
14657 state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
14658 state->dts_deadman = cyclic_add(&hdlr, &when);
14659
14660 state->dts_activity = DTRACE_ACTIVITY_WARMUP;
14661
14662 /*
14663 * Now it's time to actually fire the BEGIN probe. We need to disable
14664 * interrupts here both to record the CPU on which we fired the BEGIN
14665 * probe (the data from this CPU will be processed first at user
14666 * level) and to manually activate the buffer for this CPU.
14667 */
14668 cookie = dtrace_interrupt_disable();
14669 *cpu = CPU->cpu_id;
14670 ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
14671 state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
14672
14673 dtrace_probe(dtrace_probeid_begin,
14674 (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
14675 dtrace_interrupt_enable(cookie);
14676 /*
14677 * We may have had an exit action from a BEGIN probe; only change our
14678 * state to ACTIVE if we're still in WARMUP.
14679 */
14680 ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||
14681 state->dts_activity == DTRACE_ACTIVITY_DRAINING);
14682
14683 if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
14684 state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
14685
14686 /*
14687 * Regardless of whether or not now we're in ACTIVE or DRAINING, we
14688 * want each CPU to transition its principal buffer out of the
14689 * INACTIVE state. Doing this assures that no CPU will suddenly begin
14690 * processing an ECB halfway down a probe's ECB chain; all CPUs will
14691 * atomically transition from processing none of a state's ECBs to
14692 * processing all of them.
14693 */
14694 dtrace_xcall(DTRACE_CPUALL,
14695 (dtrace_xcall_t)dtrace_buffer_activate, state);
14696 goto out;
14697
14698 err:
14699 dtrace_buffer_free(state->dts_buffer);
14700 dtrace_buffer_free(state->dts_aggbuffer);
14701
14702 if ((nspec = state->dts_nspeculations) == 0) {
14703 ASSERT(state->dts_speculations == NULL);
14704 goto out;
14705 }
14706
14707 spec = state->dts_speculations;
14708 ASSERT(spec != NULL);
14709
14710 for (i = 0; i < state->dts_nspeculations; i++) {
14711 if ((buf = spec[i].dtsp_buffer) == NULL)
14712 break;
14713
14714 dtrace_buffer_free(buf);
14715 kmem_free(buf, bufsize);
14716 }
14717
14718 kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
14719 state->dts_nspeculations = 0;
14720 state->dts_speculations = NULL;
14721
14722 out:
14723 lck_mtx_unlock(&dtrace_lock);
14724 lck_mtx_unlock(&cpu_lock);
14725
14726 return (rval);
14727 }
14728
14729 static int
14730 dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
14731 {
14732 dtrace_icookie_t cookie;
14733
14734 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14735
14736 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
14737 state->dts_activity != DTRACE_ACTIVITY_DRAINING)
14738 return (EINVAL);
14739
14740 /*
14741 * We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
14742 * to be sure that every CPU has seen it. See below for the details
14743 * on why this is done.
14744 */
14745 state->dts_activity = DTRACE_ACTIVITY_DRAINING;
14746 dtrace_sync();
14747
14748 /*
14749 * By this point, it is impossible for any CPU to be still processing
14750 * with DTRACE_ACTIVITY_ACTIVE. We can thus set our activity to
14751 * DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
14752 * other CPU in dtrace_buffer_reserve(). This allows dtrace_probe()
14753 * and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
14754 * iff we're in the END probe.
14755 */
14756 state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
14757 dtrace_sync();
14758 ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
14759
14760 /*
14761 * Finally, we can release the reserve and call the END probe. We
14762 * disable interrupts across calling the END probe to allow us to
14763 * return the CPU on which we actually called the END probe. This
14764 * allows user-land to be sure that this CPU's principal buffer is
14765 * processed last.
14766 */
14767 state->dts_reserve = 0;
14768
14769 cookie = dtrace_interrupt_disable();
14770 *cpu = CPU->cpu_id;
14771 dtrace_probe(dtrace_probeid_end,
14772 (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
14773 dtrace_interrupt_enable(cookie);
14774
14775 state->dts_activity = DTRACE_ACTIVITY_STOPPED;
14776 dtrace_sync();
14777
14778 return (0);
14779 }
14780
14781 static int
14782 dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
14783 dtrace_optval_t val)
14784 {
14785 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14786
14787 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14788 return (EBUSY);
14789
14790 if (option >= DTRACEOPT_MAX)
14791 return (EINVAL);
14792
14793 if (option != DTRACEOPT_CPU && val < 0)
14794 return (EINVAL);
14795
14796 switch (option) {
14797 case DTRACEOPT_DESTRUCTIVE:
14798 if (dtrace_destructive_disallow)
14799 return (EACCES);
14800
14801 state->dts_cred.dcr_destructive = 1;
14802 break;
14803
14804 case DTRACEOPT_BUFSIZE:
14805 case DTRACEOPT_DYNVARSIZE:
14806 case DTRACEOPT_AGGSIZE:
14807 case DTRACEOPT_SPECSIZE:
14808 case DTRACEOPT_STRSIZE:
14809 if (val < 0)
14810 return (EINVAL);
14811
14812 if (val >= LONG_MAX) {
14813 /*
14814 * If this is an otherwise negative value, set it to
14815 * the highest multiple of 128m less than LONG_MAX.
14816 * Technically, we're adjusting the size without
14817 * regard to the buffer resizing policy, but in fact,
14818 * this has no effect -- if we set the buffer size to
14819 * ~LONG_MAX and the buffer policy is ultimately set to
14820 * be "manual", the buffer allocation is guaranteed to
14821 * fail, if only because the allocation requires two
14822 * buffers. (We set the the size to the highest
14823 * multiple of 128m because it ensures that the size
14824 * will remain a multiple of a megabyte when
14825 * repeatedly halved -- all the way down to 15m.)
14826 */
14827 val = LONG_MAX - (1 << 27) + 1;
14828 }
14829 }
14830
14831 state->dts_options[option] = val;
14832
14833 return (0);
14834 }
14835
14836 static void
14837 dtrace_state_destroy(dtrace_state_t *state)
14838 {
14839 dtrace_ecb_t *ecb;
14840 dtrace_vstate_t *vstate = &state->dts_vstate;
14841 minor_t minor = getminor(state->dts_dev);
14842 int i, bufsize = (int)NCPU * sizeof (dtrace_buffer_t);
14843 dtrace_speculation_t *spec = state->dts_speculations;
14844 int nspec = state->dts_nspeculations;
14845 uint32_t match;
14846
14847 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14848 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14849
14850 /*
14851 * First, retract any retained enablings for this state.
14852 */
14853 dtrace_enabling_retract(state);
14854 ASSERT(state->dts_nretained == 0);
14855
14856 if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||
14857 state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
14858 /*
14859 * We have managed to come into dtrace_state_destroy() on a
14860 * hot enabling -- almost certainly because of a disorderly
14861 * shutdown of a consumer. (That is, a consumer that is
14862 * exiting without having called dtrace_stop().) In this case,
14863 * we're going to set our activity to be KILLED, and then
14864 * issue a sync to be sure that everyone is out of probe
14865 * context before we start blowing away ECBs.
14866 */
14867 state->dts_activity = DTRACE_ACTIVITY_KILLED;
14868 dtrace_sync();
14869 }
14870
14871 /*
14872 * Release the credential hold we took in dtrace_state_create().
14873 */
14874 if (state->dts_cred.dcr_cred != NULL)
14875 kauth_cred_unref(&state->dts_cred.dcr_cred);
14876
14877 /*
14878 * Now we can safely disable and destroy any enabled probes. Because
14879 * any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
14880 * (especially if they're all enabled), we take two passes through the
14881 * ECBs: in the first, we disable just DTRACE_PRIV_KERNEL probes, and
14882 * in the second we disable whatever is left over.
14883 */
14884 for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
14885 for (i = 0; i < state->dts_necbs; i++) {
14886 if ((ecb = state->dts_ecbs[i]) == NULL)
14887 continue;
14888
14889 if (match && ecb->dte_probe != NULL) {
14890 dtrace_probe_t *probe = ecb->dte_probe;
14891 dtrace_provider_t *prov = probe->dtpr_provider;
14892
14893 if (!(prov->dtpv_priv.dtpp_flags & match))
14894 continue;
14895 }
14896
14897 dtrace_ecb_disable(ecb);
14898 dtrace_ecb_destroy(ecb);
14899 }
14900
14901 if (!match)
14902 break;
14903 }
14904
14905 /*
14906 * Before we free the buffers, perform one more sync to assure that
14907 * every CPU is out of probe context.
14908 */
14909 dtrace_sync();
14910
14911 dtrace_buffer_free(state->dts_buffer);
14912 dtrace_buffer_free(state->dts_aggbuffer);
14913
14914 for (i = 0; i < (int)NCPU; i++) {
14915 kmem_free(state->dts_rstate[i], 2 * sizeof(uint64_t));
14916 }
14917 kmem_free(state->dts_rstate, NCPU * sizeof(uint64_t*));
14918
14919 for (i = 0; i < nspec; i++)
14920 dtrace_buffer_free(spec[i].dtsp_buffer);
14921
14922 if (state->dts_cleaner != CYCLIC_NONE)
14923 cyclic_remove(state->dts_cleaner);
14924
14925 if (state->dts_deadman != CYCLIC_NONE)
14926 cyclic_remove(state->dts_deadman);
14927
14928 dtrace_dstate_fini(&vstate->dtvs_dynvars);
14929 dtrace_vstate_fini(vstate);
14930 kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
14931
14932 if (state->dts_aggregations != NULL) {
14933 #if DEBUG
14934 for (i = 0; i < state->dts_naggregations; i++)
14935 ASSERT(state->dts_aggregations[i] == NULL);
14936 #endif
14937 ASSERT(state->dts_naggregations > 0);
14938 kmem_free(state->dts_aggregations,
14939 state->dts_naggregations * sizeof (dtrace_aggregation_t *));
14940 }
14941
14942 kmem_free(state->dts_buffer, bufsize);
14943 kmem_free(state->dts_aggbuffer, bufsize);
14944
14945 for (i = 0; i < nspec; i++)
14946 kmem_free(spec[i].dtsp_buffer, bufsize);
14947
14948 kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
14949
14950 dtrace_format_destroy(state);
14951
14952 vmem_destroy(state->dts_aggid_arena);
14953 dtrace_state_free(minor);
14954 }
14955
14956 /*
14957 * DTrace Anonymous Enabling Functions
14958 */
14959
14960 int
14961 dtrace_keep_kernel_symbols(void)
14962 {
14963 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
14964 return 0;
14965 }
14966
14967 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL)
14968 return 1;
14969
14970 return 0;
14971 }
14972
14973 static dtrace_state_t *
14974 dtrace_anon_grab(void)
14975 {
14976 dtrace_state_t *state;
14977
14978 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14979
14980 if ((state = dtrace_anon.dta_state) == NULL) {
14981 ASSERT(dtrace_anon.dta_enabling == NULL);
14982 return (NULL);
14983 }
14984
14985 ASSERT(dtrace_anon.dta_enabling != NULL);
14986 ASSERT(dtrace_retained != NULL);
14987
14988 dtrace_enabling_destroy(dtrace_anon.dta_enabling);
14989 dtrace_anon.dta_enabling = NULL;
14990 dtrace_anon.dta_state = NULL;
14991
14992 return (state);
14993 }
14994
14995 static void
14996 dtrace_anon_property(void)
14997 {
14998 int i, rv;
14999 dtrace_state_t *state;
15000 dof_hdr_t *dof;
15001 char c[32]; /* enough for "dof-data-" + digits */
15002
15003 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15004 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
15005
15006 for (i = 0; ; i++) {
15007 (void) snprintf(c, sizeof (c), "dof-data-%d", i);
15008
15009 dtrace_err_verbose = 1;
15010
15011 if ((dof = dtrace_dof_property(c)) == NULL) {
15012 dtrace_err_verbose = 0;
15013 break;
15014 }
15015
15016 #ifdef illumos
15017 /*
15018 * We want to create anonymous state, so we need to transition
15019 * the kernel debugger to indicate that DTrace is active. If
15020 * this fails (e.g. because the debugger has modified text in
15021 * some way), we won't continue with the processing.
15022 */
15023 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
15024 cmn_err(CE_NOTE, "kernel debugger active; anonymous "
15025 "enabling ignored.");
15026 dtrace_dof_destroy(dof);
15027 break;
15028 }
15029 #endif
15030
15031 /*
15032 * If we haven't allocated an anonymous state, we'll do so now.
15033 */
15034 if ((state = dtrace_anon.dta_state) == NULL) {
15035 rv = dtrace_state_create(NULL, NULL, &state);
15036 dtrace_anon.dta_state = state;
15037 if (rv != 0 || state == NULL) {
15038 /*
15039 * This basically shouldn't happen: the only
15040 * failure mode from dtrace_state_create() is a
15041 * failure of ddi_soft_state_zalloc() that
15042 * itself should never happen. Still, the
15043 * interface allows for a failure mode, and
15044 * we want to fail as gracefully as possible:
15045 * we'll emit an error message and cease
15046 * processing anonymous state in this case.
15047 */
15048 cmn_err(CE_WARN, "failed to create "
15049 "anonymous state");
15050 dtrace_dof_destroy(dof);
15051 break;
15052 }
15053 }
15054
15055 rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
15056 &dtrace_anon.dta_enabling, 0, B_TRUE);
15057
15058 if (rv == 0)
15059 rv = dtrace_dof_options(dof, state);
15060
15061 dtrace_err_verbose = 0;
15062 dtrace_dof_destroy(dof);
15063
15064 if (rv != 0) {
15065 /*
15066 * This is malformed DOF; chuck any anonymous state
15067 * that we created.
15068 */
15069 ASSERT(dtrace_anon.dta_enabling == NULL);
15070 dtrace_state_destroy(state);
15071 dtrace_anon.dta_state = NULL;
15072 break;
15073 }
15074
15075 ASSERT(dtrace_anon.dta_enabling != NULL);
15076 }
15077
15078 if (dtrace_anon.dta_enabling != NULL) {
15079 int rval;
15080
15081 /*
15082 * dtrace_enabling_retain() can only fail because we are
15083 * trying to retain more enablings than are allowed -- but
15084 * we only have one anonymous enabling, and we are guaranteed
15085 * to be allowed at least one retained enabling; we assert
15086 * that dtrace_enabling_retain() returns success.
15087 */
15088 rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
15089 ASSERT(rval == 0);
15090
15091 dtrace_enabling_dump(dtrace_anon.dta_enabling);
15092 }
15093 }
15094
15095 /*
15096 * DTrace Helper Functions
15097 */
15098 static void
15099 dtrace_helper_trace(dtrace_helper_action_t *helper,
15100 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
15101 {
15102 uint32_t size, next, nnext;
15103 int i;
15104 dtrace_helptrace_t *ent;
15105 uint16_t flags = cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
15106
15107 if (!dtrace_helptrace_enabled)
15108 return;
15109
15110 ASSERT((uint32_t)vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
15111
15112 /*
15113 * What would a tracing framework be without its own tracing
15114 * framework? (Well, a hell of a lot simpler, for starters...)
15115 */
15116 size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
15117 sizeof (uint64_t) - sizeof (uint64_t);
15118
15119 /*
15120 * Iterate until we can allocate a slot in the trace buffer.
15121 */
15122 do {
15123 next = dtrace_helptrace_next;
15124
15125 if (next + size < dtrace_helptrace_bufsize) {
15126 nnext = next + size;
15127 } else {
15128 nnext = size;
15129 }
15130 } while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
15131
15132 /*
15133 * We have our slot; fill it in.
15134 */
15135 if (nnext == size)
15136 next = 0;
15137
15138 ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
15139 ent->dtht_helper = helper;
15140 ent->dtht_where = where;
15141 ent->dtht_nlocals = vstate->dtvs_nlocals;
15142
15143 ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
15144 mstate->dtms_fltoffs : -1;
15145 ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
15146 ent->dtht_illval = cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
15147
15148 for (i = 0; i < vstate->dtvs_nlocals; i++) {
15149 dtrace_statvar_t *svar;
15150
15151 if ((svar = vstate->dtvs_locals[i]) == NULL)
15152 continue;
15153
15154 ASSERT(svar->dtsv_size >= (int)NCPU * sizeof (uint64_t));
15155 ent->dtht_locals[i] =
15156 ((uint64_t *)(uintptr_t)svar->dtsv_data)[CPU->cpu_id];
15157 }
15158 }
15159
15160 __attribute__((noinline))
15161 static uint64_t
15162 dtrace_helper(int which, dtrace_mstate_t *mstate,
15163 dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
15164 {
15165 uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
15166 uint64_t sarg0 = mstate->dtms_arg[0];
15167 uint64_t sarg1 = mstate->dtms_arg[1];
15168 uint64_t rval = 0;
15169 dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
15170 dtrace_helper_action_t *helper;
15171 dtrace_vstate_t *vstate;
15172 dtrace_difo_t *pred;
15173 int i, trace = dtrace_helptrace_enabled;
15174
15175 ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
15176
15177 if (helpers == NULL)
15178 return (0);
15179
15180 if ((helper = helpers->dthps_actions[which]) == NULL)
15181 return (0);
15182
15183 vstate = &helpers->dthps_vstate;
15184 mstate->dtms_arg[0] = arg0;
15185 mstate->dtms_arg[1] = arg1;
15186
15187 /*
15188 * Now iterate over each helper. If its predicate evaluates to 'true',
15189 * we'll call the corresponding actions. Note that the below calls
15190 * to dtrace_dif_emulate() may set faults in machine state. This is
15191 * okay: our caller (the outer dtrace_dif_emulate()) will simply plow
15192 * the stored DIF offset with its own (which is the desired behavior).
15193 * Also, note the calls to dtrace_dif_emulate() may allocate scratch
15194 * from machine state; this is okay, too.
15195 */
15196 for (; helper != NULL; helper = helper->dtha_next) {
15197 if ((pred = helper->dtha_predicate) != NULL) {
15198 if (trace)
15199 dtrace_helper_trace(helper, mstate, vstate, 0);
15200
15201 if (!dtrace_dif_emulate(pred, mstate, vstate, state))
15202 goto next;
15203
15204 if (*flags & CPU_DTRACE_FAULT)
15205 goto err;
15206 }
15207
15208 for (i = 0; i < helper->dtha_nactions; i++) {
15209 if (trace)
15210 dtrace_helper_trace(helper,
15211 mstate, vstate, i + 1);
15212
15213 rval = dtrace_dif_emulate(helper->dtha_actions[i],
15214 mstate, vstate, state);
15215
15216 if (*flags & CPU_DTRACE_FAULT)
15217 goto err;
15218 }
15219
15220 next:
15221 if (trace)
15222 dtrace_helper_trace(helper, mstate, vstate,
15223 DTRACE_HELPTRACE_NEXT);
15224 }
15225
15226 if (trace)
15227 dtrace_helper_trace(helper, mstate, vstate,
15228 DTRACE_HELPTRACE_DONE);
15229
15230 /*
15231 * Restore the arg0 that we saved upon entry.
15232 */
15233 mstate->dtms_arg[0] = sarg0;
15234 mstate->dtms_arg[1] = sarg1;
15235
15236 return (rval);
15237
15238 err:
15239 if (trace)
15240 dtrace_helper_trace(helper, mstate, vstate,
15241 DTRACE_HELPTRACE_ERR);
15242
15243 /*
15244 * Restore the arg0 that we saved upon entry.
15245 */
15246 mstate->dtms_arg[0] = sarg0;
15247 mstate->dtms_arg[1] = sarg1;
15248
15249 return (0);
15250 }
15251
15252 static void
15253 dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
15254 dtrace_vstate_t *vstate)
15255 {
15256 int i;
15257
15258 if (helper->dtha_predicate != NULL)
15259 dtrace_difo_release(helper->dtha_predicate, vstate);
15260
15261 for (i = 0; i < helper->dtha_nactions; i++) {
15262 ASSERT(helper->dtha_actions[i] != NULL);
15263 dtrace_difo_release(helper->dtha_actions[i], vstate);
15264 }
15265
15266 kmem_free(helper->dtha_actions,
15267 helper->dtha_nactions * sizeof (dtrace_difo_t *));
15268 kmem_free(helper, sizeof (dtrace_helper_action_t));
15269 }
15270
15271 static int
15272 dtrace_helper_destroygen(proc_t* p, int gen)
15273 {
15274 dtrace_helpers_t *help = p->p_dtrace_helpers;
15275 dtrace_vstate_t *vstate;
15276 uint_t i;
15277
15278 LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
15279 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15280
15281 if (help == NULL || gen > help->dthps_generation)
15282 return (EINVAL);
15283
15284 vstate = &help->dthps_vstate;
15285
15286 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
15287 dtrace_helper_action_t *last = NULL, *h, *next;
15288
15289 for (h = help->dthps_actions[i]; h != NULL; h = next) {
15290 next = h->dtha_next;
15291
15292 if (h->dtha_generation == gen) {
15293 if (last != NULL) {
15294 last->dtha_next = next;
15295 } else {
15296 help->dthps_actions[i] = next;
15297 }
15298
15299 dtrace_helper_action_destroy(h, vstate);
15300 } else {
15301 last = h;
15302 }
15303 }
15304 }
15305
15306 /*
15307 * Interate until we've cleared out all helper providers with the
15308 * given generation number.
15309 */
15310 for (;;) {
15311 dtrace_helper_provider_t *prov = NULL;
15312
15313 /*
15314 * Look for a helper provider with the right generation. We
15315 * have to start back at the beginning of the list each time
15316 * because we drop dtrace_lock. It's unlikely that we'll make
15317 * more than two passes.
15318 */
15319 for (i = 0; i < help->dthps_nprovs; i++) {
15320 prov = help->dthps_provs[i];
15321
15322 if (prov->dthp_generation == gen)
15323 break;
15324 }
15325
15326 /*
15327 * If there were no matches, we're done.
15328 */
15329 if (i == help->dthps_nprovs)
15330 break;
15331
15332 /*
15333 * Move the last helper provider into this slot.
15334 */
15335 help->dthps_nprovs--;
15336 help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
15337 help->dthps_provs[help->dthps_nprovs] = NULL;
15338
15339 lck_mtx_unlock(&dtrace_lock);
15340
15341 /*
15342 * If we have a meta provider, remove this helper provider.
15343 */
15344 if (dtrace_meta_pid != NULL) {
15345 ASSERT(dtrace_deferred_pid == NULL);
15346 dtrace_helper_provider_remove(&prov->dthp_prov,
15347 p);
15348 }
15349
15350 dtrace_helper_provider_destroy(prov);
15351
15352 lck_mtx_lock(&dtrace_lock);
15353 }
15354
15355 return (0);
15356 }
15357
15358 static int
15359 dtrace_helper_validate(dtrace_helper_action_t *helper)
15360 {
15361 int err = 0, i;
15362 dtrace_difo_t *dp;
15363
15364 if ((dp = helper->dtha_predicate) != NULL)
15365 err += dtrace_difo_validate_helper(dp);
15366
15367 for (i = 0; i < helper->dtha_nactions; i++)
15368 err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
15369
15370 return (err == 0);
15371 }
15372
15373 static int
15374 dtrace_helper_action_add(proc_t* p, int which, dtrace_ecbdesc_t *ep)
15375 {
15376 dtrace_helpers_t *help;
15377 dtrace_helper_action_t *helper, *last;
15378 dtrace_actdesc_t *act;
15379 dtrace_vstate_t *vstate;
15380 dtrace_predicate_t *pred;
15381 int count = 0, nactions = 0, i;
15382
15383 if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
15384 return (EINVAL);
15385
15386 help = p->p_dtrace_helpers;
15387 last = help->dthps_actions[which];
15388 vstate = &help->dthps_vstate;
15389
15390 for (count = 0; last != NULL; last = last->dtha_next) {
15391 count++;
15392 if (last->dtha_next == NULL)
15393 break;
15394 }
15395
15396 /*
15397 * If we already have dtrace_helper_actions_max helper actions for this
15398 * helper action type, we'll refuse to add a new one.
15399 */
15400 if (count >= dtrace_helper_actions_max)
15401 return (ENOSPC);
15402
15403 helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
15404 helper->dtha_generation = help->dthps_generation;
15405
15406 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
15407 ASSERT(pred->dtp_difo != NULL);
15408 dtrace_difo_hold(pred->dtp_difo);
15409 helper->dtha_predicate = pred->dtp_difo;
15410 }
15411
15412 for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
15413 if (act->dtad_kind != DTRACEACT_DIFEXPR)
15414 goto err;
15415
15416 if (act->dtad_difo == NULL)
15417 goto err;
15418
15419 nactions++;
15420 }
15421
15422 helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *
15423 (helper->dtha_nactions = nactions), KM_SLEEP);
15424
15425 for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
15426 dtrace_difo_hold(act->dtad_difo);
15427 helper->dtha_actions[i++] = act->dtad_difo;
15428 }
15429
15430 if (!dtrace_helper_validate(helper))
15431 goto err;
15432
15433 if (last == NULL) {
15434 help->dthps_actions[which] = helper;
15435 } else {
15436 last->dtha_next = helper;
15437 }
15438
15439 if ((uint32_t)vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
15440 dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
15441 dtrace_helptrace_next = 0;
15442 }
15443
15444 return (0);
15445 err:
15446 dtrace_helper_action_destroy(helper, vstate);
15447 return (EINVAL);
15448 }
15449
15450 static void
15451 dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
15452 dof_helper_t *dofhp)
15453 {
15454 LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
15455 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
15456
15457 lck_mtx_lock(&dtrace_lock);
15458
15459 if (!dtrace_attached() || dtrace_meta_pid == NULL) {
15460 /*
15461 * If the dtrace module is loaded but not attached, or if
15462 * there aren't isn't a meta provider registered to deal with
15463 * these provider descriptions, we need to postpone creating
15464 * the actual providers until later.
15465 */
15466
15467 if (help->dthps_next == NULL && help->dthps_prev == NULL &&
15468 dtrace_deferred_pid != help) {
15469 help->dthps_deferred = 1;
15470 help->dthps_pid = p->p_pid;
15471 help->dthps_next = dtrace_deferred_pid;
15472 help->dthps_prev = NULL;
15473 if (dtrace_deferred_pid != NULL)
15474 dtrace_deferred_pid->dthps_prev = help;
15475 dtrace_deferred_pid = help;
15476 }
15477
15478 lck_mtx_unlock(&dtrace_lock);
15479
15480 } else if (dofhp != NULL) {
15481 /*
15482 * If the dtrace module is loaded and we have a particular
15483 * helper provider description, pass that off to the
15484 * meta provider.
15485 */
15486
15487 lck_mtx_unlock(&dtrace_lock);
15488
15489 dtrace_helper_provide(dofhp, p);
15490
15491 } else {
15492 /*
15493 * Otherwise, just pass all the helper provider descriptions
15494 * off to the meta provider.
15495 */
15496
15497 uint_t i;
15498 lck_mtx_unlock(&dtrace_lock);
15499
15500 for (i = 0; i < help->dthps_nprovs; i++) {
15501 dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
15502 p);
15503 }
15504 }
15505 }
15506
15507 static int
15508 dtrace_helper_provider_add(proc_t* p, dof_helper_t *dofhp, int gen)
15509 {
15510 dtrace_helpers_t *help;
15511 dtrace_helper_provider_t *hprov, **tmp_provs;
15512 uint_t tmp_maxprovs, i;
15513
15514 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15515 help = p->p_dtrace_helpers;
15516 ASSERT(help != NULL);
15517
15518 /*
15519 * If we already have dtrace_helper_providers_max helper providers,
15520 * we're refuse to add a new one.
15521 */
15522 if (help->dthps_nprovs >= dtrace_helper_providers_max)
15523 return (ENOSPC);
15524
15525 /*
15526 * Check to make sure this isn't a duplicate.
15527 */
15528 for (i = 0; i < help->dthps_nprovs; i++) {
15529 if (dofhp->dofhp_addr ==
15530 help->dthps_provs[i]->dthp_prov.dofhp_addr)
15531 return (EALREADY);
15532 }
15533
15534 hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
15535 hprov->dthp_prov = *dofhp;
15536 hprov->dthp_ref = 1;
15537 hprov->dthp_generation = gen;
15538
15539 /*
15540 * Allocate a bigger table for helper providers if it's already full.
15541 */
15542 if (help->dthps_maxprovs == help->dthps_nprovs) {
15543 tmp_maxprovs = help->dthps_maxprovs;
15544 tmp_provs = help->dthps_provs;
15545
15546 if (help->dthps_maxprovs == 0)
15547 help->dthps_maxprovs = 2;
15548 else
15549 help->dthps_maxprovs *= 2;
15550 if (help->dthps_maxprovs > dtrace_helper_providers_max)
15551 help->dthps_maxprovs = dtrace_helper_providers_max;
15552
15553 ASSERT(tmp_maxprovs < help->dthps_maxprovs);
15554
15555 help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
15556 sizeof (dtrace_helper_provider_t *), KM_SLEEP);
15557
15558 if (tmp_provs != NULL) {
15559 bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
15560 sizeof (dtrace_helper_provider_t *));
15561 kmem_free(tmp_provs, tmp_maxprovs *
15562 sizeof (dtrace_helper_provider_t *));
15563 }
15564 }
15565
15566 help->dthps_provs[help->dthps_nprovs] = hprov;
15567 help->dthps_nprovs++;
15568
15569 return (0);
15570 }
15571
15572 static void
15573 dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
15574 {
15575 lck_mtx_lock(&dtrace_lock);
15576
15577 if (--hprov->dthp_ref == 0) {
15578 dof_hdr_t *dof;
15579 lck_mtx_unlock(&dtrace_lock);
15580 dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
15581 dtrace_dof_destroy(dof);
15582 kmem_free(hprov, sizeof (dtrace_helper_provider_t));
15583 } else {
15584 lck_mtx_unlock(&dtrace_lock);
15585 }
15586 }
15587
15588 static int
15589 dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)
15590 {
15591 uintptr_t daddr = (uintptr_t)dof;
15592 dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
15593 dof_provider_t *provider;
15594 dof_probe_t *probe;
15595 uint8_t *arg;
15596 char *strtab, *typestr;
15597 dof_stridx_t typeidx;
15598 size_t typesz;
15599 uint_t nprobes, j, k;
15600
15601 ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
15602
15603 if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
15604 dtrace_dof_error(dof, "misaligned section offset");
15605 return (-1);
15606 }
15607
15608 /*
15609 * The section needs to be large enough to contain the DOF provider
15610 * structure appropriate for the given version.
15611 */
15612 if (sec->dofs_size <
15613 ((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
15614 offsetof(dof_provider_t, dofpv_prenoffs) :
15615 sizeof (dof_provider_t))) {
15616 dtrace_dof_error(dof, "provider section too small");
15617 return (-1);
15618 }
15619
15620 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
15621 str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
15622 prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
15623 arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
15624 off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
15625
15626 if (str_sec == NULL || prb_sec == NULL ||
15627 arg_sec == NULL || off_sec == NULL)
15628 return (-1);
15629
15630 enoff_sec = NULL;
15631
15632 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
15633 provider->dofpv_prenoffs != DOF_SECT_NONE &&
15634 (enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
15635 provider->dofpv_prenoffs)) == NULL)
15636 return (-1);
15637
15638 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
15639
15640 if (provider->dofpv_name >= str_sec->dofs_size ||
15641 strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
15642 dtrace_dof_error(dof, "invalid provider name");
15643 return (-1);
15644 }
15645
15646 if (prb_sec->dofs_entsize == 0 ||
15647 prb_sec->dofs_entsize > prb_sec->dofs_size) {
15648 dtrace_dof_error(dof, "invalid entry size");
15649 return (-1);
15650 }
15651
15652 if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
15653 dtrace_dof_error(dof, "misaligned entry size");
15654 return (-1);
15655 }
15656
15657 if (off_sec->dofs_entsize != sizeof (uint32_t)) {
15658 dtrace_dof_error(dof, "invalid entry size");
15659 return (-1);
15660 }
15661
15662 if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
15663 dtrace_dof_error(dof, "misaligned section offset");
15664 return (-1);
15665 }
15666
15667 if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
15668 dtrace_dof_error(dof, "invalid entry size");
15669 return (-1);
15670 }
15671
15672 arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
15673
15674 nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
15675
15676 /*
15677 * Take a pass through the probes to check for errors.
15678 */
15679 for (j = 0; j < nprobes; j++) {
15680 probe = (dof_probe_t *)(uintptr_t)(daddr +
15681 prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
15682
15683 if (probe->dofpr_func >= str_sec->dofs_size) {
15684 dtrace_dof_error(dof, "invalid function name");
15685 return (-1);
15686 }
15687
15688 if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
15689 dtrace_dof_error(dof, "function name too long");
15690 return (-1);
15691 }
15692
15693 if (probe->dofpr_name >= str_sec->dofs_size ||
15694 strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
15695 dtrace_dof_error(dof, "invalid probe name");
15696 return (-1);
15697 }
15698
15699 /*
15700 * The offset count must not wrap the index, and the offsets
15701 * must also not overflow the section's data.
15702 */
15703 if (probe->dofpr_offidx + probe->dofpr_noffs <
15704 probe->dofpr_offidx ||
15705 (probe->dofpr_offidx + probe->dofpr_noffs) *
15706 off_sec->dofs_entsize > off_sec->dofs_size) {
15707 dtrace_dof_error(dof, "invalid probe offset");
15708 return (-1);
15709 }
15710
15711 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
15712 /*
15713 * If there's no is-enabled offset section, make sure
15714 * there aren't any is-enabled offsets. Otherwise
15715 * perform the same checks as for probe offsets
15716 * (immediately above).
15717 */
15718 if (enoff_sec == NULL) {
15719 if (probe->dofpr_enoffidx != 0 ||
15720 probe->dofpr_nenoffs != 0) {
15721 dtrace_dof_error(dof, "is-enabled "
15722 "offsets with null section");
15723 return (-1);
15724 }
15725 } else if (probe->dofpr_enoffidx +
15726 probe->dofpr_nenoffs < probe->dofpr_enoffidx ||
15727 (probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
15728 enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
15729 dtrace_dof_error(dof, "invalid is-enabled "
15730 "offset");
15731 return (-1);
15732 }
15733
15734 if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
15735 dtrace_dof_error(dof, "zero probe and "
15736 "is-enabled offsets");
15737 return (-1);
15738 }
15739 } else if (probe->dofpr_noffs == 0) {
15740 dtrace_dof_error(dof, "zero probe offsets");
15741 return (-1);
15742 }
15743
15744 if (probe->dofpr_argidx + probe->dofpr_xargc <
15745 probe->dofpr_argidx ||
15746 (probe->dofpr_argidx + probe->dofpr_xargc) *
15747 arg_sec->dofs_entsize > arg_sec->dofs_size) {
15748 dtrace_dof_error(dof, "invalid args");
15749 return (-1);
15750 }
15751
15752 typeidx = probe->dofpr_nargv;
15753 typestr = strtab + probe->dofpr_nargv;
15754 for (k = 0; k < probe->dofpr_nargc; k++) {
15755 if (typeidx >= str_sec->dofs_size) {
15756 dtrace_dof_error(dof, "bad "
15757 "native argument type");
15758 return (-1);
15759 }
15760
15761 typesz = strlen(typestr) + 1;
15762 if (typesz > DTRACE_ARGTYPELEN) {
15763 dtrace_dof_error(dof, "native "
15764 "argument type too long");
15765 return (-1);
15766 }
15767 typeidx += typesz;
15768 typestr += typesz;
15769 }
15770
15771 typeidx = probe->dofpr_xargv;
15772 typestr = strtab + probe->dofpr_xargv;
15773 for (k = 0; k < probe->dofpr_xargc; k++) {
15774 if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
15775 dtrace_dof_error(dof, "bad "
15776 "native argument index");
15777 return (-1);
15778 }
15779
15780 if (typeidx >= str_sec->dofs_size) {
15781 dtrace_dof_error(dof, "bad "
15782 "translated argument type");
15783 return (-1);
15784 }
15785
15786 typesz = strlen(typestr) + 1;
15787 if (typesz > DTRACE_ARGTYPELEN) {
15788 dtrace_dof_error(dof, "translated argument "
15789 "type too long");
15790 return (-1);
15791 }
15792
15793 typeidx += typesz;
15794 typestr += typesz;
15795 }
15796 }
15797
15798 return (0);
15799 }
15800
15801 static int
15802 dtrace_helper_slurp(proc_t* p, dof_hdr_t *dof, dof_helper_t *dhp)
15803 {
15804 dtrace_helpers_t *help;
15805 dtrace_vstate_t *vstate;
15806 dtrace_enabling_t *enab = NULL;
15807 int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
15808 uintptr_t daddr = (uintptr_t)dof;
15809
15810 LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
15811 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15812
15813 if ((help = p->p_dtrace_helpers) == NULL)
15814 help = dtrace_helpers_create(p);
15815
15816 vstate = &help->dthps_vstate;
15817
15818 if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab,
15819 dhp != NULL ? dhp->dofhp_addr : 0, B_FALSE)) != 0) {
15820 dtrace_dof_destroy(dof);
15821 return (rv);
15822 }
15823
15824 /*
15825 * Look for helper providers and validate their descriptions.
15826 */
15827 if (dhp != NULL) {
15828 for (i = 0; (uint32_t)i < dof->dofh_secnum; i++) {
15829 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
15830 dof->dofh_secoff + i * dof->dofh_secsize);
15831
15832 if (sec->dofs_type != DOF_SECT_PROVIDER)
15833 continue;
15834
15835 if (dtrace_helper_provider_validate(dof, sec) != 0) {
15836 dtrace_enabling_destroy(enab);
15837 dtrace_dof_destroy(dof);
15838 return (-1);
15839 }
15840
15841 nprovs++;
15842 }
15843 }
15844
15845 /*
15846 * Now we need to walk through the ECB descriptions in the enabling.
15847 */
15848 for (i = 0; i < enab->dten_ndesc; i++) {
15849 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
15850 dtrace_probedesc_t *desc = &ep->dted_probe;
15851
15852 /* APPLE NOTE: Darwin employs size bounded string operation. */
15853 if (!LIT_STRNEQL(desc->dtpd_provider, "dtrace"))
15854 continue;
15855
15856 if (!LIT_STRNEQL(desc->dtpd_mod, "helper"))
15857 continue;
15858
15859 if (!LIT_STRNEQL(desc->dtpd_func, "ustack"))
15860 continue;
15861
15862 if ((rv = dtrace_helper_action_add(p, DTRACE_HELPER_ACTION_USTACK,
15863 ep)) != 0) {
15864 /*
15865 * Adding this helper action failed -- we are now going
15866 * to rip out the entire generation and return failure.
15867 */
15868 (void) dtrace_helper_destroygen(p, help->dthps_generation);
15869 dtrace_enabling_destroy(enab);
15870 dtrace_dof_destroy(dof);
15871 return (-1);
15872 }
15873
15874 nhelpers++;
15875 }
15876
15877 if (nhelpers < enab->dten_ndesc)
15878 dtrace_dof_error(dof, "unmatched helpers");
15879
15880 gen = help->dthps_generation++;
15881 dtrace_enabling_destroy(enab);
15882
15883 if (dhp != NULL && nprovs > 0) {
15884 dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
15885 if (dtrace_helper_provider_add(p, dhp, gen) == 0) {
15886 lck_mtx_unlock(&dtrace_lock);
15887 dtrace_helper_provider_register(p, help, dhp);
15888 lck_mtx_lock(&dtrace_lock);
15889
15890 destroy = 0;
15891 }
15892 }
15893
15894 if (destroy)
15895 dtrace_dof_destroy(dof);
15896
15897 return (gen);
15898 }
15899
15900 /*
15901 * APPLE NOTE: DTrace lazy dof implementation
15902 *
15903 * DTrace user static probes (USDT probes) and helper actions are loaded
15904 * in a process by proccessing dof sections. The dof sections are passed
15905 * into the kernel by dyld, in a dof_ioctl_data_t block. It is rather
15906 * expensive to process dof for a process that will never use it. There
15907 * is a memory cost (allocating the providers/probes), and a cpu cost
15908 * (creating the providers/probes).
15909 *
15910 * To reduce this cost, we use "lazy dof". The normal proceedure for
15911 * dof processing is to copyin the dof(s) pointed to by the dof_ioctl_data_t
15912 * block, and invoke dof_slurp_helper() on them. When "lazy dof" is
15913 * used, each process retains the dof_ioctl_data_t block, instead of
15914 * copying in the data it points to.
15915 *
15916 * The dof_ioctl_data_t blocks are managed as if they were the actual
15917 * processed dof; on fork the block is copied to the child, on exec and
15918 * exit the block is freed.
15919 *
15920 * If the process loads library(s) containing additional dof, the
15921 * new dof_ioctl_data_t is merged with the existing block.
15922 *
15923 * There are a few catches that make this slightly more difficult.
15924 * When dyld registers dof_ioctl_data_t blocks, it expects a unique
15925 * identifier value for each dof in the block. In non-lazy dof terms,
15926 * this is the generation that dof was loaded in. If we hand back
15927 * a UID for a lazy dof, that same UID must be able to unload the
15928 * dof once it has become non-lazy. To meet this requirement, the
15929 * code that loads lazy dof requires that the UID's for dof(s) in
15930 * the lazy dof be sorted, and in ascending order. It is okay to skip
15931 * UID's, I.E., 1 -> 5 -> 6 is legal.
15932 *
15933 * Once a process has become non-lazy, it will stay non-lazy. All
15934 * future dof operations for that process will be non-lazy, even
15935 * if the dof mode transitions back to lazy.
15936 *
15937 * Always do lazy dof checks before non-lazy (I.E. In fork, exit, exec.).
15938 * That way if the lazy check fails due to transitioning to non-lazy, the
15939 * right thing is done with the newly faulted in dof.
15940 */
15941
15942 /*
15943 * This method is a bit squicky. It must handle:
15944 *
15945 * dof should not be lazy.
15946 * dof should have been handled lazily, but there was an error
15947 * dof was handled lazily, and needs to be freed.
15948 * dof was handled lazily, and must not be freed.
15949 *
15950 *
15951 * Returns EACCESS if dof should be handled non-lazily.
15952 *
15953 * KERN_SUCCESS and all other return codes indicate lazy handling of dof.
15954 *
15955 * If the dofs data is claimed by this method, dofs_claimed will be set.
15956 * Callers should not free claimed dofs.
15957 */
15958 static int
15959 dtrace_lazy_dofs_add(proc_t *p, dof_ioctl_data_t* incoming_dofs, int *dofs_claimed)
15960 {
15961 ASSERT(p);
15962 ASSERT(incoming_dofs && incoming_dofs->dofiod_count > 0);
15963
15964 int rval = 0;
15965 *dofs_claimed = 0;
15966
15967 lck_rw_lock_shared(&dtrace_dof_mode_lock);
15968
15969 ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
15970 ASSERT(dtrace_dof_mode != DTRACE_DOF_MODE_NEVER);
15971
15972 /*
15973 * Any existing helpers force non-lazy behavior.
15974 */
15975 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON && (p->p_dtrace_helpers == NULL)) {
15976 dtrace_sprlock(p);
15977
15978 dof_ioctl_data_t* existing_dofs = p->p_dtrace_lazy_dofs;
15979 unsigned int existing_dofs_count = (existing_dofs) ? existing_dofs->dofiod_count : 0;
15980 unsigned int i, merged_dofs_count = incoming_dofs->dofiod_count + existing_dofs_count;
15981
15982 /*
15983 * Range check...
15984 */
15985 if (merged_dofs_count == 0 || merged_dofs_count > 1024) {
15986 dtrace_dof_error(NULL, "lazy_dofs_add merged_dofs_count out of range");
15987 rval = EINVAL;
15988 goto unlock;
15989 }
15990
15991 /*
15992 * Each dof being added must be assigned a unique generation.
15993 */
15994 uint64_t generation = (existing_dofs) ? existing_dofs->dofiod_helpers[existing_dofs_count - 1].dofhp_dof + 1 : 1;
15995 for (i=0; i<incoming_dofs->dofiod_count; i++) {
15996 /*
15997 * We rely on these being the same so we can overwrite dofhp_dof and not lose info.
15998 */
15999 ASSERT(incoming_dofs->dofiod_helpers[i].dofhp_dof == incoming_dofs->dofiod_helpers[i].dofhp_addr);
16000 incoming_dofs->dofiod_helpers[i].dofhp_dof = generation++;
16001 }
16002
16003
16004 if (existing_dofs) {
16005 /*
16006 * Merge the existing and incoming dofs
16007 */
16008 size_t merged_dofs_size = DOF_IOCTL_DATA_T_SIZE(merged_dofs_count);
16009 dof_ioctl_data_t* merged_dofs = kmem_alloc(merged_dofs_size, KM_SLEEP);
16010
16011 bcopy(&existing_dofs->dofiod_helpers[0],
16012 &merged_dofs->dofiod_helpers[0],
16013 sizeof(dof_helper_t) * existing_dofs_count);
16014 bcopy(&incoming_dofs->dofiod_helpers[0],
16015 &merged_dofs->dofiod_helpers[existing_dofs_count],
16016 sizeof(dof_helper_t) * incoming_dofs->dofiod_count);
16017
16018 merged_dofs->dofiod_count = merged_dofs_count;
16019
16020 kmem_free(existing_dofs, DOF_IOCTL_DATA_T_SIZE(existing_dofs_count));
16021
16022 p->p_dtrace_lazy_dofs = merged_dofs;
16023 } else {
16024 /*
16025 * Claim the incoming dofs
16026 */
16027 *dofs_claimed = 1;
16028 p->p_dtrace_lazy_dofs = incoming_dofs;
16029 }
16030
16031 #if DEBUG
16032 dof_ioctl_data_t* all_dofs = p->p_dtrace_lazy_dofs;
16033 for (i=0; i<all_dofs->dofiod_count-1; i++) {
16034 ASSERT(all_dofs->dofiod_helpers[i].dofhp_dof < all_dofs->dofiod_helpers[i+1].dofhp_dof);
16035 }
16036 #endif /* DEBUG */
16037
16038 unlock:
16039 dtrace_sprunlock(p);
16040 } else {
16041 rval = EACCES;
16042 }
16043
16044 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
16045
16046 return rval;
16047 }
16048
16049 /*
16050 * Returns:
16051 *
16052 * EINVAL: lazy dof is enabled, but the requested generation was not found.
16053 * EACCES: This removal needs to be handled non-lazily.
16054 */
16055 static int
16056 dtrace_lazy_dofs_remove(proc_t *p, int generation)
16057 {
16058 int rval = EINVAL;
16059
16060 lck_rw_lock_shared(&dtrace_dof_mode_lock);
16061
16062 ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
16063 ASSERT(dtrace_dof_mode != DTRACE_DOF_MODE_NEVER);
16064
16065 /*
16066 * Any existing helpers force non-lazy behavior.
16067 */
16068 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON && (p->p_dtrace_helpers == NULL)) {
16069 dtrace_sprlock(p);
16070
16071 dof_ioctl_data_t* existing_dofs = p->p_dtrace_lazy_dofs;
16072
16073 if (existing_dofs) {
16074 int index, existing_dofs_count = existing_dofs->dofiod_count;
16075 for (index=0; index<existing_dofs_count; index++) {
16076 if ((int)existing_dofs->dofiod_helpers[index].dofhp_dof == generation) {
16077 dof_ioctl_data_t* removed_dofs = NULL;
16078
16079 /*
16080 * If there is only 1 dof, we'll delete it and swap in NULL.
16081 */
16082 if (existing_dofs_count > 1) {
16083 int removed_dofs_count = existing_dofs_count - 1;
16084 size_t removed_dofs_size = DOF_IOCTL_DATA_T_SIZE(removed_dofs_count);
16085
16086 removed_dofs = kmem_alloc(removed_dofs_size, KM_SLEEP);
16087 removed_dofs->dofiod_count = removed_dofs_count;
16088
16089 /*
16090 * copy the remaining data.
16091 */
16092 if (index > 0) {
16093 bcopy(&existing_dofs->dofiod_helpers[0],
16094 &removed_dofs->dofiod_helpers[0],
16095 index * sizeof(dof_helper_t));
16096 }
16097
16098 if (index < existing_dofs_count-1) {
16099 bcopy(&existing_dofs->dofiod_helpers[index+1],
16100 &removed_dofs->dofiod_helpers[index],
16101 (existing_dofs_count - index - 1) * sizeof(dof_helper_t));
16102 }
16103 }
16104
16105 kmem_free(existing_dofs, DOF_IOCTL_DATA_T_SIZE(existing_dofs_count));
16106
16107 p->p_dtrace_lazy_dofs = removed_dofs;
16108
16109 rval = KERN_SUCCESS;
16110
16111 break;
16112 }
16113 }
16114
16115 #if DEBUG
16116 dof_ioctl_data_t* all_dofs = p->p_dtrace_lazy_dofs;
16117 if (all_dofs) {
16118 unsigned int i;
16119 for (i=0; i<all_dofs->dofiod_count-1; i++) {
16120 ASSERT(all_dofs->dofiod_helpers[i].dofhp_dof < all_dofs->dofiod_helpers[i+1].dofhp_dof);
16121 }
16122 }
16123 #endif
16124
16125 }
16126 dtrace_sprunlock(p);
16127 } else {
16128 rval = EACCES;
16129 }
16130
16131 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
16132
16133 return rval;
16134 }
16135
16136 void
16137 dtrace_lazy_dofs_destroy(proc_t *p)
16138 {
16139 lck_rw_lock_shared(&dtrace_dof_mode_lock);
16140 dtrace_sprlock(p);
16141
16142 ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
16143
16144 dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs;
16145 p->p_dtrace_lazy_dofs = NULL;
16146
16147 dtrace_sprunlock(p);
16148 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
16149
16150 if (lazy_dofs) {
16151 kmem_free(lazy_dofs, DOF_IOCTL_DATA_T_SIZE(lazy_dofs->dofiod_count));
16152 }
16153 }
16154
16155 static int
16156 dtrace_lazy_dofs_proc_iterate_filter(proc_t *p, void* ignored)
16157 {
16158 #pragma unused(ignored)
16159 /*
16160 * Okay to NULL test without taking the sprlock.
16161 */
16162 return p->p_dtrace_lazy_dofs != NULL;
16163 }
16164
16165 static void
16166 dtrace_lazy_dofs_process(proc_t *p) {
16167 /*
16168 * It is possible this process may exit during our attempt to
16169 * fault in the dof. We could fix this by holding locks longer,
16170 * but the errors are benign.
16171 */
16172 dtrace_sprlock(p);
16173
16174
16175 ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
16176 ASSERT(dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF);
16177
16178 dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs;
16179 p->p_dtrace_lazy_dofs = NULL;
16180
16181 dtrace_sprunlock(p);
16182 lck_mtx_lock(&dtrace_meta_lock);
16183 /*
16184 * Process each dof_helper_t
16185 */
16186 if (lazy_dofs != NULL) {
16187 unsigned int i;
16188 int rval;
16189
16190 for (i=0; i<lazy_dofs->dofiod_count; i++) {
16191 /*
16192 * When loading lazy dof, we depend on the generations being sorted in ascending order.
16193 */
16194 ASSERT(i >= (lazy_dofs->dofiod_count - 1) || lazy_dofs->dofiod_helpers[i].dofhp_dof < lazy_dofs->dofiod_helpers[i+1].dofhp_dof);
16195
16196 dof_helper_t *dhp = &lazy_dofs->dofiod_helpers[i];
16197
16198 /*
16199 * We stored the generation in dofhp_dof. Save it, and restore the original value.
16200 */
16201 int generation = dhp->dofhp_dof;
16202 dhp->dofhp_dof = dhp->dofhp_addr;
16203
16204 dof_hdr_t *dof = dtrace_dof_copyin_from_proc(p, dhp->dofhp_dof, &rval);
16205
16206 if (dof != NULL) {
16207 dtrace_helpers_t *help;
16208
16209 lck_mtx_lock(&dtrace_lock);
16210
16211 /*
16212 * This must be done with the dtrace_lock held
16213 */
16214 if ((help = p->p_dtrace_helpers) == NULL)
16215 help = dtrace_helpers_create(p);
16216
16217 /*
16218 * If the generation value has been bumped, someone snuck in
16219 * when we released the dtrace lock. We have to dump this generation,
16220 * there is no safe way to load it.
16221 */
16222 if (help->dthps_generation <= generation) {
16223 help->dthps_generation = generation;
16224
16225 /*
16226 * dtrace_helper_slurp() takes responsibility for the dof --
16227 * it may free it now or it may save it and free it later.
16228 */
16229 if ((rval = dtrace_helper_slurp(p, dof, dhp)) != generation) {
16230 dtrace_dof_error(NULL, "returned value did not match expected generation");
16231 }
16232 }
16233
16234 lck_mtx_unlock(&dtrace_lock);
16235 }
16236 }
16237 lck_mtx_unlock(&dtrace_meta_lock);
16238 kmem_free(lazy_dofs, DOF_IOCTL_DATA_T_SIZE(lazy_dofs->dofiod_count));
16239 } else {
16240 lck_mtx_unlock(&dtrace_meta_lock);
16241 }
16242 }
16243
16244 static int
16245 dtrace_lazy_dofs_proc_iterate_doit(proc_t *p, void* ignored)
16246 {
16247 #pragma unused(ignored)
16248
16249 dtrace_lazy_dofs_process(p);
16250
16251 return PROC_RETURNED;
16252 }
16253
16254 #define DTRACE_LAZY_DOFS_DUPLICATED 1
16255
16256 static int
16257 dtrace_lazy_dofs_duplicate(proc_t *parent, proc_t *child)
16258 {
16259 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
16260 LCK_MTX_ASSERT(&parent->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED);
16261 LCK_MTX_ASSERT(&child->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED);
16262
16263 lck_rw_lock_shared(&dtrace_dof_mode_lock);
16264 dtrace_sprlock(parent);
16265
16266 /*
16267 * We need to make sure that the transition to lazy dofs -> helpers
16268 * was atomic for our parent
16269 */
16270 ASSERT(parent->p_dtrace_lazy_dofs == NULL || parent->p_dtrace_helpers == NULL);
16271 /*
16272 * In theory we should hold the child sprlock, but this is safe...
16273 */
16274 ASSERT(child->p_dtrace_lazy_dofs == NULL && child->p_dtrace_helpers == NULL);
16275
16276 dof_ioctl_data_t* parent_dofs = parent->p_dtrace_lazy_dofs;
16277 dof_ioctl_data_t* child_dofs = NULL;
16278 if (parent_dofs) {
16279 size_t parent_dofs_size = DOF_IOCTL_DATA_T_SIZE(parent_dofs->dofiod_count);
16280 child_dofs = kmem_alloc(parent_dofs_size, KM_SLEEP);
16281 bcopy(parent_dofs, child_dofs, parent_dofs_size);
16282 }
16283
16284 dtrace_sprunlock(parent);
16285
16286 if (child_dofs) {
16287 dtrace_sprlock(child);
16288 child->p_dtrace_lazy_dofs = child_dofs;
16289 dtrace_sprunlock(child);
16290 /**
16291 * We process the DOF at this point if the mode is set to
16292 * LAZY_OFF. This can happen if DTrace is still processing the
16293 * DOF of other process (which can happen because the
16294 * protected pager can have a huge latency)
16295 * but has not processed our parent yet
16296 */
16297 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF) {
16298 dtrace_lazy_dofs_process(child);
16299 }
16300 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
16301
16302 return DTRACE_LAZY_DOFS_DUPLICATED;
16303 }
16304 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
16305
16306 return 0;
16307 }
16308
16309 static dtrace_helpers_t *
16310 dtrace_helpers_create(proc_t *p)
16311 {
16312 dtrace_helpers_t *help;
16313
16314 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
16315 ASSERT(p->p_dtrace_helpers == NULL);
16316
16317 help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
16318 help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *
16319 DTRACE_NHELPER_ACTIONS, KM_SLEEP);
16320
16321 p->p_dtrace_helpers = help;
16322 dtrace_helpers++;
16323
16324 return (help);
16325 }
16326
16327 static void
16328 dtrace_helpers_destroy(proc_t* p)
16329 {
16330 dtrace_helpers_t *help;
16331 dtrace_vstate_t *vstate;
16332 uint_t i;
16333
16334 lck_mtx_lock(&dtrace_meta_lock);
16335 lck_mtx_lock(&dtrace_lock);
16336
16337 ASSERT(p->p_dtrace_helpers != NULL);
16338 ASSERT(dtrace_helpers > 0);
16339
16340 help = p->p_dtrace_helpers;
16341 vstate = &help->dthps_vstate;
16342
16343 /*
16344 * We're now going to lose the help from this process.
16345 */
16346 p->p_dtrace_helpers = NULL;
16347 dtrace_sync();
16348
16349 /*
16350 * Destory the helper actions.
16351 */
16352 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
16353 dtrace_helper_action_t *h, *next;
16354
16355 for (h = help->dthps_actions[i]; h != NULL; h = next) {
16356 next = h->dtha_next;
16357 dtrace_helper_action_destroy(h, vstate);
16358 h = next;
16359 }
16360 }
16361
16362 lck_mtx_unlock(&dtrace_lock);
16363
16364 /*
16365 * Destroy the helper providers.
16366 */
16367 if (help->dthps_maxprovs > 0) {
16368 if (dtrace_meta_pid != NULL) {
16369 ASSERT(dtrace_deferred_pid == NULL);
16370
16371 for (i = 0; i < help->dthps_nprovs; i++) {
16372 dtrace_helper_provider_remove(
16373 &help->dthps_provs[i]->dthp_prov, p);
16374 }
16375 } else {
16376 lck_mtx_lock(&dtrace_lock);
16377 ASSERT(help->dthps_deferred == 0 ||
16378 help->dthps_next != NULL ||
16379 help->dthps_prev != NULL ||
16380 help == dtrace_deferred_pid);
16381
16382 /*
16383 * Remove the helper from the deferred list.
16384 */
16385 if (help->dthps_next != NULL)
16386 help->dthps_next->dthps_prev = help->dthps_prev;
16387 if (help->dthps_prev != NULL)
16388 help->dthps_prev->dthps_next = help->dthps_next;
16389 if (dtrace_deferred_pid == help) {
16390 dtrace_deferred_pid = help->dthps_next;
16391 ASSERT(help->dthps_prev == NULL);
16392 }
16393
16394 lck_mtx_unlock(&dtrace_lock);
16395 }
16396
16397
16398 for (i = 0; i < help->dthps_nprovs; i++) {
16399 dtrace_helper_provider_destroy(help->dthps_provs[i]);
16400 }
16401
16402 kmem_free(help->dthps_provs, help->dthps_maxprovs *
16403 sizeof (dtrace_helper_provider_t *));
16404 }
16405
16406 lck_mtx_lock(&dtrace_lock);
16407
16408 dtrace_vstate_fini(&help->dthps_vstate);
16409 kmem_free(help->dthps_actions,
16410 sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);
16411 kmem_free(help, sizeof (dtrace_helpers_t));
16412
16413 --dtrace_helpers;
16414 lck_mtx_unlock(&dtrace_lock);
16415 lck_mtx_unlock(&dtrace_meta_lock);
16416 }
16417
16418 static void
16419 dtrace_helpers_duplicate(proc_t *from, proc_t *to)
16420 {
16421 dtrace_helpers_t *help, *newhelp;
16422 dtrace_helper_action_t *helper, *new, *last;
16423 dtrace_difo_t *dp;
16424 dtrace_vstate_t *vstate;
16425 uint_t i;
16426 int j, sz, hasprovs = 0;
16427
16428 lck_mtx_lock(&dtrace_meta_lock);
16429 lck_mtx_lock(&dtrace_lock);
16430 ASSERT(from->p_dtrace_helpers != NULL);
16431 ASSERT(dtrace_helpers > 0);
16432
16433 help = from->p_dtrace_helpers;
16434 newhelp = dtrace_helpers_create(to);
16435 ASSERT(to->p_dtrace_helpers != NULL);
16436
16437 newhelp->dthps_generation = help->dthps_generation;
16438 vstate = &newhelp->dthps_vstate;
16439
16440 /*
16441 * Duplicate the helper actions.
16442 */
16443 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
16444 if ((helper = help->dthps_actions[i]) == NULL)
16445 continue;
16446
16447 for (last = NULL; helper != NULL; helper = helper->dtha_next) {
16448 new = kmem_zalloc(sizeof (dtrace_helper_action_t),
16449 KM_SLEEP);
16450 new->dtha_generation = helper->dtha_generation;
16451
16452 if ((dp = helper->dtha_predicate) != NULL) {
16453 dp = dtrace_difo_duplicate(dp, vstate);
16454 new->dtha_predicate = dp;
16455 }
16456
16457 new->dtha_nactions = helper->dtha_nactions;
16458 sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;
16459 new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
16460
16461 for (j = 0; j < new->dtha_nactions; j++) {
16462 dtrace_difo_t *dpj = helper->dtha_actions[j];
16463
16464 ASSERT(dpj != NULL);
16465 dpj = dtrace_difo_duplicate(dpj, vstate);
16466 new->dtha_actions[j] = dpj;
16467 }
16468
16469 if (last != NULL) {
16470 last->dtha_next = new;
16471 } else {
16472 newhelp->dthps_actions[i] = new;
16473 }
16474
16475 last = new;
16476 }
16477 }
16478
16479 /*
16480 * Duplicate the helper providers and register them with the
16481 * DTrace framework.
16482 */
16483 if (help->dthps_nprovs > 0) {
16484 newhelp->dthps_nprovs = help->dthps_nprovs;
16485 newhelp->dthps_maxprovs = help->dthps_nprovs;
16486 newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
16487 sizeof (dtrace_helper_provider_t *), KM_SLEEP);
16488 for (i = 0; i < newhelp->dthps_nprovs; i++) {
16489 newhelp->dthps_provs[i] = help->dthps_provs[i];
16490 newhelp->dthps_provs[i]->dthp_ref++;
16491 }
16492
16493 hasprovs = 1;
16494 }
16495
16496 lck_mtx_unlock(&dtrace_lock);
16497
16498 if (hasprovs)
16499 dtrace_helper_provider_register(to, newhelp, NULL);
16500
16501 lck_mtx_unlock(&dtrace_meta_lock);
16502 }
16503
16504 /**
16505 * DTrace Process functions
16506 */
16507
16508 void
16509 dtrace_proc_fork(proc_t *parent_proc, proc_t *child_proc, int spawn)
16510 {
16511 /*
16512 * This code applies to new processes who are copying the task
16513 * and thread state and address spaces of their parent process.
16514 */
16515 if (!spawn) {
16516 /*
16517 * APPLE NOTE: Solaris does a sprlock() and drops the
16518 * proc_lock here. We're cheating a bit and only taking
16519 * the p_dtrace_sprlock lock. A full sprlock would
16520 * task_suspend the parent.
16521 */
16522 dtrace_sprlock(parent_proc);
16523
16524 /*
16525 * Remove all DTrace tracepoints from the child process. We
16526 * need to do this _before_ duplicating USDT providers since
16527 * any associated probes may be immediately enabled.
16528 */
16529 if (parent_proc->p_dtrace_count > 0) {
16530 dtrace_fasttrap_fork(parent_proc, child_proc);
16531 }
16532
16533 dtrace_sprunlock(parent_proc);
16534
16535 /*
16536 * Duplicate any lazy dof(s). This must be done while NOT
16537 * holding the parent sprlock! Lock ordering is
16538 * dtrace_dof_mode_lock, then sprlock. It is imperative we
16539 * always call dtrace_lazy_dofs_duplicate, rather than null
16540 * check and call if !NULL. If we NULL test, during lazy dof
16541 * faulting we can race with the faulting code and proceed
16542 * from here to beyond the helpers copy. The lazy dof
16543 * faulting will then fail to copy the helpers to the child
16544 * process. We return if we duplicated lazy dofs as a process
16545 * can only have one at the same time to avoid a race between
16546 * a dtrace client and dtrace_proc_fork where a process would
16547 * end up with both lazy dofs and helpers.
16548 */
16549 if (dtrace_lazy_dofs_duplicate(parent_proc, child_proc) == DTRACE_LAZY_DOFS_DUPLICATED) {
16550 return;
16551 }
16552
16553 /*
16554 * Duplicate any helper actions and providers if they haven't
16555 * already.
16556 */
16557 #if !defined(__APPLE__)
16558 /*
16559 * The SFORKING
16560 * we set above informs the code to enable USDT probes that
16561 * sprlock() may fail because the child is being forked.
16562 */
16563 #endif
16564 /*
16565 * APPLE NOTE: As best I can tell, Apple's sprlock() equivalent
16566 * never fails to find the child. We do not set SFORKING.
16567 */
16568 if (parent_proc->p_dtrace_helpers != NULL && dtrace_helpers_fork) {
16569 (*dtrace_helpers_fork)(parent_proc, child_proc);
16570 }
16571 }
16572 }
16573
16574 void
16575 dtrace_proc_exec(proc_t *p)
16576 {
16577 /*
16578 * Invalidate any predicate evaluation already cached for this thread by DTrace.
16579 * That's because we've just stored to p_comm and DTrace refers to that when it
16580 * evaluates the "execname" special variable. uid and gid may have changed as well.
16581 */
16582 dtrace_set_thread_predcache(current_thread(), 0);
16583
16584 /*
16585 * Free any outstanding lazy dof entries. It is imperative we
16586 * always call dtrace_lazy_dofs_destroy, rather than null check
16587 * and call if !NULL. If we NULL test, during lazy dof faulting
16588 * we can race with the faulting code and proceed from here to
16589 * beyond the helpers cleanup. The lazy dof faulting will then
16590 * install new helpers which no longer belong to this process!
16591 */
16592 dtrace_lazy_dofs_destroy(p);
16593
16594
16595 /*
16596 * Clean up any DTrace helpers for the process.
16597 */
16598 if (p->p_dtrace_helpers != NULL && dtrace_helpers_cleanup) {
16599 (*dtrace_helpers_cleanup)(p);
16600 }
16601
16602 /*
16603 * Cleanup the DTrace provider associated with this process.
16604 */
16605 proc_lock(p);
16606 if (p->p_dtrace_probes && dtrace_fasttrap_exec_ptr) {
16607 (*dtrace_fasttrap_exec_ptr)(p);
16608 }
16609 proc_unlock(p);
16610 }
16611
16612 void
16613 dtrace_proc_exit(proc_t *p)
16614 {
16615 /*
16616 * Free any outstanding lazy dof entries. It is imperative we
16617 * always call dtrace_lazy_dofs_destroy, rather than null check
16618 * and call if !NULL. If we NULL test, during lazy dof faulting
16619 * we can race with the faulting code and proceed from here to
16620 * beyond the helpers cleanup. The lazy dof faulting will then
16621 * install new helpers which will never be cleaned up, and leak.
16622 */
16623 dtrace_lazy_dofs_destroy(p);
16624
16625 /*
16626 * Clean up any DTrace helper actions or probes for the process.
16627 */
16628 if (p->p_dtrace_helpers != NULL) {
16629 (*dtrace_helpers_cleanup)(p);
16630 }
16631
16632 /*
16633 * Clean up any DTrace probes associated with this process.
16634 */
16635 /*
16636 * APPLE NOTE: We release ptss pages/entries in dtrace_fasttrap_exit_ptr(),
16637 * call this after dtrace_helpers_cleanup()
16638 */
16639 proc_lock(p);
16640 if (p->p_dtrace_probes && dtrace_fasttrap_exit_ptr) {
16641 (*dtrace_fasttrap_exit_ptr)(p);
16642 }
16643 proc_unlock(p);
16644 }
16645
16646 /*
16647 * DTrace Hook Functions
16648 */
16649
16650 /*
16651 * APPLE NOTE: dtrace_modctl_* routines for kext support.
16652 * Used to manipulate the modctl list within dtrace xnu.
16653 */
16654
16655 modctl_t *dtrace_modctl_list;
16656
16657 static void
16658 dtrace_modctl_add(struct modctl * newctl)
16659 {
16660 struct modctl *nextp, *prevp;
16661
16662 ASSERT(newctl != NULL);
16663 LCK_MTX_ASSERT(&mod_lock, LCK_MTX_ASSERT_OWNED);
16664
16665 // Insert new module at the front of the list,
16666
16667 newctl->mod_next = dtrace_modctl_list;
16668 dtrace_modctl_list = newctl;
16669
16670 /*
16671 * If a module exists with the same name, then that module
16672 * must have been unloaded with enabled probes. We will move
16673 * the unloaded module to the new module's stale chain and
16674 * then stop traversing the list.
16675 */
16676
16677 prevp = newctl;
16678 nextp = newctl->mod_next;
16679
16680 while (nextp != NULL) {
16681 if (nextp->mod_loaded) {
16682 /* This is a loaded module. Keep traversing. */
16683 prevp = nextp;
16684 nextp = nextp->mod_next;
16685 continue;
16686 }
16687 else {
16688 /* Found an unloaded module */
16689 if (strncmp (newctl->mod_modname, nextp->mod_modname, KMOD_MAX_NAME)) {
16690 /* Names don't match. Keep traversing. */
16691 prevp = nextp;
16692 nextp = nextp->mod_next;
16693 continue;
16694 }
16695 else {
16696 /* We found a stale entry, move it. We're done. */
16697 prevp->mod_next = nextp->mod_next;
16698 newctl->mod_stale = nextp;
16699 nextp->mod_next = NULL;
16700 break;
16701 }
16702 }
16703 }
16704 }
16705
16706 static modctl_t *
16707 dtrace_modctl_lookup(struct kmod_info * kmod)
16708 {
16709 LCK_MTX_ASSERT(&mod_lock, LCK_MTX_ASSERT_OWNED);
16710
16711 struct modctl * ctl;
16712
16713 for (ctl = dtrace_modctl_list; ctl; ctl=ctl->mod_next) {
16714 if (ctl->mod_id == kmod->id)
16715 return(ctl);
16716 }
16717 return (NULL);
16718 }
16719
16720 /*
16721 * This routine is called from dtrace_module_unloaded().
16722 * It removes a modctl structure and its stale chain
16723 * from the kext shadow list.
16724 */
16725 static void
16726 dtrace_modctl_remove(struct modctl * ctl)
16727 {
16728 ASSERT(ctl != NULL);
16729 LCK_MTX_ASSERT(&mod_lock, LCK_MTX_ASSERT_OWNED);
16730 modctl_t *prevp, *nextp, *curp;
16731
16732 // Remove stale chain first
16733 for (curp=ctl->mod_stale; curp != NULL; curp=nextp) {
16734 nextp = curp->mod_stale;
16735 /* There should NEVER be user symbols allocated at this point */
16736 ASSERT(curp->mod_user_symbols == NULL);
16737 kmem_free(curp, sizeof(modctl_t));
16738 }
16739
16740 prevp = NULL;
16741 curp = dtrace_modctl_list;
16742
16743 while (curp != ctl) {
16744 prevp = curp;
16745 curp = curp->mod_next;
16746 }
16747
16748 if (prevp != NULL) {
16749 prevp->mod_next = ctl->mod_next;
16750 }
16751 else {
16752 dtrace_modctl_list = ctl->mod_next;
16753 }
16754
16755 /* There should NEVER be user symbols allocated at this point */
16756 ASSERT(ctl->mod_user_symbols == NULL);
16757
16758 kmem_free (ctl, sizeof(modctl_t));
16759 }
16760
16761 /*
16762 * APPLE NOTE: The kext loader will call dtrace_module_loaded
16763 * when the kext is loaded in memory, but before calling the
16764 * kext's start routine.
16765 *
16766 * Return 0 on success
16767 * Return -1 on failure
16768 */
16769
16770 static int
16771 dtrace_module_loaded(struct kmod_info *kmod, uint32_t flag)
16772 {
16773 dtrace_provider_t *prv;
16774
16775 /*
16776 * If kernel symbols have been disabled, return immediately
16777 * DTRACE_KERNEL_SYMBOLS_NEVER is a permanent mode, it is safe to test without holding locks
16778 */
16779 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER)
16780 return 0;
16781
16782 struct modctl *ctl = NULL;
16783 if (!kmod || kmod->address == 0 || kmod->size == 0)
16784 return(-1);
16785
16786 lck_mtx_lock(&dtrace_provider_lock);
16787 lck_mtx_lock(&mod_lock);
16788
16789 /*
16790 * Have we seen this kext before?
16791 */
16792
16793 ctl = dtrace_modctl_lookup(kmod);
16794
16795 if (ctl != NULL) {
16796 /* bail... we already have this kext in the modctl list */
16797 lck_mtx_unlock(&mod_lock);
16798 lck_mtx_unlock(&dtrace_provider_lock);
16799 if (dtrace_err_verbose)
16800 cmn_err(CE_WARN, "dtrace load module already exists '%s %u' is failing against '%s %u'", kmod->name, (uint_t)kmod->id, ctl->mod_modname, ctl->mod_id);
16801 return(-1);
16802 }
16803 else {
16804 ctl = kmem_alloc(sizeof(struct modctl), KM_SLEEP);
16805 if (ctl == NULL) {
16806 if (dtrace_err_verbose)
16807 cmn_err(CE_WARN, "dtrace module load '%s %u' is failing ", kmod->name, (uint_t)kmod->id);
16808 lck_mtx_unlock(&mod_lock);
16809 lck_mtx_unlock(&dtrace_provider_lock);
16810 return (-1);
16811 }
16812 ctl->mod_next = NULL;
16813 ctl->mod_stale = NULL;
16814 strlcpy (ctl->mod_modname, kmod->name, sizeof(ctl->mod_modname));
16815 ctl->mod_loadcnt = kmod->id;
16816 ctl->mod_nenabled = 0;
16817 ctl->mod_address = kmod->address;
16818 ctl->mod_size = kmod->size;
16819 ctl->mod_id = kmod->id;
16820 ctl->mod_loaded = 1;
16821 ctl->mod_flags = 0;
16822 ctl->mod_user_symbols = NULL;
16823
16824 /*
16825 * Find the UUID for this module, if it has one
16826 */
16827 kernel_mach_header_t* header = (kernel_mach_header_t *)ctl->mod_address;
16828 struct load_command* load_cmd = (struct load_command *)&header[1];
16829 uint32_t i;
16830 for (i = 0; i < header->ncmds; i++) {
16831 if (load_cmd->cmd == LC_UUID) {
16832 struct uuid_command* uuid_cmd = (struct uuid_command *)load_cmd;
16833 memcpy(ctl->mod_uuid, uuid_cmd->uuid, sizeof(uuid_cmd->uuid));
16834 ctl->mod_flags |= MODCTL_HAS_UUID;
16835 break;
16836 }
16837 load_cmd = (struct load_command *)((caddr_t)load_cmd + load_cmd->cmdsize);
16838 }
16839
16840 if (ctl->mod_address == g_kernel_kmod_info.address) {
16841 ctl->mod_flags |= MODCTL_IS_MACH_KERNEL;
16842 memcpy(dtrace_kerneluuid, ctl->mod_uuid, sizeof(dtrace_kerneluuid));
16843 }
16844 /*
16845 * Static kexts have a UUID that is not used for symbolication, as all their
16846 * symbols are in kernel
16847 */
16848 else if ((flag & KMOD_DTRACE_STATIC_KEXT) == KMOD_DTRACE_STATIC_KEXT) {
16849 memcpy(ctl->mod_uuid, dtrace_kerneluuid, sizeof(dtrace_kerneluuid));
16850 ctl->mod_flags |= MODCTL_IS_STATIC_KEXT;
16851 }
16852 }
16853 dtrace_modctl_add(ctl);
16854
16855 /*
16856 * We must hold the dtrace_lock to safely test non permanent dtrace_fbt_symbol_mode(s)
16857 */
16858 lck_mtx_lock(&dtrace_lock);
16859
16860 /*
16861 * DTrace must decide if it will instrument modules lazily via
16862 * userspace symbols (default mode), or instrument immediately via
16863 * kernel symbols (non-default mode)
16864 *
16865 * When in default/lazy mode, DTrace will only support modules
16866 * built with a valid UUID.
16867 *
16868 * Overriding the default can be done explicitly in one of
16869 * the following two ways.
16870 *
16871 * A module can force symbols from kernel space using the plist key,
16872 * OSBundleForceDTraceInit (see kmod.h). If this per kext state is set,
16873 * we fall through and instrument this module now.
16874 *
16875 * Or, the boot-arg, dtrace_kernel_symbol_mode, can be set to force symbols
16876 * from kernel space (see dtrace_impl.h). If this system state is set
16877 * to a non-userspace mode, we fall through and instrument the module now.
16878 */
16879
16880 if ((dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) &&
16881 (!(flag & KMOD_DTRACE_FORCE_INIT)))
16882 {
16883 /* We will instrument the module lazily -- this is the default */
16884 lck_mtx_unlock(&dtrace_lock);
16885 lck_mtx_unlock(&mod_lock);
16886 lck_mtx_unlock(&dtrace_provider_lock);
16887 return 0;
16888 }
16889
16890 /* We will instrument the module immediately using kernel symbols */
16891 if (!(flag & KMOD_DTRACE_NO_KERNEL_SYMS)) {
16892 ctl->mod_flags |= MODCTL_HAS_KERNEL_SYMBOLS;
16893 }
16894
16895 lck_mtx_unlock(&dtrace_lock);
16896
16897 /*
16898 * We're going to call each providers per-module provide operation
16899 * specifying only this module.
16900 */
16901 for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
16902 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
16903
16904 /*
16905 * APPLE NOTE: The contract with the kext loader is that once this function
16906 * has completed, it may delete kernel symbols at will.
16907 * We must set this while still holding the mod_lock.
16908 */
16909 ctl->mod_flags &= ~MODCTL_HAS_KERNEL_SYMBOLS;
16910
16911 lck_mtx_unlock(&mod_lock);
16912 lck_mtx_unlock(&dtrace_provider_lock);
16913
16914 /*
16915 * If we have any retained enablings, we need to match against them.
16916 * Enabling probes requires that cpu_lock be held, and we cannot hold
16917 * cpu_lock here -- it is legal for cpu_lock to be held when loading a
16918 * module. (In particular, this happens when loading scheduling
16919 * classes.) So if we have any retained enablings, we need to dispatch
16920 * our task queue to do the match for us.
16921 */
16922 lck_mtx_lock(&dtrace_lock);
16923
16924 if (dtrace_retained == NULL) {
16925 lck_mtx_unlock(&dtrace_lock);
16926 return 0;
16927 }
16928
16929 /* APPLE NOTE!
16930 *
16931 * The cpu_lock mentioned above is only held by dtrace code, Apple's xnu never actually
16932 * holds it for any reason. Thus the comment above is invalid, we can directly invoke
16933 * dtrace_enabling_matchall without jumping through all the hoops, and we can avoid
16934 * the delay call as well.
16935 */
16936 lck_mtx_unlock(&dtrace_lock);
16937
16938 dtrace_enabling_matchall();
16939
16940 return 0;
16941 }
16942
16943 /*
16944 * Return 0 on success
16945 * Return -1 on failure
16946 */
16947 static int
16948 dtrace_module_unloaded(struct kmod_info *kmod)
16949 {
16950 dtrace_probe_t template, *probe, *first, *next;
16951 dtrace_provider_t *prov;
16952 struct modctl *ctl = NULL;
16953 struct modctl *syncctl = NULL;
16954 struct modctl *nextsyncctl = NULL;
16955 int syncmode = 0;
16956
16957 lck_mtx_lock(&dtrace_provider_lock);
16958 lck_mtx_lock(&mod_lock);
16959 lck_mtx_lock(&dtrace_lock);
16960
16961 if (kmod == NULL) {
16962 syncmode = 1;
16963 }
16964 else {
16965 ctl = dtrace_modctl_lookup(kmod);
16966 if (ctl == NULL)
16967 {
16968 lck_mtx_unlock(&dtrace_lock);
16969 lck_mtx_unlock(&mod_lock);
16970 lck_mtx_unlock(&dtrace_provider_lock);
16971 return (-1);
16972 }
16973 ctl->mod_loaded = 0;
16974 ctl->mod_address = 0;
16975 ctl->mod_size = 0;
16976 }
16977
16978 if (dtrace_bymod == NULL) {
16979 /*
16980 * The DTrace module is loaded (obviously) but not attached;
16981 * we don't have any work to do.
16982 */
16983 if (ctl != NULL)
16984 (void)dtrace_modctl_remove(ctl);
16985 lck_mtx_unlock(&dtrace_lock);
16986 lck_mtx_unlock(&mod_lock);
16987 lck_mtx_unlock(&dtrace_provider_lock);
16988 return(0);
16989 }
16990
16991 /* Syncmode set means we target and traverse entire modctl list. */
16992 if (syncmode)
16993 nextsyncctl = dtrace_modctl_list;
16994
16995 syncloop:
16996 if (syncmode)
16997 {
16998 /* find a stale modctl struct */
16999 for (syncctl = nextsyncctl; syncctl != NULL; syncctl=syncctl->mod_next) {
17000 if (syncctl->mod_address == 0)
17001 break;
17002 }
17003 if (syncctl==NULL)
17004 {
17005 /* We have no more work to do */
17006 lck_mtx_unlock(&dtrace_lock);
17007 lck_mtx_unlock(&mod_lock);
17008 lck_mtx_unlock(&dtrace_provider_lock);
17009 return(0);
17010 }
17011 else {
17012 /* keep track of next syncctl in case this one is removed */
17013 nextsyncctl = syncctl->mod_next;
17014 ctl = syncctl;
17015 }
17016 }
17017
17018 template.dtpr_mod = ctl->mod_modname;
17019
17020 for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
17021 probe != NULL; probe = probe->dtpr_nextmod) {
17022 if (probe->dtpr_ecb != NULL) {
17023 /*
17024 * This shouldn't _actually_ be possible -- we're
17025 * unloading a module that has an enabled probe in it.
17026 * (It's normally up to the provider to make sure that
17027 * this can't happen.) However, because dtps_enable()
17028 * doesn't have a failure mode, there can be an
17029 * enable/unload race. Upshot: we don't want to
17030 * assert, but we're not going to disable the
17031 * probe, either.
17032 */
17033
17034
17035 if (syncmode) {
17036 /* We're syncing, let's look at next in list */
17037 goto syncloop;
17038 }
17039
17040 lck_mtx_unlock(&dtrace_lock);
17041 lck_mtx_unlock(&mod_lock);
17042 lck_mtx_unlock(&dtrace_provider_lock);
17043
17044 if (dtrace_err_verbose) {
17045 cmn_err(CE_WARN, "unloaded module '%s' had "
17046 "enabled probes", ctl->mod_modname);
17047 }
17048 return(-1);
17049 }
17050 }
17051
17052 probe = first;
17053
17054 for (first = NULL; probe != NULL; probe = next) {
17055 ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
17056
17057 dtrace_probes[probe->dtpr_id - 1] = NULL;
17058 probe->dtpr_provider->dtpv_probe_count--;
17059
17060 next = probe->dtpr_nextmod;
17061 dtrace_hash_remove(dtrace_byprov, probe);
17062 dtrace_hash_remove(dtrace_bymod, probe);
17063 dtrace_hash_remove(dtrace_byfunc, probe);
17064 dtrace_hash_remove(dtrace_byname, probe);
17065
17066 if (first == NULL) {
17067 first = probe;
17068 probe->dtpr_nextmod = NULL;
17069 } else {
17070 probe->dtpr_nextmod = first;
17071 first = probe;
17072 }
17073 }
17074
17075 /*
17076 * We've removed all of the module's probes from the hash chains and
17077 * from the probe array. Now issue a dtrace_sync() to be sure that
17078 * everyone has cleared out from any probe array processing.
17079 */
17080 dtrace_sync();
17081
17082 for (probe = first; probe != NULL; probe = first) {
17083 first = probe->dtpr_nextmod;
17084 prov = probe->dtpr_provider;
17085 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
17086 probe->dtpr_arg);
17087 dtrace_strunref(probe->dtpr_mod);
17088 dtrace_strunref(probe->dtpr_func);
17089 dtrace_strunref(probe->dtpr_name);
17090 vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
17091
17092 zfree(dtrace_probe_t_zone, probe);
17093 }
17094
17095 dtrace_modctl_remove(ctl);
17096
17097 if (syncmode)
17098 goto syncloop;
17099
17100 lck_mtx_unlock(&dtrace_lock);
17101 lck_mtx_unlock(&mod_lock);
17102 lck_mtx_unlock(&dtrace_provider_lock);
17103
17104 return(0);
17105 }
17106
17107 void
17108 dtrace_suspend(void)
17109 {
17110 dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
17111 }
17112
17113 void
17114 dtrace_resume(void)
17115 {
17116 dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
17117 }
17118
17119 static int
17120 dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
17121 {
17122 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
17123 lck_mtx_lock(&dtrace_lock);
17124
17125 switch (what) {
17126 case CPU_CONFIG: {
17127 dtrace_state_t *state;
17128 dtrace_optval_t *opt, rs, c;
17129
17130 /*
17131 * For now, we only allocate a new buffer for anonymous state.
17132 */
17133 if ((state = dtrace_anon.dta_state) == NULL)
17134 break;
17135
17136 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
17137 break;
17138
17139 opt = state->dts_options;
17140 c = opt[DTRACEOPT_CPU];
17141
17142 if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
17143 break;
17144
17145 /*
17146 * Regardless of what the actual policy is, we're going to
17147 * temporarily set our resize policy to be manual. We're
17148 * also going to temporarily set our CPU option to denote
17149 * the newly configured CPU.
17150 */
17151 rs = opt[DTRACEOPT_BUFRESIZE];
17152 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
17153 opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
17154
17155 (void) dtrace_state_buffers(state);
17156
17157 opt[DTRACEOPT_BUFRESIZE] = rs;
17158 opt[DTRACEOPT_CPU] = c;
17159
17160 break;
17161 }
17162
17163 case CPU_UNCONFIG:
17164 /*
17165 * We don't free the buffer in the CPU_UNCONFIG case. (The
17166 * buffer will be freed when the consumer exits.)
17167 */
17168 break;
17169
17170 default:
17171 break;
17172 }
17173
17174 lck_mtx_unlock(&dtrace_lock);
17175 return (0);
17176 }
17177
17178 static void
17179 dtrace_cpu_setup_initial(processorid_t cpu)
17180 {
17181 (void) dtrace_cpu_setup(CPU_CONFIG, cpu);
17182 }
17183
17184 static void
17185 dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
17186 {
17187 if (dtrace_toxranges >= dtrace_toxranges_max) {
17188 int osize, nsize;
17189 dtrace_toxrange_t *range;
17190
17191 osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
17192
17193 if (osize == 0) {
17194 ASSERT(dtrace_toxrange == NULL);
17195 ASSERT(dtrace_toxranges_max == 0);
17196 dtrace_toxranges_max = 1;
17197 } else {
17198 dtrace_toxranges_max <<= 1;
17199 }
17200
17201 nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
17202 range = kmem_zalloc(nsize, KM_SLEEP);
17203
17204 if (dtrace_toxrange != NULL) {
17205 ASSERT(osize != 0);
17206 bcopy(dtrace_toxrange, range, osize);
17207 kmem_free(dtrace_toxrange, osize);
17208 }
17209
17210 dtrace_toxrange = range;
17211 }
17212
17213 ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == 0);
17214 ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == 0);
17215
17216 dtrace_toxrange[dtrace_toxranges].dtt_base = base;
17217 dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
17218 dtrace_toxranges++;
17219 }
17220
17221 /*
17222 * DTrace Driver Cookbook Functions
17223 */
17224 /*ARGSUSED*/
17225 static int
17226 dtrace_attach(dev_info_t *devi)
17227 {
17228 dtrace_provider_id_t id;
17229 dtrace_state_t *state = NULL;
17230 dtrace_enabling_t *enab;
17231
17232 lck_mtx_lock(&cpu_lock);
17233 lck_mtx_lock(&dtrace_provider_lock);
17234 lck_mtx_lock(&dtrace_lock);
17235
17236 /* Darwin uses BSD cloning device driver to automagically obtain minor device number. */
17237 dtrace_devi = devi;
17238
17239 dtrace_modload = dtrace_module_loaded;
17240 dtrace_modunload = dtrace_module_unloaded;
17241 dtrace_cpu_init = dtrace_cpu_setup_initial;
17242 dtrace_helpers_cleanup = dtrace_helpers_destroy;
17243 dtrace_helpers_fork = dtrace_helpers_duplicate;
17244 dtrace_cpustart_init = dtrace_suspend;
17245 dtrace_cpustart_fini = dtrace_resume;
17246 dtrace_debugger_init = dtrace_suspend;
17247 dtrace_debugger_fini = dtrace_resume;
17248
17249 register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
17250
17251 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
17252
17253 dtrace_arena = vmem_create("dtrace", (void *)1, INT32_MAX, 1,
17254 NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
17255
17256 dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
17257 sizeof (dtrace_dstate_percpu_t) * (int)NCPU, DTRACE_STATE_ALIGN,
17258 NULL, NULL, NULL, NULL, NULL, 0);
17259
17260 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
17261
17262 dtrace_nprobes = dtrace_nprobes_default;
17263 dtrace_probes = kmem_zalloc(sizeof(dtrace_probe_t*) * dtrace_nprobes,
17264 KM_SLEEP);
17265
17266 dtrace_byprov = dtrace_hash_create(dtrace_strkey_probe_provider,
17267 0, /* unused */
17268 offsetof(dtrace_probe_t, dtpr_nextprov),
17269 offsetof(dtrace_probe_t, dtpr_prevprov));
17270
17271 dtrace_bymod = dtrace_hash_create(dtrace_strkey_deref_offset,
17272 offsetof(dtrace_probe_t, dtpr_mod),
17273 offsetof(dtrace_probe_t, dtpr_nextmod),
17274 offsetof(dtrace_probe_t, dtpr_prevmod));
17275
17276 dtrace_byfunc = dtrace_hash_create(dtrace_strkey_deref_offset,
17277 offsetof(dtrace_probe_t, dtpr_func),
17278 offsetof(dtrace_probe_t, dtpr_nextfunc),
17279 offsetof(dtrace_probe_t, dtpr_prevfunc));
17280
17281 dtrace_byname = dtrace_hash_create(dtrace_strkey_deref_offset,
17282 offsetof(dtrace_probe_t, dtpr_name),
17283 offsetof(dtrace_probe_t, dtpr_nextname),
17284 offsetof(dtrace_probe_t, dtpr_prevname));
17285
17286 if (dtrace_retain_max < 1) {
17287 cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
17288 "setting to 1", dtrace_retain_max);
17289 dtrace_retain_max = 1;
17290 }
17291
17292 /*
17293 * Now discover our toxic ranges.
17294 */
17295 dtrace_toxic_ranges(dtrace_toxrange_add);
17296
17297 /*
17298 * Before we register ourselves as a provider to our own framework,
17299 * we would like to assert that dtrace_provider is NULL -- but that's
17300 * not true if we were loaded as a dependency of a DTrace provider.
17301 * Once we've registered, we can assert that dtrace_provider is our
17302 * pseudo provider.
17303 */
17304 (void) dtrace_register("dtrace", &dtrace_provider_attr,
17305 DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);
17306
17307 ASSERT(dtrace_provider != NULL);
17308 ASSERT((dtrace_provider_id_t)dtrace_provider == id);
17309
17310 #if defined (__x86_64__)
17311 dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
17312 dtrace_provider, NULL, NULL, "BEGIN", 1, NULL);
17313 dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
17314 dtrace_provider, NULL, NULL, "END", 0, NULL);
17315 dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
17316 dtrace_provider, NULL, NULL, "ERROR", 3, NULL);
17317 #elif (defined(__arm__) || defined(__arm64__))
17318 dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
17319 dtrace_provider, NULL, NULL, "BEGIN", 2, NULL);
17320 dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
17321 dtrace_provider, NULL, NULL, "END", 1, NULL);
17322 dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
17323 dtrace_provider, NULL, NULL, "ERROR", 4, NULL);
17324 #else
17325 #error Unknown Architecture
17326 #endif
17327
17328 dtrace_anon_property();
17329 lck_mtx_unlock(&cpu_lock);
17330
17331 /*
17332 * If DTrace helper tracing is enabled, we need to allocate the
17333 * trace buffer and initialize the values.
17334 */
17335 if (dtrace_helptrace_enabled) {
17336 ASSERT(dtrace_helptrace_buffer == NULL);
17337 dtrace_helptrace_buffer =
17338 kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
17339 dtrace_helptrace_next = 0;
17340 }
17341
17342 /*
17343 * If there are already providers, we must ask them to provide their
17344 * probes, and then match any anonymous enabling against them. Note
17345 * that there should be no other retained enablings at this time:
17346 * the only retained enablings at this time should be the anonymous
17347 * enabling.
17348 */
17349 if (dtrace_anon.dta_enabling != NULL) {
17350 ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
17351
17352 /*
17353 * APPLE NOTE: if handling anonymous dof, switch symbol modes.
17354 */
17355 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) {
17356 dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_KERNEL;
17357 }
17358
17359 dtrace_enabling_provide(NULL);
17360 state = dtrace_anon.dta_state;
17361
17362 /*
17363 * We couldn't hold cpu_lock across the above call to
17364 * dtrace_enabling_provide(), but we must hold it to actually
17365 * enable the probes. We have to drop all of our locks, pick
17366 * up cpu_lock, and regain our locks before matching the
17367 * retained anonymous enabling.
17368 */
17369 lck_mtx_unlock(&dtrace_lock);
17370 lck_mtx_unlock(&dtrace_provider_lock);
17371
17372 lck_mtx_lock(&cpu_lock);
17373 lck_mtx_lock(&dtrace_provider_lock);
17374 lck_mtx_lock(&dtrace_lock);
17375
17376 if ((enab = dtrace_anon.dta_enabling) != NULL)
17377 (void) dtrace_enabling_match(enab, NULL, NULL);
17378
17379 lck_mtx_unlock(&cpu_lock);
17380 }
17381
17382 lck_mtx_unlock(&dtrace_lock);
17383 lck_mtx_unlock(&dtrace_provider_lock);
17384
17385 if (state != NULL) {
17386 /*
17387 * If we created any anonymous state, set it going now.
17388 */
17389 (void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
17390 }
17391
17392 return (DDI_SUCCESS);
17393 }
17394
17395 /*ARGSUSED*/
17396 static int
17397 dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
17398 {
17399 #pragma unused(flag, otyp)
17400 dtrace_state_t *state;
17401 uint32_t priv;
17402 uid_t uid;
17403 zoneid_t zoneid;
17404 int rv;
17405
17406 /* APPLE: Darwin puts Helper on its own major device. */
17407
17408 /*
17409 * If no DTRACE_PRIV_* bits are set in the credential, then the
17410 * caller lacks sufficient permission to do anything with DTrace.
17411 */
17412 dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
17413 if (priv == DTRACE_PRIV_NONE)
17414 return (EACCES);
17415
17416 /*
17417 * APPLE NOTE: We delay the initialization of fasttrap as late as possible.
17418 * It certainly can't be later than now!
17419 */
17420 fasttrap_init();
17421
17422 /*
17423 * Ask all providers to provide all their probes.
17424 */
17425 lck_mtx_lock(&dtrace_provider_lock);
17426 dtrace_probe_provide(NULL, NULL);
17427 lck_mtx_unlock(&dtrace_provider_lock);
17428
17429 lck_mtx_lock(&cpu_lock);
17430 lck_mtx_lock(&dtrace_lock);
17431 dtrace_opens++;
17432 dtrace_membar_producer();
17433
17434 #ifdef illumos
17435 /*
17436 * If the kernel debugger is active (that is, if the kernel debugger
17437 * modified text in some way), we won't allow the open.
17438 */
17439 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
17440 dtrace_opens--;
17441 lck_mtx_unlock(&dtrace_lock);
17442 lck_mtx_unlock(&cpu_lock);
17443 return (EBUSY);
17444 }
17445 #endif
17446
17447 rv = dtrace_state_create(devp, cred_p, &state);
17448 lck_mtx_unlock(&cpu_lock);
17449
17450 if (rv != 0 || state == NULL) {
17451 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) {
17452 #ifdef illumos
17453 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
17454 #endif
17455 }
17456 lck_mtx_unlock(&dtrace_lock);
17457 /* propagate EAGAIN or ERESTART */
17458 return (rv);
17459 }
17460
17461 lck_mtx_unlock(&dtrace_lock);
17462
17463 lck_rw_lock_exclusive(&dtrace_dof_mode_lock);
17464
17465 /*
17466 * If we are currently lazy, transition states.
17467 *
17468 * Unlike dtrace_close, we do not need to check the
17469 * value of dtrace_opens, as any positive value (and
17470 * we count as 1) means we transition states.
17471 */
17472 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON) {
17473 dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_OFF;
17474 /*
17475 * We do not need to hold the exclusive lock while processing
17476 * DOF on processes. We do need to make sure the mode does not get
17477 * changed to DTRACE_DOF_MODE_LAZY_ON during that stage though
17478 * (which should not happen anyway since it only happens in
17479 * dtrace_close). There is no way imcomplete USDT probes can be
17480 * activate by any DTrace clients here since they all have to
17481 * call dtrace_open and be blocked on dtrace_dof_mode_lock
17482 */
17483 lck_rw_lock_exclusive_to_shared(&dtrace_dof_mode_lock);
17484 /*
17485 * Iterate all existing processes and load lazy dofs.
17486 */
17487 proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS,
17488 dtrace_lazy_dofs_proc_iterate_doit,
17489 NULL,
17490 dtrace_lazy_dofs_proc_iterate_filter,
17491 NULL);
17492
17493 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
17494 }
17495 else {
17496 lck_rw_unlock_exclusive(&dtrace_dof_mode_lock);
17497 }
17498
17499
17500 /*
17501 * Update kernel symbol state.
17502 *
17503 * We must own the provider and dtrace locks.
17504 *
17505 * NOTE! It may appear there is a race by setting this value so late
17506 * after dtrace_probe_provide. However, any kext loaded after the
17507 * call to probe provide and before we set LAZY_OFF will be marked as
17508 * eligible for symbols from userspace. The same dtrace that is currently
17509 * calling dtrace_open() (this call!) will get a list of kexts needing
17510 * symbols and fill them in, thus closing the race window.
17511 *
17512 * We want to set this value only after it certain it will succeed, as
17513 * this significantly reduces the complexity of error exits.
17514 */
17515 lck_mtx_lock(&dtrace_lock);
17516 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) {
17517 dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_KERNEL;
17518 }
17519 lck_mtx_unlock(&dtrace_lock);
17520
17521 return (0);
17522 }
17523
17524 /*ARGSUSED*/
17525 static int
17526 dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
17527 {
17528 #pragma unused(flag, otyp, cred_p) /* __APPLE__ */
17529 minor_t minor = getminor(dev);
17530 dtrace_state_t *state;
17531
17532 /* APPLE NOTE: Darwin puts Helper on its own major device. */
17533 state = dtrace_state_get(minor);
17534
17535 lck_mtx_lock(&cpu_lock);
17536 lck_mtx_lock(&dtrace_lock);
17537
17538 if (state->dts_anon) {
17539 /*
17540 * There is anonymous state. Destroy that first.
17541 */
17542 ASSERT(dtrace_anon.dta_state == NULL);
17543 dtrace_state_destroy(state->dts_anon);
17544 }
17545
17546 dtrace_state_destroy(state);
17547 ASSERT(dtrace_opens > 0);
17548
17549 /*
17550 * Only relinquish control of the kernel debugger interface when there
17551 * are no consumers and no anonymous enablings.
17552 */
17553 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) {
17554 #ifdef illumos
17555 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
17556 #endif
17557 }
17558
17559 lck_mtx_unlock(&dtrace_lock);
17560 lck_mtx_unlock(&cpu_lock);
17561
17562 /*
17563 * Lock ordering requires the dof mode lock be taken before
17564 * the dtrace_lock.
17565 */
17566 lck_rw_lock_exclusive(&dtrace_dof_mode_lock);
17567 lck_mtx_lock(&dtrace_lock);
17568
17569 if (dtrace_opens == 0) {
17570 /*
17571 * If we are currently lazy-off, and this is the last close, transition to
17572 * lazy state.
17573 */
17574 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF) {
17575 dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_ON;
17576 }
17577
17578 /*
17579 * If we are the last dtrace client, switch back to lazy (from userspace) symbols
17580 */
17581 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_KERNEL) {
17582 dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE;
17583 }
17584 }
17585
17586 lck_mtx_unlock(&dtrace_lock);
17587 lck_rw_unlock_exclusive(&dtrace_dof_mode_lock);
17588
17589 /*
17590 * Kext probes may be retained past the end of the kext's lifespan. The
17591 * probes are kept until the last reference to them has been removed.
17592 * Since closing an active dtrace context is likely to drop that last reference,
17593 * lets take a shot at cleaning out the orphaned probes now.
17594 */
17595 dtrace_module_unloaded(NULL);
17596
17597 return (0);
17598 }
17599
17600 /*ARGSUSED*/
17601 static int
17602 dtrace_ioctl_helper(u_long cmd, caddr_t arg, int *rv)
17603 {
17604 #pragma unused(rv)
17605 /*
17606 * Safe to check this outside the dof mode lock
17607 */
17608 if (dtrace_dof_mode == DTRACE_DOF_MODE_NEVER)
17609 return KERN_SUCCESS;
17610
17611 switch (cmd) {
17612 #if defined (__arm64__)
17613 case DTRACEHIOC_ADDDOF_U32:
17614 case DTRACEHIOC_ADDDOF_U64:
17615 #else
17616 case DTRACEHIOC_ADDDOF:
17617 #endif /* __arm64__*/
17618 {
17619 dof_helper_t *dhp = NULL;
17620 size_t dof_ioctl_data_size;
17621 dof_ioctl_data_t* multi_dof;
17622 unsigned int i;
17623 int rval = 0;
17624 user_addr_t user_address = *(user_addr_t*)arg;
17625 uint64_t dof_count;
17626 int multi_dof_claimed = 0;
17627 proc_t* p = current_proc();
17628
17629 /*
17630 * If this is a restricted process and dtrace is restricted,
17631 * do not allow DOFs to be registered
17632 */
17633 if (dtrace_is_restricted() &&
17634 !dtrace_are_restrictions_relaxed() &&
17635 !dtrace_can_attach_to_proc(current_proc())) {
17636 return (EACCES);
17637 }
17638
17639 /*
17640 * Read the number of DOF sections being passed in.
17641 */
17642 if (copyin(user_address + offsetof(dof_ioctl_data_t, dofiod_count),
17643 &dof_count,
17644 sizeof(dof_count))) {
17645 dtrace_dof_error(NULL, "failed to copyin dofiod_count");
17646 return (EFAULT);
17647 }
17648
17649 /*
17650 * Range check the count.
17651 */
17652 if (dof_count == 0 || dof_count > 1024) {
17653 dtrace_dof_error(NULL, "dofiod_count is not valid");
17654 return (EINVAL);
17655 }
17656
17657 /*
17658 * Allocate a correctly sized structure and copyin the data.
17659 */
17660 dof_ioctl_data_size = DOF_IOCTL_DATA_T_SIZE(dof_count);
17661 if ((multi_dof = kmem_alloc(dof_ioctl_data_size, KM_SLEEP)) == NULL)
17662 return (ENOMEM);
17663
17664 /* NOTE! We can no longer exit this method via return */
17665 if (copyin(user_address, multi_dof, dof_ioctl_data_size) != 0) {
17666 dtrace_dof_error(NULL, "failed copyin of dof_ioctl_data_t");
17667 rval = EFAULT;
17668 goto cleanup;
17669 }
17670
17671 /*
17672 * Check that the count didn't change between the first copyin and the second.
17673 */
17674 if (multi_dof->dofiod_count != dof_count) {
17675 rval = EINVAL;
17676 goto cleanup;
17677 }
17678
17679 /*
17680 * Try to process lazily first.
17681 */
17682 rval = dtrace_lazy_dofs_add(p, multi_dof, &multi_dof_claimed);
17683
17684 /*
17685 * If rval is EACCES, we must be non-lazy.
17686 */
17687 if (rval == EACCES) {
17688 rval = 0;
17689 /*
17690 * Process each dof_helper_t
17691 */
17692 i = 0;
17693 do {
17694 dhp = &multi_dof->dofiod_helpers[i];
17695
17696 dof_hdr_t *dof = dtrace_dof_copyin(dhp->dofhp_dof, &rval);
17697
17698 if (dof != NULL) {
17699 lck_mtx_lock(&dtrace_meta_lock);
17700 lck_mtx_lock(&dtrace_lock);
17701
17702 /*
17703 * dtrace_helper_slurp() takes responsibility for the dof --
17704 * it may free it now or it may save it and free it later.
17705 */
17706 if ((dhp->dofhp_dof = (uint64_t)dtrace_helper_slurp(p, dof, dhp)) == -1ULL) {
17707 rval = EINVAL;
17708 }
17709
17710 lck_mtx_unlock(&dtrace_lock);
17711 lck_mtx_unlock(&dtrace_meta_lock);
17712 }
17713 } while (++i < multi_dof->dofiod_count && rval == 0);
17714 }
17715
17716 /*
17717 * We need to copyout the multi_dof struct, because it contains
17718 * the generation (unique id) values needed to call DTRACEHIOC_REMOVE
17719 *
17720 * This could certainly be better optimized.
17721 */
17722 if (copyout(multi_dof, user_address, dof_ioctl_data_size) != 0) {
17723 dtrace_dof_error(NULL, "failed copyout of dof_ioctl_data_t");
17724 /* Don't overwrite pre-existing error code */
17725 if (rval == 0) rval = EFAULT;
17726 }
17727
17728 cleanup:
17729 /*
17730 * If we had to allocate struct memory, free it.
17731 */
17732 if (multi_dof != NULL && !multi_dof_claimed) {
17733 kmem_free(multi_dof, dof_ioctl_data_size);
17734 }
17735
17736 return rval;
17737 }
17738
17739 case DTRACEHIOC_REMOVE: {
17740 int generation = *(int*)arg;
17741 proc_t* p = current_proc();
17742
17743 /*
17744 * Try lazy first.
17745 */
17746 int rval = dtrace_lazy_dofs_remove(p, generation);
17747
17748 /*
17749 * EACCES means non-lazy
17750 */
17751 if (rval == EACCES) {
17752 lck_mtx_lock(&dtrace_meta_lock);
17753 lck_mtx_lock(&dtrace_lock);
17754 rval = dtrace_helper_destroygen(p, generation);
17755 lck_mtx_unlock(&dtrace_lock);
17756 lck_mtx_unlock(&dtrace_meta_lock);
17757 }
17758
17759 return (rval);
17760 }
17761
17762 default:
17763 break;
17764 }
17765
17766 return ENOTTY;
17767 }
17768
17769 /*ARGSUSED*/
17770 static int
17771 dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv)
17772 {
17773 #pragma unused(md)
17774 minor_t minor = getminor(dev);
17775 dtrace_state_t *state;
17776 int rval;
17777
17778 /* Darwin puts Helper on its own major device. */
17779
17780 state = dtrace_state_get(minor);
17781
17782 if (state->dts_anon) {
17783 ASSERT(dtrace_anon.dta_state == NULL);
17784 state = state->dts_anon;
17785 }
17786
17787 switch (cmd) {
17788 case DTRACEIOC_PROVIDER: {
17789 dtrace_providerdesc_t pvd;
17790 dtrace_provider_t *pvp;
17791
17792 if (copyin(arg, &pvd, sizeof (pvd)) != 0)
17793 return (EFAULT);
17794
17795 pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
17796 lck_mtx_lock(&dtrace_provider_lock);
17797
17798 for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
17799 if (strncmp(pvp->dtpv_name, pvd.dtvd_name, DTRACE_PROVNAMELEN) == 0)
17800 break;
17801 }
17802
17803 lck_mtx_unlock(&dtrace_provider_lock);
17804
17805 if (pvp == NULL)
17806 return (ESRCH);
17807
17808 bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
17809 bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
17810 if (copyout(&pvd, arg, sizeof (pvd)) != 0)
17811 return (EFAULT);
17812
17813 return (0);
17814 }
17815
17816 case DTRACEIOC_EPROBE: {
17817 dtrace_eprobedesc_t epdesc;
17818 dtrace_ecb_t *ecb;
17819 dtrace_action_t *act;
17820 void *buf;
17821 size_t size;
17822 uintptr_t dest;
17823 int nrecs;
17824
17825 if (copyin(arg, &epdesc, sizeof (epdesc)) != 0)
17826 return (EFAULT);
17827
17828 lck_mtx_lock(&dtrace_lock);
17829
17830 if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
17831 lck_mtx_unlock(&dtrace_lock);
17832 return (EINVAL);
17833 }
17834
17835 if (ecb->dte_probe == NULL) {
17836 lck_mtx_unlock(&dtrace_lock);
17837 return (EINVAL);
17838 }
17839
17840 epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
17841 epdesc.dtepd_uarg = ecb->dte_uarg;
17842 epdesc.dtepd_size = ecb->dte_size;
17843
17844 nrecs = epdesc.dtepd_nrecs;
17845 epdesc.dtepd_nrecs = 0;
17846 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17847 if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17848 continue;
17849
17850 epdesc.dtepd_nrecs++;
17851 }
17852
17853 /*
17854 * Now that we have the size, we need to allocate a temporary
17855 * buffer in which to store the complete description. We need
17856 * the temporary buffer to be able to drop dtrace_lock()
17857 * across the copyout(), below.
17858 */
17859 size = sizeof (dtrace_eprobedesc_t) +
17860 (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
17861
17862 buf = kmem_alloc(size, KM_SLEEP);
17863 dest = (uintptr_t)buf;
17864
17865 bcopy(&epdesc, (void *)dest, sizeof (epdesc));
17866 dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
17867
17868 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17869 if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17870 continue;
17871
17872 if (nrecs-- == 0)
17873 break;
17874
17875 bcopy(&act->dta_rec, (void *)dest,
17876 sizeof (dtrace_recdesc_t));
17877 dest += sizeof (dtrace_recdesc_t);
17878 }
17879
17880 lck_mtx_unlock(&dtrace_lock);
17881
17882 if (copyout(buf, arg, dest - (uintptr_t)buf) != 0) {
17883 kmem_free(buf, size);
17884 return (EFAULT);
17885 }
17886
17887 kmem_free(buf, size);
17888 return (0);
17889 }
17890
17891 case DTRACEIOC_AGGDESC: {
17892 dtrace_aggdesc_t aggdesc;
17893 dtrace_action_t *act;
17894 dtrace_aggregation_t *agg;
17895 int nrecs;
17896 uint32_t offs;
17897 dtrace_recdesc_t *lrec;
17898 void *buf;
17899 size_t size;
17900 uintptr_t dest;
17901
17902 if (copyin(arg, &aggdesc, sizeof (aggdesc)) != 0)
17903 return (EFAULT);
17904
17905 lck_mtx_lock(&dtrace_lock);
17906
17907 if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
17908 lck_mtx_unlock(&dtrace_lock);
17909 return (EINVAL);
17910 }
17911
17912 aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
17913
17914 nrecs = aggdesc.dtagd_nrecs;
17915 aggdesc.dtagd_nrecs = 0;
17916
17917 offs = agg->dtag_base;
17918 lrec = &agg->dtag_action.dta_rec;
17919 aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
17920
17921 for (act = agg->dtag_first; ; act = act->dta_next) {
17922 ASSERT(act->dta_intuple ||
17923 DTRACEACT_ISAGG(act->dta_kind));
17924
17925 /*
17926 * If this action has a record size of zero, it
17927 * denotes an argument to the aggregating action.
17928 * Because the presence of this record doesn't (or
17929 * shouldn't) affect the way the data is interpreted,
17930 * we don't copy it out to save user-level the
17931 * confusion of dealing with a zero-length record.
17932 */
17933 if (act->dta_rec.dtrd_size == 0) {
17934 ASSERT(agg->dtag_hasarg);
17935 continue;
17936 }
17937
17938 aggdesc.dtagd_nrecs++;
17939
17940 if (act == &agg->dtag_action)
17941 break;
17942 }
17943
17944 /*
17945 * Now that we have the size, we need to allocate a temporary
17946 * buffer in which to store the complete description. We need
17947 * the temporary buffer to be able to drop dtrace_lock()
17948 * across the copyout(), below.
17949 */
17950 size = sizeof (dtrace_aggdesc_t) +
17951 (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
17952
17953 buf = kmem_alloc(size, KM_SLEEP);
17954 dest = (uintptr_t)buf;
17955
17956 bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
17957 dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
17958
17959 for (act = agg->dtag_first; ; act = act->dta_next) {
17960 dtrace_recdesc_t rec = act->dta_rec;
17961
17962 /*
17963 * See the comment in the above loop for why we pass
17964 * over zero-length records.
17965 */
17966 if (rec.dtrd_size == 0) {
17967 ASSERT(agg->dtag_hasarg);
17968 continue;
17969 }
17970
17971 if (nrecs-- == 0)
17972 break;
17973
17974 rec.dtrd_offset -= offs;
17975 bcopy(&rec, (void *)dest, sizeof (rec));
17976 dest += sizeof (dtrace_recdesc_t);
17977
17978 if (act == &agg->dtag_action)
17979 break;
17980 }
17981
17982 lck_mtx_unlock(&dtrace_lock);
17983
17984 if (copyout(buf, arg, dest - (uintptr_t)buf) != 0) {
17985 kmem_free(buf, size);
17986 return (EFAULT);
17987 }
17988
17989 kmem_free(buf, size);
17990 return (0);
17991 }
17992
17993 case DTRACEIOC_ENABLE: {
17994 dof_hdr_t *dof;
17995 dtrace_enabling_t *enab = NULL;
17996 dtrace_vstate_t *vstate;
17997 int err = 0;
17998
17999 *rv = 0;
18000
18001 /*
18002 * If a NULL argument has been passed, we take this as our
18003 * cue to reevaluate our enablings.
18004 */
18005 if (arg == 0) {
18006 dtrace_enabling_matchall();
18007
18008 return (0);
18009 }
18010
18011 if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
18012 return (rval);
18013
18014 lck_mtx_lock(&cpu_lock);
18015 lck_mtx_lock(&dtrace_lock);
18016 vstate = &state->dts_vstate;
18017
18018 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
18019 lck_mtx_unlock(&dtrace_lock);
18020 lck_mtx_unlock(&cpu_lock);
18021 dtrace_dof_destroy(dof);
18022 return (EBUSY);
18023 }
18024
18025 if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
18026 lck_mtx_unlock(&dtrace_lock);
18027 lck_mtx_unlock(&cpu_lock);
18028 dtrace_dof_destroy(dof);
18029 return (EINVAL);
18030 }
18031
18032 if ((rval = dtrace_dof_options(dof, state)) != 0) {
18033 dtrace_enabling_destroy(enab);
18034 lck_mtx_unlock(&dtrace_lock);
18035 lck_mtx_unlock(&cpu_lock);
18036 dtrace_dof_destroy(dof);
18037 return (rval);
18038 }
18039
18040 if ((err = dtrace_enabling_match(enab, rv, NULL)) == 0) {
18041 err = dtrace_enabling_retain(enab);
18042 } else {
18043 dtrace_enabling_destroy(enab);
18044 }
18045
18046 lck_mtx_unlock(&dtrace_lock);
18047 lck_mtx_unlock(&cpu_lock);
18048 dtrace_dof_destroy(dof);
18049
18050 return (err);
18051 }
18052
18053 case DTRACEIOC_REPLICATE: {
18054 dtrace_repldesc_t desc;
18055 dtrace_probedesc_t *match = &desc.dtrpd_match;
18056 dtrace_probedesc_t *create = &desc.dtrpd_create;
18057 int err;
18058
18059 if (copyin(arg, &desc, sizeof (desc)) != 0)
18060 return (EFAULT);
18061
18062 match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
18063 match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
18064 match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
18065 match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
18066
18067 create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
18068 create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
18069 create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
18070 create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
18071
18072 lck_mtx_lock(&dtrace_lock);
18073 err = dtrace_enabling_replicate(state, match, create);
18074 lck_mtx_unlock(&dtrace_lock);
18075
18076 return (err);
18077 }
18078
18079 case DTRACEIOC_PROBEMATCH:
18080 case DTRACEIOC_PROBES: {
18081 dtrace_probe_t *probe = NULL;
18082 dtrace_probedesc_t desc;
18083 dtrace_probekey_t pkey;
18084 dtrace_id_t i;
18085 int m = 0;
18086 uint32_t priv;
18087 uid_t uid;
18088 zoneid_t zoneid;
18089
18090 if (copyin(arg, &desc, sizeof (desc)) != 0)
18091 return (EFAULT);
18092
18093 desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
18094 desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
18095 desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
18096 desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
18097
18098 /*
18099 * Before we attempt to match this probe, we want to give
18100 * all providers the opportunity to provide it.
18101 */
18102 if (desc.dtpd_id == DTRACE_IDNONE) {
18103 lck_mtx_lock(&dtrace_provider_lock);
18104 dtrace_probe_provide(&desc, NULL);
18105 lck_mtx_unlock(&dtrace_provider_lock);
18106 desc.dtpd_id++;
18107 }
18108
18109 dtrace_cred2priv(cr, &priv, &uid, &zoneid);
18110
18111 lck_mtx_lock(&dtrace_lock);
18112
18113 if (cmd == DTRACEIOC_PROBEMATCH) {
18114 dtrace_probekey(&desc, &pkey);
18115 pkey.dtpk_id = DTRACE_IDNONE;
18116
18117 /* Quiet compiler warning */
18118 for (i = desc.dtpd_id; i <= (dtrace_id_t)dtrace_nprobes; i++) {
18119 if ((probe = dtrace_probes[i - 1]) != NULL &&
18120 (m = dtrace_match_probe(probe, &pkey,
18121 priv, uid, zoneid)) != 0)
18122 break;
18123 }
18124
18125 if (m < 0) {
18126 lck_mtx_unlock(&dtrace_lock);
18127 return (EINVAL);
18128 }
18129 dtrace_probekey_release(&pkey);
18130
18131 } else {
18132 /* Quiet compiler warning */
18133 for (i = desc.dtpd_id; i <= (dtrace_id_t)dtrace_nprobes; i++) {
18134 if ((probe = dtrace_probes[i - 1]) != NULL &&
18135 dtrace_match_priv(probe, priv, uid, zoneid))
18136 break;
18137 }
18138 }
18139
18140 if (probe == NULL) {
18141 lck_mtx_unlock(&dtrace_lock);
18142 return (ESRCH);
18143 }
18144
18145 dtrace_probe_description(probe, &desc);
18146 lck_mtx_unlock(&dtrace_lock);
18147
18148 if (copyout(&desc, arg, sizeof (desc)) != 0)
18149 return (EFAULT);
18150
18151 return (0);
18152 }
18153
18154 case DTRACEIOC_PROBEARG: {
18155 dtrace_argdesc_t desc;
18156 dtrace_probe_t *probe;
18157 dtrace_provider_t *prov;
18158
18159 if (copyin(arg, &desc, sizeof (desc)) != 0)
18160 return (EFAULT);
18161
18162 if (desc.dtargd_id == DTRACE_IDNONE)
18163 return (EINVAL);
18164
18165 if (desc.dtargd_ndx == DTRACE_ARGNONE)
18166 return (EINVAL);
18167
18168 lck_mtx_lock(&dtrace_provider_lock);
18169 lck_mtx_lock(&mod_lock);
18170 lck_mtx_lock(&dtrace_lock);
18171
18172 /* Quiet compiler warning */
18173 if (desc.dtargd_id > (dtrace_id_t)dtrace_nprobes) {
18174 lck_mtx_unlock(&dtrace_lock);
18175 lck_mtx_unlock(&mod_lock);
18176 lck_mtx_unlock(&dtrace_provider_lock);
18177 return (EINVAL);
18178 }
18179
18180 if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
18181 lck_mtx_unlock(&dtrace_lock);
18182 lck_mtx_unlock(&mod_lock);
18183 lck_mtx_unlock(&dtrace_provider_lock);
18184 return (EINVAL);
18185 }
18186
18187 lck_mtx_unlock(&dtrace_lock);
18188
18189 prov = probe->dtpr_provider;
18190
18191 if (prov->dtpv_pops.dtps_getargdesc == NULL) {
18192 /*
18193 * There isn't any typed information for this probe.
18194 * Set the argument number to DTRACE_ARGNONE.
18195 */
18196 desc.dtargd_ndx = DTRACE_ARGNONE;
18197 } else {
18198 desc.dtargd_native[0] = '\0';
18199 desc.dtargd_xlate[0] = '\0';
18200 desc.dtargd_mapping = desc.dtargd_ndx;
18201
18202 prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
18203 probe->dtpr_id, probe->dtpr_arg, &desc);
18204 }
18205
18206 lck_mtx_unlock(&mod_lock);
18207 lck_mtx_unlock(&dtrace_provider_lock);
18208
18209 if (copyout(&desc, arg, sizeof (desc)) != 0)
18210 return (EFAULT);
18211
18212 return (0);
18213 }
18214
18215 case DTRACEIOC_GO: {
18216 processorid_t cpuid;
18217 rval = dtrace_state_go(state, &cpuid);
18218
18219 if (rval != 0)
18220 return (rval);
18221
18222 if (copyout(&cpuid, arg, sizeof (cpuid)) != 0)
18223 return (EFAULT);
18224
18225 return (0);
18226 }
18227
18228 case DTRACEIOC_STOP: {
18229 processorid_t cpuid;
18230
18231 lck_mtx_lock(&dtrace_lock);
18232 rval = dtrace_state_stop(state, &cpuid);
18233 lck_mtx_unlock(&dtrace_lock);
18234
18235 if (rval != 0)
18236 return (rval);
18237
18238 if (copyout(&cpuid, arg, sizeof (cpuid)) != 0)
18239 return (EFAULT);
18240
18241 return (0);
18242 }
18243
18244 case DTRACEIOC_DOFGET: {
18245 dof_hdr_t hdr, *dof;
18246 uint64_t len;
18247
18248 if (copyin(arg, &hdr, sizeof (hdr)) != 0)
18249 return (EFAULT);
18250
18251 lck_mtx_lock(&dtrace_lock);
18252 dof = dtrace_dof_create(state);
18253 lck_mtx_unlock(&dtrace_lock);
18254
18255 len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
18256 rval = copyout(dof, arg, len);
18257 dtrace_dof_destroy(dof);
18258
18259 return (rval == 0 ? 0 : EFAULT);
18260 }
18261
18262 case DTRACEIOC_SLEEP: {
18263 int64_t time;
18264 uint64_t abstime;
18265 uint64_t rvalue = DTRACE_WAKE_TIMEOUT;
18266
18267 if (copyin(arg, &time, sizeof(time)) != 0)
18268 return (EFAULT);
18269
18270 nanoseconds_to_absolutetime((uint64_t)time, &abstime);
18271 clock_absolutetime_interval_to_deadline(abstime, &abstime);
18272
18273 if (assert_wait_deadline(state, THREAD_ABORTSAFE, abstime) == THREAD_WAITING) {
18274 if (state->dts_buf_over_limit > 0) {
18275 clear_wait(current_thread(), THREAD_INTERRUPTED);
18276 rvalue = DTRACE_WAKE_BUF_LIMIT;
18277 } else {
18278 thread_block(THREAD_CONTINUE_NULL);
18279 if (state->dts_buf_over_limit > 0) {
18280 rvalue = DTRACE_WAKE_BUF_LIMIT;
18281 }
18282 }
18283 }
18284
18285 if (copyout(&rvalue, arg, sizeof(rvalue)) != 0)
18286 return (EFAULT);
18287
18288 return (0);
18289 }
18290
18291 case DTRACEIOC_SIGNAL: {
18292 wakeup(state);
18293 return (0);
18294 }
18295
18296 case DTRACEIOC_AGGSNAP:
18297 case DTRACEIOC_BUFSNAP: {
18298 dtrace_bufdesc_t desc;
18299 caddr_t cached;
18300 boolean_t over_limit;
18301 dtrace_buffer_t *buf;
18302
18303 if (copyin(arg, &desc, sizeof (desc)) != 0)
18304 return (EFAULT);
18305
18306 if ((int)desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)
18307 return (EINVAL);
18308
18309 lck_mtx_lock(&dtrace_lock);
18310
18311 if (cmd == DTRACEIOC_BUFSNAP) {
18312 buf = &state->dts_buffer[desc.dtbd_cpu];
18313 } else {
18314 buf = &state->dts_aggbuffer[desc.dtbd_cpu];
18315 }
18316
18317 if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
18318 size_t sz = buf->dtb_offset;
18319
18320 if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
18321 lck_mtx_unlock(&dtrace_lock);
18322 return (EBUSY);
18323 }
18324
18325 /*
18326 * If this buffer has already been consumed, we're
18327 * going to indicate that there's nothing left here
18328 * to consume.
18329 */
18330 if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
18331 lck_mtx_unlock(&dtrace_lock);
18332
18333 desc.dtbd_size = 0;
18334 desc.dtbd_drops = 0;
18335 desc.dtbd_errors = 0;
18336 desc.dtbd_oldest = 0;
18337 sz = sizeof (desc);
18338
18339 if (copyout(&desc, arg, sz) != 0)
18340 return (EFAULT);
18341
18342 return (0);
18343 }
18344
18345 /*
18346 * If this is a ring buffer that has wrapped, we want
18347 * to copy the whole thing out.
18348 */
18349 if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
18350 dtrace_buffer_polish(buf);
18351 sz = buf->dtb_size;
18352 }
18353
18354 if (copyout(buf->dtb_tomax, (user_addr_t)desc.dtbd_data, sz) != 0) {
18355 lck_mtx_unlock(&dtrace_lock);
18356 return (EFAULT);
18357 }
18358
18359 desc.dtbd_size = sz;
18360 desc.dtbd_drops = buf->dtb_drops;
18361 desc.dtbd_errors = buf->dtb_errors;
18362 desc.dtbd_oldest = buf->dtb_xamot_offset;
18363 desc.dtbd_timestamp = dtrace_gethrtime();
18364
18365 lck_mtx_unlock(&dtrace_lock);
18366
18367 if (copyout(&desc, arg, sizeof (desc)) != 0)
18368 return (EFAULT);
18369
18370 buf->dtb_flags |= DTRACEBUF_CONSUMED;
18371
18372 return (0);
18373 }
18374
18375 if (buf->dtb_tomax == NULL) {
18376 ASSERT(buf->dtb_xamot == NULL);
18377 lck_mtx_unlock(&dtrace_lock);
18378 return (ENOENT);
18379 }
18380
18381 cached = buf->dtb_tomax;
18382 over_limit = buf->dtb_cur_limit == buf->dtb_size;
18383
18384 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
18385
18386 dtrace_xcall(desc.dtbd_cpu,
18387 (dtrace_xcall_t)dtrace_buffer_switch, buf);
18388
18389 state->dts_errors += buf->dtb_xamot_errors;
18390
18391 /*
18392 * If the buffers did not actually switch, then the cross call
18393 * did not take place -- presumably because the given CPU is
18394 * not in the ready set. If this is the case, we'll return
18395 * ENOENT.
18396 */
18397 if (buf->dtb_tomax == cached) {
18398 ASSERT(buf->dtb_xamot != cached);
18399 lck_mtx_unlock(&dtrace_lock);
18400 return (ENOENT);
18401 }
18402
18403 ASSERT(cached == buf->dtb_xamot);
18404 /*
18405 * At this point we know the buffer have switched, so we
18406 * can decrement the over limit count if the buffer was over
18407 * its limit. The new buffer might already be over its limit
18408 * yet, but we don't care since we're guaranteed not to be
18409 * checking the buffer over limit count at this point.
18410 */
18411 if (over_limit) {
18412 uint32_t old = os_atomic_dec_orig(&state->dts_buf_over_limit, relaxed);
18413 #pragma unused(old)
18414
18415 /*
18416 * Verify that we didn't underflow the value
18417 */
18418 ASSERT(old != 0);
18419 }
18420
18421 /*
18422 * We have our snapshot; now copy it out.
18423 */
18424 if (dtrace_buffer_copyout(buf->dtb_xamot,
18425 (user_addr_t)desc.dtbd_data,
18426 buf->dtb_xamot_offset) != 0) {
18427 lck_mtx_unlock(&dtrace_lock);
18428 return (EFAULT);
18429 }
18430
18431 desc.dtbd_size = buf->dtb_xamot_offset;
18432 desc.dtbd_drops = buf->dtb_xamot_drops;
18433 desc.dtbd_errors = buf->dtb_xamot_errors;
18434 desc.dtbd_oldest = 0;
18435 desc.dtbd_timestamp = buf->dtb_switched;
18436
18437 lck_mtx_unlock(&dtrace_lock);
18438
18439 /*
18440 * Finally, copy out the buffer description.
18441 */
18442 if (copyout(&desc, arg, sizeof (desc)) != 0)
18443 return (EFAULT);
18444
18445 return (0);
18446 }
18447
18448 case DTRACEIOC_CONF: {
18449 dtrace_conf_t conf;
18450
18451 bzero(&conf, sizeof (conf));
18452 conf.dtc_difversion = DIF_VERSION;
18453 conf.dtc_difintregs = DIF_DIR_NREGS;
18454 conf.dtc_diftupregs = DIF_DTR_NREGS;
18455 conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
18456
18457 if (copyout(&conf, arg, sizeof (conf)) != 0)
18458 return (EFAULT);
18459
18460 return (0);
18461 }
18462
18463 case DTRACEIOC_STATUS: {
18464 dtrace_status_t stat;
18465 dtrace_dstate_t *dstate;
18466 int i, j;
18467 uint64_t nerrs;
18468
18469 /*
18470 * See the comment in dtrace_state_deadman() for the reason
18471 * for setting dts_laststatus to INT64_MAX before setting
18472 * it to the correct value.
18473 */
18474 state->dts_laststatus = INT64_MAX;
18475 dtrace_membar_producer();
18476 state->dts_laststatus = dtrace_gethrtime();
18477
18478 bzero(&stat, sizeof (stat));
18479
18480 lck_mtx_lock(&dtrace_lock);
18481
18482 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
18483 lck_mtx_unlock(&dtrace_lock);
18484 return (ENOENT);
18485 }
18486
18487 if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
18488 stat.dtst_exiting = 1;
18489
18490 nerrs = state->dts_errors;
18491 dstate = &state->dts_vstate.dtvs_dynvars;
18492
18493 for (i = 0; i < (int)NCPU; i++) {
18494 dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
18495
18496 stat.dtst_dyndrops += dcpu->dtdsc_drops;
18497 stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
18498 stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
18499
18500 if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
18501 stat.dtst_filled++;
18502
18503 nerrs += state->dts_buffer[i].dtb_errors;
18504
18505 for (j = 0; j < state->dts_nspeculations; j++) {
18506 dtrace_speculation_t *spec;
18507 dtrace_buffer_t *buf;
18508
18509 spec = &state->dts_speculations[j];
18510 buf = &spec->dtsp_buffer[i];
18511 stat.dtst_specdrops += buf->dtb_xamot_drops;
18512 }
18513 }
18514
18515 stat.dtst_specdrops_busy = state->dts_speculations_busy;
18516 stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
18517 stat.dtst_stkstroverflows = state->dts_stkstroverflows;
18518 stat.dtst_dblerrors = state->dts_dblerrors;
18519 stat.dtst_killed =
18520 (state->dts_activity == DTRACE_ACTIVITY_KILLED);
18521 stat.dtst_errors = nerrs;
18522
18523 lck_mtx_unlock(&dtrace_lock);
18524
18525 if (copyout(&stat, arg, sizeof (stat)) != 0)
18526 return (EFAULT);
18527
18528 return (0);
18529 }
18530
18531 case DTRACEIOC_FORMAT: {
18532 dtrace_fmtdesc_t fmt;
18533 char *str;
18534 int len;
18535
18536 if (copyin(arg, &fmt, sizeof (fmt)) != 0)
18537 return (EFAULT);
18538
18539 lck_mtx_lock(&dtrace_lock);
18540
18541 if (fmt.dtfd_format == 0 ||
18542 fmt.dtfd_format > state->dts_nformats) {
18543 lck_mtx_unlock(&dtrace_lock);
18544 return (EINVAL);
18545 }
18546
18547 /*
18548 * Format strings are allocated contiguously and they are
18549 * never freed; if a format index is less than the number
18550 * of formats, we can assert that the format map is non-NULL
18551 * and that the format for the specified index is non-NULL.
18552 */
18553 ASSERT(state->dts_formats != NULL);
18554 str = state->dts_formats[fmt.dtfd_format - 1]->dtf_str;
18555 ASSERT(str != NULL);
18556
18557 len = strlen(str) + 1;
18558
18559 if (len > fmt.dtfd_length) {
18560 fmt.dtfd_length = len;
18561
18562 if (copyout(&fmt, arg, sizeof (fmt)) != 0) {
18563 lck_mtx_unlock(&dtrace_lock);
18564 return (EINVAL);
18565 }
18566 } else {
18567 if (copyout(str, (user_addr_t)fmt.dtfd_string, len) != 0) {
18568 lck_mtx_unlock(&dtrace_lock);
18569 return (EINVAL);
18570 }
18571 }
18572
18573 lck_mtx_unlock(&dtrace_lock);
18574 return (0);
18575 }
18576
18577 case DTRACEIOC_MODUUIDSLIST: {
18578 size_t module_uuids_list_size;
18579 dtrace_module_uuids_list_t* uuids_list;
18580 uint64_t dtmul_count;
18581
18582 /*
18583 * Security restrictions make this operation illegal, if this is enabled DTrace
18584 * must refuse to provide any fbt probes.
18585 */
18586 if (dtrace_fbt_probes_restricted()) {
18587 cmn_err(CE_WARN, "security restrictions disallow DTRACEIOC_MODUUIDSLIST");
18588 return (EPERM);
18589 }
18590
18591 /*
18592 * Fail if the kernel symbol mode makes this operation illegal.
18593 * Both NEVER & ALWAYS_FROM_KERNEL are permanent states, it is legal to check
18594 * for them without holding the dtrace_lock.
18595 */
18596 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER ||
18597 dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) {
18598 cmn_err(CE_WARN, "dtrace_kernel_symbol_mode of %u disallows DTRACEIOC_MODUUIDSLIST", dtrace_kernel_symbol_mode);
18599 return (EPERM);
18600 }
18601
18602 /*
18603 * Read the number of symbolsdesc structs being passed in.
18604 */
18605 if (copyin(arg + offsetof(dtrace_module_uuids_list_t, dtmul_count),
18606 &dtmul_count,
18607 sizeof(dtmul_count))) {
18608 cmn_err(CE_WARN, "failed to copyin dtmul_count");
18609 return (EFAULT);
18610 }
18611
18612 /*
18613 * Range check the count. More than 2k kexts is probably an error.
18614 */
18615 if (dtmul_count > 2048) {
18616 cmn_err(CE_WARN, "dtmul_count is not valid");
18617 return (EINVAL);
18618 }
18619
18620 /*
18621 * For all queries, we return EINVAL when the user specified
18622 * count does not match the actual number of modules we find
18623 * available.
18624 *
18625 * If the user specified count is zero, then this serves as a
18626 * simple query to count the available modules in need of symbols.
18627 */
18628
18629 rval = 0;
18630
18631 if (dtmul_count == 0)
18632 {
18633 lck_mtx_lock(&mod_lock);
18634 struct modctl* ctl = dtrace_modctl_list;
18635 while (ctl) {
18636 ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
18637 if (!MOD_SYMBOLS_DONE(ctl) && !MOD_IS_STATIC_KEXT(ctl)) {
18638 dtmul_count++;
18639 rval = EINVAL;
18640 }
18641 ctl = ctl->mod_next;
18642 }
18643 lck_mtx_unlock(&mod_lock);
18644
18645 if (copyout(&dtmul_count, arg, sizeof (dtmul_count)) != 0)
18646 return (EFAULT);
18647 else
18648 return (rval);
18649 }
18650
18651 /*
18652 * If we reach this point, then we have a request for full list data.
18653 * Allocate a correctly sized structure and copyin the data.
18654 */
18655 module_uuids_list_size = DTRACE_MODULE_UUIDS_LIST_SIZE(dtmul_count);
18656 if ((uuids_list = kmem_alloc(module_uuids_list_size, KM_SLEEP)) == NULL)
18657 return (ENOMEM);
18658
18659 /* NOTE! We can no longer exit this method via return */
18660 if (copyin(arg, uuids_list, module_uuids_list_size) != 0) {
18661 cmn_err(CE_WARN, "failed copyin of dtrace_module_uuids_list_t");
18662 rval = EFAULT;
18663 goto moduuidslist_cleanup;
18664 }
18665
18666 /*
18667 * Check that the count didn't change between the first copyin and the second.
18668 */
18669 if (uuids_list->dtmul_count != dtmul_count) {
18670 rval = EINVAL;
18671 goto moduuidslist_cleanup;
18672 }
18673
18674 /*
18675 * Build the list of UUID's that need symbols
18676 */
18677 lck_mtx_lock(&mod_lock);
18678
18679 dtmul_count = 0;
18680
18681 struct modctl* ctl = dtrace_modctl_list;
18682 while (ctl) {
18683 /*
18684 * We assume that userspace symbols will be "better" than kernel level symbols,
18685 * as userspace can search for dSYM(s) and symbol'd binaries. Even if kernel syms
18686 * are available, add user syms if the module might use them.
18687 */
18688 ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
18689 if (!MOD_SYMBOLS_DONE(ctl) && !MOD_IS_STATIC_KEXT(ctl)) {
18690 UUID* uuid = &uuids_list->dtmul_uuid[dtmul_count];
18691 if (dtmul_count++ < uuids_list->dtmul_count) {
18692 memcpy(uuid, ctl->mod_uuid, sizeof(UUID));
18693 }
18694 }
18695 ctl = ctl->mod_next;
18696 }
18697
18698 lck_mtx_unlock(&mod_lock);
18699
18700 if (uuids_list->dtmul_count < dtmul_count)
18701 rval = EINVAL;
18702
18703 uuids_list->dtmul_count = dtmul_count;
18704
18705 /*
18706 * Copyout the symbols list (or at least the count!)
18707 */
18708 if (copyout(uuids_list, arg, module_uuids_list_size) != 0) {
18709 cmn_err(CE_WARN, "failed copyout of dtrace_symbolsdesc_list_t");
18710 rval = EFAULT;
18711 }
18712
18713 moduuidslist_cleanup:
18714 /*
18715 * If we had to allocate struct memory, free it.
18716 */
18717 if (uuids_list != NULL) {
18718 kmem_free(uuids_list, module_uuids_list_size);
18719 }
18720
18721 return rval;
18722 }
18723
18724 case DTRACEIOC_PROVMODSYMS: {
18725 size_t module_symbols_size;
18726 dtrace_module_symbols_t* module_symbols;
18727 uint64_t dtmodsyms_count;
18728
18729 /*
18730 * Security restrictions make this operation illegal, if this is enabled DTrace
18731 * must refuse to provide any fbt probes.
18732 */
18733 if (dtrace_fbt_probes_restricted()) {
18734 cmn_err(CE_WARN, "security restrictions disallow DTRACEIOC_MODUUIDSLIST");
18735 return (EPERM);
18736 }
18737
18738 /*
18739 * Fail if the kernel symbol mode makes this operation illegal.
18740 * Both NEVER & ALWAYS_FROM_KERNEL are permanent states, it is legal to check
18741 * for them without holding the dtrace_lock.
18742 */
18743 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER ||
18744 dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) {
18745 cmn_err(CE_WARN, "dtrace_kernel_symbol_mode of %u disallows DTRACEIOC_PROVMODSYMS", dtrace_kernel_symbol_mode);
18746 return (EPERM);
18747 }
18748
18749 /*
18750 * Read the number of module symbols structs being passed in.
18751 */
18752 if (copyin(arg + offsetof(dtrace_module_symbols_t, dtmodsyms_count),
18753 &dtmodsyms_count,
18754 sizeof(dtmodsyms_count))) {
18755 cmn_err(CE_WARN, "failed to copyin dtmodsyms_count");
18756 return (EFAULT);
18757 }
18758
18759 /*
18760 * Range check the count. How much data can we pass around?
18761 * FIX ME!
18762 */
18763 if (dtmodsyms_count == 0) {
18764 cmn_err(CE_WARN, "dtmodsyms_count is not valid");
18765 return (EINVAL);
18766 }
18767
18768 /*
18769 * Allocate a correctly sized structure and copyin the data.
18770 */
18771 module_symbols_size = DTRACE_MODULE_SYMBOLS_SIZE(dtmodsyms_count);
18772 if (module_symbols_size > (size_t)dtrace_copy_maxsize()) {
18773 size_t dtmodsyms_max = DTRACE_MODULE_SYMBOLS_COUNT(dtrace_copy_maxsize());
18774 cmn_err(CE_WARN, "dtmodsyms_count %ld is too high, maximum is %ld", dtmodsyms_count, dtmodsyms_max);
18775 return (ENOBUFS);
18776 }
18777
18778 if ((module_symbols = kmem_alloc(module_symbols_size, KM_SLEEP)) == NULL)
18779 return (ENOMEM);
18780
18781 rval = 0;
18782
18783 /* NOTE! We can no longer exit this method via return */
18784 if (copyin(arg, module_symbols, module_symbols_size) != 0) {
18785 cmn_err(CE_WARN, "failed copyin of dtrace_module_symbols_t");
18786 rval = EFAULT;
18787 goto module_symbols_cleanup;
18788 }
18789
18790 /*
18791 * Check that the count didn't change between the first copyin and the second.
18792 */
18793 if (module_symbols->dtmodsyms_count != dtmodsyms_count) {
18794 rval = EINVAL;
18795 goto module_symbols_cleanup;
18796 }
18797
18798 /*
18799 * Find the modctl to add symbols to.
18800 */
18801 lck_mtx_lock(&dtrace_provider_lock);
18802 lck_mtx_lock(&mod_lock);
18803
18804 struct modctl* ctl = dtrace_modctl_list;
18805 while (ctl) {
18806 ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
18807 if (MOD_HAS_UUID(ctl) && !MOD_SYMBOLS_DONE(ctl) && memcmp(module_symbols->dtmodsyms_uuid, ctl->mod_uuid, sizeof(UUID)) == 0) {
18808 dtrace_provider_t *prv;
18809 ctl->mod_user_symbols = module_symbols;
18810
18811 /*
18812 * We're going to call each providers per-module provide operation
18813 * specifying only this module.
18814 */
18815 for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
18816 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
18817 /*
18818 * We gave every provider a chance to provide with the user syms, go ahead and clear them
18819 */
18820 ctl->mod_user_symbols = NULL; /* MUST reset this to clear HAS_USERSPACE_SYMBOLS */
18821 }
18822 ctl = ctl->mod_next;
18823 }
18824
18825 lck_mtx_unlock(&mod_lock);
18826 lck_mtx_unlock(&dtrace_provider_lock);
18827
18828 module_symbols_cleanup:
18829 /*
18830 * If we had to allocate struct memory, free it.
18831 */
18832 if (module_symbols != NULL) {
18833 kmem_free(module_symbols, module_symbols_size);
18834 }
18835
18836 return rval;
18837 }
18838
18839 case DTRACEIOC_PROCWAITFOR: {
18840 dtrace_procdesc_t pdesc = {
18841 .p_name = {0},
18842 .p_pid = -1
18843 };
18844
18845 if ((rval = copyin(arg, &pdesc, sizeof(pdesc))) != 0)
18846 goto proc_waitfor_error;
18847
18848 if ((rval = dtrace_proc_waitfor(&pdesc)) != 0)
18849 goto proc_waitfor_error;
18850
18851 if ((rval = copyout(&pdesc, arg, sizeof(pdesc))) != 0)
18852 goto proc_waitfor_error;
18853
18854 return 0;
18855
18856 proc_waitfor_error:
18857 /* The process was suspended, revert this since the client will not do it. */
18858 if (pdesc.p_pid != -1) {
18859 proc_t *proc = proc_find(pdesc.p_pid);
18860 if (proc != PROC_NULL) {
18861 task_pidresume(proc->task);
18862 proc_rele(proc);
18863 }
18864 }
18865
18866 return rval;
18867 }
18868
18869 default:
18870 break;
18871 }
18872
18873 return (ENOTTY);
18874 }
18875
18876 /*
18877 * APPLE NOTE: dtrace_detach not implemented
18878 */
18879 #if !defined(__APPLE__)
18880 /*ARGSUSED*/
18881 static int
18882 dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
18883 {
18884 dtrace_state_t *state;
18885
18886 switch (cmd) {
18887 case DDI_DETACH:
18888 break;
18889
18890 case DDI_SUSPEND:
18891 return (DDI_SUCCESS);
18892
18893 default:
18894 return (DDI_FAILURE);
18895 }
18896
18897 lck_mtx_lock(&cpu_lock);
18898 lck_mtx_lock(&dtrace_provider_lock);
18899 lck_mtx_lock(&dtrace_lock);
18900
18901 ASSERT(dtrace_opens == 0);
18902
18903 if (dtrace_helpers > 0) {
18904 lck_mtx_unlock(&dtrace_lock);
18905 lck_mtx_unlock(&dtrace_provider_lock);
18906 lck_mtx_unlock(&cpu_lock);
18907 return (DDI_FAILURE);
18908 }
18909
18910 if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
18911 lck_mtx_unlock(&dtrace_lock);
18912 lck_mtx_unlock(&dtrace_provider_lock);
18913 lck_mtx_unlock(&cpu_lock);
18914 return (DDI_FAILURE);
18915 }
18916
18917 dtrace_provider = NULL;
18918
18919 if ((state = dtrace_anon_grab()) != NULL) {
18920 /*
18921 * If there were ECBs on this state, the provider should
18922 * have not been allowed to detach; assert that there is
18923 * none.
18924 */
18925 ASSERT(state->dts_necbs == 0);
18926 dtrace_state_destroy(state);
18927
18928 /*
18929 * If we're being detached with anonymous state, we need to
18930 * indicate to the kernel debugger that DTrace is now inactive.
18931 */
18932 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
18933 }
18934
18935 bzero(&dtrace_anon, sizeof (dtrace_anon_t));
18936 unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
18937 dtrace_cpu_init = NULL;
18938 dtrace_helpers_cleanup = NULL;
18939 dtrace_helpers_fork = NULL;
18940 dtrace_cpustart_init = NULL;
18941 dtrace_cpustart_fini = NULL;
18942 dtrace_debugger_init = NULL;
18943 dtrace_debugger_fini = NULL;
18944 dtrace_kreloc_init = NULL;
18945 dtrace_kreloc_fini = NULL;
18946 dtrace_modload = NULL;
18947 dtrace_modunload = NULL;
18948
18949 lck_mtx_unlock(&cpu_lock);
18950
18951 if (dtrace_helptrace_enabled) {
18952 kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
18953 dtrace_helptrace_buffer = NULL;
18954 }
18955
18956 kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
18957 dtrace_probes = NULL;
18958 dtrace_nprobes = 0;
18959
18960 dtrace_hash_destroy(dtrace_strings);
18961 dtrace_hash_destroy(dtrace_byprov);
18962 dtrace_hash_destroy(dtrace_bymod);
18963 dtrace_hash_destroy(dtrace_byfunc);
18964 dtrace_hash_destroy(dtrace_byname);
18965 dtrace_strings = NULL;
18966 dtrace_byprov = NULL;
18967 dtrace_bymod = NULL;
18968 dtrace_byfunc = NULL;
18969 dtrace_byname = NULL;
18970
18971 kmem_cache_destroy(dtrace_state_cache);
18972 vmem_destroy(dtrace_arena);
18973
18974 if (dtrace_toxrange != NULL) {
18975 kmem_free(dtrace_toxrange,
18976 dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
18977 dtrace_toxrange = NULL;
18978 dtrace_toxranges = 0;
18979 dtrace_toxranges_max = 0;
18980 }
18981
18982 ddi_remove_minor_node(dtrace_devi, NULL);
18983 dtrace_devi = NULL;
18984
18985 ddi_soft_state_fini(&dtrace_softstate);
18986
18987 ASSERT(dtrace_vtime_references == 0);
18988 ASSERT(dtrace_opens == 0);
18989 ASSERT(dtrace_retained == NULL);
18990
18991 lck_mtx_unlock(&dtrace_lock);
18992 lck_mtx_unlock(&dtrace_provider_lock);
18993
18994 #ifdef illumos
18995 /*
18996 * We don't destroy the task queue until after we have dropped our
18997 * locks (taskq_destroy() may block on running tasks). To prevent
18998 * attempting to do work after we have effectively detached but before
18999 * the task queue has been destroyed, all tasks dispatched via the
19000 * task queue must check that DTrace is still attached before
19001 * performing any operation.
19002 */
19003 taskq_destroy(dtrace_taskq);
19004 dtrace_taskq = NULL;
19005 #endif
19006
19007 return (DDI_SUCCESS);
19008 }
19009 #endif /* __APPLE__ */
19010
19011 d_open_t _dtrace_open, helper_open;
19012 d_close_t _dtrace_close, helper_close;
19013 d_ioctl_t _dtrace_ioctl, helper_ioctl;
19014
19015 int
19016 _dtrace_open(dev_t dev, int flags, int devtype, struct proc *p)
19017 {
19018 #pragma unused(p)
19019 dev_t locdev = dev;
19020
19021 return dtrace_open( &locdev, flags, devtype, CRED());
19022 }
19023
19024 int
19025 helper_open(dev_t dev, int flags, int devtype, struct proc *p)
19026 {
19027 #pragma unused(dev,flags,devtype,p)
19028 return 0;
19029 }
19030
19031 int
19032 _dtrace_close(dev_t dev, int flags, int devtype, struct proc *p)
19033 {
19034 #pragma unused(p)
19035 return dtrace_close( dev, flags, devtype, CRED());
19036 }
19037
19038 int
19039 helper_close(dev_t dev, int flags, int devtype, struct proc *p)
19040 {
19041 #pragma unused(dev,flags,devtype,p)
19042 return 0;
19043 }
19044
19045 int
19046 _dtrace_ioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct proc *p)
19047 {
19048 #pragma unused(p)
19049 int err, rv = 0;
19050 user_addr_t uaddrp;
19051
19052 if (proc_is64bit(p))
19053 uaddrp = *(user_addr_t *)data;
19054 else
19055 uaddrp = (user_addr_t) *(uint32_t *)data;
19056
19057 err = dtrace_ioctl(dev, cmd, uaddrp, fflag, CRED(), &rv);
19058
19059 /* Darwin's BSD ioctls only return -1 or zero. Overload errno to mimic Solaris. 20 bits suffice. */
19060 if (err != 0) {
19061 ASSERT( (err & 0xfffff000) == 0 );
19062 return (err & 0xfff); /* ioctl will return -1 and will set errno to an error code < 4096 */
19063 } else if (rv != 0) {
19064 ASSERT( (rv & 0xfff00000) == 0 );
19065 return (((rv & 0xfffff) << 12)); /* ioctl will return -1 and will set errno to a value >= 4096 */
19066 } else
19067 return 0;
19068 }
19069
19070 int
19071 helper_ioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct proc *p)
19072 {
19073 #pragma unused(dev,fflag,p)
19074 int err, rv = 0;
19075
19076 err = dtrace_ioctl_helper(cmd, data, &rv);
19077 /* Darwin's BSD ioctls only return -1 or zero. Overload errno to mimic Solaris. 20 bits suffice. */
19078 if (err != 0) {
19079 ASSERT( (err & 0xfffff000) == 0 );
19080 return (err & 0xfff); /* ioctl will return -1 and will set errno to an error code < 4096 */
19081 } else if (rv != 0) {
19082 ASSERT( (rv & 0xfff00000) == 0 );
19083 return (((rv & 0xfffff) << 12)); /* ioctl will return -1 and will set errno to a value >= 4096 */
19084 } else
19085 return 0;
19086 }
19087
19088 #define HELPER_MAJOR -24 /* let the kernel pick the device number */
19089
19090 const static struct cdevsw helper_cdevsw =
19091 {
19092 .d_open = helper_open,
19093 .d_close = helper_close,
19094 .d_read = eno_rdwrt,
19095 .d_write = eno_rdwrt,
19096 .d_ioctl = helper_ioctl,
19097 .d_stop = (stop_fcn_t *)nulldev,
19098 .d_reset = (reset_fcn_t *)nulldev,
19099 .d_select = eno_select,
19100 .d_mmap = eno_mmap,
19101 .d_strategy = eno_strat,
19102 .d_reserved_1 = eno_getc,
19103 .d_reserved_2 = eno_putc,
19104 };
19105
19106 static int helper_majdevno = 0;
19107
19108 static int gDTraceInited = 0;
19109
19110 void
19111 helper_init( void )
19112 {
19113 /*
19114 * Once the "helper" is initialized, it can take ioctl calls that use locks
19115 * and zones initialized in dtrace_init. Make certain dtrace_init was called
19116 * before us.
19117 */
19118
19119 if (!gDTraceInited) {
19120 panic("helper_init before dtrace_init\n");
19121 }
19122
19123 if (0 >= helper_majdevno)
19124 {
19125 helper_majdevno = cdevsw_add(HELPER_MAJOR, &helper_cdevsw);
19126
19127 if (helper_majdevno < 0) {
19128 printf("helper_init: failed to allocate a major number!\n");
19129 return;
19130 }
19131
19132 if (NULL == devfs_make_node( makedev(helper_majdevno, 0), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666,
19133 DTRACEMNR_HELPER, 0 )) {
19134 printf("dtrace_init: failed to devfs_make_node for helper!\n");
19135 return;
19136 }
19137 } else
19138 panic("helper_init: called twice!\n");
19139 }
19140
19141 #undef HELPER_MAJOR
19142
19143 static int
19144 dtrace_clone_func(dev_t dev, int action)
19145 {
19146 #pragma unused(dev)
19147
19148 if (action == DEVFS_CLONE_ALLOC) {
19149 return dtrace_state_reserve();
19150 }
19151 else if (action == DEVFS_CLONE_FREE) {
19152 return 0;
19153 }
19154 else return -1;
19155 }
19156
19157 void dtrace_ast(void);
19158
19159 void
19160 dtrace_ast(void)
19161 {
19162 int i;
19163 uint32_t clients = os_atomic_xchg(&dtrace_wake_clients, 0, relaxed);
19164 if (clients == 0)
19165 return;
19166 /**
19167 * We disable preemption here to be sure that we won't get
19168 * interrupted by a wakeup to a thread that is higher
19169 * priority than us, so that we do issue all wakeups
19170 */
19171 disable_preemption();
19172 for (i = 0; i < DTRACE_NCLIENTS; i++) {
19173 if (clients & (1 << i)) {
19174 dtrace_state_t *state = dtrace_state_get(i);
19175 if (state) {
19176 wakeup(state);
19177 }
19178
19179 }
19180 }
19181 enable_preemption();
19182 }
19183
19184
19185 #define DTRACE_MAJOR -24 /* let the kernel pick the device number */
19186
19187 static const struct cdevsw dtrace_cdevsw =
19188 {
19189 .d_open = _dtrace_open,
19190 .d_close = _dtrace_close,
19191 .d_read = eno_rdwrt,
19192 .d_write = eno_rdwrt,
19193 .d_ioctl = _dtrace_ioctl,
19194 .d_stop = (stop_fcn_t *)nulldev,
19195 .d_reset = (reset_fcn_t *)nulldev,
19196 .d_select = eno_select,
19197 .d_mmap = eno_mmap,
19198 .d_strategy = eno_strat,
19199 .d_reserved_1 = eno_getc,
19200 .d_reserved_2 = eno_putc,
19201 };
19202
19203 lck_attr_t* dtrace_lck_attr;
19204 lck_grp_attr_t* dtrace_lck_grp_attr;
19205 lck_grp_t* dtrace_lck_grp;
19206
19207 static int gMajDevNo;
19208
19209 void dtrace_early_init (void)
19210 {
19211 dtrace_restriction_policy_load();
19212
19213 /*
19214 * See dtrace_impl.h for a description of kernel symbol modes.
19215 * The default is to wait for symbols from userspace (lazy symbols).
19216 */
19217 if (!PE_parse_boot_argn("dtrace_kernel_symbol_mode", &dtrace_kernel_symbol_mode, sizeof (dtrace_kernel_symbol_mode))) {
19218 dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE;
19219 }
19220 }
19221
19222 void
19223 dtrace_init( void )
19224 {
19225 if (0 == gDTraceInited) {
19226 unsigned int i, ncpu;
19227 size_t size = sizeof(dtrace_buffer_memory_maxsize);
19228
19229 /*
19230 * Disable destructive actions when dtrace is running
19231 * in a restricted environment
19232 */
19233 dtrace_destructive_disallow = dtrace_is_restricted() &&
19234 !dtrace_are_restrictions_relaxed();
19235
19236 /*
19237 * DTrace allocates buffers based on the maximum number
19238 * of enabled cpus. This call avoids any race when finding
19239 * that count.
19240 */
19241 ASSERT(dtrace_max_cpus == 0);
19242 ncpu = dtrace_max_cpus = ml_wait_max_cpus();
19243
19244 /*
19245 * Retrieve the size of the physical memory in order to define
19246 * the state buffer memory maximal size. If we cannot retrieve
19247 * this value, we'll consider that we have 1Gb of memory per CPU, that's
19248 * still better than raising a kernel panic.
19249 */
19250 if (0 != kernel_sysctlbyname("hw.memsize", &dtrace_buffer_memory_maxsize,
19251 &size, NULL, 0))
19252 {
19253 dtrace_buffer_memory_maxsize = ncpu * 1024 * 1024 * 1024;
19254 printf("dtrace_init: failed to retrieve the hw.memsize, defaulted to %lld bytes\n",
19255 dtrace_buffer_memory_maxsize);
19256 }
19257
19258 /*
19259 * Finally, divide by three to prevent DTrace from eating too
19260 * much memory.
19261 */
19262 dtrace_buffer_memory_maxsize /= 3;
19263 ASSERT(dtrace_buffer_memory_maxsize > 0);
19264
19265 gMajDevNo = cdevsw_add(DTRACE_MAJOR, &dtrace_cdevsw);
19266
19267 if (gMajDevNo < 0) {
19268 printf("dtrace_init: failed to allocate a major number!\n");
19269 gDTraceInited = 0;
19270 return;
19271 }
19272
19273 if (NULL == devfs_make_node_clone( makedev(gMajDevNo, 0), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666,
19274 dtrace_clone_func, DTRACEMNR_DTRACE, 0 )) {
19275 printf("dtrace_init: failed to devfs_make_node_clone for dtrace!\n");
19276 gDTraceInited = 0;
19277 return;
19278 }
19279
19280 /*
19281 * Create the dtrace lock group and attrs.
19282 */
19283 dtrace_lck_attr = lck_attr_alloc_init();
19284 dtrace_lck_grp_attr= lck_grp_attr_alloc_init();
19285 dtrace_lck_grp = lck_grp_alloc_init("dtrace", dtrace_lck_grp_attr);
19286
19287 /*
19288 * We have to initialize all locks explicitly
19289 */
19290 lck_mtx_init(&dtrace_lock, dtrace_lck_grp, dtrace_lck_attr);
19291 lck_mtx_init(&dtrace_provider_lock, dtrace_lck_grp, dtrace_lck_attr);
19292 lck_mtx_init(&dtrace_meta_lock, dtrace_lck_grp, dtrace_lck_attr);
19293 lck_mtx_init(&dtrace_procwaitfor_lock, dtrace_lck_grp, dtrace_lck_attr);
19294 #if DEBUG
19295 lck_mtx_init(&dtrace_errlock, dtrace_lck_grp, dtrace_lck_attr);
19296 #endif
19297 lck_rw_init(&dtrace_dof_mode_lock, dtrace_lck_grp, dtrace_lck_attr);
19298
19299 /*
19300 * The cpu_core structure consists of per-CPU state available in any context.
19301 * On some architectures, this may mean that the page(s) containing the
19302 * NCPU-sized array of cpu_core structures must be locked in the TLB -- it
19303 * is up to the platform to assure that this is performed properly. Note that
19304 * the structure is sized to avoid false sharing.
19305 */
19306 lck_mtx_init(&cpu_lock, dtrace_lck_grp, dtrace_lck_attr);
19307 lck_mtx_init(&cyc_lock, dtrace_lck_grp, dtrace_lck_attr);
19308 lck_mtx_init(&mod_lock, dtrace_lck_grp, dtrace_lck_attr);
19309
19310 /*
19311 * Initialize the CPU offline/online hooks.
19312 */
19313 dtrace_install_cpu_hooks();
19314
19315 dtrace_modctl_list = NULL;
19316
19317 cpu_core = (cpu_core_t *)kmem_zalloc( ncpu * sizeof(cpu_core_t), KM_SLEEP );
19318 for (i = 0; i < ncpu; ++i) {
19319 lck_mtx_init(&cpu_core[i].cpuc_pid_lock, dtrace_lck_grp, dtrace_lck_attr);
19320 }
19321
19322 cpu_list = (dtrace_cpu_t *)kmem_zalloc( ncpu * sizeof(dtrace_cpu_t), KM_SLEEP );
19323 for (i = 0; i < ncpu; ++i) {
19324 cpu_list[i].cpu_id = (processorid_t)i;
19325 cpu_list[i].cpu_next = &(cpu_list[(i+1) % ncpu]);
19326 LIST_INIT(&cpu_list[i].cpu_cyc_list);
19327 lck_rw_init(&cpu_list[i].cpu_ft_lock, dtrace_lck_grp, dtrace_lck_attr);
19328 }
19329
19330 lck_mtx_lock(&cpu_lock);
19331 for (i = 0; i < ncpu; ++i)
19332 /* FIXME: track CPU configuration */
19333 dtrace_cpu_setup_initial( (processorid_t)i ); /* In lieu of register_cpu_setup_func() callback */
19334 lck_mtx_unlock(&cpu_lock);
19335
19336 (void)dtrace_abs_to_nano(0LL); /* Force once only call to clock_timebase_info (which can take a lock) */
19337
19338 dtrace_strings = dtrace_hash_create(dtrace_strkey_offset,
19339 offsetof(dtrace_string_t, dtst_str),
19340 offsetof(dtrace_string_t, dtst_next),
19341 offsetof(dtrace_string_t, dtst_prev));
19342
19343 dtrace_isa_init();
19344 /*
19345 * See dtrace_impl.h for a description of dof modes.
19346 * The default is lazy dof.
19347 *
19348 * FIXME: Warn if state is LAZY_OFF? It won't break anything, but
19349 * makes no sense...
19350 */
19351 if (!PE_parse_boot_argn("dtrace_dof_mode", &dtrace_dof_mode, sizeof (dtrace_dof_mode))) {
19352 #if defined(XNU_TARGET_OS_OSX)
19353 dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_ON;
19354 #else
19355 dtrace_dof_mode = DTRACE_DOF_MODE_NEVER;
19356 #endif
19357 }
19358
19359 /*
19360 * Sanity check of dof mode value.
19361 */
19362 switch (dtrace_dof_mode) {
19363 case DTRACE_DOF_MODE_NEVER:
19364 case DTRACE_DOF_MODE_LAZY_ON:
19365 /* valid modes, but nothing else we need to do */
19366 break;
19367
19368 case DTRACE_DOF_MODE_LAZY_OFF:
19369 case DTRACE_DOF_MODE_NON_LAZY:
19370 /* Cannot wait for a dtrace_open to init fasttrap */
19371 fasttrap_init();
19372 break;
19373
19374 default:
19375 /* Invalid, clamp to non lazy */
19376 dtrace_dof_mode = DTRACE_DOF_MODE_NON_LAZY;
19377 fasttrap_init();
19378 break;
19379 }
19380
19381 #if CONFIG_DTRACE
19382 if (dtrace_dof_mode != DTRACE_DOF_MODE_NEVER)
19383 commpage_update_dof(true);
19384 #endif
19385
19386 gDTraceInited = 1;
19387
19388 } else
19389 panic("dtrace_init: called twice!\n");
19390 }
19391
19392 void
19393 dtrace_postinit(void)
19394 {
19395 /*
19396 * Called from bsd_init after all provider's *_init() routines have been
19397 * run. That way, anonymous DOF enabled under dtrace_attach() is safe
19398 * to go.
19399 */
19400 dtrace_attach( (dev_info_t *)(uintptr_t)makedev(gMajDevNo, 0)); /* Punning a dev_t to a dev_info_t* */
19401
19402 /*
19403 * Add the mach_kernel to the module list for lazy processing
19404 */
19405 struct kmod_info fake_kernel_kmod;
19406 memset(&fake_kernel_kmod, 0, sizeof(fake_kernel_kmod));
19407
19408 strlcpy(fake_kernel_kmod.name, "mach_kernel", sizeof(fake_kernel_kmod.name));
19409 fake_kernel_kmod.id = 1;
19410 fake_kernel_kmod.address = g_kernel_kmod_info.address;
19411 fake_kernel_kmod.size = g_kernel_kmod_info.size;
19412
19413 if (dtrace_module_loaded(&fake_kernel_kmod, 0) != 0) {
19414 printf("dtrace_postinit: Could not register mach_kernel modctl\n");
19415 }
19416
19417 (void)OSKextRegisterKextsWithDTrace();
19418 }
19419 #undef DTRACE_MAJOR
19420
19421 /*
19422 * Routines used to register interest in cpu's being added to or removed
19423 * from the system.
19424 */
19425 void
19426 register_cpu_setup_func(cpu_setup_func_t *ignore1, void *ignore2)
19427 {
19428 #pragma unused(ignore1,ignore2)
19429 }
19430
19431 void
19432 unregister_cpu_setup_func(cpu_setup_func_t *ignore1, void *ignore2)
19433 {
19434 #pragma unused(ignore1,ignore2)
19435 }