bsd/dev/dtrace/dtrace.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26
  27 /* #pragma ident        "@(#)dtrace.c   1.65    08/07/02 SMI" */
  28
  29 /*
  30  * DTrace - Dynamic Tracing for Solaris
  31  *
  32  * This is the implementation of the Solaris Dynamic Tracing framework
  33  * (DTrace).  The user-visible interface to DTrace is described at length in
  34  * the "Solaris Dynamic Tracing Guide".  The interfaces between the libdtrace
  35  * library, the in-kernel DTrace framework, and the DTrace providers are
  36  * described in the block comments in the <sys/dtrace.h> header file.  The
  37  * internal architecture of DTrace is described in the block comments in the
  38  * <sys/dtrace_impl.h> header file.  The comments contained within the DTrace
  39  * implementation very much assume mastery of all of these sources; if one has
  40  * an unanswered question about the implementation, one should consult them
  41  * first.
  42  *
  43  * The functions here are ordered roughly as follows:
  44  *
  45  *   - Probe context functions
  46  *   - Probe hashing functions
  47  *   - Non-probe context utility functions
  48  *   - Matching functions
  49  *   - Provider-to-Framework API functions
  50  *   - Probe management functions
  51  *   - DIF object functions
  52  *   - Format functions
  53  *   - Predicate functions
  54  *   - ECB functions
  55  *   - Buffer functions
  56  *   - Enabling functions
  57  *   - DOF functions
  58  *   - Anonymous enabling functions
  59  *   - Consumer state functions
  60  *   - Helper functions
  61  *   - Hook functions
  62  *   - Driver cookbook functions
  63  *
  64  * Each group of functions begins with a block comment labelled the "DTrace
  65  * [Group] Functions", allowing one to find each block by searching forward
  66  * on capital-f functions.
  67  */
  68 #if !defined(__APPLE__)
  69 #include <sys/errno.h>
  70 #include <sys/stat.h>
  71 #include <sys/modctl.h>
  72 #include <sys/conf.h>
  73 #include <sys/systm.h>
  74 #include <sys/ddi.h>
  75 #include <sys/sunddi.h>
  76 #include <sys/cpuvar.h>
  77 #include <sys/kmem.h>
  78 #include <sys/strsubr.h>
  79 #include <sys/sysmacros.h>
  80 #include <sys/dtrace_impl.h>
  81 #include <sys/atomic.h>
  82 #include <sys/cmn_err.h>
  83 #include <sys/mutex_impl.h>
  84 #include <sys/rwlock_impl.h>
  85 #include <sys/ctf_api.h>
  86 #include <sys/panic.h>
  87 #include <sys/priv_impl.h>
  88 #include <sys/policy.h>
  89 #include <sys/cred_impl.h>
  90 #include <sys/procfs_isa.h>
  91 #include <sys/taskq.h>
  92 #include <sys/mkdev.h>
  93 #include <sys/kdi.h>
  94 #include <sys/zone.h>
  95 #else
  96 #include <sys/errno.h>
  97 #include <sys/types.h>
  98 #include <sys/stat.h>
  99 #include <sys/conf.h>
 100 #include <sys/systm.h>
 101 #include <sys/dtrace_impl.h>
 102 #include <sys/param.h>
 103 #include <sys/proc_internal.h>
 104 #include <sys/ioctl.h>
 105 #include <sys/fcntl.h>
 106 #include <miscfs/devfs/devfs.h>
 107 #include <sys/malloc.h>
 108 #include <sys/kernel_types.h>
 109 #include <sys/proc_internal.h>
 110 #include <sys/uio_internal.h>
 111 #include <sys/kauth.h>
 112 #include <vm/pmap.h>
 113 #include <sys/user.h>
 114 #include <mach/exception_types.h>
 115 #include <sys/signalvar.h>
 116 #include <mach/task.h>
 117 #include <kern/zalloc.h>
 118 #include <kern/ast.h>
 119 #include <netinet/in.h>
 120
 121 #if defined(__APPLE__)
 122 #include <kern/cpu_data.h>
 123 extern uint32_t pmap_find_phys(void *, uint64_t);
 124 extern boolean_t pmap_valid_page(uint32_t);
 125 extern void OSKextRegisterKextsWithDTrace(void);
 126 extern kmod_info_t g_kernel_kmod_info;
 127 #endif /* __APPLE__ */
 128
 129
 130 /* Solaris proc_t is the struct. Darwin's proc_t is a pointer to it. */
 131 #define proc_t struct proc /* Steer clear of the Darwin typedef for proc_t */
 132
 133 #define t_predcache t_dtrace_predcache /* Cosmetic. Helps readability of thread.h */
 134
 135 extern void dtrace_suspend(void);
 136 extern void dtrace_resume(void);
 137 extern void dtrace_init(void);
 138 extern void helper_init(void);
 139 extern void fasttrap_init(void);
 140 extern void dtrace_lazy_dofs_duplicate(proc_t *, proc_t *);
 141 extern void dtrace_lazy_dofs_destroy(proc_t *);
 142 extern void dtrace_postinit(void);
 143
 144 #include "../../../osfmk/chud/chud_dtrace.h"
 145
 146 extern kern_return_t chudxnu_dtrace_callback
 147         (uint64_t selector, uint64_t *args, uint32_t count);
 148
 149 #endif /* __APPLE__ */
 150
 151 /*
 152  * DTrace Tunable Variables
 153  *
 154  * The following variables may be tuned by adding a line to /etc/system that
 155  * includes both the name of the DTrace module ("dtrace") and the name of the
 156  * variable.  For example:
 157  *
 158  *   set dtrace:dtrace_destructive_disallow = 1
 159  *
 160  * In general, the only variables that one should be tuning this way are those
 161  * that affect system-wide DTrace behavior, and for which the default behavior
 162  * is undesirable.  Most of these variables are tunable on a per-consumer
 163  * basis using DTrace options, and need not be tuned on a system-wide basis.
 164  * When tuning these variables, avoid pathological values; while some attempt
 165  * is made to verify the integrity of these variables, they are not considered
 166  * part of the supported interface to DTrace, and they are therefore not
 167  * checked comprehensively.  Further, these variables should not be tuned
 168  * dynamically via "mdb -kw" or other means; they should only be tuned via
 169  * /etc/system.
 170  */
 171 int             dtrace_destructive_disallow = 0;
 172 dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024);
 173 size_t          dtrace_difo_maxsize = (256 * 1024);
 174 dtrace_optval_t dtrace_dof_maxsize = (384 * 1024);
 175 size_t          dtrace_global_maxsize = (16 * 1024);
 176 size_t          dtrace_actions_max = (16 * 1024);
 177 size_t          dtrace_retain_max = 1024;
 178 dtrace_optval_t dtrace_helper_actions_max = 32;
 179 dtrace_optval_t dtrace_helper_providers_max = 64;
 180 dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024);
 181 size_t          dtrace_strsize_default = 256;
 182 dtrace_optval_t dtrace_cleanrate_default = 9900990;             /* 101 hz */
 183 dtrace_optval_t dtrace_cleanrate_min = 200000;                  /* 5000 hz */
 184 dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC;  /* 1/minute */
 185 dtrace_optval_t dtrace_aggrate_default = NANOSEC;               /* 1 hz */
 186 dtrace_optval_t dtrace_statusrate_default = NANOSEC;            /* 1 hz */
 187 dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC;  /* 6/minute */
 188 dtrace_optval_t dtrace_switchrate_default = NANOSEC;            /* 1 hz */
 189 dtrace_optval_t dtrace_nspec_default = 1;
 190 dtrace_optval_t dtrace_specsize_default = 32 * 1024;
 191 dtrace_optval_t dtrace_stackframes_default = 20;
 192 dtrace_optval_t dtrace_ustackframes_default = 20;
 193 dtrace_optval_t dtrace_jstackframes_default = 50;
 194 dtrace_optval_t dtrace_jstackstrsize_default = 512;
 195 int             dtrace_msgdsize_max = 128;
 196 hrtime_t        dtrace_chill_max = 500 * (NANOSEC / MILLISEC);  /* 500 ms */
 197 hrtime_t        dtrace_chill_interval = NANOSEC;                /* 1000 ms */
 198 int             dtrace_devdepth_max = 32;
 199 int             dtrace_err_verbose;
 200 hrtime_t        dtrace_deadman_interval = NANOSEC;
 201 hrtime_t        dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
 202 hrtime_t        dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
 203
 204 /*
 205  * DTrace External Variables
 206  *
 207  * As dtrace(7D) is a kernel module, any DTrace variables are obviously
 208  * available to DTrace consumers via the backtick (`) syntax.  One of these,
 209  * dtrace_zero, is made deliberately so:  it is provided as a source of
 210  * well-known, zero-filled memory.  While this variable is not documented,
 211  * it is used by some translators as an implementation detail.
 212  */
 213 const char      dtrace_zero[256] = { 0 };       /* zero-filled memory */
 214
 215 /*
 216  * DTrace Internal Variables
 217  */
 218 static dev_info_t       *dtrace_devi;           /* device info */
 219 static vmem_t           *dtrace_arena;          /* probe ID arena */
 220 static vmem_t           *dtrace_minor;          /* minor number arena */
 221 static taskq_t          *dtrace_taskq;          /* task queue */
 222 static dtrace_probe_t   **dtrace_probes;        /* array of all probes */
 223 static int              dtrace_nprobes;         /* number of probes */
 224 static dtrace_provider_t *dtrace_provider;      /* provider list */
 225 static dtrace_meta_t    *dtrace_meta_pid;       /* user-land meta provider */
 226 static int              dtrace_opens;           /* number of opens */
 227 static int              dtrace_helpers;         /* number of helpers */
 228 static void             *dtrace_softstate;      /* softstate pointer */
 229 static dtrace_hash_t    *dtrace_bymod;          /* probes hashed by module */
 230 static dtrace_hash_t    *dtrace_byfunc;         /* probes hashed by function */
 231 static dtrace_hash_t    *dtrace_byname;         /* probes hashed by name */
 232 static dtrace_toxrange_t *dtrace_toxrange;      /* toxic range array */
 233 static int              dtrace_toxranges;       /* number of toxic ranges */
 234 static int              dtrace_toxranges_max;   /* size of toxic range array */
 235 static dtrace_anon_t    dtrace_anon;            /* anonymous enabling */
 236 static kmem_cache_t     *dtrace_state_cache;    /* cache for dynamic state */
 237 static uint64_t         dtrace_vtime_references; /* number of vtimestamp refs */
 238 static kthread_t        *dtrace_panicked;       /* panicking thread */
 239 static dtrace_ecb_t     *dtrace_ecb_create_cache; /* cached created ECB */
 240 static dtrace_genid_t   dtrace_probegen;        /* current probe generation */
 241 static dtrace_helpers_t *dtrace_deferred_pid;   /* deferred helper list */
 242 static dtrace_enabling_t *dtrace_retained;      /* list of retained enablings */
 243 static dtrace_genid_t   dtrace_retained_gen;    /* current retained enab gen */
 244 static dtrace_dynvar_t  dtrace_dynhash_sink;    /* end of dynamic hash chains */
 245 #if defined(__APPLE__)
 246 static int              dtrace_dof_mode;        /* See dtrace_impl.h for a description of Darwin's dof modes. */
 247
 248                         /*
 249                          * This does't quite fit as an internal variable, as it must be accessed in
 250                          * fbt_provide and sdt_provide. Its clearly not a dtrace tunable variable either...
 251                          */
 252 int                     dtrace_kernel_symbol_mode;      /* See dtrace_impl.h for a description of Darwin's kernel symbol modes. */
 253 #endif
 254
 255 #if defined(__APPLE__)
 256 /*
 257  * To save memory, some common memory allocations are given a
 258  * unique zone. For example, dtrace_probe_t is 72 bytes in size,
 259  * which means it would fall into the kalloc.128 bucket. With
 260  * 20k elements allocated, the space saved is substantial.
 261  */
 262
 263 struct zone *dtrace_probe_t_zone;
 264
 265 static int dtrace_module_unloaded(struct kmod_info *kmod);
 266 #endif /* __APPLE__ */
 267
 268 /*
 269  * DTrace Locking
 270  * DTrace is protected by three (relatively coarse-grained) locks:
 271  *
 272  * (1) dtrace_lock is required to manipulate essentially any DTrace state,
 273  *     including enabling state, probes, ECBs, consumer state, helper state,
 274  *     etc.  Importantly, dtrace_lock is _not_ required when in probe context;
 275  *     probe context is lock-free -- synchronization is handled via the
 276  *     dtrace_sync() cross call mechanism.
 277  *
 278  * (2) dtrace_provider_lock is required when manipulating provider state, or
 279  *     when provider state must be held constant.
 280  *
 281  * (3) dtrace_meta_lock is required when manipulating meta provider state, or
 282  *     when meta provider state must be held constant.
 283  *
 284  * The lock ordering between these three locks is dtrace_meta_lock before
 285  * dtrace_provider_lock before dtrace_lock.  (In particular, there are
 286  * several places where dtrace_provider_lock is held by the framework as it
 287  * calls into the providers -- which then call back into the framework,
 288  * grabbing dtrace_lock.)
 289  *
 290  * There are two other locks in the mix:  mod_lock and cpu_lock.  With respect
 291  * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
 292  * role as a coarse-grained lock; it is acquired before both of these locks.
 293  * With respect to dtrace_meta_lock, its behavior is stranger:  cpu_lock must
 294  * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
 295  * mod_lock is similar with respect to dtrace_provider_lock in that it must be
 296  * acquired _between_ dtrace_provider_lock and dtrace_lock.
 297  */
 298
 299 #if !defined(__APPLE__)
 300 static kmutex_t         dtrace_lock;            /* probe state lock */
 301 static kmutex_t         dtrace_provider_lock;   /* provider state lock */
 302 static kmutex_t         dtrace_meta_lock;       /* meta-provider state lock */
 303 #else
 304 /*
 305  * APPLE NOTE:
 306  *
 307  * All kmutex_t vars have been changed to lck_mtx_t.
 308  * Note that lck_mtx_t's require explicit initialization.
 309  *
 310  * mutex_enter() becomes lck_mtx_lock()
 311  * mutex_exit() becomes lck_mtx_unlock()
 312  *
 313  * Lock asserts are changed like this:
 314  *
 315  * ASSERT(MUTEX_HELD(&cpu_lock));
 316  *      becomes:
 317  * lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
 318  *
 319  * Due to the number of these changes, they are not called out explicitly.
 320  */
 321 static lck_mtx_t        dtrace_lock;            /* probe state lock */
 322 static lck_mtx_t        dtrace_provider_lock;   /* provider state lock */
 323 static lck_mtx_t        dtrace_meta_lock;       /* meta-provider state lock */
 324 static lck_rw_t         dtrace_dof_mode_lock;   /* dof mode lock */
 325 #endif /* __APPLE__ */
 326
 327 /*
 328  * DTrace Provider Variables
 329  *
 330  * These are the variables relating to DTrace as a provider (that is, the
 331  * provider of the BEGIN, END, and ERROR probes).
 332  */
 333 static dtrace_pattr_t   dtrace_provider_attr = {
 334 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
 335 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
 336 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
 337 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
 338 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
 339 };
 340
 341 static void
 342 dtrace_nullop(void)
 343 {}
 344
 345 static int
 346 dtrace_enable_nullop(void)
 347 {
 348     return (0);
 349 }
 350
 351 static dtrace_pops_t    dtrace_provider_ops = {
 352         (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop,
 353         (void (*)(void *, struct modctl *))dtrace_nullop,
 354         (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop,
 355         (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
 356         (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
 357         (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
 358         NULL,
 359         NULL,
 360         NULL,
 361         (void (*)(void *, dtrace_id_t, void *))dtrace_nullop
 362 };
 363
 364 static dtrace_id_t      dtrace_probeid_begin;   /* special BEGIN probe */
 365 static dtrace_id_t      dtrace_probeid_end;     /* special END probe */
 366 dtrace_id_t             dtrace_probeid_error;   /* special ERROR probe */
 367
 368 /*
 369  * DTrace Helper Tracing Variables
 370  */
 371 uint32_t dtrace_helptrace_next = 0;
 372 uint32_t dtrace_helptrace_nlocals;
 373 char    *dtrace_helptrace_buffer;
 374 #if !defined(__APPLE__) /* Quiet compiler warning */
 375 int     dtrace_helptrace_bufsize = 512 * 1024;
 376 #else
 377 size_t  dtrace_helptrace_bufsize = 512 * 1024;
 378 #endif /* __APPLE__ */
 379
 380 #if DEBUG
 381 int     dtrace_helptrace_enabled = 1;
 382 #else
 383 int     dtrace_helptrace_enabled = 0;
 384 #endif
 385
 386 /*
 387  * DTrace Error Hashing
 388  *
 389  * On DEBUG kernels, DTrace will track the errors that has seen in a hash
 390  * table.  This is very useful for checking coverage of tests that are
 391  * expected to induce DIF or DOF processing errors, and may be useful for
 392  * debugging problems in the DIF code generator or in DOF generation .  The
 393  * error hash may be examined with the ::dtrace_errhash MDB dcmd.
 394  */
 395 #if DEBUG
 396 static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ];
 397 static const char *dtrace_errlast;
 398 static kthread_t *dtrace_errthread;
 399 static lck_mtx_t dtrace_errlock;
 400 #endif
 401
 402 /*
 403  * DTrace Macros and Constants
 404  *
 405  * These are various macros that are useful in various spots in the
 406  * implementation, along with a few random constants that have no meaning
 407  * outside of the implementation.  There is no real structure to this cpp
 408  * mishmash -- but is there ever?
 409  */
 410 #define DTRACE_HASHSTR(hash, probe)     \
 411         dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))
 412
 413 #define DTRACE_HASHNEXT(hash, probe)    \
 414         (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)
 415
 416 #define DTRACE_HASHPREV(hash, probe)    \
 417         (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)
 418
 419 #define DTRACE_HASHEQ(hash, lhs, rhs)   \
 420         (strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
 421             *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)
 422
 423 #define DTRACE_AGGHASHSIZE_SLEW         17
 424
 425 #define DTRACE_V4MAPPED_OFFSET          (sizeof (uint32_t) * 3)
 426
 427 /*
 428  * The key for a thread-local variable consists of the lower 61 bits of the
 429  * t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
 430  * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
 431  * equal to a variable identifier.  This is necessary (but not sufficient) to
 432  * assure that global associative arrays never collide with thread-local
 433  * variables.  To guarantee that they cannot collide, we must also define the
 434  * order for keying dynamic variables.  That order is:
 435  *
 436  *   [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
 437  *
 438  * Because the variable-key and the tls-key are in orthogonal spaces, there is
 439  * no way for a global variable key signature to match a thread-local key
 440  * signature.
 441  */
 442 #if !defined(__APPLE__)
 443 #define DTRACE_TLS_THRKEY(where) { \
 444         uint_t intr = 0; \
 445         uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
 446         for (; actv; actv >>= 1) \
 447                 intr++; \
 448         ASSERT(intr < (1 << 3)); \
 449         (where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \
 450             (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
 451 }
 452 #else
 453 #if defined(__x86_64__)
 454 /* FIXME: two function calls!! */
 455 #define DTRACE_TLS_THRKEY(where) { \
 456         uint_t intr = ml_at_interrupt_context(); /* Note: just one measly bit */ \
 457         uint64_t thr = (uintptr_t)current_thread(); \
 458         ASSERT(intr < (1 << 3)); \
 459         (where) = ((thr + DIF_VARIABLE_MAX) & \
 460             (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
 461 }
 462 #else
 463 /* FIXME: three function calls!!! */
 464 #define DTRACE_TLS_THRKEY(where) { \
 465         uint_t intr = ml_at_interrupt_context(); /* Note: just one measly bit */ \
 466         uint64_t thr = (uintptr_t)current_thread(); \
 467         uint_t pid = (uint_t)proc_selfpid(); \
 468         ASSERT(intr < (1 << 3)); \
 469         (where) = (((thr << 32 | pid) + DIF_VARIABLE_MAX) & \
 470             (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
 471 }
 472 #endif
 473 #endif /* __APPLE__ */
 474
 475 #define DT_BSWAP_8(x)   ((x) & 0xff)
 476 #define DT_BSWAP_16(x)  ((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
 477 #define DT_BSWAP_32(x)  ((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
 478 #define DT_BSWAP_64(x)  ((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
 479
 480 #define DT_MASK_LO 0x00000000FFFFFFFFULL
 481
 482 #define DTRACE_STORE(type, tomax, offset, what) \
 483         *((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
 484
 485 #if !defined(__APPLE__)
 486 #ifndef __i386
 487 #define DTRACE_ALIGNCHECK(addr, size, flags)                            \
 488         if (addr & (size - 1)) {                                        \
 489                 *flags |= CPU_DTRACE_BADALIGN;                          \
 490                 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;        \
 491                 return (0);                                             \
 492         }
 493 #else
 494 #define DTRACE_ALIGNCHECK(addr, size, flags)
 495 #endif
 496 #else /* __APPLE__ */
 497 #define DTRACE_ALIGNCHECK(addr, size, flags)                            \
 498         if (addr & (MIN(size,4) - 1)) {                                 \
 499                 *flags |= CPU_DTRACE_BADALIGN;                          \
 500                 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;        \
 501                 return (0);                                             \
 502         }
 503 #endif /* __APPLE__ */
 504
 505 /*
 506  * Test whether a range of memory starting at testaddr of size testsz falls
 507  * within the range of memory described by addr, sz.  We take care to avoid
 508  * problems with overflow and underflow of the unsigned quantities, and
 509  * disallow all negative sizes.  Ranges of size 0 are allowed.
 510  */
 511 #define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
 512         ((testaddr) - (baseaddr) < (basesz) && \
 513         (testaddr) + (testsz) - (baseaddr) <= (basesz) && \
 514         (testaddr) + (testsz) >= (testaddr))
 515
 516 /*
 517  * Test whether alloc_sz bytes will fit in the scratch region.  We isolate
 518  * alloc_sz on the righthand side of the comparison in order to avoid overflow
 519  * or underflow in the comparison with it.  This is simpler than the INRANGE
 520  * check above, because we know that the dtms_scratch_ptr is valid in the
 521  * range.  Allocations of size zero are allowed.
 522  */
 523 #define DTRACE_INSCRATCH(mstate, alloc_sz) \
 524         ((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
 525         (mstate)->dtms_scratch_ptr >= (alloc_sz))
 526
 527 #if !defined(__APPLE__)
 528 #define DTRACE_LOADFUNC(bits)                                           \
 529 /*CSTYLED*/                                                             \
 530 uint##bits##_t                                                          \
 531 dtrace_load##bits(uintptr_t addr)                                       \
 532 {                                                                       \
 533         size_t size = bits / NBBY;                                      \
 534         /*CSTYLED*/                                                     \
 535         uint##bits##_t rval;                                            \
 536         int i;                                                          \
 537         volatile uint16_t *flags = (volatile uint16_t *)                \
 538             &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;                   \
 539                                                                         \
 540         DTRACE_ALIGNCHECK(addr, size, flags);                           \
 541                                                                         \
 542         for (i = 0; i < dtrace_toxranges; i++) {                        \
 543                 if (addr >= dtrace_toxrange[i].dtt_limit)               \
 544                         continue;                                       \
 545                                                                         \
 546                 if (addr + size <= dtrace_toxrange[i].dtt_base)         \
 547                         continue;                                       \
 548                                                                         \
 549                 /*                                                      \
 550                  * This address falls within a toxic region; return 0.  \
 551                  */                                                     \
 552                 *flags |= CPU_DTRACE_BADADDR;                           \
 553                 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;        \
 554                 return (0);                                             \
 555         }                                                               \
 556                                                                         \
 557         *flags |= CPU_DTRACE_NOFAULT;                                   \
 558         /*CSTYLED*/                                                     \
 559         rval = *((volatile uint##bits##_t *)addr);                      \
 560         *flags &= ~CPU_DTRACE_NOFAULT;                                  \
 561                                                                         \
 562         return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0);               \
 563 }
 564 #else /* __APPLE__ */
 565 #define RECOVER_LABEL(bits) dtraceLoadRecover##bits:
 566
 567 #if (defined(__i386__) || defined (__x86_64__))
 568 #define DTRACE_LOADFUNC(bits)                                           \
 569 /*CSTYLED*/                                                             \
 570 uint##bits##_t dtrace_load##bits(uintptr_t addr);                       \
 571                                                                         \
 572 uint##bits##_t                                                          \
 573 dtrace_load##bits(uintptr_t addr)                                       \
 574 {                                                                       \
 575         size_t size = bits / NBBY;                                      \
 576         /*CSTYLED*/                                                     \
 577         uint##bits##_t rval = 0;                                        \
 578         int i;                                                          \
 579         volatile uint16_t *flags = (volatile uint16_t *)                \
 580             &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;                   \
 581                                                                         \
 582         DTRACE_ALIGNCHECK(addr, size, flags);                           \
 583                                                                         \
 584         for (i = 0; i < dtrace_toxranges; i++) {                        \
 585                 if (addr >= dtrace_toxrange[i].dtt_limit)               \
 586                         continue;                                       \
 587                                                                         \
 588                 if (addr + size <= dtrace_toxrange[i].dtt_base)         \
 589                         continue;                                       \
 590                                                                         \
 591                 /*                                                      \
 592                  * This address falls within a toxic region; return 0.  \
 593                  */                                                     \
 594                 *flags |= CPU_DTRACE_BADADDR;                           \
 595                 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;        \
 596                 return (0);                                             \
 597         }                                                               \
 598                                                                         \
 599         {                                                               \
 600         volatile vm_offset_t recover = (vm_offset_t)&&dtraceLoadRecover##bits;          \
 601         *flags |= CPU_DTRACE_NOFAULT;                                   \
 602         recover = dtrace_set_thread_recover(current_thread(), recover); \
 603         /*CSTYLED*/                                                     \
 604         /*                                                              \
 605         * PR6394061 - avoid device memory that is unpredictably         \
 606         * mapped and unmapped                                           \
 607         */                                                              \
 608         if (pmap_valid_page(pmap_find_phys(kernel_pmap, addr)))         \
 609             rval = *((volatile uint##bits##_t *)addr);                  \
 610         RECOVER_LABEL(bits);                                            \
 611         (void)dtrace_set_thread_recover(current_thread(), recover);     \
 612         *flags &= ~CPU_DTRACE_NOFAULT;                                  \
 613         }                                                               \
 614                                                                         \
 615         return (rval);                                                  \
 616 }
 617 #else /* all other architectures */
 618 #define DTRACE_LOADFUNC(bits)                                           \
 619 /*CSTYLED*/                                                             \
 620 uint##bits##_t dtrace_load##bits(uintptr_t addr);                       \
 621                                                                         \
 622 uint##bits##_t                                                          \
 623 dtrace_load##bits(uintptr_t addr)                                       \
 624 {                                                                       \
 625         size_t size = bits / NBBY;                                      \
 626         /*CSTYLED*/                                                     \
 627         uint##bits##_t rval = 0;                                        \
 628         int i;                                                          \
 629         volatile uint16_t *flags = (volatile uint16_t *)                \
 630             &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;                   \
 631                                                                         \
 632         DTRACE_ALIGNCHECK(addr, size, flags);                           \
 633                                                                         \
 634         for (i = 0; i < dtrace_toxranges; i++) {                        \
 635                 if (addr >= dtrace_toxrange[i].dtt_limit)               \
 636                         continue;                                       \
 637                                                                         \
 638                 if (addr + size <= dtrace_toxrange[i].dtt_base)         \
 639                         continue;                                       \
 640                                                                         \
 641                 /*                                                      \
 642                  * This address falls within a toxic region; return 0.  \
 643                  */                                                     \
 644                 *flags |= CPU_DTRACE_BADADDR;                           \
 645                 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;        \
 646                 return (0);                                             \
 647         }                                                               \
 648                                                                         \
 649         {                                                               \
 650         volatile vm_offset_t recover = (vm_offset_t)&&dtraceLoadRecover##bits;          \
 651         *flags |= CPU_DTRACE_NOFAULT;                                   \
 652         recover = dtrace_set_thread_recover(current_thread(), recover); \
 653         /*CSTYLED*/     \
 654         rval = *((volatile uint##bits##_t *)addr);                      \
 655         RECOVER_LABEL(bits);                                            \
 656         (void)dtrace_set_thread_recover(current_thread(), recover);     \
 657         *flags &= ~CPU_DTRACE_NOFAULT;                                  \
 658         }                                                               \
 659                                                                         \
 660         return (rval);                                                  \
 661 }
 662 #endif
 663 #endif /* __APPLE__ */
 664
 665 #ifdef __LP64__
 666 #define dtrace_loadptr  dtrace_load64
 667 #else
 668 #define dtrace_loadptr  dtrace_load32
 669 #endif
 670
 671 #define DTRACE_DYNHASH_FREE     0
 672 #define DTRACE_DYNHASH_SINK     1
 673 #define DTRACE_DYNHASH_VALID    2
 674
 675 #define DTRACE_MATCH_FAIL       -1
 676 #define DTRACE_MATCH_NEXT       0
 677 #define DTRACE_MATCH_DONE       1
 678 #define DTRACE_ANCHORED(probe)  ((probe)->dtpr_func[0] != '\0')
 679 #define DTRACE_STATE_ALIGN      64
 680
 681 #define DTRACE_FLAGS2FLT(flags)                                         \
 682         (((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR :           \
 683         ((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP :                \
 684         ((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO :            \
 685         ((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV :                \
 686         ((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV :                \
 687         ((flags) & CPU_DTRACE_TUPOFLOW) ?  DTRACEFLT_TUPOFLOW :         \
 688         ((flags) & CPU_DTRACE_BADALIGN) ?  DTRACEFLT_BADALIGN :         \
 689         ((flags) & CPU_DTRACE_NOSCRATCH) ?  DTRACEFLT_NOSCRATCH :       \
 690         ((flags) & CPU_DTRACE_BADSTACK) ?  DTRACEFLT_BADSTACK :         \
 691         DTRACEFLT_UNKNOWN)
 692
 693 #define DTRACEACT_ISSTRING(act)                                         \
 694         ((act)->dta_kind == DTRACEACT_DIFEXPR &&                        \
 695         (act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
 696
 697
 698 #if defined (__APPLE__)
 699 /* Avoid compiler warnings when assigning regs[rd] = NULL */
 700 #ifdef NULL
 701 #undef NULL
 702 #define NULL (uintptr_t)0
 703 #endif
 704 #endif /* __APPLE__ */
 705
 706 static size_t dtrace_strlen(const char *, size_t);
 707 static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
 708 static void dtrace_enabling_provide(dtrace_provider_t *);
 709 static int dtrace_enabling_match(dtrace_enabling_t *, int *);
 710 static void dtrace_enabling_matchall(void);
 711 static dtrace_state_t *dtrace_anon_grab(void);
 712 static uint64_t dtrace_helper(int, dtrace_mstate_t *,
 713     dtrace_state_t *, uint64_t, uint64_t);
 714 static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
 715 static void dtrace_buffer_drop(dtrace_buffer_t *);
 716 static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
 717     dtrace_state_t *, dtrace_mstate_t *);
 718 static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
 719     dtrace_optval_t);
 720 static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
 721 static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
 722
 723 /*
 724  * DTrace Probe Context Functions
 725  *
 726  * These functions are called from probe context.  Because probe context is
 727  * any context in which C may be called, arbitrarily locks may be held,
 728  * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
 729  * As a result, functions called from probe context may only call other DTrace
 730  * support functions -- they may not interact at all with the system at large.
 731  * (Note that the ASSERT macro is made probe-context safe by redefining it in
 732  * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
 733  * loads are to be performed from probe context, they _must_ be in terms of
 734  * the safe dtrace_load*() variants.
 735  *
 736  * Some functions in this block are not actually called from probe context;
 737  * for these functions, there will be a comment above the function reading
 738  * "Note:  not called from probe context."
 739  */
 740 void
 741 dtrace_panic(const char *format, ...)
 742 {
 743         va_list alist;
 744
 745         va_start(alist, format);
 746         dtrace_vpanic(format, alist);
 747         va_end(alist);
 748 }
 749
 750 int
 751 dtrace_assfail(const char *a, const char *f, int l)
 752 {
 753         dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l);
 754
 755         /*
 756          * We just need something here that even the most clever compiler
 757          * cannot optimize away.
 758          */
 759         return (a[(uintptr_t)f]);
 760 }
 761
 762 /*
 763  * Atomically increment a specified error counter from probe context.
 764  */
 765 static void
 766 dtrace_error(uint32_t *counter)
 767 {
 768         /*
 769          * Most counters stored to in probe context are per-CPU counters.
 770          * However, there are some error conditions that are sufficiently
 771          * arcane that they don't merit per-CPU storage.  If these counters
 772          * are incremented concurrently on different CPUs, scalability will be
 773          * adversely affected -- but we don't expect them to be white-hot in a
 774          * correctly constructed enabling...
 775          */
 776         uint32_t oval, nval;
 777
 778         do {
 779                 oval = *counter;
 780
 781                 if ((nval = oval + 1) == 0) {
 782                         /*
 783                          * If the counter would wrap, set it to 1 -- assuring
 784                          * that the counter is never zero when we have seen
 785                          * errors.  (The counter must be 32-bits because we
 786                          * aren't guaranteed a 64-bit compare&swap operation.)
 787                          * To save this code both the infamy of being fingered
 788                          * by a priggish news story and the indignity of being
 789                          * the target of a neo-puritan witch trial, we're
 790                          * carefully avoiding any colorful description of the
 791                          * likelihood of this condition -- but suffice it to
 792                          * say that it is only slightly more likely than the
 793                          * overflow of predicate cache IDs, as discussed in
 794                          * dtrace_predicate_create().
 795                          */
 796                         nval = 1;
 797                 }
 798         } while (dtrace_cas32(counter, oval, nval) != oval);
 799 }
 800
 801 /*
 802  * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
 803  * uint8_t, a uint16_t, a uint32_t and a uint64_t.
 804  */
 805 DTRACE_LOADFUNC(8)
 806 DTRACE_LOADFUNC(16)
 807 DTRACE_LOADFUNC(32)
 808 DTRACE_LOADFUNC(64)
 809
 810 static int
 811 dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
 812 {
 813         if (dest < mstate->dtms_scratch_base)
 814                 return (0);
 815
 816         if (dest + size < dest)
 817                 return (0);
 818
 819         if (dest + size > mstate->dtms_scratch_ptr)
 820                 return (0);
 821
 822         return (1);
 823 }
 824
 825 static int
 826 dtrace_canstore_statvar(uint64_t addr, size_t sz,
 827     dtrace_statvar_t **svars, int nsvars)
 828 {
 829         int i;
 830
 831         for (i = 0; i < nsvars; i++) {
 832                 dtrace_statvar_t *svar = svars[i];
 833
 834                 if (svar == NULL || svar->dtsv_size == 0)
 835                         continue;
 836
 837                 if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size))
 838                         return (1);
 839         }
 840
 841         return (0);
 842 }
 843
 844 /*
 845  * Check to see if the address is within a memory region to which a store may
 846  * be issued.  This includes the DTrace scratch areas, and any DTrace variable
 847  * region.  The caller of dtrace_canstore() is responsible for performing any
 848  * alignment checks that are needed before stores are actually executed.
 849  */
 850 static int
 851 dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
 852     dtrace_vstate_t *vstate)
 853 {
 854         /*
 855          * First, check to see if the address is in scratch space...
 856          */
 857         if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
 858             mstate->dtms_scratch_size))
 859                 return (1);
 860
 861         /*
 862          * Now check to see if it's a dynamic variable.  This check will pick
 863          * up both thread-local variables and any global dynamically-allocated
 864          * variables.
 865          */
 866         if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base,
 867             vstate->dtvs_dynvars.dtds_size)) {
 868                 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
 869                 uintptr_t base = (uintptr_t)dstate->dtds_base +
 870                     (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
 871                 uintptr_t chunkoffs;
 872
 873                 /*
 874                  * Before we assume that we can store here, we need to make
 875                  * sure that it isn't in our metadata -- storing to our
 876                  * dynamic variable metadata would corrupt our state.  For
 877                  * the range to not include any dynamic variable metadata,
 878                  * it must:
 879                  *
 880                  *      (1) Start above the hash table that is at the base of
 881                  *      the dynamic variable space
 882                  *
 883                  *      (2) Have a starting chunk offset that is beyond the
 884                  *      dtrace_dynvar_t that is at the base of every chunk
 885                  *
 886                  *      (3) Not span a chunk boundary
 887                  *
 888                  */
 889                 if (addr < base)
 890                         return (0);
 891
 892                 chunkoffs = (addr - base) % dstate->dtds_chunksize;
 893
 894                 if (chunkoffs < sizeof (dtrace_dynvar_t))
 895                         return (0);
 896
 897                 if (chunkoffs + sz > dstate->dtds_chunksize)
 898                         return (0);
 899
 900                 return (1);
 901         }
 902
 903         /*
 904          * Finally, check the static local and global variables.  These checks
 905          * take the longest, so we perform them last.
 906          */
 907         if (dtrace_canstore_statvar(addr, sz,
 908             vstate->dtvs_locals, vstate->dtvs_nlocals))
 909                 return (1);
 910
 911         if (dtrace_canstore_statvar(addr, sz,
 912             vstate->dtvs_globals, vstate->dtvs_nglobals))
 913                 return (1);
 914
 915         return (0);
 916 }
 917
 918
 919 /*
 920  * Convenience routine to check to see if the address is within a memory
 921  * region in which a load may be issued given the user's privilege level;
 922  * if not, it sets the appropriate error flags and loads 'addr' into the
 923  * illegal value slot.
 924  *
 925  * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
 926  * appropriate memory access protection.
 927  */
 928 static int
 929 dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
 930     dtrace_vstate_t *vstate)
 931 {
 932 #if !defined(__APPLE__)  /* Quiet compiler warning - matches dtrace_dif_emulate */
 933         volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
 934 #else
 935         volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
 936 #endif /* __APPLE */
 937
 938         /*
 939          * If we hold the privilege to read from kernel memory, then
 940          * everything is readable.
 941          */
 942         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
 943                 return (1);
 944
 945         /*
 946          * You can obviously read that which you can store.
 947          */
 948         if (dtrace_canstore(addr, sz, mstate, vstate))
 949                 return (1);
 950
 951         /*
 952          * We're allowed to read from our own string table.
 953          */
 954         if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab,
 955             mstate->dtms_difo->dtdo_strlen))
 956                 return (1);
 957
 958         DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
 959         *illval = addr;
 960         return (0);
 961 }
 962
 963 /*
 964  * Convenience routine to check to see if a given string is within a memory
 965  * region in which a load may be issued given the user's privilege level;
 966  * this exists so that we don't need to issue unnecessary dtrace_strlen()
 967  * calls in the event that the user has all privileges.
 968  */
 969 static int
 970 dtrace_strcanload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
 971     dtrace_vstate_t *vstate)
 972 {
 973         size_t strsz;
 974
 975         /*
 976          * If we hold the privilege to read from kernel memory, then
 977          * everything is readable.
 978          */
 979         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
 980                 return (1);
 981
 982         strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr, sz);
 983         if (dtrace_canload(addr, strsz, mstate, vstate))
 984                 return (1);
 985
 986         return (0);
 987 }
 988
 989 /*
 990  * Convenience routine to check to see if a given variable is within a memory
 991  * region in which a load may be issued given the user's privilege level.
 992  */
 993 static int
 994 dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate,
 995     dtrace_vstate_t *vstate)
 996 {
 997         size_t sz;
 998         ASSERT(type->dtdt_flags & DIF_TF_BYREF);
 999
1000         /*
1001          * If we hold the privilege to read from kernel memory, then
1002          * everything is readable.
1003          */
1004         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
1005                 return (1);
1006
1007         if (type->dtdt_kind == DIF_TYPE_STRING)
1008                 sz = dtrace_strlen(src,
1009                     vstate->dtvs_state->dts_options[DTRACEOPT_STRSIZE]) + 1;
1010         else
1011                 sz = type->dtdt_size;
1012
1013         return (dtrace_canload((uintptr_t)src, sz, mstate, vstate));
1014 }
1015
1016 /*
1017  * Compare two strings using safe loads.
1018  */
1019 static int
1020 dtrace_strncmp(char *s1, char *s2, size_t limit)
1021 {
1022         uint8_t c1, c2;
1023         volatile uint16_t *flags;
1024
1025         if (s1 == s2 || limit == 0)
1026                 return (0);
1027
1028         flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1029
1030         do {
1031                 if (s1 == NULL) {
1032                         c1 = '\0';
1033                 } else {
1034                         c1 = dtrace_load8((uintptr_t)s1++);
1035                 }
1036
1037                 if (s2 == NULL) {
1038                         c2 = '\0';
1039                 } else {
1040                         c2 = dtrace_load8((uintptr_t)s2++);
1041                 }
1042
1043                 if (c1 != c2)
1044                         return (c1 - c2);
1045         } while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
1046
1047         return (0);
1048 }
1049
1050 /*
1051  * Compute strlen(s) for a string using safe memory accesses.  The additional
1052  * len parameter is used to specify a maximum length to ensure completion.
1053  */
1054 static size_t
1055 dtrace_strlen(const char *s, size_t lim)
1056 {
1057         uint_t len;
1058
1059         for (len = 0; len != lim; len++) {
1060                 if (dtrace_load8((uintptr_t)s++) == '\0')
1061                         break;
1062         }
1063
1064         return (len);
1065 }
1066
1067 /*
1068  * Check if an address falls within a toxic region.
1069  */
1070 static int
1071 dtrace_istoxic(uintptr_t kaddr, size_t size)
1072 {
1073         uintptr_t taddr, tsize;
1074         int i;
1075
1076         for (i = 0; i < dtrace_toxranges; i++) {
1077                 taddr = dtrace_toxrange[i].dtt_base;
1078                 tsize = dtrace_toxrange[i].dtt_limit - taddr;
1079
1080                 if (kaddr - taddr < tsize) {
1081                         DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1082                         cpu_core[CPU->cpu_id].cpuc_dtrace_illval = kaddr;
1083                         return (1);
1084                 }
1085
1086                 if (taddr - kaddr < size) {
1087                         DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1088                         cpu_core[CPU->cpu_id].cpuc_dtrace_illval = taddr;
1089                         return (1);
1090                 }
1091         }
1092
1093         return (0);
1094 }
1095
1096 /*
1097  * Copy src to dst using safe memory accesses.  The src is assumed to be unsafe
1098  * memory specified by the DIF program.  The dst is assumed to be safe memory
1099  * that we can store to directly because it is managed by DTrace.  As with
1100  * standard bcopy, overlapping copies are handled properly.
1101  */
1102 static void
1103 dtrace_bcopy(const void *src, void *dst, size_t len)
1104 {
1105         if (len != 0) {
1106                 uint8_t *s1 = dst;
1107                 const uint8_t *s2 = src;
1108
1109                 if (s1 <= s2) {
1110                         do {
1111                                 *s1++ = dtrace_load8((uintptr_t)s2++);
1112                         } while (--len != 0);
1113                 } else {
1114                         s2 += len;
1115                         s1 += len;
1116
1117                         do {
1118                                 *--s1 = dtrace_load8((uintptr_t)--s2);
1119                         } while (--len != 0);
1120                 }
1121         }
1122 }
1123
1124 /*
1125  * Copy src to dst using safe memory accesses, up to either the specified
1126  * length, or the point that a nul byte is encountered.  The src is assumed to
1127  * be unsafe memory specified by the DIF program.  The dst is assumed to be
1128  * safe memory that we can store to directly because it is managed by DTrace.
1129  * Unlike dtrace_bcopy(), overlapping regions are not handled.
1130  */
1131 static void
1132 dtrace_strcpy(const void *src, void *dst, size_t len)
1133 {
1134         if (len != 0) {
1135                 uint8_t *s1 = dst, c;
1136                 const uint8_t *s2 = src;
1137
1138                 do {
1139                         *s1++ = c = dtrace_load8((uintptr_t)s2++);
1140                 } while (--len != 0 && c != '\0');
1141         }
1142 }
1143
1144 /*
1145  * Copy src to dst, deriving the size and type from the specified (BYREF)
1146  * variable type.  The src is assumed to be unsafe memory specified by the DIF
1147  * program.  The dst is assumed to be DTrace variable memory that is of the
1148  * specified type; we assume that we can store to directly.
1149  */
1150 static void
1151 dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type)
1152 {
1153         ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1154
1155         if (type->dtdt_kind == DIF_TYPE_STRING) {
1156                 dtrace_strcpy(src, dst, type->dtdt_size);
1157         } else {
1158                 dtrace_bcopy(src, dst, type->dtdt_size);
1159 }
1160 }
1161
1162 /*
1163  * Compare s1 to s2 using safe memory accesses.  The s1 data is assumed to be
1164  * unsafe memory specified by the DIF program.  The s2 data is assumed to be
1165  * safe memory that we can access directly because it is managed by DTrace.
1166  */
1167 static int
1168 dtrace_bcmp(const void *s1, const void *s2, size_t len)
1169 {
1170         volatile uint16_t *flags;
1171
1172         flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1173
1174         if (s1 == s2)
1175                 return (0);
1176
1177         if (s1 == NULL || s2 == NULL)
1178                 return (1);
1179
1180         if (s1 != s2 && len != 0) {
1181                 const uint8_t *ps1 = s1;
1182                 const uint8_t *ps2 = s2;
1183
1184                 do {
1185                         if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
1186                                 return (1);
1187                 } while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
1188         }
1189         return (0);
1190 }
1191
1192 /*
1193  * Zero the specified region using a simple byte-by-byte loop.  Note that this
1194  * is for safe DTrace-managed memory only.
1195  */
1196 static void
1197 dtrace_bzero(void *dst, size_t len)
1198 {
1199         uchar_t *cp;
1200
1201         for (cp = dst; len != 0; len--)
1202                 *cp++ = 0;
1203 }
1204
1205 static void
1206 dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
1207 {
1208         uint64_t result[2];
1209
1210         result[0] = addend1[0] + addend2[0];
1211         result[1] = addend1[1] + addend2[1] +
1212             (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
1213
1214         sum[0] = result[0];
1215         sum[1] = result[1];
1216 }
1217
1218 /*
1219  * Shift the 128-bit value in a by b. If b is positive, shift left.
1220  * If b is negative, shift right.
1221  */
1222 static void
1223 dtrace_shift_128(uint64_t *a, int b)
1224 {
1225         uint64_t mask;
1226
1227         if (b == 0)
1228                 return;
1229
1230         if (b < 0) {
1231                 b = -b;
1232                 if (b >= 64) {
1233                         a[0] = a[1] >> (b - 64);
1234                         a[1] = 0;
1235                 } else {
1236                         a[0] >>= b;
1237                         mask = 1LL << (64 - b);
1238                         mask -= 1;
1239                         a[0] |= ((a[1] & mask) << (64 - b));
1240                         a[1] >>= b;
1241                 }
1242         } else {
1243                 if (b >= 64) {
1244                         a[1] = a[0] << (b - 64);
1245                         a[0] = 0;
1246                 } else {
1247                         a[1] <<= b;
1248                         mask = a[0] >> (64 - b);
1249                         a[1] |= mask;
1250                         a[0] <<= b;
1251                 }
1252         }
1253 }
1254
1255 /*
1256  * The basic idea is to break the 2 64-bit values into 4 32-bit values,
1257  * use native multiplication on those, and then re-combine into the
1258  * resulting 128-bit value.
1259  *
1260  * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1261  *     hi1 * hi2 << 64 +
1262  *     hi1 * lo2 << 32 +
1263  *     hi2 * lo1 << 32 +
1264  *     lo1 * lo2
1265  */
1266 static void
1267 dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1268 {
1269         uint64_t hi1, hi2, lo1, lo2;
1270         uint64_t tmp[2];
1271
1272         hi1 = factor1 >> 32;
1273         hi2 = factor2 >> 32;
1274
1275         lo1 = factor1 & DT_MASK_LO;
1276         lo2 = factor2 & DT_MASK_LO;
1277
1278         product[0] = lo1 * lo2;
1279         product[1] = hi1 * hi2;
1280
1281         tmp[0] = hi1 * lo2;
1282         tmp[1] = 0;
1283         dtrace_shift_128(tmp, 32);
1284         dtrace_add_128(product, tmp, product);
1285
1286         tmp[0] = hi2 * lo1;
1287         tmp[1] = 0;
1288         dtrace_shift_128(tmp, 32);
1289         dtrace_add_128(product, tmp, product);
1290 }
1291
1292 /*
1293  * This privilege check should be used by actions and subroutines to
1294  * verify that the user credentials of the process that enabled the
1295  * invoking ECB match the target credentials
1296  */
1297 static int
1298 dtrace_priv_proc_common_user(dtrace_state_t *state)
1299 {
1300         cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1301
1302         /*
1303          * We should always have a non-NULL state cred here, since if cred
1304          * is null (anonymous tracing), we fast-path bypass this routine.
1305          */
1306         ASSERT(s_cr != NULL);
1307
1308 #if !defined(__APPLE__)
1309         if ((cr = CRED()) != NULL &&
1310 #else
1311         if ((cr = dtrace_CRED()) != NULL &&
1312 #endif /* __APPLE__ */
1313             posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_uid &&
1314             posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_ruid &&
1315             posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_suid &&
1316             posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_gid &&
1317             posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_rgid &&
1318             posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_sgid)
1319                 return (1);
1320
1321         return (0);
1322 }
1323
1324 /*
1325  * This privilege check should be used by actions and subroutines to
1326  * verify that the zone of the process that enabled the invoking ECB
1327  * matches the target credentials
1328  */
1329 static int
1330 dtrace_priv_proc_common_zone(dtrace_state_t *state)
1331 {
1332         cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1333 #pragma unused(cr, s_cr) /* __APPLE__ */
1334
1335         /*
1336          * We should always have a non-NULL state cred here, since if cred
1337          * is null (anonymous tracing), we fast-path bypass this routine.
1338          */
1339         ASSERT(s_cr != NULL);
1340
1341 #if !defined(__APPLE__)
1342         if ((cr = CRED()) != NULL &&
1343             s_cr->cr_zone == cr->cr_zone)
1344                 return (1);
1345
1346         return (0);
1347 #else
1348 #pragma unused(state)
1349
1350         return 1; /* Darwin doesn't do zones. */
1351 #endif /* __APPLE__ */
1352 }
1353
1354 /*
1355  * This privilege check should be used by actions and subroutines to
1356  * verify that the process has not setuid or changed credentials.
1357  */
1358 #if !defined(__APPLE__)
1359 static int
1360 dtrace_priv_proc_common_nocd()
1361 {
1362         proc_t *proc;
1363
1364         if ((proc = ttoproc(curthread)) != NULL &&
1365             !(proc->p_flag & SNOCD))
1366                 return (1);
1367
1368         return (0);
1369 }
1370 #else
1371 static int
1372 dtrace_priv_proc_common_nocd(void)
1373 {
1374         return 1; /* Darwin omits "No Core Dump" flag. */
1375 }
1376 #endif /* __APPLE__ */
1377
1378 static int
1379 dtrace_priv_proc_destructive(dtrace_state_t *state)
1380 {
1381         int action = state->dts_cred.dcr_action;
1382
1383 #if defined(__APPLE__)
1384         if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1385                 goto bad;
1386 #endif /* __APPLE__ */
1387
1388         if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1389             dtrace_priv_proc_common_zone(state) == 0)
1390                 goto bad;
1391
1392         if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1393             dtrace_priv_proc_common_user(state) == 0)
1394                 goto bad;
1395
1396         if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1397             dtrace_priv_proc_common_nocd() == 0)
1398                 goto bad;
1399
1400         return (1);
1401
1402 bad:
1403         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1404
1405         return (0);
1406 }
1407
1408 static int
1409 dtrace_priv_proc_control(dtrace_state_t *state)
1410 {
1411 #if defined(__APPLE__)
1412         if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1413                 goto bad;
1414 #endif /* __APPLE__ */
1415
1416         if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1417                 return (1);
1418
1419         if (dtrace_priv_proc_common_zone(state) &&
1420             dtrace_priv_proc_common_user(state) &&
1421             dtrace_priv_proc_common_nocd())
1422                 return (1);
1423
1424 #if defined(__APPLE__)
1425 bad:
1426 #endif /* __APPLE__ */
1427         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1428
1429         return (0);
1430 }
1431
1432 static int
1433 dtrace_priv_proc(dtrace_state_t *state)
1434 {
1435 #if defined(__APPLE__)
1436         if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1437                 goto bad;
1438 #endif /* __APPLE__ */
1439
1440         if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1441                 return (1);
1442
1443 #if defined(__APPLE__)
1444 bad:
1445 #endif /* __APPLE__ */
1446         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1447
1448         return (0);
1449 }
1450
1451 #if defined(__APPLE__)
1452 /* dtrace_priv_proc() omitting the P_LNOATTACH check. For PID and EXECNAME accesses. */
1453 static int
1454 dtrace_priv_proc_relaxed(dtrace_state_t *state)
1455 {
1456
1457         if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1458                 return (1);
1459
1460         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1461
1462         return (0);
1463 }
1464 #endif /* __APPLE__ */
1465
1466 static int
1467 dtrace_priv_kernel(dtrace_state_t *state)
1468 {
1469         if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1470                 return (1);
1471
1472         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1473
1474         return (0);
1475 }
1476
1477 static int
1478 dtrace_priv_kernel_destructive(dtrace_state_t *state)
1479 {
1480         if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1481                 return (1);
1482
1483         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1484
1485         return (0);
1486 }
1487
1488 /*
1489  * Note:  not called from probe context.  This function is called
1490  * asynchronously (and at a regular interval) from outside of probe context to
1491  * clean the dirty dynamic variable lists on all CPUs.  Dynamic variable
1492  * cleaning is explained in detail in <sys/dtrace_impl.h>.
1493  */
1494 #if defined(__APPLE__) /* Quiet compiler warning. */
1495 static
1496 #endif /* __APPLE__ */
1497 void
1498 dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1499 {
1500         dtrace_dynvar_t *dirty;
1501         dtrace_dstate_percpu_t *dcpu;
1502         int i, work = 0;
1503
1504         for (i = 0; i < (int)NCPU; i++) {
1505                 dcpu = &dstate->dtds_percpu[i];
1506
1507                 ASSERT(dcpu->dtdsc_rinsing == NULL);
1508
1509                 /*
1510                  * If the dirty list is NULL, there is no dirty work to do.
1511                  */
1512                 if (dcpu->dtdsc_dirty == NULL)
1513                         continue;
1514
1515                 /*
1516                  * If the clean list is non-NULL, then we're not going to do
1517                  * any work for this CPU -- it means that there has not been
1518                  * a dtrace_dynvar() allocation on this CPU (or from this CPU)
1519                  * since the last time we cleaned house.
1520                  */
1521                 if (dcpu->dtdsc_clean != NULL)
1522                         continue;
1523
1524                 work = 1;
1525
1526                 /*
1527                  * Atomically move the dirty list aside.
1528                  */
1529                 do {
1530                         dirty = dcpu->dtdsc_dirty;
1531
1532                         /*
1533                          * Before we zap the dirty list, set the rinsing list.
1534                          * (This allows for a potential assertion in
1535                          * dtrace_dynvar():  if a free dynamic variable appears
1536                          * on a hash chain, either the dirty list or the
1537                          * rinsing list for some CPU must be non-NULL.)
1538                          */
1539                         dcpu->dtdsc_rinsing = dirty;
1540                         dtrace_membar_producer();
1541                 } while (dtrace_casptr(&dcpu->dtdsc_dirty,
1542                     dirty, NULL) != dirty);
1543         }
1544
1545         if (!work) {
1546                 /*
1547                  * We have no work to do; we can simply return.
1548                  */
1549                 return;
1550         }
1551
1552         dtrace_sync();
1553
1554         for (i = 0; i < (int)NCPU; i++) {
1555                 dcpu = &dstate->dtds_percpu[i];
1556
1557                 if (dcpu->dtdsc_rinsing == NULL)
1558                         continue;
1559
1560                 /*
1561                  * We are now guaranteed that no hash chain contains a pointer
1562                  * into this dirty list; we can make it clean.
1563                  */
1564                 ASSERT(dcpu->dtdsc_clean == NULL);
1565                 dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1566                 dcpu->dtdsc_rinsing = NULL;
1567         }
1568
1569         /*
1570          * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1571          * sure that all CPUs have seen all of the dtdsc_clean pointers.
1572          * This prevents a race whereby a CPU incorrectly decides that
1573          * the state should be something other than DTRACE_DSTATE_CLEAN
1574          * after dtrace_dynvar_clean() has completed.
1575          */
1576         dtrace_sync();
1577
1578         dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1579 }
1580
1581 /*
1582  * Depending on the value of the op parameter, this function looks-up,
1583  * allocates or deallocates an arbitrarily-keyed dynamic variable.  If an
1584  * allocation is requested, this function will return a pointer to a
1585  * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1586  * variable can be allocated.  If NULL is returned, the appropriate counter
1587  * will be incremented.
1588  */
1589 #if defined(__APPLE__) /* Quiet compiler warning. */
1590 static
1591 #endif /* __APPLE__ */
1592 dtrace_dynvar_t *
1593 dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1594     dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1595     dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1596 {
1597         uint64_t hashval = DTRACE_DYNHASH_VALID;
1598         dtrace_dynhash_t *hash = dstate->dtds_hash;
1599         dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1600         processorid_t me = CPU->cpu_id, cpu = me;
1601         dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
1602         size_t bucket, ksize;
1603         size_t chunksize = dstate->dtds_chunksize;
1604         uintptr_t kdata, lock, nstate;
1605         uint_t i;
1606
1607         ASSERT(nkeys != 0);
1608
1609         /*
1610          * Hash the key.  As with aggregations, we use Jenkins' "One-at-a-time"
1611          * algorithm.  For the by-value portions, we perform the algorithm in
1612          * 16-bit chunks (as opposed to 8-bit chunks).  This speeds things up a
1613          * bit, and seems to have only a minute effect on distribution.  For
1614          * the by-reference data, we perform "One-at-a-time" iterating (safely)
1615          * over each referenced byte.  It's painful to do this, but it's much
1616          * better than pathological hash distribution.  The efficacy of the
1617          * hashing algorithm (and a comparison with other algorithms) may be
1618          * found by running the ::dtrace_dynstat MDB dcmd.
1619          */
1620         for (i = 0; i < nkeys; i++) {
1621                 if (key[i].dttk_size == 0) {
1622                         uint64_t val = key[i].dttk_value;
1623
1624                         hashval += (val >> 48) & 0xffff;
1625                         hashval += (hashval << 10);
1626                         hashval ^= (hashval >> 6);
1627
1628                         hashval += (val >> 32) & 0xffff;
1629                         hashval += (hashval << 10);
1630                         hashval ^= (hashval >> 6);
1631
1632                         hashval += (val >> 16) & 0xffff;
1633                         hashval += (hashval << 10);
1634                         hashval ^= (hashval >> 6);
1635
1636                         hashval += val & 0xffff;
1637                         hashval += (hashval << 10);
1638                         hashval ^= (hashval >> 6);
1639                 } else {
1640                         /*
1641                          * This is incredibly painful, but it beats the hell
1642                          * out of the alternative.
1643                          */
1644                         uint64_t j, size = key[i].dttk_size;
1645                         uintptr_t base = (uintptr_t)key[i].dttk_value;
1646
1647                         if (!dtrace_canload(base, size, mstate, vstate))
1648                                 break;
1649
1650                         for (j = 0; j < size; j++) {
1651                                 hashval += dtrace_load8(base + j);
1652                                 hashval += (hashval << 10);
1653                                 hashval ^= (hashval >> 6);
1654                         }
1655                 }
1656         }
1657
1658         if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1659                 return (NULL);
1660
1661         hashval += (hashval << 3);
1662         hashval ^= (hashval >> 11);
1663         hashval += (hashval << 15);
1664
1665         /*
1666          * There is a remote chance (ideally, 1 in 2^31) that our hashval
1667          * comes out to be one of our two sentinel hash values.  If this
1668          * actually happens, we set the hashval to be a value known to be a
1669          * non-sentinel value.
1670          */
1671         if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1672                 hashval = DTRACE_DYNHASH_VALID;
1673
1674         /*
1675          * Yes, it's painful to do a divide here.  If the cycle count becomes
1676          * important here, tricks can be pulled to reduce it.  (However, it's
1677          * critical that hash collisions be kept to an absolute minimum;
1678          * they're much more painful than a divide.)  It's better to have a
1679          * solution that generates few collisions and still keeps things
1680          * relatively simple.
1681          */
1682         bucket = hashval % dstate->dtds_hashsize;
1683
1684         if (op == DTRACE_DYNVAR_DEALLOC) {
1685                 volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1686
1687                 for (;;) {
1688                         while ((lock = *lockp) & 1)
1689                                 continue;
1690
1691 #if !defined(__APPLE__)  /* Quiet compiler warning */
1692                         if (dtrace_casptr((void *)lockp,
1693                             (void *)lock, (void *)(lock + 1)) == (void *)lock)
1694                                 break;
1695 #else
1696                         if (dtrace_casptr((void *)(uintptr_t)lockp,
1697                             (void *)lock, (void *)(lock + 1)) == (void *)lock)
1698                                 break;
1699 #endif /* __APPLE__ */
1700                 }
1701
1702                 dtrace_membar_producer();
1703         }
1704
1705 top:
1706         prev = NULL;
1707         lock = hash[bucket].dtdh_lock;
1708
1709         dtrace_membar_consumer();
1710
1711         start = hash[bucket].dtdh_chain;
1712         ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
1713             start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
1714             op != DTRACE_DYNVAR_DEALLOC));
1715
1716         for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1717                 dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
1718                 dtrace_key_t *dkey = &dtuple->dtt_key[0];
1719
1720                 if (dvar->dtdv_hashval != hashval) {
1721                         if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
1722                                 /*
1723                                  * We've reached the sink, and therefore the
1724                                  * end of the hash chain; we can kick out of
1725                                  * the loop knowing that we have seen a valid
1726                                  * snapshot of state.
1727                                  */
1728                                 ASSERT(dvar->dtdv_next == NULL);
1729                                 ASSERT(dvar == &dtrace_dynhash_sink);
1730                                 break;
1731                         }
1732
1733                         if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
1734                                 /*
1735                                  * We've gone off the rails:  somewhere along
1736                                  * the line, one of the members of this hash
1737                                  * chain was deleted.  Note that we could also
1738                                  * detect this by simply letting this loop run
1739                                  * to completion, as we would eventually hit
1740                                  * the end of the dirty list.  However, we
1741                                  * want to avoid running the length of the
1742                                  * dirty list unnecessarily (it might be quite
1743                                  * long), so we catch this as early as
1744                                  * possible by detecting the hash marker.  In
1745                                  * this case, we simply set dvar to NULL and
1746                                  * break; the conditional after the loop will
1747                                  * send us back to top.
1748                                  */
1749                                 dvar = NULL;
1750                                 break;
1751                         }
1752
1753                         goto next;
1754                 }
1755
1756                 if (dtuple->dtt_nkeys != nkeys)
1757                         goto next;
1758
1759                 for (i = 0; i < nkeys; i++, dkey++) {
1760                         if (dkey->dttk_size != key[i].dttk_size)
1761                                 goto next; /* size or type mismatch */
1762
1763                         if (dkey->dttk_size != 0) {
1764                                 if (dtrace_bcmp(
1765                                     (void *)(uintptr_t)key[i].dttk_value,
1766                                     (void *)(uintptr_t)dkey->dttk_value,
1767                                     dkey->dttk_size))
1768                                         goto next;
1769                         } else {
1770                                 if (dkey->dttk_value != key[i].dttk_value)
1771                                         goto next;
1772                         }
1773                 }
1774
1775                 if (op != DTRACE_DYNVAR_DEALLOC)
1776                         return (dvar);
1777
1778                 ASSERT(dvar->dtdv_next == NULL ||
1779                     dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
1780
1781                 if (prev != NULL) {
1782                         ASSERT(hash[bucket].dtdh_chain != dvar);
1783                         ASSERT(start != dvar);
1784                         ASSERT(prev->dtdv_next == dvar);
1785                         prev->dtdv_next = dvar->dtdv_next;
1786                 } else {
1787                         if (dtrace_casptr(&hash[bucket].dtdh_chain,
1788                             start, dvar->dtdv_next) != start) {
1789                                 /*
1790                                  * We have failed to atomically swing the
1791                                  * hash table head pointer, presumably because
1792                                  * of a conflicting allocation on another CPU.
1793                                  * We need to reread the hash chain and try
1794                                  * again.
1795                                  */
1796                                 goto top;
1797                         }
1798                 }
1799
1800                 dtrace_membar_producer();
1801
1802                 /*
1803                  * Now set the hash value to indicate that it's free.
1804                  */
1805                 ASSERT(hash[bucket].dtdh_chain != dvar);
1806                 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1807
1808                 dtrace_membar_producer();
1809
1810                 /*
1811                  * Set the next pointer to point at the dirty list, and
1812                  * atomically swing the dirty pointer to the newly freed dvar.
1813                  */
1814                 do {
1815                         next = dcpu->dtdsc_dirty;
1816                         dvar->dtdv_next = next;
1817                 } while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
1818
1819                 /*
1820                  * Finally, unlock this hash bucket.
1821                  */
1822                 ASSERT(hash[bucket].dtdh_lock == lock);
1823                 ASSERT(lock & 1);
1824                 hash[bucket].dtdh_lock++;
1825
1826                 return (NULL);
1827 next:
1828                 prev = dvar;
1829                 continue;
1830         }
1831
1832         if (dvar == NULL) {
1833                 /*
1834                  * If dvar is NULL, it is because we went off the rails:
1835                  * one of the elements that we traversed in the hash chain
1836                  * was deleted while we were traversing it.  In this case,
1837                  * we assert that we aren't doing a dealloc (deallocs lock
1838                  * the hash bucket to prevent themselves from racing with
1839                  * one another), and retry the hash chain traversal.
1840                  */
1841                 ASSERT(op != DTRACE_DYNVAR_DEALLOC);
1842                 goto top;
1843         }
1844
1845         if (op != DTRACE_DYNVAR_ALLOC) {
1846                 /*
1847                  * If we are not to allocate a new variable, we want to
1848                  * return NULL now.  Before we return, check that the value
1849                  * of the lock word hasn't changed.  If it has, we may have
1850                  * seen an inconsistent snapshot.
1851                  */
1852                 if (op == DTRACE_DYNVAR_NOALLOC) {
1853                         if (hash[bucket].dtdh_lock != lock)
1854                                 goto top;
1855                 } else {
1856                         ASSERT(op == DTRACE_DYNVAR_DEALLOC);
1857                         ASSERT(hash[bucket].dtdh_lock == lock);
1858                         ASSERT(lock & 1);
1859                         hash[bucket].dtdh_lock++;
1860                 }
1861
1862                 return (NULL);
1863         }
1864
1865         /*
1866          * We need to allocate a new dynamic variable.  The size we need is the
1867          * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
1868          * size of any auxiliary key data (rounded up to 8-byte alignment) plus
1869          * the size of any referred-to data (dsize).  We then round the final
1870          * size up to the chunksize for allocation.
1871          */
1872         for (ksize = 0, i = 0; i < nkeys; i++)
1873                 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
1874
1875         /*
1876          * This should be pretty much impossible, but could happen if, say,
1877          * strange DIF specified the tuple.  Ideally, this should be an
1878          * assertion and not an error condition -- but that requires that the
1879          * chunksize calculation in dtrace_difo_chunksize() be absolutely
1880          * bullet-proof.  (That is, it must not be able to be fooled by
1881          * malicious DIF.)  Given the lack of backwards branches in DIF,
1882          * solving this would presumably not amount to solving the Halting
1883          * Problem -- but it still seems awfully hard.
1884          */
1885         if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
1886             ksize + dsize > chunksize) {
1887                 dcpu->dtdsc_drops++;
1888                 return (NULL);
1889         }
1890
1891         nstate = DTRACE_DSTATE_EMPTY;
1892
1893         do {
1894 retry:
1895                 free = dcpu->dtdsc_free;
1896
1897                 if (free == NULL) {
1898                         dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
1899                         void *rval;
1900
1901                         if (clean == NULL) {
1902                                 /*
1903                                  * We're out of dynamic variable space on
1904                                  * this CPU.  Unless we have tried all CPUs,
1905                                  * we'll try to allocate from a different
1906                                  * CPU.
1907                                  */
1908                                 switch (dstate->dtds_state) {
1909                                 case DTRACE_DSTATE_CLEAN: {
1910                                         void *sp = &dstate->dtds_state;
1911
1912                                         if (++cpu >= (int)NCPU)
1913                                                 cpu = 0;
1914
1915                                         if (dcpu->dtdsc_dirty != NULL &&
1916                                             nstate == DTRACE_DSTATE_EMPTY)
1917                                                 nstate = DTRACE_DSTATE_DIRTY;
1918
1919                                         if (dcpu->dtdsc_rinsing != NULL)
1920                                                 nstate = DTRACE_DSTATE_RINSING;
1921
1922                                         dcpu = &dstate->dtds_percpu[cpu];
1923
1924                                         if (cpu != me)
1925                                                 goto retry;
1926
1927                                         (void) dtrace_cas32(sp,
1928                                             DTRACE_DSTATE_CLEAN, nstate);
1929
1930                                         /*
1931                                          * To increment the correct bean
1932                                          * counter, take another lap.
1933                                          */
1934                                         goto retry;
1935                                 }
1936
1937                                 case DTRACE_DSTATE_DIRTY:
1938                                         dcpu->dtdsc_dirty_drops++;
1939                                         break;
1940
1941                                 case DTRACE_DSTATE_RINSING:
1942                                         dcpu->dtdsc_rinsing_drops++;
1943                                         break;
1944
1945                                 case DTRACE_DSTATE_EMPTY:
1946                                         dcpu->dtdsc_drops++;
1947                                         break;
1948                                 }
1949
1950                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
1951                                 return (NULL);
1952                         }
1953
1954                         /*
1955                          * The clean list appears to be non-empty.  We want to
1956                          * move the clean list to the free list; we start by
1957                          * moving the clean pointer aside.
1958                          */
1959                         if (dtrace_casptr(&dcpu->dtdsc_clean,
1960                             clean, NULL) != clean) {
1961                                 /*
1962                                  * We are in one of two situations:
1963                                  *
1964                                  *  (a) The clean list was switched to the
1965                                  *      free list by another CPU.
1966                                  *
1967                                  *  (b) The clean list was added to by the
1968                                  *      cleansing cyclic.
1969                                  *
1970                                  * In either of these situations, we can
1971                                  * just reattempt the free list allocation.
1972                                  */
1973                                 goto retry;
1974                         }
1975
1976                         ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
1977
1978                         /*
1979                          * Now we'll move the clean list to the free list.
1980                          * It's impossible for this to fail:  the only way
1981                          * the free list can be updated is through this
1982                          * code path, and only one CPU can own the clean list.
1983                          * Thus, it would only be possible for this to fail if
1984                          * this code were racing with dtrace_dynvar_clean().
1985                          * (That is, if dtrace_dynvar_clean() updated the clean
1986                          * list, and we ended up racing to update the free
1987                          * list.)  This race is prevented by the dtrace_sync()
1988                          * in dtrace_dynvar_clean() -- which flushes the
1989                          * owners of the clean lists out before resetting
1990                          * the clean lists.
1991                          */
1992                         rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
1993                         ASSERT(rval == NULL);
1994                         goto retry;
1995                 }
1996
1997                 dvar = free;
1998                 new_free = dvar->dtdv_next;
1999         } while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
2000
2001         /*
2002          * We have now allocated a new chunk.  We copy the tuple keys into the
2003          * tuple array and copy any referenced key data into the data space
2004          * following the tuple array.  As we do this, we relocate dttk_value
2005          * in the final tuple to point to the key data address in the chunk.
2006          */
2007         kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
2008         dvar->dtdv_data = (void *)(kdata + ksize);
2009         dvar->dtdv_tuple.dtt_nkeys = nkeys;
2010
2011         for (i = 0; i < nkeys; i++) {
2012                 dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
2013                 size_t kesize = key[i].dttk_size;
2014
2015                 if (kesize != 0) {
2016                         dtrace_bcopy(
2017                             (const void *)(uintptr_t)key[i].dttk_value,
2018                             (void *)kdata, kesize);
2019                         dkey->dttk_value = kdata;
2020                         kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
2021                 } else {
2022                         dkey->dttk_value = key[i].dttk_value;
2023                 }
2024
2025                 dkey->dttk_size = kesize;
2026         }
2027
2028         ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
2029         dvar->dtdv_hashval = hashval;
2030         dvar->dtdv_next = start;
2031
2032         if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
2033                 return (dvar);
2034
2035         /*
2036          * The cas has failed.  Either another CPU is adding an element to
2037          * this hash chain, or another CPU is deleting an element from this
2038          * hash chain.  The simplest way to deal with both of these cases
2039          * (though not necessarily the most efficient) is to free our
2040          * allocated block and tail-call ourselves.  Note that the free is
2041          * to the dirty list and _not_ to the free list.  This is to prevent
2042          * races with allocators, above.
2043          */
2044         dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2045
2046         dtrace_membar_producer();
2047
2048         do {
2049                 free = dcpu->dtdsc_dirty;
2050                 dvar->dtdv_next = free;
2051         } while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
2052
2053         return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
2054 }
2055
2056 /*ARGSUSED*/
2057 static void
2058 dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
2059 {
2060 #pragma unused(arg) /* __APPLE__ */
2061         if ((int64_t)nval < (int64_t)*oval)
2062                 *oval = nval;
2063 }
2064
2065 /*ARGSUSED*/
2066 static void
2067 dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
2068 {
2069 #pragma unused(arg) /* __APPLE__ */
2070         if ((int64_t)nval > (int64_t)*oval)
2071                 *oval = nval;
2072 }
2073
2074 static void
2075 dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
2076 {
2077         int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
2078         int64_t val = (int64_t)nval;
2079
2080         if (val < 0) {
2081                 for (i = 0; i < zero; i++) {
2082                         if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
2083                                 quanta[i] += incr;
2084                                 return;
2085                         }
2086                 }
2087         } else {
2088                 for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
2089                         if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
2090                                 quanta[i - 1] += incr;
2091                                 return;
2092                         }
2093                 }
2094
2095                 quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
2096                 return;
2097         }
2098
2099         ASSERT(0);
2100 }
2101
2102 static void
2103 dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
2104 {
2105         uint64_t arg = *lquanta++;
2106         int32_t base = DTRACE_LQUANTIZE_BASE(arg);
2107         uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
2108         uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
2109         int32_t val = (int32_t)nval, level;
2110
2111         ASSERT(step != 0);
2112         ASSERT(levels != 0);
2113
2114         if (val < base) {
2115                 /*
2116                  * This is an underflow.
2117                  */
2118                 lquanta[0] += incr;
2119                 return;
2120         }
2121
2122         level = (val - base) / step;
2123
2124         if (level < levels) {
2125                 lquanta[level + 1] += incr;
2126                 return;
2127         }
2128
2129         /*
2130          * This is an overflow.
2131          */
2132         lquanta[levels + 1] += incr;
2133 }
2134
2135 /*ARGSUSED*/
2136 static void
2137 dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
2138 {
2139 #pragma unused(arg) /* __APPLE__ */
2140         data[0]++;
2141         data[1] += nval;
2142 }
2143
2144 /*ARGSUSED*/
2145 static void
2146 dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
2147 {
2148 #pragma unused(arg) /* __APPLE__ */
2149         int64_t snval = (int64_t)nval;
2150         uint64_t tmp[2];
2151
2152         data[0]++;
2153         data[1] += nval;
2154
2155         /*
2156          * What we want to say here is:
2157          *
2158          * data[2] += nval * nval;
2159          *
2160          * But given that nval is 64-bit, we could easily overflow, so
2161          * we do this as 128-bit arithmetic.
2162          */
2163         if (snval < 0)
2164                 snval = -snval;
2165
2166         dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
2167         dtrace_add_128(data + 2, tmp, data + 2);
2168 }
2169
2170 /*ARGSUSED*/
2171 static void
2172 dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
2173 {
2174 #pragma unused(nval, arg) /* __APPLE__ */
2175         *oval = *oval + 1;
2176 }
2177
2178 /*ARGSUSED*/
2179 static void
2180 dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
2181 {
2182 #pragma unused(arg) /* __APPLE__ */
2183         *oval += nval;
2184 }
2185
2186 /*
2187  * Aggregate given the tuple in the principal data buffer, and the aggregating
2188  * action denoted by the specified dtrace_aggregation_t.  The aggregation
2189  * buffer is specified as the buf parameter.  This routine does not return
2190  * failure; if there is no space in the aggregation buffer, the data will be
2191  * dropped, and a corresponding counter incremented.
2192  */
2193 static void
2194 dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
2195     intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
2196 {
2197 #pragma unused(arg)
2198         dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
2199         uint32_t i, ndx, size, fsize;
2200         uint32_t align = sizeof (uint64_t) - 1;
2201         dtrace_aggbuffer_t *agb;
2202         dtrace_aggkey_t *key;
2203         uint32_t hashval = 0, limit, isstr;
2204         caddr_t tomax, data, kdata;
2205         dtrace_actkind_t action;
2206         dtrace_action_t *act;
2207         uintptr_t offs;
2208
2209         if (buf == NULL)
2210                 return;
2211
2212         if (!agg->dtag_hasarg) {
2213                 /*
2214                  * Currently, only quantize() and lquantize() take additional
2215                  * arguments, and they have the same semantics:  an increment
2216                  * value that defaults to 1 when not present.  If additional
2217                  * aggregating actions take arguments, the setting of the
2218                  * default argument value will presumably have to become more
2219                  * sophisticated...
2220                  */
2221                 arg = 1;
2222         }
2223
2224         action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
2225         size = rec->dtrd_offset - agg->dtag_base;
2226         fsize = size + rec->dtrd_size;
2227
2228         ASSERT(dbuf->dtb_tomax != NULL);
2229         data = dbuf->dtb_tomax + offset + agg->dtag_base;
2230
2231         if ((tomax = buf->dtb_tomax) == NULL) {
2232                 dtrace_buffer_drop(buf);
2233                 return;
2234         }
2235
2236         /*
2237          * The metastructure is always at the bottom of the buffer.
2238          */
2239         agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
2240             sizeof (dtrace_aggbuffer_t));
2241
2242         if (buf->dtb_offset == 0) {
2243                 /*
2244                  * We just kludge up approximately 1/8th of the size to be
2245                  * buckets.  If this guess ends up being routinely
2246                  * off-the-mark, we may need to dynamically readjust this
2247                  * based on past performance.
2248                  */
2249                 uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
2250
2251                 if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
2252                     (uintptr_t)tomax || hashsize == 0) {
2253                         /*
2254                          * We've been given a ludicrously small buffer;
2255                          * increment our drop count and leave.
2256                          */
2257                         dtrace_buffer_drop(buf);
2258                         return;
2259                 }
2260
2261                 /*
2262                  * And now, a pathetic attempt to try to get a an odd (or
2263                  * perchance, a prime) hash size for better hash distribution.
2264                  */
2265                 if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
2266                         hashsize -= DTRACE_AGGHASHSIZE_SLEW;
2267
2268                 agb->dtagb_hashsize = hashsize;
2269                 agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
2270                     agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
2271                 agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
2272
2273                 for (i = 0; i < agb->dtagb_hashsize; i++)
2274                         agb->dtagb_hash[i] = NULL;
2275         }
2276
2277         ASSERT(agg->dtag_first != NULL);
2278         ASSERT(agg->dtag_first->dta_intuple);
2279
2280         /*
2281          * Calculate the hash value based on the key.  Note that we _don't_
2282          * include the aggid in the hashing (but we will store it as part of
2283          * the key).  The hashing algorithm is Bob Jenkins' "One-at-a-time"
2284          * algorithm: a simple, quick algorithm that has no known funnels, and
2285          * gets good distribution in practice.  The efficacy of the hashing
2286          * algorithm (and a comparison with other algorithms) may be found by
2287          * running the ::dtrace_aggstat MDB dcmd.
2288          */
2289         for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2290                 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2291                 limit = i + act->dta_rec.dtrd_size;
2292                 ASSERT(limit <= size);
2293                 isstr = DTRACEACT_ISSTRING(act);
2294
2295                 for (; i < limit; i++) {
2296                         hashval += data[i];
2297                         hashval += (hashval << 10);
2298                         hashval ^= (hashval >> 6);
2299
2300                         if (isstr && data[i] == '\0')
2301                                 break;
2302                 }
2303         }
2304
2305         hashval += (hashval << 3);
2306         hashval ^= (hashval >> 11);
2307         hashval += (hashval << 15);
2308
2309         /*
2310          * Yes, the divide here is expensive -- but it's generally the least
2311          * of the performance issues given the amount of data that we iterate
2312          * over to compute hash values, compare data, etc.
2313          */
2314         ndx = hashval % agb->dtagb_hashsize;
2315
2316         for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
2317                 ASSERT((caddr_t)key >= tomax);
2318                 ASSERT((caddr_t)key < tomax + buf->dtb_size);
2319
2320                 if (hashval != key->dtak_hashval || key->dtak_size != size)
2321                         continue;
2322
2323                 kdata = key->dtak_data;
2324                 ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
2325
2326                 for (act = agg->dtag_first; act->dta_intuple;
2327                     act = act->dta_next) {
2328                         i = act->dta_rec.dtrd_offset - agg->dtag_base;
2329                         limit = i + act->dta_rec.dtrd_size;
2330                         ASSERT(limit <= size);
2331                         isstr = DTRACEACT_ISSTRING(act);
2332
2333                         for (; i < limit; i++) {
2334                                 if (kdata[i] != data[i])
2335                                         goto next;
2336
2337                                 if (isstr && data[i] == '\0')
2338                                         break;
2339                         }
2340                 }
2341
2342                 if (action != key->dtak_action) {
2343                         /*
2344                          * We are aggregating on the same value in the same
2345                          * aggregation with two different aggregating actions.
2346                          * (This should have been picked up in the compiler,
2347                          * so we may be dealing with errant or devious DIF.)
2348                          * This is an error condition; we indicate as much,
2349                          * and return.
2350                          */
2351                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2352                         return;
2353                 }
2354
2355                 /*
2356                  * This is a hit:  we need to apply the aggregator to
2357                  * the value at this key.
2358                  */
2359                 agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2360                 return;
2361 next:
2362                 continue;
2363         }
2364
2365         /*
2366          * We didn't find it.  We need to allocate some zero-filled space,
2367          * link it into the hash table appropriately, and apply the aggregator
2368          * to the (zero-filled) value.
2369          */
2370         offs = buf->dtb_offset;
2371         while (offs & (align - 1))
2372                 offs += sizeof (uint32_t);
2373
2374         /*
2375          * If we don't have enough room to both allocate a new key _and_
2376          * its associated data, increment the drop count and return.
2377          */
2378         if ((uintptr_t)tomax + offs + fsize >
2379             agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2380                 dtrace_buffer_drop(buf);
2381                 return;
2382         }
2383
2384         /*CONSTCOND*/
2385         ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
2386         key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
2387         agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2388
2389         key->dtak_data = kdata = tomax + offs;
2390         buf->dtb_offset = offs + fsize;
2391
2392         /*
2393          * Now copy the data across.
2394          */
2395         *((dtrace_aggid_t *)kdata) = agg->dtag_id;
2396
2397         for (i = sizeof (dtrace_aggid_t); i < size; i++)
2398                 kdata[i] = data[i];
2399
2400         /*
2401          * Because strings are not zeroed out by default, we need to iterate
2402          * looking for actions that store strings, and we need to explicitly
2403          * pad these strings out with zeroes.
2404          */
2405         for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2406                 int nul;
2407
2408                 if (!DTRACEACT_ISSTRING(act))
2409                         continue;
2410
2411                 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2412                 limit = i + act->dta_rec.dtrd_size;
2413                 ASSERT(limit <= size);
2414
2415                 for (nul = 0; i < limit; i++) {
2416                         if (nul) {
2417                                 kdata[i] = '\0';
2418                                 continue;
2419                         }
2420
2421                         if (data[i] != '\0')
2422                                 continue;
2423
2424                         nul = 1;
2425                 }
2426         }
2427
2428         for (i = size; i < fsize; i++)
2429                 kdata[i] = 0;
2430
2431         key->dtak_hashval = hashval;
2432         key->dtak_size = size;
2433         key->dtak_action = action;
2434         key->dtak_next = agb->dtagb_hash[ndx];
2435         agb->dtagb_hash[ndx] = key;
2436
2437         /*
2438          * Finally, apply the aggregator.
2439          */
2440         *((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
2441         agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2442 }
2443
2444 /*
2445  * Given consumer state, this routine finds a speculation in the INACTIVE
2446  * state and transitions it into the ACTIVE state.  If there is no speculation
2447  * in the INACTIVE state, 0 is returned.  In this case, no error counter is
2448  * incremented -- it is up to the caller to take appropriate action.
2449  */
2450 static int
2451 dtrace_speculation(dtrace_state_t *state)
2452 {
2453         int i = 0;
2454         dtrace_speculation_state_t current;
2455         uint32_t *stat = &state->dts_speculations_unavail, count;
2456
2457         while (i < state->dts_nspeculations) {
2458                 dtrace_speculation_t *spec = &state->dts_speculations[i];
2459
2460                 current = spec->dtsp_state;
2461
2462                 if (current != DTRACESPEC_INACTIVE) {
2463                         if (current == DTRACESPEC_COMMITTINGMANY ||
2464                             current == DTRACESPEC_COMMITTING ||
2465                             current == DTRACESPEC_DISCARDING)
2466                                 stat = &state->dts_speculations_busy;
2467                         i++;
2468                         continue;
2469                 }
2470
2471                 if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2472                     current, DTRACESPEC_ACTIVE) == current)
2473                         return (i + 1);
2474         }
2475
2476         /*
2477          * We couldn't find a speculation.  If we found as much as a single
2478          * busy speculation buffer, we'll attribute this failure as "busy"
2479          * instead of "unavail".
2480          */
2481         do {
2482                 count = *stat;
2483         } while (dtrace_cas32(stat, count, count + 1) != count);
2484
2485         return (0);
2486 }
2487
2488 /*
2489  * This routine commits an active speculation.  If the specified speculation
2490  * is not in a valid state to perform a commit(), this routine will silently do
2491  * nothing.  The state of the specified speculation is transitioned according
2492  * to the state transition diagram outlined in <sys/dtrace_impl.h>
2493  */
2494 static void
2495 dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2496     dtrace_specid_t which)
2497 {
2498         dtrace_speculation_t *spec;
2499         dtrace_buffer_t *src, *dest;
2500         uintptr_t daddr, saddr, dlimit;
2501 #if !defined(__APPLE__)  /* Quiet compiler warning */
2502         dtrace_speculation_state_t current, new;
2503 #else
2504         dtrace_speculation_state_t current,  new = DTRACESPEC_INACTIVE;
2505 #endif /* __APPLE__ */
2506         intptr_t offs;
2507
2508         if (which == 0)
2509                 return;
2510
2511 #if !defined(__APPLE__)  /* Quiet compiler warning */
2512         if (which > state->dts_nspeculations) {
2513                 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2514                 return;
2515         }
2516 #else
2517         if (which > (dtrace_specid_t)state->dts_nspeculations) {
2518                 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2519                 return;
2520         }
2521 #endif /* __APPLE__ */
2522
2523         spec = &state->dts_speculations[which - 1];
2524         src = &spec->dtsp_buffer[cpu];
2525         dest = &state->dts_buffer[cpu];
2526
2527         do {
2528                 current = spec->dtsp_state;
2529
2530                 if (current == DTRACESPEC_COMMITTINGMANY)
2531                         break;
2532
2533                 switch (current) {
2534                 case DTRACESPEC_INACTIVE:
2535                 case DTRACESPEC_DISCARDING:
2536                         return;
2537
2538                 case DTRACESPEC_COMMITTING:
2539                         /*
2540                          * This is only possible if we are (a) commit()'ing
2541                          * without having done a prior speculate() on this CPU
2542                          * and (b) racing with another commit() on a different
2543                          * CPU.  There's nothing to do -- we just assert that
2544                          * our offset is 0.
2545                          */
2546                         ASSERT(src->dtb_offset == 0);
2547                         return;
2548
2549                 case DTRACESPEC_ACTIVE:
2550                         new = DTRACESPEC_COMMITTING;
2551                         break;
2552
2553                 case DTRACESPEC_ACTIVEONE:
2554                         /*
2555                          * This speculation is active on one CPU.  If our
2556                          * buffer offset is non-zero, we know that the one CPU
2557                          * must be us.  Otherwise, we are committing on a
2558                          * different CPU from the speculate(), and we must
2559                          * rely on being asynchronously cleaned.
2560                          */
2561                         if (src->dtb_offset != 0) {
2562                                 new = DTRACESPEC_COMMITTING;
2563                                 break;
2564                         }
2565                         /*FALLTHROUGH*/
2566
2567                 case DTRACESPEC_ACTIVEMANY:
2568                         new = DTRACESPEC_COMMITTINGMANY;
2569                         break;
2570
2571                 default:
2572                         ASSERT(0);
2573                 }
2574         } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2575             current, new) != current);
2576
2577         /*
2578          * We have set the state to indicate that we are committing this
2579          * speculation.  Now reserve the necessary space in the destination
2580          * buffer.
2581          */
2582         if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2583             sizeof (uint64_t), state, NULL)) < 0) {
2584                 dtrace_buffer_drop(dest);
2585                 goto out;
2586         }
2587
2588         /*
2589          * We have the space; copy the buffer across.  (Note that this is a
2590          * highly subobtimal bcopy(); in the unlikely event that this becomes
2591          * a serious performance issue, a high-performance DTrace-specific
2592          * bcopy() should obviously be invented.)
2593          */
2594         daddr = (uintptr_t)dest->dtb_tomax + offs;
2595         dlimit = daddr + src->dtb_offset;
2596         saddr = (uintptr_t)src->dtb_tomax;
2597
2598         /*
2599          * First, the aligned portion.
2600          */
2601         while (dlimit - daddr >= sizeof (uint64_t)) {
2602                 *((uint64_t *)daddr) = *((uint64_t *)saddr);
2603
2604                 daddr += sizeof (uint64_t);
2605                 saddr += sizeof (uint64_t);
2606         }
2607
2608         /*
2609          * Now any left-over bit...
2610          */
2611         while (dlimit - daddr)
2612                 *((uint8_t *)daddr++) = *((uint8_t *)saddr++);
2613
2614         /*
2615          * Finally, commit the reserved space in the destination buffer.
2616          */
2617         dest->dtb_offset = offs + src->dtb_offset;
2618
2619 out:
2620         /*
2621          * If we're lucky enough to be the only active CPU on this speculation
2622          * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2623          */
2624         if (current == DTRACESPEC_ACTIVE ||
2625             (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
2626                 uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
2627                     DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
2628 #pragma unused(rval) /* __APPLE__ */
2629
2630                 ASSERT(rval == DTRACESPEC_COMMITTING);
2631         }
2632
2633         src->dtb_offset = 0;
2634         src->dtb_xamot_drops += src->dtb_drops;
2635         src->dtb_drops = 0;
2636 }
2637
2638 /*
2639  * This routine discards an active speculation.  If the specified speculation
2640  * is not in a valid state to perform a discard(), this routine will silently
2641  * do nothing.  The state of the specified speculation is transitioned
2642  * according to the state transition diagram outlined in <sys/dtrace_impl.h>
2643  */
2644 static void
2645 dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
2646     dtrace_specid_t which)
2647 {
2648         dtrace_speculation_t *spec;
2649 #if !defined(__APPLE__)  /* Quiet compiler warning */
2650         dtrace_speculation_state_t current, new;
2651 #else
2652         dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
2653 #endif /* __APPLE__ */
2654         dtrace_buffer_t *buf;
2655
2656         if (which == 0)
2657                 return;
2658
2659 #if !defined(__APPLE__)  /* Quiet compiler warning */
2660         if (which > state->dts_nspeculations) {
2661                 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2662                 return;
2663         }
2664 #else
2665         if (which > (dtrace_specid_t)state->dts_nspeculations) {
2666                 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2667                 return;
2668         }
2669 #endif /* __APPLE__ */
2670
2671         spec = &state->dts_speculations[which - 1];
2672         buf = &spec->dtsp_buffer[cpu];
2673
2674         do {
2675                 current = spec->dtsp_state;
2676
2677                 switch (current) {
2678                 case DTRACESPEC_INACTIVE:
2679                 case DTRACESPEC_COMMITTINGMANY:
2680                 case DTRACESPEC_COMMITTING:
2681                 case DTRACESPEC_DISCARDING:
2682                         return;
2683
2684                 case DTRACESPEC_ACTIVE:
2685                 case DTRACESPEC_ACTIVEMANY:
2686                         new = DTRACESPEC_DISCARDING;
2687                         break;
2688
2689                 case DTRACESPEC_ACTIVEONE:
2690                         if (buf->dtb_offset != 0) {
2691                                 new = DTRACESPEC_INACTIVE;
2692                         } else {
2693                                 new = DTRACESPEC_DISCARDING;
2694                         }
2695                         break;
2696
2697                 default:
2698                         ASSERT(0);
2699                 }
2700         } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2701             current, new) != current);
2702
2703         buf->dtb_offset = 0;
2704         buf->dtb_drops = 0;
2705 }
2706
2707 /*
2708  * Note:  not called from probe context.  This function is called
2709  * asynchronously from cross call context to clean any speculations that are
2710  * in the COMMITTINGMANY or DISCARDING states.  These speculations may not be
2711  * transitioned back to the INACTIVE state until all CPUs have cleaned the
2712  * speculation.
2713  */
2714 static void
2715 dtrace_speculation_clean_here(dtrace_state_t *state)
2716 {
2717         dtrace_icookie_t cookie;
2718         processorid_t cpu = CPU->cpu_id;
2719         dtrace_buffer_t *dest = &state->dts_buffer[cpu];
2720         dtrace_specid_t i;
2721
2722         cookie = dtrace_interrupt_disable();
2723
2724         if (dest->dtb_tomax == NULL) {
2725                 dtrace_interrupt_enable(cookie);
2726                 return;
2727         }
2728
2729 #if !defined(__APPLE__)  /* Quiet compiler warning */
2730         for (i = 0; i < state->dts_nspeculations; i++) {
2731 #else
2732         for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
2733 #endif /* __APPLE__ */
2734                 dtrace_speculation_t *spec = &state->dts_speculations[i];
2735                 dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
2736
2737                 if (src->dtb_tomax == NULL)
2738                         continue;
2739
2740                 if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
2741                         src->dtb_offset = 0;
2742                         continue;
2743                 }
2744
2745                 if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2746                         continue;
2747
2748                 if (src->dtb_offset == 0)
2749                         continue;
2750
2751                 dtrace_speculation_commit(state, cpu, i + 1);
2752         }
2753
2754         dtrace_interrupt_enable(cookie);
2755 }
2756
2757 /*
2758  * Note:  not called from probe context.  This function is called
2759  * asynchronously (and at a regular interval) to clean any speculations that
2760  * are in the COMMITTINGMANY or DISCARDING states.  If it discovers that there
2761  * is work to be done, it cross calls all CPUs to perform that work;
2762  * COMMITMANY and DISCARDING speculations may not be transitioned back to the
2763  * INACTIVE state until they have been cleaned by all CPUs.
2764  */
2765 static void
2766 dtrace_speculation_clean(dtrace_state_t *state)
2767 {
2768 #if !defined(__APPLE__)  /* Quiet compiler warning */
2769         int work = 0, rv;
2770 #else
2771         int work = 0;
2772         uint32_t rv;
2773 #endif /* __APPLE__ */
2774         dtrace_specid_t i;
2775
2776 #if !defined(__APPLE__)  /* Quiet compiler warning */
2777         for (i = 0; i < state->dts_nspeculations; i++) {
2778 #else
2779         for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
2780 #endif /* __APPLE__ */
2781                 dtrace_speculation_t *spec = &state->dts_speculations[i];
2782
2783                 ASSERT(!spec->dtsp_cleaning);
2784
2785                 if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
2786                     spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2787                         continue;
2788
2789                 work++;
2790                 spec->dtsp_cleaning = 1;
2791         }
2792
2793         if (!work)
2794                 return;
2795
2796         dtrace_xcall(DTRACE_CPUALL,
2797             (dtrace_xcall_t)dtrace_speculation_clean_here, state);
2798
2799         /*
2800          * We now know that all CPUs have committed or discarded their
2801          * speculation buffers, as appropriate.  We can now set the state
2802          * to inactive.
2803          */
2804 #if !defined(__APPLE__)  /* Quiet compiler warning */
2805         for (i = 0; i < state->dts_nspeculations; i++) {
2806 #else
2807         for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
2808 #endif /* __APPLE__ */
2809                 dtrace_speculation_t *spec = &state->dts_speculations[i];
2810                 dtrace_speculation_state_t current, new;
2811
2812                 if (!spec->dtsp_cleaning)
2813                         continue;
2814
2815                 current = spec->dtsp_state;
2816                 ASSERT(current == DTRACESPEC_DISCARDING ||
2817                     current == DTRACESPEC_COMMITTINGMANY);
2818
2819                 new = DTRACESPEC_INACTIVE;
2820
2821                 rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
2822                 ASSERT(rv == current);
2823                 spec->dtsp_cleaning = 0;
2824         }
2825 }
2826
2827 /*
2828  * Called as part of a speculate() to get the speculative buffer associated
2829  * with a given speculation.  Returns NULL if the specified speculation is not
2830  * in an ACTIVE state.  If the speculation is in the ACTIVEONE state -- and
2831  * the active CPU is not the specified CPU -- the speculation will be
2832  * atomically transitioned into the ACTIVEMANY state.
2833  */
2834 static dtrace_buffer_t *
2835 dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
2836     dtrace_specid_t which)
2837 {
2838         dtrace_speculation_t *spec;
2839 #if !defined(__APPLE__)  /* Quiet compiler warning */
2840         dtrace_speculation_state_t current, new;
2841 #else
2842         dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
2843 #endif /* __APPLE__ */
2844         dtrace_buffer_t *buf;
2845
2846         if (which == 0)
2847                 return (NULL);
2848
2849 #if !defined(__APPLE__)  /* Quiet compiler warning */
2850         if (which > state->dts_nspeculations) {
2851 #else
2852         if (which > (dtrace_specid_t)state->dts_nspeculations) {
2853 #endif /* __APPLE__ */
2854                 cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2855                 return (NULL);
2856         }
2857
2858         spec = &state->dts_speculations[which - 1];
2859         buf = &spec->dtsp_buffer[cpuid];
2860
2861         do {
2862                 current = spec->dtsp_state;
2863
2864                 switch (current) {
2865                 case DTRACESPEC_INACTIVE:
2866                 case DTRACESPEC_COMMITTINGMANY:
2867                 case DTRACESPEC_DISCARDING:
2868                         return (NULL);
2869
2870                 case DTRACESPEC_COMMITTING:
2871                         ASSERT(buf->dtb_offset == 0);
2872                         return (NULL);
2873
2874                 case DTRACESPEC_ACTIVEONE:
2875                         /*
2876                          * This speculation is currently active on one CPU.
2877                          * Check the offset in the buffer; if it's non-zero,
2878                          * that CPU must be us (and we leave the state alone).
2879                          * If it's zero, assume that we're starting on a new
2880                          * CPU -- and change the state to indicate that the
2881                          * speculation is active on more than one CPU.
2882                          */
2883                         if (buf->dtb_offset != 0)
2884                                 return (buf);
2885
2886                         new = DTRACESPEC_ACTIVEMANY;
2887                         break;
2888
2889                 case DTRACESPEC_ACTIVEMANY:
2890                         return (buf);
2891
2892                 case DTRACESPEC_ACTIVE:
2893                         new = DTRACESPEC_ACTIVEONE;
2894                         break;
2895
2896                 default:
2897                         ASSERT(0);
2898                 }
2899         } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2900             current, new) != current);
2901
2902         ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
2903         return (buf);
2904 }
2905
2906 /*
2907  * Return a string.  In the event that the user lacks the privilege to access
2908  * arbitrary kernel memory, we copy the string out to scratch memory so that we
2909  * don't fail access checking.
2910  *
2911  * dtrace_dif_variable() uses this routine as a helper for various
2912  * builtin values such as 'execname' and 'probefunc.'
2913  */
2914 #if defined(__APPLE__) /* Quiet compiler warning. */
2915 static
2916 #endif /* __APPLE__ */
2917 uintptr_t
2918 dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
2919     dtrace_mstate_t *mstate)
2920 {
2921         uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
2922         uintptr_t ret;
2923         size_t strsz;
2924
2925         /*
2926          * The easy case: this probe is allowed to read all of memory, so
2927          * we can just return this as a vanilla pointer.
2928          */
2929         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
2930                 return (addr);
2931
2932         /*
2933          * This is the tougher case: we copy the string in question from
2934          * kernel memory into scratch memory and return it that way: this
2935          * ensures that we won't trip up when access checking tests the
2936          * BYREF return value.
2937          */
2938         strsz = dtrace_strlen((char *)addr, size) + 1;
2939
2940         if (mstate->dtms_scratch_ptr + strsz >
2941             mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
2942                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
2943                 return (NULL);
2944         }
2945
2946         dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
2947             strsz);
2948         ret = mstate->dtms_scratch_ptr;
2949         mstate->dtms_scratch_ptr += strsz;
2950         return (ret);
2951 }
2952
2953 /*
2954  * This function implements the DIF emulator's variable lookups.  The emulator
2955  * passes a reserved variable identifier and optional built-in array index.
2956  */
2957 static uint64_t
2958 dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
2959     uint64_t ndx)
2960 {
2961         /*
2962          * If we're accessing one of the uncached arguments, we'll turn this
2963          * into a reference in the args array.
2964          */
2965         if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
2966                 ndx = v - DIF_VAR_ARG0;
2967                 v = DIF_VAR_ARGS;
2968         }
2969
2970         switch (v) {
2971         case DIF_VAR_ARGS:
2972                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
2973                 if (ndx >= sizeof (mstate->dtms_arg) /
2974                     sizeof (mstate->dtms_arg[0])) {
2975 #if !defined(__APPLE__)
2976                         int aframes = mstate->dtms_probe->dtpr_aframes + 2;
2977 #else
2978                         /* Account for introduction of __dtrace_probe() on xnu. */
2979                         int aframes = mstate->dtms_probe->dtpr_aframes + 3;
2980 #endif /* __APPLE__ */
2981                         dtrace_provider_t *pv;
2982                         uint64_t val;
2983
2984                         pv = mstate->dtms_probe->dtpr_provider;
2985                         if (pv->dtpv_pops.dtps_getargval != NULL)
2986                                 val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
2987                                     mstate->dtms_probe->dtpr_id,
2988                                     mstate->dtms_probe->dtpr_arg, ndx, aframes);
2989 #if defined(__APPLE__)
2990                         /* Special case access of arg5 as passed to dtrace_probe_error() (which see.) */
2991                         else if (mstate->dtms_probe->dtpr_id == dtrace_probeid_error && ndx == 5) {
2992                                 return ((dtrace_state_t *)(uintptr_t)(mstate->dtms_arg[0]))->dts_arg_error_illval;
2993                         }
2994 #endif /* __APPLE__ */
2995                         else
2996                                 val = dtrace_getarg(ndx, aframes);
2997
2998                         /*
2999                          * This is regrettably required to keep the compiler
3000                          * from tail-optimizing the call to dtrace_getarg().
3001                          * The condition always evaluates to true, but the
3002                          * compiler has no way of figuring that out a priori.
3003                          * (None of this would be necessary if the compiler
3004                          * could be relied upon to _always_ tail-optimize
3005                          * the call to dtrace_getarg() -- but it can't.)
3006                          */
3007                         if (mstate->dtms_probe != NULL)
3008                                 return (val);
3009
3010                         ASSERT(0);
3011                 }
3012
3013                 return (mstate->dtms_arg[ndx]);
3014
3015 #if !defined(__APPLE__)
3016         case DIF_VAR_UREGS: {
3017                 klwp_t *lwp;
3018
3019                 if (!dtrace_priv_proc(state))
3020                         return (0);
3021
3022                 if ((lwp = curthread->t_lwp) == NULL) {
3023                         DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3024                         cpu_core[CPU->cpu_id].cpuc_dtrace_illval = NULL;
3025                         return (0);
3026                 }
3027
3028                 return (dtrace_getreg(lwp->lwp_regs, ndx));
3029         }
3030 #else
3031         case DIF_VAR_UREGS: {
3032                 thread_t thread;
3033
3034                 if (!dtrace_priv_proc(state))
3035                         return (0);
3036
3037                 if ((thread = current_thread()) == NULL) {
3038                         DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3039                         cpu_core[CPU->cpu_id].cpuc_dtrace_illval = 0;
3040                         return (0);
3041                 }
3042
3043                 return (dtrace_getreg(find_user_regs(thread), ndx));
3044         }
3045 #endif /* __APPLE__ */
3046
3047 #if !defined(__APPLE__)
3048         case DIF_VAR_CURTHREAD:
3049                 if (!dtrace_priv_kernel(state))
3050                         return (0);
3051                 return ((uint64_t)(uintptr_t)curthread);
3052 #else
3053         case DIF_VAR_CURTHREAD:
3054                 if (!dtrace_priv_kernel(state))
3055                         return (0);
3056
3057                 return ((uint64_t)(uintptr_t)current_thread());
3058 #endif /* __APPLE__ */
3059
3060         case DIF_VAR_TIMESTAMP:
3061                 if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
3062                         mstate->dtms_timestamp = dtrace_gethrtime();
3063                         mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
3064                 }
3065                 return (mstate->dtms_timestamp);
3066
3067 #if !defined(__APPLE__)
3068         case DIF_VAR_VTIMESTAMP:
3069                 ASSERT(dtrace_vtime_references != 0);
3070                 return (curthread->t_dtrace_vtime);
3071 #else
3072         case DIF_VAR_VTIMESTAMP:
3073                 ASSERT(dtrace_vtime_references != 0);
3074                 return (dtrace_get_thread_vtime(current_thread()));
3075 #endif /* __APPLE__ */
3076
3077         case DIF_VAR_WALLTIMESTAMP:
3078                 if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
3079                         mstate->dtms_walltimestamp = dtrace_gethrestime();
3080                         mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
3081                 }
3082                 return (mstate->dtms_walltimestamp);
3083
3084         case DIF_VAR_IPL:
3085                 if (!dtrace_priv_kernel(state))
3086                         return (0);
3087                 if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
3088                         mstate->dtms_ipl = dtrace_getipl();
3089                         mstate->dtms_present |= DTRACE_MSTATE_IPL;
3090                 }
3091                 return (mstate->dtms_ipl);
3092
3093         case DIF_VAR_EPID:
3094                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
3095                 return (mstate->dtms_epid);
3096
3097         case DIF_VAR_ID:
3098                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3099                 return (mstate->dtms_probe->dtpr_id);
3100
3101         case DIF_VAR_STACKDEPTH:
3102                 if (!dtrace_priv_kernel(state))
3103                         return (0);
3104                 if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
3105 #if !defined(__APPLE__)
3106                         int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3107 #else
3108                         /* Account for introduction of __dtrace_probe() on xnu. */
3109                         int aframes = mstate->dtms_probe->dtpr_aframes + 3;
3110 #endif /* __APPLE__ */
3111
3112                         mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
3113                         mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
3114                 }
3115                 return (mstate->dtms_stackdepth);
3116
3117         case DIF_VAR_USTACKDEPTH:
3118                 if (!dtrace_priv_proc(state))
3119                         return (0);
3120                 if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
3121                         /*
3122                          * See comment in DIF_VAR_PID.
3123                          */
3124                         if (DTRACE_ANCHORED(mstate->dtms_probe) &&
3125                             CPU_ON_INTR(CPU)) {
3126                                 mstate->dtms_ustackdepth = 0;
3127                         } else {
3128                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3129                                 mstate->dtms_ustackdepth =
3130                                     dtrace_getustackdepth();
3131                                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3132                         }
3133                         mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
3134                 }
3135                 return (mstate->dtms_ustackdepth);
3136
3137         case DIF_VAR_CALLER:
3138                 if (!dtrace_priv_kernel(state))
3139                         return (0);
3140                 if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
3141 #if !defined(__APPLE__)
3142                         int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3143 #else
3144                         /* Account for introduction of __dtrace_probe() on xnu. */
3145                         int aframes = mstate->dtms_probe->dtpr_aframes + 3;
3146 #endif /* __APPLE__ */
3147
3148                         if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
3149                                 /*
3150                                  * If this is an unanchored probe, we are
3151                                  * required to go through the slow path:
3152                                  * dtrace_caller() only guarantees correct
3153                                  * results for anchored probes.
3154                                  */
3155                                 pc_t caller[2];
3156
3157                                 dtrace_getpcstack(caller, 2, aframes,
3158                                     (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
3159                                 mstate->dtms_caller = caller[1];
3160                         } else if ((mstate->dtms_caller =
3161 #if !defined(__APPLE__)  /* Quiet compiler warnings */
3162                             dtrace_caller(aframes)) == -1) {
3163 #else
3164                             dtrace_caller(aframes)) == (uintptr_t)-1) {
3165 #endif /* __APPLE__ */
3166                                 /*
3167                                  * We have failed to do this the quick way;
3168                                  * we must resort to the slower approach of
3169                                  * calling dtrace_getpcstack().
3170                                  */
3171                                 pc_t caller;
3172
3173                                 dtrace_getpcstack(&caller, 1, aframes, NULL);
3174                                 mstate->dtms_caller = caller;
3175                         }
3176
3177                         mstate->dtms_present |= DTRACE_MSTATE_CALLER;
3178                 }
3179                 return (mstate->dtms_caller);
3180
3181         case DIF_VAR_UCALLER:
3182                 if (!dtrace_priv_proc(state))
3183                         return (0);
3184
3185                 if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
3186                         uint64_t ustack[3];
3187
3188                         /*
3189                          * dtrace_getupcstack() fills in the first uint64_t
3190                          * with the current PID.  The second uint64_t will
3191                          * be the program counter at user-level.  The third
3192                          * uint64_t will contain the caller, which is what
3193                          * we're after.
3194                          */
3195                         ustack[2] = NULL;
3196                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3197                         dtrace_getupcstack(ustack, 3);
3198                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3199                         mstate->dtms_ucaller = ustack[2];
3200                         mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
3201                 }
3202
3203                 return (mstate->dtms_ucaller);
3204
3205         case DIF_VAR_PROBEPROV:
3206                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3207                 return (dtrace_dif_varstr(
3208                     (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
3209                     state, mstate));
3210
3211         case DIF_VAR_PROBEMOD:
3212                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3213                 return (dtrace_dif_varstr(
3214                     (uintptr_t)mstate->dtms_probe->dtpr_mod,
3215                     state, mstate));
3216
3217         case DIF_VAR_PROBEFUNC:
3218                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3219                 return (dtrace_dif_varstr(
3220                     (uintptr_t)mstate->dtms_probe->dtpr_func,
3221                     state, mstate));
3222
3223         case DIF_VAR_PROBENAME:
3224                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3225                 return (dtrace_dif_varstr(
3226                     (uintptr_t)mstate->dtms_probe->dtpr_name,
3227                     state, mstate));
3228
3229 #if !defined(__APPLE__)
3230         case DIF_VAR_PID:
3231                 if (!dtrace_priv_proc(state))
3232                         return (0);
3233
3234                 /*
3235                  * Note that we are assuming that an unanchored probe is
3236                  * always due to a high-level interrupt.  (And we're assuming
3237                  * that there is only a single high level interrupt.)
3238                  */
3239                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3240                         return (pid0.pid_id);
3241
3242                 /*
3243                  * It is always safe to dereference one's own t_procp pointer:
3244                  * it always points to a valid, allocated proc structure.
3245                  * Further, it is always safe to dereference the p_pidp member
3246                  * of one's own proc structure.  (These are truisms becuase
3247                  * threads and processes don't clean up their own state --
3248                  * they leave that task to whomever reaps them.)
3249                  */
3250                 return ((uint64_t)curthread->t_procp->p_pidp->pid_id);
3251
3252 #else
3253         case DIF_VAR_PID:
3254                 if (!dtrace_priv_proc_relaxed(state))
3255                         return (0);
3256
3257                 /*
3258                  * Note that we are assuming that an unanchored probe is
3259                  * always due to a high-level interrupt.  (And we're assuming
3260                  * that there is only a single high level interrupt.)
3261                  */
3262                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3263                         /* Anchored probe that fires while on an interrupt accrues to process 0 */
3264                         return 0;
3265
3266                 return ((uint64_t)proc_selfpid());
3267 #endif /* __APPLE__ */
3268
3269 #if !defined(__APPLE__)
3270         case DIF_VAR_PPID:
3271                 if (!dtrace_priv_proc(state))
3272                         return (0);
3273
3274                 /*
3275                  * See comment in DIF_VAR_PID.
3276                  */
3277                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3278                         return (pid0.pid_id);
3279
3280                 /*
3281                  * It is always safe to dereference one's own t_procp pointer:
3282                  * it always points to a valid, allocated proc structure.
3283                  * (This is true because threads don't clean up their own
3284                  * state -- they leave that task to whomever reaps them.)
3285                  */
3286                 return ((uint64_t)curthread->t_procp->p_ppid);
3287 #else
3288         case DIF_VAR_PPID:
3289                 if (!dtrace_priv_proc_relaxed(state))
3290                         return (0);
3291
3292                 /*
3293                  * See comment in DIF_VAR_PID.
3294                  */
3295                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3296                         return (0);
3297
3298                 return ((uint64_t)proc_selfppid());
3299 #endif /* __APPLE__ */
3300
3301 #if !defined(__APPLE__)
3302         case DIF_VAR_TID:
3303                 /*
3304                  * See comment in DIF_VAR_PID.
3305                  */
3306                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3307                         return (0);
3308
3309                 return ((uint64_t)curthread->t_tid);
3310 #else
3311         case DIF_VAR_TID:
3312                 /* We do not need to check for null current_thread() */
3313                 return thread_tid(current_thread()); /* globally unique */
3314
3315         case DIF_VAR_PTHREAD_SELF:
3316                 if (!dtrace_priv_proc(state))
3317                         return (0);
3318
3319                 /* Not currently supported, but we should be able to delta the dispatchqaddr and dispatchqoffset to get pthread_self */
3320                 return 0;
3321
3322         case DIF_VAR_DISPATCHQADDR:
3323                 if (!dtrace_priv_proc(state))
3324                         return (0);
3325
3326                 /* We do not need to check for null current_thread() */
3327                 return thread_dispatchqaddr(current_thread());
3328 #endif /* __APPLE__ */
3329
3330 #if !defined(__APPLE__)
3331         case DIF_VAR_EXECNAME:
3332                 if (!dtrace_priv_proc(state))
3333                         return (0);
3334
3335                 /*
3336                  * See comment in DIF_VAR_PID.
3337                  */
3338                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3339                         return ((uint64_t)(uintptr_t)p0.p_user.u_comm);
3340
3341                 /*
3342                  * It is always safe to dereference one's own t_procp pointer:
3343                  * it always points to a valid, allocated proc structure.
3344                  * (This is true because threads don't clean up their own
3345                  * state -- they leave that task to whomever reaps them.)
3346                  */
3347                 return (dtrace_dif_varstr(
3348                     (uintptr_t)curthread->t_procp->p_user.u_comm,
3349                     state, mstate));
3350 #else
3351         case DIF_VAR_EXECNAME:
3352         {
3353                 char *xname = (char *)mstate->dtms_scratch_ptr;
3354                 size_t scratch_size = MAXCOMLEN+1;
3355
3356                 /* The scratch allocation's lifetime is that of the clause. */
3357                 if (!DTRACE_INSCRATCH(mstate, scratch_size)) {
3358                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3359                         return 0;
3360                 }
3361
3362                 if (!dtrace_priv_proc_relaxed(state))
3363                         return (0);
3364
3365                 mstate->dtms_scratch_ptr += scratch_size;
3366                 proc_selfname( xname, MAXCOMLEN );
3367
3368                 return ((uint64_t)(uintptr_t)xname);
3369         }
3370 #endif /* __APPLE__ */
3371 #if !defined(__APPLE__)
3372         case DIF_VAR_ZONENAME:
3373                 if (!dtrace_priv_proc(state))
3374                         return (0);
3375
3376                 /*
3377                  * See comment in DIF_VAR_PID.
3378                  */
3379                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3380                         return ((uint64_t)(uintptr_t)p0.p_zone->zone_name);
3381
3382                 /*
3383                  * It is always safe to dereference one's own t_procp pointer:
3384                  * it always points to a valid, allocated proc structure.
3385                  * (This is true because threads don't clean up their own
3386                  * state -- they leave that task to whomever reaps them.)
3387                  */
3388                 return (dtrace_dif_varstr(
3389                     (uintptr_t)curthread->t_procp->p_zone->zone_name,
3390                     state, mstate));
3391
3392 #else
3393         case DIF_VAR_ZONENAME:
3394                 if (!dtrace_priv_proc(state))
3395                         return (0);
3396
3397                 /* FIXME: return e.g. "global" allocated from scratch a la execname. */
3398                 return ((uint64_t)(uintptr_t)NULL); /* Darwin doesn't do "zones" */
3399 #endif /* __APPLE__ */
3400
3401 #if !defined(__APPLE__)
3402         case DIF_VAR_UID:
3403                 if (!dtrace_priv_proc(state))
3404                         return (0);
3405
3406                 /*
3407                  * See comment in DIF_VAR_PID.
3408                  */
3409                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3410                         return ((uint64_t)p0.p_cred->cr_uid);
3411
3412                 /*
3413                  * It is always safe to dereference one's own t_procp pointer:
3414                  * it always points to a valid, allocated proc structure.
3415                  * (This is true because threads don't clean up their own
3416                  * state -- they leave that task to whomever reaps them.)
3417                  *
3418                  * Additionally, it is safe to dereference one's own process
3419                  * credential, since this is never NULL after process birth.
3420                  */
3421                 return ((uint64_t)curthread->t_procp->p_cred->cr_uid);
3422 #else
3423         case DIF_VAR_UID:
3424                 if (!dtrace_priv_proc(state))
3425                         return (0);
3426
3427                 /*
3428                  * See comment in DIF_VAR_PID.
3429                  */
3430                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3431                         return (0);
3432
3433                 if (dtrace_CRED() != NULL)
3434                         /* Credential does not require lazy initialization. */
3435                         return ((uint64_t)kauth_getuid());
3436                 else {
3437                         /* proc_lock would be taken under kauth_cred_proc_ref() in kauth_cred_get(). */
3438                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3439                         return -1ULL;
3440                 }
3441 #endif /* __APPLE__ */
3442
3443 #if !defined(__APPLE__)
3444         case DIF_VAR_GID:
3445                 if (!dtrace_priv_proc(state))
3446                         return (0);
3447
3448                 /*
3449                  * See comment in DIF_VAR_PID.
3450                  */
3451                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3452                         return ((uint64_t)p0.p_cred->cr_gid);
3453
3454                 /*
3455                  * It is always safe to dereference one's own t_procp pointer:
3456                  * it always points to a valid, allocated proc structure.
3457                  * (This is true because threads don't clean up their own
3458                  * state -- they leave that task to whomever reaps them.)
3459                  *
3460                  * Additionally, it is safe to dereference one's own process
3461                  * credential, since this is never NULL after process birth.
3462                  */
3463                 return ((uint64_t)curthread->t_procp->p_cred->cr_gid);
3464 #else
3465         case DIF_VAR_GID:
3466                 if (!dtrace_priv_proc(state))
3467                         return (0);
3468
3469                 /*
3470                  * See comment in DIF_VAR_PID.
3471                  */
3472                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3473                         return (0);
3474
3475                 if (dtrace_CRED() != NULL)
3476                         /* Credential does not require lazy initialization. */
3477                         return ((uint64_t)kauth_getgid());
3478                 else {
3479                         /* proc_lock would be taken under kauth_cred_proc_ref() in kauth_cred_get(). */
3480                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3481                         return -1ULL;
3482                 }
3483 #endif /* __APPLE__ */
3484
3485 #if !defined(__APPLE__)
3486         case DIF_VAR_ERRNO: {
3487                 klwp_t *lwp;
3488                 if (!dtrace_priv_proc(state))
3489                         return (0);
3490
3491                 /*
3492                  * See comment in DIF_VAR_PID.
3493                  */
3494                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3495                         return (0);
3496
3497                 /*
3498                  * It is always safe to dereference one's own t_lwp pointer in
3499                  * the event that this pointer is non-NULL.  (This is true
3500                  * because threads and lwps don't clean up their own state --
3501                  * they leave that task to whomever reaps them.)
3502                  */
3503                 if ((lwp = curthread->t_lwp) == NULL)
3504                         return (0);
3505
3506                 return ((uint64_t)lwp->lwp_errno);
3507         }
3508 #else
3509         case DIF_VAR_ERRNO: {
3510                 uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
3511                 if (!dtrace_priv_proc(state))
3512                         return (0);
3513
3514                 /*
3515                  * See comment in DIF_VAR_PID.
3516                  */
3517                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3518                         return (0);
3519
3520                 if (uthread)
3521                         return (uint64_t)uthread->t_dtrace_errno;
3522                 else {
3523                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3524                         return -1ULL;
3525                 }
3526         }
3527 #endif /* __APPLE__ */
3528
3529         default:
3530                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3531                 return (0);
3532         }
3533 }
3534
3535 /*
3536  * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
3537  * Notice that we don't bother validating the proper number of arguments or
3538  * their types in the tuple stack.  This isn't needed because all argument
3539  * interpretation is safe because of our load safety -- the worst that can
3540  * happen is that a bogus program can obtain bogus results.
3541  */
3542 static void
3543 dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
3544     dtrace_key_t *tupregs, int nargs,
3545     dtrace_mstate_t *mstate, dtrace_state_t *state)
3546 {
3547         volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
3548 #if !defined(__APPLE__)
3549         volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
3550 #else
3551         volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
3552 #endif /* __APPLE__ */
3553         dtrace_vstate_t *vstate = &state->dts_vstate;
3554
3555 #if !defined(__APPLE__)
3556         union {
3557                 mutex_impl_t mi;
3558                 uint64_t mx;
3559         } m;
3560
3561         union {
3562                 krwlock_t ri;
3563                 uintptr_t rw;
3564         } r;
3565 #else
3566 /* FIXME: awaits lock/mutex work */
3567 #endif /* __APPLE__ */
3568
3569         switch (subr) {
3570         case DIF_SUBR_RAND:
3571                 regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875;
3572                 break;
3573
3574 #if !defined(__APPLE__)
3575         case DIF_SUBR_MUTEX_OWNED:
3576                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3577                     mstate, vstate)) {
3578                         regs[rd] = NULL;
3579                         break;
3580                 }
3581
3582                 m.mx = dtrace_load64(tupregs[0].dttk_value);
3583                 if (MUTEX_TYPE_ADAPTIVE(&m.mi))
3584                         regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
3585                 else
3586                         regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
3587                 break;
3588
3589         case DIF_SUBR_MUTEX_OWNER:
3590                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3591                     mstate, vstate)) {
3592                         regs[rd] = NULL;
3593                         break;
3594                 }
3595
3596                 m.mx = dtrace_load64(tupregs[0].dttk_value);
3597                 if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
3598                     MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
3599                         regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
3600                 else
3601                         regs[rd] = 0;
3602                 break;
3603
3604         case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
3605                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3606                     mstate, vstate)) {
3607                         regs[rd] = NULL;
3608                         break;
3609                 }
3610
3611                 m.mx = dtrace_load64(tupregs[0].dttk_value);
3612                 regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
3613                 break;
3614
3615         case DIF_SUBR_MUTEX_TYPE_SPIN:
3616                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3617                     mstate, vstate)) {
3618                         regs[rd] = NULL;
3619                         break;
3620                 }
3621
3622                 m.mx = dtrace_load64(tupregs[0].dttk_value);
3623                 regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
3624                 break;
3625
3626         case DIF_SUBR_RW_READ_HELD: {
3627                 uintptr_t tmp;
3628
3629                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
3630                     mstate, vstate)) {
3631                         regs[rd] = NULL;
3632                         break;
3633                 }
3634
3635                 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3636                 regs[rd] = _RW_READ_HELD(&r.ri, tmp);
3637                 break;
3638         }
3639
3640         case DIF_SUBR_RW_WRITE_HELD:
3641                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
3642                     mstate, vstate)) {
3643                         regs[rd] = NULL;
3644                         break;
3645                 }
3646
3647                 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3648                 regs[rd] = _RW_WRITE_HELD(&r.ri);
3649                 break;
3650
3651         case DIF_SUBR_RW_ISWRITER:
3652                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
3653                     mstate, vstate)) {
3654                         regs[rd] = NULL;
3655                         break;
3656                 }
3657
3658                 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3659                 regs[rd] = _RW_ISWRITER(&r.ri);
3660                 break;
3661 #else
3662 /* FIXME: awaits lock/mutex work */
3663 #endif /* __APPLE__ */
3664
3665         case DIF_SUBR_BCOPY: {
3666                 /*
3667                  * We need to be sure that the destination is in the scratch
3668                  * region -- no other region is allowed.
3669                  */
3670                 uintptr_t src = tupregs[0].dttk_value;
3671                 uintptr_t dest = tupregs[1].dttk_value;
3672                 size_t size = tupregs[2].dttk_value;
3673
3674                 if (!dtrace_inscratch(dest, size, mstate)) {
3675                         *flags |= CPU_DTRACE_BADADDR;
3676                         *illval = regs[rd];
3677                         break;
3678                 }
3679
3680                 if (!dtrace_canload(src, size, mstate, vstate)) {
3681                         regs[rd] = NULL;
3682                         break;
3683                 }
3684
3685                 dtrace_bcopy((void *)src, (void *)dest, size);
3686                 break;
3687         }
3688
3689         case DIF_SUBR_ALLOCA:
3690         case DIF_SUBR_COPYIN: {
3691                 uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
3692                 uint64_t size =
3693                     tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
3694                 size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
3695
3696                 /*
3697                  * This action doesn't require any credential checks since
3698                  * probes will not activate in user contexts to which the
3699                  * enabling user does not have permissions.
3700                  */
3701
3702                 /*
3703                  * Rounding up the user allocation size could have overflowed
3704                  * a large, bogus allocation (like -1ULL) to 0.
3705                  */
3706                 if (scratch_size < size ||
3707                     !DTRACE_INSCRATCH(mstate, scratch_size)) {
3708                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3709                         regs[rd] = NULL;
3710                         break;
3711                 }
3712
3713                 if (subr == DIF_SUBR_COPYIN) {
3714                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3715 #if !defined(__APPLE__)
3716                         dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3717 #else
3718                         if (dtrace_priv_proc(state))
3719                                 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3720 #endif /* __APPLE__ */
3721                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3722                 }
3723
3724                 mstate->dtms_scratch_ptr += scratch_size;
3725                 regs[rd] = dest;
3726                 break;
3727         }
3728
3729         case DIF_SUBR_COPYINTO: {
3730                 uint64_t size = tupregs[1].dttk_value;
3731                 uintptr_t dest = tupregs[2].dttk_value;
3732
3733                 /*
3734                  * This action doesn't require any credential checks since
3735                  * probes will not activate in user contexts to which the
3736                  * enabling user does not have permissions.
3737                  */
3738                 if (!dtrace_inscratch(dest, size, mstate)) {
3739                         *flags |= CPU_DTRACE_BADADDR;
3740                         *illval = regs[rd];
3741                         break;
3742                 }
3743
3744                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3745 #if !defined(__APPLE__)
3746                 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3747 #else
3748                 if (dtrace_priv_proc(state))
3749                         dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3750 #endif /* __APPLE__ */
3751                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3752                 break;
3753         }
3754
3755         case DIF_SUBR_COPYINSTR: {
3756                 uintptr_t dest = mstate->dtms_scratch_ptr;
3757                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3758
3759                 if (nargs > 1 && tupregs[1].dttk_value < size)
3760                         size = tupregs[1].dttk_value + 1;
3761
3762                 /*
3763                  * This action doesn't require any credential checks since
3764                  * probes will not activate in user contexts to which the
3765                  * enabling user does not have permissions.
3766                  */
3767                 if (!DTRACE_INSCRATCH(mstate, size)) {
3768                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3769                         regs[rd] = NULL;
3770                         break;
3771                 }
3772
3773                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3774 #if !defined(__APPLE__)
3775                 dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
3776 #else
3777                 if (dtrace_priv_proc(state))
3778                         dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
3779 #endif /* __APPLE__ */
3780                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3781
3782                 ((char *)dest)[size - 1] = '\0';
3783                 mstate->dtms_scratch_ptr += size;
3784                 regs[rd] = dest;
3785                 break;
3786         }
3787
3788 #if !defined(__APPLE__)
3789         case DIF_SUBR_MSGSIZE:
3790         case DIF_SUBR_MSGDSIZE: {
3791                 uintptr_t baddr = tupregs[0].dttk_value, daddr;
3792                 uintptr_t wptr, rptr;
3793                 size_t count = 0;
3794                 int cont = 0;
3795
3796                 while (baddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
3797
3798                         if (!dtrace_canload(baddr, sizeof (mblk_t), mstate,
3799                             vstate)) {
3800                                 regs[rd] = NULL;
3801                                 break;
3802                         }
3803
3804                         wptr = dtrace_loadptr(baddr +
3805                             offsetof(mblk_t, b_wptr));
3806
3807                         rptr = dtrace_loadptr(baddr +
3808                             offsetof(mblk_t, b_rptr));
3809
3810                         if (wptr < rptr) {
3811                                 *flags |= CPU_DTRACE_BADADDR;
3812                                 *illval = tupregs[0].dttk_value;
3813                                 break;
3814                         }
3815
3816                         daddr = dtrace_loadptr(baddr +
3817                             offsetof(mblk_t, b_datap));
3818
3819                         baddr = dtrace_loadptr(baddr +
3820                             offsetof(mblk_t, b_cont));
3821
3822                         /*
3823                          * We want to prevent against denial-of-service here,
3824                          * so we're only going to search the list for
3825                          * dtrace_msgdsize_max mblks.
3826                          */
3827                         if (cont++ > dtrace_msgdsize_max) {
3828                                 *flags |= CPU_DTRACE_ILLOP;
3829                                 break;
3830                         }
3831
3832                         if (subr == DIF_SUBR_MSGDSIZE) {
3833                                 if (dtrace_load8(daddr +
3834                                     offsetof(dblk_t, db_type)) != M_DATA)
3835                                         continue;
3836                         }
3837
3838                         count += wptr - rptr;
3839                 }
3840
3841                 if (!(*flags & CPU_DTRACE_FAULT))
3842                         regs[rd] = count;
3843
3844                 break;
3845         }
3846 #else
3847         case DIF_SUBR_MSGSIZE:
3848         case DIF_SUBR_MSGDSIZE: {
3849                 /* Darwin does not implement SysV streams messages */
3850                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3851                 regs[rd] = 0;
3852                 break;
3853         }
3854 #endif /* __APPLE__ */
3855
3856 #if !defined(__APPLE__)
3857         case DIF_SUBR_PROGENYOF: {
3858                 pid_t pid = tupregs[0].dttk_value;
3859                 proc_t *p;
3860                 int rval = 0;
3861
3862                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3863
3864                 for (p = curthread->t_procp; p != NULL; p = p->p_parent) {
3865                         if (p->p_pidp->pid_id == pid) {
3866                                 rval = 1;
3867                                 break;
3868                         }
3869                 }
3870
3871                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3872
3873                 regs[rd] = rval;
3874                 break;
3875         }
3876 #else
3877         case DIF_SUBR_PROGENYOF: {
3878                 pid_t pid = tupregs[0].dttk_value;
3879                 struct proc *p = current_proc();
3880                 int rval = 0, lim = nprocs;
3881
3882                 while(p && (lim-- > 0)) {
3883                         pid_t ppid;
3884
3885                         ppid = (pid_t)dtrace_load32((uintptr_t)&(p->p_pid));
3886                         if (*flags & CPU_DTRACE_FAULT)
3887                                 break;
3888
3889                         if (ppid == pid) {
3890                                 rval = 1;
3891                                 break;
3892                         }
3893
3894                         if (ppid == 0)
3895                                 break; /* Can't climb process tree any further. */
3896
3897                         p = (struct proc *)dtrace_loadptr((uintptr_t)&(p->p_pptr));
3898                         if (*flags & CPU_DTRACE_FAULT)
3899                                 break;
3900                 }
3901
3902                 regs[rd] = rval;
3903                 break;
3904         }
3905 #endif /* __APPLE__ */
3906
3907         case DIF_SUBR_SPECULATION:
3908                 regs[rd] = dtrace_speculation(state);
3909                 break;
3910
3911 #if !defined(__APPLE__)
3912         case DIF_SUBR_COPYOUT: {
3913                 uintptr_t kaddr = tupregs[0].dttk_value;
3914                 uintptr_t uaddr = tupregs[1].dttk_value;
3915                 uint64_t size = tupregs[2].dttk_value;
3916
3917                 if (!dtrace_destructive_disallow &&
3918                     dtrace_priv_proc_control(state) &&
3919                     !dtrace_istoxic(kaddr, size)) {
3920                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3921                         dtrace_copyout(kaddr, uaddr, size, flags);
3922                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3923                 }
3924                 break;
3925         }
3926
3927         case DIF_SUBR_COPYOUTSTR: {
3928                 uintptr_t kaddr = tupregs[0].dttk_value;
3929                 uintptr_t uaddr = tupregs[1].dttk_value;
3930                 uint64_t size = tupregs[2].dttk_value;
3931
3932                 if (!dtrace_destructive_disallow &&
3933                     dtrace_priv_proc_control(state) &&
3934                     !dtrace_istoxic(kaddr, size)) {
3935                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3936                         dtrace_copyoutstr(kaddr, uaddr, size, flags);
3937                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3938                 }
3939                 break;
3940         }
3941 #else
3942         case DIF_SUBR_COPYOUT: {
3943                 uintptr_t kaddr = tupregs[0].dttk_value;
3944                 user_addr_t uaddr = tupregs[1].dttk_value;
3945                 uint64_t size = tupregs[2].dttk_value;
3946
3947                 if (!dtrace_destructive_disallow &&
3948                     dtrace_priv_proc_control(state) &&
3949                     !dtrace_istoxic(kaddr, size)) {
3950                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3951                         dtrace_copyout(kaddr, uaddr, size, flags);
3952                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3953                 }
3954                 break;
3955         }
3956
3957         case DIF_SUBR_COPYOUTSTR: {
3958                 uintptr_t kaddr = tupregs[0].dttk_value;
3959                 user_addr_t uaddr = tupregs[1].dttk_value;
3960                 uint64_t size = tupregs[2].dttk_value;
3961
3962                 if (!dtrace_destructive_disallow &&
3963                     dtrace_priv_proc_control(state) &&
3964                     !dtrace_istoxic(kaddr, size)) {
3965                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3966                         dtrace_copyoutstr(kaddr, uaddr, size, flags);
3967                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3968                 }
3969                 break;
3970         }
3971 #endif /* __APPLE__ */
3972
3973         case DIF_SUBR_STRLEN: {
3974                 size_t sz;
3975                 uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
3976                 sz = dtrace_strlen((char *)addr,
3977                     state->dts_options[DTRACEOPT_STRSIZE]);
3978
3979                 if (!dtrace_canload(addr, sz + 1, mstate, vstate)) {
3980                         regs[rd] = NULL;
3981                         break;
3982                 }
3983
3984                 regs[rd] = sz;
3985
3986                 break;
3987         }
3988
3989         case DIF_SUBR_STRCHR:
3990         case DIF_SUBR_STRRCHR: {
3991                 /*
3992                  * We're going to iterate over the string looking for the
3993                  * specified character.  We will iterate until we have reached
3994                  * the string length or we have found the character.  If this
3995                  * is DIF_SUBR_STRRCHR, we will look for the last occurrence
3996                  * of the specified character instead of the first.
3997                  */
3998                 uintptr_t saddr = tupregs[0].dttk_value;
3999                 uintptr_t addr = tupregs[0].dttk_value;
4000                 uintptr_t limit = addr + state->dts_options[DTRACEOPT_STRSIZE];
4001                 char c, target = (char)tupregs[1].dttk_value;
4002
4003                 for (regs[rd] = NULL; addr < limit; addr++) {
4004                         if ((c = dtrace_load8(addr)) == target) {
4005                                 regs[rd] = addr;
4006
4007                                 if (subr == DIF_SUBR_STRCHR)
4008                                         break;
4009                         }
4010
4011                         if (c == '\0')
4012                                 break;
4013                 }
4014
4015                 if (!dtrace_canload(saddr, addr - saddr, mstate, vstate)) {
4016                         regs[rd] = NULL;
4017                         break;
4018                 }
4019
4020                 break;
4021         }
4022
4023         case DIF_SUBR_STRSTR:
4024         case DIF_SUBR_INDEX:
4025         case DIF_SUBR_RINDEX: {
4026                 /*
4027                  * We're going to iterate over the string looking for the
4028                  * specified string.  We will iterate until we have reached
4029                  * the string length or we have found the string.  (Yes, this
4030                  * is done in the most naive way possible -- but considering
4031                  * that the string we're searching for is likely to be
4032                  * relatively short, the complexity of Rabin-Karp or similar
4033                  * hardly seems merited.)
4034                  */
4035                 char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
4036                 char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
4037                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4038                 size_t len = dtrace_strlen(addr, size);
4039                 size_t sublen = dtrace_strlen(substr, size);
4040                 char *limit = addr + len, *orig = addr;
4041                 int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
4042                 int inc = 1;
4043
4044                 regs[rd] = notfound;
4045
4046                 if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
4047                         regs[rd] = NULL;
4048                         break;
4049                 }
4050
4051                 if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
4052                     vstate)) {
4053                         regs[rd] = NULL;
4054                         break;
4055                 }
4056
4057                 /*
4058                  * strstr() and index()/rindex() have similar semantics if
4059                  * both strings are the empty string: strstr() returns a
4060                  * pointer to the (empty) string, and index() and rindex()
4061                  * both return index 0 (regardless of any position argument).
4062                  */
4063                 if (sublen == 0 && len == 0) {
4064                         if (subr == DIF_SUBR_STRSTR)
4065                                 regs[rd] = (uintptr_t)addr;
4066                         else
4067                                 regs[rd] = 0;
4068                         break;
4069                 }
4070
4071                 if (subr != DIF_SUBR_STRSTR) {
4072                         if (subr == DIF_SUBR_RINDEX) {
4073                                 limit = orig - 1;
4074                                 addr += len;
4075                                 inc = -1;
4076                         }
4077
4078                         /*
4079                          * Both index() and rindex() take an optional position
4080                          * argument that denotes the starting position.
4081                          */
4082                         if (nargs == 3) {
4083                                 int64_t pos = (int64_t)tupregs[2].dttk_value;
4084
4085                                 /*
4086                                  * If the position argument to index() is
4087                                  * negative, Perl implicitly clamps it at
4088                                  * zero.  This semantic is a little surprising
4089                                  * given the special meaning of negative
4090                                  * positions to similar Perl functions like
4091                                  * substr(), but it appears to reflect a
4092                                  * notion that index() can start from a
4093                                  * negative index and increment its way up to
4094                                  * the string.  Given this notion, Perl's
4095                                  * rindex() is at least self-consistent in
4096                                  * that it implicitly clamps positions greater
4097                                  * than the string length to be the string
4098                                  * length.  Where Perl completely loses
4099                                  * coherence, however, is when the specified
4100                                  * substring is the empty string ("").  In
4101                                  * this case, even if the position is
4102                                  * negative, rindex() returns 0 -- and even if
4103                                  * the position is greater than the length,
4104                                  * index() returns the string length.  These
4105                                  * semantics violate the notion that index()
4106                                  * should never return a value less than the
4107                                  * specified position and that rindex() should
4108                                  * never return a value greater than the
4109                                  * specified position.  (One assumes that
4110                                  * these semantics are artifacts of Perl's
4111                                  * implementation and not the results of
4112                                  * deliberate design -- it beggars belief that
4113                                  * even Larry Wall could desire such oddness.)
4114                                  * While in the abstract one would wish for
4115                                  * consistent position semantics across
4116                                  * substr(), index() and rindex() -- or at the
4117                                  * very least self-consistent position
4118                                  * semantics for index() and rindex() -- we
4119                                  * instead opt to keep with the extant Perl
4120                                  * semantics, in all their broken glory.  (Do
4121                                  * we have more desire to maintain Perl's
4122                                  * semantics than Perl does?  Probably.)
4123                                  */
4124                                 if (subr == DIF_SUBR_RINDEX) {
4125                                         if (pos < 0) {
4126                                                 if (sublen == 0)
4127                                                         regs[rd] = 0;
4128                                                 break;
4129                                         }
4130
4131 #if !defined(__APPLE__)  /* Quiet compiler warnings */
4132                                         if (pos > len)
4133 #else
4134                                         if ((size_t)pos > len)
4135 #endif /* __APPLE__ */
4136                                                 pos = len;
4137                                 } else {
4138                                         if (pos < 0)
4139                                                 pos = 0;
4140
4141 #if !defined(__APPLE__)  /* Quiet compiler warnings */
4142                                         if (pos >= len) {
4143 #else
4144                                         if ((size_t)pos >= len) {
4145 #endif /* __APPLE__ */
4146                                                 if (sublen == 0)
4147                                                         regs[rd] = len;
4148                                                 break;
4149                                         }
4150                                 }
4151
4152                                 addr = orig + pos;
4153                         }
4154                 }
4155
4156                 for (regs[rd] = notfound; addr != limit; addr += inc) {
4157                         if (dtrace_strncmp(addr, substr, sublen) == 0) {
4158                                 if (subr != DIF_SUBR_STRSTR) {
4159                                         /*
4160                                          * As D index() and rindex() are
4161                                          * modeled on Perl (and not on awk),
4162                                          * we return a zero-based (and not a
4163                                          * one-based) index.  (For you Perl
4164                                          * weenies: no, we're not going to add
4165                                          * $[ -- and shouldn't you be at a con
4166                                          * or something?)
4167                                          */
4168                                         regs[rd] = (uintptr_t)(addr - orig);
4169                                         break;
4170                                 }
4171
4172                                 ASSERT(subr == DIF_SUBR_STRSTR);
4173                                 regs[rd] = (uintptr_t)addr;
4174                                 break;
4175                         }
4176                 }
4177
4178                 break;
4179         }
4180
4181         case DIF_SUBR_STRTOK: {
4182                 uintptr_t addr = tupregs[0].dttk_value;
4183                 uintptr_t tokaddr = tupregs[1].dttk_value;
4184                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4185                 uintptr_t limit, toklimit = tokaddr + size;
4186                 char *dest = (char *)mstate->dtms_scratch_ptr;
4187 #if !defined(__APPLE__)   /* Quiet compiler warnings */
4188                 uint8_t c, tokmap[32];   /* 256 / 8 */
4189                 int i;
4190 #else
4191                 uint8_t c='\0', tokmap[32];      /* 256 / 8 */
4192                 uint64_t i = 0;
4193 #endif /* __APPLE__ */
4194
4195                 /*
4196                  * Check both the token buffer and (later) the input buffer,
4197                  * since both could be non-scratch addresses.
4198                  */
4199                 if (!dtrace_strcanload(tokaddr, size, mstate, vstate)) {
4200                         regs[rd] = NULL;
4201                         break;
4202                 }
4203
4204                 if (!DTRACE_INSCRATCH(mstate, size)) {
4205                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4206                         regs[rd] = NULL;
4207                         break;
4208                 }
4209
4210                 if (addr == NULL) {
4211                         /*
4212                          * If the address specified is NULL, we use our saved
4213                          * strtok pointer from the mstate.  Note that this
4214                          * means that the saved strtok pointer is _only_
4215                          * valid within multiple enablings of the same probe --
4216                          * it behaves like an implicit clause-local variable.
4217                          */
4218                         addr = mstate->dtms_strtok;
4219                 } else {
4220                         /*
4221                          * If the user-specified address is non-NULL we must
4222                          * access check it.  This is the only time we have
4223                          * a chance to do so, since this address may reside
4224                          * in the string table of this clause-- future calls
4225                          * (when we fetch addr from mstate->dtms_strtok)
4226                          * would fail this access check.
4227                          */
4228                         if (!dtrace_strcanload(addr, size, mstate, vstate)) {
4229                                 regs[rd] = NULL;
4230                                 break;
4231                 }
4232                 }
4233
4234                 /*
4235                  * First, zero the token map, and then process the token
4236                  * string -- setting a bit in the map for every character
4237                  * found in the token string.
4238                  */
4239                 for (i = 0; i < (int)sizeof (tokmap); i++)
4240                         tokmap[i] = 0;
4241
4242                 for (; tokaddr < toklimit; tokaddr++) {
4243                         if ((c = dtrace_load8(tokaddr)) == '\0')
4244                                 break;
4245
4246                         ASSERT((c >> 3) < sizeof (tokmap));
4247                         tokmap[c >> 3] |= (1 << (c & 0x7));
4248                 }
4249
4250                 for (limit = addr + size; addr < limit; addr++) {
4251                         /*
4252                          * We're looking for a character that is _not_ contained
4253                          * in the token string.
4254                          */
4255                         if ((c = dtrace_load8(addr)) == '\0')
4256                                 break;
4257
4258                         if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
4259                                 break;
4260                 }
4261
4262                 if (c == '\0') {
4263                         /*
4264                          * We reached the end of the string without finding
4265                          * any character that was not in the token string.
4266                          * We return NULL in this case, and we set the saved
4267                          * address to NULL as well.
4268                          */
4269                         regs[rd] = NULL;
4270                         mstate->dtms_strtok = NULL;
4271                         break;
4272                 }
4273
4274                 /*
4275                  * From here on, we're copying into the destination string.
4276                  */
4277                 for (i = 0; addr < limit && i < size - 1; addr++) {
4278                         if ((c = dtrace_load8(addr)) == '\0')
4279                                 break;
4280
4281                         if (tokmap[c >> 3] & (1 << (c & 0x7)))
4282                                 break;
4283
4284                         ASSERT(i < size);
4285                         dest[i++] = c;
4286                 }
4287
4288                 ASSERT(i < size);
4289                 dest[i] = '\0';
4290                 regs[rd] = (uintptr_t)dest;
4291                 mstate->dtms_scratch_ptr += size;
4292                 mstate->dtms_strtok = addr;
4293                 break;
4294         }
4295
4296         case DIF_SUBR_SUBSTR: {
4297                 uintptr_t s = tupregs[0].dttk_value;
4298                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4299                 char *d = (char *)mstate->dtms_scratch_ptr;
4300                 int64_t index = (int64_t)tupregs[1].dttk_value;
4301                 int64_t remaining = (int64_t)tupregs[2].dttk_value;
4302                 size_t len = dtrace_strlen((char *)s, size);
4303                 int64_t i = 0;
4304
4305                 if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4306                         regs[rd] = NULL;
4307                         break;
4308                 }
4309
4310                 if (!DTRACE_INSCRATCH(mstate, size)) {
4311                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4312                         regs[rd] = NULL;
4313                         break;
4314                 }
4315
4316                 if (nargs <= 2)
4317                         remaining = (int64_t)size;
4318
4319                 if (index < 0) {
4320                         index += len;
4321
4322                         if (index < 0 && index + remaining > 0) {
4323                                 remaining += index;
4324                                 index = 0;
4325                         }
4326                 }
4327
4328 #if !defined(__APPLE__)   /* Quiet compiler warnings */
4329                 if (index >= len || index < 0) {
4330                         remaining = 0;
4331                 } else if (remaining < 0) {
4332                         remaining += len - index;
4333                 } else if (index + remaining > size) {
4334                         remaining = size - index;
4335                 }
4336 #else
4337                 if ((size_t)index >= len || index < 0) {
4338                         remaining = 0;
4339                 } else if (remaining < 0) {
4340                         remaining += len - index;
4341                 } else if ((uint64_t)index + (uint64_t)remaining > size) {
4342                         remaining = size - index;
4343                 }
4344 #endif /* __APPLE__ */
4345                 for (i = 0; i < remaining; i++) {
4346                         if ((d[i] = dtrace_load8(s + index + i)) == '\0')
4347                                 break;
4348                         }
4349
4350                 d[i] = '\0';
4351
4352                 mstate->dtms_scratch_ptr += size;
4353                 regs[rd] = (uintptr_t)d;
4354                 break;
4355         }
4356
4357 #if !defined(__APPLE__)
4358         case DIF_SUBR_GETMAJOR:
4359 #ifdef _LP64
4360                 regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR64) & MAXMAJ64;
4361 #else
4362                 regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR) & MAXMAJ;
4363 #endif
4364                 break;
4365
4366 #else  /* __APPLE__ */
4367         case DIF_SUBR_GETMAJOR:
4368                 regs[rd] = (uintptr_t)major( (dev_t)tupregs[0].dttk_value );
4369                 break;
4370 #endif /* __APPLE__ */
4371
4372 #if !defined(__APPLE__)
4373         case DIF_SUBR_GETMINOR:
4374 #ifdef _LP64
4375                 regs[rd] = tupregs[0].dttk_value & MAXMIN64;
4376 #else
4377                 regs[rd] = tupregs[0].dttk_value & MAXMIN;
4378 #endif
4379                 break;
4380
4381 #else  /* __APPLE__ */
4382         case DIF_SUBR_GETMINOR:
4383                 regs[rd] = (uintptr_t)minor( (dev_t)tupregs[0].dttk_value );
4384                 break;
4385 #endif /* __APPLE__ */
4386
4387 #if !defined(__APPLE__)
4388         case DIF_SUBR_DDI_PATHNAME: {
4389                 /*
4390                  * This one is a galactic mess.  We are going to roughly
4391                  * emulate ddi_pathname(), but it's made more complicated
4392                  * by the fact that we (a) want to include the minor name and
4393                  * (b) must proceed iteratively instead of recursively.
4394                  */
4395                 uintptr_t dest = mstate->dtms_scratch_ptr;
4396                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4397                 char *start = (char *)dest, *end = start + size - 1;
4398                 uintptr_t daddr = tupregs[0].dttk_value;
4399                 int64_t minor = (int64_t)tupregs[1].dttk_value;
4400                 char *s;
4401                 int i, len, depth = 0;
4402
4403                 /*
4404                  * Due to all the pointer jumping we do and context we must
4405                  * rely upon, we just mandate that the user must have kernel
4406                  * read privileges to use this routine.
4407                  */
4408                 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) == 0) {
4409                         *flags |= CPU_DTRACE_KPRIV;
4410                         *illval = daddr;
4411                         regs[rd] = NULL;
4412                 }
4413
4414                 if (!DTRACE_INSCRATCH(mstate, size)) {
4415                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4416                         regs[rd] = NULL;
4417                         break;
4418                 }
4419
4420                 *end = '\0';
4421
4422                 /*
4423                  * We want to have a name for the minor.  In order to do this,
4424                  * we need to walk the minor list from the devinfo.  We want
4425                  * to be sure that we don't infinitely walk a circular list,
4426                  * so we check for circularity by sending a scout pointer
4427                  * ahead two elements for every element that we iterate over;
4428                  * if the list is circular, these will ultimately point to the
4429                  * same element.  You may recognize this little trick as the
4430                  * answer to a stupid interview question -- one that always
4431                  * seems to be asked by those who had to have it laboriously
4432                  * explained to them, and who can't even concisely describe
4433                  * the conditions under which one would be forced to resort to
4434                  * this technique.  Needless to say, those conditions are
4435                  * found here -- and probably only here.  Is this the only use
4436                  * of this infamous trick in shipping, production code?  If it
4437                  * isn't, it probably should be...
4438                  */
4439                 if (minor != -1) {
4440                         uintptr_t maddr = dtrace_loadptr(daddr +
4441                             offsetof(struct dev_info, devi_minor));
4442
4443                         uintptr_t next = offsetof(struct ddi_minor_data, next);
4444                         uintptr_t name = offsetof(struct ddi_minor_data,
4445                             d_minor) + offsetof(struct ddi_minor, name);
4446                         uintptr_t dev = offsetof(struct ddi_minor_data,
4447                             d_minor) + offsetof(struct ddi_minor, dev);
4448                         uintptr_t scout;
4449
4450                         if (maddr != NULL)
4451                                 scout = dtrace_loadptr(maddr + next);
4452
4453                         while (maddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
4454                                 uint64_t m;
4455 #ifdef _LP64
4456                                 m = dtrace_load64(maddr + dev) & MAXMIN64;
4457 #else
4458                                 m = dtrace_load32(maddr + dev) & MAXMIN;
4459 #endif
4460                                 if (m != minor) {
4461                                         maddr = dtrace_loadptr(maddr + next);
4462
4463                                         if (scout == NULL)
4464                                                 continue;
4465
4466                                         scout = dtrace_loadptr(scout + next);
4467
4468                                         if (scout == NULL)
4469                                                 continue;
4470
4471                                         scout = dtrace_loadptr(scout + next);
4472
4473                                         if (scout == NULL)
4474                                                 continue;
4475
4476                                         if (scout == maddr) {
4477                                                 *flags |= CPU_DTRACE_ILLOP;
4478                                                 break;
4479                                         }
4480
4481                                         continue;
4482                                 }
4483
4484                                 /*
4485                                  * We have the minor data.  Now we need to
4486                                  * copy the minor's name into the end of the
4487                                  * pathname.
4488                                  */
4489                                 s = (char *)dtrace_loadptr(maddr + name);
4490                                 len = dtrace_strlen(s, size);
4491
4492                                 if (*flags & CPU_DTRACE_FAULT)
4493                                         break;
4494
4495                                 if (len != 0) {
4496                                         if ((end -= (len + 1)) < start)
4497                                                 break;
4498
4499                                         *end = ':';
4500                                 }
4501
4502                                 for (i = 1; i <= len; i++)
4503                                         end[i] = dtrace_load8((uintptr_t)s++);
4504                                 break;
4505                         }
4506                 }
4507
4508                 while (daddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
4509                         ddi_node_state_t devi_state;
4510
4511                         devi_state = dtrace_load32(daddr +
4512                             offsetof(struct dev_info, devi_node_state));
4513
4514                         if (*flags & CPU_DTRACE_FAULT)
4515                                 break;
4516
4517                         if (devi_state >= DS_INITIALIZED) {
4518                                 s = (char *)dtrace_loadptr(daddr +
4519                                     offsetof(struct dev_info, devi_addr));
4520                                 len = dtrace_strlen(s, size);
4521
4522                                 if (*flags & CPU_DTRACE_FAULT)
4523                                         break;
4524
4525                                 if (len != 0) {
4526                                         if ((end -= (len + 1)) < start)
4527                                                 break;
4528
4529                                         *end = '@';
4530                                 }
4531
4532                                 for (i = 1; i <= len; i++)
4533                                         end[i] = dtrace_load8((uintptr_t)s++);
4534                         }
4535
4536                         /*
4537                          * Now for the node name...
4538                          */
4539                         s = (char *)dtrace_loadptr(daddr +
4540                             offsetof(struct dev_info, devi_node_name));
4541
4542                         daddr = dtrace_loadptr(daddr +
4543                             offsetof(struct dev_info, devi_parent));
4544
4545                         /*
4546                          * If our parent is NULL (that is, if we're the root
4547                          * node), we're going to use the special path
4548                          * "devices".
4549                          */
4550                         if (daddr == NULL)
4551                                 s = "devices";
4552
4553                         len = dtrace_strlen(s, size);
4554                         if (*flags & CPU_DTRACE_FAULT)
4555                                 break;
4556
4557                         if ((end -= (len + 1)) < start)
4558                                 break;
4559
4560                         for (i = 1; i <= len; i++)
4561                                 end[i] = dtrace_load8((uintptr_t)s++);
4562                         *end = '/';
4563
4564                         if (depth++ > dtrace_devdepth_max) {
4565                                 *flags |= CPU_DTRACE_ILLOP;
4566                                 break;
4567                         }
4568                 }
4569
4570                 if (end < start)
4571                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4572
4573                 if (daddr == NULL) {
4574                         regs[rd] = (uintptr_t)end;
4575                         mstate->dtms_scratch_ptr += size;
4576                 }
4577
4578                 break;
4579         }
4580 #else
4581         case DIF_SUBR_DDI_PATHNAME: {
4582                 /* FIXME: awaits galactic disentanglement ;-} */
4583                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4584                 regs[rd] = NULL;
4585                 break;
4586         }
4587 #endif /* __APPLE__ */
4588
4589         case DIF_SUBR_STRJOIN: {
4590                 char *d = (char *)mstate->dtms_scratch_ptr;
4591                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4592                 uintptr_t s1 = tupregs[0].dttk_value;
4593                 uintptr_t s2 = tupregs[1].dttk_value;
4594 #if !defined(__APPLE__)   /* Quiet compiler warnings */
4595                 int i = 0;
4596 #else
4597                 uint64_t i = 0;
4598 #endif /* __APPLE__ */
4599
4600                 if (!dtrace_strcanload(s1, size, mstate, vstate) ||
4601                     !dtrace_strcanload(s2, size, mstate, vstate)) {
4602                         regs[rd] = NULL;
4603                         break;
4604                 }
4605
4606                 if (!DTRACE_INSCRATCH(mstate, size)) {
4607                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4608                         regs[rd] = NULL;
4609                         break;
4610                 }
4611
4612                 for (;;) {
4613                         if (i >= size) {
4614                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4615                                 regs[rd] = NULL;
4616                                 break;
4617                         }
4618
4619                         if ((d[i++] = dtrace_load8(s1++)) == '\0') {
4620                                 i--;
4621                                 break;
4622                         }
4623                 }
4624
4625                 for (;;) {
4626                         if (i >= size) {
4627                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4628                                 regs[rd] = NULL;
4629                                 break;
4630                         }
4631
4632                         if ((d[i++] = dtrace_load8(s2++)) == '\0')
4633                                 break;
4634                 }
4635
4636                 if (i < size) {
4637                         mstate->dtms_scratch_ptr += i;
4638                         regs[rd] = (uintptr_t)d;
4639                 }
4640
4641                 break;
4642         }
4643
4644         case DIF_SUBR_LLTOSTR: {
4645                 int64_t i = (int64_t)tupregs[0].dttk_value;
4646                 int64_t val = i < 0 ? i * -1 : i;
4647                 uint64_t size = 22;     /* enough room for 2^64 in decimal */
4648                 char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
4649
4650                 if (!DTRACE_INSCRATCH(mstate, size)) {
4651                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4652                         regs[rd] = NULL;
4653                         break;
4654                 }
4655
4656                 for (*end-- = '\0'; val; val /= 10)
4657                         *end-- = '0' + (val % 10);
4658
4659                 if (i == 0)
4660                         *end-- = '0';
4661
4662                 if (i < 0)
4663                         *end-- = '-';
4664
4665                 regs[rd] = (uintptr_t)end + 1;
4666                 mstate->dtms_scratch_ptr += size;
4667                 break;
4668         }
4669
4670         case DIF_SUBR_HTONS:
4671         case DIF_SUBR_NTOHS:
4672 #ifdef _BIG_ENDIAN
4673                 regs[rd] = (uint16_t)tupregs[0].dttk_value;
4674 #else
4675                 regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
4676 #endif
4677                 break;
4678
4679
4680         case DIF_SUBR_HTONL:
4681         case DIF_SUBR_NTOHL:
4682 #ifdef _BIG_ENDIAN
4683                 regs[rd] = (uint32_t)tupregs[0].dttk_value;
4684 #else
4685                 regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
4686 #endif
4687                 break;
4688
4689
4690         case DIF_SUBR_HTONLL:
4691         case DIF_SUBR_NTOHLL:
4692 #ifdef _BIG_ENDIAN
4693                 regs[rd] = (uint64_t)tupregs[0].dttk_value;
4694 #else
4695                 regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
4696 #endif
4697                 break;
4698
4699
4700         case DIF_SUBR_DIRNAME:
4701         case DIF_SUBR_BASENAME: {
4702                 char *dest = (char *)mstate->dtms_scratch_ptr;
4703                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4704                 uintptr_t src = tupregs[0].dttk_value;
4705                 int i, j, len = dtrace_strlen((char *)src, size);
4706                 int lastbase = -1, firstbase = -1, lastdir = -1;
4707                 int start, end;
4708
4709                 if (!dtrace_canload(src, len + 1, mstate, vstate)) {
4710                         regs[rd] = NULL;
4711                         break;
4712                 }
4713
4714                 if (!DTRACE_INSCRATCH(mstate, size)) {
4715                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4716                         regs[rd] = NULL;
4717                         break;
4718                 }
4719
4720                 /*
4721                  * The basename and dirname for a zero-length string is
4722                  * defined to be "."
4723                  */
4724                 if (len == 0) {
4725                         len = 1;
4726                         src = (uintptr_t)".";
4727                 }
4728
4729                 /*
4730                  * Start from the back of the string, moving back toward the
4731                  * front until we see a character that isn't a slash.  That
4732                  * character is the last character in the basename.
4733                  */
4734                 for (i = len - 1; i >= 0; i--) {
4735                         if (dtrace_load8(src + i) != '/')
4736                                 break;
4737                 }
4738
4739                 if (i >= 0)
4740                         lastbase = i;
4741
4742                 /*
4743                  * Starting from the last character in the basename, move
4744                  * towards the front until we find a slash.  The character
4745                  * that we processed immediately before that is the first
4746                  * character in the basename.
4747                  */
4748                 for (; i >= 0; i--) {
4749                         if (dtrace_load8(src + i) == '/')
4750                                 break;
4751                 }
4752
4753                 if (i >= 0)
4754                         firstbase = i + 1;
4755
4756                 /*
4757                  * Now keep going until we find a non-slash character.  That
4758                  * character is the last character in the dirname.
4759                  */
4760                 for (; i >= 0; i--) {
4761                         if (dtrace_load8(src + i) != '/')
4762                                 break;
4763                 }
4764
4765                 if (i >= 0)
4766                         lastdir = i;
4767
4768                 ASSERT(!(lastbase == -1 && firstbase != -1));
4769                 ASSERT(!(firstbase == -1 && lastdir != -1));
4770
4771                 if (lastbase == -1) {
4772                         /*
4773                          * We didn't find a non-slash character.  We know that
4774                          * the length is non-zero, so the whole string must be
4775                          * slashes.  In either the dirname or the basename
4776                          * case, we return '/'.
4777                          */
4778                         ASSERT(firstbase == -1);
4779                         firstbase = lastbase = lastdir = 0;
4780                 }
4781
4782                 if (firstbase == -1) {
4783                         /*
4784                          * The entire string consists only of a basename
4785                          * component.  If we're looking for dirname, we need
4786                          * to change our string to be just "."; if we're
4787                          * looking for a basename, we'll just set the first
4788                          * character of the basename to be 0.
4789                          */
4790                         if (subr == DIF_SUBR_DIRNAME) {
4791                                 ASSERT(lastdir == -1);
4792                                 src = (uintptr_t)".";
4793                                 lastdir = 0;
4794                         } else {
4795                                 firstbase = 0;
4796                         }
4797                 }
4798
4799                 if (subr == DIF_SUBR_DIRNAME) {
4800                         if (lastdir == -1) {
4801                                 /*
4802                                  * We know that we have a slash in the name --
4803                                  * or lastdir would be set to 0, above.  And
4804                                  * because lastdir is -1, we know that this
4805                                  * slash must be the first character.  (That
4806                                  * is, the full string must be of the form
4807                                  * "/basename".)  In this case, the last
4808                                  * character of the directory name is 0.
4809                                  */
4810                                 lastdir = 0;
4811                         }
4812
4813                         start = 0;
4814                         end = lastdir;
4815                 } else {
4816                         ASSERT(subr == DIF_SUBR_BASENAME);
4817                         ASSERT(firstbase != -1 && lastbase != -1);
4818                         start = firstbase;
4819                         end = lastbase;
4820                 }
4821
4822 #if !defined(__APPLE__)   /* Quiet compiler warnings */
4823                 for (i = start, j = 0; i <= end && j < size - 1; i++, j++)
4824                         dest[j] = dtrace_load8(src + i);
4825 #else
4826                 for (i = start, j = 0; i <= end && (uint64_t)j < size - 1; i++, j++)
4827                         dest[j] = dtrace_load8(src + i);
4828 #endif /* __APPLE__ */
4829
4830                 dest[j] = '\0';
4831                 regs[rd] = (uintptr_t)dest;
4832                 mstate->dtms_scratch_ptr += size;
4833                 break;
4834         }
4835
4836         case DIF_SUBR_CLEANPATH: {
4837                 char *dest = (char *)mstate->dtms_scratch_ptr, c;
4838                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4839                 uintptr_t src = tupregs[0].dttk_value;
4840                 int i = 0, j = 0;
4841
4842                 if (!dtrace_strcanload(src, size, mstate, vstate)) {
4843                         regs[rd] = NULL;
4844                         break;
4845                 }
4846
4847                 if (!DTRACE_INSCRATCH(mstate, size)) {
4848                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4849                         regs[rd] = NULL;
4850                         break;
4851                 }
4852
4853                 /*
4854                  * Move forward, loading each character.
4855                  */
4856                 do {
4857                         c = dtrace_load8(src + i++);
4858 next:
4859 #if !defined(__APPLE__)   /* Quiet compiler warnings */
4860                         if (j + 5 >= size)      /* 5 = strlen("/..c\0") */
4861                                 break;
4862 #else
4863                         if ((uint64_t)(j + 5) >= size)  /* 5 = strlen("/..c\0") */
4864                                 break;
4865 #endif /* __APPLE__ */
4866
4867                         if (c != '/') {
4868                                 dest[j++] = c;
4869                                 continue;
4870                         }
4871
4872                         c = dtrace_load8(src + i++);
4873
4874                         if (c == '/') {
4875                                 /*
4876                                  * We have two slashes -- we can just advance
4877                                  * to the next character.
4878                                  */
4879                                 goto next;
4880                         }
4881
4882                         if (c != '.') {
4883                                 /*
4884                                  * This is not "." and it's not ".." -- we can
4885                                  * just store the "/" and this character and
4886                                  * drive on.
4887                                  */
4888                                 dest[j++] = '/';
4889                                 dest[j++] = c;
4890                                 continue;
4891                         }
4892
4893                         c = dtrace_load8(src + i++);
4894
4895                         if (c == '/') {
4896                                 /*
4897                                  * This is a "/./" component.  We're not going
4898                                  * to store anything in the destination buffer;
4899                                  * we're just going to go to the next component.
4900                                  */
4901                                 goto next;
4902                         }
4903
4904                         if (c != '.') {
4905                                 /*
4906                                  * This is not ".." -- we can just store the
4907                                  * "/." and this character and continue
4908                                  * processing.
4909                                  */
4910                                 dest[j++] = '/';
4911                                 dest[j++] = '.';
4912                                 dest[j++] = c;
4913                                 continue;
4914                         }
4915
4916                         c = dtrace_load8(src + i++);
4917
4918                         if (c != '/' && c != '\0') {
4919                                 /*
4920                                  * This is not ".." -- it's "..[mumble]".
4921                                  * We'll store the "/.." and this character
4922                                  * and continue processing.
4923                                  */
4924                                 dest[j++] = '/';
4925                                 dest[j++] = '.';
4926                                 dest[j++] = '.';
4927                                 dest[j++] = c;
4928                                 continue;
4929                         }
4930
4931                         /*
4932                          * This is "/../" or "/..\0".  We need to back up
4933                          * our destination pointer until we find a "/".
4934                          */
4935                         i--;
4936                         while (j != 0 && dest[--j] != '/')
4937                                 continue;
4938
4939                         if (c == '\0')
4940                                 dest[++j] = '/';
4941                 } while (c != '\0');
4942
4943                 dest[j] = '\0';
4944                 regs[rd] = (uintptr_t)dest;
4945                 mstate->dtms_scratch_ptr += size;
4946                 break;
4947         }
4948
4949         case DIF_SUBR_INET_NTOA:
4950         case DIF_SUBR_INET_NTOA6:
4951         case DIF_SUBR_INET_NTOP: {
4952                 size_t size;
4953                 int af, argi, i;
4954                 char *base, *end;
4955
4956                 if (subr == DIF_SUBR_INET_NTOP) {
4957                         af = (int)tupregs[0].dttk_value;
4958                         argi = 1;
4959                 } else {
4960                         af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
4961                         argi = 0;
4962                 }
4963
4964                 if (af == AF_INET) {
4965 #if !defined(__APPLE__)
4966                         ipaddr_t ip4;
4967 #else
4968                         uint32_t ip4;
4969 #endif /* __APPLE__ */
4970                         uint8_t *ptr8, val;
4971
4972                         /*
4973                          * Safely load the IPv4 address.
4974                          */
4975 #if !defined(__APPLE__)
4976                         ip4 = dtrace_load32(tupregs[argi].dttk_value);
4977 #else
4978                         dtrace_bcopy(
4979                             (void *)(uintptr_t)tupregs[argi].dttk_value,
4980                             (void *)(uintptr_t)&ip4, sizeof (ip4));
4981 #endif /* __APPLE__ */
4982                         /*
4983                          * Check an IPv4 string will fit in scratch.
4984                          */
4985 #if !defined(__APPLE__)
4986                         size = INET_ADDRSTRLEN;
4987 #else
4988                         size = MAX_IPv4_STR_LEN;
4989 #endif /* __APPLE__ */
4990                         if (!DTRACE_INSCRATCH(mstate, size)) {
4991                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4992                                 regs[rd] = NULL;
4993                                 break;
4994                         }
4995                         base = (char *)mstate->dtms_scratch_ptr;
4996                         end = (char *)mstate->dtms_scratch_ptr + size - 1;
4997
4998                         /*
4999                          * Stringify as a dotted decimal quad.
5000                          */
5001                         *end-- = '\0';
5002                         ptr8 = (uint8_t *)&ip4;
5003                         for (i = 3; i >= 0; i--) {
5004                                 val = ptr8[i];
5005
5006                                 if (val == 0) {
5007                                         *end-- = '0';
5008                                 } else {
5009                                         for (; val; val /= 10) {
5010                                                 *end-- = '0' + (val % 10);
5011                                         }
5012                                 }
5013
5014                                 if (i > 0)
5015                                         *end-- = '.';
5016                         }
5017                         ASSERT(end + 1 >= base);
5018
5019                 } else if (af == AF_INET6) {
5020 #if defined(__APPLE__)
5021 #define _S6_un __u6_addr
5022 #define _S6_u8 __u6_addr8
5023 #endif /* __APPLE__ */
5024                         struct in6_addr ip6;
5025                         int firstzero, tryzero, numzero, v6end;
5026                         uint16_t val;
5027                         const char digits[] = "0123456789abcdef";
5028
5029                         /*
5030                          * Stringify using RFC 1884 convention 2 - 16 bit
5031                          * hexadecimal values with a zero-run compression.
5032                          * Lower case hexadecimal digits are used.
5033                          *      eg, fe80::214:4fff:fe0b:76c8.
5034                          * The IPv4 embedded form is returned for inet_ntop,
5035                          * just the IPv4 string is returned for inet_ntoa6.
5036                          */
5037
5038                         /*
5039                          * Safely load the IPv6 address.
5040                          */
5041                         dtrace_bcopy(
5042                             (void *)(uintptr_t)tupregs[argi].dttk_value,
5043                             (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
5044
5045                         /*
5046                          * Check an IPv6 string will fit in scratch.
5047                          */
5048                         size = INET6_ADDRSTRLEN;
5049                         if (!DTRACE_INSCRATCH(mstate, size)) {
5050                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5051                                 regs[rd] = NULL;
5052                                 break;
5053                         }
5054                         base = (char *)mstate->dtms_scratch_ptr;
5055                         end = (char *)mstate->dtms_scratch_ptr + size - 1;
5056                         *end-- = '\0';
5057
5058                         /*
5059                          * Find the longest run of 16 bit zero values
5060                          * for the single allowed zero compression - "::".
5061                          */
5062                         firstzero = -1;
5063                         tryzero = -1;
5064                         numzero = 1;
5065 #if !defined(__APPLE__)   /* Quiet compiler warnings */
5066                         for (i = 0; i < sizeof (struct in6_addr); i++) {
5067 #else
5068                         for (i = 0; i < (int)sizeof (struct in6_addr); i++) {
5069 #endif /* __APPLE__ */
5070                                 if (ip6._S6_un._S6_u8[i] == 0 &&
5071                                     tryzero == -1 && i % 2 == 0) {
5072                                         tryzero = i;
5073                                         continue;
5074                                 }
5075
5076                                 if (tryzero != -1 &&
5077                                     (ip6._S6_un._S6_u8[i] != 0 ||
5078                                     i == sizeof (struct in6_addr) - 1)) {
5079
5080                                         if (i - tryzero <= numzero) {
5081                                                 tryzero = -1;
5082                                                 continue;
5083                                         }
5084
5085                                         firstzero = tryzero;
5086                                         numzero = i - i % 2 - tryzero;
5087                                         tryzero = -1;
5088
5089                                         if (ip6._S6_un._S6_u8[i] == 0 &&
5090                                             i == sizeof (struct in6_addr) - 1)
5091                                                 numzero += 2;
5092                                 }
5093                         }
5094 #if !defined(__APPLE__)   /* Quiet compiler warnings */
5095                         ASSERT(firstzero + numzero <= sizeof (struct in6_addr));
5096 #else
5097                         ASSERT(firstzero + numzero <= (int)sizeof (struct in6_addr));
5098 #endif /* __APPLE__ */
5099
5100                         /*
5101                          * Check for an IPv4 embedded address.
5102                          */
5103                         v6end = sizeof (struct in6_addr) - 2;
5104                         if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
5105                             IN6_IS_ADDR_V4COMPAT(&ip6)) {
5106 #if !defined(__APPLE__)   /* Quiet compiler warnings */
5107                                 for (i = sizeof (struct in6_addr) - 1;
5108                                     i >= DTRACE_V4MAPPED_OFFSET; i--) {
5109 #else
5110                                 for (i = sizeof (struct in6_addr) - 1;
5111                                      i >= (int)DTRACE_V4MAPPED_OFFSET; i--) {
5112 #endif /* __APPLE__ */
5113                                         ASSERT(end >= base);
5114
5115                                         val = ip6._S6_un._S6_u8[i];
5116
5117                                         if (val == 0) {
5118                                                 *end-- = '0';
5119                                         } else {
5120                                                 for (; val; val /= 10) {
5121                                                         *end-- = '0' + val % 10;
5122                                                 }
5123                                         }
5124
5125 #if !defined(__APPLE__)   /* Quiet compiler warnings */
5126                                         if (i > DTRACE_V4MAPPED_OFFSET)
5127                                                 *end-- = '.';
5128 #else
5129                                         if (i > (int)DTRACE_V4MAPPED_OFFSET)
5130                                                 *end-- = '.';
5131 #endif /* __APPLE__ */
5132                                 }
5133
5134                                 if (subr == DIF_SUBR_INET_NTOA6)
5135                                         goto inetout;
5136
5137                                 /*
5138                                  * Set v6end to skip the IPv4 address that
5139                                  * we have already stringified.
5140                                  */
5141                                 v6end = 10;
5142                         }
5143
5144                         /*
5145                          * Build the IPv6 string by working through the
5146                          * address in reverse.
5147                          */
5148                         for (i = v6end; i >= 0; i -= 2) {
5149                                 ASSERT(end >= base);
5150
5151                                 if (i == firstzero + numzero - 2) {
5152                                         *end-- = ':';
5153                                         *end-- = ':';
5154                                         i -= numzero - 2;
5155                                         continue;
5156                                 }
5157
5158                                 if (i < 14 && i != firstzero - 2)
5159                                         *end-- = ':';
5160
5161                                 val = (ip6._S6_un._S6_u8[i] << 8) +
5162                                     ip6._S6_un._S6_u8[i + 1];
5163
5164                                 if (val == 0) {
5165                                         *end-- = '0';
5166                                 } else {
5167                                         for (; val; val /= 16) {
5168                                                 *end-- = digits[val % 16];
5169                                         }
5170                                 }
5171                         }
5172                         ASSERT(end + 1 >= base);
5173
5174 #if defined(__APPLE__)
5175 #undef _S6_un
5176 #undef _S6_u8
5177 #endif /* __APPLE__ */
5178                 } else {
5179                         /*
5180                          * The user didn't use AH_INET or AH_INET6.
5181                          */
5182                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5183                         regs[rd] = NULL;
5184                         break;
5185                 }
5186
5187 inetout:        regs[rd] = (uintptr_t)end + 1;
5188                 mstate->dtms_scratch_ptr += size;
5189                 break;
5190         }
5191
5192 #ifdef __APPLE__
5193
5194         /* CoreProfile callback ('core_profile(uint64_t, [uint64_t], [uint64_t] ...)') */
5195         case DIF_SUBR_COREPROFILE: {
5196                 uint64_t selector = tupregs[0].dttk_value;
5197                 uint64_t args[DIF_DTR_NREGS-1] = {0ULL};
5198                 uint32_t ii;
5199                 uint32_t count = (uint32_t)nargs;
5200
5201                 if (count < 1) {
5202                     regs[rd] = KERN_FAILURE;
5203                     break;
5204                 }
5205
5206                 if(count > DIF_DTR_NREGS)
5207                     count = DIF_DTR_NREGS;
5208
5209                 /* copy in any variadic argument list, bounded by DIF_DTR_NREGS */
5210                 for(ii = 0; ii < count-1; ii++) {
5211                         args[ii] = tupregs[ii+1].dttk_value;
5212                 }
5213
5214                 kern_return_t ret =
5215                         chudxnu_dtrace_callback(selector, args, count-1);
5216                 if(KERN_SUCCESS != ret) {
5217                         /* error */
5218                 }
5219
5220                 regs[rd] = ret;
5221                 break;
5222         }
5223
5224 #endif /* __APPLE__ */
5225
5226         }
5227 }
5228
5229 /*
5230  * Emulate the execution of DTrace IR instructions specified by the given
5231  * DIF object.  This function is deliberately void of assertions as all of
5232  * the necessary checks are handled by a call to dtrace_difo_validate().
5233  */
5234 static uint64_t
5235 dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
5236     dtrace_vstate_t *vstate, dtrace_state_t *state)
5237 {
5238         const dif_instr_t *text = difo->dtdo_buf;
5239         const uint_t textlen = difo->dtdo_len;
5240         const char *strtab = difo->dtdo_strtab;
5241         const uint64_t *inttab = difo->dtdo_inttab;
5242
5243         uint64_t rval = 0;
5244         dtrace_statvar_t *svar;
5245         dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
5246         dtrace_difv_t *v;
5247         volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
5248 #if !defined(__APPLE__)
5249         volatile uintptr_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
5250 #else
5251         volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
5252 #endif /* __APPLE__ */
5253
5254         dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
5255         uint64_t regs[DIF_DIR_NREGS];
5256         uint64_t *tmp;
5257
5258         uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
5259         int64_t cc_r;
5260 #if !defined(__APPLE__)   /* Quiet compiler warnings */
5261         uint_t pc = 0, id, opc;
5262 #else
5263         uint_t pc = 0, id, opc = 0;
5264 #endif /* __APPLE__ */
5265         uint8_t ttop = 0;
5266         dif_instr_t instr;
5267         uint_t r1, r2, rd;
5268
5269         /*
5270          * We stash the current DIF object into the machine state: we need it
5271          * for subsequent access checking.
5272          */
5273         mstate->dtms_difo = difo;
5274
5275         regs[DIF_REG_R0] = 0;           /* %r0 is fixed at zero */
5276
5277         while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
5278                 opc = pc;
5279
5280                 instr = text[pc++];
5281                 r1 = DIF_INSTR_R1(instr);
5282                 r2 = DIF_INSTR_R2(instr);
5283                 rd = DIF_INSTR_RD(instr);
5284
5285                 switch (DIF_INSTR_OP(instr)) {
5286                 case DIF_OP_OR:
5287                         regs[rd] = regs[r1] | regs[r2];
5288                         break;
5289                 case DIF_OP_XOR:
5290                         regs[rd] = regs[r1] ^ regs[r2];
5291                         break;
5292                 case DIF_OP_AND:
5293                         regs[rd] = regs[r1] & regs[r2];
5294                         break;
5295                 case DIF_OP_SLL:
5296                         regs[rd] = regs[r1] << regs[r2];
5297                         break;
5298                 case DIF_OP_SRL:
5299                         regs[rd] = regs[r1] >> regs[r2];
5300                         break;
5301                 case DIF_OP_SUB:
5302                         regs[rd] = regs[r1] - regs[r2];
5303                         break;
5304                 case DIF_OP_ADD:
5305                         regs[rd] = regs[r1] + regs[r2];
5306                         break;
5307                 case DIF_OP_MUL:
5308                         regs[rd] = regs[r1] * regs[r2];
5309                         break;
5310                 case DIF_OP_SDIV:
5311                         if (regs[r2] == 0) {
5312                                 regs[rd] = 0;
5313                                 *flags |= CPU_DTRACE_DIVZERO;
5314                         } else {
5315                                 regs[rd] = (int64_t)regs[r1] /
5316                                     (int64_t)regs[r2];
5317                         }
5318                         break;
5319
5320                 case DIF_OP_UDIV:
5321                         if (regs[r2] == 0) {
5322                                 regs[rd] = 0;
5323                                 *flags |= CPU_DTRACE_DIVZERO;
5324                         } else {
5325                                 regs[rd] = regs[r1] / regs[r2];
5326                         }
5327                         break;
5328
5329                 case DIF_OP_SREM:
5330                         if (regs[r2] == 0) {
5331                                 regs[rd] = 0;
5332                                 *flags |= CPU_DTRACE_DIVZERO;
5333                         } else {
5334                                 regs[rd] = (int64_t)regs[r1] %
5335                                     (int64_t)regs[r2];
5336                         }
5337                         break;
5338
5339                 case DIF_OP_UREM:
5340                         if (regs[r2] == 0) {
5341                                 regs[rd] = 0;
5342                                 *flags |= CPU_DTRACE_DIVZERO;
5343                         } else {
5344                                 regs[rd] = regs[r1] % regs[r2];
5345                         }
5346                         break;
5347
5348                 case DIF_OP_NOT:
5349                         regs[rd] = ~regs[r1];
5350                         break;
5351                 case DIF_OP_MOV:
5352                         regs[rd] = regs[r1];
5353                         break;
5354                 case DIF_OP_CMP:
5355                         cc_r = regs[r1] - regs[r2];
5356                         cc_n = cc_r < 0;
5357                         cc_z = cc_r == 0;
5358                         cc_v = 0;
5359                         cc_c = regs[r1] < regs[r2];
5360                         break;
5361                 case DIF_OP_TST:
5362                         cc_n = cc_v = cc_c = 0;
5363                         cc_z = regs[r1] == 0;
5364                         break;
5365                 case DIF_OP_BA:
5366                         pc = DIF_INSTR_LABEL(instr);
5367                         break;
5368                 case DIF_OP_BE:
5369                         if (cc_z)
5370                                 pc = DIF_INSTR_LABEL(instr);
5371                         break;
5372                 case DIF_OP_BNE:
5373                         if (cc_z == 0)
5374                                 pc = DIF_INSTR_LABEL(instr);
5375                         break;
5376                 case DIF_OP_BG:
5377                         if ((cc_z | (cc_n ^ cc_v)) == 0)
5378                                 pc = DIF_INSTR_LABEL(instr);
5379                         break;
5380                 case DIF_OP_BGU:
5381                         if ((cc_c | cc_z) == 0)
5382                                 pc = DIF_INSTR_LABEL(instr);
5383                         break;
5384                 case DIF_OP_BGE:
5385                         if ((cc_n ^ cc_v) == 0)
5386                                 pc = DIF_INSTR_LABEL(instr);
5387                         break;
5388                 case DIF_OP_BGEU:
5389                         if (cc_c == 0)
5390                                 pc = DIF_INSTR_LABEL(instr);
5391                         break;
5392                 case DIF_OP_BL:
5393                         if (cc_n ^ cc_v)
5394                                 pc = DIF_INSTR_LABEL(instr);
5395                         break;
5396                 case DIF_OP_BLU:
5397                         if (cc_c)
5398                                 pc = DIF_INSTR_LABEL(instr);
5399                         break;
5400                 case DIF_OP_BLE:
5401                         if (cc_z | (cc_n ^ cc_v))
5402                                 pc = DIF_INSTR_LABEL(instr);
5403                         break;
5404                 case DIF_OP_BLEU:
5405                         if (cc_c | cc_z)
5406                                 pc = DIF_INSTR_LABEL(instr);
5407                         break;
5408                 case DIF_OP_RLDSB:
5409                         if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
5410                                 *flags |= CPU_DTRACE_KPRIV;
5411                                 *illval = regs[r1];
5412                                 break;
5413                         }
5414                         /*FALLTHROUGH*/
5415                 case DIF_OP_LDSB:
5416                         regs[rd] = (int8_t)dtrace_load8(regs[r1]);
5417                         break;
5418                 case DIF_OP_RLDSH:
5419                         if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
5420                                 *flags |= CPU_DTRACE_KPRIV;
5421                                 *illval = regs[r1];
5422                                 break;
5423                         }
5424                         /*FALLTHROUGH*/
5425                 case DIF_OP_LDSH:
5426                         regs[rd] = (int16_t)dtrace_load16(regs[r1]);
5427                         break;
5428                 case DIF_OP_RLDSW:
5429                         if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
5430                                 *flags |= CPU_DTRACE_KPRIV;
5431                                 *illval = regs[r1];
5432                                 break;
5433                         }
5434                         /*FALLTHROUGH*/
5435                 case DIF_OP_LDSW:
5436                         regs[rd] = (int32_t)dtrace_load32(regs[r1]);
5437                         break;
5438                 case DIF_OP_RLDUB:
5439                         if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
5440                                 *flags |= CPU_DTRACE_KPRIV;
5441                                 *illval = regs[r1];
5442                                 break;
5443                         }
5444                         /*FALLTHROUGH*/
5445                 case DIF_OP_LDUB:
5446                         regs[rd] = dtrace_load8(regs[r1]);
5447                         break;
5448                 case DIF_OP_RLDUH:
5449                         if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
5450                                 *flags |= CPU_DTRACE_KPRIV;
5451                                 *illval = regs[r1];
5452                                 break;
5453                         }
5454                         /*FALLTHROUGH*/
5455                 case DIF_OP_LDUH:
5456                         regs[rd] = dtrace_load16(regs[r1]);
5457                         break;
5458                 case DIF_OP_RLDUW:
5459                         if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
5460                                 *flags |= CPU_DTRACE_KPRIV;
5461                                 *illval = regs[r1];
5462                                 break;
5463                         }
5464                         /*FALLTHROUGH*/
5465                 case DIF_OP_LDUW:
5466                         regs[rd] = dtrace_load32(regs[r1]);
5467                         break;
5468                 case DIF_OP_RLDX:
5469                         if (!dtrace_canstore(regs[r1], 8, mstate, vstate)) {
5470                                 *flags |= CPU_DTRACE_KPRIV;
5471                                 *illval = regs[r1];
5472                                 break;
5473                         }
5474                         /*FALLTHROUGH*/
5475                 case DIF_OP_LDX:
5476                         regs[rd] = dtrace_load64(regs[r1]);
5477                         break;
5478 #if !defined(__APPLE__)
5479                 case DIF_OP_ULDSB:
5480                         regs[rd] = (int8_t)
5481                             dtrace_fuword8((void *)(uintptr_t)regs[r1]);
5482                         break;
5483                 case DIF_OP_ULDSH:
5484                         regs[rd] = (int16_t)
5485                             dtrace_fuword16((void *)(uintptr_t)regs[r1]);
5486                         break;
5487                 case DIF_OP_ULDSW:
5488                         regs[rd] = (int32_t)
5489                             dtrace_fuword32((void *)(uintptr_t)regs[r1]);
5490                         break;
5491                 case DIF_OP_ULDUB:
5492                         regs[rd] =
5493                             dtrace_fuword8((void *)(uintptr_t)regs[r1]);
5494                         break;
5495                 case DIF_OP_ULDUH:
5496                         regs[rd] =
5497                             dtrace_fuword16((void *)(uintptr_t)regs[r1]);
5498                         break;
5499                 case DIF_OP_ULDUW:
5500                         regs[rd] =
5501                             dtrace_fuword32((void *)(uintptr_t)regs[r1]);
5502                         break;
5503                 case DIF_OP_ULDX:
5504                         regs[rd] =
5505                             dtrace_fuword64((void *)(uintptr_t)regs[r1]);
5506                         break;
5507 #else /* Darwin 32-bit kernel may fetch from 64-bit user. Don't want uintptr_t cast. */
5508                 case DIF_OP_ULDSB:
5509                         regs[rd] = (int8_t)
5510                             dtrace_fuword8(regs[r1]);
5511                         break;
5512                 case DIF_OP_ULDSH:
5513                         regs[rd] = (int16_t)
5514                             dtrace_fuword16(regs[r1]);
5515                         break;
5516                 case DIF_OP_ULDSW:
5517                         regs[rd] = (int32_t)
5518                             dtrace_fuword32(regs[r1]);
5519                         break;
5520                 case DIF_OP_ULDUB:
5521                         regs[rd] =
5522                             dtrace_fuword8(regs[r1]);
5523                         break;
5524                 case DIF_OP_ULDUH:
5525                         regs[rd] =
5526                             dtrace_fuword16(regs[r1]);
5527                         break;
5528                 case DIF_OP_ULDUW:
5529                         regs[rd] =
5530                             dtrace_fuword32(regs[r1]);
5531                         break;
5532                 case DIF_OP_ULDX:
5533                         regs[rd] =
5534                             dtrace_fuword64(regs[r1]);
5535 #endif /* __APPLE__ */
5536                         break;
5537                 case DIF_OP_RET:
5538                         rval = regs[rd];
5539                         pc = textlen;
5540                         break;
5541                 case DIF_OP_NOP:
5542                         break;
5543                 case DIF_OP_SETX:
5544                         regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
5545                         break;
5546                 case DIF_OP_SETS:
5547                         regs[rd] = (uint64_t)(uintptr_t)
5548                             (strtab + DIF_INSTR_STRING(instr));
5549                         break;
5550                 case DIF_OP_SCMP: {
5551                         size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
5552                         uintptr_t s1 = regs[r1];
5553                         uintptr_t s2 = regs[r2];
5554
5555                         if (s1 != NULL &&
5556                             !dtrace_strcanload(s1, sz, mstate, vstate))
5557                                 break;
5558                         if (s2 != NULL &&
5559                             !dtrace_strcanload(s2, sz, mstate, vstate))
5560                                 break;
5561
5562                         cc_r = dtrace_strncmp((char *)s1, (char *)s2, sz);
5563
5564                         cc_n = cc_r < 0;
5565                         cc_z = cc_r == 0;
5566                         cc_v = cc_c = 0;
5567                         break;
5568                 }
5569                 case DIF_OP_LDGA:
5570                         regs[rd] = dtrace_dif_variable(mstate, state,
5571                             r1, regs[r2]);
5572                         break;
5573                 case DIF_OP_LDGS:
5574                         id = DIF_INSTR_VAR(instr);
5575
5576                         if (id >= DIF_VAR_OTHER_UBASE) {
5577                                 uintptr_t a;
5578
5579                                 id -= DIF_VAR_OTHER_UBASE;
5580                                 svar = vstate->dtvs_globals[id];
5581                                 ASSERT(svar != NULL);
5582                                 v = &svar->dtsv_var;
5583
5584                                 if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
5585                                         regs[rd] = svar->dtsv_data;
5586                                         break;
5587                                 }
5588
5589                                 a = (uintptr_t)svar->dtsv_data;
5590
5591                                 if (*(uint8_t *)a == UINT8_MAX) {
5592                                         /*
5593                                          * If the 0th byte is set to UINT8_MAX
5594                                          * then this is to be treated as a
5595                                          * reference to a NULL variable.
5596                                          */
5597                                         regs[rd] = NULL;
5598                                 } else {
5599                                         regs[rd] = a + sizeof (uint64_t);
5600                                 }
5601
5602                                 break;
5603                         }
5604
5605                         regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
5606                         break;
5607
5608                 case DIF_OP_STGS:
5609                         id = DIF_INSTR_VAR(instr);
5610
5611                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
5612                         id -= DIF_VAR_OTHER_UBASE;
5613
5614                         svar = vstate->dtvs_globals[id];
5615                         ASSERT(svar != NULL);
5616                         v = &svar->dtsv_var;
5617
5618                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5619                                 uintptr_t a = (uintptr_t)svar->dtsv_data;
5620
5621                                 ASSERT(a != NULL);
5622                                 ASSERT(svar->dtsv_size != 0);
5623
5624                                 if (regs[rd] == NULL) {
5625                                         *(uint8_t *)a = UINT8_MAX;
5626                                         break;
5627                                 } else {
5628                                         *(uint8_t *)a = 0;
5629                                         a += sizeof (uint64_t);
5630                                 }
5631                                 if (!dtrace_vcanload(
5632                                     (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5633                                     mstate, vstate))
5634                                         break;
5635
5636                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5637                                     (void *)a, &v->dtdv_type);
5638                                 break;
5639                         }
5640
5641                         svar->dtsv_data = regs[rd];
5642                         break;
5643
5644                 case DIF_OP_LDTA:
5645                         /*
5646                          * There are no DTrace built-in thread-local arrays at
5647                          * present.  This opcode is saved for future work.
5648                          */
5649                         *flags |= CPU_DTRACE_ILLOP;
5650                         regs[rd] = 0;
5651                         break;
5652
5653                 case DIF_OP_LDLS:
5654                         id = DIF_INSTR_VAR(instr);
5655
5656                         if (id < DIF_VAR_OTHER_UBASE) {
5657                                 /*
5658                                  * For now, this has no meaning.
5659                                  */
5660                                 regs[rd] = 0;
5661                                 break;
5662                         }
5663
5664                         id -= DIF_VAR_OTHER_UBASE;
5665
5666 #if !defined(__APPLE__)   /* Quiet compiler warnings */
5667                         ASSERT(id < vstate->dtvs_nlocals);
5668 #else
5669                         ASSERT(id < (uint_t)vstate->dtvs_nlocals);
5670 #endif /* __APPLE__ */
5671                         ASSERT(vstate->dtvs_locals != NULL);
5672
5673                         svar = vstate->dtvs_locals[id];
5674                         ASSERT(svar != NULL);
5675                         v = &svar->dtsv_var;
5676
5677                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5678                                 uintptr_t a = (uintptr_t)svar->dtsv_data;
5679                                 size_t sz = v->dtdv_type.dtdt_size;
5680
5681                                 sz += sizeof (uint64_t);
5682                                 ASSERT(svar->dtsv_size == (int)NCPU * sz);
5683                                 a += CPU->cpu_id * sz;
5684
5685                                 if (*(uint8_t *)a == UINT8_MAX) {
5686                                         /*
5687                                          * If the 0th byte is set to UINT8_MAX
5688                                          * then this is to be treated as a
5689                                          * reference to a NULL variable.
5690                                          */
5691                                         regs[rd] = NULL;
5692                                 } else {
5693                                         regs[rd] = a + sizeof (uint64_t);
5694                                 }
5695
5696                                 break;
5697                         }
5698
5699                         ASSERT(svar->dtsv_size == (int)NCPU * sizeof (uint64_t));
5700                         tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
5701                         regs[rd] = tmp[CPU->cpu_id];
5702                         break;
5703
5704                 case DIF_OP_STLS:
5705                         id = DIF_INSTR_VAR(instr);
5706
5707                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
5708                         id -= DIF_VAR_OTHER_UBASE;
5709 #if !defined(__APPLE__)   /* Quiet compiler warnings */
5710                         ASSERT(id < vstate->dtvs_nlocals);
5711 #else
5712                         ASSERT(id < (uint_t)vstate->dtvs_nlocals);
5713 #endif /* __APPLE__ */
5714
5715                         ASSERT(vstate->dtvs_locals != NULL);
5716                         svar = vstate->dtvs_locals[id];
5717                         ASSERT(svar != NULL);
5718                         v = &svar->dtsv_var;
5719
5720                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5721                                 uintptr_t a = (uintptr_t)svar->dtsv_data;
5722                                 size_t sz = v->dtdv_type.dtdt_size;
5723
5724                                 sz += sizeof (uint64_t);
5725                                 ASSERT(svar->dtsv_size == (int)NCPU * sz);
5726                                 a += CPU->cpu_id * sz;
5727
5728                                 if (regs[rd] == NULL) {
5729                                         *(uint8_t *)a = UINT8_MAX;
5730                                         break;
5731                                 } else {
5732                                         *(uint8_t *)a = 0;
5733                                         a += sizeof (uint64_t);
5734                                 }
5735
5736                                 if (!dtrace_vcanload(
5737                                     (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5738                                     mstate, vstate))
5739                                         break;
5740
5741                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5742                                     (void *)a, &v->dtdv_type);
5743                                 break;
5744                         }
5745
5746                         ASSERT(svar->dtsv_size == (int)NCPU * sizeof (uint64_t));
5747                         tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
5748                         tmp[CPU->cpu_id] = regs[rd];
5749                         break;
5750
5751                 case DIF_OP_LDTS: {
5752                         dtrace_dynvar_t *dvar;
5753                         dtrace_key_t *key;
5754
5755                         id = DIF_INSTR_VAR(instr);
5756                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
5757                         id -= DIF_VAR_OTHER_UBASE;
5758                         v = &vstate->dtvs_tlocals[id];
5759
5760                         key = &tupregs[DIF_DTR_NREGS];
5761                         key[0].dttk_value = (uint64_t)id;
5762                         key[0].dttk_size = 0;
5763                         DTRACE_TLS_THRKEY(key[1].dttk_value);
5764                         key[1].dttk_size = 0;
5765
5766                         dvar = dtrace_dynvar(dstate, 2, key,
5767                             sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
5768                             mstate, vstate);
5769
5770                         if (dvar == NULL) {
5771                                 regs[rd] = 0;
5772                                 break;
5773                         }
5774
5775                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5776                                 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
5777                         } else {
5778                                 regs[rd] = *((uint64_t *)dvar->dtdv_data);
5779                         }
5780
5781                         break;
5782                 }
5783
5784                 case DIF_OP_STTS: {
5785                         dtrace_dynvar_t *dvar;
5786                         dtrace_key_t *key;
5787
5788                         id = DIF_INSTR_VAR(instr);
5789                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
5790                         id -= DIF_VAR_OTHER_UBASE;
5791
5792                         key = &tupregs[DIF_DTR_NREGS];
5793                         key[0].dttk_value = (uint64_t)id;
5794                         key[0].dttk_size = 0;
5795                         DTRACE_TLS_THRKEY(key[1].dttk_value);
5796                         key[1].dttk_size = 0;
5797                         v = &vstate->dtvs_tlocals[id];
5798
5799                         dvar = dtrace_dynvar(dstate, 2, key,
5800                             v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5801                             v->dtdv_type.dtdt_size : sizeof (uint64_t),
5802                             regs[rd] ? DTRACE_DYNVAR_ALLOC :
5803                             DTRACE_DYNVAR_DEALLOC, mstate, vstate);
5804
5805                         /*
5806                          * Given that we're storing to thread-local data,
5807                          * we need to flush our predicate cache.
5808                          */
5809 #if !defined(__APPLE__)
5810                         curthread->t_predcache = NULL;
5811 #else
5812                         dtrace_set_thread_predcache(current_thread(), 0);
5813 #endif /* __APPLE__ */
5814
5815                         if (dvar == NULL)
5816                                 break;
5817
5818                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5819                                 if (!dtrace_vcanload(
5820                                     (void *)(uintptr_t)regs[rd],
5821                                     &v->dtdv_type, mstate, vstate))
5822                                         break;
5823
5824                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5825                                     dvar->dtdv_data, &v->dtdv_type);
5826                         } else {
5827                                 *((uint64_t *)dvar->dtdv_data) = regs[rd];
5828                         }
5829
5830                         break;
5831                 }
5832
5833                 case DIF_OP_SRA:
5834                         regs[rd] = (int64_t)regs[r1] >> regs[r2];
5835                         break;
5836
5837                 case DIF_OP_CALL:
5838                         dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
5839                             regs, tupregs, ttop, mstate, state);
5840                         break;
5841
5842                 case DIF_OP_PUSHTR:
5843                         if (ttop == DIF_DTR_NREGS) {
5844                                 *flags |= CPU_DTRACE_TUPOFLOW;
5845                                 break;
5846                         }
5847
5848                         if (r1 == DIF_TYPE_STRING) {
5849                                 /*
5850                                  * If this is a string type and the size is 0,
5851                                  * we'll use the system-wide default string
5852                                  * size.  Note that we are _not_ looking at
5853                                  * the value of the DTRACEOPT_STRSIZE option;
5854                                  * had this been set, we would expect to have
5855                                  * a non-zero size value in the "pushtr".
5856                                  */
5857                                 tupregs[ttop].dttk_size =
5858                                     dtrace_strlen((char *)(uintptr_t)regs[rd],
5859                                     regs[r2] ? regs[r2] :
5860                                     dtrace_strsize_default) + 1;
5861                         } else {
5862                                 tupregs[ttop].dttk_size = regs[r2];
5863                         }
5864
5865                         tupregs[ttop++].dttk_value = regs[rd];
5866                         break;
5867
5868                 case DIF_OP_PUSHTV:
5869                         if (ttop == DIF_DTR_NREGS) {
5870                                 *flags |= CPU_DTRACE_TUPOFLOW;
5871                                 break;
5872                         }
5873
5874                         tupregs[ttop].dttk_value = regs[rd];
5875                         tupregs[ttop++].dttk_size = 0;
5876                         break;
5877
5878                 case DIF_OP_POPTS:
5879                         if (ttop != 0)
5880                                 ttop--;
5881                         break;
5882
5883                 case DIF_OP_FLUSHTS:
5884                         ttop = 0;
5885                         break;
5886
5887                 case DIF_OP_LDGAA:
5888                 case DIF_OP_LDTAA: {
5889                         dtrace_dynvar_t *dvar;
5890                         dtrace_key_t *key = tupregs;
5891                         uint_t nkeys = ttop;
5892
5893                         id = DIF_INSTR_VAR(instr);
5894                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
5895                         id -= DIF_VAR_OTHER_UBASE;
5896
5897                         key[nkeys].dttk_value = (uint64_t)id;
5898                         key[nkeys++].dttk_size = 0;
5899
5900                         if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
5901                                 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5902                                 key[nkeys++].dttk_size = 0;
5903                                 v = &vstate->dtvs_tlocals[id];
5904                         } else {
5905                                 v = &vstate->dtvs_globals[id]->dtsv_var;
5906                         }
5907
5908                         dvar = dtrace_dynvar(dstate, nkeys, key,
5909                             v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5910                             v->dtdv_type.dtdt_size : sizeof (uint64_t),
5911                             DTRACE_DYNVAR_NOALLOC, mstate, vstate);
5912
5913                         if (dvar == NULL) {
5914                                 regs[rd] = 0;
5915                                 break;
5916                         }
5917
5918                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5919                                 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
5920                         } else {
5921                                 regs[rd] = *((uint64_t *)dvar->dtdv_data);
5922                         }
5923
5924                         break;
5925                 }
5926
5927                 case DIF_OP_STGAA:
5928                 case DIF_OP_STTAA: {
5929                         dtrace_dynvar_t *dvar;
5930                         dtrace_key_t *key = tupregs;
5931                         uint_t nkeys = ttop;
5932
5933                         id = DIF_INSTR_VAR(instr);
5934                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
5935                         id -= DIF_VAR_OTHER_UBASE;
5936
5937                         key[nkeys].dttk_value = (uint64_t)id;
5938                         key[nkeys++].dttk_size = 0;
5939
5940                         if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
5941                                 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5942                                 key[nkeys++].dttk_size = 0;
5943                                 v = &vstate->dtvs_tlocals[id];
5944                         } else {
5945                                 v = &vstate->dtvs_globals[id]->dtsv_var;
5946                         }
5947
5948                         dvar = dtrace_dynvar(dstate, nkeys, key,
5949                             v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5950                             v->dtdv_type.dtdt_size : sizeof (uint64_t),
5951                             regs[rd] ? DTRACE_DYNVAR_ALLOC :
5952                             DTRACE_DYNVAR_DEALLOC, mstate, vstate);
5953
5954                         if (dvar == NULL)
5955                                 break;
5956
5957                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5958                                 if (!dtrace_vcanload(
5959                                     (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5960                                     mstate, vstate))
5961                                         break;
5962
5963                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5964                                     dvar->dtdv_data, &v->dtdv_type);
5965                         } else {
5966                                 *((uint64_t *)dvar->dtdv_data) = regs[rd];
5967                         }
5968
5969                         break;
5970                 }
5971
5972                 case DIF_OP_ALLOCS: {
5973                         uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
5974                         size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
5975
5976                         /*
5977                          * Rounding up the user allocation size could have
5978                          * overflowed large, bogus allocations (like -1ULL) to
5979                          * 0.
5980                          */
5981                         if (size < regs[r1] ||
5982                             !DTRACE_INSCRATCH(mstate, size)) {
5983                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5984                                 regs[rd] = NULL;
5985                                 break;
5986                         }
5987
5988                         dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
5989                                 mstate->dtms_scratch_ptr += size;
5990                                 regs[rd] = ptr;
5991                         break;
5992                 }
5993
5994                 case DIF_OP_COPYS:
5995                         if (!dtrace_canstore(regs[rd], regs[r2],
5996                             mstate, vstate)) {
5997                                 *flags |= CPU_DTRACE_BADADDR;
5998                                 *illval = regs[rd];
5999                                 break;
6000                         }
6001
6002                         if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
6003                                 break;
6004
6005                         dtrace_bcopy((void *)(uintptr_t)regs[r1],
6006                             (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
6007                         break;
6008
6009                 case DIF_OP_STB:
6010                         if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
6011                                 *flags |= CPU_DTRACE_BADADDR;
6012                                 *illval = regs[rd];
6013                                 break;
6014                         }
6015                         *((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
6016                         break;
6017
6018                 case DIF_OP_STH:
6019                         if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
6020                                 *flags |= CPU_DTRACE_BADADDR;
6021                                 *illval = regs[rd];
6022                                 break;
6023                         }
6024                         if (regs[rd] & 1) {
6025                                 *flags |= CPU_DTRACE_BADALIGN;
6026                                 *illval = regs[rd];
6027                                 break;
6028                         }
6029                         *((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
6030                         break;
6031
6032                 case DIF_OP_STW:
6033                         if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
6034                                 *flags |= CPU_DTRACE_BADADDR;
6035                                 *illval = regs[rd];
6036                                 break;
6037                         }
6038                         if (regs[rd] & 3) {
6039                                 *flags |= CPU_DTRACE_BADALIGN;
6040                                 *illval = regs[rd];
6041                                 break;
6042                         }
6043                         *((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
6044                         break;
6045
6046                 case DIF_OP_STX:
6047                         if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
6048                                 *flags |= CPU_DTRACE_BADADDR;
6049                                 *illval = regs[rd];
6050                                 break;
6051                         }
6052 #if !defined(__APPLE__)
6053                         if (regs[rd] & 7) {
6054 #else
6055                         if (regs[rd] & 3) { /* Darwin kmem_zalloc() called from dtrace_difo_init() is 4-byte aligned. */
6056 #endif /* __APPLE__ */
6057                                 *flags |= CPU_DTRACE_BADALIGN;
6058                                 *illval = regs[rd];
6059                                 break;
6060                         }
6061                         *((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
6062                         break;
6063                 }
6064         }
6065
6066         if (!(*flags & CPU_DTRACE_FAULT))
6067                 return (rval);
6068
6069         mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
6070         mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
6071
6072         return (0);
6073 }
6074
6075 static void
6076 dtrace_action_breakpoint(dtrace_ecb_t *ecb)
6077 {
6078         dtrace_probe_t *probe = ecb->dte_probe;
6079         dtrace_provider_t *prov = probe->dtpr_provider;
6080         char c[DTRACE_FULLNAMELEN + 80], *str;
6081 #if !defined(__APPLE__)   /* Quiet compiler warnings */
6082         char *msg = "dtrace: breakpoint action at probe ";
6083         char *ecbmsg = " (ecb ";
6084 #else
6085         const char *msg = "dtrace: breakpoint action at probe ";
6086         const char *ecbmsg = " (ecb ";
6087 #endif /* __APPLE__ */
6088         uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
6089         uintptr_t val = (uintptr_t)ecb;
6090         int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
6091
6092         if (dtrace_destructive_disallow)
6093                 return;
6094
6095         /*
6096          * It's impossible to be taking action on the NULL probe.
6097          */
6098         ASSERT(probe != NULL);
6099
6100         /*
6101          * This is a poor man's (destitute man's?) sprintf():  we want to
6102          * print the provider name, module name, function name and name of
6103          * the probe, along with the hex address of the ECB with the breakpoint
6104          * action -- all of which we must place in the character buffer by
6105          * hand.
6106          */
6107         while (*msg != '\0')
6108                 c[i++] = *msg++;
6109
6110         for (str = prov->dtpv_name; *str != '\0'; str++)
6111                 c[i++] = *str;
6112         c[i++] = ':';
6113
6114         for (str = probe->dtpr_mod; *str != '\0'; str++)
6115                 c[i++] = *str;
6116         c[i++] = ':';
6117
6118         for (str = probe->dtpr_func; *str != '\0'; str++)
6119                 c[i++] = *str;
6120         c[i++] = ':';
6121
6122         for (str = probe->dtpr_name; *str != '\0'; str++)
6123                 c[i++] = *str;
6124
6125         while (*ecbmsg != '\0')
6126                 c[i++] = *ecbmsg++;
6127
6128         while (shift >= 0) {
6129                 mask = (uintptr_t)0xf << shift;
6130
6131                 if (val >= ((uintptr_t)1 << shift))
6132                         c[i++] = "0123456789abcdef"[(val & mask) >> shift];
6133                 shift -= 4;
6134         }
6135
6136         c[i++] = ')';
6137         c[i] = '\0';
6138
6139         debug_enter(c);
6140 }
6141
6142 static void
6143 dtrace_action_panic(dtrace_ecb_t *ecb)
6144 {
6145         dtrace_probe_t *probe = ecb->dte_probe;
6146
6147         /*
6148          * It's impossible to be taking action on the NULL probe.
6149          */
6150         ASSERT(probe != NULL);
6151
6152         if (dtrace_destructive_disallow)
6153                 return;
6154
6155         if (dtrace_panicked != NULL)
6156                 return;
6157
6158 #if !defined(__APPLE__)
6159         if (dtrace_casptr(&dtrace_panicked, NULL, curthread) != NULL)
6160                 return;
6161 #else
6162         if (dtrace_casptr(&dtrace_panicked, NULL, current_thread()) != NULL)
6163                 return;
6164 #endif /* __APPLE__ */
6165
6166         /*
6167          * We won the right to panic.  (We want to be sure that only one
6168          * thread calls panic() from dtrace_probe(), and that panic() is
6169          * called exactly once.)
6170          */
6171         dtrace_panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
6172             probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
6173             probe->dtpr_func, probe->dtpr_name, (void *)ecb);
6174
6175 #if defined(__APPLE__)
6176         /* Mac OS X debug feature -- can return from panic() */
6177         dtrace_panicked = NULL;
6178 #endif /* __APPLE__ */
6179 }
6180
6181 static void
6182 dtrace_action_raise(uint64_t sig)
6183 {
6184         if (dtrace_destructive_disallow)
6185                 return;
6186
6187         if (sig >= NSIG) {
6188                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6189                 return;
6190         }
6191
6192 #if !defined(__APPLE__)
6193         /*
6194          * raise() has a queue depth of 1 -- we ignore all subsequent
6195          * invocations of the raise() action.
6196          */
6197         if (curthread->t_dtrace_sig == 0)
6198                 curthread->t_dtrace_sig = (uint8_t)sig;
6199
6200         curthread->t_sig_check = 1;
6201         aston(curthread);
6202 #else
6203         uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
6204
6205         if (uthread && uthread->t_dtrace_sig == 0) {
6206                 uthread->t_dtrace_sig = sig;
6207                 act_set_astbsd(current_thread());
6208         }
6209 #endif /* __APPLE__ */
6210 }
6211
6212 static void
6213 dtrace_action_stop(void)
6214 {
6215         if (dtrace_destructive_disallow)
6216                 return;
6217
6218 #if !defined(__APPLE__)
6219         if (!curthread->t_dtrace_stop) {
6220                 curthread->t_dtrace_stop = 1;
6221                 curthread->t_sig_check = 1;
6222                 aston(curthread);
6223         }
6224 #else
6225         uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
6226         if (uthread) {
6227                 /*
6228                  * The currently running process will be set to task_suspend
6229                  * when it next leaves the kernel.
6230                 */
6231                 uthread->t_dtrace_stop = 1;
6232                 act_set_astbsd(current_thread());
6233         }
6234
6235 #endif /* __APPLE__ */
6236 }
6237
6238 #if defined(__APPLE__)
6239 static void
6240 dtrace_action_pidresume(uint64_t pid)
6241 {
6242         if (dtrace_destructive_disallow)
6243                 return;
6244
6245         if (kauth_cred_issuser(kauth_cred_get()) == 0) {
6246                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6247                 return;
6248         }
6249
6250         uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
6251
6252         /*
6253          * When the currently running process leaves the kernel, it attempts to
6254          * task_resume the process (denoted by pid), if that pid appears to have
6255          * been stopped by dtrace_action_stop().
6256          * The currently running process has a pidresume() queue depth of 1 --
6257          * subsequent invocations of the pidresume() action are ignored.
6258          */
6259
6260         if (pid != 0 && uthread && uthread->t_dtrace_resumepid == 0) {
6261                 uthread->t_dtrace_resumepid = pid;
6262                 act_set_astbsd(current_thread());
6263         }
6264 }
6265 #endif /* __APPLE__ */
6266
6267
6268 static void
6269 dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
6270 {
6271         hrtime_t now;
6272         volatile uint16_t *flags;
6273         dtrace_cpu_t *cpu = CPU;
6274
6275         if (dtrace_destructive_disallow)
6276                 return;
6277
6278         flags = (volatile uint16_t *)&cpu_core[cpu->cpu_id].cpuc_dtrace_flags;
6279
6280         now = dtrace_gethrtime();
6281
6282         if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
6283                 /*
6284                  * We need to advance the mark to the current time.
6285                  */
6286                 cpu->cpu_dtrace_chillmark = now;
6287                 cpu->cpu_dtrace_chilled = 0;
6288         }
6289
6290         /*
6291          * Now check to see if the requested chill time would take us over
6292          * the maximum amount of time allowed in the chill interval.  (Or
6293          * worse, if the calculation itself induces overflow.)
6294          */
6295         if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
6296             cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
6297                 *flags |= CPU_DTRACE_ILLOP;
6298                 return;
6299         }
6300
6301         while (dtrace_gethrtime() - now < val)
6302                 continue;
6303
6304         /*
6305          * Normally, we assure that the value of the variable "timestamp" does
6306          * not change within an ECB.  The presence of chill() represents an
6307          * exception to this rule, however.
6308          */
6309         mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
6310         cpu->cpu_dtrace_chilled += val;
6311 }
6312
6313 static void
6314 dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
6315     uint64_t *buf, uint64_t arg)
6316 {
6317         int nframes = DTRACE_USTACK_NFRAMES(arg);
6318         int strsize = DTRACE_USTACK_STRSIZE(arg);
6319         uint64_t *pcs = &buf[1], *fps;
6320         char *str = (char *)&pcs[nframes];
6321         int size, offs = 0, i, j;
6322         uintptr_t old = mstate->dtms_scratch_ptr, saved;
6323         uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
6324         char *sym;
6325
6326         /*
6327          * Should be taking a faster path if string space has not been
6328          * allocated.
6329          */
6330         ASSERT(strsize != 0);
6331
6332         /*
6333          * We will first allocate some temporary space for the frame pointers.
6334          */
6335         fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6336         size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
6337             (nframes * sizeof (uint64_t));
6338
6339 #if !defined(__APPLE__)   /* Quiet compiler warnings */
6340         if (!DTRACE_INSCRATCH(mstate, size)) {
6341 #else
6342         if (!DTRACE_INSCRATCH(mstate, (uintptr_t)size)) {
6343 #endif /* __APPLE__ */
6344                 /*
6345                  * Not enough room for our frame pointers -- need to indicate
6346                  * that we ran out of scratch space.
6347                  */
6348                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6349                 return;
6350         }
6351
6352         mstate->dtms_scratch_ptr += size;
6353         saved = mstate->dtms_scratch_ptr;
6354
6355         /*
6356          * Now get a stack with both program counters and frame pointers.
6357          */
6358         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6359         dtrace_getufpstack(buf, fps, nframes + 1);
6360         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6361
6362         /*
6363          * If that faulted, we're cooked.
6364          */
6365         if (*flags & CPU_DTRACE_FAULT)
6366                 goto out;
6367
6368         /*
6369          * Now we want to walk up the stack, calling the USTACK helper.  For
6370          * each iteration, we restore the scratch pointer.
6371          */
6372         for (i = 0; i < nframes; i++) {
6373                 mstate->dtms_scratch_ptr = saved;
6374
6375                 if (offs >= strsize)
6376                         break;
6377
6378                 sym = (char *)(uintptr_t)dtrace_helper(
6379                     DTRACE_HELPER_ACTION_USTACK,
6380                     mstate, state, pcs[i], fps[i]);
6381
6382                 /*
6383                  * If we faulted while running the helper, we're going to
6384                  * clear the fault and null out the corresponding string.
6385                  */
6386                 if (*flags & CPU_DTRACE_FAULT) {
6387                         *flags &= ~CPU_DTRACE_FAULT;
6388                         str[offs++] = '\0';
6389                         continue;
6390                 }
6391
6392                 if (sym == NULL) {
6393                         str[offs++] = '\0';
6394                         continue;
6395                 }
6396
6397                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6398
6399                 /*
6400                  * Now copy in the string that the helper returned to us.
6401                  */
6402                 for (j = 0; offs + j < strsize; j++) {
6403                         if ((str[offs + j] = sym[j]) == '\0')
6404                                 break;
6405                 }
6406
6407                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6408
6409                 offs += j + 1;
6410         }
6411
6412         if (offs >= strsize) {
6413                 /*
6414                  * If we didn't have room for all of the strings, we don't
6415                  * abort processing -- this needn't be a fatal error -- but we
6416                  * still want to increment a counter (dts_stkstroverflows) to
6417                  * allow this condition to be warned about.  (If this is from
6418                  * a jstack() action, it is easily tuned via jstackstrsize.)
6419                  */
6420                 dtrace_error(&state->dts_stkstroverflows);
6421         }
6422
6423         while (offs < strsize)
6424                 str[offs++] = '\0';
6425
6426 out:
6427         mstate->dtms_scratch_ptr = old;
6428 }
6429
6430 /*
6431  * If you're looking for the epicenter of DTrace, you just found it.  This
6432  * is the function called by the provider to fire a probe -- from which all
6433  * subsequent probe-context DTrace activity emanates.
6434  */
6435 #if !defined(__APPLE__)
6436 void
6437 dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
6438     uintptr_t arg2, uintptr_t arg3, uintptr_t arg4)
6439 #else
6440 static void
6441 __dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
6442     uint64_t arg2, uint64_t arg3, uint64_t arg4)
6443 #endif /* __APPLE__ */
6444 {
6445         processorid_t cpuid;
6446         dtrace_icookie_t cookie;
6447         dtrace_probe_t *probe;
6448         dtrace_mstate_t mstate;
6449         dtrace_ecb_t *ecb;
6450         dtrace_action_t *act;
6451         intptr_t offs;
6452         size_t size;
6453         int vtime, onintr;
6454         volatile uint16_t *flags;
6455         hrtime_t now;
6456
6457 #if !defined(__APPLE__)
6458         /*
6459          * Kick out immediately if this CPU is still being born (in which case
6460          * curthread will be set to -1) or the current thread can't allow
6461          * probes in its current context.
6462          */
6463         if (((uintptr_t)curthread & 1) || (curthread->t_flag & T_DONTDTRACE))
6464                 return;
6465 #else
6466         /* Not a concern for Darwin */
6467 #endif /* __APPLE__ */
6468
6469         cookie = dtrace_interrupt_disable();
6470         probe = dtrace_probes[id - 1];
6471         cpuid = CPU->cpu_id;
6472         onintr = CPU_ON_INTR(CPU);
6473
6474 #if !defined(__APPLE__)
6475         if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
6476             probe->dtpr_predcache == curthread->t_predcache) {
6477 #else
6478         if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
6479             probe->dtpr_predcache == dtrace_get_thread_predcache(current_thread())) {
6480 #endif /* __APPLE__ */
6481                 /*
6482                  * We have hit in the predicate cache; we know that
6483                  * this predicate would evaluate to be false.
6484                  */
6485                 dtrace_interrupt_enable(cookie);
6486                 return;
6487         }
6488
6489         if (panic_quiesce) {
6490                 /*
6491                  * We don't trace anything if we're panicking.
6492                  */
6493                 dtrace_interrupt_enable(cookie);
6494                 return;
6495         }
6496
6497 #if !defined(__APPLE__)
6498         now = dtrace_gethrtime();
6499         vtime = dtrace_vtime_references != 0;
6500
6501         if (vtime && curthread->t_dtrace_start)
6502                 curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
6503 #else
6504         /* FIXME: the time spent entering DTrace and arriving to this point is attributed
6505            to the current thread. Instead it should accrue to DTrace. */
6506         vtime = dtrace_vtime_references != 0;
6507
6508         if (vtime)
6509         {
6510                 int64_t dtrace_accum_time, recent_vtime;
6511                 thread_t thread = current_thread();
6512
6513                 dtrace_accum_time = dtrace_get_thread_tracing(thread); /* Time spent inside DTrace so far (nanoseconds) */
6514
6515                 if (dtrace_accum_time >= 0) {
6516                         recent_vtime = dtrace_abs_to_nano(dtrace_calc_thread_recent_vtime(thread)); /* up to the moment thread vtime */
6517
6518                         recent_vtime = recent_vtime - dtrace_accum_time; /* Time without DTrace contribution */
6519
6520                         dtrace_set_thread_vtime(thread, recent_vtime);
6521                 }
6522         }
6523
6524         now = dtrace_gethrtime(); /* must not precede dtrace_calc_thread_recent_vtime() call! */
6525 #endif /* __APPLE__ */
6526
6527 #if defined(__APPLE__)
6528         /*
6529          * A provider may call dtrace_probe_error() in lieu of dtrace_probe() in some circumstances.
6530          * See, e.g. fasttrap_isa.c. However the provider has no access to ECB context, so passes
6531          * 0 through "arg0" and the probe_id of the overridden probe as arg1. Detect that here
6532          * and cons up a viable state (from the probe_id).
6533          */
6534         if (dtrace_probeid_error == id && 0 == arg0) {
6535                 dtrace_id_t ftp_id = (dtrace_id_t)arg1;
6536                 dtrace_probe_t *ftp_probe = dtrace_probes[ftp_id - 1];
6537                 dtrace_ecb_t *ftp_ecb = ftp_probe->dtpr_ecb;
6538
6539                 if (NULL != ftp_ecb) {
6540                         dtrace_state_t *ftp_state = ftp_ecb->dte_state;
6541
6542                         arg0 = (uint64_t)(uintptr_t)ftp_state;
6543                         arg1 = ftp_ecb->dte_epid;
6544                         /*
6545                          * args[2-4] established by caller.
6546                          */
6547                         ftp_state->dts_arg_error_illval = -1; /* arg5 */
6548                 }
6549         }
6550 #endif /* __APPLE__ */
6551
6552         mstate.dtms_difo = NULL;
6553         mstate.dtms_probe = probe;
6554         mstate.dtms_strtok = NULL;
6555         mstate.dtms_arg[0] = arg0;
6556         mstate.dtms_arg[1] = arg1;
6557         mstate.dtms_arg[2] = arg2;
6558         mstate.dtms_arg[3] = arg3;
6559         mstate.dtms_arg[4] = arg4;
6560
6561         flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
6562
6563         for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
6564                 dtrace_predicate_t *pred = ecb->dte_predicate;
6565                 dtrace_state_t *state = ecb->dte_state;
6566                 dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
6567                 dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
6568                 dtrace_vstate_t *vstate = &state->dts_vstate;
6569                 dtrace_provider_t *prov = probe->dtpr_provider;
6570                 int committed = 0;
6571                 caddr_t tomax;
6572
6573                 /*
6574                  * A little subtlety with the following (seemingly innocuous)
6575                  * declaration of the automatic 'val':  by looking at the
6576                  * code, you might think that it could be declared in the
6577                  * action processing loop, below.  (That is, it's only used in
6578                  * the action processing loop.)  However, it must be declared
6579                  * out of that scope because in the case of DIF expression
6580                  * arguments to aggregating actions, one iteration of the
6581                  * action loop will use the last iteration's value.
6582                  */
6583 #ifdef lint
6584                 uint64_t val = 0;
6585 #else
6586                 uint64_t val = 0;
6587 #endif
6588
6589                 mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
6590                 *flags &= ~CPU_DTRACE_ERROR;
6591
6592                 if (prov == dtrace_provider) {
6593                         /*
6594                          * If dtrace itself is the provider of this probe,
6595                          * we're only going to continue processing the ECB if
6596                          * arg0 (the dtrace_state_t) is equal to the ECB's
6597                          * creating state.  (This prevents disjoint consumers
6598                          * from seeing one another's metaprobes.)
6599                          */
6600                         if (arg0 != (uint64_t)(uintptr_t)state)
6601                                 continue;
6602                 }
6603
6604                 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
6605                         /*
6606                          * We're not currently active.  If our provider isn't
6607                          * the dtrace pseudo provider, we're not interested.
6608                          */
6609                         if (prov != dtrace_provider)
6610                                 continue;
6611
6612                         /*
6613                          * Now we must further check if we are in the BEGIN
6614                          * probe.  If we are, we will only continue processing
6615                          * if we're still in WARMUP -- if one BEGIN enabling
6616                          * has invoked the exit() action, we don't want to
6617                          * evaluate subsequent BEGIN enablings.
6618                          */
6619                         if (probe->dtpr_id == dtrace_probeid_begin &&
6620                             state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
6621                                 ASSERT(state->dts_activity ==
6622                                     DTRACE_ACTIVITY_DRAINING);
6623                                 continue;
6624                         }
6625                 }
6626
6627                 if (ecb->dte_cond) {
6628                         /*
6629                          * If the dte_cond bits indicate that this
6630                          * consumer is only allowed to see user-mode firings
6631                          * of this probe, call the provider's dtps_usermode()
6632                          * entry point to check that the probe was fired
6633                          * while in a user context. Skip this ECB if that's
6634                          * not the case.
6635                          */
6636                         if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&
6637                             prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,
6638                             probe->dtpr_id, probe->dtpr_arg) == 0)
6639                                 continue;
6640
6641                         /*
6642                          * This is more subtle than it looks. We have to be
6643                          * absolutely certain that CRED() isn't going to
6644                          * change out from under us so it's only legit to
6645                          * examine that structure if we're in constrained
6646                          * situations. Currently, the only times we'll this
6647                          * check is if a non-super-user has enabled the
6648                          * profile or syscall providers -- providers that
6649                          * allow visibility of all processes. For the
6650                          * profile case, the check above will ensure that
6651                          * we're examining a user context.
6652                          */
6653                         if (ecb->dte_cond & DTRACE_COND_OWNER) {
6654                                 cred_t *cr;
6655                                 cred_t *s_cr =
6656                                     ecb->dte_state->dts_cred.dcr_cred;
6657                                 proc_t *proc;
6658 #pragma unused(proc) /* __APPLE__ */
6659
6660                                 ASSERT(s_cr != NULL);
6661
6662                         /*
6663                          * XXX this is hackish, but so is setting a variable
6664                          * XXX in a McCarthy OR...
6665                          */
6666 #if !defined(__APPLE__)
6667                                 if ((cr = CRED()) == NULL ||
6668 #else
6669                                 if ((cr = dtrace_CRED()) == NULL ||
6670 #endif /* __APPLE__ */
6671                                     posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_uid ||
6672                                     posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_ruid ||
6673                                     posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_suid ||
6674                                     posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_gid ||
6675                                     posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_rgid ||
6676                                     posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_sgid ||
6677 #if !defined(__APPLE__)
6678                                     (proc = ttoproc(curthread)) == NULL ||
6679                                     (proc->p_flag & SNOCD))
6680 #else
6681                                         1) /* Darwin omits "No Core Dump" flag. */
6682 #endif /* __APPLE__ */
6683                                         continue;
6684                         }
6685
6686                         if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
6687                                 cred_t *cr;
6688                                 cred_t *s_cr =
6689                                     ecb->dte_state->dts_cred.dcr_cred;
6690 #pragma unused(cr, s_cr) /* __APPLE__ */
6691
6692                                 ASSERT(s_cr != NULL);
6693
6694 #if !defined(__APPLE__)
6695                                 if ((cr = CRED()) == NULL ||
6696                                     s_cr->cr_zone->zone_id !=
6697                                     cr->cr_zone->zone_id)
6698                                         continue;
6699 #else
6700                                 /* Darwin doesn't do zones. */
6701 #endif /* __APPLE__ */
6702                         }
6703                 }
6704
6705                 if (now - state->dts_alive > dtrace_deadman_timeout) {
6706                         /*
6707                          * We seem to be dead.  Unless we (a) have kernel
6708                          * destructive permissions (b) have expicitly enabled
6709                          * destructive actions and (c) destructive actions have
6710                          * not been disabled, we're going to transition into
6711                          * the KILLED state, from which no further processing
6712                          * on this state will be performed.
6713                          */
6714                         if (!dtrace_priv_kernel_destructive(state) ||
6715                             !state->dts_cred.dcr_destructive ||
6716                             dtrace_destructive_disallow) {
6717                                 void *activity = &state->dts_activity;
6718                                 dtrace_activity_t current;
6719
6720                                 do {
6721                                         current = state->dts_activity;
6722                                 } while (dtrace_cas32(activity, current,
6723                                     DTRACE_ACTIVITY_KILLED) != current);
6724
6725                                 continue;
6726                         }
6727                 }
6728
6729                 if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
6730                     ecb->dte_alignment, state, &mstate)) < 0)
6731                         continue;
6732
6733                 tomax = buf->dtb_tomax;
6734                 ASSERT(tomax != NULL);
6735
6736                 if (ecb->dte_size != 0)
6737                         DTRACE_STORE(uint32_t, tomax, offs, ecb->dte_epid);
6738
6739                 mstate.dtms_epid = ecb->dte_epid;
6740                 mstate.dtms_present |= DTRACE_MSTATE_EPID;
6741
6742                 if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
6743                         mstate.dtms_access = DTRACE_ACCESS_KERNEL;
6744                 else
6745                         mstate.dtms_access = 0;
6746
6747                 if (pred != NULL) {
6748                         dtrace_difo_t *dp = pred->dtp_difo;
6749                         int rval;
6750
6751                         rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
6752
6753                         if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
6754                                 dtrace_cacheid_t cid = probe->dtpr_predcache;
6755
6756                                 if (cid != DTRACE_CACHEIDNONE && !onintr) {
6757                                         /*
6758                                          * Update the predicate cache...
6759                                          */
6760                                         ASSERT(cid == pred->dtp_cacheid);
6761 #if !defined(__APPLE__)
6762                                         curthread->t_predcache = cid;
6763 #else
6764                                         dtrace_set_thread_predcache(current_thread(), cid);
6765 #endif /* __APPLE__ */
6766                                 }
6767
6768                                 continue;
6769                         }
6770                 }
6771
6772                 for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
6773                     act != NULL; act = act->dta_next) {
6774                         size_t valoffs;
6775                         dtrace_difo_t *dp;
6776                         dtrace_recdesc_t *rec = &act->dta_rec;
6777
6778                         size = rec->dtrd_size;
6779                         valoffs = offs + rec->dtrd_offset;
6780
6781                         if (DTRACEACT_ISAGG(act->dta_kind)) {
6782                                 uint64_t v = 0xbad;
6783                                 dtrace_aggregation_t *agg;
6784
6785                                 agg = (dtrace_aggregation_t *)act;
6786
6787                                 if ((dp = act->dta_difo) != NULL)
6788                                         v = dtrace_dif_emulate(dp,
6789                                             &mstate, vstate, state);
6790
6791                                 if (*flags & CPU_DTRACE_ERROR)
6792                                         continue;
6793
6794                                 /*
6795                                  * Note that we always pass the expression
6796                                  * value from the previous iteration of the
6797                                  * action loop.  This value will only be used
6798                                  * if there is an expression argument to the
6799                                  * aggregating action, denoted by the
6800                                  * dtag_hasarg field.
6801                                  */
6802                                 dtrace_aggregate(agg, buf,
6803                                     offs, aggbuf, v, val);
6804                                 continue;
6805                         }
6806
6807                         switch (act->dta_kind) {
6808                         case DTRACEACT_STOP:
6809                                 if (dtrace_priv_proc_destructive(state))
6810                                         dtrace_action_stop();
6811                                 continue;
6812
6813                         case DTRACEACT_BREAKPOINT:
6814                                 if (dtrace_priv_kernel_destructive(state))
6815                                         dtrace_action_breakpoint(ecb);
6816                                 continue;
6817
6818                         case DTRACEACT_PANIC:
6819                                 if (dtrace_priv_kernel_destructive(state))
6820                                         dtrace_action_panic(ecb);
6821                                 continue;
6822
6823                         case DTRACEACT_STACK:
6824                                 if (!dtrace_priv_kernel(state))
6825                                         continue;
6826
6827 #if !defined(__APPLE__) /* Quiet compiler warnings */
6828                                 dtrace_getpcstack((pc_t *)(tomax + valoffs),
6829                                     size / sizeof (pc_t), probe->dtpr_aframes,
6830                                     DTRACE_ANCHORED(probe) ? NULL :
6831                                     (uint32_t *)arg0);
6832 #else
6833                                 dtrace_getpcstack((pc_t *)(tomax + valoffs),
6834                                     size / sizeof (pc_t), probe->dtpr_aframes,
6835                                     DTRACE_ANCHORED(probe) ? NULL :
6836                                   (uint32_t *)(uintptr_t)arg0);
6837 #endif /* __APPLE__ */
6838
6839                                 continue;
6840
6841                         case DTRACEACT_JSTACK:
6842                         case DTRACEACT_USTACK:
6843                                 if (!dtrace_priv_proc(state))
6844                                         continue;
6845
6846                                 /*
6847                                  * See comment in DIF_VAR_PID.
6848                                  */
6849                                 if (DTRACE_ANCHORED(mstate.dtms_probe) &&
6850                                     CPU_ON_INTR(CPU)) {
6851                                         int depth = DTRACE_USTACK_NFRAMES(
6852                                             rec->dtrd_arg) + 1;
6853
6854                                         dtrace_bzero((void *)(tomax + valoffs),
6855                                             DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
6856                                             + depth * sizeof (uint64_t));
6857
6858                                         continue;
6859                                 }
6860
6861                                 if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
6862                                     curproc->p_dtrace_helpers != NULL) {
6863                                         /*
6864                                          * This is the slow path -- we have
6865                                          * allocated string space, and we're
6866                                          * getting the stack of a process that
6867                                          * has helpers.  Call into a separate
6868                                          * routine to perform this processing.
6869                                          */
6870                                         dtrace_action_ustack(&mstate, state,
6871                                             (uint64_t *)(tomax + valoffs),
6872                                             rec->dtrd_arg);
6873                                         continue;
6874                                 }
6875
6876                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6877                                 dtrace_getupcstack((uint64_t *)
6878                                     (tomax + valoffs),
6879                                     DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
6880                                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6881                                 continue;
6882
6883                         default:
6884                                 break;
6885                         }
6886
6887                         dp = act->dta_difo;
6888                         ASSERT(dp != NULL);
6889
6890                         val = dtrace_dif_emulate(dp, &mstate, vstate, state);
6891
6892                         if (*flags & CPU_DTRACE_ERROR)
6893                                 continue;
6894
6895                         switch (act->dta_kind) {
6896                         case DTRACEACT_SPECULATE:
6897                                 ASSERT(buf == &state->dts_buffer[cpuid]);
6898                                 buf = dtrace_speculation_buffer(state,
6899                                     cpuid, val);
6900
6901                                 if (buf == NULL) {
6902                                         *flags |= CPU_DTRACE_DROP;
6903                                         continue;
6904                                 }
6905
6906                                 offs = dtrace_buffer_reserve(buf,
6907                                     ecb->dte_needed, ecb->dte_alignment,
6908                                     state, NULL);
6909
6910                                 if (offs < 0) {
6911                                         *flags |= CPU_DTRACE_DROP;
6912                                         continue;
6913                                 }
6914
6915                                 tomax = buf->dtb_tomax;
6916                                 ASSERT(tomax != NULL);
6917
6918                                 if (ecb->dte_size != 0)
6919                                         DTRACE_STORE(uint32_t, tomax, offs,
6920                                             ecb->dte_epid);
6921                                 continue;
6922
6923                         case DTRACEACT_CHILL:
6924                                 if (dtrace_priv_kernel_destructive(state))
6925                                         dtrace_action_chill(&mstate, val);
6926                                 continue;
6927
6928                         case DTRACEACT_RAISE:
6929                                 if (dtrace_priv_proc_destructive(state))
6930                                         dtrace_action_raise(val);
6931                                 continue;
6932
6933 #if defined(__APPLE__)
6934                         case DTRACEACT_PIDRESUME:
6935                                 if (dtrace_priv_proc_destructive(state))
6936                                         dtrace_action_pidresume(val);
6937                                 continue;
6938 #endif /* __APPLE__ */
6939
6940                         case DTRACEACT_COMMIT:
6941                                 ASSERT(!committed);
6942
6943                                 /*
6944                                  * We need to commit our buffer state.
6945                                  */
6946                                 if (ecb->dte_size)
6947                                         buf->dtb_offset = offs + ecb->dte_size;
6948                                 buf = &state->dts_buffer[cpuid];
6949                                 dtrace_speculation_commit(state, cpuid, val);
6950                                 committed = 1;
6951                                 continue;
6952
6953                         case DTRACEACT_DISCARD:
6954                                 dtrace_speculation_discard(state, cpuid, val);
6955                                 continue;
6956
6957                         case DTRACEACT_DIFEXPR:
6958                         case DTRACEACT_LIBACT:
6959                         case DTRACEACT_PRINTF:
6960                         case DTRACEACT_PRINTA:
6961                         case DTRACEACT_SYSTEM:
6962                         case DTRACEACT_FREOPEN:
6963 #if defined(__APPLE__)
6964                         case DTRACEACT_APPLEBINARY:
6965 #endif /* __APPLE__ */
6966                                 break;
6967
6968                         case DTRACEACT_SYM:
6969                         case DTRACEACT_MOD:
6970                                 if (!dtrace_priv_kernel(state))
6971                                         continue;
6972                                 break;
6973
6974 #if !defined(__APPLE__)
6975                         case DTRACEACT_USYM:
6976                         case DTRACEACT_UMOD:
6977                         case DTRACEACT_UADDR: {
6978                                 struct pid *pid = curthread->t_procp->p_pidp;
6979
6980                                 if (!dtrace_priv_proc(state))
6981                                         continue;
6982
6983                                 DTRACE_STORE(uint64_t, tomax,
6984                                     valoffs, (uint64_t)pid->pid_id);
6985                                 DTRACE_STORE(uint64_t, tomax,
6986                                     valoffs + sizeof (uint64_t), val);
6987
6988                                 continue;
6989                         }
6990 #else
6991                         case DTRACEACT_USYM:
6992                         case DTRACEACT_UMOD:
6993                         case DTRACEACT_UADDR: {
6994                                 if (!dtrace_priv_proc(state))
6995                                         continue;
6996
6997                                 DTRACE_STORE(uint64_t, tomax,
6998                                     valoffs, (uint64_t)proc_selfpid());
6999                                 DTRACE_STORE(uint64_t, tomax,
7000                                     valoffs + sizeof (uint64_t), val);
7001
7002                                 continue;
7003                         }
7004 #endif /* __APPLE__ */
7005
7006                         case DTRACEACT_EXIT: {
7007                                 /*
7008                                  * For the exit action, we are going to attempt
7009                                  * to atomically set our activity to be
7010                                  * draining.  If this fails (either because
7011                                  * another CPU has beat us to the exit action,
7012                                  * or because our current activity is something
7013                                  * other than ACTIVE or WARMUP), we will
7014                                  * continue.  This assures that the exit action
7015                                  * can be successfully recorded at most once
7016                                  * when we're in the ACTIVE state.  If we're
7017                                  * encountering the exit() action while in
7018                                  * COOLDOWN, however, we want to honor the new
7019                                  * status code.  (We know that we're the only
7020                                  * thread in COOLDOWN, so there is no race.)
7021                                  */
7022                                 void *activity = &state->dts_activity;
7023                                 dtrace_activity_t current = state->dts_activity;
7024
7025                                 if (current == DTRACE_ACTIVITY_COOLDOWN)
7026                                         break;
7027
7028                                 if (current != DTRACE_ACTIVITY_WARMUP)
7029                                         current = DTRACE_ACTIVITY_ACTIVE;
7030
7031                                 if (dtrace_cas32(activity, current,
7032                                     DTRACE_ACTIVITY_DRAINING) != current) {
7033                                         *flags |= CPU_DTRACE_DROP;
7034                                         continue;
7035                                 }
7036
7037                                 break;
7038                         }
7039
7040                         default:
7041                                 ASSERT(0);
7042                         }
7043
7044                         if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF) {
7045                                 uintptr_t end = valoffs + size;
7046
7047                                 if (!dtrace_vcanload((void *)(uintptr_t)val,
7048                                     &dp->dtdo_rtype, &mstate, vstate))
7049                                         continue;
7050
7051                                 /*
7052                                  * If this is a string, we're going to only
7053                                  * load until we find the zero byte -- after
7054                                  * which we'll store zero bytes.
7055                                  */
7056                                 if (dp->dtdo_rtype.dtdt_kind ==
7057                                     DIF_TYPE_STRING) {
7058                                         char c = '\0' + 1;
7059                                         int intuple = act->dta_intuple;
7060                                         size_t s;
7061
7062                                         for (s = 0; s < size; s++) {
7063                                                 if (c != '\0')
7064                                                         c = dtrace_load8(val++);
7065
7066                                                 DTRACE_STORE(uint8_t, tomax,
7067                                                     valoffs++, c);
7068
7069                                                 if (c == '\0' && intuple)
7070                                                         break;
7071                                         }
7072
7073                                         continue;
7074                                 }
7075
7076                                 while (valoffs < end) {
7077                                         DTRACE_STORE(uint8_t, tomax, valoffs++,
7078                                             dtrace_load8(val++));
7079                                 }
7080
7081                                 continue;
7082                         }
7083
7084                         switch (size) {
7085                         case 0:
7086                                 break;
7087
7088                         case sizeof (uint8_t):
7089                                 DTRACE_STORE(uint8_t, tomax, valoffs, val);
7090                                 break;
7091                         case sizeof (uint16_t):
7092                                 DTRACE_STORE(uint16_t, tomax, valoffs, val);
7093                                 break;
7094                         case sizeof (uint32_t):
7095                                 DTRACE_STORE(uint32_t, tomax, valoffs, val);
7096                                 break;
7097                         case sizeof (uint64_t):
7098                                 DTRACE_STORE(uint64_t, tomax, valoffs, val);
7099                                 break;
7100                         default:
7101                                 /*
7102                                  * Any other size should have been returned by
7103                                  * reference, not by value.
7104                                  */
7105                                 ASSERT(0);
7106                                 break;
7107                         }
7108                 }
7109
7110                 if (*flags & CPU_DTRACE_DROP)
7111                         continue;
7112
7113                 if (*flags & CPU_DTRACE_FAULT) {
7114                         int ndx;
7115                         dtrace_action_t *err;
7116
7117                         buf->dtb_errors++;
7118
7119                         if (probe->dtpr_id == dtrace_probeid_error) {
7120                                 /*
7121                                  * There's nothing we can do -- we had an
7122                                  * error on the error probe.  We bump an
7123                                  * error counter to at least indicate that
7124                                  * this condition happened.
7125                                  */
7126                                 dtrace_error(&state->dts_dblerrors);
7127                                 continue;
7128                         }
7129
7130                         if (vtime) {
7131                                 /*
7132                                  * Before recursing on dtrace_probe(), we
7133                                  * need to explicitly clear out our start
7134                                  * time to prevent it from being accumulated
7135                                  * into t_dtrace_vtime.
7136                                  */
7137 #if !defined(__APPLE__)
7138                                 curthread->t_dtrace_start = 0;
7139 #else
7140                                 /* Set the sign bit on t_dtrace_tracing to suspend accumulation to it. */
7141                                 dtrace_set_thread_tracing(current_thread(),
7142                                                         (1ULL<<63) | dtrace_get_thread_tracing(current_thread()));
7143 #endif /* __APPLE__ */
7144                         }
7145
7146                         /*
7147                          * Iterate over the actions to figure out which action
7148                          * we were processing when we experienced the error.
7149                          * Note that act points _past_ the faulting action; if
7150                          * act is ecb->dte_action, the fault was in the
7151                          * predicate, if it's ecb->dte_action->dta_next it's
7152                          * in action #1, and so on.
7153                          */
7154                         for (err = ecb->dte_action, ndx = 0;
7155                             err != act; err = err->dta_next, ndx++)
7156                                 continue;
7157
7158                         dtrace_probe_error(state, ecb->dte_epid, ndx,
7159                             (mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
7160                             mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
7161                             cpu_core[cpuid].cpuc_dtrace_illval);
7162
7163                         continue;
7164                 }
7165
7166                 if (!committed)
7167                         buf->dtb_offset = offs + ecb->dte_size;
7168         }
7169
7170 #if !defined(__APPLE__)
7171         if (vtime)
7172                 curthread->t_dtrace_start = dtrace_gethrtime();
7173 #else
7174         /* FIXME: the time spent leaving DTrace from this point to the rti is attributed
7175            to the current thread. Instead it should accrue to DTrace. */
7176         if (vtime) {
7177                 thread_t thread = current_thread();
7178                 int64_t t = dtrace_get_thread_tracing(thread);
7179
7180                 if (t >= 0) {
7181                         /* Usual case, accumulate time spent here into t_dtrace_tracing */
7182                         dtrace_set_thread_tracing(thread, t + (dtrace_gethrtime() - now));
7183                 } else {
7184                         /* Return from error recursion. No accumulation, just clear the sign bit on t_dtrace_tracing. */
7185                         dtrace_set_thread_tracing(thread, (~(1ULL<<63)) & t);
7186                 }
7187         }
7188 #endif /* __APPLE__ */
7189
7190         dtrace_interrupt_enable(cookie);
7191 }
7192
7193 #if defined(__APPLE__)
7194 /* Don't allow a thread to re-enter dtrace_probe(). This could occur if a probe is encountered
7195    on some function in the transitive closure of the call to dtrace_probe(). Solaris has some
7196    strong guarantees that this won't happen, the Darwin implementation is not so mature as to
7197    make those guarantees. */
7198
7199 void
7200 dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
7201     uint64_t arg2, uint64_t arg3, uint64_t arg4)
7202 {
7203         thread_t thread = current_thread();
7204         disable_preemption();
7205         if (id == dtrace_probeid_error) {
7206                 __dtrace_probe(id, arg0, arg1, arg2, arg3, arg4);
7207                 dtrace_getipl(); /* Defeat tail-call optimization of __dtrace_probe() */
7208         } else if (!dtrace_get_thread_reentering(thread)) {
7209                 dtrace_set_thread_reentering(thread, TRUE);
7210                 __dtrace_probe(id, arg0, arg1, arg2, arg3, arg4);
7211                 dtrace_set_thread_reentering(thread, FALSE);
7212         }
7213 #if DEBUG
7214         else __dtrace_probe(dtrace_probeid_error, 0, id, 1, -1, DTRACEFLT_UNKNOWN);
7215 #endif
7216         enable_preemption();
7217 }
7218 #endif /* __APPLE__ */
7219
7220 /*
7221  * DTrace Probe Hashing Functions
7222  *
7223  * The functions in this section (and indeed, the functions in remaining
7224  * sections) are not _called_ from probe context.  (Any exceptions to this are
7225  * marked with a "Note:".)  Rather, they are called from elsewhere in the
7226  * DTrace framework to look-up probes in, add probes to and remove probes from
7227  * the DTrace probe hashes.  (Each probe is hashed by each element of the
7228  * probe tuple -- allowing for fast lookups, regardless of what was
7229  * specified.)
7230  */
7231 static uint_t
7232 #if !defined(__APPLE__)  /* Quiet compiler warnings */
7233 dtrace_hash_str(char *p)
7234 #else
7235 dtrace_hash_str(const char *p)
7236 #endif /* __APPLE__ */
7237 {
7238         unsigned int g;
7239         uint_t hval = 0;
7240
7241         while (*p) {
7242                 hval = (hval << 4) + *p++;
7243                 if ((g = (hval & 0xf0000000)) != 0)
7244                         hval ^= g >> 24;
7245                 hval &= ~g;
7246         }
7247         return (hval);
7248 }
7249
7250 static dtrace_hash_t *
7251 dtrace_hash_create(uintptr_t stroffs, uintptr_t nextoffs, uintptr_t prevoffs)
7252 {
7253         dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
7254
7255         hash->dth_stroffs = stroffs;
7256         hash->dth_nextoffs = nextoffs;
7257         hash->dth_prevoffs = prevoffs;
7258
7259         hash->dth_size = 1;
7260         hash->dth_mask = hash->dth_size - 1;
7261
7262         hash->dth_tab = kmem_zalloc(hash->dth_size *
7263             sizeof (dtrace_hashbucket_t *), KM_SLEEP);
7264
7265         return (hash);
7266 }
7267
7268 #if !defined(__APPLE__) /* Unused. Quiet compiler warning. */
7269 static void
7270 dtrace_hash_destroy(dtrace_hash_t *hash)
7271 {
7272 #if DEBUG
7273         int i;
7274
7275         for (i = 0; i < hash->dth_size; i++)
7276                 ASSERT(hash->dth_tab[i] == NULL);
7277 #endif
7278
7279         kmem_free(hash->dth_tab,
7280             hash->dth_size * sizeof (dtrace_hashbucket_t *));
7281         kmem_free(hash, sizeof (dtrace_hash_t));
7282 }
7283 #endif /* __APPLE__ */
7284
7285 static void
7286 dtrace_hash_resize(dtrace_hash_t *hash)
7287 {
7288         int size = hash->dth_size, i, ndx;
7289         int new_size = hash->dth_size << 1;
7290         int new_mask = new_size - 1;
7291         dtrace_hashbucket_t **new_tab, *bucket, *next;
7292
7293         ASSERT((new_size & new_mask) == 0);
7294
7295         new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
7296
7297         for (i = 0; i < size; i++) {
7298                 for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
7299                         dtrace_probe_t *probe = bucket->dthb_chain;
7300
7301                         ASSERT(probe != NULL);
7302                         ndx = DTRACE_HASHSTR(hash, probe) & new_mask;
7303
7304                         next = bucket->dthb_next;
7305                         bucket->dthb_next = new_tab[ndx];
7306                         new_tab[ndx] = bucket;
7307                 }
7308         }
7309
7310         kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
7311         hash->dth_tab = new_tab;
7312         hash->dth_size = new_size;
7313         hash->dth_mask = new_mask;
7314 }
7315
7316 static void
7317 dtrace_hash_add(dtrace_hash_t *hash, dtrace_probe_t *new)
7318 {
7319         int hashval = DTRACE_HASHSTR(hash, new);
7320         int ndx = hashval & hash->dth_mask;
7321         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7322         dtrace_probe_t **nextp, **prevp;
7323
7324         for (; bucket != NULL; bucket = bucket->dthb_next) {
7325                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
7326                         goto add;
7327         }
7328
7329         if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
7330                 dtrace_hash_resize(hash);
7331                 dtrace_hash_add(hash, new);
7332                 return;
7333         }
7334
7335         bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
7336         bucket->dthb_next = hash->dth_tab[ndx];
7337         hash->dth_tab[ndx] = bucket;
7338         hash->dth_nbuckets++;
7339
7340 add:
7341         nextp = DTRACE_HASHNEXT(hash, new);
7342         ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);
7343         *nextp = bucket->dthb_chain;
7344
7345         if (bucket->dthb_chain != NULL) {
7346                 prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
7347                 ASSERT(*prevp == NULL);
7348                 *prevp = new;
7349         }
7350
7351         bucket->dthb_chain = new;
7352         bucket->dthb_len++;
7353 }
7354
7355 static dtrace_probe_t *
7356 dtrace_hash_lookup(dtrace_hash_t *hash, dtrace_probe_t *template)
7357 {
7358         int hashval = DTRACE_HASHSTR(hash, template);
7359         int ndx = hashval & hash->dth_mask;
7360         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7361
7362         for (; bucket != NULL; bucket = bucket->dthb_next) {
7363                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
7364                         return (bucket->dthb_chain);
7365         }
7366
7367         return (NULL);
7368 }
7369
7370 static int
7371 dtrace_hash_collisions(dtrace_hash_t *hash, dtrace_probe_t *template)
7372 {
7373         int hashval = DTRACE_HASHSTR(hash, template);
7374         int ndx = hashval & hash->dth_mask;
7375         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7376
7377         for (; bucket != NULL; bucket = bucket->dthb_next) {
7378                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
7379                         return (bucket->dthb_len);
7380         }
7381
7382         return (NULL);
7383 }
7384
7385 static void
7386 dtrace_hash_remove(dtrace_hash_t *hash, dtrace_probe_t *probe)
7387 {
7388         int ndx = DTRACE_HASHSTR(hash, probe) & hash->dth_mask;
7389         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7390
7391         dtrace_probe_t **prevp = DTRACE_HASHPREV(hash, probe);
7392         dtrace_probe_t **nextp = DTRACE_HASHNEXT(hash, probe);
7393
7394         /*
7395          * Find the bucket that we're removing this probe from.
7396          */
7397         for (; bucket != NULL; bucket = bucket->dthb_next) {
7398                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, probe))
7399                         break;
7400         }
7401
7402         ASSERT(bucket != NULL);
7403
7404         if (*prevp == NULL) {
7405                 if (*nextp == NULL) {
7406                         /*
7407                          * The removed probe was the only probe on this
7408                          * bucket; we need to remove the bucket.
7409                          */
7410                         dtrace_hashbucket_t *b = hash->dth_tab[ndx];
7411
7412                         ASSERT(bucket->dthb_chain == probe);
7413                         ASSERT(b != NULL);
7414
7415                         if (b == bucket) {
7416                                 hash->dth_tab[ndx] = bucket->dthb_next;
7417                         } else {
7418                                 while (b->dthb_next != bucket)
7419                                         b = b->dthb_next;
7420                                 b->dthb_next = bucket->dthb_next;
7421                         }
7422
7423                         ASSERT(hash->dth_nbuckets > 0);
7424                         hash->dth_nbuckets--;
7425                         kmem_free(bucket, sizeof (dtrace_hashbucket_t));
7426                         return;
7427                 }
7428
7429                 bucket->dthb_chain = *nextp;
7430         } else {
7431                 *(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;
7432         }
7433
7434         if (*nextp != NULL)
7435                 *(DTRACE_HASHPREV(hash, *nextp)) = *prevp;
7436 }
7437
7438 /*
7439  * DTrace Utility Functions
7440  *
7441  * These are random utility functions that are _not_ called from probe context.
7442  */
7443 static int
7444 dtrace_badattr(const dtrace_attribute_t *a)
7445 {
7446         return (a->dtat_name > DTRACE_STABILITY_MAX ||
7447             a->dtat_data > DTRACE_STABILITY_MAX ||
7448             a->dtat_class > DTRACE_CLASS_MAX);
7449 }
7450
7451 /*
7452  * Return a duplicate copy of a string.  If the specified string is NULL,
7453  * this function returns a zero-length string.
7454  */
7455 #if !defined(__APPLE__)
7456 static char *
7457 dtrace_strdup(const char *str)
7458 {
7459         char *new = kmem_zalloc((str != NULL ? strlen(str) : 0) + 1, KM_SLEEP);
7460
7461         if (str != NULL)
7462                 (void) strcpy(new, str);
7463
7464         return (new);
7465 }
7466 #else /* Employ size bounded string operation. */
7467 static char *
7468 dtrace_strdup(const char *str)
7469 {
7470         size_t bufsize = (str != NULL ? strlen(str) : 0) + 1;
7471         char *new = kmem_zalloc(bufsize, KM_SLEEP);
7472
7473         if (str != NULL)
7474                 (void) strlcpy(new, str, bufsize);
7475
7476         return (new);
7477 }
7478 #endif /* __APPLE__ */
7479
7480 #define DTRACE_ISALPHA(c)       \
7481         (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
7482
7483 static int
7484 dtrace_badname(const char *s)
7485 {
7486         char c;
7487
7488         if (s == NULL || (c = *s++) == '\0')
7489                 return (0);
7490
7491         if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
7492                 return (1);
7493
7494         while ((c = *s++) != '\0') {
7495                 if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') &&
7496                     c != '-' && c != '_' && c != '.' && c != '`')
7497                         return (1);
7498         }
7499
7500         return (0);
7501 }
7502
7503 static void
7504 dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
7505 {
7506         uint32_t priv;
7507
7508         if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
7509                 /*
7510                  * For DTRACE_PRIV_ALL, the uid and zoneid don't matter.
7511                  */
7512                 priv = DTRACE_PRIV_ALL;
7513         } else {
7514                 *uidp = crgetuid(cr);
7515                 *zoneidp = crgetzoneid(cr);
7516
7517                 priv = 0;
7518                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
7519                         priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;
7520                 else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
7521                         priv |= DTRACE_PRIV_USER;
7522                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
7523                         priv |= DTRACE_PRIV_PROC;
7524                 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
7525                         priv |= DTRACE_PRIV_OWNER;
7526                 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
7527                         priv |= DTRACE_PRIV_ZONEOWNER;
7528         }
7529
7530         *privp = priv;
7531 }
7532
7533 #ifdef DTRACE_ERRDEBUG
7534 static void
7535 dtrace_errdebug(const char *str)
7536 {
7537 #if !defined(__APPLE__) /* Quiet compiler warnings */
7538         int hval = dtrace_hash_str((char *)str) % DTRACE_ERRHASHSZ;
7539 #else
7540         int hval = dtrace_hash_str(str) % DTRACE_ERRHASHSZ;
7541 #endif /* __APPLE__ */
7542         int occupied = 0;
7543
7544         lck_mtx_lock(&dtrace_errlock);
7545         dtrace_errlast = str;
7546 #if !defined(__APPLE__)
7547         dtrace_errthread = curthread;
7548 #else
7549         dtrace_errthread = (kthread_t *)current_thread();
7550 #endif /* __APPLE__ */
7551
7552         while (occupied++ < DTRACE_ERRHASHSZ) {
7553                 if (dtrace_errhash[hval].dter_msg == str) {
7554                         dtrace_errhash[hval].dter_count++;
7555                         goto out;
7556                 }
7557
7558                 if (dtrace_errhash[hval].dter_msg != NULL) {
7559                         hval = (hval + 1) % DTRACE_ERRHASHSZ;
7560                         continue;
7561                 }
7562
7563                 dtrace_errhash[hval].dter_msg = str;
7564                 dtrace_errhash[hval].dter_count = 1;
7565                 goto out;
7566         }
7567
7568         panic("dtrace: undersized error hash");
7569 out:
7570         lck_mtx_unlock(&dtrace_errlock);
7571 }
7572 #endif
7573
7574 /*
7575  * DTrace Matching Functions
7576  *
7577  * These functions are used to match groups of probes, given some elements of
7578  * a probe tuple, or some globbed expressions for elements of a probe tuple.
7579  */
7580 static int
7581 dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
7582     zoneid_t zoneid)
7583 {
7584         if (priv != DTRACE_PRIV_ALL) {
7585                 uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
7586                 uint32_t match = priv & ppriv;
7587
7588                 /*
7589                  * No PRIV_DTRACE_* privileges...
7590                  */
7591                 if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |
7592                     DTRACE_PRIV_KERNEL)) == 0)
7593                         return (0);
7594
7595                 /*
7596                  * No matching bits, but there were bits to match...
7597                  */
7598                 if (match == 0 && ppriv != 0)
7599                         return (0);
7600
7601                 /*
7602                  * Need to have permissions to the process, but don't...
7603                  */
7604                 if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
7605                     uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
7606                         return (0);
7607                 }
7608
7609                 /*
7610                  * Need to be in the same zone unless we possess the
7611                  * privilege to examine all zones.
7612                  */
7613                 if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
7614                     zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
7615                         return (0);
7616                 }
7617         }
7618
7619         return (1);
7620 }
7621
7622 /*
7623  * dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
7624  * consists of input pattern strings and an ops-vector to evaluate them.
7625  * This function returns >0 for match, 0 for no match, and <0 for error.
7626  */
7627 static int
7628 dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,
7629     uint32_t priv, uid_t uid, zoneid_t zoneid)
7630 {
7631         dtrace_provider_t *pvp = prp->dtpr_provider;
7632         int rv;
7633
7634         if (pvp->dtpv_defunct)
7635                 return (0);
7636
7637         if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
7638                 return (rv);
7639
7640         if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
7641                 return (rv);
7642
7643         if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
7644                 return (rv);
7645
7646         if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
7647                 return (rv);
7648
7649         if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
7650                 return (0);
7651
7652         return (rv);
7653 }
7654
7655 /*
7656  * dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
7657  * interface for matching a glob pattern 'p' to an input string 's'.  Unlike
7658  * libc's version, the kernel version only applies to 8-bit ASCII strings.
7659  * In addition, all of the recursion cases except for '*' matching have been
7660  * unwound.  For '*', we still implement recursive evaluation, but a depth
7661  * counter is maintained and matching is aborted if we recurse too deep.
7662  * The function returns 0 if no match, >0 if match, and <0 if recursion error.
7663  */
7664 static int
7665 dtrace_match_glob(const char *s, const char *p, int depth)
7666 {
7667         const char *olds;
7668         char s1, c;
7669         int gs;
7670
7671         if (depth > DTRACE_PROBEKEY_MAXDEPTH)
7672                 return (-1);
7673
7674         if (s == NULL)
7675                 s = ""; /* treat NULL as empty string */
7676
7677 top:
7678         olds = s;
7679         s1 = *s++;
7680
7681         if (p == NULL)
7682                 return (0);
7683
7684         if ((c = *p++) == '\0')
7685                 return (s1 == '\0');
7686
7687         switch (c) {
7688         case '[': {
7689                 int ok = 0, notflag = 0;
7690                 char lc = '\0';
7691
7692                 if (s1 == '\0')
7693                         return (0);
7694
7695                 if (*p == '!') {
7696                         notflag = 1;
7697                         p++;
7698                 }
7699
7700                 if ((c = *p++) == '\0')
7701                         return (0);
7702
7703                 do {
7704                         if (c == '-' && lc != '\0' && *p != ']') {
7705                                 if ((c = *p++) == '\0')
7706                                         return (0);
7707                                 if (c == '\\' && (c = *p++) == '\0')
7708                                         return (0);
7709
7710                                 if (notflag) {
7711                                         if (s1 < lc || s1 > c)
7712                                                 ok++;
7713                                         else
7714                                                 return (0);
7715                                 } else if (lc <= s1 && s1 <= c)
7716                                         ok++;
7717
7718                         } else if (c == '\\' && (c = *p++) == '\0')
7719                                 return (0);
7720
7721                         lc = c; /* save left-hand 'c' for next iteration */
7722
7723                         if (notflag) {
7724                                 if (s1 != c)
7725                                         ok++;
7726                                 else
7727                                         return (0);
7728                         } else if (s1 == c)
7729                                 ok++;
7730
7731                         if ((c = *p++) == '\0')
7732                                 return (0);
7733
7734                 } while (c != ']');
7735
7736                 if (ok)
7737                         goto top;
7738
7739                 return (0);
7740         }
7741
7742         case '\\':
7743                 if ((c = *p++) == '\0')
7744                         return (0);
7745                 /*FALLTHRU*/
7746
7747         default:
7748                 if (c != s1)
7749                         return (0);
7750                 /*FALLTHRU*/
7751
7752         case '?':
7753                 if (s1 != '\0')
7754                         goto top;
7755                 return (0);
7756
7757         case '*':
7758                 while (*p == '*')
7759                         p++; /* consecutive *'s are identical to a single one */
7760
7761                 if (*p == '\0')
7762                         return (1);
7763
7764                 for (s = olds; *s != '\0'; s++) {
7765                         if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
7766                                 return (gs);
7767                 }
7768
7769                 return (0);
7770         }
7771 }
7772
7773 /*ARGSUSED*/
7774 static int
7775 dtrace_match_string(const char *s, const char *p, int depth)
7776 {
7777 #pragma unused(depth) /* __APPLE__ */
7778 #if !defined(__APPLE__)
7779         return (s != NULL && strcmp(s, p) == 0);
7780 #else /* Employ size bounded string operation. */
7781         return (s != NULL && strncmp(s, p, strlen(s) + 1) == 0);
7782 #endif /* __APPLE__ */
7783 }
7784
7785 /*ARGSUSED*/
7786 static int
7787 dtrace_match_nul(const char *s, const char *p, int depth)
7788 {
7789 #pragma unused(s, p, depth) /* __APPLE__ */
7790         return (1); /* always match the empty pattern */
7791 }
7792
7793 /*ARGSUSED*/
7794 static int
7795 dtrace_match_nonzero(const char *s, const char *p, int depth)
7796 {
7797 #pragma unused(p, depth) /* __APPLE__ */
7798         return (s != NULL && s[0] != '\0');
7799 }
7800
7801 static int
7802 dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
7803     zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *), void *arg)
7804 {
7805         dtrace_probe_t template, *probe;
7806         dtrace_hash_t *hash = NULL;
7807         int len, rc, best = INT_MAX, nmatched = 0;
7808         dtrace_id_t i;
7809
7810         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7811
7812         /*
7813          * If the probe ID is specified in the key, just lookup by ID and
7814          * invoke the match callback once if a matching probe is found.
7815          */
7816         if (pkp->dtpk_id != DTRACE_IDNONE) {
7817                 if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
7818                     dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
7819                         if ((*matched)(probe, arg) == DTRACE_MATCH_FAIL)
7820                                return (DTRACE_MATCH_FAIL);
7821                         nmatched++;
7822                 }
7823                 return (nmatched);
7824         }
7825
7826 #if !defined(__APPLE__)   /* Quiet compiler warnings */
7827         template.dtpr_mod = (char *)pkp->dtpk_mod;
7828         template.dtpr_func = (char *)pkp->dtpk_func;
7829         template.dtpr_name = (char *)pkp->dtpk_name;
7830 #else
7831         template.dtpr_mod =  (char *)(uintptr_t)pkp->dtpk_mod;
7832         template.dtpr_func = (char *)(uintptr_t)pkp->dtpk_func;
7833         template.dtpr_name = (char *)(uintptr_t)pkp->dtpk_name;
7834 #endif /* __APPLE__ */
7835
7836         /*
7837          * We want to find the most distinct of the module name, function
7838          * name, and name.  So for each one that is not a glob pattern or
7839          * empty string, we perform a lookup in the corresponding hash and
7840          * use the hash table with the fewest collisions to do our search.
7841          */
7842         if (pkp->dtpk_mmatch == &dtrace_match_string &&
7843             (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
7844                 best = len;
7845                 hash = dtrace_bymod;
7846         }
7847
7848         if (pkp->dtpk_fmatch == &dtrace_match_string &&
7849             (len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
7850                 best = len;
7851                 hash = dtrace_byfunc;
7852         }
7853
7854         if (pkp->dtpk_nmatch == &dtrace_match_string &&
7855             (len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
7856                 best = len;
7857                 hash = dtrace_byname;
7858         }
7859
7860         /*
7861          * If we did not select a hash table, iterate over every probe and
7862          * invoke our callback for each one that matches our input probe key.
7863          */
7864         if (hash == NULL) {
7865 #if !defined(__APPLE__)  /* Quiet compiler warning */
7866                 for (i = 0; i < dtrace_nprobes; i++) {
7867 #else
7868                 for (i = 0; i < (dtrace_id_t)dtrace_nprobes; i++) {
7869 #endif /* __APPLE__ */
7870                         if ((probe = dtrace_probes[i]) == NULL ||
7871                             dtrace_match_probe(probe, pkp, priv, uid,
7872                             zoneid) <= 0)
7873                                 continue;
7874
7875                         nmatched++;
7876
7877                        if ((rc = (*matched)(probe, arg)) != DTRACE_MATCH_NEXT) {
7878                                if (rc == DTRACE_MATCH_FAIL)
7879                                        return (DTRACE_MATCH_FAIL);
7880                                break;
7881                        }
7882                 }
7883
7884                 return (nmatched);
7885         }
7886
7887         /*
7888          * If we selected a hash table, iterate over each probe of the same key
7889          * name and invoke the callback for every probe that matches the other
7890          * attributes of our input probe key.
7891          */
7892         for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
7893             probe = *(DTRACE_HASHNEXT(hash, probe))) {
7894
7895                 if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
7896                         continue;
7897
7898                 nmatched++;
7899
7900                 if ((rc = (*matched)(probe, arg)) != DTRACE_MATCH_NEXT) {
7901                     if (rc == DTRACE_MATCH_FAIL)
7902                         return (DTRACE_MATCH_FAIL);
7903                     break;
7904                 }
7905         }
7906
7907         return (nmatched);
7908 }
7909
7910 /*
7911  * Return the function pointer dtrace_probecmp() should use to compare the
7912  * specified pattern with a string.  For NULL or empty patterns, we select
7913  * dtrace_match_nul().  For glob pattern strings, we use dtrace_match_glob().
7914  * For non-empty non-glob strings, we use dtrace_match_string().
7915  */
7916 static dtrace_probekey_f *
7917 dtrace_probekey_func(const char *p)
7918 {
7919         char c;
7920
7921         if (p == NULL || *p == '\0')
7922                 return (&dtrace_match_nul);
7923
7924         while ((c = *p++) != '\0') {
7925                 if (c == '[' || c == '?' || c == '*' || c == '\\')
7926                         return (&dtrace_match_glob);
7927         }
7928
7929         return (&dtrace_match_string);
7930 }
7931
7932 /*
7933  * Build a probe comparison key for use with dtrace_match_probe() from the
7934  * given probe description.  By convention, a null key only matches anchored
7935  * probes: if each field is the empty string, reset dtpk_fmatch to
7936  * dtrace_match_nonzero().
7937  */
7938 static void
7939 dtrace_probekey(const dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
7940 {
7941         pkp->dtpk_prov = pdp->dtpd_provider;
7942         pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
7943
7944         pkp->dtpk_mod = pdp->dtpd_mod;
7945         pkp->dtpk_mmatch = dtrace_probekey_func(pdp->dtpd_mod);
7946
7947         pkp->dtpk_func = pdp->dtpd_func;
7948         pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
7949
7950         pkp->dtpk_name = pdp->dtpd_name;
7951         pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
7952
7953         pkp->dtpk_id = pdp->dtpd_id;
7954
7955         if (pkp->dtpk_id == DTRACE_IDNONE &&
7956             pkp->dtpk_pmatch == &dtrace_match_nul &&
7957             pkp->dtpk_mmatch == &dtrace_match_nul &&
7958             pkp->dtpk_fmatch == &dtrace_match_nul &&
7959             pkp->dtpk_nmatch == &dtrace_match_nul)
7960                 pkp->dtpk_fmatch = &dtrace_match_nonzero;
7961 }
7962
7963 /*
7964  * DTrace Provider-to-Framework API Functions
7965  *
7966  * These functions implement much of the Provider-to-Framework API, as
7967  * described in <sys/dtrace.h>.  The parts of the API not in this section are
7968  * the functions in the API for probe management (found below), and
7969  * dtrace_probe() itself (found above).
7970  */
7971
7972 /*
7973  * Register the calling provider with the DTrace framework.  This should
7974  * generally be called by DTrace providers in their attach(9E) entry point.
7975  */
7976 int
7977 dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
7978     cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)
7979 {
7980         dtrace_provider_t *provider;
7981
7982         if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {
7983                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7984                     "arguments", name ? name : "<NULL>");
7985                 return (EINVAL);
7986         }
7987
7988         if (name[0] == '\0' || dtrace_badname(name)) {
7989                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7990                     "provider name", name);
7991                 return (EINVAL);
7992         }
7993
7994         if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||
7995             pops->dtps_enable == NULL || pops->dtps_disable == NULL ||
7996             pops->dtps_destroy == NULL ||
7997             ((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
7998                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7999                     "provider ops", name);
8000                 return (EINVAL);
8001         }
8002
8003         if (dtrace_badattr(&pap->dtpa_provider) ||
8004             dtrace_badattr(&pap->dtpa_mod) ||
8005             dtrace_badattr(&pap->dtpa_func) ||
8006             dtrace_badattr(&pap->dtpa_name) ||
8007             dtrace_badattr(&pap->dtpa_args)) {
8008                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8009                     "provider attributes", name);
8010                 return (EINVAL);
8011         }
8012
8013         if (priv & ~DTRACE_PRIV_ALL) {
8014                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8015                     "privilege attributes", name);
8016                 return (EINVAL);
8017         }
8018
8019         if ((priv & DTRACE_PRIV_KERNEL) &&
8020             (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&
8021             pops->dtps_usermode == NULL) {
8022                 cmn_err(CE_WARN, "failed to register provider '%s': need "
8023                     "dtps_usermode() op for given privilege attributes", name);
8024                 return (EINVAL);
8025         }
8026
8027         provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
8028 #if !defined(__APPLE__)
8029         provider->dtpv_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
8030         (void) strcpy(provider->dtpv_name, name);
8031 #else /* Employ size bounded string operation. */
8032         {
8033         size_t bufsize = strlen(name) + 1;
8034         provider->dtpv_name = kmem_alloc(bufsize, KM_SLEEP);
8035         (void) strlcpy(provider->dtpv_name, name, bufsize);
8036         }
8037 #endif /* __APPLE__ */
8038
8039         provider->dtpv_attr = *pap;
8040         provider->dtpv_priv.dtpp_flags = priv;
8041         if (cr != NULL) {
8042                 provider->dtpv_priv.dtpp_uid = crgetuid(cr);
8043                 provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
8044         }
8045         provider->dtpv_pops = *pops;
8046
8047         if (pops->dtps_provide == NULL) {
8048                 ASSERT(pops->dtps_provide_module != NULL);
8049                 provider->dtpv_pops.dtps_provide =
8050                     (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop;
8051         }
8052
8053         if (pops->dtps_provide_module == NULL) {
8054                 ASSERT(pops->dtps_provide != NULL);
8055                 provider->dtpv_pops.dtps_provide_module =
8056                     (void (*)(void *, struct modctl *))dtrace_nullop;
8057         }
8058
8059         if (pops->dtps_suspend == NULL) {
8060                 ASSERT(pops->dtps_resume == NULL);
8061                 provider->dtpv_pops.dtps_suspend =
8062                     (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
8063                 provider->dtpv_pops.dtps_resume =
8064                     (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
8065         }
8066
8067         provider->dtpv_arg = arg;
8068         *idp = (dtrace_provider_id_t)provider;
8069
8070         if (pops == &dtrace_provider_ops) {
8071                 lck_mtx_assert(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
8072                 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8073                 ASSERT(dtrace_anon.dta_enabling == NULL);
8074
8075                 /*
8076                  * We make sure that the DTrace provider is at the head of
8077                  * the provider chain.
8078                  */
8079                 provider->dtpv_next = dtrace_provider;
8080                 dtrace_provider = provider;
8081                 return (0);
8082         }
8083
8084         lck_mtx_lock(&dtrace_provider_lock);
8085         lck_mtx_lock(&dtrace_lock);
8086
8087         /*
8088          * If there is at least one provider registered, we'll add this
8089          * provider after the first provider.
8090          */
8091         if (dtrace_provider != NULL) {
8092                 provider->dtpv_next = dtrace_provider->dtpv_next;
8093                 dtrace_provider->dtpv_next = provider;
8094         } else {
8095                 dtrace_provider = provider;
8096         }
8097
8098         if (dtrace_retained != NULL) {
8099                 dtrace_enabling_provide(provider);
8100
8101                 /*
8102                  * Now we need to call dtrace_enabling_matchall() -- which
8103                  * will acquire cpu_lock and dtrace_lock.  We therefore need
8104                  * to drop all of our locks before calling into it...
8105                  */
8106                 lck_mtx_unlock(&dtrace_lock);
8107                 lck_mtx_unlock(&dtrace_provider_lock);
8108                 dtrace_enabling_matchall();
8109
8110                 return (0);
8111         }
8112
8113         lck_mtx_unlock(&dtrace_lock);
8114         lck_mtx_unlock(&dtrace_provider_lock);
8115
8116         return (0);
8117 }
8118
8119 /*
8120  * Unregister the specified provider from the DTrace framework.  This should
8121  * generally be called by DTrace providers in their detach(9E) entry point.
8122  */
8123 int
8124 dtrace_unregister(dtrace_provider_id_t id)
8125 {
8126         dtrace_provider_t *old = (dtrace_provider_t *)id;
8127         dtrace_provider_t *prev = NULL;
8128         int i, self = 0;
8129         dtrace_probe_t *probe, *first = NULL;
8130
8131         if (old->dtpv_pops.dtps_enable ==
8132             (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop) {
8133                 /*
8134                  * If DTrace itself is the provider, we're called with locks
8135                  * already held.
8136                  */
8137                 ASSERT(old == dtrace_provider);
8138                 ASSERT(dtrace_devi != NULL);
8139                 lck_mtx_assert(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
8140                 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8141                 self = 1;
8142
8143                 if (dtrace_provider->dtpv_next != NULL) {
8144                         /*
8145                          * There's another provider here; return failure.
8146                          */
8147                         return (EBUSY);
8148                 }
8149         } else {
8150                 lck_mtx_lock(&dtrace_provider_lock);
8151                 lck_mtx_lock(&mod_lock);
8152                 lck_mtx_lock(&dtrace_lock);
8153         }
8154
8155         /*
8156          * If anyone has /dev/dtrace open, or if there are anonymous enabled
8157          * probes, we refuse to let providers slither away, unless this
8158          * provider has already been explicitly invalidated.
8159          */
8160         if (!old->dtpv_defunct &&
8161             (dtrace_opens || (dtrace_anon.dta_state != NULL &&
8162             dtrace_anon.dta_state->dts_necbs > 0))) {
8163                 if (!self) {
8164                         lck_mtx_unlock(&dtrace_lock);
8165                         lck_mtx_unlock(&mod_lock);
8166                         lck_mtx_unlock(&dtrace_provider_lock);
8167                 }
8168                 return (EBUSY);
8169         }
8170
8171         /*
8172          * Attempt to destroy the probes associated with this provider.
8173          */
8174         for (i = 0; i < dtrace_nprobes; i++) {
8175                 if ((probe = dtrace_probes[i]) == NULL)
8176                         continue;
8177
8178                 if (probe->dtpr_provider != old)
8179                         continue;
8180
8181                 if (probe->dtpr_ecb == NULL)
8182                         continue;
8183
8184                 /*
8185                  * We have at least one ECB; we can't remove this provider.
8186                  */
8187                 if (!self) {
8188                         lck_mtx_unlock(&dtrace_lock);
8189                         lck_mtx_unlock(&mod_lock);
8190                         lck_mtx_unlock(&dtrace_provider_lock);
8191                 }
8192                 return (EBUSY);
8193         }
8194
8195         /*
8196          * All of the probes for this provider are disabled; we can safely
8197          * remove all of them from their hash chains and from the probe array.
8198          */
8199         for (i = 0; i < dtrace_nprobes; i++) {
8200                 if ((probe = dtrace_probes[i]) == NULL)
8201                         continue;
8202
8203                 if (probe->dtpr_provider != old)
8204                         continue;
8205
8206                 dtrace_probes[i] = NULL;
8207
8208                 dtrace_hash_remove(dtrace_bymod, probe);
8209                 dtrace_hash_remove(dtrace_byfunc, probe);
8210                 dtrace_hash_remove(dtrace_byname, probe);
8211
8212                 if (first == NULL) {
8213                         first = probe;
8214                         probe->dtpr_nextmod = NULL;
8215                 } else {
8216                         probe->dtpr_nextmod = first;
8217                         first = probe;
8218                 }
8219         }
8220
8221         /*
8222          * The provider's probes have been removed from the hash chains and
8223          * from the probe array.  Now issue a dtrace_sync() to be sure that
8224          * everyone has cleared out from any probe array processing.
8225          */
8226         dtrace_sync();
8227
8228         for (probe = first; probe != NULL; probe = first) {
8229                 first = probe->dtpr_nextmod;
8230
8231                 old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
8232                     probe->dtpr_arg);
8233                 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
8234                 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
8235                 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
8236                 vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
8237 #if !defined(__APPLE__)
8238                 kmem_free(probe, sizeof (dtrace_probe_t));
8239 #else
8240                 zfree(dtrace_probe_t_zone, probe);
8241 #endif
8242         }
8243
8244         if ((prev = dtrace_provider) == old) {
8245                 ASSERT(self || dtrace_devi == NULL);
8246                 ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
8247                 dtrace_provider = old->dtpv_next;
8248         } else {
8249                 while (prev != NULL && prev->dtpv_next != old)
8250                         prev = prev->dtpv_next;
8251
8252                 if (prev == NULL) {
8253                         panic("attempt to unregister non-existent "
8254                             "dtrace provider %p\n", (void *)id);
8255                 }
8256
8257                 prev->dtpv_next = old->dtpv_next;
8258         }
8259
8260         if (!self) {
8261                 lck_mtx_unlock(&dtrace_lock);
8262                 lck_mtx_unlock(&mod_lock);
8263                 lck_mtx_unlock(&dtrace_provider_lock);
8264         }
8265
8266         kmem_free(old->dtpv_name, strlen(old->dtpv_name) + 1);
8267         kmem_free(old, sizeof (dtrace_provider_t));
8268
8269         return (0);
8270 }
8271
8272 /*
8273  * Invalidate the specified provider.  All subsequent probe lookups for the
8274  * specified provider will fail, but its probes will not be removed.
8275  */
8276 void
8277 dtrace_invalidate(dtrace_provider_id_t id)
8278 {
8279         dtrace_provider_t *pvp = (dtrace_provider_t *)id;
8280
8281         ASSERT(pvp->dtpv_pops.dtps_enable !=
8282             (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
8283
8284         lck_mtx_lock(&dtrace_provider_lock);
8285         lck_mtx_lock(&dtrace_lock);
8286
8287         pvp->dtpv_defunct = 1;
8288
8289         lck_mtx_unlock(&dtrace_lock);
8290         lck_mtx_unlock(&dtrace_provider_lock);
8291 }
8292
8293 /*
8294  * Indicate whether or not DTrace has attached.
8295  */
8296 int
8297 dtrace_attached(void)
8298 {
8299         /*
8300          * dtrace_provider will be non-NULL iff the DTrace driver has
8301          * attached.  (It's non-NULL because DTrace is always itself a
8302          * provider.)
8303          */
8304         return (dtrace_provider != NULL);
8305 }
8306
8307 /*
8308  * Remove all the unenabled probes for the given provider.  This function is
8309  * not unlike dtrace_unregister(), except that it doesn't remove the provider
8310  * -- just as many of its associated probes as it can.
8311  */
8312 int
8313 dtrace_condense(dtrace_provider_id_t id)
8314 {
8315         dtrace_provider_t *prov = (dtrace_provider_t *)id;
8316         int i;
8317         dtrace_probe_t *probe;
8318
8319         /*
8320          * Make sure this isn't the dtrace provider itself.
8321          */
8322         ASSERT(prov->dtpv_pops.dtps_enable !=
8323           (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
8324
8325         lck_mtx_lock(&dtrace_provider_lock);
8326         lck_mtx_lock(&dtrace_lock);
8327
8328         /*
8329          * Attempt to destroy the probes associated with this provider.
8330          */
8331         for (i = 0; i < dtrace_nprobes; i++) {
8332                 if ((probe = dtrace_probes[i]) == NULL)
8333                         continue;
8334
8335                 if (probe->dtpr_provider != prov)
8336                         continue;
8337
8338                 if (probe->dtpr_ecb != NULL)
8339                         continue;
8340
8341                 dtrace_probes[i] = NULL;
8342
8343                 dtrace_hash_remove(dtrace_bymod, probe);
8344                 dtrace_hash_remove(dtrace_byfunc, probe);
8345                 dtrace_hash_remove(dtrace_byname, probe);
8346
8347                 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, i + 1,
8348                     probe->dtpr_arg);
8349                 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
8350                 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
8351                 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
8352 #if !defined(__APPLE__)
8353                 kmem_free(probe, sizeof (dtrace_probe_t));
8354 #else
8355                 zfree(dtrace_probe_t_zone, probe);
8356 #endif
8357                 vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1);
8358         }
8359
8360         lck_mtx_unlock(&dtrace_lock);
8361         lck_mtx_unlock(&dtrace_provider_lock);
8362
8363         return (0);
8364 }
8365
8366 /*
8367  * DTrace Probe Management Functions
8368  *
8369  * The functions in this section perform the DTrace probe management,
8370  * including functions to create probes, look-up probes, and call into the
8371  * providers to request that probes be provided.  Some of these functions are
8372  * in the Provider-to-Framework API; these functions can be identified by the
8373  * fact that they are not declared "static".
8374  */
8375
8376 /*
8377  * Create a probe with the specified module name, function name, and name.
8378  */
8379 dtrace_id_t
8380 dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
8381     const char *func, const char *name, int aframes, void *arg)
8382 {
8383         dtrace_probe_t *probe, **probes;
8384         dtrace_provider_t *provider = (dtrace_provider_t *)prov;
8385         dtrace_id_t id;
8386
8387         if (provider == dtrace_provider) {
8388                 lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8389         } else {
8390                 lck_mtx_lock(&dtrace_lock);
8391         }
8392
8393         id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
8394             VM_BESTFIT | VM_SLEEP);
8395 #if !defined(__APPLE__)
8396         probe = kmem_zalloc(sizeof (dtrace_probe_t), KM_SLEEP);
8397 #else
8398         probe = zalloc(dtrace_probe_t_zone);
8399         bzero(probe, sizeof (dtrace_probe_t));
8400 #endif
8401
8402         probe->dtpr_id = id;
8403         probe->dtpr_gen = dtrace_probegen++;
8404         probe->dtpr_mod = dtrace_strdup(mod);
8405         probe->dtpr_func = dtrace_strdup(func);
8406         probe->dtpr_name = dtrace_strdup(name);
8407         probe->dtpr_arg = arg;
8408         probe->dtpr_aframes = aframes;
8409         probe->dtpr_provider = provider;
8410
8411         dtrace_hash_add(dtrace_bymod, probe);
8412         dtrace_hash_add(dtrace_byfunc, probe);
8413         dtrace_hash_add(dtrace_byname, probe);
8414
8415 #if !defined(__APPLE__)  /* Quiet compiler warning */
8416         if (id - 1 >= dtrace_nprobes) {
8417 #else
8418         if (id - 1 >= (dtrace_id_t)dtrace_nprobes) {
8419 #endif /* __APPLE__ */
8420                 size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
8421                 size_t nsize = osize << 1;
8422
8423                 if (nsize == 0) {
8424                         ASSERT(osize == 0);
8425                         ASSERT(dtrace_probes == NULL);
8426                         nsize = sizeof (dtrace_probe_t *);
8427                 }
8428
8429                 probes = kmem_zalloc(nsize, KM_SLEEP);
8430
8431                 if (dtrace_probes == NULL) {
8432                         ASSERT(osize == 0);
8433                         dtrace_probes = probes;
8434                         dtrace_nprobes = 1;
8435                 } else {
8436                         dtrace_probe_t **oprobes = dtrace_probes;
8437
8438                         bcopy(oprobes, probes, osize);
8439                         dtrace_membar_producer();
8440                         dtrace_probes = probes;
8441
8442                         dtrace_sync();
8443
8444                         /*
8445                          * All CPUs are now seeing the new probes array; we can
8446                          * safely free the old array.
8447                          */
8448                         kmem_free(oprobes, osize);
8449                         dtrace_nprobes <<= 1;
8450                 }
8451
8452 #if !defined(__APPLE__)  /* Quiet compiler warning */
8453                 ASSERT(id - 1 < dtrace_nprobes);
8454 #else
8455                 ASSERT(id - 1 < (dtrace_id_t)dtrace_nprobes);
8456 #endif /* __APPLE__ */
8457         }
8458
8459         ASSERT(dtrace_probes[id - 1] == NULL);
8460         dtrace_probes[id - 1] = probe;
8461
8462         if (provider != dtrace_provider)
8463                 lck_mtx_unlock(&dtrace_lock);
8464
8465         return (id);
8466 }
8467
8468 static dtrace_probe_t *
8469 dtrace_probe_lookup_id(dtrace_id_t id)
8470 {
8471         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8472
8473 #if !defined(__APPLE__)  /* Quiet compiler warning */
8474         if (id == 0 || id > dtrace_nprobes)
8475                 return (NULL);
8476 #else
8477         if (id == 0 || id > (dtrace_id_t)dtrace_nprobes)
8478                 return (NULL);
8479 #endif /* __APPLE__ */
8480
8481         return (dtrace_probes[id - 1]);
8482 }
8483
8484 static int
8485 dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg)
8486 {
8487         *((dtrace_id_t *)arg) = probe->dtpr_id;
8488
8489         return (DTRACE_MATCH_DONE);
8490 }
8491
8492 /*
8493  * Look up a probe based on provider and one or more of module name, function
8494  * name and probe name.
8495  */
8496 dtrace_id_t
8497 dtrace_probe_lookup(dtrace_provider_id_t prid, const char *mod,
8498     const char *func, const char *name)
8499 {
8500         dtrace_probekey_t pkey;
8501         dtrace_id_t id;
8502         int match;
8503
8504         pkey.dtpk_prov = ((dtrace_provider_t *)prid)->dtpv_name;
8505         pkey.dtpk_pmatch = &dtrace_match_string;
8506         pkey.dtpk_mod = mod;
8507         pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
8508         pkey.dtpk_func = func;
8509         pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
8510         pkey.dtpk_name = name;
8511         pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
8512         pkey.dtpk_id = DTRACE_IDNONE;
8513
8514         lck_mtx_lock(&dtrace_lock);
8515         match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
8516             dtrace_probe_lookup_match, &id);
8517         lck_mtx_unlock(&dtrace_lock);
8518
8519         ASSERT(match == 1 || match == 0);
8520         return (match ? id : 0);
8521 }
8522
8523 /*
8524  * Returns the probe argument associated with the specified probe.
8525  */
8526 void *
8527 dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
8528 {
8529         dtrace_probe_t *probe;
8530         void *rval = NULL;
8531
8532         lck_mtx_lock(&dtrace_lock);
8533
8534         if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
8535             probe->dtpr_provider == (dtrace_provider_t *)id)
8536                 rval = probe->dtpr_arg;
8537
8538         lck_mtx_unlock(&dtrace_lock);
8539
8540         return (rval);
8541 }
8542
8543 /*
8544  * Copy a probe into a probe description.
8545  */
8546 static void
8547 dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)
8548 {
8549         bzero(pdp, sizeof (dtrace_probedesc_t));
8550         pdp->dtpd_id = prp->dtpr_id;
8551
8552 #if !defined(__APPLE__)
8553         (void) strncpy(pdp->dtpd_provider,
8554             prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN - 1);
8555
8556         (void) strncpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN - 1);
8557         (void) strncpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN - 1);
8558         (void) strncpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN - 1);
8559 #else /* Employ size bounded string operation. */
8560         (void) strlcpy(pdp->dtpd_provider,
8561             prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN);
8562
8563         (void) strlcpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN);
8564         (void) strlcpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN);
8565         (void) strlcpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN);
8566 #endif /* __APPLE__ */
8567 }
8568
8569 /*
8570  * Called to indicate that a probe -- or probes -- should be provided by a
8571  * specfied provider.  If the specified description is NULL, the provider will
8572  * be told to provide all of its probes.  (This is done whenever a new
8573  * consumer comes along, or whenever a retained enabling is to be matched.) If
8574  * the specified description is non-NULL, the provider is given the
8575  * opportunity to dynamically provide the specified probe, allowing providers
8576  * to support the creation of probes on-the-fly.  (So-called _autocreated_
8577  * probes.)  If the provider is NULL, the operations will be applied to all
8578  * providers; if the provider is non-NULL the operations will only be applied
8579  * to the specified provider.  The dtrace_provider_lock must be held, and the
8580  * dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
8581  * will need to grab the dtrace_lock when it reenters the framework through
8582  * dtrace_probe_lookup(), dtrace_probe_create(), etc.
8583  */
8584 static void
8585 dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
8586 {
8587         struct modctl *ctl;
8588         int all = 0;
8589
8590         lck_mtx_assert(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
8591
8592         if (prv == NULL) {
8593                 all = 1;
8594                 prv = dtrace_provider;
8595         }
8596
8597         do {
8598                 /*
8599                  * First, call the blanket provide operation.
8600                  */
8601                 prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
8602
8603                 /*
8604                  * Now call the per-module provide operation.  We will grab
8605                  * mod_lock to prevent the list from being modified.  Note
8606                  * that this also prevents the mod_busy bits from changing.
8607                  * (mod_busy can only be changed with mod_lock held.)
8608                  */
8609                 lck_mtx_lock(&mod_lock);
8610
8611 #if !defined(__APPLE__)
8612                 ctl = &modules;
8613                 do {
8614                         if (ctl->mod_busy || ctl->mod_mp == NULL)
8615                                 continue;
8616
8617                         prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
8618
8619                 } while ((ctl = ctl->mod_next) != &modules);
8620 #else
8621                 ctl = dtrace_modctl_list;
8622                 while (ctl) {
8623                         prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
8624                         ctl = ctl->mod_next;
8625                 }
8626 #endif
8627
8628                 lck_mtx_unlock(&mod_lock);
8629         } while (all && (prv = prv->dtpv_next) != NULL);
8630 }
8631
8632 /*
8633  * Iterate over each probe, and call the Framework-to-Provider API function
8634  * denoted by offs.
8635  */
8636 static void
8637 dtrace_probe_foreach(uintptr_t offs)
8638 {
8639         dtrace_provider_t *prov;
8640         void (*func)(void *, dtrace_id_t, void *);
8641         dtrace_probe_t *probe;
8642         dtrace_icookie_t cookie;
8643         int i;
8644
8645         /*
8646          * We disable interrupts to walk through the probe array.  This is
8647          * safe -- the dtrace_sync() in dtrace_unregister() assures that we
8648          * won't see stale data.
8649          */
8650         cookie = dtrace_interrupt_disable();
8651
8652         for (i = 0; i < dtrace_nprobes; i++) {
8653                 if ((probe = dtrace_probes[i]) == NULL)
8654                         continue;
8655
8656                 if (probe->dtpr_ecb == NULL) {
8657                         /*
8658                          * This probe isn't enabled -- don't call the function.
8659                          */
8660                         continue;
8661                 }
8662
8663                 prov = probe->dtpr_provider;
8664                 func = *((void(**)(void *, dtrace_id_t, void *))
8665                     ((uintptr_t)&prov->dtpv_pops + offs));
8666
8667                 func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
8668         }
8669
8670         dtrace_interrupt_enable(cookie);
8671 }
8672
8673 static int
8674 dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab)
8675 {
8676         dtrace_probekey_t pkey;
8677         uint32_t priv;
8678         uid_t uid;
8679         zoneid_t zoneid;
8680
8681         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8682
8683         dtrace_ecb_create_cache = NULL;
8684
8685         if (desc == NULL) {
8686                 /*
8687                  * If we're passed a NULL description, we're being asked to
8688                  * create an ECB with a NULL probe.
8689                  */
8690                 (void) dtrace_ecb_create_enable(NULL, enab);
8691                 return (0);
8692         }
8693
8694         dtrace_probekey(desc, &pkey);
8695         dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
8696             &priv, &uid, &zoneid);
8697
8698         return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable,
8699             enab));
8700 }
8701
8702 /*
8703  * DTrace Helper Provider Functions
8704  */
8705 static void
8706 dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
8707 {
8708         attr->dtat_name = DOF_ATTR_NAME(dofattr);
8709         attr->dtat_data = DOF_ATTR_DATA(dofattr);
8710         attr->dtat_class = DOF_ATTR_CLASS(dofattr);
8711 }
8712
8713 static void
8714 dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
8715     const dof_provider_t *dofprov, char *strtab)
8716 {
8717         hprov->dthpv_provname = strtab + dofprov->dofpv_name;
8718         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
8719             dofprov->dofpv_provattr);
8720         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
8721             dofprov->dofpv_modattr);
8722         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
8723             dofprov->dofpv_funcattr);
8724         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
8725             dofprov->dofpv_nameattr);
8726         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
8727             dofprov->dofpv_argsattr);
8728 }
8729
8730 static void
8731 dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
8732 {
8733         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8734         dof_hdr_t *dof = (dof_hdr_t *)daddr;
8735         dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
8736         dof_provider_t *provider;
8737         dof_probe_t *probe;
8738         uint32_t *off, *enoff;
8739         uint8_t *arg;
8740         char *strtab;
8741         uint_t i, nprobes;
8742         dtrace_helper_provdesc_t dhpv;
8743         dtrace_helper_probedesc_t dhpb;
8744         dtrace_meta_t *meta = dtrace_meta_pid;
8745         dtrace_mops_t *mops = &meta->dtm_mops;
8746         void *parg;
8747
8748         provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
8749         str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8750             provider->dofpv_strtab * dof->dofh_secsize);
8751         prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8752             provider->dofpv_probes * dof->dofh_secsize);
8753         arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8754             provider->dofpv_prargs * dof->dofh_secsize);
8755         off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8756             provider->dofpv_proffs * dof->dofh_secsize);
8757
8758         strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
8759         off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
8760         arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
8761         enoff = NULL;
8762
8763         /*
8764          * See dtrace_helper_provider_validate().
8765          */
8766         if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
8767             provider->dofpv_prenoffs != DOF_SECT_NONE) {
8768                 enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8769                     provider->dofpv_prenoffs * dof->dofh_secsize);
8770                 enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
8771         }
8772
8773         nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
8774
8775         /*
8776          * Create the provider.
8777          */
8778         dtrace_dofprov2hprov(&dhpv, provider, strtab);
8779
8780         if ((parg = mops->dtms_provide_pid(meta->dtm_arg, &dhpv, pid)) == NULL)
8781                 return;
8782
8783         meta->dtm_count++;
8784
8785         /*
8786          * Create the probes.
8787          */
8788         for (i = 0; i < nprobes; i++) {
8789                 probe = (dof_probe_t *)(uintptr_t)(daddr +
8790                     prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
8791
8792                 dhpb.dthpb_mod = dhp->dofhp_mod;
8793                 dhpb.dthpb_func = strtab + probe->dofpr_func;
8794                 dhpb.dthpb_name = strtab + probe->dofpr_name;
8795 #if !defined(__APPLE__)
8796                 dhpb.dthpb_base = probe->dofpr_addr;
8797 #else
8798                 dhpb.dthpb_base = dhp->dofhp_addr; /* FIXME: James, why? */
8799 #endif
8800 #if !defined(__APPLE__)  /* Quiet compiler warning */
8801                 dhpb.dthpb_offs = off + probe->dofpr_offidx;
8802 #else
8803                 dhpb.dthpb_offs = (int32_t *)(off + probe->dofpr_offidx);
8804 #endif /* __APPLE__ */
8805                 dhpb.dthpb_noffs = probe->dofpr_noffs;
8806                 if (enoff != NULL) {
8807 #if !defined(__APPLE__)  /* Quiet compiler warning */
8808                         dhpb.dthpb_enoffs = enoff + probe->dofpr_enoffidx;
8809 #else
8810                         dhpb.dthpb_enoffs = (int32_t *)(enoff + probe->dofpr_enoffidx);
8811 #endif /* __APPLE__ */
8812                         dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
8813                 } else {
8814                         dhpb.dthpb_enoffs = NULL;
8815                         dhpb.dthpb_nenoffs = 0;
8816                 }
8817                 dhpb.dthpb_args = arg + probe->dofpr_argidx;
8818                 dhpb.dthpb_nargc = probe->dofpr_nargc;
8819                 dhpb.dthpb_xargc = probe->dofpr_xargc;
8820                 dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
8821                 dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
8822
8823                 mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
8824         }
8825 }
8826
8827 static void
8828 dtrace_helper_provide(dof_helper_t *dhp, pid_t pid)
8829 {
8830         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8831         dof_hdr_t *dof = (dof_hdr_t *)daddr;
8832 #if !defined(__APPLE__)  /* Quiet compiler warning */
8833         int i;
8834 #else
8835         uint32_t i;
8836 #endif /* __APPLE__ */
8837
8838         lck_mtx_assert(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
8839
8840         for (i = 0; i < dof->dofh_secnum; i++) {
8841                 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
8842                     dof->dofh_secoff + i * dof->dofh_secsize);
8843
8844                 if (sec->dofs_type != DOF_SECT_PROVIDER)
8845                         continue;
8846
8847                 dtrace_helper_provide_one(dhp, sec, pid);
8848         }
8849
8850         /*
8851          * We may have just created probes, so we must now rematch against
8852          * any retained enablings.  Note that this call will acquire both
8853          * cpu_lock and dtrace_lock; the fact that we are holding
8854          * dtrace_meta_lock now is what defines the ordering with respect to
8855          * these three locks.
8856          */
8857         dtrace_enabling_matchall();
8858 }
8859
8860 static void
8861 dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
8862 {
8863         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8864         dof_hdr_t *dof = (dof_hdr_t *)daddr;
8865         dof_sec_t *str_sec;
8866         dof_provider_t *provider;
8867         char *strtab;
8868         dtrace_helper_provdesc_t dhpv;
8869         dtrace_meta_t *meta = dtrace_meta_pid;
8870         dtrace_mops_t *mops = &meta->dtm_mops;
8871
8872         provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
8873         str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8874             provider->dofpv_strtab * dof->dofh_secsize);
8875
8876         strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
8877
8878         /*
8879          * Create the provider.
8880          */
8881         dtrace_dofprov2hprov(&dhpv, provider, strtab);
8882
8883         mops->dtms_remove_pid(meta->dtm_arg, &dhpv, pid);
8884
8885         meta->dtm_count--;
8886 }
8887
8888 static void
8889 dtrace_helper_provider_remove(dof_helper_t *dhp, pid_t pid)
8890 {
8891         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8892         dof_hdr_t *dof = (dof_hdr_t *)daddr;
8893 #if !defined(__APPLE__)  /* Quiet compiler warning */
8894         int i;
8895 #else
8896         uint32_t i;
8897 #endif /* __APPLE__ */
8898
8899         lck_mtx_assert(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
8900
8901         for (i = 0; i < dof->dofh_secnum; i++) {
8902                 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
8903                     dof->dofh_secoff + i * dof->dofh_secsize);
8904
8905                 if (sec->dofs_type != DOF_SECT_PROVIDER)
8906                         continue;
8907
8908                 dtrace_helper_provider_remove_one(dhp, sec, pid);
8909         }
8910 }
8911
8912 /*
8913  * DTrace Meta Provider-to-Framework API Functions
8914  *
8915  * These functions implement the Meta Provider-to-Framework API, as described
8916  * in <sys/dtrace.h>.
8917  */
8918 int
8919 dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
8920     dtrace_meta_provider_id_t *idp)
8921 {
8922         dtrace_meta_t *meta;
8923         dtrace_helpers_t *help, *next;
8924 #if !defined(__APPLE__)  /* Quiet compiler warning */
8925         int i;
8926 #else
8927         uint_t i;
8928 #endif /* __APPLE__ */
8929
8930         *idp = DTRACE_METAPROVNONE;
8931
8932         /*
8933          * We strictly don't need the name, but we hold onto it for
8934          * debuggability. All hail error queues!
8935          */
8936         if (name == NULL) {
8937                 cmn_err(CE_WARN, "failed to register meta-provider: "
8938                     "invalid name");
8939                 return (EINVAL);
8940         }
8941
8942         if (mops == NULL ||
8943             mops->dtms_create_probe == NULL ||
8944             mops->dtms_provide_pid == NULL ||
8945             mops->dtms_remove_pid == NULL) {
8946                 cmn_err(CE_WARN, "failed to register meta-register %s: "
8947                     "invalid ops", name);
8948                 return (EINVAL);
8949         }
8950
8951         meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
8952         meta->dtm_mops = *mops;
8953 #if !defined(__APPLE__)
8954         meta->dtm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
8955         (void) strcpy(meta->dtm_name, name);
8956 #else /* Employ size bounded string operation. */
8957         {
8958         size_t bufsize = strlen(name) + 1;
8959         meta->dtm_name = kmem_alloc(bufsize, KM_SLEEP);
8960         (void) strlcpy(meta->dtm_name, name, bufsize);
8961         }
8962 #endif /* __APPLE__ */
8963         meta->dtm_arg = arg;
8964
8965         lck_mtx_lock(&dtrace_meta_lock);
8966         lck_mtx_lock(&dtrace_lock);
8967
8968         if (dtrace_meta_pid != NULL) {
8969                 lck_mtx_unlock(&dtrace_lock);
8970                 lck_mtx_unlock(&dtrace_meta_lock);
8971                 cmn_err(CE_WARN, "failed to register meta-register %s: "
8972                     "user-land meta-provider exists", name);
8973                 kmem_free(meta->dtm_name, strlen(meta->dtm_name) + 1);
8974                 kmem_free(meta, sizeof (dtrace_meta_t));
8975                 return (EINVAL);
8976         }
8977
8978         dtrace_meta_pid = meta;
8979         *idp = (dtrace_meta_provider_id_t)meta;
8980
8981         /*
8982          * If there are providers and probes ready to go, pass them
8983          * off to the new meta provider now.
8984          */
8985
8986         help = dtrace_deferred_pid;
8987         dtrace_deferred_pid = NULL;
8988
8989         lck_mtx_unlock(&dtrace_lock);
8990
8991         while (help != NULL) {
8992                 for (i = 0; i < help->dthps_nprovs; i++) {
8993                         dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
8994                             help->dthps_pid);
8995                 }
8996
8997                 next = help->dthps_next;
8998                 help->dthps_next = NULL;
8999                 help->dthps_prev = NULL;
9000                 help->dthps_deferred = 0;
9001                 help = next;
9002         }
9003
9004         lck_mtx_unlock(&dtrace_meta_lock);
9005
9006         return (0);
9007 }
9008
9009 int
9010 dtrace_meta_unregister(dtrace_meta_provider_id_t id)
9011 {
9012         dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;
9013
9014         lck_mtx_lock(&dtrace_meta_lock);
9015         lck_mtx_lock(&dtrace_lock);
9016
9017         if (old == dtrace_meta_pid) {
9018                 pp = &dtrace_meta_pid;
9019         } else {
9020                 panic("attempt to unregister non-existent "
9021                     "dtrace meta-provider %p\n", (void *)old);
9022         }
9023
9024         if (old->dtm_count != 0) {
9025                 lck_mtx_unlock(&dtrace_lock);
9026                 lck_mtx_unlock(&dtrace_meta_lock);
9027                 return (EBUSY);
9028         }
9029
9030         *pp = NULL;
9031
9032         lck_mtx_unlock(&dtrace_lock);
9033         lck_mtx_unlock(&dtrace_meta_lock);
9034
9035         kmem_free(old->dtm_name, strlen(old->dtm_name) + 1);
9036         kmem_free(old, sizeof (dtrace_meta_t));
9037
9038         return (0);
9039 }
9040
9041
9042 /*
9043  * DTrace DIF Object Functions
9044  */
9045 static int
9046 dtrace_difo_err(uint_t pc, const char *format, ...)
9047 {
9048         if (dtrace_err_verbose) {
9049                 va_list alist;
9050
9051                 (void) uprintf("dtrace DIF object error: [%u]: ", pc);
9052                 va_start(alist, format);
9053                 (void) vuprintf(format, alist);
9054                 va_end(alist);
9055         }
9056
9057 #ifdef DTRACE_ERRDEBUG
9058         dtrace_errdebug(format);
9059 #endif
9060         return (1);
9061 }
9062
9063 /*
9064  * Validate a DTrace DIF object by checking the IR instructions.  The following
9065  * rules are currently enforced by dtrace_difo_validate():
9066  *
9067  * 1. Each instruction must have a valid opcode
9068  * 2. Each register, string, variable, or subroutine reference must be valid
9069  * 3. No instruction can modify register %r0 (must be zero)
9070  * 4. All instruction reserved bits must be set to zero
9071  * 5. The last instruction must be a "ret" instruction
9072  * 6. All branch targets must reference a valid instruction _after_ the branch
9073  */
9074 static int
9075 dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
9076     cred_t *cr)
9077 {
9078 #if !defined(__APPLE__)  /* Quiet compiler warnings */
9079         int err = 0, i;
9080 #else
9081         int err = 0;
9082         uint_t i;
9083 #endif /* __APPLE__ */
9084         int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
9085         int kcheckload;
9086         uint_t pc;
9087
9088         kcheckload = cr == NULL ||
9089             (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
9090
9091         dp->dtdo_destructive = 0;
9092
9093         for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
9094                 dif_instr_t instr = dp->dtdo_buf[pc];
9095
9096                 uint_t r1 = DIF_INSTR_R1(instr);
9097                 uint_t r2 = DIF_INSTR_R2(instr);
9098                 uint_t rd = DIF_INSTR_RD(instr);
9099                 uint_t rs = DIF_INSTR_RS(instr);
9100                 uint_t label = DIF_INSTR_LABEL(instr);
9101                 uint_t v = DIF_INSTR_VAR(instr);
9102                 uint_t subr = DIF_INSTR_SUBR(instr);
9103                 uint_t type = DIF_INSTR_TYPE(instr);
9104                 uint_t op = DIF_INSTR_OP(instr);
9105
9106                 switch (op) {
9107                 case DIF_OP_OR:
9108                 case DIF_OP_XOR:
9109                 case DIF_OP_AND:
9110                 case DIF_OP_SLL:
9111                 case DIF_OP_SRL:
9112                 case DIF_OP_SRA:
9113                 case DIF_OP_SUB:
9114                 case DIF_OP_ADD:
9115                 case DIF_OP_MUL:
9116                 case DIF_OP_SDIV:
9117                 case DIF_OP_UDIV:
9118                 case DIF_OP_SREM:
9119                 case DIF_OP_UREM:
9120                 case DIF_OP_COPYS:
9121                         if (r1 >= nregs)
9122                                 err += efunc(pc, "invalid register %u\n", r1);
9123                         if (r2 >= nregs)
9124                                 err += efunc(pc, "invalid register %u\n", r2);
9125                         if (rd >= nregs)
9126                                 err += efunc(pc, "invalid register %u\n", rd);
9127                         if (rd == 0)
9128                                 err += efunc(pc, "cannot write to %r0\n");
9129                         break;
9130                 case DIF_OP_NOT:
9131                 case DIF_OP_MOV:
9132                 case DIF_OP_ALLOCS:
9133                         if (r1 >= nregs)
9134                                 err += efunc(pc, "invalid register %u\n", r1);
9135                         if (r2 != 0)
9136                                 err += efunc(pc, "non-zero reserved bits\n");
9137                         if (rd >= nregs)
9138                                 err += efunc(pc, "invalid register %u\n", rd);
9139                         if (rd == 0)
9140                                 err += efunc(pc, "cannot write to %r0\n");
9141                         break;
9142                 case DIF_OP_LDSB:
9143                 case DIF_OP_LDSH:
9144                 case DIF_OP_LDSW:
9145                 case DIF_OP_LDUB:
9146                 case DIF_OP_LDUH:
9147                 case DIF_OP_LDUW:
9148                 case DIF_OP_LDX:
9149                         if (r1 >= nregs)
9150                                 err += efunc(pc, "invalid register %u\n", r1);
9151                         if (r2 != 0)
9152                                 err += efunc(pc, "non-zero reserved bits\n");
9153                         if (rd >= nregs)
9154                                 err += efunc(pc, "invalid register %u\n", rd);
9155                         if (rd == 0)
9156                                 err += efunc(pc, "cannot write to %r0\n");
9157                         if (kcheckload)
9158                                 dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
9159                                     DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
9160                         break;
9161                 case DIF_OP_RLDSB:
9162                 case DIF_OP_RLDSH:
9163                 case DIF_OP_RLDSW:
9164                 case DIF_OP_RLDUB:
9165                 case DIF_OP_RLDUH:
9166                 case DIF_OP_RLDUW:
9167                 case DIF_OP_RLDX:
9168                         if (r1 >= nregs)
9169                                 err += efunc(pc, "invalid register %u\n", r1);
9170                         if (r2 != 0)
9171                                 err += efunc(pc, "non-zero reserved bits\n");
9172                         if (rd >= nregs)
9173                                 err += efunc(pc, "invalid register %u\n", rd);
9174                         if (rd == 0)
9175                                 err += efunc(pc, "cannot write to %r0\n");
9176                         break;
9177                 case DIF_OP_ULDSB:
9178                 case DIF_OP_ULDSH:
9179                 case DIF_OP_ULDSW:
9180                 case DIF_OP_ULDUB:
9181                 case DIF_OP_ULDUH:
9182                 case DIF_OP_ULDUW:
9183                 case DIF_OP_ULDX:
9184                         if (r1 >= nregs)
9185                                 err += efunc(pc, "invalid register %u\n", r1);
9186                         if (r2 != 0)
9187                                 err += efunc(pc, "non-zero reserved bits\n");
9188                         if (rd >= nregs)
9189                                 err += efunc(pc, "invalid register %u\n", rd);
9190                         if (rd == 0)
9191                                 err += efunc(pc, "cannot write to %r0\n");
9192                         break;
9193                 case DIF_OP_STB:
9194                 case DIF_OP_STH:
9195                 case DIF_OP_STW:
9196                 case DIF_OP_STX:
9197                         if (r1 >= nregs)
9198                                 err += efunc(pc, "invalid register %u\n", r1);
9199                         if (r2 != 0)
9200                                 err += efunc(pc, "non-zero reserved bits\n");
9201                         if (rd >= nregs)
9202                                 err += efunc(pc, "invalid register %u\n", rd);
9203                         if (rd == 0)
9204                                 err += efunc(pc, "cannot write to 0 address\n");
9205                         break;
9206                 case DIF_OP_CMP:
9207                 case DIF_OP_SCMP:
9208                         if (r1 >= nregs)
9209                                 err += efunc(pc, "invalid register %u\n", r1);
9210                         if (r2 >= nregs)
9211                                 err += efunc(pc, "invalid register %u\n", r2);
9212                         if (rd != 0)
9213                                 err += efunc(pc, "non-zero reserved bits\n");
9214                         break;
9215                 case DIF_OP_TST:
9216                         if (r1 >= nregs)
9217                                 err += efunc(pc, "invalid register %u\n", r1);
9218                         if (r2 != 0 || rd != 0)
9219                                 err += efunc(pc, "non-zero reserved bits\n");
9220                         break;
9221                 case DIF_OP_BA:
9222                 case DIF_OP_BE:
9223                 case DIF_OP_BNE:
9224                 case DIF_OP_BG:
9225                 case DIF_OP_BGU:
9226                 case DIF_OP_BGE:
9227                 case DIF_OP_BGEU:
9228                 case DIF_OP_BL:
9229                 case DIF_OP_BLU:
9230                 case DIF_OP_BLE:
9231                 case DIF_OP_BLEU:
9232                         if (label >= dp->dtdo_len) {
9233                                 err += efunc(pc, "invalid branch target %u\n",
9234                                     label);
9235                         }
9236                         if (label <= pc) {
9237                                 err += efunc(pc, "backward branch to %u\n",
9238                                     label);
9239                         }
9240                         break;
9241                 case DIF_OP_RET:
9242                         if (r1 != 0 || r2 != 0)
9243                                 err += efunc(pc, "non-zero reserved bits\n");
9244                         if (rd >= nregs)
9245                                 err += efunc(pc, "invalid register %u\n", rd);
9246                         break;
9247                 case DIF_OP_NOP:
9248                 case DIF_OP_POPTS:
9249                 case DIF_OP_FLUSHTS:
9250                         if (r1 != 0 || r2 != 0 || rd != 0)
9251                                 err += efunc(pc, "non-zero reserved bits\n");
9252                         break;
9253                 case DIF_OP_SETX:
9254                         if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
9255                                 err += efunc(pc, "invalid integer ref %u\n",
9256                                     DIF_INSTR_INTEGER(instr));
9257                         }
9258                         if (rd >= nregs)
9259                                 err += efunc(pc, "invalid register %u\n", rd);
9260                         if (rd == 0)
9261                                 err += efunc(pc, "cannot write to %r0\n");
9262                         break;
9263                 case DIF_OP_SETS:
9264                         if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
9265                                 err += efunc(pc, "invalid string ref %u\n",
9266                                     DIF_INSTR_STRING(instr));
9267                         }
9268                         if (rd >= nregs)
9269                                 err += efunc(pc, "invalid register %u\n", rd);
9270                         if (rd == 0)
9271                                 err += efunc(pc, "cannot write to %r0\n");
9272                         break;
9273                 case DIF_OP_LDGA:
9274                 case DIF_OP_LDTA:
9275                         if (r1 > DIF_VAR_ARRAY_MAX)
9276                                 err += efunc(pc, "invalid array %u\n", r1);
9277                         if (r2 >= nregs)
9278                                 err += efunc(pc, "invalid register %u\n", r2);
9279                         if (rd >= nregs)
9280                                 err += efunc(pc, "invalid register %u\n", rd);
9281                         if (rd == 0)
9282                                 err += efunc(pc, "cannot write to %r0\n");
9283                         break;
9284                 case DIF_OP_LDGS:
9285                 case DIF_OP_LDTS:
9286                 case DIF_OP_LDLS:
9287                 case DIF_OP_LDGAA:
9288                 case DIF_OP_LDTAA:
9289                         if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)
9290                                 err += efunc(pc, "invalid variable %u\n", v);
9291                         if (rd >= nregs)
9292                                 err += efunc(pc, "invalid register %u\n", rd);
9293                         if (rd == 0)
9294                                 err += efunc(pc, "cannot write to %r0\n");
9295                         break;
9296                 case DIF_OP_STGS:
9297                 case DIF_OP_STTS:
9298                 case DIF_OP_STLS:
9299                 case DIF_OP_STGAA:
9300                 case DIF_OP_STTAA:
9301                         if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)
9302                                 err += efunc(pc, "invalid variable %u\n", v);
9303                         if (rs >= nregs)
9304                                 err += efunc(pc, "invalid register %u\n", rd);
9305                         break;
9306                 case DIF_OP_CALL:
9307                         if (subr > DIF_SUBR_MAX)
9308                                 err += efunc(pc, "invalid subr %u\n", subr);
9309                         if (rd >= nregs)
9310                                 err += efunc(pc, "invalid register %u\n", rd);
9311                         if (rd == 0)
9312                                 err += efunc(pc, "cannot write to %r0\n");
9313
9314                         if (subr == DIF_SUBR_COPYOUT ||
9315                             subr == DIF_SUBR_COPYOUTSTR) {
9316                                 dp->dtdo_destructive = 1;
9317                         }
9318                         break;
9319                 case DIF_OP_PUSHTR:
9320                         if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
9321                                 err += efunc(pc, "invalid ref type %u\n", type);
9322                         if (r2 >= nregs)
9323                                 err += efunc(pc, "invalid register %u\n", r2);
9324                         if (rs >= nregs)
9325                                 err += efunc(pc, "invalid register %u\n", rs);
9326                         break;
9327                 case DIF_OP_PUSHTV:
9328                         if (type != DIF_TYPE_CTF)
9329                                 err += efunc(pc, "invalid val type %u\n", type);
9330                         if (r2 >= nregs)
9331                                 err += efunc(pc, "invalid register %u\n", r2);
9332                         if (rs >= nregs)
9333                                 err += efunc(pc, "invalid register %u\n", rs);
9334                         break;
9335                 default:
9336                         err += efunc(pc, "invalid opcode %u\n",
9337                             DIF_INSTR_OP(instr));
9338                 }
9339         }
9340
9341         if (dp->dtdo_len != 0 &&
9342             DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
9343                 err += efunc(dp->dtdo_len - 1,
9344                     "expected 'ret' as last DIF instruction\n");
9345         }
9346
9347         if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF)) {
9348                 /*
9349                  * If we're not returning by reference, the size must be either
9350                  * 0 or the size of one of the base types.
9351                  */
9352                 switch (dp->dtdo_rtype.dtdt_size) {
9353                 case 0:
9354                 case sizeof (uint8_t):
9355                 case sizeof (uint16_t):
9356                 case sizeof (uint32_t):
9357                 case sizeof (uint64_t):
9358                         break;
9359
9360                 default:
9361                         err += efunc(dp->dtdo_len - 1, "bad return size\n");
9362                 }
9363         }
9364
9365         for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
9366                 dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;
9367                 dtrace_diftype_t *vt, *et;
9368 #if !defined(__APPLE__) /* Quiet compiler warnings */
9369                 uint_t id, ndx;
9370 #else
9371                 uint_t id;
9372                 int ndx;
9373 #endif /* __APPLE__ */
9374
9375                 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
9376                     v->dtdv_scope != DIFV_SCOPE_THREAD &&
9377                     v->dtdv_scope != DIFV_SCOPE_LOCAL) {
9378                         err += efunc(i, "unrecognized variable scope %d\n",
9379                             v->dtdv_scope);
9380                         break;
9381                 }
9382
9383                 if (v->dtdv_kind != DIFV_KIND_ARRAY &&
9384                     v->dtdv_kind != DIFV_KIND_SCALAR) {
9385                         err += efunc(i, "unrecognized variable type %d\n",
9386                             v->dtdv_kind);
9387                         break;
9388                 }
9389
9390                 if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
9391                         err += efunc(i, "%d exceeds variable id limit\n", id);
9392                         break;
9393                 }
9394
9395                 if (id < DIF_VAR_OTHER_UBASE)
9396                         continue;
9397
9398                 /*
9399                  * For user-defined variables, we need to check that this
9400                  * definition is identical to any previous definition that we
9401                  * encountered.
9402                  */
9403                 ndx = id - DIF_VAR_OTHER_UBASE;
9404
9405                 switch (v->dtdv_scope) {
9406                 case DIFV_SCOPE_GLOBAL:
9407                         if (ndx < vstate->dtvs_nglobals) {
9408                                 dtrace_statvar_t *svar;
9409
9410                                 if ((svar = vstate->dtvs_globals[ndx]) != NULL)
9411                                         existing = &svar->dtsv_var;
9412                         }
9413
9414                         break;
9415
9416                 case DIFV_SCOPE_THREAD:
9417                         if (ndx < vstate->dtvs_ntlocals)
9418                                 existing = &vstate->dtvs_tlocals[ndx];
9419                         break;
9420
9421                 case DIFV_SCOPE_LOCAL:
9422                         if (ndx < vstate->dtvs_nlocals) {
9423                                 dtrace_statvar_t *svar;
9424
9425                                 if ((svar = vstate->dtvs_locals[ndx]) != NULL)
9426                                         existing = &svar->dtsv_var;
9427                         }
9428
9429                         break;
9430                 }
9431
9432                 vt = &v->dtdv_type;
9433
9434                 if (vt->dtdt_flags & DIF_TF_BYREF) {
9435                         if (vt->dtdt_size == 0) {
9436                                 err += efunc(i, "zero-sized variable\n");
9437                                 break;
9438                         }
9439
9440                         if (v->dtdv_scope == DIFV_SCOPE_GLOBAL &&
9441                             vt->dtdt_size > dtrace_global_maxsize) {
9442                                 err += efunc(i, "oversized by-ref global\n");
9443                                 break;
9444                         }
9445                 }
9446
9447                 if (existing == NULL || existing->dtdv_id == 0)
9448                         continue;
9449
9450                 ASSERT(existing->dtdv_id == v->dtdv_id);
9451                 ASSERT(existing->dtdv_scope == v->dtdv_scope);
9452
9453                 if (existing->dtdv_kind != v->dtdv_kind)
9454                         err += efunc(i, "%d changed variable kind\n", id);
9455
9456                 et = &existing->dtdv_type;
9457
9458                 if (vt->dtdt_flags != et->dtdt_flags) {
9459                         err += efunc(i, "%d changed variable type flags\n", id);
9460                         break;
9461                 }
9462
9463                 if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
9464                         err += efunc(i, "%d changed variable type size\n", id);
9465                         break;
9466                 }
9467         }
9468
9469         return (err);
9470 }
9471
9472 /*
9473  * Validate a DTrace DIF object that it is to be used as a helper.  Helpers
9474  * are much more constrained than normal DIFOs.  Specifically, they may
9475  * not:
9476  *
9477  * 1. Make calls to subroutines other than copyin(), copyinstr() or
9478  *    miscellaneous string routines
9479  * 2. Access DTrace variables other than the args[] array, and the
9480  *    curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
9481  * 3. Have thread-local variables.
9482  * 4. Have dynamic variables.
9483  */
9484 static int
9485 dtrace_difo_validate_helper(dtrace_difo_t *dp)
9486 {
9487         int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
9488         int err = 0;
9489         uint_t pc;
9490
9491         for (pc = 0; pc < dp->dtdo_len; pc++) {
9492                 dif_instr_t instr = dp->dtdo_buf[pc];
9493
9494                 uint_t v = DIF_INSTR_VAR(instr);
9495                 uint_t subr = DIF_INSTR_SUBR(instr);
9496                 uint_t op = DIF_INSTR_OP(instr);
9497
9498                 switch (op) {
9499                 case DIF_OP_OR:
9500                 case DIF_OP_XOR:
9501                 case DIF_OP_AND:
9502                 case DIF_OP_SLL:
9503                 case DIF_OP_SRL:
9504                 case DIF_OP_SRA:
9505                 case DIF_OP_SUB:
9506                 case DIF_OP_ADD:
9507                 case DIF_OP_MUL:
9508                 case DIF_OP_SDIV:
9509                 case DIF_OP_UDIV:
9510                 case DIF_OP_SREM:
9511                 case DIF_OP_UREM:
9512                 case DIF_OP_COPYS:
9513                 case DIF_OP_NOT:
9514                 case DIF_OP_MOV:
9515                 case DIF_OP_RLDSB:
9516                 case DIF_OP_RLDSH:
9517                 case DIF_OP_RLDSW:
9518                 case DIF_OP_RLDUB:
9519                 case DIF_OP_RLDUH:
9520                 case DIF_OP_RLDUW:
9521                 case DIF_OP_RLDX:
9522                 case DIF_OP_ULDSB:
9523                 case DIF_OP_ULDSH:
9524                 case DIF_OP_ULDSW:
9525                 case DIF_OP_ULDUB:
9526                 case DIF_OP_ULDUH:
9527                 case DIF_OP_ULDUW:
9528                 case DIF_OP_ULDX:
9529                 case DIF_OP_STB:
9530                 case DIF_OP_STH:
9531                 case DIF_OP_STW:
9532                 case DIF_OP_STX:
9533                 case DIF_OP_ALLOCS:
9534                 case DIF_OP_CMP:
9535                 case DIF_OP_SCMP:
9536                 case DIF_OP_TST:
9537                 case DIF_OP_BA:
9538                 case DIF_OP_BE:
9539                 case DIF_OP_BNE:
9540                 case DIF_OP_BG:
9541                 case DIF_OP_BGU:
9542                 case DIF_OP_BGE:
9543                 case DIF_OP_BGEU:
9544                 case DIF_OP_BL:
9545                 case DIF_OP_BLU:
9546                 case DIF_OP_BLE:
9547                 case DIF_OP_BLEU:
9548                 case DIF_OP_RET:
9549                 case DIF_OP_NOP:
9550                 case DIF_OP_POPTS:
9551                 case DIF_OP_FLUSHTS:
9552                 case DIF_OP_SETX:
9553                 case DIF_OP_SETS:
9554                 case DIF_OP_LDGA:
9555                 case DIF_OP_LDLS:
9556                 case DIF_OP_STGS:
9557                 case DIF_OP_STLS:
9558                 case DIF_OP_PUSHTR:
9559                 case DIF_OP_PUSHTV:
9560                         break;
9561
9562                 case DIF_OP_LDGS:
9563                         if (v >= DIF_VAR_OTHER_UBASE)
9564                                 break;
9565
9566                         if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
9567                                 break;
9568
9569                         if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||
9570                             v == DIF_VAR_PPID || v == DIF_VAR_TID ||
9571                             v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||
9572                             v == DIF_VAR_UID || v == DIF_VAR_GID)
9573                                 break;
9574
9575                         err += efunc(pc, "illegal variable %u\n", v);
9576                         break;
9577
9578                 case DIF_OP_LDTA:
9579                 case DIF_OP_LDTS:
9580                 case DIF_OP_LDGAA:
9581                 case DIF_OP_LDTAA:
9582                         err += efunc(pc, "illegal dynamic variable load\n");
9583                         break;
9584
9585                 case DIF_OP_STTS:
9586                 case DIF_OP_STGAA:
9587                 case DIF_OP_STTAA:
9588                         err += efunc(pc, "illegal dynamic variable store\n");
9589                         break;
9590
9591                 case DIF_OP_CALL:
9592                         if (subr == DIF_SUBR_ALLOCA ||
9593                             subr == DIF_SUBR_BCOPY ||
9594                             subr == DIF_SUBR_COPYIN ||
9595                             subr == DIF_SUBR_COPYINTO ||
9596                             subr == DIF_SUBR_COPYINSTR ||
9597                             subr == DIF_SUBR_INDEX ||
9598                             subr == DIF_SUBR_INET_NTOA ||
9599                             subr == DIF_SUBR_INET_NTOA6 ||
9600                             subr == DIF_SUBR_INET_NTOP ||
9601                             subr == DIF_SUBR_LLTOSTR ||
9602                             subr == DIF_SUBR_RINDEX ||
9603                             subr == DIF_SUBR_STRCHR ||
9604                             subr == DIF_SUBR_STRJOIN ||
9605                             subr == DIF_SUBR_STRRCHR ||
9606                             subr == DIF_SUBR_STRSTR ||
9607 #if defined(__APPLE__)
9608                             subr == DIF_SUBR_COREPROFILE ||
9609 #endif /* __APPLE__ */
9610                             subr == DIF_SUBR_HTONS ||
9611                             subr == DIF_SUBR_HTONL ||
9612                             subr == DIF_SUBR_HTONLL ||
9613                             subr == DIF_SUBR_NTOHS ||
9614                             subr == DIF_SUBR_NTOHL ||
9615                             subr == DIF_SUBR_NTOHLL)
9616                                 break;
9617
9618                         err += efunc(pc, "invalid subr %u\n", subr);
9619                         break;
9620
9621                 default:
9622                         err += efunc(pc, "invalid opcode %u\n",
9623                             DIF_INSTR_OP(instr));
9624                 }
9625         }
9626
9627         return (err);
9628 }
9629
9630 /*
9631  * Returns 1 if the expression in the DIF object can be cached on a per-thread
9632  * basis; 0 if not.
9633  */
9634 static int
9635 dtrace_difo_cacheable(dtrace_difo_t *dp)
9636 {
9637 #if !defined(__APPLE__) /* Quiet compiler warnings */
9638         int i;
9639 #else
9640         uint_t i;
9641 #endif /* __APPLE__ */
9642
9643         if (dp == NULL)
9644                 return (0);
9645
9646         for (i = 0; i < dp->dtdo_varlen; i++) {
9647                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9648
9649                 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
9650                         continue;
9651
9652                 switch (v->dtdv_id) {
9653                 case DIF_VAR_CURTHREAD:
9654                 case DIF_VAR_PID:
9655                 case DIF_VAR_TID:
9656                 case DIF_VAR_EXECNAME:
9657                 case DIF_VAR_ZONENAME:
9658                         break;
9659
9660                 default:
9661                         return (0);
9662                 }
9663         }
9664
9665         /*
9666          * This DIF object may be cacheable.  Now we need to look for any
9667          * array loading instructions, any memory loading instructions, or
9668          * any stores to thread-local variables.
9669          */
9670         for (i = 0; i < dp->dtdo_len; i++) {
9671                 uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
9672
9673                 if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||
9674                     (op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||
9675                     (op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||
9676                     op == DIF_OP_LDGA || op == DIF_OP_STTS)
9677                         return (0);
9678         }
9679
9680         return (1);
9681 }
9682
9683 static void
9684 dtrace_difo_hold(dtrace_difo_t *dp)
9685 {
9686 #if !defined(__APPLE__) /* Quiet compiler warnings */
9687         int i;
9688 #else
9689         uint_t i;
9690 #endif /* __APPLE__ */
9691
9692         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9693
9694         dp->dtdo_refcnt++;
9695         ASSERT(dp->dtdo_refcnt != 0);
9696
9697         /*
9698          * We need to check this DIF object for references to the variable
9699          * DIF_VAR_VTIMESTAMP.
9700          */
9701         for (i = 0; i < dp->dtdo_varlen; i++) {
9702                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9703
9704                 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
9705                         continue;
9706
9707                 if (dtrace_vtime_references++ == 0)
9708                         dtrace_vtime_enable();
9709         }
9710 }
9711
9712 /*
9713  * This routine calculates the dynamic variable chunksize for a given DIF
9714  * object.  The calculation is not fool-proof, and can probably be tricked by
9715  * malicious DIF -- but it works for all compiler-generated DIF.  Because this
9716  * calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
9717  * if a dynamic variable size exceeds the chunksize.
9718  */
9719 static void
9720 dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9721 {
9722 #if !defined(__APPLE__) /* Quiet compiler warnings */
9723         uint64_t sval;
9724 #else
9725         uint64_t sval = 0;
9726 #endif /* __APPLE__ */
9727         dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
9728         const dif_instr_t *text = dp->dtdo_buf;
9729         uint_t pc, srd = 0;
9730         uint_t ttop = 0;
9731         size_t size, ksize;
9732         uint_t id, i;
9733
9734         for (pc = 0; pc < dp->dtdo_len; pc++) {
9735                 dif_instr_t instr = text[pc];
9736                 uint_t op = DIF_INSTR_OP(instr);
9737                 uint_t rd = DIF_INSTR_RD(instr);
9738                 uint_t r1 = DIF_INSTR_R1(instr);
9739                 uint_t nkeys = 0;
9740                 uchar_t scope;
9741
9742                 dtrace_key_t *key = tupregs;
9743
9744                 switch (op) {
9745                 case DIF_OP_SETX:
9746                         sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
9747                         srd = rd;
9748                         continue;
9749
9750                 case DIF_OP_STTS:
9751                         key = &tupregs[DIF_DTR_NREGS];
9752                         key[0].dttk_size = 0;
9753                         key[1].dttk_size = 0;
9754                         nkeys = 2;
9755                         scope = DIFV_SCOPE_THREAD;
9756                         break;
9757
9758                 case DIF_OP_STGAA:
9759                 case DIF_OP_STTAA:
9760                         nkeys = ttop;
9761
9762                         if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
9763                                 key[nkeys++].dttk_size = 0;
9764
9765                         key[nkeys++].dttk_size = 0;
9766
9767                         if (op == DIF_OP_STTAA) {
9768                                 scope = DIFV_SCOPE_THREAD;
9769                         } else {
9770                                 scope = DIFV_SCOPE_GLOBAL;
9771                         }
9772
9773                         break;
9774
9775                 case DIF_OP_PUSHTR:
9776                         if (ttop == DIF_DTR_NREGS)
9777                                 return;
9778
9779                         if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {
9780                                 /*
9781                                  * If the register for the size of the "pushtr"
9782                                  * is %r0 (or the value is 0) and the type is
9783                                  * a string, we'll use the system-wide default
9784                                  * string size.
9785                                  */
9786                                 tupregs[ttop++].dttk_size =
9787                                     dtrace_strsize_default;
9788                         } else {
9789                                 if (srd == 0)
9790                                         return;
9791
9792                                 tupregs[ttop++].dttk_size = sval;
9793                         }
9794
9795                         break;
9796
9797                 case DIF_OP_PUSHTV:
9798                         if (ttop == DIF_DTR_NREGS)
9799                                 return;
9800
9801                         tupregs[ttop++].dttk_size = 0;
9802                         break;
9803
9804                 case DIF_OP_FLUSHTS:
9805                         ttop = 0;
9806                         break;
9807
9808                 case DIF_OP_POPTS:
9809                         if (ttop != 0)
9810                                 ttop--;
9811                         break;
9812                 }
9813
9814                 sval = 0;
9815                 srd = 0;
9816
9817                 if (nkeys == 0)
9818                         continue;
9819
9820                 /*
9821                  * We have a dynamic variable allocation; calculate its size.
9822                  */
9823                 for (ksize = 0, i = 0; i < nkeys; i++)
9824                         ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
9825
9826                 size = sizeof (dtrace_dynvar_t);
9827                 size += sizeof (dtrace_key_t) * (nkeys - 1);
9828                 size += ksize;
9829
9830                 /*
9831                  * Now we need to determine the size of the stored data.
9832                  */
9833                 id = DIF_INSTR_VAR(instr);
9834
9835                 for (i = 0; i < dp->dtdo_varlen; i++) {
9836                         dtrace_difv_t *v = &dp->dtdo_vartab[i];
9837
9838                         if (v->dtdv_id == id && v->dtdv_scope == scope) {
9839                                 size += v->dtdv_type.dtdt_size;
9840                                 break;
9841                         }
9842                 }
9843
9844                 if (i == dp->dtdo_varlen)
9845                         return;
9846
9847                 /*
9848                  * We have the size.  If this is larger than the chunk size
9849                  * for our dynamic variable state, reset the chunk size.
9850                  */
9851                 size = P2ROUNDUP(size, sizeof (uint64_t));
9852
9853                 if (size > vstate->dtvs_dynvars.dtds_chunksize)
9854                         vstate->dtvs_dynvars.dtds_chunksize = size;
9855         }
9856 }
9857
9858 static void
9859 dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9860 {
9861 #if !defined(__APPLE__) /* Quiet compiler warnings */
9862         int i, oldsvars, osz, nsz, otlocals, ntlocals;
9863         uint_t id;
9864 #else
9865         int oldsvars, osz, nsz, otlocals, ntlocals;
9866         uint_t i, id;
9867 #endif /* __APPLE__ */
9868
9869         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9870         ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);
9871
9872         for (i = 0; i < dp->dtdo_varlen; i++) {
9873                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9874 #if !defined(__APPLE__) /* Quiet compiler warnings */
9875                 dtrace_statvar_t *svar, ***svarp;
9876 #else
9877                 dtrace_statvar_t *svar;
9878                 dtrace_statvar_t ***svarp = NULL;
9879 #endif /* __APPLE__ */
9880                 size_t dsize = 0;
9881                 uint8_t scope = v->dtdv_scope;
9882                 int *np = (int *)NULL;
9883
9884                 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
9885                         continue;
9886
9887                 id -= DIF_VAR_OTHER_UBASE;
9888
9889                 switch (scope) {
9890                 case DIFV_SCOPE_THREAD:
9891 #if !defined(__APPLE__) /* Quiet compiler warnings */
9892                         while (id >= (otlocals = vstate->dtvs_ntlocals)) {
9893 #else
9894                         while (id >= (uint_t)(otlocals = vstate->dtvs_ntlocals)) {
9895 #endif /* __APPLE__ */
9896                                 dtrace_difv_t *tlocals;
9897
9898                                 if ((ntlocals = (otlocals << 1)) == 0)
9899                                         ntlocals = 1;
9900
9901                                 osz = otlocals * sizeof (dtrace_difv_t);
9902                                 nsz = ntlocals * sizeof (dtrace_difv_t);
9903
9904                                 tlocals = kmem_zalloc(nsz, KM_SLEEP);
9905
9906                                 if (osz != 0) {
9907                                         bcopy(vstate->dtvs_tlocals,
9908                                             tlocals, osz);
9909                                         kmem_free(vstate->dtvs_tlocals, osz);
9910                                 }
9911
9912                                 vstate->dtvs_tlocals = tlocals;
9913                                 vstate->dtvs_ntlocals = ntlocals;
9914                         }
9915
9916                         vstate->dtvs_tlocals[id] = *v;
9917                         continue;
9918
9919                 case DIFV_SCOPE_LOCAL:
9920                         np = &vstate->dtvs_nlocals;
9921                         svarp = &vstate->dtvs_locals;
9922
9923                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
9924                                 dsize = (int)NCPU * (v->dtdv_type.dtdt_size +
9925                                     sizeof (uint64_t));
9926                         else
9927                                 dsize = (int)NCPU * sizeof (uint64_t);
9928
9929                         break;
9930
9931                 case DIFV_SCOPE_GLOBAL:
9932                         np = &vstate->dtvs_nglobals;
9933                         svarp = &vstate->dtvs_globals;
9934
9935                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
9936                                 dsize = v->dtdv_type.dtdt_size +
9937                                     sizeof (uint64_t);
9938
9939                         break;
9940
9941                 default:
9942                         ASSERT(0);
9943                 }
9944
9945 #if !defined(__APPLE__) /* Quiet compiler warnings */
9946                 while (id >= (oldsvars = *np)) {
9947 #else
9948                 while (id >= (uint_t)(oldsvars = *np)) {
9949 #endif /* __APPLE__ */
9950                         dtrace_statvar_t **statics;
9951                         int newsvars, oldsize, newsize;
9952
9953                         if ((newsvars = (oldsvars << 1)) == 0)
9954                                 newsvars = 1;
9955
9956                         oldsize = oldsvars * sizeof (dtrace_statvar_t *);
9957                         newsize = newsvars * sizeof (dtrace_statvar_t *);
9958
9959                         statics = kmem_zalloc(newsize, KM_SLEEP);
9960
9961                         if (oldsize != 0) {
9962                                 bcopy(*svarp, statics, oldsize);
9963                                 kmem_free(*svarp, oldsize);
9964                         }
9965
9966                         *svarp = statics;
9967                         *np = newsvars;
9968                 }
9969
9970                 if ((svar = (*svarp)[id]) == NULL) {
9971                         svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
9972                         svar->dtsv_var = *v;
9973
9974                         if ((svar->dtsv_size = dsize) != 0) {
9975                                 svar->dtsv_data = (uint64_t)(uintptr_t)
9976                                     kmem_zalloc(dsize, KM_SLEEP);
9977                         }
9978
9979                         (*svarp)[id] = svar;
9980                 }
9981
9982                 svar->dtsv_refcnt++;
9983         }
9984
9985         dtrace_difo_chunksize(dp, vstate);
9986         dtrace_difo_hold(dp);
9987 }
9988
9989 static dtrace_difo_t *
9990 dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9991 {
9992         dtrace_difo_t *new;
9993         size_t sz;
9994
9995         ASSERT(dp->dtdo_buf != NULL);
9996         ASSERT(dp->dtdo_refcnt != 0);
9997
9998         new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
9999
10000         ASSERT(dp->dtdo_buf != NULL);
10001         sz = dp->dtdo_len * sizeof (dif_instr_t);
10002         new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
10003         bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
10004         new->dtdo_len = dp->dtdo_len;
10005
10006         if (dp->dtdo_strtab != NULL) {
10007                 ASSERT(dp->dtdo_strlen != 0);
10008                 new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
10009                 bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
10010                 new->dtdo_strlen = dp->dtdo_strlen;
10011         }
10012
10013         if (dp->dtdo_inttab != NULL) {
10014                 ASSERT(dp->dtdo_intlen != 0);
10015                 sz = dp->dtdo_intlen * sizeof (uint64_t);
10016                 new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
10017                 bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
10018                 new->dtdo_intlen = dp->dtdo_intlen;
10019         }
10020
10021         if (dp->dtdo_vartab != NULL) {
10022                 ASSERT(dp->dtdo_varlen != 0);
10023                 sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
10024                 new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
10025                 bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
10026                 new->dtdo_varlen = dp->dtdo_varlen;
10027         }
10028
10029         dtrace_difo_init(new, vstate);
10030         return (new);
10031 }
10032
10033 static void
10034 dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10035 {
10036 #if !defined(__APPLE__) /* Quiet compiler warnings */
10037         int i;
10038 #else
10039         uint_t i;
10040 #endif /* __APPLE__ */
10041
10042         ASSERT(dp->dtdo_refcnt == 0);
10043
10044         for (i = 0; i < dp->dtdo_varlen; i++) {
10045                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10046 #if !defined(__APPLE__) /* Quiet compiler warnings */
10047                 dtrace_statvar_t *svar, **svarp;
10048                 uint_t id;
10049                 uint8_t scope = v->dtdv_scope;
10050                 int *np;
10051 #else
10052                 dtrace_statvar_t *svar;
10053                 dtrace_statvar_t **svarp = NULL;
10054                 uint_t id;
10055                 uint8_t scope = v->dtdv_scope;
10056                 int *np = NULL;
10057 #endif /* __APPLE__ */
10058
10059                 switch (scope) {
10060                 case DIFV_SCOPE_THREAD:
10061                         continue;
10062
10063                 case DIFV_SCOPE_LOCAL:
10064                         np = &vstate->dtvs_nlocals;
10065                         svarp = vstate->dtvs_locals;
10066                         break;
10067
10068                 case DIFV_SCOPE_GLOBAL:
10069                         np = &vstate->dtvs_nglobals;
10070                         svarp = vstate->dtvs_globals;
10071                         break;
10072
10073                 default:
10074                         ASSERT(0);
10075                 }
10076
10077                 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10078                         continue;
10079
10080                 id -= DIF_VAR_OTHER_UBASE;
10081
10082 #if !defined(__APPLE__) /* Quiet compiler warnings */
10083                 ASSERT(id < *np);
10084 #else
10085                 ASSERT(id < (uint_t)*np);
10086 #endif /* __APPLE__ */
10087
10088                 svar = svarp[id];
10089                 ASSERT(svar != NULL);
10090                 ASSERT(svar->dtsv_refcnt > 0);
10091
10092                 if (--svar->dtsv_refcnt > 0)
10093                         continue;
10094
10095                 if (svar->dtsv_size != 0) {
10096                         ASSERT(svar->dtsv_data != NULL);
10097                         kmem_free((void *)(uintptr_t)svar->dtsv_data,
10098                             svar->dtsv_size);
10099                 }
10100
10101                 kmem_free(svar, sizeof (dtrace_statvar_t));
10102                 svarp[id] = NULL;
10103         }
10104
10105         kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
10106         kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
10107         kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
10108         kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
10109
10110         kmem_free(dp, sizeof (dtrace_difo_t));
10111 }
10112
10113 static void
10114 dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10115 {
10116 #if !defined(__APPLE__) /* Quiet compiler warnings */
10117         int i;
10118 #else
10119         uint_t i;
10120 #endif /* __APPLE__ */
10121
10122         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10123         ASSERT(dp->dtdo_refcnt != 0);
10124
10125         for (i = 0; i < dp->dtdo_varlen; i++) {
10126                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10127
10128                 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10129                         continue;
10130
10131                 ASSERT(dtrace_vtime_references > 0);
10132                 if (--dtrace_vtime_references == 0)
10133                         dtrace_vtime_disable();
10134         }
10135
10136         if (--dp->dtdo_refcnt == 0)
10137                 dtrace_difo_destroy(dp, vstate);
10138 }
10139
10140 /*
10141  * DTrace Format Functions
10142  */
10143 static uint16_t
10144 dtrace_format_add(dtrace_state_t *state, char *str)
10145 {
10146         char *fmt, **new;
10147         uint16_t ndx, len = strlen(str) + 1;
10148
10149         fmt = kmem_zalloc(len, KM_SLEEP);
10150         bcopy(str, fmt, len);
10151
10152         for (ndx = 0; ndx < state->dts_nformats; ndx++) {
10153                 if (state->dts_formats[ndx] == NULL) {
10154                         state->dts_formats[ndx] = fmt;
10155                         return (ndx + 1);
10156                 }
10157         }
10158
10159         if (state->dts_nformats == USHRT_MAX) {
10160                 /*
10161                  * This is only likely if a denial-of-service attack is being
10162                  * attempted.  As such, it's okay to fail silently here.
10163                  */
10164                 kmem_free(fmt, len);
10165                 return (0);
10166         }
10167
10168         /*
10169          * For simplicity, we always resize the formats array to be exactly the
10170          * number of formats.
10171          */
10172         ndx = state->dts_nformats++;
10173         new = kmem_alloc((ndx + 1) * sizeof (char *), KM_SLEEP);
10174
10175         if (state->dts_formats != NULL) {
10176                 ASSERT(ndx != 0);
10177                 bcopy(state->dts_formats, new, ndx * sizeof (char *));
10178                 kmem_free(state->dts_formats, ndx * sizeof (char *));
10179         }
10180
10181         state->dts_formats = new;
10182         state->dts_formats[ndx] = fmt;
10183
10184         return (ndx + 1);
10185 }
10186
10187 static void
10188 dtrace_format_remove(dtrace_state_t *state, uint16_t format)
10189 {
10190         char *fmt;
10191
10192         ASSERT(state->dts_formats != NULL);
10193         ASSERT(format <= state->dts_nformats);
10194         ASSERT(state->dts_formats[format - 1] != NULL);
10195
10196         fmt = state->dts_formats[format - 1];
10197         kmem_free(fmt, strlen(fmt) + 1);
10198         state->dts_formats[format - 1] = NULL;
10199 }
10200
10201 static void
10202 dtrace_format_destroy(dtrace_state_t *state)
10203 {
10204         int i;
10205
10206         if (state->dts_nformats == 0) {
10207                 ASSERT(state->dts_formats == NULL);
10208                 return;
10209         }
10210
10211         ASSERT(state->dts_formats != NULL);
10212
10213         for (i = 0; i < state->dts_nformats; i++) {
10214                 char *fmt = state->dts_formats[i];
10215
10216                 if (fmt == NULL)
10217                         continue;
10218
10219                 kmem_free(fmt, strlen(fmt) + 1);
10220         }
10221
10222         kmem_free(state->dts_formats, state->dts_nformats * sizeof (char *));
10223         state->dts_nformats = 0;
10224         state->dts_formats = NULL;
10225 }
10226
10227 /*
10228  * DTrace Predicate Functions
10229  */
10230 static dtrace_predicate_t *
10231 dtrace_predicate_create(dtrace_difo_t *dp)
10232 {
10233         dtrace_predicate_t *pred;
10234
10235         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10236         ASSERT(dp->dtdo_refcnt != 0);
10237
10238         pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
10239         pred->dtp_difo = dp;
10240         pred->dtp_refcnt = 1;
10241
10242         if (!dtrace_difo_cacheable(dp))
10243                 return (pred);
10244
10245         if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
10246                 /*
10247                  * This is only theoretically possible -- we have had 2^32
10248                  * cacheable predicates on this machine.  We cannot allow any
10249                  * more predicates to become cacheable:  as unlikely as it is,
10250                  * there may be a thread caching a (now stale) predicate cache
10251                  * ID. (N.B.: the temptation is being successfully resisted to
10252                  * have this cmn_err() "Holy shit -- we executed this code!")
10253                  */
10254                 return (pred);
10255         }
10256
10257         pred->dtp_cacheid = dtrace_predcache_id++;
10258
10259         return (pred);
10260 }
10261
10262 static void
10263 dtrace_predicate_hold(dtrace_predicate_t *pred)
10264 {
10265         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10266         ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
10267         ASSERT(pred->dtp_refcnt > 0);
10268
10269         pred->dtp_refcnt++;
10270 }
10271
10272 static void
10273 dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)
10274 {
10275         dtrace_difo_t *dp = pred->dtp_difo;
10276 #pragma unused(dp) /* __APPLE__ */
10277
10278         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10279         ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
10280         ASSERT(pred->dtp_refcnt > 0);
10281
10282         if (--pred->dtp_refcnt == 0) {
10283                 dtrace_difo_release(pred->dtp_difo, vstate);
10284                 kmem_free(pred, sizeof (dtrace_predicate_t));
10285         }
10286 }
10287
10288 /*
10289  * DTrace Action Description Functions
10290  */
10291 static dtrace_actdesc_t *
10292 dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
10293     uint64_t uarg, uint64_t arg)
10294 {
10295         dtrace_actdesc_t *act;
10296
10297         ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != NULL &&
10298             arg >= KERNELBASE) || (arg == NULL && kind == DTRACEACT_PRINTA));
10299
10300         act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
10301         act->dtad_kind = kind;
10302         act->dtad_ntuple = ntuple;
10303         act->dtad_uarg = uarg;
10304         act->dtad_arg = arg;
10305         act->dtad_refcnt = 1;
10306
10307         return (act);
10308 }
10309
10310 static void
10311 dtrace_actdesc_hold(dtrace_actdesc_t *act)
10312 {
10313         ASSERT(act->dtad_refcnt >= 1);
10314         act->dtad_refcnt++;
10315 }
10316
10317 static void
10318 dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)
10319 {
10320         dtrace_actkind_t kind = act->dtad_kind;
10321         dtrace_difo_t *dp;
10322
10323         ASSERT(act->dtad_refcnt >= 1);
10324
10325         if (--act->dtad_refcnt != 0)
10326                 return;
10327
10328         if ((dp = act->dtad_difo) != NULL)
10329                 dtrace_difo_release(dp, vstate);
10330
10331         if (DTRACEACT_ISPRINTFLIKE(kind)) {
10332                 char *str = (char *)(uintptr_t)act->dtad_arg;
10333
10334                 ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
10335                     (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
10336
10337                 if (str != NULL)
10338                         kmem_free(str, strlen(str) + 1);
10339         }
10340
10341         kmem_free(act, sizeof (dtrace_actdesc_t));
10342 }
10343
10344 /*
10345  * DTrace ECB Functions
10346  */
10347 static dtrace_ecb_t *
10348 dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
10349 {
10350         dtrace_ecb_t *ecb;
10351         dtrace_epid_t epid;
10352
10353         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10354
10355         ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
10356         ecb->dte_predicate = NULL;
10357         ecb->dte_probe = probe;
10358
10359         /*
10360          * The default size is the size of the default action: recording
10361          * the epid.
10362          */
10363         ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
10364         ecb->dte_alignment = sizeof (dtrace_epid_t);
10365
10366         epid = state->dts_epid++;
10367
10368 #if !defined(__APPLE__) /* Quiet compiler warnings */
10369         if (epid - 1 >= state->dts_necbs) {
10370 #else
10371         if (epid - 1 >= (dtrace_epid_t)state->dts_necbs) {
10372 #endif /* __APPLE__ */
10373                 dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;
10374                 int necbs = state->dts_necbs << 1;
10375
10376 #if !defined(__APPLE__) /* Quiet compiler warnings */
10377                 ASSERT(epid == state->dts_necbs + 1);
10378 #else
10379                 ASSERT(epid == (dtrace_epid_t)state->dts_necbs + 1);
10380 #endif /* __APPLE__ */
10381
10382                 if (necbs == 0) {
10383                         ASSERT(oecbs == NULL);
10384                         necbs = 1;
10385                 }
10386
10387                 ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
10388
10389                 if (oecbs != NULL)
10390                         bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
10391
10392                 dtrace_membar_producer();
10393                 state->dts_ecbs = ecbs;
10394
10395                 if (oecbs != NULL) {
10396                         /*
10397                          * If this state is active, we must dtrace_sync()
10398                          * before we can free the old dts_ecbs array:  we're
10399                          * coming in hot, and there may be active ring
10400                          * buffer processing (which indexes into the dts_ecbs
10401                          * array) on another CPU.
10402                          */
10403                         if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
10404                                 dtrace_sync();
10405
10406                         kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
10407                 }
10408
10409                 dtrace_membar_producer();
10410                 state->dts_necbs = necbs;
10411         }
10412
10413         ecb->dte_state = state;
10414
10415         ASSERT(state->dts_ecbs[epid - 1] == NULL);
10416         dtrace_membar_producer();
10417         state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;
10418
10419         return (ecb);
10420 }
10421
10422 static int
10423 dtrace_ecb_enable(dtrace_ecb_t *ecb)
10424 {
10425         dtrace_probe_t *probe = ecb->dte_probe;
10426
10427         lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
10428         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10429         ASSERT(ecb->dte_next == NULL);
10430
10431         if (probe == NULL) {
10432                 /*
10433                  * This is the NULL probe -- there's nothing to do.
10434                  */
10435             return(0);
10436         }
10437
10438         if (probe->dtpr_ecb == NULL) {
10439                 dtrace_provider_t *prov = probe->dtpr_provider;
10440
10441                 /*
10442                  * We're the first ECB on this probe.
10443                  */
10444                 probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
10445
10446                 if (ecb->dte_predicate != NULL)
10447                         probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
10448
10449                 return (prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
10450                     probe->dtpr_id, probe->dtpr_arg));
10451         } else {
10452                 /*
10453                  * This probe is already active.  Swing the last pointer to
10454                  * point to the new ECB, and issue a dtrace_sync() to assure
10455                  * that all CPUs have seen the change.
10456                  */
10457                 ASSERT(probe->dtpr_ecb_last != NULL);
10458                 probe->dtpr_ecb_last->dte_next = ecb;
10459                 probe->dtpr_ecb_last = ecb;
10460                 probe->dtpr_predcache = 0;
10461
10462                 dtrace_sync();
10463                 return(0);
10464         }
10465 }
10466
10467 static void
10468 dtrace_ecb_resize(dtrace_ecb_t *ecb)
10469 {
10470         uint32_t maxalign = sizeof (dtrace_epid_t);
10471         uint32_t align = sizeof (uint8_t), offs, diff;
10472         dtrace_action_t *act;
10473         int wastuple = 0;
10474         uint32_t aggbase = UINT32_MAX;
10475         dtrace_state_t *state = ecb->dte_state;
10476
10477         /*
10478          * If we record anything, we always record the epid.  (And we always
10479          * record it first.)
10480          */
10481         offs = sizeof (dtrace_epid_t);
10482         ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
10483
10484         for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
10485                 dtrace_recdesc_t *rec = &act->dta_rec;
10486
10487                 if ((align = rec->dtrd_alignment) > maxalign)
10488                         maxalign = align;
10489
10490                 if (!wastuple && act->dta_intuple) {
10491                         /*
10492                          * This is the first record in a tuple.  Align the
10493                          * offset to be at offset 4 in an 8-byte aligned
10494                          * block.
10495                          */
10496                         diff = offs + sizeof (dtrace_aggid_t);
10497
10498                         if ((diff = (diff & (sizeof (uint64_t) - 1))))
10499                                 offs += sizeof (uint64_t) - diff;
10500
10501                         aggbase = offs - sizeof (dtrace_aggid_t);
10502                         ASSERT(!(aggbase & (sizeof (uint64_t) - 1)));
10503                 }
10504
10505                 /*LINTED*/
10506                 if (rec->dtrd_size != 0 && (diff = (offs & (align - 1)))) {
10507                         /*
10508                          * The current offset is not properly aligned; align it.
10509                          */
10510                         offs += align - diff;
10511                 }
10512
10513                 rec->dtrd_offset = offs;
10514
10515                 if (offs + rec->dtrd_size > ecb->dte_needed) {
10516                         ecb->dte_needed = offs + rec->dtrd_size;
10517
10518                         if (ecb->dte_needed > state->dts_needed)
10519                                 state->dts_needed = ecb->dte_needed;
10520                 }
10521
10522                 if (DTRACEACT_ISAGG(act->dta_kind)) {
10523                         dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
10524                         dtrace_action_t *first = agg->dtag_first, *prev;
10525
10526                         ASSERT(rec->dtrd_size != 0 && first != NULL);
10527                         ASSERT(wastuple);
10528                         ASSERT(aggbase != UINT32_MAX);
10529
10530                         agg->dtag_base = aggbase;
10531
10532                         while ((prev = first->dta_prev) != NULL &&
10533                             DTRACEACT_ISAGG(prev->dta_kind)) {
10534                                 agg = (dtrace_aggregation_t *)prev;
10535                                 first = agg->dtag_first;
10536                         }
10537
10538                         if (prev != NULL) {
10539                                 offs = prev->dta_rec.dtrd_offset +
10540                                     prev->dta_rec.dtrd_size;
10541                         } else {
10542                                 offs = sizeof (dtrace_epid_t);
10543                         }
10544                         wastuple = 0;
10545                 } else {
10546                         if (!act->dta_intuple)
10547                                 ecb->dte_size = offs + rec->dtrd_size;
10548
10549                         offs += rec->dtrd_size;
10550                 }
10551
10552                 wastuple = act->dta_intuple;
10553         }
10554
10555         if ((act = ecb->dte_action) != NULL &&
10556             !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
10557             ecb->dte_size == sizeof (dtrace_epid_t)) {
10558                 /*
10559                  * If the size is still sizeof (dtrace_epid_t), then all
10560                  * actions store no data; set the size to 0.
10561                  */
10562                 ecb->dte_alignment = maxalign;
10563                 ecb->dte_size = 0;
10564
10565                 /*
10566                  * If the needed space is still sizeof (dtrace_epid_t), then
10567                  * all actions need no additional space; set the needed
10568                  * size to 0.
10569                  */
10570                 if (ecb->dte_needed == sizeof (dtrace_epid_t))
10571                         ecb->dte_needed = 0;
10572
10573                 return;
10574         }
10575
10576         /*
10577          * Set our alignment, and make sure that the dte_size and dte_needed
10578          * are aligned to the size of an EPID.
10579          */
10580         ecb->dte_alignment = maxalign;
10581         ecb->dte_size = (ecb->dte_size + (sizeof (dtrace_epid_t) - 1)) &
10582             ~(sizeof (dtrace_epid_t) - 1);
10583         ecb->dte_needed = (ecb->dte_needed + (sizeof (dtrace_epid_t) - 1)) &
10584             ~(sizeof (dtrace_epid_t) - 1);
10585         ASSERT(ecb->dte_size <= ecb->dte_needed);
10586 }
10587
10588 static dtrace_action_t *
10589 dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
10590 {
10591         dtrace_aggregation_t *agg;
10592         size_t size = sizeof (uint64_t);
10593         int ntuple = desc->dtad_ntuple;
10594         dtrace_action_t *act;
10595         dtrace_recdesc_t *frec;
10596         dtrace_aggid_t aggid;
10597         dtrace_state_t *state = ecb->dte_state;
10598
10599         agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
10600         agg->dtag_ecb = ecb;
10601
10602         ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
10603
10604         switch (desc->dtad_kind) {
10605         case DTRACEAGG_MIN:
10606                 agg->dtag_initial = INT64_MAX;
10607                 agg->dtag_aggregate = dtrace_aggregate_min;
10608                 break;
10609
10610         case DTRACEAGG_MAX:
10611                 agg->dtag_initial = INT64_MIN;
10612                 agg->dtag_aggregate = dtrace_aggregate_max;
10613                 break;
10614
10615         case DTRACEAGG_COUNT:
10616                 agg->dtag_aggregate = dtrace_aggregate_count;
10617                 break;
10618
10619         case DTRACEAGG_QUANTIZE:
10620                 agg->dtag_aggregate = dtrace_aggregate_quantize;
10621                 size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
10622                     sizeof (uint64_t);
10623                 break;
10624
10625         case DTRACEAGG_LQUANTIZE: {
10626                 uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
10627                 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
10628
10629                 agg->dtag_initial = desc->dtad_arg;
10630                 agg->dtag_aggregate = dtrace_aggregate_lquantize;
10631
10632                 if (step == 0 || levels == 0)
10633                         goto err;
10634
10635                 size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
10636                 break;
10637         }
10638
10639         case DTRACEAGG_AVG:
10640                 agg->dtag_aggregate = dtrace_aggregate_avg;
10641                 size = sizeof (uint64_t) * 2;
10642                 break;
10643
10644         case DTRACEAGG_STDDEV:
10645                 agg->dtag_aggregate = dtrace_aggregate_stddev;
10646                 size = sizeof (uint64_t) * 4;
10647                 break;
10648
10649         case DTRACEAGG_SUM:
10650                 agg->dtag_aggregate = dtrace_aggregate_sum;
10651                 break;
10652
10653         default:
10654                 goto err;
10655         }
10656
10657         agg->dtag_action.dta_rec.dtrd_size = size;
10658
10659         if (ntuple == 0)
10660                 goto err;
10661
10662         /*
10663          * We must make sure that we have enough actions for the n-tuple.
10664          */
10665         for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
10666                 if (DTRACEACT_ISAGG(act->dta_kind))
10667                         break;
10668
10669                 if (--ntuple == 0) {
10670                         /*
10671                          * This is the action with which our n-tuple begins.
10672                          */
10673                         agg->dtag_first = act;
10674                         goto success;
10675                 }
10676         }
10677
10678         /*
10679          * This n-tuple is short by ntuple elements.  Return failure.
10680          */
10681         ASSERT(ntuple != 0);
10682 err:
10683         kmem_free(agg, sizeof (dtrace_aggregation_t));
10684         return (NULL);
10685
10686 success:
10687         /*
10688          * If the last action in the tuple has a size of zero, it's actually
10689          * an expression argument for the aggregating action.
10690          */
10691         ASSERT(ecb->dte_action_last != NULL);
10692         act = ecb->dte_action_last;
10693
10694         if (act->dta_kind == DTRACEACT_DIFEXPR) {
10695                 ASSERT(act->dta_difo != NULL);
10696
10697                 if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
10698                         agg->dtag_hasarg = 1;
10699         }
10700
10701         /*
10702          * We need to allocate an id for this aggregation.
10703          */
10704         aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
10705             VM_BESTFIT | VM_SLEEP);
10706
10707 #if !defined(__APPLE__) /* Quiet compiler warnings */
10708         if (aggid - 1 >= state->dts_naggregations) {
10709 #else
10710         if (aggid - 1 >= (dtrace_aggid_t)state->dts_naggregations) {
10711 #endif /* __APPLE__ */
10712                 dtrace_aggregation_t **oaggs = state->dts_aggregations;
10713                 dtrace_aggregation_t **aggs;
10714                 int naggs = state->dts_naggregations << 1;
10715                 int onaggs = state->dts_naggregations;
10716
10717 #if !defined(__APPLE__) /* Quiet compiler warnings */
10718                 ASSERT(aggid == state->dts_naggregations + 1);
10719 #else
10720                 ASSERT(aggid == (dtrace_aggid_t)state->dts_naggregations + 1);
10721 #endif /* __APPLE */
10722
10723                 if (naggs == 0) {
10724                         ASSERT(oaggs == NULL);
10725                         naggs = 1;
10726                 }
10727
10728                 aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
10729
10730                 if (oaggs != NULL) {
10731                         bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
10732                         kmem_free(oaggs, onaggs * sizeof (*aggs));
10733                 }
10734
10735                 state->dts_aggregations = aggs;
10736                 state->dts_naggregations = naggs;
10737         }
10738
10739         ASSERT(state->dts_aggregations[aggid - 1] == NULL);
10740         state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;
10741
10742         frec = &agg->dtag_first->dta_rec;
10743         if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
10744                 frec->dtrd_alignment = sizeof (dtrace_aggid_t);
10745
10746         for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
10747                 ASSERT(!act->dta_intuple);
10748                 act->dta_intuple = 1;
10749         }
10750
10751         return (&agg->dtag_action);
10752 }
10753
10754 static void
10755 dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)
10756 {
10757         dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
10758         dtrace_state_t *state = ecb->dte_state;
10759         dtrace_aggid_t aggid = agg->dtag_id;
10760
10761         ASSERT(DTRACEACT_ISAGG(act->dta_kind));
10762         vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
10763
10764         ASSERT(state->dts_aggregations[aggid - 1] == agg);
10765         state->dts_aggregations[aggid - 1] = NULL;
10766
10767         kmem_free(agg, sizeof (dtrace_aggregation_t));
10768 }
10769
10770 static int
10771 dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
10772 {
10773         dtrace_action_t *action, *last;
10774         dtrace_difo_t *dp = desc->dtad_difo;
10775         uint32_t size = 0, align = sizeof (uint8_t), mask;
10776         uint16_t format = 0;
10777         dtrace_recdesc_t *rec;
10778         dtrace_state_t *state = ecb->dte_state;
10779 #if !defined(__APPLE__) /* Quiet compiler warnings */
10780         dtrace_optval_t *opt = state->dts_options, nframes, strsize;
10781 #else
10782         dtrace_optval_t *opt = state->dts_options;
10783         dtrace_optval_t nframes=0, strsize;
10784 #endif /* __APPLE__ */
10785         uint64_t arg = desc->dtad_arg;
10786
10787         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10788         ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);
10789
10790         if (DTRACEACT_ISAGG(desc->dtad_kind)) {
10791                 /*
10792                  * If this is an aggregating action, there must be neither
10793                  * a speculate nor a commit on the action chain.
10794                  */
10795                 dtrace_action_t *act;
10796
10797                 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
10798                         if (act->dta_kind == DTRACEACT_COMMIT)
10799                                 return (EINVAL);
10800
10801                         if (act->dta_kind == DTRACEACT_SPECULATE)
10802                                 return (EINVAL);
10803                 }
10804
10805                 action = dtrace_ecb_aggregation_create(ecb, desc);
10806
10807                 if (action == NULL)
10808                         return (EINVAL);
10809         } else {
10810                 if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||
10811                     (desc->dtad_kind == DTRACEACT_DIFEXPR &&
10812                     dp != NULL && dp->dtdo_destructive)) {
10813                         state->dts_destructive = 1;
10814                 }
10815
10816                 switch (desc->dtad_kind) {
10817                 case DTRACEACT_PRINTF:
10818                 case DTRACEACT_PRINTA:
10819                 case DTRACEACT_SYSTEM:
10820                 case DTRACEACT_FREOPEN:
10821                         /*
10822                          * We know that our arg is a string -- turn it into a
10823                          * format.
10824                          */
10825                         if (arg == NULL) {
10826                                 ASSERT(desc->dtad_kind == DTRACEACT_PRINTA);
10827                                 format = 0;
10828                         } else {
10829                                 ASSERT(arg != NULL);
10830                                 ASSERT(arg > KERNELBASE);
10831                                 format = dtrace_format_add(state,
10832                                     (char *)(uintptr_t)arg);
10833                         }
10834
10835                         /*FALLTHROUGH*/
10836                 case DTRACEACT_LIBACT:
10837                 case DTRACEACT_DIFEXPR:
10838 #if defined(__APPLE__)
10839                 case DTRACEACT_APPLEBINARY:
10840 #endif /* __APPLE__ */
10841                         if (dp == NULL)
10842                                 return (EINVAL);
10843
10844                         if ((size = dp->dtdo_rtype.dtdt_size) != 0)
10845                                 break;
10846
10847                         if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
10848                                 if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10849                                         return (EINVAL);
10850
10851                                 size = opt[DTRACEOPT_STRSIZE];
10852                         }
10853
10854                         break;
10855
10856                 case DTRACEACT_STACK:
10857                         if ((nframes = arg) == 0) {
10858                                 nframes = opt[DTRACEOPT_STACKFRAMES];
10859                                 ASSERT(nframes > 0);
10860                                 arg = nframes;
10861                         }
10862
10863                         size = nframes * sizeof (pc_t);
10864                         break;
10865
10866                 case DTRACEACT_JSTACK:
10867                         if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
10868                                 strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
10869
10870                         if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
10871                                 nframes = opt[DTRACEOPT_JSTACKFRAMES];
10872
10873                         arg = DTRACE_USTACK_ARG(nframes, strsize);
10874
10875                         /*FALLTHROUGH*/
10876                 case DTRACEACT_USTACK:
10877                         if (desc->dtad_kind != DTRACEACT_JSTACK &&
10878                             (nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
10879                                 strsize = DTRACE_USTACK_STRSIZE(arg);
10880                                 nframes = opt[DTRACEOPT_USTACKFRAMES];
10881                                 ASSERT(nframes > 0);
10882                                 arg = DTRACE_USTACK_ARG(nframes, strsize);
10883                         }
10884
10885                         /*
10886                          * Save a slot for the pid.
10887                          */
10888                         size = (nframes + 1) * sizeof (uint64_t);
10889                         size += DTRACE_USTACK_STRSIZE(arg);
10890                         size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
10891
10892                         break;
10893
10894                 case DTRACEACT_SYM:
10895                 case DTRACEACT_MOD:
10896                         if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=
10897                             sizeof (uint64_t)) ||
10898                             (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10899                                 return (EINVAL);
10900                         break;
10901
10902                 case DTRACEACT_USYM:
10903                 case DTRACEACT_UMOD:
10904                 case DTRACEACT_UADDR:
10905                         if (dp == NULL ||
10906                             (dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||
10907                             (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10908                                 return (EINVAL);
10909
10910                         /*
10911                          * We have a slot for the pid, plus a slot for the
10912                          * argument.  To keep things simple (aligned with
10913                          * bitness-neutral sizing), we store each as a 64-bit
10914                          * quantity.
10915                          */
10916                         size = 2 * sizeof (uint64_t);
10917                         break;
10918
10919                 case DTRACEACT_STOP:
10920                 case DTRACEACT_BREAKPOINT:
10921                 case DTRACEACT_PANIC:
10922                         break;
10923
10924                 case DTRACEACT_CHILL:
10925                 case DTRACEACT_DISCARD:
10926                 case DTRACEACT_RAISE:
10927 #if defined(__APPLE__)
10928                 case DTRACEACT_PIDRESUME:
10929 #endif /* __APPLE__ */
10930                         if (dp == NULL)
10931                                 return (EINVAL);
10932                         break;
10933
10934                 case DTRACEACT_EXIT:
10935                         if (dp == NULL ||
10936                             (size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||
10937                             (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10938                                 return (EINVAL);
10939                         break;
10940
10941                 case DTRACEACT_SPECULATE:
10942                         if (ecb->dte_size > sizeof (dtrace_epid_t))
10943                                 return (EINVAL);
10944
10945                         if (dp == NULL)
10946                                 return (EINVAL);
10947
10948                         state->dts_speculates = 1;
10949                         break;
10950
10951                 case DTRACEACT_COMMIT: {
10952                         dtrace_action_t *act = ecb->dte_action;
10953
10954                         for (; act != NULL; act = act->dta_next) {
10955                                 if (act->dta_kind == DTRACEACT_COMMIT)
10956                                         return (EINVAL);
10957                         }
10958
10959                         if (dp == NULL)
10960                                 return (EINVAL);
10961                         break;
10962                 }
10963
10964                 default:
10965                         return (EINVAL);
10966                 }
10967
10968                 if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {
10969                         /*
10970                          * If this is a data-storing action or a speculate,
10971                          * we must be sure that there isn't a commit on the
10972                          * action chain.
10973                          */
10974                         dtrace_action_t *act = ecb->dte_action;
10975
10976                         for (; act != NULL; act = act->dta_next) {
10977                                 if (act->dta_kind == DTRACEACT_COMMIT)
10978                                         return (EINVAL);
10979                         }
10980                 }
10981
10982                 action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
10983                 action->dta_rec.dtrd_size = size;
10984         }
10985
10986         action->dta_refcnt = 1;
10987         rec = &action->dta_rec;
10988         size = rec->dtrd_size;
10989
10990         for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
10991                 if (!(size & mask)) {
10992                         align = mask + 1;
10993                         break;
10994                 }
10995         }
10996
10997         action->dta_kind = desc->dtad_kind;
10998
10999         if ((action->dta_difo = dp) != NULL)
11000                 dtrace_difo_hold(dp);
11001
11002         rec->dtrd_action = action->dta_kind;
11003         rec->dtrd_arg = arg;
11004         rec->dtrd_uarg = desc->dtad_uarg;
11005         rec->dtrd_alignment = (uint16_t)align;
11006         rec->dtrd_format = format;
11007
11008         if ((last = ecb->dte_action_last) != NULL) {
11009                 ASSERT(ecb->dte_action != NULL);
11010                 action->dta_prev = last;
11011                 last->dta_next = action;
11012         } else {
11013                 ASSERT(ecb->dte_action == NULL);
11014                 ecb->dte_action = action;
11015         }
11016
11017         ecb->dte_action_last = action;
11018
11019         return (0);
11020 }
11021
11022 static void
11023 dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
11024 {
11025         dtrace_action_t *act = ecb->dte_action, *next;
11026         dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
11027         dtrace_difo_t *dp;
11028         uint16_t format;
11029
11030         if (act != NULL && act->dta_refcnt > 1) {
11031                 ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);
11032                 act->dta_refcnt--;
11033         } else {
11034                 for (; act != NULL; act = next) {
11035                         next = act->dta_next;
11036                         ASSERT(next != NULL || act == ecb->dte_action_last);
11037                         ASSERT(act->dta_refcnt == 1);
11038
11039                         if ((format = act->dta_rec.dtrd_format) != 0)
11040                                 dtrace_format_remove(ecb->dte_state, format);
11041
11042                         if ((dp = act->dta_difo) != NULL)
11043                                 dtrace_difo_release(dp, vstate);
11044
11045                         if (DTRACEACT_ISAGG(act->dta_kind)) {
11046                                 dtrace_ecb_aggregation_destroy(ecb, act);
11047                         } else {
11048                                 kmem_free(act, sizeof (dtrace_action_t));
11049                         }
11050                 }
11051         }
11052
11053         ecb->dte_action = NULL;
11054         ecb->dte_action_last = NULL;
11055         ecb->dte_size = sizeof (dtrace_epid_t);
11056 }
11057
11058 static void
11059 dtrace_ecb_disable(dtrace_ecb_t *ecb)
11060 {
11061         /*
11062          * We disable the ECB by removing it from its probe.
11063          */
11064         dtrace_ecb_t *pecb, *prev = NULL;
11065         dtrace_probe_t *probe = ecb->dte_probe;
11066
11067         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11068
11069         if (probe == NULL) {
11070                 /*
11071                  * This is the NULL probe; there is nothing to disable.
11072                  */
11073                 return;
11074         }
11075
11076         for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
11077                 if (pecb == ecb)
11078                         break;
11079                 prev = pecb;
11080         }
11081
11082         ASSERT(pecb != NULL);
11083
11084         if (prev == NULL) {
11085                 probe->dtpr_ecb = ecb->dte_next;
11086         } else {
11087                 prev->dte_next = ecb->dte_next;
11088         }
11089
11090         if (ecb == probe->dtpr_ecb_last) {
11091                 ASSERT(ecb->dte_next == NULL);
11092                 probe->dtpr_ecb_last = prev;
11093         }
11094
11095         /*
11096          * The ECB has been disconnected from the probe; now sync to assure
11097          * that all CPUs have seen the change before returning.
11098          */
11099         dtrace_sync();
11100
11101         if (probe->dtpr_ecb == NULL) {
11102                 /*
11103                  * That was the last ECB on the probe; clear the predicate
11104                  * cache ID for the probe, disable it and sync one more time
11105                  * to assure that we'll never hit it again.
11106                  */
11107                 dtrace_provider_t *prov = probe->dtpr_provider;
11108
11109                 ASSERT(ecb->dte_next == NULL);
11110                 ASSERT(probe->dtpr_ecb_last == NULL);
11111                 probe->dtpr_predcache = DTRACE_CACHEIDNONE;
11112                 prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
11113                     probe->dtpr_id, probe->dtpr_arg);
11114                 dtrace_sync();
11115         } else {
11116                 /*
11117                  * There is at least one ECB remaining on the probe.  If there
11118                  * is _exactly_ one, set the probe's predicate cache ID to be
11119                  * the predicate cache ID of the remaining ECB.
11120                  */
11121                 ASSERT(probe->dtpr_ecb_last != NULL);
11122                 ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
11123
11124                 if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
11125                         dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
11126
11127                         ASSERT(probe->dtpr_ecb->dte_next == NULL);
11128
11129                         if (p != NULL)
11130                                 probe->dtpr_predcache = p->dtp_cacheid;
11131                 }
11132
11133                 ecb->dte_next = NULL;
11134         }
11135 }
11136
11137 static void
11138 dtrace_ecb_destroy(dtrace_ecb_t *ecb)
11139 {
11140         dtrace_state_t *state = ecb->dte_state;
11141         dtrace_vstate_t *vstate = &state->dts_vstate;
11142         dtrace_predicate_t *pred;
11143         dtrace_epid_t epid = ecb->dte_epid;
11144
11145         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11146         ASSERT(ecb->dte_next == NULL);
11147         ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);
11148
11149         if ((pred = ecb->dte_predicate) != NULL)
11150                 dtrace_predicate_release(pred, vstate);
11151
11152         dtrace_ecb_action_remove(ecb);
11153
11154         ASSERT(state->dts_ecbs[epid - 1] == ecb);
11155         state->dts_ecbs[epid - 1] = NULL;
11156
11157         kmem_free(ecb, sizeof (dtrace_ecb_t));
11158 }
11159
11160 static dtrace_ecb_t *
11161 dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
11162     dtrace_enabling_t *enab)
11163 {
11164         dtrace_ecb_t *ecb;
11165         dtrace_predicate_t *pred;
11166         dtrace_actdesc_t *act;
11167         dtrace_provider_t *prov;
11168         dtrace_ecbdesc_t *desc = enab->dten_current;
11169
11170         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11171         ASSERT(state != NULL);
11172
11173         ecb = dtrace_ecb_add(state, probe);
11174         ecb->dte_uarg = desc->dted_uarg;
11175
11176         if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
11177                 dtrace_predicate_hold(pred);
11178                 ecb->dte_predicate = pred;
11179         }
11180
11181         if (probe != NULL) {
11182                 /*
11183                  * If the provider shows more leg than the consumer is old
11184                  * enough to see, we need to enable the appropriate implicit
11185                  * predicate bits to prevent the ecb from activating at
11186                  * revealing times.
11187                  *
11188                  * Providers specifying DTRACE_PRIV_USER at register time
11189                  * are stating that they need the /proc-style privilege
11190                  * model to be enforced, and this is what DTRACE_COND_OWNER
11191                  * and DTRACE_COND_ZONEOWNER will then do at probe time.
11192                  */
11193                 prov = probe->dtpr_provider;
11194                 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
11195                     (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11196                         ecb->dte_cond |= DTRACE_COND_OWNER;
11197
11198                 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
11199                     (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11200                         ecb->dte_cond |= DTRACE_COND_ZONEOWNER;
11201
11202                 /*
11203                  * If the provider shows us kernel innards and the user
11204                  * is lacking sufficient privilege, enable the
11205                  * DTRACE_COND_USERMODE implicit predicate.
11206                  */
11207                 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
11208                     (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
11209                         ecb->dte_cond |= DTRACE_COND_USERMODE;
11210         }
11211
11212         if (dtrace_ecb_create_cache != NULL) {
11213                 /*
11214                  * If we have a cached ecb, we'll use its action list instead
11215                  * of creating our own (saving both time and space).
11216                  */
11217                 dtrace_ecb_t *cached = dtrace_ecb_create_cache;
11218                 dtrace_action_t *act_if = cached->dte_action;
11219
11220                 if (act_if != NULL) {
11221                         ASSERT(act_if->dta_refcnt > 0);
11222                         act_if->dta_refcnt++;
11223                         ecb->dte_action = act_if;
11224                         ecb->dte_action_last = cached->dte_action_last;
11225                         ecb->dte_needed = cached->dte_needed;
11226                         ecb->dte_size = cached->dte_size;
11227                         ecb->dte_alignment = cached->dte_alignment;
11228                 }
11229
11230                 return (ecb);
11231         }
11232
11233         for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
11234                 if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
11235                         dtrace_ecb_destroy(ecb);
11236                         return (NULL);
11237                 }
11238         }
11239
11240         dtrace_ecb_resize(ecb);
11241
11242         return (dtrace_ecb_create_cache = ecb);
11243 }
11244
11245 static int
11246 dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg)
11247 {
11248         dtrace_ecb_t *ecb;
11249         dtrace_enabling_t *enab = arg;
11250         dtrace_state_t *state = enab->dten_vstate->dtvs_state;
11251
11252         ASSERT(state != NULL);
11253
11254         if (probe != NULL && probe->dtpr_gen < enab->dten_probegen) {
11255                 /*
11256                  * This probe was created in a generation for which this
11257                  * enabling has previously created ECBs; we don't want to
11258                  * enable it again, so just kick out.
11259                  */
11260                 return (DTRACE_MATCH_NEXT);
11261         }
11262
11263         if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
11264                 return (DTRACE_MATCH_DONE);
11265
11266         if (dtrace_ecb_enable(ecb) < 0)
11267                return (DTRACE_MATCH_FAIL);
11268
11269         return (DTRACE_MATCH_NEXT);
11270 }
11271
11272 static dtrace_ecb_t *
11273 dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
11274 {
11275         dtrace_ecb_t *ecb;
11276 #pragma unused(ecb) /* __APPLE__ */
11277
11278         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11279
11280 #if !defined(__APPLE__) /* Quiet compiler warnings */
11281         if (id == 0 || id > state->dts_necbs)
11282 #else
11283             if (id == 0 || id > (dtrace_epid_t)state->dts_necbs)
11284 #endif /* __APPLE__ */
11285                 return (NULL);
11286
11287         ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
11288         ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);
11289
11290         return (state->dts_ecbs[id - 1]);
11291 }
11292
11293 static dtrace_aggregation_t *
11294 dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
11295 {
11296         dtrace_aggregation_t *agg;
11297 #pragma unused(agg) /* __APPLE__ */
11298
11299         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11300
11301 #if !defined(__APPLE__) /* Quiet compiler warnings */
11302         if (id == 0 || id > state->dts_naggregations)
11303 #else
11304         if (id == 0 || id > (dtrace_aggid_t)state->dts_naggregations)
11305 #endif /* __APPLE__ */
11306                 return (NULL);
11307
11308         ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
11309         ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||
11310             agg->dtag_id == id);
11311
11312         return (state->dts_aggregations[id - 1]);
11313 }
11314
11315 /*
11316  * DTrace Buffer Functions
11317  *
11318  * The following functions manipulate DTrace buffers.  Most of these functions
11319  * are called in the context of establishing or processing consumer state;
11320  * exceptions are explicitly noted.
11321  */
11322
11323 /*
11324  * Note:  called from cross call context.  This function switches the two
11325  * buffers on a given CPU.  The atomicity of this operation is assured by
11326  * disabling interrupts while the actual switch takes place; the disabling of
11327  * interrupts serializes the execution with any execution of dtrace_probe() on
11328  * the same CPU.
11329  */
11330 static void
11331 dtrace_buffer_switch(dtrace_buffer_t *buf)
11332 {
11333         caddr_t tomax = buf->dtb_tomax;
11334         caddr_t xamot = buf->dtb_xamot;
11335         dtrace_icookie_t cookie;
11336
11337         ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
11338         ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
11339
11340         cookie = dtrace_interrupt_disable();
11341         buf->dtb_tomax = xamot;
11342         buf->dtb_xamot = tomax;
11343         buf->dtb_xamot_drops = buf->dtb_drops;
11344         buf->dtb_xamot_offset = buf->dtb_offset;
11345         buf->dtb_xamot_errors = buf->dtb_errors;
11346         buf->dtb_xamot_flags = buf->dtb_flags;
11347         buf->dtb_offset = 0;
11348         buf->dtb_drops = 0;
11349         buf->dtb_errors = 0;
11350         buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
11351         dtrace_interrupt_enable(cookie);
11352 }
11353
11354 /*
11355  * Note:  called from cross call context.  This function activates a buffer
11356  * on a CPU.  As with dtrace_buffer_switch(), the atomicity of the operation
11357  * is guaranteed by the disabling of interrupts.
11358  */
11359 static void
11360 dtrace_buffer_activate(dtrace_state_t *state)
11361 {
11362         dtrace_buffer_t *buf;
11363         dtrace_icookie_t cookie = dtrace_interrupt_disable();
11364
11365         buf = &state->dts_buffer[CPU->cpu_id];
11366
11367         if (buf->dtb_tomax != NULL) {
11368                 /*
11369                  * We might like to assert that the buffer is marked inactive,
11370                  * but this isn't necessarily true:  the buffer for the CPU
11371                  * that processes the BEGIN probe has its buffer activated
11372                  * manually.  In this case, we take the (harmless) action
11373                  * re-clearing the bit INACTIVE bit.
11374                  */
11375                 buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
11376         }
11377
11378         dtrace_interrupt_enable(cookie);
11379 }
11380
11381 static int
11382 dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags,
11383     processorid_t cpu)
11384 {
11385         dtrace_cpu_t *cp;
11386         dtrace_buffer_t *buf;
11387
11388         lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
11389         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11390
11391 #if !defined(__APPLE__) /* Quiet compiler warnings */
11392         if (size > dtrace_nonroot_maxsize &&
11393             !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
11394                 return (EFBIG);
11395 #else
11396         if (size > (size_t)dtrace_nonroot_maxsize &&
11397             !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
11398                 return (EFBIG);
11399 #endif /* __APPLE__ */
11400
11401
11402 #if defined(__APPLE__)
11403         if (size > (sane_size / 8) / (int)NCPU) /* As in kdbg_set_nkdbufs(), roughly. */
11404                 return (ENOMEM);
11405 #endif /* __APPLE__ */
11406
11407         cp = cpu_list;
11408
11409         do {
11410                 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
11411                         continue;
11412
11413                 buf = &bufs[cp->cpu_id];
11414
11415                 /*
11416                  * If there is already a buffer allocated for this CPU, it
11417                  * is only possible that this is a DR event.  In this case,
11418                  * the buffer size must match our specified size.
11419                  */
11420                 if (buf->dtb_tomax != NULL) {
11421                         ASSERT(buf->dtb_size == size);
11422                         continue;
11423                 }
11424
11425                 ASSERT(buf->dtb_xamot == NULL);
11426
11427                 if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
11428                         goto err;
11429
11430                 buf->dtb_size = size;
11431                 buf->dtb_flags = flags;
11432                 buf->dtb_offset = 0;
11433                 buf->dtb_drops = 0;
11434
11435                 if (flags & DTRACEBUF_NOSWITCH)
11436                         continue;
11437
11438                 if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
11439                         goto err;
11440         } while ((cp = cp->cpu_next) != cpu_list);
11441
11442         return (0);
11443
11444 err:
11445         cp = cpu_list;
11446
11447         do {
11448                 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
11449                         continue;
11450
11451                 buf = &bufs[cp->cpu_id];
11452
11453                 if (buf->dtb_xamot != NULL) {
11454                         ASSERT(buf->dtb_tomax != NULL);
11455                         ASSERT(buf->dtb_size == size);
11456                         kmem_free(buf->dtb_xamot, size);
11457                 }
11458
11459                 if (buf->dtb_tomax != NULL) {
11460                         ASSERT(buf->dtb_size == size);
11461                         kmem_free(buf->dtb_tomax, size);
11462                 }
11463
11464                 buf->dtb_tomax = NULL;
11465                 buf->dtb_xamot = NULL;
11466                 buf->dtb_size = 0;
11467         } while ((cp = cp->cpu_next) != cpu_list);
11468
11469         return (ENOMEM);
11470 }
11471
11472 /*
11473  * Note:  called from probe context.  This function just increments the drop
11474  * count on a buffer.  It has been made a function to allow for the
11475  * possibility of understanding the source of mysterious drop counts.  (A
11476  * problem for which one may be particularly disappointed that DTrace cannot
11477  * be used to understand DTrace.)
11478  */
11479 static void
11480 dtrace_buffer_drop(dtrace_buffer_t *buf)
11481 {
11482         buf->dtb_drops++;
11483 }
11484
11485 /*
11486  * Note:  called from probe context.  This function is called to reserve space
11487  * in a buffer.  If mstate is non-NULL, sets the scratch base and size in the
11488  * mstate.  Returns the new offset in the buffer, or a negative value if an
11489  * error has occurred.
11490  */
11491 static intptr_t
11492 dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
11493     dtrace_state_t *state, dtrace_mstate_t *mstate)
11494 {
11495         intptr_t offs = buf->dtb_offset, soffs;
11496         intptr_t woffs;
11497         caddr_t tomax;
11498         size_t total_off;
11499
11500         if (buf->dtb_flags & DTRACEBUF_INACTIVE)
11501                 return (-1);
11502
11503         if ((tomax = buf->dtb_tomax) == NULL) {
11504                 dtrace_buffer_drop(buf);
11505                 return (-1);
11506         }
11507
11508         if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {
11509                 while (offs & (align - 1)) {
11510                         /*
11511                          * Assert that our alignment is off by a number which
11512                          * is itself sizeof (uint32_t) aligned.
11513                          */
11514                         ASSERT(!((align - (offs & (align - 1))) &
11515                             (sizeof (uint32_t) - 1)));
11516                         DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
11517                         offs += sizeof (uint32_t);
11518                 }
11519
11520 #if !defined(__APPLE__) /* Quiet compiler warnings */
11521                 if ((soffs = offs + needed) > buf->dtb_size) {
11522 #else
11523                 if ((uint64_t)(soffs = offs + needed) > buf->dtb_size) {
11524 #endif /* __APPLE__ */
11525                         dtrace_buffer_drop(buf);
11526                         return (-1);
11527                 }
11528
11529                 if (mstate == NULL)
11530                         return (offs);
11531
11532                 mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
11533                 mstate->dtms_scratch_size = buf->dtb_size - soffs;
11534                 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
11535
11536                 return (offs);
11537         }
11538
11539         if (buf->dtb_flags & DTRACEBUF_FILL) {
11540                 if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
11541                     (buf->dtb_flags & DTRACEBUF_FULL))
11542                         return (-1);
11543                 goto out;
11544         }
11545
11546         total_off = needed + (offs & (align - 1));
11547
11548         /*
11549          * For a ring buffer, life is quite a bit more complicated.  Before
11550          * we can store any padding, we need to adjust our wrapping offset.
11551          * (If we've never before wrapped or we're not about to, no adjustment
11552          * is required.)
11553          */
11554         if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
11555             offs + total_off > buf->dtb_size) {
11556                 woffs = buf->dtb_xamot_offset;
11557
11558                 if (offs + total_off > buf->dtb_size) {
11559                         /*
11560                          * We can't fit in the end of the buffer.  First, a
11561                          * sanity check that we can fit in the buffer at all.
11562                          */
11563                         if (total_off > buf->dtb_size) {
11564                                 dtrace_buffer_drop(buf);
11565                                 return (-1);
11566                         }
11567
11568                         /*
11569                          * We're going to be storing at the top of the buffer,
11570                          * so now we need to deal with the wrapped offset.  We
11571                          * only reset our wrapped offset to 0 if it is
11572                          * currently greater than the current offset.  If it
11573                          * is less than the current offset, it is because a
11574                          * previous allocation induced a wrap -- but the
11575                          * allocation didn't subsequently take the space due
11576                          * to an error or false predicate evaluation.  In this
11577                          * case, we'll just leave the wrapped offset alone: if
11578                          * the wrapped offset hasn't been advanced far enough
11579                          * for this allocation, it will be adjusted in the
11580                          * lower loop.
11581                          */
11582                         if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
11583                                 if (woffs >= offs)
11584                                         woffs = 0;
11585                         } else {
11586                                 woffs = 0;
11587                         }
11588
11589                         /*
11590                          * Now we know that we're going to be storing to the
11591                          * top of the buffer and that there is room for us
11592                          * there.  We need to clear the buffer from the current
11593                          * offset to the end (there may be old gunk there).
11594                          */
11595 #if !defined(__APPLE__) /* Quiet compiler warnings */
11596                         while (offs < buf->dtb_size)
11597 #else
11598                         while ((uint64_t)offs < buf->dtb_size)
11599 #endif /* __APPLE__ */
11600                                 tomax[offs++] = 0;
11601
11602                         /*
11603                          * We need to set our offset to zero.  And because we
11604                          * are wrapping, we need to set the bit indicating as
11605                          * much.  We can also adjust our needed space back
11606                          * down to the space required by the ECB -- we know
11607                          * that the top of the buffer is aligned.
11608                          */
11609                         offs = 0;
11610                         total_off = needed;
11611                         buf->dtb_flags |= DTRACEBUF_WRAPPED;
11612                 } else {
11613                         /*
11614                          * There is room for us in the buffer, so we simply
11615                          * need to check the wrapped offset.
11616                          */
11617                         if (woffs < offs) {
11618                                 /*
11619                                  * The wrapped offset is less than the offset.
11620                                  * This can happen if we allocated buffer space
11621                                  * that induced a wrap, but then we didn't
11622                                  * subsequently take the space due to an error
11623                                  * or false predicate evaluation.  This is
11624                                  * okay; we know that _this_ allocation isn't
11625                                  * going to induce a wrap.  We still can't
11626                                  * reset the wrapped offset to be zero,
11627                                  * however: the space may have been trashed in
11628                                  * the previous failed probe attempt.  But at
11629                                  * least the wrapped offset doesn't need to
11630                                  * be adjusted at all...
11631                                  */
11632                                 goto out;
11633                         }
11634                 }
11635
11636 #if !defined(__APPLE__) /* Quiet compiler warnings */
11637                 while (offs + total_off > woffs) {
11638 #else
11639                 while (offs + total_off > (size_t)woffs) {
11640 #endif /* __APPLE__ */
11641                         dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
11642                         size_t size;
11643
11644                         if (epid == DTRACE_EPIDNONE) {
11645                                 size = sizeof (uint32_t);
11646                         } else {
11647 #if !defined(__APPLE__) /* Quiet compiler warnings */
11648                                 ASSERT(epid <= state->dts_necbs);
11649 #else
11650                                 ASSERT(epid <= (dtrace_epid_t)state->dts_necbs);
11651 #endif /* __APPLE__ */
11652                                 ASSERT(state->dts_ecbs[epid - 1] != NULL);
11653
11654                                 size = state->dts_ecbs[epid - 1]->dte_size;
11655                         }
11656
11657                         ASSERT(woffs + size <= buf->dtb_size);
11658                         ASSERT(size != 0);
11659
11660                         if (woffs + size == buf->dtb_size) {
11661                                 /*
11662                                  * We've reached the end of the buffer; we want
11663                                  * to set the wrapped offset to 0 and break
11664                                  * out.  However, if the offs is 0, then we're
11665                                  * in a strange edge-condition:  the amount of
11666                                  * space that we want to reserve plus the size
11667                                  * of the record that we're overwriting is
11668                                  * greater than the size of the buffer.  This
11669                                  * is problematic because if we reserve the
11670                                  * space but subsequently don't consume it (due
11671                                  * to a failed predicate or error) the wrapped
11672                                  * offset will be 0 -- yet the EPID at offset 0
11673                                  * will not be committed.  This situation is
11674                                  * relatively easy to deal with:  if we're in
11675                                  * this case, the buffer is indistinguishable
11676                                  * from one that hasn't wrapped; we need only
11677                                  * finish the job by clearing the wrapped bit,
11678                                  * explicitly setting the offset to be 0, and
11679                                  * zero'ing out the old data in the buffer.
11680                                  */
11681                                 if (offs == 0) {
11682                                         buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
11683                                         buf->dtb_offset = 0;
11684                                         woffs = total_off;
11685
11686 #if !defined(__APPLE__) /* Quiet compiler warnings */
11687                                         while (woffs < buf->dtb_size)
11688 #else
11689                                         while ((uint64_t)woffs < buf->dtb_size)
11690 #endif /* __APPLE__ */
11691
11692                                                 tomax[woffs++] = 0;
11693                                 }
11694
11695                                 woffs = 0;
11696                                 break;
11697                         }
11698
11699                         woffs += size;
11700                 }
11701
11702                 /*
11703                  * We have a wrapped offset.  It may be that the wrapped offset
11704                  * has become zero -- that's okay.
11705                  */
11706                 buf->dtb_xamot_offset = woffs;
11707         }
11708
11709 out:
11710         /*
11711          * Now we can plow the buffer with any necessary padding.
11712          */
11713         while (offs & (align - 1)) {
11714                 /*
11715                  * Assert that our alignment is off by a number which
11716                  * is itself sizeof (uint32_t) aligned.
11717                  */
11718                 ASSERT(!((align - (offs & (align - 1))) &
11719                     (sizeof (uint32_t) - 1)));
11720                 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
11721                 offs += sizeof (uint32_t);
11722         }
11723
11724         if (buf->dtb_flags & DTRACEBUF_FILL) {
11725                 if (offs + needed > buf->dtb_size - state->dts_reserve) {
11726                         buf->dtb_flags |= DTRACEBUF_FULL;
11727                         return (-1);
11728                 }
11729         }
11730
11731         if (mstate == NULL)
11732                 return (offs);
11733
11734         /*
11735          * For ring buffers and fill buffers, the scratch space is always
11736          * the inactive buffer.
11737          */
11738         mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
11739         mstate->dtms_scratch_size = buf->dtb_size;
11740         mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
11741
11742         return (offs);
11743 }
11744
11745 static void
11746 dtrace_buffer_polish(dtrace_buffer_t *buf)
11747 {
11748         ASSERT(buf->dtb_flags & DTRACEBUF_RING);
11749         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11750
11751         if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
11752                 return;
11753
11754         /*
11755          * We need to polish the ring buffer.  There are three cases:
11756          *
11757          * - The first (and presumably most common) is that there is no gap
11758          *   between the buffer offset and the wrapped offset.  In this case,
11759          *   there is nothing in the buffer that isn't valid data; we can
11760          *   mark the buffer as polished and return.
11761          *
11762          * - The second (less common than the first but still more common
11763          *   than the third) is that there is a gap between the buffer offset
11764          *   and the wrapped offset, and the wrapped offset is larger than the
11765          *   buffer offset.  This can happen because of an alignment issue, or
11766          *   can happen because of a call to dtrace_buffer_reserve() that
11767          *   didn't subsequently consume the buffer space.  In this case,
11768          *   we need to zero the data from the buffer offset to the wrapped
11769          *   offset.
11770          *
11771          * - The third (and least common) is that there is a gap between the
11772          *   buffer offset and the wrapped offset, but the wrapped offset is
11773          *   _less_ than the buffer offset.  This can only happen because a
11774          *   call to dtrace_buffer_reserve() induced a wrap, but the space
11775          *   was not subsequently consumed.  In this case, we need to zero the
11776          *   space from the offset to the end of the buffer _and_ from the
11777          *   top of the buffer to the wrapped offset.
11778          */
11779         if (buf->dtb_offset < buf->dtb_xamot_offset) {
11780                 bzero(buf->dtb_tomax + buf->dtb_offset,
11781                     buf->dtb_xamot_offset - buf->dtb_offset);
11782         }
11783
11784         if (buf->dtb_offset > buf->dtb_xamot_offset) {
11785                 bzero(buf->dtb_tomax + buf->dtb_offset,
11786                     buf->dtb_size - buf->dtb_offset);
11787                 bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
11788         }
11789 }
11790
11791 static void
11792 dtrace_buffer_free(dtrace_buffer_t *bufs)
11793 {
11794         int i;
11795
11796         for (i = 0; i < (int)NCPU; i++) {
11797                 dtrace_buffer_t *buf = &bufs[i];
11798
11799                 if (buf->dtb_tomax == NULL) {
11800                         ASSERT(buf->dtb_xamot == NULL);
11801                         ASSERT(buf->dtb_size == 0);
11802                         continue;
11803                 }
11804
11805                 if (buf->dtb_xamot != NULL) {
11806                         ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
11807                         kmem_free(buf->dtb_xamot, buf->dtb_size);
11808                 }
11809
11810                 kmem_free(buf->dtb_tomax, buf->dtb_size);
11811                 buf->dtb_size = 0;
11812                 buf->dtb_tomax = NULL;
11813                 buf->dtb_xamot = NULL;
11814         }
11815 }
11816
11817 /*
11818  * DTrace Enabling Functions
11819  */
11820 static dtrace_enabling_t *
11821 dtrace_enabling_create(dtrace_vstate_t *vstate)
11822 {
11823         dtrace_enabling_t *enab;
11824
11825         enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
11826         enab->dten_vstate = vstate;
11827
11828         return (enab);
11829 }
11830
11831 static void
11832 dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)
11833 {
11834         dtrace_ecbdesc_t **ndesc;
11835         size_t osize, nsize;
11836
11837         /*
11838          * We can't add to enablings after we've enabled them, or after we've
11839          * retained them.
11840          */
11841         ASSERT(enab->dten_probegen == 0);
11842         ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
11843
11844 #if defined(__APPLE__)
11845         if (ecb == NULL) return; /* Note: protection against gcc 4.0 botch on x86 */
11846 #endif /* __APPLE__ */
11847
11848         if (enab->dten_ndesc < enab->dten_maxdesc) {
11849                 enab->dten_desc[enab->dten_ndesc++] = ecb;
11850                 return;
11851         }
11852
11853         osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
11854
11855         if (enab->dten_maxdesc == 0) {
11856                 enab->dten_maxdesc = 1;
11857         } else {
11858                 enab->dten_maxdesc <<= 1;
11859         }
11860
11861         ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
11862
11863         nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
11864         ndesc = kmem_zalloc(nsize, KM_SLEEP);
11865         bcopy(enab->dten_desc, ndesc, osize);
11866         kmem_free(enab->dten_desc, osize);
11867
11868         enab->dten_desc = ndesc;
11869         enab->dten_desc[enab->dten_ndesc++] = ecb;
11870 }
11871
11872 static void
11873 dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,
11874     dtrace_probedesc_t *pd)
11875 {
11876         dtrace_ecbdesc_t *new;
11877         dtrace_predicate_t *pred;
11878         dtrace_actdesc_t *act;
11879
11880         /*
11881          * We're going to create a new ECB description that matches the
11882          * specified ECB in every way, but has the specified probe description.
11883          */
11884         new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
11885
11886         if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
11887                 dtrace_predicate_hold(pred);
11888
11889         for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
11890                 dtrace_actdesc_hold(act);
11891
11892         new->dted_action = ecb->dted_action;
11893         new->dted_pred = ecb->dted_pred;
11894         new->dted_probe = *pd;
11895         new->dted_uarg = ecb->dted_uarg;
11896
11897         dtrace_enabling_add(enab, new);
11898 }
11899
11900 static void
11901 dtrace_enabling_dump(dtrace_enabling_t *enab)
11902 {
11903         int i;
11904
11905         for (i = 0; i < enab->dten_ndesc; i++) {
11906                 dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
11907
11908                 cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
11909                     desc->dtpd_provider, desc->dtpd_mod,
11910                     desc->dtpd_func, desc->dtpd_name);
11911         }
11912 }
11913
11914 static void
11915 dtrace_enabling_destroy(dtrace_enabling_t *enab)
11916 {
11917         int i;
11918         dtrace_ecbdesc_t *ep;
11919         dtrace_vstate_t *vstate = enab->dten_vstate;
11920
11921         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11922
11923         for (i = 0; i < enab->dten_ndesc; i++) {
11924                 dtrace_actdesc_t *act, *next;
11925                 dtrace_predicate_t *pred;
11926
11927                 ep = enab->dten_desc[i];
11928
11929                 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
11930                         dtrace_predicate_release(pred, vstate);
11931
11932                 for (act = ep->dted_action; act != NULL; act = next) {
11933                         next = act->dtad_next;
11934                         dtrace_actdesc_release(act, vstate);
11935                 }
11936
11937                 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
11938         }
11939
11940         kmem_free(enab->dten_desc,
11941             enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
11942
11943         /*
11944          * If this was a retained enabling, decrement the dts_nretained count
11945          * and take it off of the dtrace_retained list.
11946          */
11947         if (enab->dten_prev != NULL || enab->dten_next != NULL ||
11948             dtrace_retained == enab) {
11949                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
11950                 ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
11951                 enab->dten_vstate->dtvs_state->dts_nretained--;
11952                 dtrace_retained_gen++;
11953         }
11954
11955         if (enab->dten_prev == NULL) {
11956                 if (dtrace_retained == enab) {
11957                         dtrace_retained = enab->dten_next;
11958
11959                         if (dtrace_retained != NULL)
11960                                 dtrace_retained->dten_prev = NULL;
11961                 }
11962         } else {
11963                 ASSERT(enab != dtrace_retained);
11964                 ASSERT(dtrace_retained != NULL);
11965                 enab->dten_prev->dten_next = enab->dten_next;
11966         }
11967
11968         if (enab->dten_next != NULL) {
11969                 ASSERT(dtrace_retained != NULL);
11970                 enab->dten_next->dten_prev = enab->dten_prev;
11971         }
11972
11973         kmem_free(enab, sizeof (dtrace_enabling_t));
11974 }
11975
11976 static int
11977 dtrace_enabling_retain(dtrace_enabling_t *enab)
11978 {
11979         dtrace_state_t *state;
11980
11981         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11982         ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
11983         ASSERT(enab->dten_vstate != NULL);
11984
11985         state = enab->dten_vstate->dtvs_state;
11986         ASSERT(state != NULL);
11987
11988         /*
11989          * We only allow each state to retain dtrace_retain_max enablings.
11990          */
11991         if (state->dts_nretained >= dtrace_retain_max)
11992                 return (ENOSPC);
11993
11994         state->dts_nretained++;
11995         dtrace_retained_gen++;
11996
11997         if (dtrace_retained == NULL) {
11998                 dtrace_retained = enab;
11999                 return (0);
12000         }
12001
12002         enab->dten_next = dtrace_retained;
12003         dtrace_retained->dten_prev = enab;
12004         dtrace_retained = enab;
12005
12006         return (0);
12007 }
12008
12009 static int
12010 dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,
12011     dtrace_probedesc_t *create)
12012 {
12013         dtrace_enabling_t *new, *enab;
12014         int found = 0, err = ENOENT;
12015
12016         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12017         ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
12018         ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
12019         ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
12020         ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
12021
12022         new = dtrace_enabling_create(&state->dts_vstate);
12023
12024         /*
12025          * Iterate over all retained enablings, looking for enablings that
12026          * match the specified state.
12027          */
12028         for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12029                 int i;
12030
12031                 /*
12032                  * dtvs_state can only be NULL for helper enablings -- and
12033                  * helper enablings can't be retained.
12034                  */
12035                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12036
12037                 if (enab->dten_vstate->dtvs_state != state)
12038                         continue;
12039
12040                 /*
12041                  * Now iterate over each probe description; we're looking for
12042                  * an exact match to the specified probe description.
12043                  */
12044                 for (i = 0; i < enab->dten_ndesc; i++) {
12045                         dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12046                         dtrace_probedesc_t *pd = &ep->dted_probe;
12047
12048 #if !defined(__APPLE__)
12049                         if (strcmp(pd->dtpd_provider, match->dtpd_provider))
12050                                 continue;
12051
12052                         if (strcmp(pd->dtpd_mod, match->dtpd_mod))
12053                                 continue;
12054
12055                         if (strcmp(pd->dtpd_func, match->dtpd_func))
12056                                 continue;
12057
12058                         if (strcmp(pd->dtpd_name, match->dtpd_name))
12059                                 continue;
12060 #else /* Employ size bounded string operation. */
12061                         if (strncmp(pd->dtpd_provider, match->dtpd_provider, DTRACE_PROVNAMELEN))
12062                                 continue;
12063
12064                         if (strncmp(pd->dtpd_mod, match->dtpd_mod, DTRACE_MODNAMELEN))
12065                                 continue;
12066
12067                         if (strncmp(pd->dtpd_func, match->dtpd_func, DTRACE_FUNCNAMELEN))
12068                                 continue;
12069
12070                         if (strncmp(pd->dtpd_name, match->dtpd_name, DTRACE_NAMELEN))
12071                                 continue;
12072 #endif /* __APPLE__ */
12073
12074                         /*
12075                          * We have a winning probe!  Add it to our growing
12076                          * enabling.
12077                          */
12078                         found = 1;
12079                         dtrace_enabling_addlike(new, ep, create);
12080                 }
12081         }
12082
12083         if (!found || (err = dtrace_enabling_retain(new)) != 0) {
12084                 dtrace_enabling_destroy(new);
12085                 return (err);
12086         }
12087
12088         return (0);
12089 }
12090
12091 static void
12092 dtrace_enabling_retract(dtrace_state_t *state)
12093 {
12094         dtrace_enabling_t *enab, *next;
12095
12096         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12097
12098         /*
12099          * Iterate over all retained enablings, destroy the enablings retained
12100          * for the specified state.
12101          */
12102         for (enab = dtrace_retained; enab != NULL; enab = next) {
12103                 next = enab->dten_next;
12104
12105                 /*
12106                  * dtvs_state can only be NULL for helper enablings -- and
12107                  * helper enablings can't be retained.
12108                  */
12109                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12110
12111                 if (enab->dten_vstate->dtvs_state == state) {
12112                         ASSERT(state->dts_nretained > 0);
12113                         dtrace_enabling_destroy(enab);
12114                 }
12115         }
12116
12117         ASSERT(state->dts_nretained == 0);
12118 }
12119
12120 static int
12121 dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched)
12122 {
12123         int i = 0;
12124         int total_matched = 0, matched = 0;
12125
12126         lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
12127         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12128
12129         for (i = 0; i < enab->dten_ndesc; i++) {
12130                 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12131
12132                 enab->dten_current = ep;
12133                 enab->dten_error = 0;
12134
12135                 /*
12136                  * If a provider failed to enable a probe then get out and
12137                  * let the consumer know we failed.
12138                  */
12139                 if ((matched = dtrace_probe_enable(&ep->dted_probe, enab)) < 0)
12140                         return (EBUSY);
12141
12142                 total_matched += matched;
12143
12144                 if (enab->dten_error != 0) {
12145                         /*
12146                          * If we get an error half-way through enabling the
12147                          * probes, we kick out -- perhaps with some number of
12148                          * them enabled.  Leaving enabled probes enabled may
12149                          * be slightly confusing for user-level, but we expect
12150                          * that no one will attempt to actually drive on in
12151                          * the face of such errors.  If this is an anonymous
12152                          * enabling (indicated with a NULL nmatched pointer),
12153                          * we cmn_err() a message.  We aren't expecting to
12154                          * get such an error -- such as it can exist at all,
12155                          * it would be a result of corrupted DOF in the driver
12156                          * properties.
12157                          */
12158                         if (nmatched == NULL) {
12159                                 cmn_err(CE_WARN, "dtrace_enabling_match() "
12160                                     "error on %p: %d", (void *)ep,
12161                                     enab->dten_error);
12162                         }
12163
12164                         return (enab->dten_error);
12165                 }
12166         }
12167
12168         enab->dten_probegen = dtrace_probegen;
12169         if (nmatched != NULL)
12170                 *nmatched = total_matched;
12171
12172         return (0);
12173 }
12174
12175 static void
12176 dtrace_enabling_matchall(void)
12177 {
12178         dtrace_enabling_t *enab;
12179
12180         lck_mtx_lock(&cpu_lock);
12181         lck_mtx_lock(&dtrace_lock);
12182
12183         /*
12184          * Iterate over all retained enablings to see if any probes match
12185          * against them.  We only perform this operation on enablings for which
12186          * we have sufficient permissions by virtue of being in the global zone
12187          * or in the same zone as the DTrace client.  Because we can be called
12188          * after dtrace_detach() has been called, we cannot assert that there
12189          * are retained enablings.  We can safely load from dtrace_retained,
12190          * however:  the taskq_destroy() at the end of dtrace_detach() will
12191          * block pending our completion.
12192          */
12193         for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12194 #if !defined(__APPLE__)
12195                 cred_t *cr = enab->dten_vstate->dtvs_state->dts_cred.dcr_cred;
12196
12197                 if (INGLOBALZONE(curproc) ||
12198                     cr != NULL && getzoneid() == crgetzoneid(cr))
12199                         (void) dtrace_enabling_match(enab, NULL);
12200 #else
12201                 (void) dtrace_enabling_match(enab, NULL); /* As if always in "global" zone." */
12202 #endif /* __APPLE__ */
12203         }
12204
12205         lck_mtx_unlock(&dtrace_lock);
12206         lck_mtx_unlock(&cpu_lock);
12207 }
12208
12209 /*
12210  * If an enabling is to be enabled without having matched probes (that is, if
12211  * dtrace_state_go() is to be called on the underlying dtrace_state_t), the
12212  * enabling must be _primed_ by creating an ECB for every ECB description.
12213  * This must be done to assure that we know the number of speculations, the
12214  * number of aggregations, the minimum buffer size needed, etc. before we
12215  * transition out of DTRACE_ACTIVITY_INACTIVE.  To do this without actually
12216  * enabling any probes, we create ECBs for every ECB decription, but with a
12217  * NULL probe -- which is exactly what this function does.
12218  */
12219 static void
12220 dtrace_enabling_prime(dtrace_state_t *state)
12221 {
12222         dtrace_enabling_t *enab;
12223         int i;
12224
12225         for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12226                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12227
12228                 if (enab->dten_vstate->dtvs_state != state)
12229                         continue;
12230
12231                 /*
12232                  * We don't want to prime an enabling more than once, lest
12233                  * we allow a malicious user to induce resource exhaustion.
12234                  * (The ECBs that result from priming an enabling aren't
12235                  * leaked -- but they also aren't deallocated until the
12236                  * consumer state is destroyed.)
12237                  */
12238                 if (enab->dten_primed)
12239                         continue;
12240
12241                 for (i = 0; i < enab->dten_ndesc; i++) {
12242                         enab->dten_current = enab->dten_desc[i];
12243                         (void) dtrace_probe_enable(NULL, enab);
12244                 }
12245
12246                 enab->dten_primed = 1;
12247         }
12248 }
12249
12250 /*
12251  * Called to indicate that probes should be provided due to retained
12252  * enablings.  This is implemented in terms of dtrace_probe_provide(), but it
12253  * must take an initial lap through the enabling calling the dtps_provide()
12254  * entry point explicitly to allow for autocreated probes.
12255  */
12256 static void
12257 dtrace_enabling_provide(dtrace_provider_t *prv)
12258 {
12259         int i, all = 0;
12260         dtrace_probedesc_t desc;
12261         dtrace_genid_t gen;
12262
12263         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12264         lck_mtx_assert(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
12265
12266         if (prv == NULL) {
12267                 all = 1;
12268                 prv = dtrace_provider;
12269         }
12270
12271         do {
12272                 dtrace_enabling_t *enab;
12273                 void *parg = prv->dtpv_arg;
12274
12275 retry:
12276                 gen = dtrace_retained_gen;
12277                 for (enab = dtrace_retained; enab != NULL;
12278                     enab = enab->dten_next) {
12279                         for (i = 0; i < enab->dten_ndesc; i++) {
12280                                 desc = enab->dten_desc[i]->dted_probe;
12281                                 lck_mtx_unlock(&dtrace_lock);
12282                                 prv->dtpv_pops.dtps_provide(parg, &desc);
12283                                 lck_mtx_lock(&dtrace_lock);
12284                                 /*
12285                                  * Process the retained enablings again if
12286                                  * they have changed while we weren't holding
12287                                  * dtrace_lock.
12288                                  */
12289                                 if (gen != dtrace_retained_gen)
12290                                         goto retry;
12291                         }
12292                 }
12293         } while (all && (prv = prv->dtpv_next) != NULL);
12294
12295         lck_mtx_unlock(&dtrace_lock);
12296         dtrace_probe_provide(NULL, all ? NULL : prv);
12297         lck_mtx_lock(&dtrace_lock);
12298 }
12299
12300 /*
12301  * DTrace DOF Functions
12302  */
12303 /*ARGSUSED*/
12304 static void
12305 dtrace_dof_error(dof_hdr_t *dof, const char *str)
12306 {
12307 #pragma unused(dof) /* __APPLE__ */
12308         if (dtrace_err_verbose)
12309                 cmn_err(CE_WARN, "failed to process DOF: %s", str);
12310
12311 #ifdef DTRACE_ERRDEBUG
12312         dtrace_errdebug(str);
12313 #endif
12314 }
12315
12316 /*
12317  * Create DOF out of a currently enabled state.  Right now, we only create
12318  * DOF containing the run-time options -- but this could be expanded to create
12319  * complete DOF representing the enabled state.
12320  */
12321 static dof_hdr_t *
12322 dtrace_dof_create(dtrace_state_t *state)
12323 {
12324         dof_hdr_t *dof;
12325         dof_sec_t *sec;
12326         dof_optdesc_t *opt;
12327         int i, len = sizeof (dof_hdr_t) +
12328             roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
12329             sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12330
12331         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12332
12333 #if !defined(__APPLE__)
12334         dof = kmem_zalloc(len, KM_SLEEP);
12335 #else
12336         dof = dt_kmem_zalloc_aligned(len, 8, KM_SLEEP);
12337 #endif /* __APPLE__ */
12338         dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
12339         dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
12340         dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
12341         dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
12342
12343         dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
12344         dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
12345         dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
12346         dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
12347         dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
12348         dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
12349
12350         dof->dofh_flags = 0;
12351         dof->dofh_hdrsize = sizeof (dof_hdr_t);
12352         dof->dofh_secsize = sizeof (dof_sec_t);
12353         dof->dofh_secnum = 1;   /* only DOF_SECT_OPTDESC */
12354         dof->dofh_secoff = sizeof (dof_hdr_t);
12355         dof->dofh_loadsz = len;
12356         dof->dofh_filesz = len;
12357         dof->dofh_pad = 0;
12358
12359         /*
12360          * Fill in the option section header...
12361          */
12362         sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
12363         sec->dofs_type = DOF_SECT_OPTDESC;
12364         sec->dofs_align = sizeof (uint64_t);
12365         sec->dofs_flags = DOF_SECF_LOAD;
12366         sec->dofs_entsize = sizeof (dof_optdesc_t);
12367
12368         opt = (dof_optdesc_t *)((uintptr_t)sec +
12369             roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
12370
12371         sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
12372         sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12373
12374         for (i = 0; i < DTRACEOPT_MAX; i++) {
12375                 opt[i].dofo_option = i;
12376                 opt[i].dofo_strtab = DOF_SECIDX_NONE;
12377                 opt[i].dofo_value = state->dts_options[i];
12378         }
12379
12380         return (dof);
12381 }
12382
12383 static dof_hdr_t *
12384 #if !defined(__APPLE__)
12385 dtrace_dof_copyin(uintptr_t uarg, int *errp)
12386 #else
12387 dtrace_dof_copyin(user_addr_t uarg, int *errp)
12388 #endif
12389 {
12390         dof_hdr_t hdr, *dof;
12391
12392         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
12393
12394         /*
12395          * First, we're going to copyin() the sizeof (dof_hdr_t).
12396          */
12397 #if !defined(__APPLE__)
12398         if (copyin((void *)uarg, &hdr, sizeof (hdr)) != 0) {
12399 #else
12400         if (copyin(uarg, &hdr, sizeof (hdr)) != 0) {
12401 #endif
12402                 dtrace_dof_error(NULL, "failed to copyin DOF header");
12403                 *errp = EFAULT;
12404                 return (NULL);
12405         }
12406
12407         /*
12408          * Now we'll allocate the entire DOF and copy it in -- provided
12409          * that the length isn't outrageous.
12410          */
12411 #if !defined(__APPLE__) /* Quiet compiler warnings */
12412         if (hdr.dofh_loadsz >= dtrace_dof_maxsize) {
12413 #else
12414         if (hdr.dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
12415 #endif /* __APPLE__ */
12416                 dtrace_dof_error(&hdr, "load size exceeds maximum");
12417                 *errp = E2BIG;
12418                 return (NULL);
12419         }
12420
12421         if (hdr.dofh_loadsz < sizeof (hdr)) {
12422                 dtrace_dof_error(&hdr, "invalid load size");
12423                 *errp = EINVAL;
12424                 return (NULL);
12425         }
12426
12427 #if !defined(__APPLE__)
12428         dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP);
12429
12430         if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0 ||
12431           dof->dofh_loadsz != hdr.dofh_loadsz) {
12432             kmem_free(dof, hdr.dofh_loadsz);
12433             *errp = EFAULT;
12434             return (NULL);
12435         }
12436 #else
12437         dof = dt_kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP);
12438
12439         if (copyin(uarg, dof, hdr.dofh_loadsz) != 0  ||
12440           dof->dofh_loadsz != hdr.dofh_loadsz) {
12441             dt_kmem_free_aligned(dof, hdr.dofh_loadsz);
12442             *errp = EFAULT;
12443             return (NULL);
12444         }
12445 #endif
12446
12447         return (dof);
12448 }
12449
12450 #if defined(__APPLE__)
12451
12452 static dof_hdr_t *
12453 dtrace_dof_copyin_from_proc(proc_t* p, user_addr_t uarg, int *errp)
12454 {
12455         dof_hdr_t hdr, *dof;
12456
12457         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
12458
12459         /*
12460          * First, we're going to copyin() the sizeof (dof_hdr_t).
12461          */
12462         if (uread(p, &hdr, sizeof(hdr), uarg) != KERN_SUCCESS) {
12463                 dtrace_dof_error(NULL, "failed to copyin DOF header");
12464                 *errp = EFAULT;
12465                 return (NULL);
12466         }
12467
12468         /*
12469          * Now we'll allocate the entire DOF and copy it in -- provided
12470          * that the length isn't outrageous.
12471          */
12472         if (hdr.dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
12473                 dtrace_dof_error(&hdr, "load size exceeds maximum");
12474                 *errp = E2BIG;
12475                 return (NULL);
12476         }
12477
12478         if (hdr.dofh_loadsz < sizeof (hdr)) {
12479                 dtrace_dof_error(&hdr, "invalid load size");
12480                 *errp = EINVAL;
12481                 return (NULL);
12482         }
12483
12484         dof = dt_kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP);
12485
12486         if (uread(p, dof, hdr.dofh_loadsz, uarg) != KERN_SUCCESS) {
12487                 dt_kmem_free_aligned(dof, hdr.dofh_loadsz);
12488                 *errp = EFAULT;
12489                 return (NULL);
12490         }
12491
12492         return (dof);
12493 }
12494
12495 #endif /* __APPLE__ */
12496
12497 static dof_hdr_t *
12498 dtrace_dof_property(const char *name)
12499 {
12500         uchar_t *buf;
12501         uint64_t loadsz;
12502         unsigned int len, i;
12503         dof_hdr_t *dof;
12504
12505         /*
12506          * Unfortunately, array of values in .conf files are always (and
12507          * only) interpreted to be integer arrays.  We must read our DOF
12508          * as an integer array, and then squeeze it into a byte array.
12509          */
12510 #if !defined(__APPLE__) /* Quiet compiler warnings */
12511         if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dtrace_devi, 0,
12512             (char *)name, (int **)&buf, &len) != DDI_PROP_SUCCESS)
12513                 return (NULL);
12514 #else
12515         if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dtrace_devi, 0,
12516             name, (int **)&buf, &len) != DDI_PROP_SUCCESS)
12517                 return (NULL);
12518 #endif /* __APPLE__ */
12519
12520         for (i = 0; i < len; i++)
12521                 buf[i] = (uchar_t)(((int *)buf)[i]);
12522
12523         if (len < sizeof (dof_hdr_t)) {
12524                 ddi_prop_free(buf);
12525                 dtrace_dof_error(NULL, "truncated header");
12526                 return (NULL);
12527         }
12528
12529         if (len < (loadsz = ((dof_hdr_t *)buf)->dofh_loadsz)) {
12530                 ddi_prop_free(buf);
12531                 dtrace_dof_error(NULL, "truncated DOF");
12532                 return (NULL);
12533         }
12534
12535 #if !defined(__APPLE__) /* Quiet compiler warnings */
12536         if (loadsz >= dtrace_dof_maxsize) {
12537 #else
12538         if (loadsz >= (uint64_t)dtrace_dof_maxsize) {
12539 #endif /* __APPLE__ */
12540                 ddi_prop_free(buf);
12541                 dtrace_dof_error(NULL, "oversized DOF");
12542                 return (NULL);
12543         }
12544
12545 #if !defined(__APPLE__)
12546         dof = kmem_alloc(loadsz, KM_SLEEP);
12547 #else
12548         dof = dt_kmem_alloc_aligned(loadsz, 8, KM_SLEEP);
12549 #endif /* __APPLE__ */
12550         bcopy(buf, dof, loadsz);
12551         ddi_prop_free(buf);
12552
12553         return (dof);
12554 }
12555
12556 static void
12557 dtrace_dof_destroy(dof_hdr_t *dof)
12558 {
12559 #if !defined(__APPLE__)
12560         kmem_free(dof, dof->dofh_loadsz);
12561 #else
12562         dt_kmem_free_aligned(dof, dof->dofh_loadsz);
12563 #endif /* __APPLE__ */
12564 }
12565
12566 /*
12567  * Return the dof_sec_t pointer corresponding to a given section index.  If the
12568  * index is not valid, dtrace_dof_error() is called and NULL is returned.  If
12569  * a type other than DOF_SECT_NONE is specified, the header is checked against
12570  * this type and NULL is returned if the types do not match.
12571  */
12572 static dof_sec_t *
12573 dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
12574 {
12575         dof_sec_t *sec = (dof_sec_t *)(uintptr_t)
12576             ((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
12577
12578         if (i >= dof->dofh_secnum) {
12579                 dtrace_dof_error(dof, "referenced section index is invalid");
12580                 return (NULL);
12581         }
12582
12583         if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
12584                 dtrace_dof_error(dof, "referenced section is not loadable");
12585                 return (NULL);
12586         }
12587
12588         if (type != DOF_SECT_NONE && type != sec->dofs_type) {
12589                 dtrace_dof_error(dof, "referenced section is the wrong type");
12590                 return (NULL);
12591         }
12592
12593         return (sec);
12594 }
12595
12596 static dtrace_probedesc_t *
12597 dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)
12598 {
12599         dof_probedesc_t *probe;
12600         dof_sec_t *strtab;
12601         uintptr_t daddr = (uintptr_t)dof;
12602         uintptr_t str;
12603         size_t size;
12604
12605         if (sec->dofs_type != DOF_SECT_PROBEDESC) {
12606                 dtrace_dof_error(dof, "invalid probe section");
12607                 return (NULL);
12608         }
12609
12610         if (sec->dofs_align != sizeof (dof_secidx_t)) {
12611                 dtrace_dof_error(dof, "bad alignment in probe description");
12612                 return (NULL);
12613         }
12614
12615         if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
12616                 dtrace_dof_error(dof, "truncated probe description");
12617                 return (NULL);
12618         }
12619
12620         probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
12621         strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
12622
12623         if (strtab == NULL)
12624                 return (NULL);
12625
12626         str = daddr + strtab->dofs_offset;
12627         size = strtab->dofs_size;
12628
12629         if (probe->dofp_provider >= strtab->dofs_size) {
12630                 dtrace_dof_error(dof, "corrupt probe provider");
12631                 return (NULL);
12632         }
12633
12634         (void) strncpy(desc->dtpd_provider,
12635             (char *)(str + probe->dofp_provider),
12636             MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));
12637 #if defined(__APPLE__) /* Employ size bounded string operation. */
12638         desc->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
12639 #endif /* __APPLE__ */
12640
12641         if (probe->dofp_mod >= strtab->dofs_size) {
12642                 dtrace_dof_error(dof, "corrupt probe module");
12643                 return (NULL);
12644         }
12645
12646         (void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
12647             MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));
12648 #if defined(__APPLE__) /* Employ size bounded string operation. */
12649         desc->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
12650 #endif /* __APPLE__ */
12651
12652         if (probe->dofp_func >= strtab->dofs_size) {
12653                 dtrace_dof_error(dof, "corrupt probe function");
12654                 return (NULL);
12655         }
12656
12657         (void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
12658             MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));
12659 #if defined(__APPLE__) /* Employ size bounded string operation. */
12660         desc->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
12661 #endif /* __APPLE__ */
12662
12663         if (probe->dofp_name >= strtab->dofs_size) {
12664                 dtrace_dof_error(dof, "corrupt probe name");
12665                 return (NULL);
12666         }
12667
12668         (void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
12669             MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));
12670 #if defined(__APPLE__) /* Employ size bounded string operation. */
12671         desc->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
12672 #endif /* __APPLE__ */
12673
12674         return (desc);
12675 }
12676
12677 static dtrace_difo_t *
12678 dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
12679     cred_t *cr)
12680 {
12681         dtrace_difo_t *dp;
12682         size_t ttl = 0;
12683         dof_difohdr_t *dofd;
12684         uintptr_t daddr = (uintptr_t)dof;
12685         size_t max_size = dtrace_difo_maxsize;
12686 #if !defined(__APPLE__) /* Quiet compiler warnings */
12687         int i, l, n;
12688 #else
12689         uint_t i;
12690         int l, n;
12691 #endif /* __APPLE__ */
12692
12693
12694         static const struct {
12695                 int section;
12696                 int bufoffs;
12697                 int lenoffs;
12698                 int entsize;
12699                 int align;
12700                 const char *msg;
12701         } difo[] = {
12702                 { DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
12703                 offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
12704                 sizeof (dif_instr_t), "multiple DIF sections" },
12705
12706                 { DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
12707                 offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
12708                 sizeof (uint64_t), "multiple integer tables" },
12709
12710                 { DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
12711                 offsetof(dtrace_difo_t, dtdo_strlen), 0,
12712                 sizeof (char), "multiple string tables" },
12713
12714                 { DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
12715                 offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
12716                 sizeof (uint_t), "multiple variable tables" },
12717
12718 #if !defined(__APPLE__)
12719                 { DOF_SECT_NONE, 0, 0, 0, NULL }
12720 #else
12721                 { DOF_SECT_NONE, 0, 0, 0, 0, NULL }
12722 #endif /* __APPLE__ */
12723         };
12724
12725         if (sec->dofs_type != DOF_SECT_DIFOHDR) {
12726                 dtrace_dof_error(dof, "invalid DIFO header section");
12727                 return (NULL);
12728         }
12729
12730         if (sec->dofs_align != sizeof (dof_secidx_t)) {
12731                 dtrace_dof_error(dof, "bad alignment in DIFO header");
12732                 return (NULL);
12733         }
12734
12735         if (sec->dofs_size < sizeof (dof_difohdr_t) ||
12736             sec->dofs_size % sizeof (dof_secidx_t)) {
12737                 dtrace_dof_error(dof, "bad size in DIFO header");
12738                 return (NULL);
12739         }
12740
12741         dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
12742         n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;
12743
12744         dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
12745         dp->dtdo_rtype = dofd->dofd_rtype;
12746
12747         for (l = 0; l < n; l++) {
12748                 dof_sec_t *subsec;
12749                 void **bufp;
12750                 uint32_t *lenp;
12751
12752                 if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
12753                     dofd->dofd_links[l])) == NULL)
12754                         goto err; /* invalid section link */
12755
12756                 if (ttl + subsec->dofs_size > max_size) {
12757                         dtrace_dof_error(dof, "exceeds maximum size");
12758                         goto err;
12759                 }
12760
12761                 ttl += subsec->dofs_size;
12762
12763                 for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
12764
12765 #if !defined(__APPLE__) /* Quiet compiler warnings */
12766                         if (subsec->dofs_type != difo[i].section)
12767                                 continue;
12768 #else
12769                         if (subsec->dofs_type != (uint32_t)difo[i].section)
12770                                 continue;
12771 #endif /* __APPLE __ */
12772
12773                         if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
12774                                 dtrace_dof_error(dof, "section not loaded");
12775                                 goto err;
12776                         }
12777
12778 #if !defined(__APPLE__) /* Quiet compiler warnings */
12779                         if (subsec->dofs_align != difo[i].align) {
12780                                 dtrace_dof_error(dof, "bad alignment");
12781                                 goto err;
12782                         }
12783 #else
12784                         if (subsec->dofs_align != (uint32_t)difo[i].align) {
12785                                 dtrace_dof_error(dof, "bad alignment");
12786                                 goto err;
12787                         }
12788 #endif /* __APPLE__ */
12789
12790                         bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
12791                         lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
12792
12793                         if (*bufp != NULL) {
12794                                 dtrace_dof_error(dof, difo[i].msg);
12795                                 goto err;
12796                         }
12797
12798 #if !defined(__APPLE__) /* Quiet compiler warnings */
12799                         if (difo[i].entsize != subsec->dofs_entsize) {
12800                                 dtrace_dof_error(dof, "entry size mismatch");
12801                                 goto err;
12802                         }
12803 #else
12804                         if ((uint32_t)difo[i].entsize != subsec->dofs_entsize) {
12805                                 dtrace_dof_error(dof, "entry size mismatch");
12806                                 goto err;
12807                         }
12808 #endif /* __APPLE__ */
12809
12810                         if (subsec->dofs_entsize != 0 &&
12811                             (subsec->dofs_size % subsec->dofs_entsize) != 0) {
12812                                 dtrace_dof_error(dof, "corrupt entry size");
12813                                 goto err;
12814                         }
12815
12816                         *lenp = subsec->dofs_size;
12817                         *bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
12818                         bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
12819                             *bufp, subsec->dofs_size);
12820
12821                         if (subsec->dofs_entsize != 0)
12822                                 *lenp /= subsec->dofs_entsize;
12823
12824                         break;
12825                 }
12826
12827                 /*
12828                  * If we encounter a loadable DIFO sub-section that is not
12829                  * known to us, assume this is a broken program and fail.
12830                  */
12831                 if (difo[i].section == DOF_SECT_NONE &&
12832                     (subsec->dofs_flags & DOF_SECF_LOAD)) {
12833                         dtrace_dof_error(dof, "unrecognized DIFO subsection");
12834                         goto err;
12835                 }
12836         }
12837
12838         if (dp->dtdo_buf == NULL) {
12839                 /*
12840                  * We can't have a DIF object without DIF text.
12841                  */
12842                 dtrace_dof_error(dof, "missing DIF text");
12843                 goto err;
12844         }
12845
12846         /*
12847          * Before we validate the DIF object, run through the variable table
12848          * looking for the strings -- if any of their size are under, we'll set
12849          * their size to be the system-wide default string size.  Note that
12850          * this should _not_ happen if the "strsize" option has been set --
12851          * in this case, the compiler should have set the size to reflect the
12852          * setting of the option.
12853          */
12854         for (i = 0; i < dp->dtdo_varlen; i++) {
12855                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
12856                 dtrace_diftype_t *t = &v->dtdv_type;
12857
12858                 if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
12859                         continue;
12860
12861                 if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
12862                         t->dtdt_size = dtrace_strsize_default;
12863         }
12864
12865         if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
12866                 goto err;
12867
12868         dtrace_difo_init(dp, vstate);
12869         return (dp);
12870
12871 err:
12872         kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
12873         kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
12874         kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
12875         kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
12876
12877         kmem_free(dp, sizeof (dtrace_difo_t));
12878         return (NULL);
12879 }
12880
12881 static dtrace_predicate_t *
12882 dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
12883     cred_t *cr)
12884 {
12885         dtrace_difo_t *dp;
12886
12887         if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
12888                 return (NULL);
12889
12890         return (dtrace_predicate_create(dp));
12891 }
12892
12893 static dtrace_actdesc_t *
12894 dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
12895     cred_t *cr)
12896 {
12897         dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;
12898         dof_actdesc_t *desc;
12899         dof_sec_t *difosec;
12900         size_t offs;
12901         uintptr_t daddr = (uintptr_t)dof;
12902         uint64_t arg;
12903         dtrace_actkind_t kind;
12904
12905         if (sec->dofs_type != DOF_SECT_ACTDESC) {
12906                 dtrace_dof_error(dof, "invalid action section");
12907                 return (NULL);
12908         }
12909
12910         if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
12911                 dtrace_dof_error(dof, "truncated action description");
12912                 return (NULL);
12913         }
12914
12915         if (sec->dofs_align != sizeof (uint64_t)) {
12916                 dtrace_dof_error(dof, "bad alignment in action description");
12917                 return (NULL);
12918         }
12919
12920         if (sec->dofs_size < sec->dofs_entsize) {
12921                 dtrace_dof_error(dof, "section entry size exceeds total size");
12922                 return (NULL);
12923         }
12924
12925         if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
12926                 dtrace_dof_error(dof, "bad entry size in action description");
12927                 return (NULL);
12928         }
12929
12930         if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
12931                 dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
12932                 return (NULL);
12933         }
12934
12935         for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
12936                 desc = (dof_actdesc_t *)(daddr +
12937                     (uintptr_t)sec->dofs_offset + offs);
12938                 kind = (dtrace_actkind_t)desc->dofa_kind;
12939
12940                 if (DTRACEACT_ISPRINTFLIKE(kind) &&
12941                     (kind != DTRACEACT_PRINTA ||
12942                     desc->dofa_strtab != DOF_SECIDX_NONE)) {
12943                         dof_sec_t *strtab;
12944                         char *str, *fmt;
12945                         uint64_t i;
12946
12947                         /*
12948                          * printf()-like actions must have a format string.
12949                          */
12950                         if ((strtab = dtrace_dof_sect(dof,
12951                             DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
12952                                 goto err;
12953
12954                         str = (char *)((uintptr_t)dof +
12955                             (uintptr_t)strtab->dofs_offset);
12956
12957                         for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
12958                                 if (str[i] == '\0')
12959                                         break;
12960                         }
12961
12962                         if (i >= strtab->dofs_size) {
12963                                 dtrace_dof_error(dof, "bogus format string");
12964                                 goto err;
12965                         }
12966
12967                         if (i == desc->dofa_arg) {
12968                                 dtrace_dof_error(dof, "empty format string");
12969                                 goto err;
12970                         }
12971
12972                         i -= desc->dofa_arg;
12973                         fmt = kmem_alloc(i + 1, KM_SLEEP);
12974                         bcopy(&str[desc->dofa_arg], fmt, i + 1);
12975                         arg = (uint64_t)(uintptr_t)fmt;
12976                 } else {
12977                         if (kind == DTRACEACT_PRINTA) {
12978                                 ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
12979                                 arg = 0;
12980                         } else {
12981                                 arg = desc->dofa_arg;
12982                         }
12983                 }
12984
12985                 act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
12986                     desc->dofa_uarg, arg);
12987
12988                 if (last != NULL) {
12989                         last->dtad_next = act;
12990                 } else {
12991                         first = act;
12992                 }
12993
12994                 last = act;
12995
12996                 if (desc->dofa_difo == DOF_SECIDX_NONE)
12997                         continue;
12998
12999                 if ((difosec = dtrace_dof_sect(dof,
13000                     DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
13001                         goto err;
13002
13003                 act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
13004
13005                 if (act->dtad_difo == NULL)
13006                         goto err;
13007         }
13008
13009         ASSERT(first != NULL);
13010         return (first);
13011
13012 err:
13013         for (act = first; act != NULL; act = next) {
13014                 next = act->dtad_next;
13015                 dtrace_actdesc_release(act, vstate);
13016         }
13017
13018         return (NULL);
13019 }
13020
13021 static dtrace_ecbdesc_t *
13022 dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13023     cred_t *cr)
13024 {
13025         dtrace_ecbdesc_t *ep;
13026         dof_ecbdesc_t *ecb;
13027         dtrace_probedesc_t *desc;
13028         dtrace_predicate_t *pred = NULL;
13029
13030         if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
13031                 dtrace_dof_error(dof, "truncated ECB description");
13032                 return (NULL);
13033         }
13034
13035         if (sec->dofs_align != sizeof (uint64_t)) {
13036                 dtrace_dof_error(dof, "bad alignment in ECB description");
13037                 return (NULL);
13038         }
13039
13040         ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
13041         sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
13042
13043         if (sec == NULL)
13044                 return (NULL);
13045
13046         ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
13047         ep->dted_uarg = ecb->dofe_uarg;
13048         desc = &ep->dted_probe;
13049
13050         if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
13051                 goto err;
13052
13053         if (ecb->dofe_pred != DOF_SECIDX_NONE) {
13054                 if ((sec = dtrace_dof_sect(dof,
13055                     DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
13056                         goto err;
13057
13058                 if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
13059                         goto err;
13060
13061                 ep->dted_pred.dtpdd_predicate = pred;
13062         }
13063
13064         if (ecb->dofe_actions != DOF_SECIDX_NONE) {
13065                 if ((sec = dtrace_dof_sect(dof,
13066                     DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
13067                         goto err;
13068
13069                 ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
13070
13071                 if (ep->dted_action == NULL)
13072                         goto err;
13073         }
13074
13075         return (ep);
13076
13077 err:
13078         if (pred != NULL)
13079                 dtrace_predicate_release(pred, vstate);
13080         kmem_free(ep, sizeof (dtrace_ecbdesc_t));
13081         return (NULL);
13082 }
13083
13084 #if !defined(__APPLE__) /* APPLE dyld has already done this for us */
13085 /*
13086  * Apply the relocations from the specified 'sec' (a DOF_SECT_URELHDR) to the
13087  * specified DOF.  At present, this amounts to simply adding 'ubase' to the
13088  * site of any user SETX relocations to account for load object base address.
13089  * In the future, if we need other relocations, this function can be extended.
13090  */
13091 static int
13092 dtrace_dof_relocate(dof_hdr_t *dof, dof_sec_t *sec, uint64_t ubase)
13093 {
13094         uintptr_t daddr = (uintptr_t)dof;
13095         dof_relohdr_t *dofr =
13096             (dof_relohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
13097         dof_sec_t *ss, *rs, *ts;
13098         dof_relodesc_t *r;
13099         uint_t i, n;
13100
13101         if (sec->dofs_size < sizeof (dof_relohdr_t) ||
13102             sec->dofs_align != sizeof (dof_secidx_t)) {
13103                 dtrace_dof_error(dof, "invalid relocation header");
13104                 return (-1);
13105         }
13106
13107         ss = dtrace_dof_sect(dof, DOF_SECT_STRTAB, dofr->dofr_strtab);
13108         rs = dtrace_dof_sect(dof, DOF_SECT_RELTAB, dofr->dofr_relsec);
13109         ts = dtrace_dof_sect(dof, DOF_SECT_NONE, dofr->dofr_tgtsec);
13110
13111         if (ss == NULL || rs == NULL || ts == NULL)
13112                 return (-1); /* dtrace_dof_error() has been called already */
13113
13114         if (rs->dofs_entsize < sizeof (dof_relodesc_t) ||
13115             rs->dofs_align != sizeof (uint64_t)) {
13116                 dtrace_dof_error(dof, "invalid relocation section");
13117                 return (-1);
13118         }
13119
13120         r = (dof_relodesc_t *)(uintptr_t)(daddr + rs->dofs_offset);
13121         n = rs->dofs_size / rs->dofs_entsize;
13122
13123         for (i = 0; i < n; i++) {
13124                 uintptr_t taddr = daddr + ts->dofs_offset + r->dofr_offset;
13125
13126                 switch (r->dofr_type) {
13127                 case DOF_RELO_NONE:
13128                         break;
13129                 case DOF_RELO_SETX:
13130                         if (r->dofr_offset >= ts->dofs_size || r->dofr_offset +
13131                             sizeof (uint64_t) > ts->dofs_size) {
13132                                 dtrace_dof_error(dof, "bad relocation offset");
13133                                 return (-1);
13134                         }
13135
13136                         if (!IS_P2ALIGNED(taddr, sizeof (uint64_t))) {
13137                                 dtrace_dof_error(dof, "misaligned setx relo");
13138                                 return (-1);
13139                         }
13140
13141                         *(uint64_t *)taddr += ubase;
13142                         break;
13143                 default:
13144                         dtrace_dof_error(dof, "invalid relocation type");
13145                         return (-1);
13146                 }
13147
13148                 r = (dof_relodesc_t *)((uintptr_t)r + rs->dofs_entsize);
13149         }
13150
13151         return (0);
13152 }
13153 #endif /* __APPLE__ */
13154
13155 /*
13156  * The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
13157  * header:  it should be at the front of a memory region that is at least
13158  * sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
13159  * size.  It need not be validated in any other way.
13160  */
13161 static int
13162 dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
13163     dtrace_enabling_t **enabp, uint64_t ubase, int noprobes)
13164 {
13165 #pragma unused(ubase) /* __APPLE__ */
13166         uint64_t len = dof->dofh_loadsz, seclen;
13167         uintptr_t daddr = (uintptr_t)dof;
13168         dtrace_ecbdesc_t *ep;
13169         dtrace_enabling_t *enab;
13170         uint_t i;
13171
13172         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13173         ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
13174
13175         /*
13176          * Check the DOF header identification bytes.  In addition to checking
13177          * valid settings, we also verify that unused bits/bytes are zeroed so
13178          * we can use them later without fear of regressing existing binaries.
13179          */
13180         if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
13181             DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
13182                 dtrace_dof_error(dof, "DOF magic string mismatch");
13183                 return (-1);
13184         }
13185
13186         if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
13187             dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
13188                 dtrace_dof_error(dof, "DOF has invalid data model");
13189                 return (-1);
13190         }
13191
13192         if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
13193                 dtrace_dof_error(dof, "DOF encoding mismatch");
13194                 return (-1);
13195         }
13196
13197 #if !defined(__APPLE__)
13198         if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
13199             dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_2) {
13200                 dtrace_dof_error(dof, "DOF version mismatch");
13201                 return (-1);
13202         }
13203 #else
13204         /*
13205          * We only support DOF_VERSION_3 for now.
13206          */
13207         if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_3) {
13208                 dtrace_dof_error(dof, "DOF version mismatch");
13209                 return (-1);
13210         }
13211 #endif
13212
13213         if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
13214                 dtrace_dof_error(dof, "DOF uses unsupported instruction set");
13215                 return (-1);
13216         }
13217
13218         if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
13219                 dtrace_dof_error(dof, "DOF uses too many integer registers");
13220                 return (-1);
13221         }
13222
13223         if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
13224                 dtrace_dof_error(dof, "DOF uses too many tuple registers");
13225                 return (-1);
13226         }
13227
13228         for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
13229                 if (dof->dofh_ident[i] != 0) {
13230                         dtrace_dof_error(dof, "DOF has invalid ident byte set");
13231                         return (-1);
13232                 }
13233         }
13234
13235         if (dof->dofh_flags & ~DOF_FL_VALID) {
13236                 dtrace_dof_error(dof, "DOF has invalid flag bits set");
13237                 return (-1);
13238         }
13239
13240         if (dof->dofh_secsize == 0) {
13241                 dtrace_dof_error(dof, "zero section header size");
13242                 return (-1);
13243         }
13244
13245         /*
13246          * Check that the section headers don't exceed the amount of DOF
13247          * data.  Note that we cast the section size and number of sections
13248          * to uint64_t's to prevent possible overflow in the multiplication.
13249          */
13250         seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
13251
13252         if (dof->dofh_secoff > len || seclen > len ||
13253             dof->dofh_secoff + seclen > len) {
13254                 dtrace_dof_error(dof, "truncated section headers");
13255                 return (-1);
13256         }
13257
13258         if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
13259                 dtrace_dof_error(dof, "misaligned section headers");
13260                 return (-1);
13261         }
13262
13263         if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
13264                 dtrace_dof_error(dof, "misaligned section size");
13265                 return (-1);
13266         }
13267
13268         /*
13269          * Take an initial pass through the section headers to be sure that
13270          * the headers don't have stray offsets.  If the 'noprobes' flag is
13271          * set, do not permit sections relating to providers, probes, or args.
13272          */
13273         for (i = 0; i < dof->dofh_secnum; i++) {
13274                 dof_sec_t *sec = (dof_sec_t *)(daddr +
13275                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13276
13277                 if (noprobes) {
13278                         switch (sec->dofs_type) {
13279                         case DOF_SECT_PROVIDER:
13280                         case DOF_SECT_PROBES:
13281                         case DOF_SECT_PRARGS:
13282                         case DOF_SECT_PROFFS:
13283                                 dtrace_dof_error(dof, "illegal sections "
13284                                     "for enabling");
13285                                 return (-1);
13286                         }
13287                 }
13288
13289                 if (!(sec->dofs_flags & DOF_SECF_LOAD))
13290                         continue; /* just ignore non-loadable sections */
13291
13292                 if (sec->dofs_align & (sec->dofs_align - 1)) {
13293                         dtrace_dof_error(dof, "bad section alignment");
13294                         return (-1);
13295                 }
13296
13297                 if (sec->dofs_offset & (sec->dofs_align - 1)) {
13298                         dtrace_dof_error(dof, "misaligned section");
13299                         return (-1);
13300                 }
13301
13302                 if (sec->dofs_offset > len || sec->dofs_size > len ||
13303                     sec->dofs_offset + sec->dofs_size > len) {
13304                         dtrace_dof_error(dof, "corrupt section header");
13305                         return (-1);
13306                 }
13307
13308                 if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +
13309                     sec->dofs_offset + sec->dofs_size - 1) != '\0') {
13310                         dtrace_dof_error(dof, "non-terminating string table");
13311                         return (-1);
13312                 }
13313         }
13314
13315 #if !defined(__APPLE__)
13316         /*
13317          * Take a second pass through the sections and locate and perform any
13318          * relocations that are present.  We do this after the first pass to
13319          * be sure that all sections have had their headers validated.
13320          */
13321         for (i = 0; i < dof->dofh_secnum; i++) {
13322                 dof_sec_t *sec = (dof_sec_t *)(daddr +
13323                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13324
13325                 if (!(sec->dofs_flags & DOF_SECF_LOAD))
13326                         continue; /* skip sections that are not loadable */
13327
13328                 switch (sec->dofs_type) {
13329                 case DOF_SECT_URELHDR:
13330                         if (dtrace_dof_relocate(dof, sec, ubase) != 0)
13331                                 return (-1);
13332                         break;
13333                 }
13334         }
13335 #else
13336         /*
13337          * APPLE NOTE: We have no relocation to perform. All dof values are
13338          * relative offsets.
13339          */
13340 #endif /* __APPLE__ */
13341
13342         if ((enab = *enabp) == NULL)
13343                 enab = *enabp = dtrace_enabling_create(vstate);
13344
13345         for (i = 0; i < dof->dofh_secnum; i++) {
13346                 dof_sec_t *sec = (dof_sec_t *)(daddr +
13347                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13348
13349                 if (sec->dofs_type != DOF_SECT_ECBDESC)
13350                         continue;
13351
13352 #if !defined(__APPLE__)
13353                 if ((ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr)) == NULL) {
13354                         dtrace_enabling_destroy(enab);
13355                         *enabp = NULL;
13356                         return (-1);
13357                 }
13358 #else
13359                 /* Note: Defend against gcc 4.0 botch on x86 (not all paths out of inlined dtrace_dof_ecbdesc
13360                    are checked for the NULL return value.) */
13361                 ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr);
13362                 if (ep == NULL) {
13363                         dtrace_enabling_destroy(enab);
13364                         *enabp = NULL;
13365                         return (-1);
13366                 }
13367 #endif /* __APPLE__ */
13368
13369                 dtrace_enabling_add(enab, ep);
13370         }
13371
13372         return (0);
13373 }
13374
13375 /*
13376  * Process DOF for any options.  This routine assumes that the DOF has been
13377  * at least processed by dtrace_dof_slurp().
13378  */
13379 static int
13380 dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)
13381 {
13382 #if !defined(__APPLE__) /* Quiet compiler warnings */
13383         int i, rval;
13384 #else
13385         uint_t i;
13386         int rval;
13387 #endif /* __APPLE__ */
13388         uint32_t entsize;
13389         size_t offs;
13390         dof_optdesc_t *desc;
13391
13392         for (i = 0; i < dof->dofh_secnum; i++) {
13393                 dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +
13394                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13395
13396                 if (sec->dofs_type != DOF_SECT_OPTDESC)
13397                         continue;
13398
13399                 if (sec->dofs_align != sizeof (uint64_t)) {
13400                         dtrace_dof_error(dof, "bad alignment in "
13401                             "option description");
13402                         return (EINVAL);
13403                 }
13404
13405                 if ((entsize = sec->dofs_entsize) == 0) {
13406                         dtrace_dof_error(dof, "zeroed option entry size");
13407                         return (EINVAL);
13408                 }
13409
13410                 if (entsize < sizeof (dof_optdesc_t)) {
13411                         dtrace_dof_error(dof, "bad option entry size");
13412                         return (EINVAL);
13413                 }
13414
13415                 for (offs = 0; offs < sec->dofs_size; offs += entsize) {
13416                         desc = (dof_optdesc_t *)((uintptr_t)dof +
13417                             (uintptr_t)sec->dofs_offset + offs);
13418
13419                         if (desc->dofo_strtab != DOF_SECIDX_NONE) {
13420                                 dtrace_dof_error(dof, "non-zero option string");
13421                                 return (EINVAL);
13422                         }
13423
13424 #if !defined(__APPLE__) /* Quiet compiler warnings */
13425                         if (desc->dofo_value == DTRACEOPT_UNSET) {
13426 #else
13427                         if (desc->dofo_value == (uint64_t)DTRACEOPT_UNSET) {
13428 #endif /* __APPLE __ */
13429                                 dtrace_dof_error(dof, "unset option");
13430                                 return (EINVAL);
13431                         }
13432
13433                         if ((rval = dtrace_state_option(state,
13434                             desc->dofo_option, desc->dofo_value)) != 0) {
13435                                 dtrace_dof_error(dof, "rejected option");
13436                                 return (rval);
13437                         }
13438                 }
13439         }
13440
13441         return (0);
13442 }
13443
13444 /*
13445  * DTrace Consumer State Functions
13446  */
13447 #if defined(__APPLE__) /* Quiet compiler warning. */
13448 static
13449 #endif /* __APPLE__ */
13450 int
13451 dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
13452 {
13453         size_t hashsize, maxper, min_size, chunksize = dstate->dtds_chunksize;
13454         void *base;
13455         uintptr_t limit;
13456         dtrace_dynvar_t *dvar, *next, *start;
13457 #if !defined(__APPLE__) /* Quiet compiler warning */
13458         int i;
13459 #else
13460         size_t i;
13461 #endif /* __APPLE__ */
13462
13463         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13464         ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
13465
13466         bzero(dstate, sizeof (dtrace_dstate_t));
13467
13468         if ((dstate->dtds_chunksize = chunksize) == 0)
13469                 dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
13470
13471         if (size < (min_size = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
13472                 size = min_size;
13473
13474         if ((base = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
13475                 return (ENOMEM);
13476
13477         dstate->dtds_size = size;
13478         dstate->dtds_base = base;
13479         dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
13480         bzero(dstate->dtds_percpu, (int)NCPU * sizeof (dtrace_dstate_percpu_t));
13481
13482         hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
13483
13484         if (hashsize != 1 && (hashsize & 1))
13485                 hashsize--;
13486
13487         dstate->dtds_hashsize = hashsize;
13488         dstate->dtds_hash = dstate->dtds_base;
13489
13490         /*
13491          * Set all of our hash buckets to point to the single sink, and (if
13492          * it hasn't already been set), set the sink's hash value to be the
13493          * sink sentinel value.  The sink is needed for dynamic variable
13494          * lookups to know that they have iterated over an entire, valid hash
13495          * chain.
13496          */
13497         for (i = 0; i < hashsize; i++)
13498                 dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
13499
13500         if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
13501                 dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
13502
13503         /*
13504          * Determine number of active CPUs.  Divide free list evenly among
13505          * active CPUs.
13506          */
13507         start = (dtrace_dynvar_t *)
13508             ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
13509         limit = (uintptr_t)base + size;
13510
13511         maxper = (limit - (uintptr_t)start) / (int)NCPU;
13512         maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
13513
13514         for (i = 0; i < NCPU; i++) {
13515                 dstate->dtds_percpu[i].dtdsc_free = dvar = start;
13516
13517                 /*
13518                  * If we don't even have enough chunks to make it once through
13519                  * NCPUs, we're just going to allocate everything to the first
13520                  * CPU.  And if we're on the last CPU, we're going to allocate
13521                  * whatever is left over.  In either case, we set the limit to
13522                  * be the limit of the dynamic variable space.
13523                  */
13524                 if (maxper == 0 || i == NCPU - 1) {
13525                         limit = (uintptr_t)base + size;
13526                         start = NULL;
13527                 } else {
13528                         limit = (uintptr_t)start + maxper;
13529                         start = (dtrace_dynvar_t *)limit;
13530                 }
13531
13532                 ASSERT(limit <= (uintptr_t)base + size);
13533
13534                 for (;;) {
13535                         next = (dtrace_dynvar_t *)((uintptr_t)dvar +
13536                             dstate->dtds_chunksize);
13537
13538                         if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
13539                                 break;
13540
13541                         dvar->dtdv_next = next;
13542                         dvar = next;
13543                 }
13544
13545                 if (maxper == 0)
13546                         break;
13547         }
13548
13549         return (0);
13550 }
13551
13552 #if defined(__APPLE__) /* Quiet compiler warning. */
13553 static
13554 #endif /* __APPLE__ */
13555 void
13556 dtrace_dstate_fini(dtrace_dstate_t *dstate)
13557 {
13558         lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
13559
13560         if (dstate->dtds_base == NULL)
13561                 return;
13562
13563         kmem_free(dstate->dtds_base, dstate->dtds_size);
13564         kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);
13565 }
13566
13567 static void
13568 dtrace_vstate_fini(dtrace_vstate_t *vstate)
13569 {
13570         /*
13571          * Logical XOR, where are you?
13572          */
13573         ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));
13574
13575         if (vstate->dtvs_nglobals > 0) {
13576                 kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
13577                     sizeof (dtrace_statvar_t *));
13578         }
13579
13580         if (vstate->dtvs_ntlocals > 0) {
13581                 kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
13582                     sizeof (dtrace_difv_t));
13583         }
13584
13585         ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));
13586
13587         if (vstate->dtvs_nlocals > 0) {
13588                 kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
13589                     sizeof (dtrace_statvar_t *));
13590         }
13591 }
13592
13593 static void
13594 dtrace_state_clean(dtrace_state_t *state)
13595 {
13596         if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
13597                 return;
13598
13599         dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
13600         dtrace_speculation_clean(state);
13601 }
13602
13603 static void
13604 dtrace_state_deadman(dtrace_state_t *state)
13605 {
13606         hrtime_t now;
13607
13608         dtrace_sync();
13609
13610         now = dtrace_gethrtime();
13611
13612         if (state != dtrace_anon.dta_state &&
13613             now - state->dts_laststatus >= dtrace_deadman_user)
13614                 return;
13615
13616         /*
13617          * We must be sure that dts_alive never appears to be less than the
13618          * value upon entry to dtrace_state_deadman(), and because we lack a
13619          * dtrace_cas64(), we cannot store to it atomically.  We thus instead
13620          * store INT64_MAX to it, followed by a memory barrier, followed by
13621          * the new value.  This assures that dts_alive never appears to be
13622          * less than its true value, regardless of the order in which the
13623          * stores to the underlying storage are issued.
13624          */
13625         state->dts_alive = INT64_MAX;
13626         dtrace_membar_producer();
13627         state->dts_alive = now;
13628 }
13629
13630 #if !defined(__APPLE__)
13631 dtrace_state_t *
13632 dtrace_state_create(dev_t *devp, cred_t *cr)
13633 #else
13634 static int
13635 dtrace_state_create(dev_t *devp, cred_t *cr, dtrace_state_t **new_state)
13636 #endif /* __APPLE__ */
13637 {
13638         minor_t minor;
13639         major_t major;
13640         char c[30];
13641         dtrace_state_t *state;
13642         dtrace_optval_t *opt;
13643         int bufsize = (int)NCPU * sizeof (dtrace_buffer_t), i;
13644
13645         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13646         lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
13647
13648 #if !defined(__APPLE__)
13649         minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1,
13650             VM_BESTFIT | VM_SLEEP);
13651
13652         if (ddi_soft_state_zalloc(dtrace_softstate, minor) != DDI_SUCCESS) {
13653                 vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
13654                 return (NULL);
13655         }
13656 #else
13657         /* Cause restart */
13658         *new_state = NULL;
13659
13660         /*
13661          * Darwin's DEVFS layer acquired the minor number for this "device" when it called
13662          * dtrace_devfs_clone_func(). At that time, dtrace_devfs_clone_func() proposed a minor number
13663          * (next unused according to vmem_alloc()) and then immediately put the number back in play
13664          * (by calling vmem_free()). Now that minor number is being used for an open, so committing it
13665          * to use. The following vmem_alloc() must deliver that same minor number. FIXME.
13666          */
13667
13668         minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1,
13669             VM_BESTFIT | VM_SLEEP);
13670
13671         if (NULL != devp) {
13672         ASSERT(getminor(*devp) == minor);
13673                 if (getminor(*devp) != minor) {
13674                         printf("dtrace_open: couldn't re-acquire vended minor number %d. Instead got %d\n",
13675                                         getminor(*devp), minor);
13676                         vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
13677                         return (ERESTART);      /* can't reacquire */
13678                 }
13679         } else {
13680         /* NULL==devp iff "Anonymous state" (see dtrace_anon_property),
13681                  * so just vend the minor device number here de novo since no "open" has occurred. */
13682         }
13683
13684         if (ddi_soft_state_zalloc(dtrace_softstate, minor) != DDI_SUCCESS) {
13685                 vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
13686                 return (EAGAIN);        /* temporary resource shortage */
13687         }
13688
13689 #endif /* __APPLE__ */
13690
13691         state = ddi_get_soft_state(dtrace_softstate, minor);
13692         state->dts_epid = DTRACE_EPIDNONE + 1;
13693
13694         (void) snprintf(c, sizeof (c), "dtrace_aggid_%d", minor);
13695         state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
13696             NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
13697
13698         if (devp != NULL) {
13699                 major = getemajor(*devp);
13700         } else {
13701                 major = ddi_driver_major(dtrace_devi);
13702         }
13703
13704         state->dts_dev = makedevice(major, minor);
13705
13706         if (devp != NULL)
13707                 *devp = state->dts_dev;
13708
13709         /*
13710          * We allocate NCPU buffers.  On the one hand, this can be quite
13711          * a bit of memory per instance (nearly 36K on a Starcat).  On the
13712          * other hand, it saves an additional memory reference in the probe
13713          * path.
13714          */
13715         state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
13716         state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
13717         state->dts_cleaner = CYCLIC_NONE;
13718         state->dts_deadman = CYCLIC_NONE;
13719         state->dts_vstate.dtvs_state = state;
13720
13721         for (i = 0; i < DTRACEOPT_MAX; i++)
13722                 state->dts_options[i] = DTRACEOPT_UNSET;
13723
13724         /*
13725          * Set the default options.
13726          */
13727         opt = state->dts_options;
13728         opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
13729         opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
13730         opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
13731         opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
13732         opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
13733         opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
13734         opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
13735         opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
13736         opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
13737         opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
13738         opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
13739         opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
13740         opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
13741         opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
13742
13743         state->dts_activity = DTRACE_ACTIVITY_INACTIVE;
13744
13745         /*
13746          * Depending on the user credentials, we set flag bits which alter probe
13747          * visibility or the amount of destructiveness allowed.  In the case of
13748          * actual anonymous tracing, or the possession of all privileges, all of
13749          * the normal checks are bypassed.
13750          */
13751         if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
13752                 state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
13753                 state->dts_cred.dcr_action = DTRACE_CRA_ALL;
13754         } else {
13755                 /*
13756                  * Set up the credentials for this instantiation.  We take a
13757                  * hold on the credential to prevent it from disappearing on
13758                  * us; this in turn prevents the zone_t referenced by this
13759                  * credential from disappearing.  This means that we can
13760                  * examine the credential and the zone from probe context.
13761                  */
13762                 crhold(cr);
13763                 state->dts_cred.dcr_cred = cr;
13764
13765                 /*
13766                  * CRA_PROC means "we have *some* privilege for dtrace" and
13767                  * unlocks the use of variables like pid, zonename, etc.
13768                  */
13769                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||
13770                     PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
13771                         state->dts_cred.dcr_action |= DTRACE_CRA_PROC;
13772                 }
13773
13774                 /*
13775                  * dtrace_user allows use of syscall and profile providers.
13776                  * If the user also has proc_owner and/or proc_zone, we
13777                  * extend the scope to include additional visibility and
13778                  * destructive power.
13779                  */
13780                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
13781                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
13782                                 state->dts_cred.dcr_visible |=
13783                                     DTRACE_CRV_ALLPROC;
13784
13785                                 state->dts_cred.dcr_action |=
13786                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
13787                         }
13788
13789                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
13790                                 state->dts_cred.dcr_visible |=
13791                                     DTRACE_CRV_ALLZONE;
13792
13793                                 state->dts_cred.dcr_action |=
13794                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
13795                         }
13796
13797                         /*
13798                          * If we have all privs in whatever zone this is,
13799                          * we can do destructive things to processes which
13800                          * have altered credentials.
13801                          */
13802 #if !defined(__APPLE__)
13803                         if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
13804                             cr->cr_zone->zone_privset)) {
13805                                 state->dts_cred.dcr_action |=
13806                                         DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
13807                         }
13808 #else
13809                         /* Darwin doesn't do zones. */
13810                         state->dts_cred.dcr_action |=
13811                                 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
13812 #endif /* __APPLE__ */
13813                 }
13814
13815                 /*
13816                  * Holding the dtrace_kernel privilege also implies that
13817                  * the user has the dtrace_user privilege from a visibility
13818                  * perspective.  But without further privileges, some
13819                  * destructive actions are not available.
13820                  */
13821                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
13822                         /*
13823                          * Make all probes in all zones visible.  However,
13824                          * this doesn't mean that all actions become available
13825                          * to all zones.
13826                          */
13827                         state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |
13828                             DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;
13829
13830                         state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |
13831                             DTRACE_CRA_PROC;
13832                         /*
13833                          * Holding proc_owner means that destructive actions
13834                          * for *this* zone are allowed.
13835                          */
13836                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
13837                                 state->dts_cred.dcr_action |=
13838                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
13839
13840                         /*
13841                          * Holding proc_zone means that destructive actions
13842                          * for this user/group ID in all zones is allowed.
13843                          */
13844                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
13845                                 state->dts_cred.dcr_action |=
13846                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
13847
13848                         /*
13849                          * If we have all privs in whatever zone this is,
13850                          * we can do destructive things to processes which
13851                          * have altered credentials.
13852                          */
13853 #if !defined(__APPLE__)
13854                         if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
13855                             cr->cr_zone->zone_privset)) {
13856                                 state->dts_cred.dcr_action |=
13857                                     DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
13858                         }
13859 #else
13860                         /* Darwin doesn't do zones. */
13861                         state->dts_cred.dcr_action |=
13862                                 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
13863 #endif /* __APPLE__ */
13864                 }
13865
13866                 /*
13867                  * Holding the dtrace_proc privilege gives control over fasttrap
13868                  * and pid providers.  We need to grant wider destructive
13869                  * privileges in the event that the user has proc_owner and/or
13870                  * proc_zone.
13871                  */
13872                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
13873                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
13874                                 state->dts_cred.dcr_action |=
13875                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
13876
13877                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
13878                                 state->dts_cred.dcr_action |=
13879                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
13880                 }
13881         }
13882
13883 #if !defined(__APPLE__)
13884         return (state);
13885 #else
13886         *new_state = state;
13887         return(0);  /* Success */
13888 #endif /* __APPLE__ */
13889 }
13890
13891 static int
13892 dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
13893 {
13894         dtrace_optval_t *opt = state->dts_options, size;
13895         processorid_t cpu = 0;
13896         int flags = 0, rval;
13897
13898         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13899         lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
13900         ASSERT(which < DTRACEOPT_MAX);
13901         ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||
13902             (state == dtrace_anon.dta_state &&
13903             state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
13904
13905         if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)
13906                 return (0);
13907
13908         if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
13909                 cpu = opt[DTRACEOPT_CPU];
13910
13911         if (which == DTRACEOPT_SPECSIZE)
13912                 flags |= DTRACEBUF_NOSWITCH;
13913
13914         if (which == DTRACEOPT_BUFSIZE) {
13915                 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
13916                         flags |= DTRACEBUF_RING;
13917
13918                 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
13919                         flags |= DTRACEBUF_FILL;
13920
13921                 if (state != dtrace_anon.dta_state ||
13922                     state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
13923                         flags |= DTRACEBUF_INACTIVE;
13924         }
13925
13926 #if !defined(__APPLE__) /* Quiet compiler warning */
13927         for (size = opt[which]; size >= sizeof (uint64_t); size >>= 1) {
13928 #else
13929         for (size = opt[which]; (size_t)size >= sizeof (uint64_t); size >>= 1) {
13930 #endif /* __APPLE__ */
13931                 /*
13932                  * The size must be 8-byte aligned.  If the size is not 8-byte
13933                  * aligned, drop it down by the difference.
13934                  */
13935                 if (size & (sizeof (uint64_t) - 1))
13936                         size -= size & (sizeof (uint64_t) - 1);
13937
13938                 if (size < state->dts_reserve) {
13939                         /*
13940                          * Buffers always must be large enough to accommodate
13941                          * their prereserved space.  We return E2BIG instead
13942                          * of ENOMEM in this case to allow for user-level
13943                          * software to differentiate the cases.
13944                          */
13945                         return (E2BIG);
13946                 }
13947
13948                 rval = dtrace_buffer_alloc(buf, size, flags, cpu);
13949
13950                 if (rval != ENOMEM) {
13951                         opt[which] = size;
13952                         return (rval);
13953                 }
13954
13955                 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
13956                         return (rval);
13957         }
13958
13959         return (ENOMEM);
13960 }
13961
13962 static int
13963 dtrace_state_buffers(dtrace_state_t *state)
13964 {
13965         dtrace_speculation_t *spec = state->dts_speculations;
13966         int rval, i;
13967
13968         if ((rval = dtrace_state_buffer(state, state->dts_buffer,
13969             DTRACEOPT_BUFSIZE)) != 0)
13970                 return (rval);
13971
13972         if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
13973             DTRACEOPT_AGGSIZE)) != 0)
13974                 return (rval);
13975
13976         for (i = 0; i < state->dts_nspeculations; i++) {
13977                 if ((rval = dtrace_state_buffer(state,
13978                     spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
13979                         return (rval);
13980         }
13981
13982         return (0);
13983 }
13984
13985 static void
13986 dtrace_state_prereserve(dtrace_state_t *state)
13987 {
13988         dtrace_ecb_t *ecb;
13989         dtrace_probe_t *probe;
13990
13991         state->dts_reserve = 0;
13992
13993         if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
13994                 return;
13995
13996         /*
13997          * If our buffer policy is a "fill" buffer policy, we need to set the
13998          * prereserved space to be the space required by the END probes.
13999          */
14000         probe = dtrace_probes[dtrace_probeid_end - 1];
14001         ASSERT(probe != NULL);
14002
14003         for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
14004                 if (ecb->dte_state != state)
14005                         continue;
14006
14007                 state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
14008         }
14009 }
14010
14011 static int
14012 dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
14013 {
14014         dtrace_optval_t *opt = state->dts_options, sz, nspec;
14015         dtrace_speculation_t *spec;
14016         dtrace_buffer_t *buf;
14017         cyc_handler_t hdlr;
14018         cyc_time_t when;
14019         int rval = 0, i, bufsize = (int)NCPU * sizeof (dtrace_buffer_t);
14020         dtrace_icookie_t cookie;
14021
14022         lck_mtx_lock(&cpu_lock);
14023         lck_mtx_lock(&dtrace_lock);
14024
14025         if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
14026                 rval = EBUSY;
14027                 goto out;
14028         }
14029
14030         /*
14031          * Before we can perform any checks, we must prime all of the
14032          * retained enablings that correspond to this state.
14033          */
14034         dtrace_enabling_prime(state);
14035
14036         if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
14037                 rval = EACCES;
14038                 goto out;
14039         }
14040
14041         dtrace_state_prereserve(state);
14042
14043         /*
14044          * Now we want to do is try to allocate our speculations.
14045          * We do not automatically resize the number of speculations; if
14046          * this fails, we will fail the operation.
14047          */
14048         nspec = opt[DTRACEOPT_NSPEC];
14049         ASSERT(nspec != DTRACEOPT_UNSET);
14050
14051         if (nspec > INT_MAX) {
14052                 rval = ENOMEM;
14053                 goto out;
14054         }
14055
14056         spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t), KM_NOSLEEP);
14057
14058         if (spec == NULL) {
14059                 rval = ENOMEM;
14060                 goto out;
14061         }
14062
14063         state->dts_speculations = spec;
14064         state->dts_nspeculations = (int)nspec;
14065
14066         for (i = 0; i < nspec; i++) {
14067                 if ((buf = kmem_zalloc(bufsize, KM_NOSLEEP)) == NULL) {
14068                         rval = ENOMEM;
14069                         goto err;
14070                 }
14071
14072                 spec[i].dtsp_buffer = buf;
14073         }
14074
14075         if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
14076                 if (dtrace_anon.dta_state == NULL) {
14077                         rval = ENOENT;
14078                         goto out;
14079                 }
14080
14081                 if (state->dts_necbs != 0) {
14082                         rval = EALREADY;
14083                         goto out;
14084                 }
14085
14086                 state->dts_anon = dtrace_anon_grab();
14087                 ASSERT(state->dts_anon != NULL);
14088                 state = state->dts_anon;
14089
14090                 /*
14091                  * We want "grabanon" to be set in the grabbed state, so we'll
14092                  * copy that option value from the grabbing state into the
14093                  * grabbed state.
14094                  */
14095                 state->dts_options[DTRACEOPT_GRABANON] =
14096                     opt[DTRACEOPT_GRABANON];
14097
14098                 *cpu = dtrace_anon.dta_beganon;
14099
14100                 /*
14101                  * If the anonymous state is active (as it almost certainly
14102                  * is if the anonymous enabling ultimately matched anything),
14103                  * we don't allow any further option processing -- but we
14104                  * don't return failure.
14105                  */
14106                 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14107                         goto out;
14108         }
14109
14110         if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
14111             opt[DTRACEOPT_AGGSIZE] != 0) {
14112                 if (state->dts_aggregations == NULL) {
14113                         /*
14114                          * We're not going to create an aggregation buffer
14115                          * because we don't have any ECBs that contain
14116                          * aggregations -- set this option to 0.
14117                          */
14118                         opt[DTRACEOPT_AGGSIZE] = 0;
14119                 } else {
14120                         /*
14121                          * If we have an aggregation buffer, we must also have
14122                          * a buffer to use as scratch.
14123                          */
14124 #if !defined(__APPLE__) /* Quiet compiler warning */
14125                         if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
14126                             opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
14127                                 opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
14128                         }
14129 #else
14130                         if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
14131                           (size_t)opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
14132                                 opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
14133                         }
14134 #endif /* __APPLE__ */
14135                 }
14136         }
14137
14138         if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
14139             opt[DTRACEOPT_SPECSIZE] != 0) {
14140                 if (!state->dts_speculates) {
14141                         /*
14142                          * We're not going to create speculation buffers
14143                          * because we don't have any ECBs that actually
14144                          * speculate -- set the speculation size to 0.
14145                          */
14146                         opt[DTRACEOPT_SPECSIZE] = 0;
14147                 }
14148         }
14149
14150         /*
14151          * The bare minimum size for any buffer that we're actually going to
14152          * do anything to is sizeof (uint64_t).
14153          */
14154         sz = sizeof (uint64_t);
14155
14156         if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||
14157             (state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||
14158             (state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
14159                 /*
14160                  * A buffer size has been explicitly set to 0 (or to a size
14161                  * that will be adjusted to 0) and we need the space -- we
14162                  * need to return failure.  We return ENOSPC to differentiate
14163                  * it from failing to allocate a buffer due to failure to meet
14164                  * the reserve (for which we return E2BIG).
14165                  */
14166                 rval = ENOSPC;
14167                 goto out;
14168         }
14169
14170         if ((rval = dtrace_state_buffers(state)) != 0)
14171                 goto err;
14172
14173         if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
14174                 sz = dtrace_dstate_defsize;
14175
14176         do {
14177                 rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
14178
14179                 if (rval == 0)
14180                         break;
14181
14182                 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
14183                         goto err;
14184         } while (sz >>= 1);
14185
14186         opt[DTRACEOPT_DYNVARSIZE] = sz;
14187
14188         if (rval != 0)
14189                 goto err;
14190
14191         if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
14192                 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
14193
14194         if (opt[DTRACEOPT_CLEANRATE] == 0)
14195                 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
14196
14197         if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
14198                 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
14199
14200         if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
14201                 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
14202
14203         hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
14204         hdlr.cyh_arg = state;
14205         hdlr.cyh_level = CY_LOW_LEVEL;
14206
14207         when.cyt_when = 0;
14208         when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
14209
14210         state->dts_cleaner = cyclic_add(&hdlr, &when);
14211
14212         hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
14213         hdlr.cyh_arg = state;
14214         hdlr.cyh_level = CY_LOW_LEVEL;
14215
14216         when.cyt_when = 0;
14217         when.cyt_interval = dtrace_deadman_interval;
14218
14219         state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
14220         state->dts_deadman = cyclic_add(&hdlr, &when);
14221
14222         state->dts_activity = DTRACE_ACTIVITY_WARMUP;
14223
14224         /*
14225          * Now it's time to actually fire the BEGIN probe.  We need to disable
14226          * interrupts here both to record the CPU on which we fired the BEGIN
14227          * probe (the data from this CPU will be processed first at user
14228          * level) and to manually activate the buffer for this CPU.
14229          */
14230         cookie = dtrace_interrupt_disable();
14231         *cpu = CPU->cpu_id;
14232         ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
14233         state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
14234
14235         dtrace_probe(dtrace_probeid_begin,
14236             (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
14237         dtrace_interrupt_enable(cookie);
14238         /*
14239          * We may have had an exit action from a BEGIN probe; only change our
14240          * state to ACTIVE if we're still in WARMUP.
14241          */
14242         ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||
14243             state->dts_activity == DTRACE_ACTIVITY_DRAINING);
14244
14245         if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
14246                 state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
14247
14248         /*
14249          * Regardless of whether or not now we're in ACTIVE or DRAINING, we
14250          * want each CPU to transition its principal buffer out of the
14251          * INACTIVE state.  Doing this assures that no CPU will suddenly begin
14252          * processing an ECB halfway down a probe's ECB chain; all CPUs will
14253          * atomically transition from processing none of a state's ECBs to
14254          * processing all of them.
14255          */
14256         dtrace_xcall(DTRACE_CPUALL,
14257             (dtrace_xcall_t)dtrace_buffer_activate, state);
14258         goto out;
14259
14260 err:
14261         dtrace_buffer_free(state->dts_buffer);
14262         dtrace_buffer_free(state->dts_aggbuffer);
14263
14264         if ((nspec = state->dts_nspeculations) == 0) {
14265                 ASSERT(state->dts_speculations == NULL);
14266                 goto out;
14267         }
14268
14269         spec = state->dts_speculations;
14270         ASSERT(spec != NULL);
14271
14272         for (i = 0; i < state->dts_nspeculations; i++) {
14273                 if ((buf = spec[i].dtsp_buffer) == NULL)
14274                         break;
14275
14276                 dtrace_buffer_free(buf);
14277                 kmem_free(buf, bufsize);
14278         }
14279
14280         kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
14281         state->dts_nspeculations = 0;
14282         state->dts_speculations = NULL;
14283
14284 out:
14285         lck_mtx_unlock(&dtrace_lock);
14286         lck_mtx_unlock(&cpu_lock);
14287
14288         return (rval);
14289 }
14290
14291 static int
14292 dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
14293 {
14294         dtrace_icookie_t cookie;
14295
14296         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14297
14298         if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
14299             state->dts_activity != DTRACE_ACTIVITY_DRAINING)
14300                 return (EINVAL);
14301
14302         /*
14303          * We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
14304          * to be sure that every CPU has seen it.  See below for the details
14305          * on why this is done.
14306          */
14307         state->dts_activity = DTRACE_ACTIVITY_DRAINING;
14308         dtrace_sync();
14309
14310         /*
14311          * By this point, it is impossible for any CPU to be still processing
14312          * with DTRACE_ACTIVITY_ACTIVE.  We can thus set our activity to
14313          * DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
14314          * other CPU in dtrace_buffer_reserve().  This allows dtrace_probe()
14315          * and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
14316          * iff we're in the END probe.
14317          */
14318         state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
14319         dtrace_sync();
14320         ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
14321
14322         /*
14323          * Finally, we can release the reserve and call the END probe.  We
14324          * disable interrupts across calling the END probe to allow us to
14325          * return the CPU on which we actually called the END probe.  This
14326          * allows user-land to be sure that this CPU's principal buffer is
14327          * processed last.
14328          */
14329         state->dts_reserve = 0;
14330
14331         cookie = dtrace_interrupt_disable();
14332         *cpu = CPU->cpu_id;
14333         dtrace_probe(dtrace_probeid_end,
14334             (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
14335         dtrace_interrupt_enable(cookie);
14336
14337         state->dts_activity = DTRACE_ACTIVITY_STOPPED;
14338         dtrace_sync();
14339
14340         return (0);
14341 }
14342
14343 static int
14344 dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
14345     dtrace_optval_t val)
14346 {
14347         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14348
14349         if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14350                 return (EBUSY);
14351
14352         if (option >= DTRACEOPT_MAX)
14353                 return (EINVAL);
14354
14355         if (option != DTRACEOPT_CPU && val < 0)
14356                 return (EINVAL);
14357
14358         switch (option) {
14359         case DTRACEOPT_DESTRUCTIVE:
14360                 if (dtrace_destructive_disallow)
14361                         return (EACCES);
14362
14363                 state->dts_cred.dcr_destructive = 1;
14364                 break;
14365
14366         case DTRACEOPT_BUFSIZE:
14367         case DTRACEOPT_DYNVARSIZE:
14368         case DTRACEOPT_AGGSIZE:
14369         case DTRACEOPT_SPECSIZE:
14370         case DTRACEOPT_STRSIZE:
14371                 if (val < 0)
14372                         return (EINVAL);
14373
14374                 if (val >= LONG_MAX) {
14375                         /*
14376                          * If this is an otherwise negative value, set it to
14377                          * the highest multiple of 128m less than LONG_MAX.
14378                          * Technically, we're adjusting the size without
14379                          * regard to the buffer resizing policy, but in fact,
14380                          * this has no effect -- if we set the buffer size to
14381                          * ~LONG_MAX and the buffer policy is ultimately set to
14382                          * be "manual", the buffer allocation is guaranteed to
14383                          * fail, if only because the allocation requires two
14384                          * buffers.  (We set the the size to the highest
14385                          * multiple of 128m because it ensures that the size
14386                          * will remain a multiple of a megabyte when
14387                          * repeatedly halved -- all the way down to 15m.)
14388                          */
14389                         val = LONG_MAX - (1 << 27) + 1;
14390                 }
14391         }
14392
14393         state->dts_options[option] = val;
14394
14395         return (0);
14396 }
14397
14398 static void
14399 dtrace_state_destroy(dtrace_state_t *state)
14400 {
14401         dtrace_ecb_t *ecb;
14402         dtrace_vstate_t *vstate = &state->dts_vstate;
14403         minor_t minor = getminor(state->dts_dev);
14404         int i, bufsize = (int)NCPU * sizeof (dtrace_buffer_t);
14405         dtrace_speculation_t *spec = state->dts_speculations;
14406         int nspec = state->dts_nspeculations;
14407         uint32_t match;
14408
14409         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14410         lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14411
14412         /*
14413          * First, retract any retained enablings for this state.
14414          */
14415         dtrace_enabling_retract(state);
14416         ASSERT(state->dts_nretained == 0);
14417
14418         if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||
14419             state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
14420                 /*
14421                  * We have managed to come into dtrace_state_destroy() on a
14422                  * hot enabling -- almost certainly because of a disorderly
14423                  * shutdown of a consumer.  (That is, a consumer that is
14424                  * exiting without having called dtrace_stop().) In this case,
14425                  * we're going to set our activity to be KILLED, and then
14426                  * issue a sync to be sure that everyone is out of probe
14427                  * context before we start blowing away ECBs.
14428                  */
14429                 state->dts_activity = DTRACE_ACTIVITY_KILLED;
14430                 dtrace_sync();
14431         }
14432
14433         /*
14434          * Release the credential hold we took in dtrace_state_create().
14435          */
14436         if (state->dts_cred.dcr_cred != NULL)
14437                 crfree(state->dts_cred.dcr_cred);
14438
14439         /*
14440          * Now we can safely disable and destroy any enabled probes.  Because
14441          * any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
14442          * (especially if they're all enabled), we take two passes through the
14443          * ECBs:  in the first, we disable just DTRACE_PRIV_KERNEL probes, and
14444          * in the second we disable whatever is left over.
14445          */
14446         for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
14447                 for (i = 0; i < state->dts_necbs; i++) {
14448                         if ((ecb = state->dts_ecbs[i]) == NULL)
14449                                 continue;
14450
14451                         if (match && ecb->dte_probe != NULL) {
14452                                 dtrace_probe_t *probe = ecb->dte_probe;
14453                                 dtrace_provider_t *prov = probe->dtpr_provider;
14454
14455                                 if (!(prov->dtpv_priv.dtpp_flags & match))
14456                                         continue;
14457                         }
14458
14459                         dtrace_ecb_disable(ecb);
14460                         dtrace_ecb_destroy(ecb);
14461                 }
14462
14463                 if (!match)
14464                         break;
14465         }
14466
14467         /*
14468          * Before we free the buffers, perform one more sync to assure that
14469          * every CPU is out of probe context.
14470          */
14471         dtrace_sync();
14472
14473         dtrace_buffer_free(state->dts_buffer);
14474         dtrace_buffer_free(state->dts_aggbuffer);
14475
14476         for (i = 0; i < nspec; i++)
14477                 dtrace_buffer_free(spec[i].dtsp_buffer);
14478
14479         if (state->dts_cleaner != CYCLIC_NONE)
14480                 cyclic_remove(state->dts_cleaner);
14481
14482         if (state->dts_deadman != CYCLIC_NONE)
14483                 cyclic_remove(state->dts_deadman);
14484
14485         dtrace_dstate_fini(&vstate->dtvs_dynvars);
14486         dtrace_vstate_fini(vstate);
14487         kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
14488
14489         if (state->dts_aggregations != NULL) {
14490 #if DEBUG
14491                 for (i = 0; i < state->dts_naggregations; i++)
14492                         ASSERT(state->dts_aggregations[i] == NULL);
14493 #endif
14494                 ASSERT(state->dts_naggregations > 0);
14495                 kmem_free(state->dts_aggregations,
14496                     state->dts_naggregations * sizeof (dtrace_aggregation_t *));
14497         }
14498
14499         kmem_free(state->dts_buffer, bufsize);
14500         kmem_free(state->dts_aggbuffer, bufsize);
14501
14502         for (i = 0; i < nspec; i++)
14503                 kmem_free(spec[i].dtsp_buffer, bufsize);
14504
14505         kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
14506
14507         dtrace_format_destroy(state);
14508
14509         vmem_destroy(state->dts_aggid_arena);
14510         ddi_soft_state_free(dtrace_softstate, minor);
14511         vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
14512 }
14513
14514 /*
14515  * DTrace Anonymous Enabling Functions
14516  */
14517 static dtrace_state_t *
14518 dtrace_anon_grab(void)
14519 {
14520         dtrace_state_t *state;
14521
14522         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14523
14524         if ((state = dtrace_anon.dta_state) == NULL) {
14525                 ASSERT(dtrace_anon.dta_enabling == NULL);
14526                 return (NULL);
14527         }
14528
14529         ASSERT(dtrace_anon.dta_enabling != NULL);
14530         ASSERT(dtrace_retained != NULL);
14531
14532         dtrace_enabling_destroy(dtrace_anon.dta_enabling);
14533         dtrace_anon.dta_enabling = NULL;
14534         dtrace_anon.dta_state = NULL;
14535
14536         return (state);
14537 }
14538
14539 static void
14540 dtrace_anon_property(void)
14541 {
14542         int i, rv;
14543         dtrace_state_t *state;
14544         dof_hdr_t *dof;
14545         char c[32];             /* enough for "dof-data-" + digits */
14546
14547         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14548         lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14549
14550         for (i = 0; ; i++) {
14551                 (void) snprintf(c, sizeof (c), "dof-data-%d", i);
14552
14553                 dtrace_err_verbose = 1;
14554
14555                 if ((dof = dtrace_dof_property(c)) == NULL) {
14556                         dtrace_err_verbose = 0;
14557                         break;
14558                 }
14559
14560                 /*
14561                  * We want to create anonymous state, so we need to transition
14562                  * the kernel debugger to indicate that DTrace is active.  If
14563                  * this fails (e.g. because the debugger has modified text in
14564                  * some way), we won't continue with the processing.
14565                  */
14566                 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
14567                         cmn_err(CE_NOTE, "kernel debugger active; anonymous "
14568                             "enabling ignored.");
14569                         dtrace_dof_destroy(dof);
14570                         break;
14571                 }
14572
14573                 /*
14574                  * If we haven't allocated an anonymous state, we'll do so now.
14575                  */
14576                 if ((state = dtrace_anon.dta_state) == NULL) {
14577 #if !defined(__APPLE__)
14578                         state = dtrace_state_create(NULL, NULL);
14579                         dtrace_anon.dta_state = state;
14580                         if (state == NULL) {
14581 #else
14582                         rv = dtrace_state_create(NULL, NULL, &state);
14583                         dtrace_anon.dta_state = state;
14584                         if (rv != 0 || state == NULL) {
14585 #endif /* __APPLE__ */
14586                                 /*
14587                                  * This basically shouldn't happen:  the only
14588                                  * failure mode from dtrace_state_create() is a
14589                                  * failure of ddi_soft_state_zalloc() that
14590                                  * itself should never happen.  Still, the
14591                                  * interface allows for a failure mode, and
14592                                  * we want to fail as gracefully as possible:
14593                                  * we'll emit an error message and cease
14594                                  * processing anonymous state in this case.
14595                                  */
14596                                 cmn_err(CE_WARN, "failed to create "
14597                                     "anonymous state");
14598                                 dtrace_dof_destroy(dof);
14599                                 break;
14600                         }
14601                 }
14602
14603                 rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
14604                     &dtrace_anon.dta_enabling, 0, B_TRUE);
14605
14606                 if (rv == 0)
14607                         rv = dtrace_dof_options(dof, state);
14608
14609                 dtrace_err_verbose = 0;
14610                 dtrace_dof_destroy(dof);
14611
14612                 if (rv != 0) {
14613                         /*
14614                          * This is malformed DOF; chuck any anonymous state
14615                          * that we created.
14616                          */
14617                         ASSERT(dtrace_anon.dta_enabling == NULL);
14618                         dtrace_state_destroy(state);
14619                         dtrace_anon.dta_state = NULL;
14620                         break;
14621                 }
14622
14623                 ASSERT(dtrace_anon.dta_enabling != NULL);
14624         }
14625
14626         if (dtrace_anon.dta_enabling != NULL) {
14627                 int rval;
14628
14629                 /*
14630                  * dtrace_enabling_retain() can only fail because we are
14631                  * trying to retain more enablings than are allowed -- but
14632                  * we only have one anonymous enabling, and we are guaranteed
14633                  * to be allowed at least one retained enabling; we assert
14634                  * that dtrace_enabling_retain() returns success.
14635                  */
14636                 rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
14637                 ASSERT(rval == 0);
14638
14639                 dtrace_enabling_dump(dtrace_anon.dta_enabling);
14640         }
14641 }
14642
14643 /*
14644  * DTrace Helper Functions
14645  */
14646 static void
14647 dtrace_helper_trace(dtrace_helper_action_t *helper,
14648     dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
14649 {
14650 #if !defined(__APPLE__) /* Quiet compiler warning */
14651         uint32_t size, next, nnext, i;
14652 #else
14653         uint32_t size, next, nnext;
14654         int i;
14655 #endif /* __APPLE__ */
14656         dtrace_helptrace_t *ent;
14657         uint16_t flags = cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
14658
14659         if (!dtrace_helptrace_enabled)
14660                 return;
14661
14662 #if !defined(__APPLE__) /* Quiet compiler warning */
14663         ASSERT(vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
14664 #else
14665         ASSERT((uint32_t)vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
14666 #endif /* __APPLE__ */
14667
14668         /*
14669          * What would a tracing framework be without its own tracing
14670          * framework?  (Well, a hell of a lot simpler, for starters...)
14671          */
14672         size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
14673             sizeof (uint64_t) - sizeof (uint64_t);
14674
14675         /*
14676          * Iterate until we can allocate a slot in the trace buffer.
14677          */
14678         do {
14679                 next = dtrace_helptrace_next;
14680
14681                 if (next + size < dtrace_helptrace_bufsize) {
14682                         nnext = next + size;
14683                 } else {
14684                         nnext = size;
14685                 }
14686         } while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
14687
14688         /*
14689          * We have our slot; fill it in.
14690          */
14691         if (nnext == size)
14692                 next = 0;
14693
14694         ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
14695         ent->dtht_helper = helper;
14696         ent->dtht_where = where;
14697         ent->dtht_nlocals = vstate->dtvs_nlocals;
14698
14699         ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
14700             mstate->dtms_fltoffs : -1;
14701         ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
14702         ent->dtht_illval = cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
14703
14704         for (i = 0; i < vstate->dtvs_nlocals; i++) {
14705                 dtrace_statvar_t *svar;
14706
14707                 if ((svar = vstate->dtvs_locals[i]) == NULL)
14708                         continue;
14709
14710                 ASSERT(svar->dtsv_size >= (int)NCPU * sizeof (uint64_t));
14711                 ent->dtht_locals[i] =
14712                     ((uint64_t *)(uintptr_t)svar->dtsv_data)[CPU->cpu_id];
14713         }
14714 }
14715
14716 static uint64_t
14717 dtrace_helper(int which, dtrace_mstate_t *mstate,
14718     dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
14719 {
14720         uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
14721         uint64_t sarg0 = mstate->dtms_arg[0];
14722         uint64_t sarg1 = mstate->dtms_arg[1];
14723         uint64_t rval = 0;
14724         dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
14725         dtrace_helper_action_t *helper;
14726         dtrace_vstate_t *vstate;
14727         dtrace_difo_t *pred;
14728         int i, trace = dtrace_helptrace_enabled;
14729
14730         ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
14731
14732         if (helpers == NULL)
14733                 return (0);
14734
14735         if ((helper = helpers->dthps_actions[which]) == NULL)
14736                 return (0);
14737
14738         vstate = &helpers->dthps_vstate;
14739         mstate->dtms_arg[0] = arg0;
14740         mstate->dtms_arg[1] = arg1;
14741
14742         /*
14743          * Now iterate over each helper.  If its predicate evaluates to 'true',
14744          * we'll call the corresponding actions.  Note that the below calls
14745          * to dtrace_dif_emulate() may set faults in machine state.  This is
14746          * okay:  our caller (the outer dtrace_dif_emulate()) will simply plow
14747          * the stored DIF offset with its own (which is the desired behavior).
14748          * Also, note the calls to dtrace_dif_emulate() may allocate scratch
14749          * from machine state; this is okay, too.
14750          */
14751         for (; helper != NULL; helper = helper->dtha_next) {
14752                 if ((pred = helper->dtha_predicate) != NULL) {
14753                         if (trace)
14754                                 dtrace_helper_trace(helper, mstate, vstate, 0);
14755
14756                         if (!dtrace_dif_emulate(pred, mstate, vstate, state))
14757                                 goto next;
14758
14759                         if (*flags & CPU_DTRACE_FAULT)
14760                                 goto err;
14761                 }
14762
14763                 for (i = 0; i < helper->dtha_nactions; i++) {
14764                         if (trace)
14765                                 dtrace_helper_trace(helper,
14766                                     mstate, vstate, i + 1);
14767
14768                         rval = dtrace_dif_emulate(helper->dtha_actions[i],
14769                             mstate, vstate, state);
14770
14771                         if (*flags & CPU_DTRACE_FAULT)
14772                                 goto err;
14773                 }
14774
14775 next:
14776                 if (trace)
14777                         dtrace_helper_trace(helper, mstate, vstate,
14778                             DTRACE_HELPTRACE_NEXT);
14779         }
14780
14781         if (trace)
14782                 dtrace_helper_trace(helper, mstate, vstate,
14783                     DTRACE_HELPTRACE_DONE);
14784
14785         /*
14786          * Restore the arg0 that we saved upon entry.
14787          */
14788         mstate->dtms_arg[0] = sarg0;
14789         mstate->dtms_arg[1] = sarg1;
14790
14791         return (rval);
14792
14793 err:
14794         if (trace)
14795                 dtrace_helper_trace(helper, mstate, vstate,
14796                     DTRACE_HELPTRACE_ERR);
14797
14798         /*
14799          * Restore the arg0 that we saved upon entry.
14800          */
14801         mstate->dtms_arg[0] = sarg0;
14802         mstate->dtms_arg[1] = sarg1;
14803
14804         return (NULL);
14805 }
14806
14807 static void
14808 dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
14809     dtrace_vstate_t *vstate)
14810 {
14811         int i;
14812
14813         if (helper->dtha_predicate != NULL)
14814                 dtrace_difo_release(helper->dtha_predicate, vstate);
14815
14816         for (i = 0; i < helper->dtha_nactions; i++) {
14817                 ASSERT(helper->dtha_actions[i] != NULL);
14818                 dtrace_difo_release(helper->dtha_actions[i], vstate);
14819         }
14820
14821         kmem_free(helper->dtha_actions,
14822             helper->dtha_nactions * sizeof (dtrace_difo_t *));
14823         kmem_free(helper, sizeof (dtrace_helper_action_t));
14824 }
14825
14826 #if !defined(__APPLE__)
14827 static int
14828 dtrace_helper_destroygen(int gen)
14829 {
14830         proc_t *p = curproc;
14831 #else
14832 static int
14833 dtrace_helper_destroygen(proc_t* p, int gen)
14834 {
14835 #endif
14836         dtrace_helpers_t *help = p->p_dtrace_helpers;
14837         dtrace_vstate_t *vstate;
14838 #if !defined(__APPLE__) /* Quiet compiler warning */
14839         int i;
14840 #else
14841         uint_t i;
14842 #endif /* __APPLE__ */
14843
14844         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14845
14846         if (help == NULL || gen > help->dthps_generation)
14847                 return (EINVAL);
14848
14849         vstate = &help->dthps_vstate;
14850
14851         for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
14852                 dtrace_helper_action_t *last = NULL, *h, *next;
14853
14854                 for (h = help->dthps_actions[i]; h != NULL; h = next) {
14855                         next = h->dtha_next;
14856
14857                         if (h->dtha_generation == gen) {
14858                                 if (last != NULL) {
14859                                         last->dtha_next = next;
14860                                 } else {
14861                                         help->dthps_actions[i] = next;
14862                                 }
14863
14864                                 dtrace_helper_action_destroy(h, vstate);
14865                         } else {
14866                                 last = h;
14867                         }
14868                 }
14869         }
14870
14871         /*
14872          * Interate until we've cleared out all helper providers with the
14873          * given generation number.
14874          */
14875         for (;;) {
14876                 dtrace_helper_provider_t *prov = NULL;
14877
14878                 /*
14879                  * Look for a helper provider with the right generation. We
14880                  * have to start back at the beginning of the list each time
14881                  * because we drop dtrace_lock. It's unlikely that we'll make
14882                  * more than two passes.
14883                  */
14884                 for (i = 0; i < help->dthps_nprovs; i++) {
14885                         prov = help->dthps_provs[i];
14886
14887                         if (prov->dthp_generation == gen)
14888                                 break;
14889                 }
14890
14891                 /*
14892                  * If there were no matches, we're done.
14893                  */
14894                 if (i == help->dthps_nprovs)
14895                         break;
14896
14897                 /*
14898                  * Move the last helper provider into this slot.
14899                  */
14900                 help->dthps_nprovs--;
14901                 help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
14902                 help->dthps_provs[help->dthps_nprovs] = NULL;
14903
14904                 lck_mtx_unlock(&dtrace_lock);
14905
14906                 /*
14907                  * If we have a meta provider, remove this helper provider.
14908                  */
14909                 lck_mtx_lock(&dtrace_meta_lock);
14910                 if (dtrace_meta_pid != NULL) {
14911                         ASSERT(dtrace_deferred_pid == NULL);
14912                         dtrace_helper_provider_remove(&prov->dthp_prov,
14913                             p->p_pid);
14914                 }
14915                 lck_mtx_unlock(&dtrace_meta_lock);
14916
14917                 dtrace_helper_provider_destroy(prov);
14918
14919                 lck_mtx_lock(&dtrace_lock);
14920         }
14921
14922         return (0);
14923 }
14924
14925 static int
14926 dtrace_helper_validate(dtrace_helper_action_t *helper)
14927 {
14928         int err = 0, i;
14929         dtrace_difo_t *dp;
14930
14931         if ((dp = helper->dtha_predicate) != NULL)
14932                 err += dtrace_difo_validate_helper(dp);
14933
14934         for (i = 0; i < helper->dtha_nactions; i++)
14935                 err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
14936
14937         return (err == 0);
14938 }
14939
14940 #if !defined(__APPLE__)
14941 static int
14942 dtrace_helper_action_add(int which, dtrace_ecbdesc_t *ep)
14943 #else
14944 static int
14945 dtrace_helper_action_add(proc_t* p, int which, dtrace_ecbdesc_t *ep)
14946 #endif
14947 {
14948         dtrace_helpers_t *help;
14949         dtrace_helper_action_t *helper, *last;
14950         dtrace_actdesc_t *act;
14951         dtrace_vstate_t *vstate;
14952         dtrace_predicate_t *pred;
14953         int count = 0, nactions = 0, i;
14954
14955         if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
14956                 return (EINVAL);
14957
14958 #if !defined(__APPLE__)
14959         help = curproc->p_dtrace_helpers;
14960 #else
14961         help = p->p_dtrace_helpers;
14962 #endif
14963         last = help->dthps_actions[which];
14964         vstate = &help->dthps_vstate;
14965
14966         for (count = 0; last != NULL; last = last->dtha_next) {
14967                 count++;
14968                 if (last->dtha_next == NULL)
14969                         break;
14970         }
14971
14972         /*
14973          * If we already have dtrace_helper_actions_max helper actions for this
14974          * helper action type, we'll refuse to add a new one.
14975          */
14976         if (count >= dtrace_helper_actions_max)
14977                 return (ENOSPC);
14978
14979         helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
14980         helper->dtha_generation = help->dthps_generation;
14981
14982         if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
14983                 ASSERT(pred->dtp_difo != NULL);
14984                 dtrace_difo_hold(pred->dtp_difo);
14985                 helper->dtha_predicate = pred->dtp_difo;
14986         }
14987
14988         for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
14989                 if (act->dtad_kind != DTRACEACT_DIFEXPR)
14990                         goto err;
14991
14992                 if (act->dtad_difo == NULL)
14993                         goto err;
14994
14995                 nactions++;
14996         }
14997
14998         helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *
14999             (helper->dtha_nactions = nactions), KM_SLEEP);
15000
15001         for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
15002                 dtrace_difo_hold(act->dtad_difo);
15003                 helper->dtha_actions[i++] = act->dtad_difo;
15004         }
15005
15006         if (!dtrace_helper_validate(helper))
15007                 goto err;
15008
15009         if (last == NULL) {
15010                 help->dthps_actions[which] = helper;
15011         } else {
15012                 last->dtha_next = helper;
15013         }
15014
15015 #if !defined(__APPLE__) /* Quiet compiler warning */
15016         if (vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
15017 #else
15018         if ((uint32_t)vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
15019 #endif /* __APPLE__ */
15020                 dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
15021                 dtrace_helptrace_next = 0;
15022         }
15023
15024         return (0);
15025 err:
15026         dtrace_helper_action_destroy(helper, vstate);
15027         return (EINVAL);
15028 }
15029
15030 static void
15031 dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
15032     dof_helper_t *dofhp)
15033 {
15034         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
15035
15036         lck_mtx_lock(&dtrace_meta_lock);
15037         lck_mtx_lock(&dtrace_lock);
15038
15039         if (!dtrace_attached() || dtrace_meta_pid == NULL) {
15040                 /*
15041                  * If the dtrace module is loaded but not attached, or if
15042                  * there aren't isn't a meta provider registered to deal with
15043                  * these provider descriptions, we need to postpone creating
15044                  * the actual providers until later.
15045                  */
15046
15047                 if (help->dthps_next == NULL && help->dthps_prev == NULL &&
15048                     dtrace_deferred_pid != help) {
15049                         help->dthps_deferred = 1;
15050                         help->dthps_pid = p->p_pid;
15051                         help->dthps_next = dtrace_deferred_pid;
15052                         help->dthps_prev = NULL;
15053                         if (dtrace_deferred_pid != NULL)
15054                                 dtrace_deferred_pid->dthps_prev = help;
15055                         dtrace_deferred_pid = help;
15056                 }
15057
15058                 lck_mtx_unlock(&dtrace_lock);
15059
15060         } else if (dofhp != NULL) {
15061                 /*
15062                  * If the dtrace module is loaded and we have a particular
15063                  * helper provider description, pass that off to the
15064                  * meta provider.
15065                  */
15066
15067                 lck_mtx_unlock(&dtrace_lock);
15068
15069                 dtrace_helper_provide(dofhp, p->p_pid);
15070
15071         } else {
15072                 /*
15073                  * Otherwise, just pass all the helper provider descriptions
15074                  * off to the meta provider.
15075                  */
15076
15077 #if !defined(__APPLE__) /* Quiet compiler warning */
15078                 int i;
15079 #else
15080                 uint_t i;
15081 #endif /* __APPLE__ */
15082                 lck_mtx_unlock(&dtrace_lock);
15083
15084                 for (i = 0; i < help->dthps_nprovs; i++) {
15085                         dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
15086                             p->p_pid);
15087                 }
15088         }
15089
15090         lck_mtx_unlock(&dtrace_meta_lock);
15091 }
15092
15093 #if !defined(__APPLE__)
15094 static int
15095 dtrace_helper_provider_add(dof_helper_t *dofhp, int gen)
15096 #else
15097 static int
15098 dtrace_helper_provider_add(proc_t* p, dof_helper_t *dofhp, int gen)
15099 #endif
15100 {
15101         dtrace_helpers_t *help;
15102         dtrace_helper_provider_t *hprov, **tmp_provs;
15103         uint_t tmp_maxprovs, i;
15104
15105         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15106
15107 #if !defined(__APPLE__)
15108         help = curproc->p_dtrace_helpers;
15109 #else
15110         help = p->p_dtrace_helpers;
15111 #endif
15112         ASSERT(help != NULL);
15113
15114         /*
15115          * If we already have dtrace_helper_providers_max helper providers,
15116          * we're refuse to add a new one.
15117          */
15118         if (help->dthps_nprovs >= dtrace_helper_providers_max)
15119                 return (ENOSPC);
15120
15121         /*
15122          * Check to make sure this isn't a duplicate.
15123          */
15124         for (i = 0; i < help->dthps_nprovs; i++) {
15125                 if (dofhp->dofhp_addr ==
15126                     help->dthps_provs[i]->dthp_prov.dofhp_addr)
15127                         return (EALREADY);
15128         }
15129
15130         hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
15131         hprov->dthp_prov = *dofhp;
15132         hprov->dthp_ref = 1;
15133         hprov->dthp_generation = gen;
15134
15135         /*
15136          * Allocate a bigger table for helper providers if it's already full.
15137          */
15138         if (help->dthps_maxprovs == help->dthps_nprovs) {
15139                 tmp_maxprovs = help->dthps_maxprovs;
15140                 tmp_provs = help->dthps_provs;
15141
15142                 if (help->dthps_maxprovs == 0)
15143                         help->dthps_maxprovs = 2;
15144                 else
15145                         help->dthps_maxprovs *= 2;
15146                 if (help->dthps_maxprovs > dtrace_helper_providers_max)
15147                         help->dthps_maxprovs = dtrace_helper_providers_max;
15148
15149                 ASSERT(tmp_maxprovs < help->dthps_maxprovs);
15150
15151                 help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
15152                     sizeof (dtrace_helper_provider_t *), KM_SLEEP);
15153
15154                 if (tmp_provs != NULL) {
15155                         bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
15156                             sizeof (dtrace_helper_provider_t *));
15157                         kmem_free(tmp_provs, tmp_maxprovs *
15158                             sizeof (dtrace_helper_provider_t *));
15159                 }
15160         }
15161
15162         help->dthps_provs[help->dthps_nprovs] = hprov;
15163         help->dthps_nprovs++;
15164
15165         return (0);
15166 }
15167
15168 static void
15169 dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
15170 {
15171         lck_mtx_lock(&dtrace_lock);
15172
15173         if (--hprov->dthp_ref == 0) {
15174                 dof_hdr_t *dof;
15175                 lck_mtx_unlock(&dtrace_lock);
15176                 dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
15177                 dtrace_dof_destroy(dof);
15178                 kmem_free(hprov, sizeof (dtrace_helper_provider_t));
15179         } else {
15180                 lck_mtx_unlock(&dtrace_lock);
15181         }
15182 }
15183
15184 static int
15185 dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)
15186 {
15187         uintptr_t daddr = (uintptr_t)dof;
15188         dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
15189         dof_provider_t *provider;
15190         dof_probe_t *probe;
15191         uint8_t *arg;
15192         char *strtab, *typestr;
15193         dof_stridx_t typeidx;
15194         size_t typesz;
15195         uint_t nprobes, j, k;
15196
15197         ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
15198
15199         if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
15200                 dtrace_dof_error(dof, "misaligned section offset");
15201                 return (-1);
15202         }
15203
15204         /*
15205          * The section needs to be large enough to contain the DOF provider
15206          * structure appropriate for the given version.
15207          */
15208         if (sec->dofs_size <
15209             ((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
15210             offsetof(dof_provider_t, dofpv_prenoffs) :
15211             sizeof (dof_provider_t))) {
15212                 dtrace_dof_error(dof, "provider section too small");
15213                 return (-1);
15214         }
15215
15216         provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
15217         str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
15218         prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
15219         arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
15220         off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
15221
15222         if (str_sec == NULL || prb_sec == NULL ||
15223             arg_sec == NULL || off_sec == NULL)
15224                 return (-1);
15225
15226         enoff_sec = NULL;
15227
15228         if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
15229             provider->dofpv_prenoffs != DOF_SECT_NONE &&
15230             (enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
15231             provider->dofpv_prenoffs)) == NULL)
15232                 return (-1);
15233
15234         strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
15235
15236         if (provider->dofpv_name >= str_sec->dofs_size ||
15237             strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
15238                 dtrace_dof_error(dof, "invalid provider name");
15239                 return (-1);
15240         }
15241
15242         if (prb_sec->dofs_entsize == 0 ||
15243             prb_sec->dofs_entsize > prb_sec->dofs_size) {
15244                 dtrace_dof_error(dof, "invalid entry size");
15245                 return (-1);
15246         }
15247
15248         if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
15249                 dtrace_dof_error(dof, "misaligned entry size");
15250                 return (-1);
15251         }
15252
15253         if (off_sec->dofs_entsize != sizeof (uint32_t)) {
15254                 dtrace_dof_error(dof, "invalid entry size");
15255                 return (-1);
15256         }
15257
15258         if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
15259                 dtrace_dof_error(dof, "misaligned section offset");
15260                 return (-1);
15261         }
15262
15263         if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
15264                 dtrace_dof_error(dof, "invalid entry size");
15265                 return (-1);
15266         }
15267
15268         arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
15269
15270         nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
15271
15272         /*
15273          * Take a pass through the probes to check for errors.
15274          */
15275         for (j = 0; j < nprobes; j++) {
15276                 probe = (dof_probe_t *)(uintptr_t)(daddr +
15277                     prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
15278
15279                 if (probe->dofpr_func >= str_sec->dofs_size) {
15280                         dtrace_dof_error(dof, "invalid function name");
15281                         return (-1);
15282                 }
15283
15284                 if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
15285                         dtrace_dof_error(dof, "function name too long");
15286                         return (-1);
15287                 }
15288
15289                 if (probe->dofpr_name >= str_sec->dofs_size ||
15290                     strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
15291                         dtrace_dof_error(dof, "invalid probe name");
15292                         return (-1);
15293                 }
15294
15295                 /*
15296                  * The offset count must not wrap the index, and the offsets
15297                  * must also not overflow the section's data.
15298                  */
15299                 if (probe->dofpr_offidx + probe->dofpr_noffs <
15300                     probe->dofpr_offidx ||
15301                     (probe->dofpr_offidx + probe->dofpr_noffs) *
15302                     off_sec->dofs_entsize > off_sec->dofs_size) {
15303                         dtrace_dof_error(dof, "invalid probe offset");
15304                         return (-1);
15305                 }
15306
15307                 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
15308                         /*
15309                          * If there's no is-enabled offset section, make sure
15310                          * there aren't any is-enabled offsets. Otherwise
15311                          * perform the same checks as for probe offsets
15312                          * (immediately above).
15313                          */
15314                         if (enoff_sec == NULL) {
15315                                 if (probe->dofpr_enoffidx != 0 ||
15316                                     probe->dofpr_nenoffs != 0) {
15317                                         dtrace_dof_error(dof, "is-enabled "
15318                                             "offsets with null section");
15319                                         return (-1);
15320                                 }
15321                         } else if (probe->dofpr_enoffidx +
15322                             probe->dofpr_nenoffs < probe->dofpr_enoffidx ||
15323                             (probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
15324                             enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
15325                                 dtrace_dof_error(dof, "invalid is-enabled "
15326                                     "offset");
15327                                 return (-1);
15328                         }
15329
15330                         if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
15331                                 dtrace_dof_error(dof, "zero probe and "
15332                                     "is-enabled offsets");
15333                                 return (-1);
15334                         }
15335                 } else if (probe->dofpr_noffs == 0) {
15336                         dtrace_dof_error(dof, "zero probe offsets");
15337                         return (-1);
15338                 }
15339
15340                 if (probe->dofpr_argidx + probe->dofpr_xargc <
15341                     probe->dofpr_argidx ||
15342                     (probe->dofpr_argidx + probe->dofpr_xargc) *
15343                     arg_sec->dofs_entsize > arg_sec->dofs_size) {
15344                         dtrace_dof_error(dof, "invalid args");
15345                         return (-1);
15346                 }
15347
15348                 typeidx = probe->dofpr_nargv;
15349                 typestr = strtab + probe->dofpr_nargv;
15350                 for (k = 0; k < probe->dofpr_nargc; k++) {
15351                         if (typeidx >= str_sec->dofs_size) {
15352                                 dtrace_dof_error(dof, "bad "
15353                                     "native argument type");
15354                                 return (-1);
15355                         }
15356
15357                         typesz = strlen(typestr) + 1;
15358                         if (typesz > DTRACE_ARGTYPELEN) {
15359                                 dtrace_dof_error(dof, "native "
15360                                     "argument type too long");
15361                                 return (-1);
15362                         }
15363                         typeidx += typesz;
15364                         typestr += typesz;
15365                 }
15366
15367                 typeidx = probe->dofpr_xargv;
15368                 typestr = strtab + probe->dofpr_xargv;
15369                 for (k = 0; k < probe->dofpr_xargc; k++) {
15370                         if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
15371                                 dtrace_dof_error(dof, "bad "
15372                                     "native argument index");
15373                                 return (-1);
15374                         }
15375
15376                         if (typeidx >= str_sec->dofs_size) {
15377                                 dtrace_dof_error(dof, "bad "
15378                                     "translated argument type");
15379                                 return (-1);
15380                         }
15381
15382                         typesz = strlen(typestr) + 1;
15383                         if (typesz > DTRACE_ARGTYPELEN) {
15384                                 dtrace_dof_error(dof, "translated argument "
15385                                     "type too long");
15386                                 return (-1);
15387                         }
15388
15389                         typeidx += typesz;
15390                         typestr += typesz;
15391                 }
15392         }
15393
15394         return (0);
15395 }
15396
15397 #if !defined(__APPLE__)
15398 static int
15399 dtrace_helper_slurp(dof_hdr_t *dof, dof_helper_t *dhp)
15400 #else
15401 static int
15402 dtrace_helper_slurp(proc_t* p, dof_hdr_t *dof, dof_helper_t *dhp)
15403 #endif
15404 {
15405         dtrace_helpers_t *help;
15406         dtrace_vstate_t *vstate;
15407         dtrace_enabling_t *enab = NULL;
15408         int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
15409         uintptr_t daddr = (uintptr_t)dof;
15410
15411         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15412
15413 #if !defined(__APPLE__)
15414         if ((help = curproc->p_dtrace_helpers) == NULL)
15415                 help = dtrace_helpers_create(curproc);
15416 #else
15417         if ((help = p->p_dtrace_helpers) == NULL)
15418                 help = dtrace_helpers_create(p);
15419 #endif
15420
15421         vstate = &help->dthps_vstate;
15422
15423         if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab,
15424             dhp != NULL ? dhp->dofhp_addr : 0, B_FALSE)) != 0) {
15425                 dtrace_dof_destroy(dof);
15426                 return (rv);
15427         }
15428
15429         /*
15430          * Look for helper providers and validate their descriptions.
15431          */
15432         if (dhp != NULL) {
15433 #if !defined(__APPLE__) /* Quiet compiler warning */
15434                 for (i = 0; i < dof->dofh_secnum; i++) {
15435 #else
15436                 for (i = 0; (uint32_t)i < dof->dofh_secnum; i++) {
15437 #endif /* __APPLE__ */
15438                         dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
15439                             dof->dofh_secoff + i * dof->dofh_secsize);
15440
15441                         if (sec->dofs_type != DOF_SECT_PROVIDER)
15442                                 continue;
15443
15444                         if (dtrace_helper_provider_validate(dof, sec) != 0) {
15445                                 dtrace_enabling_destroy(enab);
15446                                 dtrace_dof_destroy(dof);
15447                                 return (-1);
15448                         }
15449
15450                         nprovs++;
15451                 }
15452         }
15453
15454         /*
15455          * Now we need to walk through the ECB descriptions in the enabling.
15456          */
15457         for (i = 0; i < enab->dten_ndesc; i++) {
15458                 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
15459                 dtrace_probedesc_t *desc = &ep->dted_probe;
15460
15461 #if !defined(__APPLE__)
15462                 if (strcmp(desc->dtpd_provider, "dtrace") != 0)
15463                         continue;
15464
15465                 if (strcmp(desc->dtpd_mod, "helper") != 0)
15466                         continue;
15467
15468                 if (strcmp(desc->dtpd_func, "ustack") != 0)
15469                         continue;
15470 #else /* Employ size bounded string operation. */
15471                 if (!LIT_STRNEQL(desc->dtpd_provider, "dtrace"))
15472                         continue;
15473
15474                 if (!LIT_STRNEQL(desc->dtpd_mod, "helper"))
15475                         continue;
15476
15477                 if (!LIT_STRNEQL(desc->dtpd_func, "ustack"))
15478                         continue;
15479 #endif /* __APPLE__ */
15480
15481 #if !defined(__APPLE__)
15482                 if ((rv = dtrace_helper_action_add(DTRACE_HELPER_ACTION_USTACK,
15483                     ep)) != 0) {
15484 #else
15485                 if ((rv = dtrace_helper_action_add(p, DTRACE_HELPER_ACTION_USTACK,
15486                     ep)) != 0) {
15487 #endif
15488                         /*
15489                          * Adding this helper action failed -- we are now going
15490                          * to rip out the entire generation and return failure.
15491                          */
15492 #if !defined(__APPLE__)
15493                         (void) dtrace_helper_destroygen(help->dthps_generation);
15494 #else
15495                         (void) dtrace_helper_destroygen(p, help->dthps_generation);
15496 #endif
15497                         dtrace_enabling_destroy(enab);
15498                         dtrace_dof_destroy(dof);
15499                         return (-1);
15500                 }
15501
15502                 nhelpers++;
15503         }
15504
15505         if (nhelpers < enab->dten_ndesc)
15506                 dtrace_dof_error(dof, "unmatched helpers");
15507
15508         gen = help->dthps_generation++;
15509         dtrace_enabling_destroy(enab);
15510
15511         if (dhp != NULL && nprovs > 0) {
15512                 dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
15513 #if !defined(__APPLE__)
15514                 if (dtrace_helper_provider_add(dhp, gen) == 0) {
15515 #else
15516                 if (dtrace_helper_provider_add(p, dhp, gen) == 0) {
15517 #endif
15518                         lck_mtx_unlock(&dtrace_lock);
15519 #if !defined(__APPLE__)
15520                         dtrace_helper_provider_register(curproc, help, dhp);
15521 #else
15522                         dtrace_helper_provider_register(p, help, dhp);
15523 #endif
15524                         lck_mtx_lock(&dtrace_lock);
15525
15526                         destroy = 0;
15527                 }
15528         }
15529
15530         if (destroy)
15531                 dtrace_dof_destroy(dof);
15532
15533         return (gen);
15534 }
15535
15536 #if defined(__APPLE__)
15537
15538 /*
15539  * DTrace lazy dof
15540  *
15541  * DTrace user static probes (USDT probes) and helper actions are loaded
15542  * in a process by proccessing dof sections. The dof sections are passed
15543  * into the kernel by dyld, in a dof_ioctl_data_t block. It is rather
15544  * expensive to process dof for a process that will never use it. There
15545  * is a memory cost (allocating the providers/probes), and a cpu cost
15546  * (creating the providers/probes).
15547  *
15548  * To reduce this cost, we use "lazy dof". The normal proceedure for
15549  * dof processing is to copyin the dof(s) pointed to by the dof_ioctl_data_t
15550  * block, and invoke dof_slurp_helper() on them. When "lazy dof" is
15551  * used, each process retains the dof_ioctl_data_t block, instead of
15552  * copying in the data it points to.
15553  *
15554  * The dof_ioctl_data_t blocks are managed as if they were the actual
15555  * processed dof; on fork the block is copied to the child, on exec and
15556  * exit the block is freed.
15557  *
15558  * If the process loads library(s) containing additional dof, the
15559  * new dof_ioctl_data_t is merged with the existing block.
15560  *
15561  * There are a few catches that make this slightly more difficult.
15562  * When dyld registers dof_ioctl_data_t blocks, it expects a unique
15563  * identifier value for each dof in the block. In non-lazy dof terms,
15564  * this is the generation that dof was loaded in. If we hand back
15565  * a UID for a lazy dof, that same UID must be able to unload the
15566  * dof once it has become non-lazy. To meet this requirement, the
15567  * code that loads lazy dof requires that the UID's for dof(s) in
15568  * the lazy dof be sorted, and in ascending order. It is okay to skip
15569  * UID's, I.E., 1 -> 5 -> 6 is legal.
15570  *
15571  * Once a process has become non-lazy, it will stay non-lazy. All
15572  * future dof operations for that process will be non-lazy, even
15573  * if the dof mode transitions back to lazy.
15574  *
15575  * Always do lazy dof checks before non-lazy (I.E. In fork, exit, exec.).
15576  * That way if the lazy check fails due to transitioning to non-lazy, the
15577  * right thing is done with the newly faulted in dof.
15578  */
15579
15580 /*
15581  * This method is a bit squicky. It must handle:
15582  *
15583  * dof should not be lazy.
15584  * dof should have been handled lazily, but there was an error
15585  * dof was handled lazily, and needs to be freed.
15586  * dof was handled lazily, and must not be freed.
15587  *
15588  *
15589  * Returns EACCESS if dof should be handled non-lazily.
15590  *
15591  * KERN_SUCCESS and all other return codes indicate lazy handling of dof.
15592  *
15593  * If the dofs data is claimed by this method, dofs_claimed will be set.
15594  * Callers should not free claimed dofs.
15595  */
15596 static int
15597 dtrace_lazy_dofs_add(proc_t *p, dof_ioctl_data_t* incoming_dofs, int *dofs_claimed)
15598 {
15599         ASSERT(p);
15600         ASSERT(incoming_dofs && incoming_dofs->dofiod_count > 0);
15601
15602         int rval = 0;
15603         *dofs_claimed = 0;
15604
15605         lck_rw_lock_shared(&dtrace_dof_mode_lock);
15606
15607         /*
15608          * If we have lazy dof, dof mode better be LAZY_ON.
15609          */
15610         ASSERT(p->p_dtrace_lazy_dofs == NULL || dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON);
15611         ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
15612         ASSERT(dtrace_dof_mode != DTRACE_DOF_MODE_NEVER);
15613
15614         /*
15615          * Any existing helpers force non-lazy behavior.
15616          */
15617         if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON && (p->p_dtrace_helpers == NULL)) {
15618                 lck_mtx_lock(&p->p_dtrace_sprlock);
15619
15620                 dof_ioctl_data_t* existing_dofs = p->p_dtrace_lazy_dofs;
15621                 unsigned int existing_dofs_count = (existing_dofs) ? existing_dofs->dofiod_count : 0;
15622                 unsigned int i, merged_dofs_count = incoming_dofs->dofiod_count + existing_dofs_count;
15623
15624                 /*
15625                  * Range check...
15626                  */
15627                 if (merged_dofs_count == 0 || merged_dofs_count > 1024) {
15628                         dtrace_dof_error(NULL, "lazy_dofs_add merged_dofs_count out of range");
15629                         rval = EINVAL;
15630                         goto unlock;
15631                 }
15632
15633                 /*
15634                  * Each dof being added must be assigned a unique generation.
15635                  */
15636                 uint64_t generation = (existing_dofs) ? existing_dofs->dofiod_helpers[existing_dofs_count - 1].dofhp_dof + 1 : 1;
15637                 for (i=0; i<incoming_dofs->dofiod_count; i++) {
15638                         /*
15639                          * We rely on these being the same so we can overwrite dofhp_dof and not lose info.
15640                          */
15641                         ASSERT(incoming_dofs->dofiod_helpers[i].dofhp_dof == incoming_dofs->dofiod_helpers[i].dofhp_addr);
15642                         incoming_dofs->dofiod_helpers[i].dofhp_dof = generation++;
15643                 }
15644
15645
15646                 if (existing_dofs) {
15647                         /*
15648                          * Merge the existing and incoming dofs
15649                          */
15650                         size_t merged_dofs_size = DOF_IOCTL_DATA_T_SIZE(merged_dofs_count);
15651                         dof_ioctl_data_t* merged_dofs = kmem_alloc(merged_dofs_size, KM_SLEEP);
15652
15653                         bcopy(&existing_dofs->dofiod_helpers[0],
15654                               &merged_dofs->dofiod_helpers[0],
15655                               sizeof(dof_helper_t) * existing_dofs_count);
15656                         bcopy(&incoming_dofs->dofiod_helpers[0],
15657                               &merged_dofs->dofiod_helpers[existing_dofs_count],
15658                               sizeof(dof_helper_t) * incoming_dofs->dofiod_count);
15659
15660                         merged_dofs->dofiod_count = merged_dofs_count;
15661
15662                         kmem_free(existing_dofs, DOF_IOCTL_DATA_T_SIZE(existing_dofs_count));
15663
15664                         p->p_dtrace_lazy_dofs = merged_dofs;
15665                 } else {
15666                         /*
15667                          * Claim the incoming dofs
15668                          */
15669                         *dofs_claimed = 1;
15670                         p->p_dtrace_lazy_dofs = incoming_dofs;
15671                 }
15672
15673 #if DEBUG
15674                 dof_ioctl_data_t* all_dofs = p->p_dtrace_lazy_dofs;
15675                 for (i=0; i<all_dofs->dofiod_count-1; i++) {
15676                         ASSERT(all_dofs->dofiod_helpers[i].dofhp_dof < all_dofs->dofiod_helpers[i+1].dofhp_dof);
15677                 }
15678 #endif /* DEBUG */
15679
15680 unlock:
15681                 lck_mtx_unlock(&p->p_dtrace_sprlock);
15682         } else {
15683                 rval = EACCES;
15684         }
15685
15686         lck_rw_unlock_shared(&dtrace_dof_mode_lock);
15687
15688         return rval;
15689 }
15690
15691 /*
15692  * Returns:
15693  *
15694  * EINVAL: lazy dof is enabled, but the requested generation was not found.
15695  * EACCES: This removal needs to be handled non-lazily.
15696  */
15697 static int
15698 dtrace_lazy_dofs_remove(proc_t *p, int generation)
15699 {
15700         int rval = EINVAL;
15701
15702         lck_rw_lock_shared(&dtrace_dof_mode_lock);
15703
15704         /*
15705          * If we have lazy dof, dof mode better be LAZY_ON.
15706          */
15707         ASSERT(p->p_dtrace_lazy_dofs == NULL || dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON);
15708         ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
15709         ASSERT(dtrace_dof_mode != DTRACE_DOF_MODE_NEVER);
15710
15711         /*
15712          * Any existing helpers force non-lazy behavior.
15713          */
15714         if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON && (p->p_dtrace_helpers == NULL)) {
15715                 lck_mtx_lock(&p->p_dtrace_sprlock);
15716
15717                 dof_ioctl_data_t* existing_dofs = p->p_dtrace_lazy_dofs;
15718
15719                 if (existing_dofs) {
15720                         int index, existing_dofs_count = existing_dofs->dofiod_count;
15721                         for (index=0; index<existing_dofs_count; index++) {
15722                                 if ((int)existing_dofs->dofiod_helpers[index].dofhp_dof == generation) {
15723                                         dof_ioctl_data_t* removed_dofs = NULL;
15724
15725                                         /*
15726                                          * If there is only 1 dof, we'll delete it and swap in NULL.
15727                                          */
15728                                         if (existing_dofs_count > 1) {
15729                                                 int removed_dofs_count = existing_dofs_count - 1;
15730                                                 size_t removed_dofs_size = DOF_IOCTL_DATA_T_SIZE(removed_dofs_count);
15731
15732                                                 removed_dofs = kmem_alloc(removed_dofs_size, KM_SLEEP);
15733                                                 removed_dofs->dofiod_count = removed_dofs_count;
15734
15735                                                 /*
15736                                                  * copy the remaining data.
15737                                                  */
15738                                                 if (index > 0) {
15739                                                         bcopy(&existing_dofs->dofiod_helpers[0],
15740                                                               &removed_dofs->dofiod_helpers[0],
15741                                                               index * sizeof(dof_helper_t));
15742                                                 }
15743
15744                                                 if (index < existing_dofs_count-1) {
15745                                                         bcopy(&existing_dofs->dofiod_helpers[index+1],
15746                                                               &removed_dofs->dofiod_helpers[index],
15747                                                               (existing_dofs_count - index - 1) * sizeof(dof_helper_t));
15748                                                 }
15749                                         }
15750
15751                                         kmem_free(existing_dofs, DOF_IOCTL_DATA_T_SIZE(existing_dofs_count));
15752
15753                                         p->p_dtrace_lazy_dofs = removed_dofs;
15754
15755                                         rval = KERN_SUCCESS;
15756
15757                                         break;
15758                                 }
15759                         }
15760
15761 #if DEBUG
15762                         dof_ioctl_data_t* all_dofs = p->p_dtrace_lazy_dofs;
15763                         if (all_dofs) {
15764                                 unsigned int i;
15765                                 for (i=0; i<all_dofs->dofiod_count-1; i++) {
15766                                         ASSERT(all_dofs->dofiod_helpers[i].dofhp_dof < all_dofs->dofiod_helpers[i+1].dofhp_dof);
15767                                 }
15768                         }
15769 #endif
15770
15771                 }
15772
15773                 lck_mtx_unlock(&p->p_dtrace_sprlock);
15774         } else {
15775                 rval = EACCES;
15776         }
15777
15778         lck_rw_unlock_shared(&dtrace_dof_mode_lock);
15779
15780         return rval;
15781 }
15782
15783 void
15784 dtrace_lazy_dofs_destroy(proc_t *p)
15785 {
15786         lck_rw_lock_shared(&dtrace_dof_mode_lock);
15787         lck_mtx_lock(&p->p_dtrace_sprlock);
15788
15789         /*
15790          * If we have lazy dof, dof mode better be LAZY_ON, or we must be exiting.
15791          * We cannot assert against DTRACE_DOF_MODE_NEVER here, because we are called from
15792          * kern_exit.c and kern_exec.c.
15793          */
15794         ASSERT(p->p_dtrace_lazy_dofs == NULL || dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON || p->p_lflag & P_LEXIT);
15795         ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
15796
15797         dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs;
15798         p->p_dtrace_lazy_dofs = NULL;
15799
15800         lck_mtx_unlock(&p->p_dtrace_sprlock);
15801         lck_rw_unlock_shared(&dtrace_dof_mode_lock);
15802
15803         if (lazy_dofs) {
15804                 kmem_free(lazy_dofs, DOF_IOCTL_DATA_T_SIZE(lazy_dofs->dofiod_count));
15805         }
15806 }
15807
15808 void
15809 dtrace_lazy_dofs_duplicate(proc_t *parent, proc_t *child)
15810 {
15811         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
15812         lck_mtx_assert(&parent->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED);
15813         lck_mtx_assert(&child->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED);
15814
15815         lck_rw_lock_shared(&dtrace_dof_mode_lock);
15816         lck_mtx_lock(&parent->p_dtrace_sprlock);
15817
15818         /*
15819          * If we have lazy dof, dof mode better be LAZY_ON, or we must be exiting.
15820          * We cannot assert against DTRACE_DOF_MODE_NEVER here, because we are called from
15821          * kern_fork.c
15822          */
15823         ASSERT(parent->p_dtrace_lazy_dofs == NULL || dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON);
15824         ASSERT(parent->p_dtrace_lazy_dofs == NULL || parent->p_dtrace_helpers == NULL);
15825         /*
15826          * In theory we should hold the child sprlock, but this is safe...
15827          */
15828         ASSERT(child->p_dtrace_lazy_dofs == NULL && child->p_dtrace_helpers == NULL);
15829
15830         dof_ioctl_data_t* parent_dofs = parent->p_dtrace_lazy_dofs;
15831         dof_ioctl_data_t* child_dofs = NULL;
15832         if (parent_dofs) {
15833                 size_t parent_dofs_size = DOF_IOCTL_DATA_T_SIZE(parent_dofs->dofiod_count);
15834                 child_dofs = kmem_alloc(parent_dofs_size, KM_SLEEP);
15835                 bcopy(parent_dofs, child_dofs, parent_dofs_size);
15836         }
15837
15838         lck_mtx_unlock(&parent->p_dtrace_sprlock);
15839
15840         if (child_dofs) {
15841                 lck_mtx_lock(&child->p_dtrace_sprlock);
15842                 child->p_dtrace_lazy_dofs = child_dofs;
15843                 lck_mtx_unlock(&child->p_dtrace_sprlock);
15844         }
15845
15846         lck_rw_unlock_shared(&dtrace_dof_mode_lock);
15847 }
15848
15849 static int
15850 dtrace_lazy_dofs_proc_iterate_filter(proc_t *p, void* ignored)
15851 {
15852 #pragma unused(ignored)
15853         /*
15854          * Okay to NULL test without taking the sprlock.
15855          */
15856         return p->p_dtrace_lazy_dofs != NULL;
15857 }
15858
15859 static int
15860 dtrace_lazy_dofs_proc_iterate_doit(proc_t *p, void* ignored)
15861 {
15862 #pragma unused(ignored)
15863         /*
15864          * It is possible this process may exit during our attempt to
15865          * fault in the dof. We could fix this by holding locks longer,
15866          * but the errors are benign.
15867          */
15868         lck_mtx_lock(&p->p_dtrace_sprlock);
15869
15870         /*
15871          * In this case only, it is okay to have lazy dof when dof mode is DTRACE_DOF_MODE_LAZY_OFF
15872          */
15873         ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
15874         ASSERT(dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF);
15875
15876
15877         dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs;
15878         p->p_dtrace_lazy_dofs = NULL;
15879
15880         lck_mtx_unlock(&p->p_dtrace_sprlock);
15881
15882         /*
15883          * Process each dof_helper_t
15884          */
15885         if (lazy_dofs != NULL) {
15886                 unsigned int i;
15887                 int rval;
15888
15889                 for (i=0; i<lazy_dofs->dofiod_count; i++) {
15890                         /*
15891                          * When loading lazy dof, we depend on the generations being sorted in ascending order.
15892                          */
15893                         ASSERT(i >= (lazy_dofs->dofiod_count - 1) || lazy_dofs->dofiod_helpers[i].dofhp_dof < lazy_dofs->dofiod_helpers[i+1].dofhp_dof);
15894
15895                         dof_helper_t *dhp = &lazy_dofs->dofiod_helpers[i];
15896
15897                         /*
15898                          * We stored the generation in dofhp_dof. Save it, and restore the original value.
15899                          */
15900                         int generation = dhp->dofhp_dof;
15901                         dhp->dofhp_dof = dhp->dofhp_addr;
15902
15903                         dof_hdr_t *dof = dtrace_dof_copyin_from_proc(p, dhp->dofhp_dof, &rval);
15904
15905                         if (dof != NULL) {
15906                                 dtrace_helpers_t *help;
15907
15908                                 lck_mtx_lock(&dtrace_lock);
15909
15910                                 /*
15911                                  * This must be done with the dtrace_lock held
15912                                  */
15913                                 if ((help = p->p_dtrace_helpers) == NULL)
15914                                         help = dtrace_helpers_create(p);
15915
15916                                 /*
15917                                  * If the generation value has been bumped, someone snuck in
15918                                  * when we released the dtrace lock. We have to dump this generation,
15919                                  * there is no safe way to load it.
15920                                  */
15921                                 if (help->dthps_generation <= generation) {
15922                                         help->dthps_generation = generation;
15923
15924                                         /*
15925                                          * dtrace_helper_slurp() takes responsibility for the dof --
15926                                          * it may free it now or it may save it and free it later.
15927                                          */
15928                                         if ((rval = dtrace_helper_slurp(p, dof, dhp)) != generation) {
15929                                                 dtrace_dof_error(NULL, "returned value did not match expected generation");
15930                                         }
15931                                 }
15932
15933                                 lck_mtx_unlock(&dtrace_lock);
15934                         }
15935                 }
15936
15937                 kmem_free(lazy_dofs, DOF_IOCTL_DATA_T_SIZE(lazy_dofs->dofiod_count));
15938         }
15939
15940         return PROC_RETURNED;
15941 }
15942
15943 #endif /* __APPLE__ */
15944
15945 static dtrace_helpers_t *
15946 dtrace_helpers_create(proc_t *p)
15947 {
15948         dtrace_helpers_t *help;
15949
15950         lck_mtx_assert(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15951         ASSERT(p->p_dtrace_helpers == NULL);
15952
15953         help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
15954         help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *
15955             DTRACE_NHELPER_ACTIONS, KM_SLEEP);
15956
15957         p->p_dtrace_helpers = help;
15958         dtrace_helpers++;
15959
15960         return (help);
15961 }
15962
15963 #if !defined(__APPLE__)
15964 static void
15965 dtrace_helpers_destroy(void)
15966 {
15967         dtrace_helpers_t *help;
15968         dtrace_vstate_t *vstate;
15969         proc_t *p = curproc;
15970         int i;
15971 #else
15972 static void
15973 dtrace_helpers_destroy(proc_t* p)
15974 {
15975         dtrace_helpers_t *help;
15976         dtrace_vstate_t *vstate;
15977         uint_t i;
15978 #endif
15979
15980         lck_mtx_lock(&dtrace_lock);
15981
15982         ASSERT(p->p_dtrace_helpers != NULL);
15983         ASSERT(dtrace_helpers > 0);
15984
15985         help = p->p_dtrace_helpers;
15986         vstate = &help->dthps_vstate;
15987
15988         /*
15989          * We're now going to lose the help from this process.
15990          */
15991         p->p_dtrace_helpers = NULL;
15992         dtrace_sync();
15993
15994         /*
15995          * Destory the helper actions.
15996          */
15997         for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
15998                 dtrace_helper_action_t *h, *next;
15999
16000                 for (h = help->dthps_actions[i]; h != NULL; h = next) {
16001                         next = h->dtha_next;
16002                         dtrace_helper_action_destroy(h, vstate);
16003                         h = next;
16004                 }
16005         }
16006
16007         lck_mtx_unlock(&dtrace_lock);
16008
16009         /*
16010          * Destroy the helper providers.
16011          */
16012         if (help->dthps_maxprovs > 0) {
16013                 lck_mtx_lock(&dtrace_meta_lock);
16014                 if (dtrace_meta_pid != NULL) {
16015                         ASSERT(dtrace_deferred_pid == NULL);
16016
16017                         for (i = 0; i < help->dthps_nprovs; i++) {
16018                                 dtrace_helper_provider_remove(
16019                                     &help->dthps_provs[i]->dthp_prov, p->p_pid);
16020                         }
16021                 } else {
16022                         lck_mtx_lock(&dtrace_lock);
16023                         ASSERT(help->dthps_deferred == 0 ||
16024                             help->dthps_next != NULL ||
16025                             help->dthps_prev != NULL ||
16026                             help == dtrace_deferred_pid);
16027
16028                         /*
16029                          * Remove the helper from the deferred list.
16030                          */
16031                         if (help->dthps_next != NULL)
16032                                 help->dthps_next->dthps_prev = help->dthps_prev;
16033                         if (help->dthps_prev != NULL)
16034                                 help->dthps_prev->dthps_next = help->dthps_next;
16035                         if (dtrace_deferred_pid == help) {
16036                                 dtrace_deferred_pid = help->dthps_next;
16037                                 ASSERT(help->dthps_prev == NULL);
16038                         }
16039
16040                         lck_mtx_unlock(&dtrace_lock);
16041                 }
16042
16043                 lck_mtx_unlock(&dtrace_meta_lock);
16044
16045                 for (i = 0; i < help->dthps_nprovs; i++) {
16046                         dtrace_helper_provider_destroy(help->dthps_provs[i]);
16047                 }
16048
16049                 kmem_free(help->dthps_provs, help->dthps_maxprovs *
16050                     sizeof (dtrace_helper_provider_t *));
16051         }
16052
16053         lck_mtx_lock(&dtrace_lock);
16054
16055         dtrace_vstate_fini(&help->dthps_vstate);
16056         kmem_free(help->dthps_actions,
16057             sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);
16058         kmem_free(help, sizeof (dtrace_helpers_t));
16059
16060         --dtrace_helpers;
16061         lck_mtx_unlock(&dtrace_lock);
16062 }
16063
16064 static void
16065 dtrace_helpers_duplicate(proc_t *from, proc_t *to)
16066 {
16067         dtrace_helpers_t *help, *newhelp;
16068         dtrace_helper_action_t *helper, *new, *last;
16069         dtrace_difo_t *dp;
16070         dtrace_vstate_t *vstate;
16071 #if !defined(__APPLE__) /* Quiet compiler warning */
16072         int i, j, sz, hasprovs = 0;
16073 #else
16074         uint_t i;
16075         int j, sz, hasprovs = 0;
16076 #endif /* __APPLE__ */
16077
16078         lck_mtx_lock(&dtrace_lock);
16079         ASSERT(from->p_dtrace_helpers != NULL);
16080         ASSERT(dtrace_helpers > 0);
16081
16082         help = from->p_dtrace_helpers;
16083         newhelp = dtrace_helpers_create(to);
16084         ASSERT(to->p_dtrace_helpers != NULL);
16085
16086         newhelp->dthps_generation = help->dthps_generation;
16087         vstate = &newhelp->dthps_vstate;
16088
16089         /*
16090          * Duplicate the helper actions.
16091          */
16092         for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
16093                 if ((helper = help->dthps_actions[i]) == NULL)
16094                         continue;
16095
16096                 for (last = NULL; helper != NULL; helper = helper->dtha_next) {
16097                         new = kmem_zalloc(sizeof (dtrace_helper_action_t),
16098                             KM_SLEEP);
16099                         new->dtha_generation = helper->dtha_generation;
16100
16101                         if ((dp = helper->dtha_predicate) != NULL) {
16102                                 dp = dtrace_difo_duplicate(dp, vstate);
16103                                 new->dtha_predicate = dp;
16104                         }
16105
16106                         new->dtha_nactions = helper->dtha_nactions;
16107                         sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;
16108                         new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
16109
16110 #if !defined(__APPLE__) /* Quiet compiler warning */
16111                         for (j = 0; j < new->dtha_nactions; j++) {
16112                                 dtrace_difo_t *dp = helper->dtha_actions[j];
16113
16114                                 ASSERT(dp != NULL);
16115                                 dp = dtrace_difo_duplicate(dp, vstate);
16116                                 new->dtha_actions[j] = dp;
16117                         }
16118 #else
16119                         for (j = 0; j < new->dtha_nactions; j++) {
16120                                 dtrace_difo_t *dpj = helper->dtha_actions[j];
16121
16122                                 ASSERT(dpj != NULL);
16123                                 dpj = dtrace_difo_duplicate(dpj, vstate);
16124                                 new->dtha_actions[j] = dpj;
16125                         }
16126 #endif /* __APPLE__ */
16127
16128                         if (last != NULL) {
16129                                 last->dtha_next = new;
16130                         } else {
16131                                 newhelp->dthps_actions[i] = new;
16132                         }
16133
16134                         last = new;
16135                 }
16136         }
16137
16138         /*
16139          * Duplicate the helper providers and register them with the
16140          * DTrace framework.
16141          */
16142         if (help->dthps_nprovs > 0) {
16143                 newhelp->dthps_nprovs = help->dthps_nprovs;
16144                 newhelp->dthps_maxprovs = help->dthps_nprovs;
16145                 newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
16146                     sizeof (dtrace_helper_provider_t *), KM_SLEEP);
16147                 for (i = 0; i < newhelp->dthps_nprovs; i++) {
16148                         newhelp->dthps_provs[i] = help->dthps_provs[i];
16149                         newhelp->dthps_provs[i]->dthp_ref++;
16150                 }
16151
16152                 hasprovs = 1;
16153         }
16154
16155         lck_mtx_unlock(&dtrace_lock);
16156
16157         if (hasprovs)
16158                 dtrace_helper_provider_register(to, newhelp, NULL);
16159 }
16160
16161 /*
16162  * DTrace Hook Functions
16163  */
16164
16165 #if defined(__APPLE__)
16166 /*
16167  * Routines to manipulate the modctl list within dtrace
16168  */
16169
16170 modctl_t *dtrace_modctl_list;
16171
16172 static void
16173 dtrace_modctl_add(struct modctl * newctl)
16174 {
16175         struct modctl *nextp, *prevp;
16176
16177         ASSERT(newctl != NULL);
16178         lck_mtx_assert(&mod_lock, LCK_MTX_ASSERT_OWNED);
16179
16180         // Insert new module at the front of the list,
16181
16182         newctl->mod_next = dtrace_modctl_list;
16183         dtrace_modctl_list = newctl;
16184
16185         /*
16186          * If a module exists with the same name, then that module
16187          * must have been unloaded with enabled probes. We will move
16188          * the unloaded module to the new module's stale chain and
16189          * then stop traversing the list.
16190          */
16191
16192         prevp = newctl;
16193         nextp = newctl->mod_next;
16194
16195         while (nextp != NULL) {
16196                 if (nextp->mod_loaded) {
16197                         /* This is a loaded module. Keep traversing. */
16198                         prevp = nextp;
16199                         nextp = nextp->mod_next;
16200                         continue;
16201                 }
16202                 else {
16203                         /* Found an unloaded module */
16204                         if (strncmp (newctl->mod_modname, nextp->mod_modname, KMOD_MAX_NAME)) {
16205                                 /* Names don't match. Keep traversing. */
16206                                 prevp = nextp;
16207                                 nextp = nextp->mod_next;
16208                                 continue;
16209                         }
16210                         else {
16211                                 /* We found a stale entry, move it. We're done. */
16212                                 prevp->mod_next = nextp->mod_next;
16213                                 newctl->mod_stale = nextp;
16214                                 nextp->mod_next = NULL;
16215                                 break;
16216                         }
16217                 }
16218         }
16219 }
16220
16221 static modctl_t *
16222 dtrace_modctl_lookup(struct kmod_info * kmod)
16223 {
16224     lck_mtx_assert(&mod_lock, LCK_MTX_ASSERT_OWNED);
16225
16226     struct modctl * ctl;
16227
16228     for (ctl = dtrace_modctl_list; ctl; ctl=ctl->mod_next) {
16229         if (ctl->mod_id == kmod->id)
16230             return(ctl);
16231     }
16232     return (NULL);
16233 }
16234
16235 /*
16236  * This routine is called from dtrace_module_unloaded().
16237  * It removes a modctl structure and its stale chain
16238  * from the kext shadow list.
16239  */
16240 static void
16241 dtrace_modctl_remove(struct modctl * ctl)
16242 {
16243         ASSERT(ctl != NULL);
16244         lck_mtx_assert(&mod_lock, LCK_MTX_ASSERT_OWNED);
16245         modctl_t *prevp, *nextp, *curp;
16246
16247         // Remove stale chain first
16248         for (curp=ctl->mod_stale; curp != NULL; curp=nextp) {
16249                 nextp = curp->mod_stale;
16250                 /* There should NEVER be user symbols allocated at this point */
16251                 ASSERT(curp->mod_user_symbols == NULL);
16252                 kmem_free(curp, sizeof(modctl_t));
16253         }
16254
16255         prevp = NULL;
16256         curp = dtrace_modctl_list;
16257
16258         while (curp != ctl) {
16259                 prevp = curp;
16260                 curp = curp->mod_next;
16261         }
16262
16263         if (prevp != NULL) {
16264                 prevp->mod_next = ctl->mod_next;
16265         }
16266         else {
16267                 dtrace_modctl_list = ctl->mod_next;
16268         }
16269
16270         /* There should NEVER be user symbols allocated at this point */
16271         ASSERT(ctl->mod_user_symbols == NULL);
16272
16273         kmem_free (ctl, sizeof(modctl_t));
16274 }
16275
16276 #endif /* __APPLE__ */
16277
16278 /*
16279  * APPLE NOTE: The kext loader will call dtrace_module_loaded
16280  * when the kext is loaded in memory, but before calling the
16281  * kext's start routine.
16282  *
16283  * Return 0 on success
16284  * Return -1 on failure
16285  */
16286
16287 #if !defined (__APPLE__)
16288 static void
16289 dtrace_module_loaded(struct modctl *ctl)
16290 #else
16291 static int
16292 dtrace_module_loaded(struct kmod_info *kmod)
16293 #endif /* __APPLE__ */
16294 {
16295         dtrace_provider_t *prv;
16296
16297 #if !defined(__APPLE__)
16298         mutex_enter(&dtrace_provider_lock);
16299         mutex_enter(&mod_lock);
16300
16301         ASSERT(ctl->mod_busy);
16302 #else
16303
16304         /*
16305          * If kernel symbols have been disabled, return immediately
16306          * DTRACE_KERNEL_SYMBOLS_NEVER is a permanent mode, it is safe to test without holding locks
16307          */
16308         if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER)
16309                 return 0;
16310
16311         struct modctl *ctl = NULL;
16312         if (!kmod || kmod->address == 0 || kmod->size == 0)
16313                 return(-1);
16314
16315         lck_mtx_lock(&dtrace_provider_lock);
16316         lck_mtx_lock(&mod_lock);
16317
16318         /*
16319          * Have we seen this kext before?
16320          */
16321
16322         ctl = dtrace_modctl_lookup(kmod);
16323
16324         if (ctl != NULL) {
16325                 /* bail... we already have this kext in the modctl list */
16326                 lck_mtx_unlock(&mod_lock);
16327                 lck_mtx_unlock(&dtrace_provider_lock);
16328                 if (dtrace_err_verbose)
16329                         cmn_err(CE_WARN, "dtrace load module already exists '%s %u' is failing against '%s %u'", kmod->name, (uint_t)kmod->id, ctl->mod_modname, ctl->mod_id);
16330                 return(-1);
16331         }
16332         else {
16333                 ctl = kmem_alloc(sizeof(struct modctl), KM_SLEEP);
16334                 if (ctl == NULL) {
16335                         if (dtrace_err_verbose)
16336                                 cmn_err(CE_WARN, "dtrace module load '%s %u' is failing ", kmod->name, (uint_t)kmod->id);
16337                         lck_mtx_unlock(&mod_lock);
16338                         lck_mtx_unlock(&dtrace_provider_lock);
16339                         return (-1);
16340                 }
16341                 ctl->mod_next = NULL;
16342                 ctl->mod_stale = NULL;
16343                 strlcpy (ctl->mod_modname, kmod->name, sizeof(ctl->mod_modname));
16344                 ctl->mod_loadcnt = kmod->id;
16345                 ctl->mod_nenabled = 0;
16346                 ctl->mod_address  = kmod->address;
16347                 ctl->mod_size = kmod->size;
16348                 ctl->mod_id = kmod->id;
16349                 ctl->mod_loaded = 1;
16350                 ctl->mod_flags = 0;
16351                 ctl->mod_user_symbols = NULL;
16352
16353                 /*
16354                  * Find the UUID for this module, if it has one
16355                  */
16356                 kernel_mach_header_t* header = (kernel_mach_header_t *)ctl->mod_address;
16357                 struct load_command* load_cmd = (struct load_command *)&header[1];
16358                 uint32_t i;
16359                 for (i = 0; i < header->ncmds; i++) {
16360                         if (load_cmd->cmd == LC_UUID) {
16361                                 struct uuid_command* uuid_cmd = (struct uuid_command *)load_cmd;
16362                                 memcpy(ctl->mod_uuid, uuid_cmd->uuid, sizeof(uuid_cmd->uuid));
16363                                 ctl->mod_flags |= MODCTL_HAS_UUID;
16364                                 break;
16365                         }
16366                         load_cmd = (struct load_command *)((caddr_t)load_cmd + load_cmd->cmdsize);
16367                 }
16368
16369                 if (ctl->mod_address == g_kernel_kmod_info.address) {
16370                         ctl->mod_flags |= MODCTL_IS_MACH_KERNEL;
16371                 }
16372         }
16373         dtrace_modctl_add(ctl);
16374
16375         /*
16376          * We must hold the dtrace_lock to safely test non permanent dtrace_fbt_symbol_mode(s)
16377          */
16378         lck_mtx_lock(&dtrace_lock);
16379
16380         /*
16381          * If the module does not have a valid UUID, we will not be able to find symbols for it from
16382          * userspace. Go ahead and instrument it now.
16383          */
16384         if (MOD_HAS_UUID(ctl) && (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE)) {
16385                 lck_mtx_unlock(&dtrace_lock);
16386                 lck_mtx_unlock(&mod_lock);
16387                 lck_mtx_unlock(&dtrace_provider_lock);
16388                 return 0;
16389         }
16390
16391         ctl->mod_flags |= MODCTL_HAS_KERNEL_SYMBOLS;
16392
16393         lck_mtx_unlock(&dtrace_lock);
16394 #endif /* __APPLE__ */
16395
16396         /*
16397          * We're going to call each providers per-module provide operation
16398          * specifying only this module.
16399          */
16400         for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
16401                 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
16402
16403 #if defined(__APPLE__)
16404         /*
16405          * The contract with the kext loader is that once this function has completed,
16406          * it may delete kernel symbols at will. We must set this while still holding
16407          * the mod_lock.
16408          */
16409         ctl->mod_flags &= ~MODCTL_HAS_KERNEL_SYMBOLS;
16410 #endif
16411
16412         lck_mtx_unlock(&mod_lock);
16413         lck_mtx_unlock(&dtrace_provider_lock);
16414
16415         /*
16416          * If we have any retained enablings, we need to match against them.
16417          * Enabling probes requires that cpu_lock be held, and we cannot hold
16418          * cpu_lock here -- it is legal for cpu_lock to be held when loading a
16419          * module.  (In particular, this happens when loading scheduling
16420          * classes.)  So if we have any retained enablings, we need to dispatch
16421          * our task queue to do the match for us.
16422          */
16423         lck_mtx_lock(&dtrace_lock);
16424
16425         if (dtrace_retained == NULL) {
16426                 lck_mtx_unlock(&dtrace_lock);
16427 #if !defined(__APPLE__)
16428                 return;
16429 #else
16430                 return 0;
16431 #endif
16432         }
16433
16434 #if !defined(__APPLE__)
16435         (void) taskq_dispatch(dtrace_taskq,
16436                               (task_func_t *)dtrace_enabling_matchall, NULL, TQ_SLEEP);
16437
16438         mutex_exit(&dtrace_lock);
16439
16440         /*
16441          * And now, for a little heuristic sleaze:  in general, we want to
16442          * match modules as soon as they load.  However, we cannot guarantee
16443          * this, because it would lead us to the lock ordering violation
16444          * outlined above.  The common case, of course, is that cpu_lock is
16445          * _not_ held -- so we delay here for a clock tick, hoping that that's
16446          * long enough for the task queue to do its work.  If it's not, it's
16447          * not a serious problem -- it just means that the module that we
16448          * just loaded may not be immediately instrumentable.
16449          */
16450         delay(1);
16451 #else
16452         /* APPLE NOTE!
16453          *
16454          * The cpu_lock mentioned above is only held by dtrace code, Apple's xnu never actually
16455          * holds it for any reason. Thus the comment above is invalid, we can directly invoke
16456          * dtrace_enabling_matchall without jumping through all the hoops, and we can avoid
16457          * the delay call as well.
16458          */
16459         lck_mtx_unlock(&dtrace_lock);
16460
16461         dtrace_enabling_matchall();
16462
16463         return 0;
16464 #endif /* __APPLE__ */
16465 }
16466
16467 #if !defined(__APPLE__)
16468 static void
16469 dtrace_module_unloaded(struct modctl *ctl)
16470 {
16471         dtrace_probe_t template, *probe, *first, *next;
16472         dtrace_provider_t *prov;
16473
16474         template.dtpr_mod = ctl->mod_modname;
16475
16476         mutex_enter(&dtrace_provider_lock);
16477         mutex_enter(&mod_lock);
16478         mutex_enter(&dtrace_lock);
16479
16480         if (dtrace_bymod == NULL) {
16481                 /*
16482                  * The DTrace module is loaded (obviously) but not attached;
16483                  * we don't have any work to do.
16484                  */
16485                 mutex_exit(&dtrace_provider_lock);
16486                 mutex_exit(&mod_lock);
16487                 mutex_exit(&dtrace_lock);
16488                 return;
16489         }
16490
16491         for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
16492             probe != NULL; probe = probe->dtpr_nextmod) {
16493                 if (probe->dtpr_ecb != NULL) {
16494                         mutex_exit(&dtrace_provider_lock);
16495                         mutex_exit(&mod_lock);
16496                         mutex_exit(&dtrace_lock);
16497
16498                         /*
16499                          * This shouldn't _actually_ be possible -- we're
16500                          * unloading a module that has an enabled probe in it.
16501                          * (It's normally up to the provider to make sure that
16502                          * this can't happen.)  However, because dtps_enable()
16503                          * doesn't have a failure mode, there can be an
16504                          * enable/unload race.  Upshot:  we don't want to
16505                          * assert, but we're not going to disable the
16506                          * probe, either.
16507                          */
16508                         if (dtrace_err_verbose) {
16509                                 cmn_err(CE_WARN, "unloaded module '%s' had "
16510                                     "enabled probes", ctl->mod_modname);
16511                         }
16512
16513                         return;
16514                 }
16515         }
16516
16517         probe = first;
16518
16519         for (first = NULL; probe != NULL; probe = next) {
16520                 ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
16521
16522                 dtrace_probes[probe->dtpr_id - 1] = NULL;
16523
16524                 next = probe->dtpr_nextmod;
16525                 dtrace_hash_remove(dtrace_bymod, probe);
16526                 dtrace_hash_remove(dtrace_byfunc, probe);
16527                 dtrace_hash_remove(dtrace_byname, probe);
16528
16529                 if (first == NULL) {
16530                         first = probe;
16531                         probe->dtpr_nextmod = NULL;
16532                 } else {
16533                         probe->dtpr_nextmod = first;
16534                         first = probe;
16535                 }
16536         }
16537
16538         /*
16539          * We've removed all of the module's probes from the hash chains and
16540          * from the probe array.  Now issue a dtrace_sync() to be sure that
16541          * everyone has cleared out from any probe array processing.
16542          */
16543         dtrace_sync();
16544
16545         for (probe = first; probe != NULL; probe = first) {
16546                 first = probe->dtpr_nextmod;
16547                 prov = probe->dtpr_provider;
16548                 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
16549                     probe->dtpr_arg);
16550                 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
16551                 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
16552                 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
16553                 vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
16554                 kmem_free(probe, sizeof (dtrace_probe_t));
16555         }
16556
16557         mutex_exit(&dtrace_lock);
16558         mutex_exit(&mod_lock);
16559         mutex_exit(&dtrace_provider_lock);
16560 }
16561 #else  /* __APPLE__ */
16562
16563 /*
16564  * Return 0 on success
16565  * Return -1 on failure
16566  */
16567 static int
16568 dtrace_module_unloaded(struct kmod_info *kmod)
16569 {
16570         dtrace_probe_t template, *probe, *first, *next;
16571         dtrace_provider_t *prov;
16572         struct modctl *ctl = NULL;
16573         struct modctl *syncctl = NULL;
16574         struct modctl *nextsyncctl = NULL;
16575         int syncmode = 0;
16576
16577         lck_mtx_lock(&dtrace_provider_lock);
16578         lck_mtx_lock(&mod_lock);
16579         lck_mtx_lock(&dtrace_lock);
16580
16581         if (kmod == NULL) {
16582             syncmode = 1;
16583         }
16584         else {
16585             ctl = dtrace_modctl_lookup(kmod);
16586             if (ctl == NULL)
16587             {
16588                 lck_mtx_unlock(&dtrace_lock);
16589                 lck_mtx_unlock(&mod_lock);
16590                 lck_mtx_unlock(&dtrace_provider_lock);
16591                 return (-1);
16592             }
16593             ctl->mod_loaded = 0;
16594             ctl->mod_address = 0;
16595             ctl->mod_size = 0;
16596         }
16597
16598         if (dtrace_bymod == NULL) {
16599                 /*
16600                  * The DTrace module is loaded (obviously) but not attached;
16601                  * we don't have any work to do.
16602                  */
16603                  if (ctl != NULL)
16604                          (void)dtrace_modctl_remove(ctl);
16605                  lck_mtx_unlock(&dtrace_provider_lock);
16606                  lck_mtx_unlock(&mod_lock);
16607                  lck_mtx_unlock(&dtrace_lock);
16608                  return(0);
16609         }
16610
16611         /* Syncmode set means we target and traverse entire modctl list. */
16612         if (syncmode)
16613             nextsyncctl = dtrace_modctl_list;
16614
16615 syncloop:
16616         if (syncmode)
16617         {
16618             /* find a stale modctl struct */
16619             for (syncctl = nextsyncctl; syncctl != NULL; syncctl=syncctl->mod_next) {
16620                 if (syncctl->mod_address == 0)
16621                     break;
16622             }
16623             if (syncctl==NULL)
16624             {
16625                 /* We have no more work to do */
16626                 lck_mtx_unlock(&dtrace_provider_lock);
16627                 lck_mtx_unlock(&mod_lock);
16628                 lck_mtx_unlock(&dtrace_lock);
16629                 return(0);
16630             }
16631             else {
16632                 /* keep track of next syncctl in case this one is removed */
16633                 nextsyncctl = syncctl->mod_next;
16634                 ctl = syncctl;
16635             }
16636         }
16637
16638         template.dtpr_mod = ctl->mod_modname;
16639
16640         for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
16641             probe != NULL; probe = probe->dtpr_nextmod) {
16642                 if (probe->dtpr_ecb != NULL) {
16643                         /*
16644                          * This shouldn't _actually_ be possible -- we're
16645                          * unloading a module that has an enabled probe in it.
16646                          * (It's normally up to the provider to make sure that
16647                          * this can't happen.)  However, because dtps_enable()
16648                          * doesn't have a failure mode, there can be an
16649                          * enable/unload race.  Upshot:  we don't want to
16650                          * assert, but we're not going to disable the
16651                          * probe, either.
16652                          */
16653
16654
16655                         if (syncmode) {
16656                             /* We're syncing, let's look at next in list */
16657                             goto syncloop;
16658                         }
16659
16660                         lck_mtx_unlock(&dtrace_provider_lock);
16661                         lck_mtx_unlock(&mod_lock);
16662                         lck_mtx_unlock(&dtrace_lock);
16663
16664                         if (dtrace_err_verbose) {
16665                                 cmn_err(CE_WARN, "unloaded module '%s' had "
16666                                     "enabled probes", ctl->mod_modname);
16667                         }
16668                         return(-1);
16669                 }
16670         }
16671
16672         probe = first;
16673
16674         for (first = NULL; probe != NULL; probe = next) {
16675                 ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
16676
16677                 dtrace_probes[probe->dtpr_id - 1] = NULL;
16678
16679                 next = probe->dtpr_nextmod;
16680                 dtrace_hash_remove(dtrace_bymod, probe);
16681                 dtrace_hash_remove(dtrace_byfunc, probe);
16682                 dtrace_hash_remove(dtrace_byname, probe);
16683
16684                 if (first == NULL) {
16685                         first = probe;
16686                         probe->dtpr_nextmod = NULL;
16687                 } else {
16688                         probe->dtpr_nextmod = first;
16689                         first = probe;
16690                 }
16691         }
16692
16693         /*
16694          * We've removed all of the module's probes from the hash chains and
16695          * from the probe array.  Now issue a dtrace_sync() to be sure that
16696          * everyone has cleared out from any probe array processing.
16697          */
16698         dtrace_sync();
16699
16700         for (probe = first; probe != NULL; probe = first) {
16701                 first = probe->dtpr_nextmod;
16702                 prov = probe->dtpr_provider;
16703                 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
16704                     probe->dtpr_arg);
16705                 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
16706                 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
16707                 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
16708                 vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
16709
16710                 zfree(dtrace_probe_t_zone, probe);
16711         }
16712
16713         dtrace_modctl_remove(ctl);
16714
16715         if (syncmode)
16716             goto syncloop;
16717
16718         lck_mtx_unlock(&dtrace_lock);
16719         lck_mtx_unlock(&mod_lock);
16720         lck_mtx_unlock(&dtrace_provider_lock);
16721
16722         return(0);
16723 }
16724 #endif /* __APPLE__ */
16725
16726 void
16727 dtrace_suspend(void)
16728 {
16729         dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
16730 }
16731
16732 void
16733 dtrace_resume(void)
16734 {
16735         dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
16736 }
16737
16738 static int
16739 dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
16740 {
16741         lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
16742         lck_mtx_lock(&dtrace_lock);
16743
16744         switch (what) {
16745         case CPU_CONFIG: {
16746                 dtrace_state_t *state;
16747                 dtrace_optval_t *opt, rs, c;
16748
16749                 /*
16750                  * For now, we only allocate a new buffer for anonymous state.
16751                  */
16752                 if ((state = dtrace_anon.dta_state) == NULL)
16753                         break;
16754
16755                 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
16756                         break;
16757
16758                 opt = state->dts_options;
16759                 c = opt[DTRACEOPT_CPU];
16760
16761                 if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
16762                         break;
16763
16764                 /*
16765                  * Regardless of what the actual policy is, we're going to
16766                  * temporarily set our resize policy to be manual.  We're
16767                  * also going to temporarily set our CPU option to denote
16768                  * the newly configured CPU.
16769                  */
16770                 rs = opt[DTRACEOPT_BUFRESIZE];
16771                 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
16772                 opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
16773
16774                 (void) dtrace_state_buffers(state);
16775
16776                 opt[DTRACEOPT_BUFRESIZE] = rs;
16777                 opt[DTRACEOPT_CPU] = c;
16778
16779                 break;
16780         }
16781
16782         case CPU_UNCONFIG:
16783                 /*
16784                  * We don't free the buffer in the CPU_UNCONFIG case.  (The
16785                  * buffer will be freed when the consumer exits.)
16786                  */
16787                 break;
16788
16789         default:
16790                 break;
16791         }
16792
16793         lck_mtx_unlock(&dtrace_lock);
16794         return (0);
16795 }
16796
16797 static void
16798 dtrace_cpu_setup_initial(processorid_t cpu)
16799 {
16800         (void) dtrace_cpu_setup(CPU_CONFIG, cpu);
16801 }
16802
16803 static void
16804 dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
16805 {
16806         if (dtrace_toxranges >= dtrace_toxranges_max) {
16807                 int osize, nsize;
16808                 dtrace_toxrange_t *range;
16809
16810                 osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
16811
16812                 if (osize == 0) {
16813                         ASSERT(dtrace_toxrange == NULL);
16814                         ASSERT(dtrace_toxranges_max == 0);
16815                         dtrace_toxranges_max = 1;
16816                 } else {
16817                         dtrace_toxranges_max <<= 1;
16818                 }
16819
16820                 nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
16821                 range = kmem_zalloc(nsize, KM_SLEEP);
16822
16823                 if (dtrace_toxrange != NULL) {
16824                         ASSERT(osize != 0);
16825                         bcopy(dtrace_toxrange, range, osize);
16826                         kmem_free(dtrace_toxrange, osize);
16827                 }
16828
16829                 dtrace_toxrange = range;
16830         }
16831
16832         ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == NULL);
16833         ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == NULL);
16834
16835         dtrace_toxrange[dtrace_toxranges].dtt_base = base;
16836         dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
16837         dtrace_toxranges++;
16838 }
16839
16840 /*
16841  * DTrace Driver Cookbook Functions
16842  */
16843 /*ARGSUSED*/
16844 static int
16845 dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
16846 {
16847 #pragma unused(cmd) /* __APPLE__ */
16848         dtrace_provider_id_t id;
16849         dtrace_state_t *state = NULL;
16850         dtrace_enabling_t *enab;
16851
16852         lck_mtx_lock(&cpu_lock);
16853         lck_mtx_lock(&dtrace_provider_lock);
16854         lck_mtx_lock(&dtrace_lock);
16855
16856         if (ddi_soft_state_init(&dtrace_softstate,
16857             sizeof (dtrace_state_t), 0) != 0) {
16858                 cmn_err(CE_NOTE, "/dev/dtrace failed to initialize soft state");
16859                 lck_mtx_unlock(&cpu_lock);
16860                 lck_mtx_unlock(&dtrace_provider_lock);
16861                 lck_mtx_unlock(&dtrace_lock);
16862                 return (DDI_FAILURE);
16863         }
16864
16865 #if !defined(__APPLE__)
16866         if (ddi_create_minor_node(devi, DTRACEMNR_DTRACE, S_IFCHR,
16867             DTRACEMNRN_DTRACE, DDI_PSEUDO, NULL) == DDI_FAILURE ||
16868             ddi_create_minor_node(devi, DTRACEMNR_HELPER, S_IFCHR,
16869             DTRACEMNRN_HELPER, DDI_PSEUDO, NULL) == DDI_FAILURE) {
16870                 cmn_err(CE_NOTE, "/dev/dtrace couldn't create minor nodes");
16871                 ddi_remove_minor_node(devi, NULL);
16872                 ddi_soft_state_fini(&dtrace_softstate);
16873                 lck_mtx_unlock(&cpu_lock);
16874                 lck_mtx_unlock(&dtrace_provider_lock);
16875                 lck_mtx_unlock(&dtrace_lock);
16876                 return (DDI_FAILURE);
16877         }
16878 #else
16879         /* Darwin uses BSD cloning device driver to automagically obtain minor device number. */
16880 #endif /* __APPLE__ */
16881
16882         ddi_report_dev(devi);
16883         dtrace_devi = devi;
16884
16885         dtrace_modload = dtrace_module_loaded;
16886         dtrace_modunload = dtrace_module_unloaded;
16887         dtrace_cpu_init = dtrace_cpu_setup_initial;
16888         dtrace_helpers_cleanup = dtrace_helpers_destroy;
16889         dtrace_helpers_fork = dtrace_helpers_duplicate;
16890         dtrace_cpustart_init = dtrace_suspend;
16891         dtrace_cpustart_fini = dtrace_resume;
16892         dtrace_debugger_init = dtrace_suspend;
16893         dtrace_debugger_fini = dtrace_resume;
16894
16895         register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
16896
16897         lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
16898
16899         dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1,
16900             NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
16901         dtrace_minor = vmem_create("dtrace_minor", (void *)DTRACEMNRN_CLONE,
16902             UINT32_MAX - DTRACEMNRN_CLONE, 1, NULL, NULL, NULL, 0,
16903             VM_SLEEP | VMC_IDENTIFIER);
16904         dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri,
16905             1, INT_MAX, 0);
16906
16907         dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
16908             sizeof (dtrace_dstate_percpu_t) * (int)NCPU, DTRACE_STATE_ALIGN,
16909             NULL, NULL, NULL, NULL, NULL, 0);
16910
16911         lck_mtx_assert(&cpu_lock, LCK_MTX_ASSERT_OWNED);
16912         dtrace_bymod = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_mod),
16913             offsetof(dtrace_probe_t, dtpr_nextmod),
16914             offsetof(dtrace_probe_t, dtpr_prevmod));
16915
16916         dtrace_byfunc = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_func),
16917             offsetof(dtrace_probe_t, dtpr_nextfunc),
16918             offsetof(dtrace_probe_t, dtpr_prevfunc));
16919
16920         dtrace_byname = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_name),
16921             offsetof(dtrace_probe_t, dtpr_nextname),
16922             offsetof(dtrace_probe_t, dtpr_prevname));
16923
16924         if (dtrace_retain_max < 1) {
16925                 cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
16926                     "setting to 1", dtrace_retain_max);
16927                 dtrace_retain_max = 1;
16928         }
16929
16930         /*
16931          * Now discover our toxic ranges.
16932          */
16933         dtrace_toxic_ranges(dtrace_toxrange_add);
16934
16935         /*
16936          * Before we register ourselves as a provider to our own framework,
16937          * we would like to assert that dtrace_provider is NULL -- but that's
16938          * not true if we were loaded as a dependency of a DTrace provider.
16939          * Once we've registered, we can assert that dtrace_provider is our
16940          * pseudo provider.
16941          */
16942         (void) dtrace_register("dtrace", &dtrace_provider_attr,
16943             DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);
16944
16945         ASSERT(dtrace_provider != NULL);
16946         ASSERT((dtrace_provider_id_t)dtrace_provider == id);
16947
16948 #if !defined(__APPLE__)
16949         dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
16950             dtrace_provider, NULL, NULL, "BEGIN", 0, NULL);
16951         dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
16952             dtrace_provider, NULL, NULL, "END", 0, NULL);
16953         dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
16954             dtrace_provider, NULL, NULL, "ERROR", 1, NULL);
16955 #elif (defined(__i386__) || defined (__x86_64__))
16956         dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
16957             dtrace_provider, NULL, NULL, "BEGIN", 1, NULL);
16958         dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
16959             dtrace_provider, NULL, NULL, "END", 0, NULL);
16960         dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
16961             dtrace_provider, NULL, NULL, "ERROR", 3, NULL);
16962 #else
16963 #error Unknown Architecture
16964 #endif /* __APPLE__ */
16965
16966         dtrace_anon_property();
16967         lck_mtx_unlock(&cpu_lock);
16968
16969         /*
16970          * If DTrace helper tracing is enabled, we need to allocate the
16971          * trace buffer and initialize the values.
16972          */
16973         if (dtrace_helptrace_enabled) {
16974                 ASSERT(dtrace_helptrace_buffer == NULL);
16975                 dtrace_helptrace_buffer =
16976                     kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
16977                 dtrace_helptrace_next = 0;
16978         }
16979
16980         /*
16981          * If there are already providers, we must ask them to provide their
16982          * probes, and then match any anonymous enabling against them.  Note
16983          * that there should be no other retained enablings at this time:
16984          * the only retained enablings at this time should be the anonymous
16985          * enabling.
16986          */
16987         if (dtrace_anon.dta_enabling != NULL) {
16988                 ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
16989
16990 #if defined(__APPLE__)
16991                 /*
16992                  * If there is anonymous dof, we should switch symbol modes.
16993                  */
16994                 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) {
16995                         dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_KERNEL;
16996                 }
16997 #endif
16998
16999                 dtrace_enabling_provide(NULL);
17000                 state = dtrace_anon.dta_state;
17001
17002                 /*
17003                  * We couldn't hold cpu_lock across the above call to
17004                  * dtrace_enabling_provide(), but we must hold it to actually
17005                  * enable the probes.  We have to drop all of our locks, pick
17006                  * up cpu_lock, and regain our locks before matching the
17007                  * retained anonymous enabling.
17008                  */
17009                 lck_mtx_unlock(&dtrace_lock);
17010                 lck_mtx_unlock(&dtrace_provider_lock);
17011
17012                 lck_mtx_lock(&cpu_lock);
17013                 lck_mtx_lock(&dtrace_provider_lock);
17014                 lck_mtx_lock(&dtrace_lock);
17015
17016                 if ((enab = dtrace_anon.dta_enabling) != NULL)
17017                         (void) dtrace_enabling_match(enab, NULL);
17018
17019                 lck_mtx_unlock(&cpu_lock);
17020         }
17021
17022         lck_mtx_unlock(&dtrace_lock);
17023         lck_mtx_unlock(&dtrace_provider_lock);
17024
17025         if (state != NULL) {
17026                 /*
17027                  * If we created any anonymous state, set it going now.
17028                  */
17029                 (void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
17030         }
17031
17032         return (DDI_SUCCESS);
17033 }
17034
17035 /*ARGSUSED*/
17036 static int
17037 dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
17038 {
17039 #pragma unused(flag, otyp)
17040         dtrace_state_t *state;
17041         uint32_t priv;
17042         uid_t uid;
17043         zoneid_t zoneid;
17044 #if defined (__APPLE__)
17045         int rv;
17046 #endif /* __APPLE__ */
17047
17048 #if !defined(__APPLE__)
17049         if (getminor(*devp) == DTRACEMNRN_HELPER)
17050                 return (0);
17051
17052         /*
17053          * If this wasn't an open with the "helper" minor, then it must be
17054          * the "dtrace" minor.
17055          */
17056         if (getminor(*devp) != DTRACEMNRN_DTRACE)
17057                 return (ENXIO);
17058 #else
17059         /* Darwin puts Helper on its own major device. */
17060 #endif /* __APPLE__ */
17061
17062         /*
17063          * If no DTRACE_PRIV_* bits are set in the credential, then the
17064          * caller lacks sufficient permission to do anything with DTrace.
17065          */
17066         dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
17067         if (priv == DTRACE_PRIV_NONE)
17068                 return (EACCES);
17069
17070 #if defined(__APPLE__)
17071         /*
17072          * We delay the initialization of fasttrap as late as possible.
17073          * It certainly can't be later than now!
17074          */
17075         fasttrap_init();
17076 #endif /* __APPLE__ */
17077
17078         /*
17079          * Ask all providers to provide all their probes.
17080          */
17081         lck_mtx_lock(&dtrace_provider_lock);
17082         dtrace_probe_provide(NULL, NULL);
17083         lck_mtx_unlock(&dtrace_provider_lock);
17084
17085         lck_mtx_lock(&cpu_lock);
17086         lck_mtx_lock(&dtrace_lock);
17087         dtrace_opens++;
17088         dtrace_membar_producer();
17089
17090         /*
17091          * If the kernel debugger is active (that is, if the kernel debugger
17092          * modified text in some way), we won't allow the open.
17093          */
17094         if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
17095                 dtrace_opens--;
17096                 lck_mtx_unlock(&cpu_lock);
17097                 lck_mtx_unlock(&dtrace_lock);
17098                 return (EBUSY);
17099         }
17100
17101 #if !defined(__APPLE__)
17102         state = dtrace_state_create(devp, cred_p);
17103         lck_mtx_unlock(&cpu_lock);
17104
17105         if (state == NULL) {
17106                 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
17107                         (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
17108                 lck_mtx_unlock(&dtrace_lock);
17109                 return (EAGAIN);
17110         }
17111
17112         lck_mtx_unlock(&dtrace_lock);
17113 #else
17114         rv = dtrace_state_create(devp, cred_p, &state);
17115         lck_mtx_unlock(&cpu_lock);
17116
17117         if (rv != 0 || state == NULL) {
17118                 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
17119                         (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
17120                 lck_mtx_unlock(&dtrace_lock);
17121                 /* propagate EAGAIN or ERESTART */
17122                 return (rv);
17123         }
17124
17125         lck_mtx_unlock(&dtrace_lock);
17126
17127         lck_rw_lock_exclusive(&dtrace_dof_mode_lock);
17128
17129         /*
17130          * If we are currently lazy, transition states.
17131          *
17132          * Unlike dtrace_close, we do not need to check the
17133          * value of dtrace_opens, as any positive value (and
17134          * we count as 1) means we transition states.
17135          */
17136         if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON) {
17137                 dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_OFF;
17138
17139                 /*
17140                  * Iterate all existing processes and load lazy dofs.
17141                  */
17142                 proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS,
17143                              dtrace_lazy_dofs_proc_iterate_doit,
17144                              NULL,
17145                              dtrace_lazy_dofs_proc_iterate_filter,
17146                              NULL);
17147         }
17148
17149         lck_rw_unlock_exclusive(&dtrace_dof_mode_lock);
17150
17151         /*
17152          * Update kernel symbol state.
17153          *
17154          * We must own the provider and dtrace locks.
17155          *
17156          * NOTE! It may appear there is a race by setting this value so late
17157          * after dtrace_probe_provide. However, any kext loaded after the
17158          * call to probe provide and before we set LAZY_OFF will be marked as
17159          * eligible for symbols from userspace. The same dtrace that is currently
17160          * calling dtrace_open() (this call!) will get a list of kexts needing
17161          * symbols and fill them in, thus closing the race window.
17162          *
17163          * We want to set this value only after it certain it will succeed, as
17164          * this significantly reduces the complexity of error exits.
17165          */
17166         lck_mtx_lock(&dtrace_lock);
17167         if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) {
17168                 dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_KERNEL;
17169         }
17170         lck_mtx_unlock(&dtrace_lock);
17171 #endif /* __APPLE__ */
17172
17173         return (0);
17174 }
17175
17176 /*ARGSUSED*/
17177 static int
17178 dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
17179 {
17180 #pragma unused(flag, otyp, cred_p) /* __APPLE__ */
17181         minor_t minor = getminor(dev);
17182         dtrace_state_t *state;
17183
17184 #if !defined(__APPLE__)
17185         if (minor == DTRACEMNRN_HELPER)
17186                 return (0);
17187 #else
17188         /* Darwin puts Helper on its own major device. */
17189 #endif /* __APPLE__ */
17190
17191         state = ddi_get_soft_state(dtrace_softstate, minor);
17192
17193         lck_mtx_lock(&cpu_lock);
17194         lck_mtx_lock(&dtrace_lock);
17195
17196         if (state->dts_anon) {
17197                 /*
17198                  * There is anonymous state. Destroy that first.
17199                  */
17200                 ASSERT(dtrace_anon.dta_state == NULL);
17201                 dtrace_state_destroy(state->dts_anon);
17202         }
17203
17204         dtrace_state_destroy(state);
17205         ASSERT(dtrace_opens > 0);
17206
17207         /*
17208          * Only relinquish control of the kernel debugger interface when there
17209          * are no consumers and no anonymous enablings.
17210          */
17211         if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
17212                 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
17213
17214         lck_mtx_unlock(&dtrace_lock);
17215         lck_mtx_unlock(&cpu_lock);
17216
17217 #if defined(__APPLE__)
17218         /*
17219          * Lock ordering requires the dof mode lock be taken before
17220          * the dtrace_lock.
17221          */
17222         lck_rw_lock_exclusive(&dtrace_dof_mode_lock);
17223         lck_mtx_lock(&dtrace_lock);
17224
17225         if (dtrace_opens == 0) {
17226                 /*
17227                  * If we are currently lazy-off, and this is the last close, transition to
17228                  * lazy state.
17229                  */
17230                 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF) {
17231                         dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_ON;
17232                 }
17233
17234                 /*
17235                  * If we are the last dtrace client, switch back to lazy (from userspace) symbols
17236                  */
17237                 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_KERNEL) {
17238                         dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE;
17239                 }
17240         }
17241
17242         lck_mtx_unlock(&dtrace_lock);
17243         lck_rw_unlock_exclusive(&dtrace_dof_mode_lock);
17244
17245         /*
17246          * Kext probes may be retained past the end of the kext's lifespan. The
17247          * probes are kept until the last reference to them has been removed.
17248          * Since closing an active dtrace context is likely to drop that last reference,
17249          * lets take a shot at cleaning out the orphaned probes now.
17250          */
17251         dtrace_module_unloaded(NULL);
17252 #endif /* __APPLE__ */
17253
17254         return (0);
17255 }
17256
17257 #if !defined(__APPLE__)
17258 /*ARGSUSED*/
17259 static int
17260 dtrace_ioctl_helper(int cmd, intptr_t arg, int *rv)
17261 {
17262         int rval;
17263         dof_helper_t help, *dhp = NULL;
17264
17265         switch (cmd) {
17266         case DTRACEHIOC_ADDDOF:
17267                 if (copyin((void *)arg, &help, sizeof (help)) != 0) {
17268                         dtrace_dof_error(NULL, "failed to copyin DOF helper");
17269                         return (EFAULT);
17270                 }
17271
17272                 dhp = &help;
17273                 arg = (intptr_t)help.dofhp_dof;
17274                 /*FALLTHROUGH*/
17275
17276         case DTRACEHIOC_ADD: {
17277                 dof_hdr_t *dof = dtrace_dof_copyin(arg, &rval);
17278
17279                 if (dof == NULL)
17280                         return (rval);
17281
17282                 mutex_enter(&dtrace_lock);
17283
17284                 /*
17285                  * dtrace_helper_slurp() takes responsibility for the dof --
17286                  * it may free it now or it may save it and free it later.
17287                  */
17288                 if ((rval = dtrace_helper_slurp(dof, dhp)) != -1) {
17289                         *rv = rval;
17290                         rval = 0;
17291                 } else {
17292                         rval = EINVAL;
17293                 }
17294
17295                 mutex_exit(&dtrace_lock);
17296                 return (rval);
17297         }
17298
17299         case DTRACEHIOC_REMOVE: {
17300                 mutex_enter(&dtrace_lock);
17301                 rval = dtrace_helper_destroygen(arg);
17302                 mutex_exit(&dtrace_lock);
17303
17304                 return (rval);
17305         }
17306
17307         default:
17308                 break;
17309         }
17310
17311         return (ENOTTY);
17312 }
17313
17314 /*ARGSUSED*/
17315 static int
17316 dtrace_ioctl(dev_t dev, u_long cmd, intptr_t arg, int md, cred_t *cr, int *rv)
17317 {
17318         minor_t minor = getminor(dev);
17319         dtrace_state_t *state;
17320         int rval;
17321
17322         if (minor == DTRACEMNRN_HELPER)
17323                 return (dtrace_ioctl_helper(cmd, arg, rv));
17324
17325         state = ddi_get_soft_state(dtrace_softstate, minor);
17326
17327         if (state->dts_anon) {
17328                 ASSERT(dtrace_anon.dta_state == NULL);
17329                 state = state->dts_anon;
17330         }
17331
17332         switch (cmd) {
17333         case DTRACEIOC_PROVIDER: {
17334                 dtrace_providerdesc_t pvd;
17335                 dtrace_provider_t *pvp;
17336
17337                 if (copyin((void *)arg, &pvd, sizeof (pvd)) != 0)
17338                         return (EFAULT);
17339
17340                 pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
17341                 lck_mtx_lock(&dtrace_provider_lock);
17342
17343                 for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
17344                         if (strcmp(pvp->dtpv_name, pvd.dtvd_name) == 0)
17345                                 break;
17346                 }
17347
17348                 lck_mtx_unlock(&dtrace_provider_lock);
17349
17350                 if (pvp == NULL)
17351                         return (ESRCH);
17352
17353                 bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
17354                 bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
17355                 if (copyout(&pvd, (void *)arg, sizeof (pvd)) != 0)
17356                         return (EFAULT);
17357
17358                 return (0);
17359         }
17360
17361         case DTRACEIOC_EPROBE: {
17362                 dtrace_eprobedesc_t epdesc;
17363                 dtrace_ecb_t *ecb;
17364                 dtrace_action_t *act;
17365                 void *buf;
17366                 size_t size;
17367                 uintptr_t dest;
17368                 int nrecs;
17369
17370                 if (copyin((void *)arg, &epdesc, sizeof (epdesc)) != 0)
17371                         return (EFAULT);
17372
17373                 lck_mtx_lock(&dtrace_lock);
17374
17375                 if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
17376                         lck_mtx_unlock(&dtrace_lock);
17377                         return (EINVAL);
17378                 }
17379
17380                 if (ecb->dte_probe == NULL) {
17381                         lck_mtx_unlock(&dtrace_lock);
17382                         return (EINVAL);
17383                 }
17384
17385                 epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
17386                 epdesc.dtepd_uarg = ecb->dte_uarg;
17387                 epdesc.dtepd_size = ecb->dte_size;
17388
17389                 nrecs = epdesc.dtepd_nrecs;
17390                 epdesc.dtepd_nrecs = 0;
17391                 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17392                         if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17393                                 continue;
17394
17395                         epdesc.dtepd_nrecs++;
17396                 }
17397
17398                 /*
17399                  * Now that we have the size, we need to allocate a temporary
17400                  * buffer in which to store the complete description.  We need
17401                  * the temporary buffer to be able to drop dtrace_lock()
17402                  * across the copyout(), below.
17403                  */
17404                 size = sizeof (dtrace_eprobedesc_t) +
17405                     (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
17406
17407                 buf = kmem_alloc(size, KM_SLEEP);
17408                 dest = (uintptr_t)buf;
17409
17410                 bcopy(&epdesc, (void *)dest, sizeof (epdesc));
17411                 dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
17412
17413                 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17414                         if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17415                                 continue;
17416
17417                         if (nrecs-- == 0)
17418                                 break;
17419
17420                         bcopy(&act->dta_rec, (void *)dest,
17421                             sizeof (dtrace_recdesc_t));
17422                         dest += sizeof (dtrace_recdesc_t);
17423                 }
17424
17425                 lck_mtx_unlock(&dtrace_lock);
17426
17427                 if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
17428                         kmem_free(buf, size);
17429                         return (EFAULT);
17430                 }
17431
17432                 kmem_free(buf, size);
17433                 return (0);
17434         }
17435
17436         case DTRACEIOC_AGGDESC: {
17437                 dtrace_aggdesc_t aggdesc;
17438                 dtrace_action_t *act;
17439                 dtrace_aggregation_t *agg;
17440                 int nrecs;
17441                 uint32_t offs;
17442                 dtrace_recdesc_t *lrec;
17443                 void *buf;
17444                 size_t size;
17445                 uintptr_t dest;
17446
17447                 if (copyin((void *)arg, &aggdesc, sizeof (aggdesc)) != 0)
17448                         return (EFAULT);
17449
17450                 lck_mtx_lock(&dtrace_lock);
17451
17452                 if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
17453                         lck_mtx_unlock(&dtrace_lock);
17454                         return (EINVAL);
17455                 }
17456
17457                 aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
17458
17459                 nrecs = aggdesc.dtagd_nrecs;
17460                 aggdesc.dtagd_nrecs = 0;
17461
17462                 offs = agg->dtag_base;
17463                 lrec = &agg->dtag_action.dta_rec;
17464                 aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
17465
17466                 for (act = agg->dtag_first; ; act = act->dta_next) {
17467                         ASSERT(act->dta_intuple ||
17468                             DTRACEACT_ISAGG(act->dta_kind));
17469
17470                         /*
17471                          * If this action has a record size of zero, it
17472                          * denotes an argument to the aggregating action.
17473                          * Because the presence of this record doesn't (or
17474                          * shouldn't) affect the way the data is interpreted,
17475                          * we don't copy it out to save user-level the
17476                          * confusion of dealing with a zero-length record.
17477                          */
17478                         if (act->dta_rec.dtrd_size == 0) {
17479                                 ASSERT(agg->dtag_hasarg);
17480                                 continue;
17481                         }
17482
17483                         aggdesc.dtagd_nrecs++;
17484
17485                         if (act == &agg->dtag_action)
17486                                 break;
17487                 }
17488
17489                 /*
17490                  * Now that we have the size, we need to allocate a temporary
17491                  * buffer in which to store the complete description.  We need
17492                  * the temporary buffer to be able to drop dtrace_lock()
17493                  * across the copyout(), below.
17494                  */
17495                 size = sizeof (dtrace_aggdesc_t) +
17496                     (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
17497
17498                 buf = kmem_alloc(size, KM_SLEEP);
17499                 dest = (uintptr_t)buf;
17500
17501                 bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
17502                 dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
17503
17504                 for (act = agg->dtag_first; ; act = act->dta_next) {
17505                         dtrace_recdesc_t rec = act->dta_rec;
17506
17507                         /*
17508                          * See the comment in the above loop for why we pass
17509                          * over zero-length records.
17510                          */
17511                         if (rec.dtrd_size == 0) {
17512                                 ASSERT(agg->dtag_hasarg);
17513                                 continue;
17514                         }
17515
17516                         if (nrecs-- == 0)
17517                                 break;
17518
17519                         rec.dtrd_offset -= offs;
17520                         bcopy(&rec, (void *)dest, sizeof (rec));
17521                         dest += sizeof (dtrace_recdesc_t);
17522
17523                         if (act == &agg->dtag_action)
17524                                 break;
17525                 }
17526
17527                 lck_mtx_unlock(&dtrace_lock);
17528
17529                 if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
17530                         kmem_free(buf, size);
17531                         return (EFAULT);
17532                 }
17533
17534                 kmem_free(buf, size);
17535                 return (0);
17536         }
17537
17538         case DTRACEIOC_ENABLE: {
17539                 dof_hdr_t *dof;
17540                 dtrace_enabling_t *enab = NULL;
17541                 dtrace_vstate_t *vstate;
17542                 int err = 0;
17543
17544                 *rv = 0;
17545
17546                 /*
17547                  * If a NULL argument has been passed, we take this as our
17548                  * cue to reevaluate our enablings.
17549                  */
17550                 if (arg == NULL) {
17551                         dtrace_enabling_matchall();
17552
17553                         return (0);
17554                 }
17555
17556                 if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
17557                         return (rval);
17558
17559                 lck_mtx_lock(&cpu_lock);
17560                 lck_mtx_lock(&dtrace_lock);
17561                 vstate = &state->dts_vstate;
17562
17563                 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
17564                         lck_mtx_unlock(&dtrace_lock);
17565                         lck_mtx_unlock(&cpu_lock);
17566                         dtrace_dof_destroy(dof);
17567                         return (EBUSY);
17568                 }
17569
17570                 if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
17571                         lck_mtx_unlock(&dtrace_lock);
17572                         lck_mtx_unlock(&cpu_lock);
17573                         dtrace_dof_destroy(dof);
17574                         return (EINVAL);
17575                 }
17576
17577                 if ((rval = dtrace_dof_options(dof, state)) != 0) {
17578                         dtrace_enabling_destroy(enab);
17579                         lck_mtx_unlock(&dtrace_lock);
17580                         lck_mtx_unlock(&cpu_lock);
17581                         dtrace_dof_destroy(dof);
17582                         return (rval);
17583                 }
17584
17585                 if ((err = dtrace_enabling_match(enab, rv)) == 0) {
17586                         err = dtrace_enabling_retain(enab);
17587                 } else {
17588                         dtrace_enabling_destroy(enab);
17589                 }
17590
17591                 lck_mtx_unlock(&cpu_lock);
17592                 lck_mtx_unlock(&dtrace_lock);
17593                 dtrace_dof_destroy(dof);
17594
17595                 return (err);
17596         }
17597
17598         case DTRACEIOC_REPLICATE: {
17599                 dtrace_repldesc_t desc;
17600                 dtrace_probedesc_t *match = &desc.dtrpd_match;
17601                 dtrace_probedesc_t *create = &desc.dtrpd_create;
17602                 int err;
17603
17604                 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17605                         return (EFAULT);
17606
17607                 match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17608                 match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17609                 match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17610                 match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17611
17612                 create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17613                 create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17614                 create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17615                 create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17616
17617                 lck_mtx_lock(&dtrace_lock);
17618                 err = dtrace_enabling_replicate(state, match, create);
17619                 lck_mtx_unlock(&dtrace_lock);
17620
17621                 return (err);
17622         }
17623
17624         case DTRACEIOC_PROBEMATCH:
17625         case DTRACEIOC_PROBES: {
17626                 dtrace_probe_t *probe = NULL;
17627                 dtrace_probedesc_t desc;
17628                 dtrace_probekey_t pkey;
17629                 dtrace_id_t i;
17630                 int m = 0;
17631                 uint32_t priv;
17632                 uid_t uid;
17633                 zoneid_t zoneid;
17634
17635                 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17636                         return (EFAULT);
17637
17638                 desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17639                 desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17640                 desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17641                 desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17642
17643                 /*
17644                  * Before we attempt to match this probe, we want to give
17645                  * all providers the opportunity to provide it.
17646                  */
17647                 if (desc.dtpd_id == DTRACE_IDNONE) {
17648                         lck_mtx_lock(&dtrace_provider_lock);
17649                         dtrace_probe_provide(&desc, NULL);
17650                         lck_mtx_unlock(&dtrace_provider_lock);
17651                         desc.dtpd_id++;
17652                 }
17653
17654                 if (cmd == DTRACEIOC_PROBEMATCH)  {
17655                         dtrace_probekey(&desc, &pkey);
17656                         pkey.dtpk_id = DTRACE_IDNONE;
17657                 }
17658
17659                 dtrace_cred2priv(cr, &priv, &uid, &zoneid);
17660
17661                 lck_mtx_lock(&dtrace_lock);
17662
17663                 if (cmd == DTRACEIOC_PROBEMATCH) {
17664                         for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
17665                                 if ((probe = dtrace_probes[i - 1]) != NULL &&
17666                                     (m = dtrace_match_probe(probe, &pkey,
17667                                     priv, uid, zoneid)) != 0)
17668                                         break;
17669                         }
17670
17671                         if (m < 0) {
17672                                 lck_mtx_unlock(&dtrace_lock);
17673                                 return (EINVAL);
17674                         }
17675
17676                 } else {
17677                         for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
17678                                 if ((probe = dtrace_probes[i - 1]) != NULL &&
17679                                     dtrace_match_priv(probe, priv, uid, zoneid))
17680                                         break;
17681                         }
17682                 }
17683
17684                 if (probe == NULL) {
17685                         lck_mtx_unlock(&dtrace_lock);
17686                         return (ESRCH);
17687                 }
17688
17689                 dtrace_probe_description(probe, &desc);
17690                 lck_mtx_unlock(&dtrace_lock);
17691
17692                 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17693                         return (EFAULT);
17694
17695                 return (0);
17696         }
17697
17698         case DTRACEIOC_PROBEARG: {
17699                 dtrace_argdesc_t desc;
17700                 dtrace_probe_t *probe;
17701                 dtrace_provider_t *prov;
17702
17703                 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17704                         return (EFAULT);
17705
17706                 if (desc.dtargd_id == DTRACE_IDNONE)
17707                         return (EINVAL);
17708
17709                 if (desc.dtargd_ndx == DTRACE_ARGNONE)
17710                         return (EINVAL);
17711
17712                 lck_mtx_lock(&dtrace_provider_lock);
17713                 lck_mtx_lock(&mod_lock);
17714                 lck_mtx_lock(&dtrace_lock);
17715
17716                 if (desc.dtargd_id > dtrace_nprobes) {
17717                         lck_mtx_unlock(&dtrace_lock);
17718                         lck_mtx_unlock(&mod_lock);
17719                         lck_mtx_unlock(&dtrace_provider_lock);
17720                         return (EINVAL);
17721                 }
17722
17723                 if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
17724                         lck_mtx_unlock(&dtrace_lock);
17725                         lck_mtx_unlock(&mod_lock);
17726                         lck_mtx_unlock(&dtrace_provider_lock);
17727                         return (EINVAL);
17728                 }
17729
17730                 lck_mtx_unlock(&dtrace_lock);
17731
17732                 prov = probe->dtpr_provider;
17733
17734                 if (prov->dtpv_pops.dtps_getargdesc == NULL) {
17735                         /*
17736                          * There isn't any typed information for this probe.
17737                          * Set the argument number to DTRACE_ARGNONE.
17738                          */
17739                         desc.dtargd_ndx = DTRACE_ARGNONE;
17740                 } else {
17741                         desc.dtargd_native[0] = '\0';
17742                         desc.dtargd_xlate[0] = '\0';
17743                         desc.dtargd_mapping = desc.dtargd_ndx;
17744
17745                         prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
17746                             probe->dtpr_id, probe->dtpr_arg, &desc);
17747                 }
17748
17749                 lck_mtx_unlock(&mod_lock);
17750                 lck_mtx_unlock(&dtrace_provider_lock);
17751
17752                 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17753                         return (EFAULT);
17754
17755                 return (0);
17756         }
17757
17758         case DTRACEIOC_GO: {
17759                 processorid_t cpuid;
17760                 rval = dtrace_state_go(state, &cpuid);
17761
17762                 if (rval != 0)
17763                         return (rval);
17764
17765                 if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
17766                         return (EFAULT);
17767
17768                 return (0);
17769         }
17770
17771         case DTRACEIOC_STOP: {
17772                 processorid_t cpuid;
17773
17774                 lck_mtx_lock(&dtrace_lock);
17775                 rval = dtrace_state_stop(state, &cpuid);
17776                 lck_mtx_unlock(&dtrace_lock);
17777
17778                 if (rval != 0)
17779                         return (rval);
17780
17781                 if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
17782                         return (EFAULT);
17783
17784                 return (0);
17785         }
17786
17787         case DTRACEIOC_DOFGET: {
17788                 dof_hdr_t hdr, *dof;
17789                 uint64_t len;
17790
17791                 if (copyin((void *)arg, &hdr, sizeof (hdr)) != 0)
17792                         return (EFAULT);
17793
17794                 lck_mtx_lock(&dtrace_lock);
17795                 dof = dtrace_dof_create(state);
17796                 lck_mtx_unlock(&dtrace_lock);
17797
17798                 len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
17799                 rval = copyout(dof, (void *)arg, len);
17800                 dtrace_dof_destroy(dof);
17801
17802                 return (rval == 0 ? 0 : EFAULT);
17803         }
17804
17805         case DTRACEIOC_AGGSNAP:
17806         case DTRACEIOC_BUFSNAP: {
17807                 dtrace_bufdesc_t desc;
17808                 caddr_t cached;
17809                 dtrace_buffer_t *buf;
17810
17811                 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17812                         return (EFAULT);
17813
17814                 if (desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)
17815                         return (EINVAL);
17816
17817                 lck_mtx_lock(&dtrace_lock);
17818
17819                 if (cmd == DTRACEIOC_BUFSNAP) {
17820                         buf = &state->dts_buffer[desc.dtbd_cpu];
17821                 } else {
17822                         buf = &state->dts_aggbuffer[desc.dtbd_cpu];
17823                 }
17824
17825                 if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
17826                         size_t sz = buf->dtb_offset;
17827
17828                         if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
17829                                 lck_mtx_unlock(&dtrace_lock);
17830                                 return (EBUSY);
17831                         }
17832
17833                         /*
17834                          * If this buffer has already been consumed, we're
17835                          * going to indicate that there's nothing left here
17836                          * to consume.
17837                          */
17838                         if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
17839                                 lck_mtx_unlock(&dtrace_lock);
17840
17841                                 desc.dtbd_size = 0;
17842                                 desc.dtbd_drops = 0;
17843                                 desc.dtbd_errors = 0;
17844                                 desc.dtbd_oldest = 0;
17845                                 sz = sizeof (desc);
17846
17847                                 if (copyout(&desc, (void *)arg, sz) != 0)
17848                                         return (EFAULT);
17849
17850                                 return (0);
17851                         }
17852
17853                         /*
17854                          * If this is a ring buffer that has wrapped, we want
17855                          * to copy the whole thing out.
17856                          */
17857                         if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
17858                                 dtrace_buffer_polish(buf);
17859                                 sz = buf->dtb_size;
17860                         }
17861
17862                         if (copyout(buf->dtb_tomax, desc.dtbd_data, sz) != 0) {
17863                                 lck_mtx_unlock(&dtrace_lock);
17864                                 return (EFAULT);
17865                         }
17866
17867                         desc.dtbd_size = sz;
17868                         desc.dtbd_drops = buf->dtb_drops;
17869                         desc.dtbd_errors = buf->dtb_errors;
17870                         desc.dtbd_oldest = buf->dtb_xamot_offset;
17871
17872                         lck_mtx_unlock(&dtrace_lock);
17873
17874                         if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17875                                 return (EFAULT);
17876
17877                         buf->dtb_flags |= DTRACEBUF_CONSUMED;
17878
17879                         return (0);
17880                 }
17881
17882                 if (buf->dtb_tomax == NULL) {
17883                         ASSERT(buf->dtb_xamot == NULL);
17884                         lck_mtx_unlock(&dtrace_lock);
17885                         return (ENOENT);
17886                 }
17887
17888                 cached = buf->dtb_tomax;
17889                 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
17890
17891                 dtrace_xcall(desc.dtbd_cpu,
17892                     (dtrace_xcall_t)dtrace_buffer_switch, buf);
17893
17894                 state->dts_errors += buf->dtb_xamot_errors;
17895
17896                 /*
17897                  * If the buffers did not actually switch, then the cross call
17898                  * did not take place -- presumably because the given CPU is
17899                  * not in the ready set.  If this is the case, we'll return
17900                  * ENOENT.
17901                  */
17902                 if (buf->dtb_tomax == cached) {
17903                         ASSERT(buf->dtb_xamot != cached);
17904                         lck_mtx_unlock(&dtrace_lock);
17905                         return (ENOENT);
17906                 }
17907
17908                 ASSERT(cached == buf->dtb_xamot);
17909
17910                 /*
17911                  * We have our snapshot; now copy it out.
17912                  */
17913                 if (copyout(buf->dtb_xamot, desc.dtbd_data,
17914                     buf->dtb_xamot_offset) != 0) {
17915                         lck_mtx_unlock(&dtrace_lock);
17916                         return (EFAULT);
17917                 }
17918
17919                 desc.dtbd_size = buf->dtb_xamot_offset;
17920                 desc.dtbd_drops = buf->dtb_xamot_drops;
17921                 desc.dtbd_errors = buf->dtb_xamot_errors;
17922                 desc.dtbd_oldest = 0;
17923
17924                 lck_mtx_unlock(&dtrace_lock);
17925
17926                 /*
17927                  * Finally, copy out the buffer description.
17928                  */
17929                 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17930                         return (EFAULT);
17931
17932                 return (0);
17933         }
17934
17935         case DTRACEIOC_CONF: {
17936                 dtrace_conf_t conf;
17937
17938                 bzero(&conf, sizeof (conf));
17939                 conf.dtc_difversion = DIF_VERSION;
17940                 conf.dtc_difintregs = DIF_DIR_NREGS;
17941                 conf.dtc_diftupregs = DIF_DTR_NREGS;
17942                 conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
17943
17944                 if (copyout(&conf, (void *)arg, sizeof (conf)) != 0)
17945                         return (EFAULT);
17946
17947                 return (0);
17948         }
17949
17950         case DTRACEIOC_STATUS: {
17951                 dtrace_status_t stat;
17952                 dtrace_dstate_t *dstate;
17953                 int i, j;
17954                 uint64_t nerrs;
17955
17956                 /*
17957                  * See the comment in dtrace_state_deadman() for the reason
17958                  * for setting dts_laststatus to INT64_MAX before setting
17959                  * it to the correct value.
17960                  */
17961                 state->dts_laststatus = INT64_MAX;
17962                 dtrace_membar_producer();
17963                 state->dts_laststatus = dtrace_gethrtime();
17964
17965                 bzero(&stat, sizeof (stat));
17966
17967                 lck_mtx_lock(&dtrace_lock);
17968
17969                 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
17970                         lck_mtx_unlock(&dtrace_lock);
17971                         return (ENOENT);
17972                 }
17973
17974                 if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
17975                         stat.dtst_exiting = 1;
17976
17977                 nerrs = state->dts_errors;
17978                 dstate = &state->dts_vstate.dtvs_dynvars;
17979
17980                 for (i = 0; i < NCPU; i++) {
17981                         dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
17982
17983                         stat.dtst_dyndrops += dcpu->dtdsc_drops;
17984                         stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
17985                         stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
17986
17987                         if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
17988                                 stat.dtst_filled++;
17989
17990                         nerrs += state->dts_buffer[i].dtb_errors;
17991
17992                         for (j = 0; j < state->dts_nspeculations; j++) {
17993                                 dtrace_speculation_t *spec;
17994                                 dtrace_buffer_t *buf;
17995
17996                                 spec = &state->dts_speculations[j];
17997                                 buf = &spec->dtsp_buffer[i];
17998                                 stat.dtst_specdrops += buf->dtb_xamot_drops;
17999                         }
18000                 }
18001
18002                 stat.dtst_specdrops_busy = state->dts_speculations_busy;
18003                 stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
18004                 stat.dtst_stkstroverflows = state->dts_stkstroverflows;
18005                 stat.dtst_dblerrors = state->dts_dblerrors;
18006                 stat.dtst_killed =
18007                     (state->dts_activity == DTRACE_ACTIVITY_KILLED);
18008                 stat.dtst_errors = nerrs;
18009
18010                 lck_mtx_unlock(&dtrace_lock);
18011
18012                 if (copyout(&stat, (void *)arg, sizeof (stat)) != 0)
18013                         return (EFAULT);
18014
18015                 return (0);
18016         }
18017
18018         case DTRACEIOC_FORMAT: {
18019                 dtrace_fmtdesc_t fmt;
18020                 char *str;
18021                 int len;
18022
18023                 if (copyin((void *)arg, &fmt, sizeof (fmt)) != 0)
18024                         return (EFAULT);
18025
18026                 lck_mtx_lock(&dtrace_lock);
18027
18028                 if (fmt.dtfd_format == 0 ||
18029                     fmt.dtfd_format > state->dts_nformats) {
18030                         lck_mtx_unlock(&dtrace_lock);
18031                         return (EINVAL);
18032                 }
18033
18034                 /*
18035                  * Format strings are allocated contiguously and they are
18036                  * never freed; if a format index is less than the number
18037                  * of formats, we can assert that the format map is non-NULL
18038                  * and that the format for the specified index is non-NULL.
18039                  */
18040                 ASSERT(state->dts_formats != NULL);
18041                 str = state->dts_formats[fmt.dtfd_format - 1];
18042                 ASSERT(str != NULL);
18043
18044                 len = strlen(str) + 1;
18045
18046                 if (len > fmt.dtfd_length) {
18047                         fmt.dtfd_length = len;
18048
18049                         if (copyout(&fmt, (void *)arg, sizeof (fmt)) != 0) {
18050                                 lck_mtx_unlock(&dtrace_lock);
18051                                 return (EINVAL);
18052                         }
18053                 } else {
18054                         if (copyout(str, fmt.dtfd_string, len) != 0) {
18055                                 lck_mtx_unlock(&dtrace_lock);
18056                                 return (EINVAL);
18057                         }
18058                 }
18059
18060                 lck_mtx_unlock(&dtrace_lock);
18061                 return (0);
18062         }
18063
18064         default:
18065                 break;
18066         }
18067
18068         return (ENOTTY);
18069 }
18070 #else
18071 /*ARGSUSED*/
18072 static int
18073 dtrace_ioctl_helper(u_long cmd, caddr_t arg, int *rv)
18074 {
18075 #pragma unused(rv)
18076         /*
18077          * Safe to check this outside the dof mode lock
18078          */
18079         if (dtrace_dof_mode == DTRACE_DOF_MODE_NEVER)
18080                 return KERN_SUCCESS;
18081
18082         switch (cmd) {
18083                 case DTRACEHIOC_ADDDOF: {
18084                         dof_helper_t *dhp = NULL;
18085                         size_t dof_ioctl_data_size;
18086                         dof_ioctl_data_t* multi_dof;
18087                         unsigned int i;
18088                         int rval = 0;
18089                         user_addr_t user_address = *(user_addr_t*)arg;
18090                         uint64_t dof_count;
18091                         int multi_dof_claimed = 0;
18092                         proc_t* p = current_proc();
18093
18094                         /*
18095                          * Read the number of DOF sections being passed in.
18096                          */
18097                         if (copyin(user_address + offsetof(dof_ioctl_data_t, dofiod_count),
18098                                    &dof_count,
18099                                    sizeof(dof_count))) {
18100                                 dtrace_dof_error(NULL, "failed to copyin dofiod_count");
18101                                 return (EFAULT);
18102                         }
18103
18104                         /*
18105                          * Range check the count.
18106                          */
18107                         if (dof_count == 0 || dof_count > 1024) {
18108                                 dtrace_dof_error(NULL, "dofiod_count is not valid");
18109                                 return (EINVAL);
18110                         }
18111
18112                         /*
18113                          * Allocate a correctly sized structure and copyin the data.
18114                          */
18115                         dof_ioctl_data_size = DOF_IOCTL_DATA_T_SIZE(dof_count);
18116                         if ((multi_dof = kmem_alloc(dof_ioctl_data_size, KM_SLEEP)) == NULL)
18117                                 return (ENOMEM);
18118
18119                         /* NOTE! We can no longer exit this method via return */
18120                         if (copyin(user_address, multi_dof, dof_ioctl_data_size) != 0) {
18121                                 dtrace_dof_error(NULL, "failed copyin of dof_ioctl_data_t");
18122                                 rval = EFAULT;
18123                                 goto cleanup;
18124                         }
18125
18126                         /*
18127                          * Check that the count didn't change between the first copyin and the second.
18128                          */
18129                         if (multi_dof->dofiod_count != dof_count) {
18130                                 rval = EINVAL;
18131                                 goto cleanup;
18132                         }
18133
18134                         /*
18135                          * Try to process lazily first.
18136                          */
18137                         rval = dtrace_lazy_dofs_add(p, multi_dof, &multi_dof_claimed);
18138
18139                         /*
18140                          * If rval is EACCES, we must be non-lazy.
18141                          */
18142                         if (rval == EACCES) {
18143                                 rval = 0;
18144                                 /*
18145                                  * Process each dof_helper_t
18146                                  */
18147                                 i = 0;
18148                                 do {
18149                                         dhp = &multi_dof->dofiod_helpers[i];
18150
18151                                         dof_hdr_t *dof = dtrace_dof_copyin(dhp->dofhp_dof, &rval);
18152
18153                                         if (dof != NULL) {
18154                                                 lck_mtx_lock(&dtrace_lock);
18155
18156                                                 /*
18157                                                  * dtrace_helper_slurp() takes responsibility for the dof --
18158                                                  * it may free it now or it may save it and free it later.
18159                                                  */
18160                                                 if ((dhp->dofhp_dof = (uint64_t)dtrace_helper_slurp(p, dof, dhp)) == -1ULL) {
18161                                                         rval = EINVAL;
18162                                                 }
18163
18164                                                 lck_mtx_unlock(&dtrace_lock);
18165                                         }
18166                                 } while (++i < multi_dof->dofiod_count && rval == 0);
18167                         }
18168
18169                         /*
18170                          * We need to copyout the multi_dof struct, because it contains
18171                          * the generation (unique id) values needed to call DTRACEHIOC_REMOVE
18172                          *
18173                          * This could certainly be better optimized.
18174                          */
18175                         if (copyout(multi_dof, user_address, dof_ioctl_data_size) != 0) {
18176                                 dtrace_dof_error(NULL, "failed copyout of dof_ioctl_data_t");
18177                                 /* Don't overwrite pre-existing error code */
18178                                 if (rval == 0) rval = EFAULT;
18179                         }
18180
18181                 cleanup:
18182                         /*
18183                          * If we had to allocate struct memory, free it.
18184                          */
18185                         if (multi_dof != NULL && !multi_dof_claimed) {
18186                                 kmem_free(multi_dof, dof_ioctl_data_size);
18187                         }
18188
18189                         return rval;
18190                 }
18191
18192                 case DTRACEHIOC_REMOVE: {
18193                         int generation = *(int*)arg;
18194                         proc_t* p = current_proc();
18195
18196                         /*
18197                          * Try lazy first.
18198                          */
18199                         int rval = dtrace_lazy_dofs_remove(p, generation);
18200
18201                         /*
18202                          * EACCES means non-lazy
18203                          */
18204                         if (rval == EACCES) {
18205                                 lck_mtx_lock(&dtrace_lock);
18206                                 rval = dtrace_helper_destroygen(p, generation);
18207                                 lck_mtx_unlock(&dtrace_lock);
18208                         }
18209
18210                         return (rval);
18211                 }
18212
18213                 default:
18214                         break;
18215         }
18216
18217         return ENOTTY;
18218 }
18219
18220 /*ARGSUSED*/
18221 static int
18222 dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv)
18223 {
18224 #pragma unused(md)
18225         minor_t minor = getminor(dev);
18226         dtrace_state_t *state;
18227         int rval;
18228
18229         /* Darwin puts Helper on its own major device. */
18230
18231         state = ddi_get_soft_state(dtrace_softstate, minor);
18232
18233         if (state->dts_anon) {
18234            ASSERT(dtrace_anon.dta_state == NULL);
18235            state = state->dts_anon;
18236         }
18237
18238         switch (cmd) {
18239         case DTRACEIOC_PROVIDER: {
18240                 dtrace_providerdesc_t pvd;
18241                 dtrace_provider_t *pvp;
18242
18243                 if (copyin(arg, &pvd, sizeof (pvd)) != 0)
18244                         return (EFAULT);
18245
18246                 pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
18247                 lck_mtx_lock(&dtrace_provider_lock);
18248
18249                 for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
18250                         if (strncmp(pvp->dtpv_name, pvd.dtvd_name, DTRACE_PROVNAMELEN) == 0)
18251                                 break;
18252                 }
18253
18254                 lck_mtx_unlock(&dtrace_provider_lock);
18255
18256                 if (pvp == NULL)
18257                         return (ESRCH);
18258
18259                 bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
18260                 bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
18261                 if (copyout(&pvd, arg, sizeof (pvd)) != 0)
18262                         return (EFAULT);
18263
18264                 return (0);
18265         }
18266
18267         case DTRACEIOC_EPROBE: {
18268                 dtrace_eprobedesc_t epdesc;
18269                 dtrace_ecb_t *ecb;
18270                 dtrace_action_t *act;
18271                 void *buf;
18272                 size_t size;
18273                 uintptr_t dest;
18274                 int nrecs;
18275
18276                 if (copyin(arg, &epdesc, sizeof (epdesc)) != 0)
18277                         return (EFAULT);
18278
18279                 lck_mtx_lock(&dtrace_lock);
18280
18281                 if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
18282                         lck_mtx_unlock(&dtrace_lock);
18283                         return (EINVAL);
18284                 }
18285
18286                 if (ecb->dte_probe == NULL) {
18287                         lck_mtx_unlock(&dtrace_lock);
18288                         return (EINVAL);
18289                 }
18290
18291                 epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
18292                 epdesc.dtepd_uarg = ecb->dte_uarg;
18293                 epdesc.dtepd_size = ecb->dte_size;
18294
18295                 nrecs = epdesc.dtepd_nrecs;
18296                 epdesc.dtepd_nrecs = 0;
18297                 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
18298                         if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
18299                                 continue;
18300
18301                         epdesc.dtepd_nrecs++;
18302                 }
18303
18304                 /*
18305                  * Now that we have the size, we need to allocate a temporary
18306                  * buffer in which to store the complete description.  We need
18307                  * the temporary buffer to be able to drop dtrace_lock()
18308                  * across the copyout(), below.
18309                  */
18310                 size = sizeof (dtrace_eprobedesc_t) +
18311                         (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
18312
18313                 buf = kmem_alloc(size, KM_SLEEP);
18314                 dest = (uintptr_t)buf;
18315
18316                 bcopy(&epdesc, (void *)dest, sizeof (epdesc));
18317                 dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
18318
18319                 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
18320                         if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
18321                                 continue;
18322
18323                         if (nrecs-- == 0)
18324                                 break;
18325
18326                         bcopy(&act->dta_rec, (void *)dest,
18327                         sizeof (dtrace_recdesc_t));
18328                         dest += sizeof (dtrace_recdesc_t);
18329                 }
18330
18331                 lck_mtx_unlock(&dtrace_lock);
18332
18333                 if (copyout(buf, arg, dest - (uintptr_t)buf) != 0) {
18334                         kmem_free(buf, size);
18335                         return (EFAULT);
18336                 }
18337
18338                 kmem_free(buf, size);
18339                 return (0);
18340         }
18341
18342         case DTRACEIOC_AGGDESC: {
18343                 dtrace_aggdesc_t aggdesc;
18344                 dtrace_action_t *act;
18345                 dtrace_aggregation_t *agg;
18346                 int nrecs;
18347                 uint32_t offs;
18348                 dtrace_recdesc_t *lrec;
18349                 void *buf;
18350                 size_t size;
18351                 uintptr_t dest;
18352
18353                 if (copyin(arg, &aggdesc, sizeof (aggdesc)) != 0)
18354                         return (EFAULT);
18355
18356                 lck_mtx_lock(&dtrace_lock);
18357
18358                 if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
18359                         lck_mtx_unlock(&dtrace_lock);
18360                         return (EINVAL);
18361                 }
18362
18363                 aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
18364
18365                 nrecs = aggdesc.dtagd_nrecs;
18366                 aggdesc.dtagd_nrecs = 0;
18367
18368                 offs = agg->dtag_base;
18369                 lrec = &agg->dtag_action.dta_rec;
18370                 aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
18371
18372                 for (act = agg->dtag_first; ; act = act->dta_next) {
18373                         ASSERT(act->dta_intuple ||
18374                         DTRACEACT_ISAGG(act->dta_kind));
18375
18376                         /*
18377                          * If this action has a record size of zero, it
18378                          * denotes an argument to the aggregating action.
18379                          * Because the presence of this record doesn't (or
18380                          * shouldn't) affect the way the data is interpreted,
18381                          * we don't copy it out to save user-level the
18382                          * confusion of dealing with a zero-length record.
18383                          */
18384                         if (act->dta_rec.dtrd_size == 0) {
18385                                 ASSERT(agg->dtag_hasarg);
18386                                 continue;
18387                         }
18388
18389                         aggdesc.dtagd_nrecs++;
18390
18391                         if (act == &agg->dtag_action)
18392                                 break;
18393                 }
18394
18395                 /*
18396                  * Now that we have the size, we need to allocate a temporary
18397                  * buffer in which to store the complete description.  We need
18398                  * the temporary buffer to be able to drop dtrace_lock()
18399                  * across the copyout(), below.
18400                  */
18401                 size = sizeof (dtrace_aggdesc_t) +
18402                         (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
18403
18404                 buf = kmem_alloc(size, KM_SLEEP);
18405                 dest = (uintptr_t)buf;
18406
18407                 bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
18408                 dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
18409
18410                 for (act = agg->dtag_first; ; act = act->dta_next) {
18411                         dtrace_recdesc_t rec = act->dta_rec;
18412
18413                         /*
18414                          * See the comment in the above loop for why we pass
18415                          * over zero-length records.
18416                          */
18417                         if (rec.dtrd_size == 0) {
18418                                 ASSERT(agg->dtag_hasarg);
18419                                 continue;
18420                         }
18421
18422                         if (nrecs-- == 0)
18423                                 break;
18424
18425                         rec.dtrd_offset -= offs;
18426                         bcopy(&rec, (void *)dest, sizeof (rec));
18427                         dest += sizeof (dtrace_recdesc_t);
18428
18429                         if (act == &agg->dtag_action)
18430                                 break;
18431                 }
18432
18433                 lck_mtx_unlock(&dtrace_lock);
18434
18435                 if (copyout(buf, arg, dest - (uintptr_t)buf) != 0) {
18436                         kmem_free(buf, size);
18437                         return (EFAULT);
18438                 }
18439
18440                 kmem_free(buf, size);
18441                 return (0);
18442         }
18443
18444         case DTRACEIOC_ENABLE: {
18445                 dof_hdr_t *dof;
18446                 dtrace_enabling_t *enab = NULL;
18447                 dtrace_vstate_t *vstate;
18448                 int err = 0;
18449
18450                 *rv = 0;
18451
18452                 /*
18453                  * If a NULL argument has been passed, we take this as our
18454                  * cue to reevaluate our enablings.
18455                  */
18456                 if (arg == NULL) {
18457                         dtrace_enabling_matchall();
18458
18459                         return (0);
18460                 }
18461
18462                 if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
18463                         return (rval);
18464
18465                 lck_mtx_lock(&cpu_lock);
18466                 lck_mtx_lock(&dtrace_lock);
18467                 vstate = &state->dts_vstate;
18468
18469                 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
18470                         lck_mtx_unlock(&dtrace_lock);
18471                         lck_mtx_unlock(&cpu_lock);
18472                         dtrace_dof_destroy(dof);
18473                         return (EBUSY);
18474                 }
18475
18476                 if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
18477                         lck_mtx_unlock(&dtrace_lock);
18478                         lck_mtx_unlock(&cpu_lock);
18479                         dtrace_dof_destroy(dof);
18480                         return (EINVAL);
18481                 }
18482
18483                 if ((rval = dtrace_dof_options(dof, state)) != 0) {
18484                         dtrace_enabling_destroy(enab);
18485                         lck_mtx_unlock(&dtrace_lock);
18486                         lck_mtx_unlock(&cpu_lock);
18487                         dtrace_dof_destroy(dof);
18488                         return (rval);
18489                 }
18490
18491                 if ((err = dtrace_enabling_match(enab, rv)) == 0) {
18492                         err = dtrace_enabling_retain(enab);
18493                 } else {
18494                         dtrace_enabling_destroy(enab);
18495                 }
18496
18497                 lck_mtx_unlock(&cpu_lock);
18498                 lck_mtx_unlock(&dtrace_lock);
18499                 dtrace_dof_destroy(dof);
18500
18501                 return (err);
18502         }
18503
18504         case DTRACEIOC_REPLICATE: {
18505                 dtrace_repldesc_t desc;
18506                 dtrace_probedesc_t *match = &desc.dtrpd_match;
18507                 dtrace_probedesc_t *create = &desc.dtrpd_create;
18508                 int err;
18509
18510                 if (copyin(arg, &desc, sizeof (desc)) != 0)
18511                         return (EFAULT);
18512
18513                 match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
18514                 match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
18515                 match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
18516                 match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
18517
18518                 create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
18519                 create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
18520                 create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
18521                 create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
18522
18523                 lck_mtx_lock(&dtrace_lock);
18524                 err = dtrace_enabling_replicate(state, match, create);
18525                 lck_mtx_unlock(&dtrace_lock);
18526
18527                 return (err);
18528         }
18529
18530         case DTRACEIOC_PROBEMATCH:
18531         case DTRACEIOC_PROBES: {
18532                 dtrace_probe_t *probe = NULL;
18533                 dtrace_probedesc_t desc;
18534                 dtrace_probekey_t pkey;
18535                 dtrace_id_t i;
18536                 int m = 0;
18537                 uint32_t priv;
18538                 uid_t uid;
18539                 zoneid_t zoneid;
18540
18541                 if (copyin(arg, &desc, sizeof (desc)) != 0)
18542                         return (EFAULT);
18543
18544                 desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
18545                 desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
18546                 desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
18547                 desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
18548
18549                 /*
18550                  * Before we attempt to match this probe, we want to give
18551                  * all providers the opportunity to provide it.
18552                  */
18553                 if (desc.dtpd_id == DTRACE_IDNONE) {
18554                         lck_mtx_lock(&dtrace_provider_lock);
18555                         dtrace_probe_provide(&desc, NULL);
18556                         lck_mtx_unlock(&dtrace_provider_lock);
18557                         desc.dtpd_id++;
18558                 }
18559
18560                 if (cmd == DTRACEIOC_PROBEMATCH)  {
18561                         dtrace_probekey(&desc, &pkey);
18562                         pkey.dtpk_id = DTRACE_IDNONE;
18563                 }
18564
18565                 dtrace_cred2priv(cr, &priv, &uid, &zoneid);
18566
18567                 lck_mtx_lock(&dtrace_lock);
18568
18569                 if (cmd == DTRACEIOC_PROBEMATCH) {
18570                         /* Quiet compiler warning */
18571                         for (i = desc.dtpd_id; i <= (dtrace_id_t)dtrace_nprobes; i++) {
18572                                 if ((probe = dtrace_probes[i - 1]) != NULL &&
18573                                         (m = dtrace_match_probe(probe, &pkey,
18574                                         priv, uid, zoneid)) != 0)
18575                                         break;
18576                         }
18577
18578                         if (m < 0) {
18579                                 lck_mtx_unlock(&dtrace_lock);
18580                                 return (EINVAL);
18581                         }
18582
18583                 } else {
18584                         /* Quiet compiler warning */
18585                         for (i = desc.dtpd_id; i <= (dtrace_id_t)dtrace_nprobes; i++) {
18586                                 if ((probe = dtrace_probes[i - 1]) != NULL &&
18587                                         dtrace_match_priv(probe, priv, uid, zoneid))
18588                                         break;
18589                         }
18590                 }
18591
18592                 if (probe == NULL) {
18593                         lck_mtx_unlock(&dtrace_lock);
18594                         return (ESRCH);
18595                 }
18596
18597                 dtrace_probe_description(probe, &desc);
18598                 lck_mtx_unlock(&dtrace_lock);
18599
18600                 if (copyout(&desc, arg, sizeof (desc)) != 0)
18601                         return (EFAULT);
18602
18603                 return (0);
18604         }
18605
18606         case DTRACEIOC_PROBEARG: {
18607                 dtrace_argdesc_t desc;
18608                 dtrace_probe_t *probe;
18609                 dtrace_provider_t *prov;
18610
18611                 if (copyin(arg, &desc, sizeof (desc)) != 0)
18612                         return (EFAULT);
18613
18614                 if (desc.dtargd_id == DTRACE_IDNONE)
18615                         return (EINVAL);
18616
18617                 if (desc.dtargd_ndx == DTRACE_ARGNONE)
18618                         return (EINVAL);
18619
18620                 lck_mtx_lock(&dtrace_provider_lock);
18621                 lck_mtx_lock(&mod_lock);
18622                 lck_mtx_lock(&dtrace_lock);
18623
18624                 /* Quiet compiler warning */
18625                 if (desc.dtargd_id > (dtrace_id_t)dtrace_nprobes) {
18626                         lck_mtx_unlock(&dtrace_lock);
18627                         lck_mtx_unlock(&mod_lock);
18628                         lck_mtx_unlock(&dtrace_provider_lock);
18629                         return (EINVAL);
18630                 }
18631
18632                 if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
18633                         lck_mtx_unlock(&dtrace_lock);
18634                         lck_mtx_unlock(&mod_lock);
18635                         lck_mtx_unlock(&dtrace_provider_lock);
18636                         return (EINVAL);
18637                 }
18638
18639                 lck_mtx_unlock(&dtrace_lock);
18640
18641                 prov = probe->dtpr_provider;
18642
18643                 if (prov->dtpv_pops.dtps_getargdesc == NULL) {
18644                 /*
18645                  * There isn't any typed information for this probe.
18646                  * Set the argument number to DTRACE_ARGNONE.
18647                  */
18648                         desc.dtargd_ndx = DTRACE_ARGNONE;
18649                 } else {
18650                         desc.dtargd_native[0] = '\0';
18651                         desc.dtargd_xlate[0] = '\0';
18652                         desc.dtargd_mapping = desc.dtargd_ndx;
18653
18654                         prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
18655                         probe->dtpr_id, probe->dtpr_arg, &desc);
18656                 }
18657
18658                 lck_mtx_unlock(&mod_lock);
18659                 lck_mtx_unlock(&dtrace_provider_lock);
18660
18661                 if (copyout(&desc, arg, sizeof (desc)) != 0)
18662                         return (EFAULT);
18663
18664                 return (0);
18665         }
18666
18667         case DTRACEIOC_GO: {
18668                 processorid_t cpuid;
18669                 rval = dtrace_state_go(state, &cpuid);
18670
18671                 if (rval != 0)
18672                         return (rval);
18673
18674                 if (copyout(&cpuid, arg, sizeof (cpuid)) != 0)
18675                         return (EFAULT);
18676
18677                 return (0);
18678         }
18679
18680         case DTRACEIOC_STOP: {
18681                 processorid_t cpuid;
18682
18683                 lck_mtx_lock(&dtrace_lock);
18684                 rval = dtrace_state_stop(state, &cpuid);
18685                 lck_mtx_unlock(&dtrace_lock);
18686
18687                 if (rval != 0)
18688                         return (rval);
18689
18690                 if (copyout(&cpuid, arg, sizeof (cpuid)) != 0)
18691                         return (EFAULT);
18692
18693                 return (0);
18694         }
18695
18696         case DTRACEIOC_DOFGET: {
18697                 dof_hdr_t hdr, *dof;
18698                 uint64_t len;
18699
18700                 if (copyin(arg, &hdr, sizeof (hdr)) != 0)
18701                         return (EFAULT);
18702
18703                 lck_mtx_lock(&dtrace_lock);
18704                 dof = dtrace_dof_create(state);
18705                 lck_mtx_unlock(&dtrace_lock);
18706
18707                 len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
18708                 rval = copyout(dof, arg, len);
18709                 dtrace_dof_destroy(dof);
18710
18711                 return (rval == 0 ? 0 : EFAULT);
18712         }
18713
18714         case DTRACEIOC_AGGSNAP:
18715         case DTRACEIOC_BUFSNAP: {
18716                 dtrace_bufdesc_t desc;
18717                 caddr_t cached;
18718                 dtrace_buffer_t *buf;
18719
18720                 if (copyin(arg, &desc, sizeof (desc)) != 0)
18721                         return (EFAULT);
18722
18723                 if ((int)desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)
18724                         return (EINVAL);
18725
18726                 lck_mtx_lock(&dtrace_lock);
18727
18728                 if (cmd == DTRACEIOC_BUFSNAP) {
18729                         buf = &state->dts_buffer[desc.dtbd_cpu];
18730                 } else {
18731                         buf = &state->dts_aggbuffer[desc.dtbd_cpu];
18732                 }
18733
18734                 if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
18735                         size_t sz = buf->dtb_offset;
18736
18737                         if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
18738                                 lck_mtx_unlock(&dtrace_lock);
18739                                 return (EBUSY);
18740                         }
18741
18742                         /*
18743                          * If this buffer has already been consumed, we're
18744                          * going to indicate that there's nothing left here
18745                          * to consume.
18746                          */
18747                         if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
18748                                 lck_mtx_unlock(&dtrace_lock);
18749
18750                                 desc.dtbd_size = 0;
18751                                 desc.dtbd_drops = 0;
18752                                 desc.dtbd_errors = 0;
18753                                 desc.dtbd_oldest = 0;
18754                                 sz = sizeof (desc);
18755
18756                                 if (copyout(&desc, arg, sz) != 0)
18757                                         return (EFAULT);
18758
18759                                 return (0);
18760                         }
18761
18762                         /*
18763                          * If this is a ring buffer that has wrapped, we want
18764                          * to copy the whole thing out.
18765                          */
18766                         if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
18767                                 dtrace_buffer_polish(buf);
18768                                 sz = buf->dtb_size;
18769                         }
18770
18771                         if (copyout(buf->dtb_tomax, (user_addr_t)desc.dtbd_data, sz) != 0) {
18772                                 lck_mtx_unlock(&dtrace_lock);
18773                                 return (EFAULT);
18774                         }
18775
18776                         desc.dtbd_size = sz;
18777                         desc.dtbd_drops = buf->dtb_drops;
18778                         desc.dtbd_errors = buf->dtb_errors;
18779                         desc.dtbd_oldest = buf->dtb_xamot_offset;
18780
18781                         lck_mtx_unlock(&dtrace_lock);
18782
18783                         if (copyout(&desc, arg, sizeof (desc)) != 0)
18784                                 return (EFAULT);
18785
18786                         buf->dtb_flags |= DTRACEBUF_CONSUMED;
18787
18788                         return (0);
18789                 }
18790
18791                 if (buf->dtb_tomax == NULL) {
18792                         ASSERT(buf->dtb_xamot == NULL);
18793                         lck_mtx_unlock(&dtrace_lock);
18794                         return (ENOENT);
18795                 }
18796
18797                 cached = buf->dtb_tomax;
18798                 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
18799
18800                 dtrace_xcall(desc.dtbd_cpu,
18801                         (dtrace_xcall_t)dtrace_buffer_switch, buf);
18802
18803                 state->dts_errors += buf->dtb_xamot_errors;
18804
18805                 /*
18806                 * If the buffers did not actually switch, then the cross call
18807                 * did not take place -- presumably because the given CPU is
18808                 * not in the ready set.  If this is the case, we'll return
18809                 * ENOENT.
18810                 */
18811                 if (buf->dtb_tomax == cached) {
18812                         ASSERT(buf->dtb_xamot != cached);
18813                         lck_mtx_unlock(&dtrace_lock);
18814                         return (ENOENT);
18815                 }
18816
18817                 ASSERT(cached == buf->dtb_xamot);
18818
18819                 /*
18820                 * We have our snapshot; now copy it out.
18821                 */
18822                 if (copyout(buf->dtb_xamot, (user_addr_t)desc.dtbd_data,
18823                                         buf->dtb_xamot_offset) != 0) {
18824                         lck_mtx_unlock(&dtrace_lock);
18825                         return (EFAULT);
18826                 }
18827
18828                 desc.dtbd_size = buf->dtb_xamot_offset;
18829                 desc.dtbd_drops = buf->dtb_xamot_drops;
18830                 desc.dtbd_errors = buf->dtb_xamot_errors;
18831                 desc.dtbd_oldest = 0;
18832
18833                 lck_mtx_unlock(&dtrace_lock);
18834
18835                 /*
18836                  * Finally, copy out the buffer description.
18837                  */
18838                 if (copyout(&desc, arg, sizeof (desc)) != 0)
18839                         return (EFAULT);
18840
18841                 return (0);
18842         }
18843
18844         case DTRACEIOC_CONF: {
18845                 dtrace_conf_t conf;
18846
18847                 bzero(&conf, sizeof (conf));
18848                 conf.dtc_difversion = DIF_VERSION;
18849                 conf.dtc_difintregs = DIF_DIR_NREGS;
18850                 conf.dtc_diftupregs = DIF_DTR_NREGS;
18851                 conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
18852
18853                 if (copyout(&conf, arg, sizeof (conf)) != 0)
18854                         return (EFAULT);
18855
18856                 return (0);
18857         }
18858
18859         case DTRACEIOC_STATUS: {
18860                 dtrace_status_t stat;
18861                 dtrace_dstate_t *dstate;
18862                 int i, j;
18863                 uint64_t nerrs;
18864
18865                 /*
18866                 * See the comment in dtrace_state_deadman() for the reason
18867                 * for setting dts_laststatus to INT64_MAX before setting
18868                 * it to the correct value.
18869                 */
18870                 state->dts_laststatus = INT64_MAX;
18871                 dtrace_membar_producer();
18872                 state->dts_laststatus = dtrace_gethrtime();
18873
18874                 bzero(&stat, sizeof (stat));
18875
18876                 lck_mtx_lock(&dtrace_lock);
18877
18878                 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
18879                         lck_mtx_unlock(&dtrace_lock);
18880                         return (ENOENT);
18881                 }
18882
18883                 if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
18884                         stat.dtst_exiting = 1;
18885
18886                 nerrs = state->dts_errors;
18887                 dstate = &state->dts_vstate.dtvs_dynvars;
18888
18889                 for (i = 0; i < (int)NCPU; i++) {
18890                         dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
18891
18892                         stat.dtst_dyndrops += dcpu->dtdsc_drops;
18893                         stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
18894                         stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
18895
18896                         if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
18897                                 stat.dtst_filled++;
18898
18899                         nerrs += state->dts_buffer[i].dtb_errors;
18900
18901                         for (j = 0; j < state->dts_nspeculations; j++) {
18902                                 dtrace_speculation_t *spec;
18903                                 dtrace_buffer_t *buf;
18904
18905                                 spec = &state->dts_speculations[j];
18906                                 buf = &spec->dtsp_buffer[i];
18907                                 stat.dtst_specdrops += buf->dtb_xamot_drops;
18908                         }
18909                 }
18910
18911                 stat.dtst_specdrops_busy = state->dts_speculations_busy;
18912                 stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
18913                 stat.dtst_stkstroverflows = state->dts_stkstroverflows;
18914                 stat.dtst_dblerrors = state->dts_dblerrors;
18915                 stat.dtst_killed =
18916                         (state->dts_activity == DTRACE_ACTIVITY_KILLED);
18917                 stat.dtst_errors = nerrs;
18918
18919                 lck_mtx_unlock(&dtrace_lock);
18920
18921                 if (copyout(&stat, arg, sizeof (stat)) != 0)
18922                         return (EFAULT);
18923
18924                 return (0);
18925         }
18926
18927         case DTRACEIOC_FORMAT: {
18928                 dtrace_fmtdesc_t fmt;
18929                 char *str;
18930                 int len;
18931
18932                 if (copyin(arg, &fmt, sizeof (fmt)) != 0)
18933                         return (EFAULT);
18934
18935                 lck_mtx_lock(&dtrace_lock);
18936
18937                 if (fmt.dtfd_format == 0 ||
18938                         fmt.dtfd_format > state->dts_nformats) {
18939                         lck_mtx_unlock(&dtrace_lock);
18940                         return (EINVAL);
18941                 }
18942
18943                 /*
18944                  * Format strings are allocated contiguously and they are
18945                  * never freed; if a format index is less than the number
18946                  * of formats, we can assert that the format map is non-NULL
18947                  * and that the format for the specified index is non-NULL.
18948                  */
18949                 ASSERT(state->dts_formats != NULL);
18950                 str = state->dts_formats[fmt.dtfd_format - 1];
18951                 ASSERT(str != NULL);
18952
18953                 len = strlen(str) + 1;
18954
18955                 if (len > fmt.dtfd_length) {
18956                         fmt.dtfd_length = len;
18957
18958                         if (copyout(&fmt, arg, sizeof (fmt)) != 0) {
18959                                 lck_mtx_unlock(&dtrace_lock);
18960                                 return (EINVAL);
18961                         }
18962                 } else {
18963                         if (copyout(str, (user_addr_t)fmt.dtfd_string, len) != 0) {
18964                                 lck_mtx_unlock(&dtrace_lock);
18965                                 return (EINVAL);
18966                         }
18967                 }
18968
18969                 lck_mtx_unlock(&dtrace_lock);
18970                 return (0);
18971         }
18972
18973         case DTRACEIOC_MODUUIDSLIST: {
18974                 size_t module_uuids_list_size;
18975                 dtrace_module_uuids_list_t* uuids_list;
18976                 uint64_t dtmul_count;
18977
18978                 /*
18979                  * Fail if the kernel symbol mode makes this operation illegal.
18980                  * Both NEVER & ALWAYS_FROM_KERNEL are permanent states, it is legal to check
18981                  * for them without holding the dtrace_lock.
18982                  */
18983                 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER ||
18984                     dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) {
18985                         cmn_err(CE_WARN, "dtrace_kernel_symbol_mode of %u disallows DTRACEIOC_MODUUIDSLIST", dtrace_kernel_symbol_mode);
18986                         return (EPERM);
18987                 }
18988
18989                 /*
18990                  * Read the number of symbolsdesc structs being passed in.
18991                  */
18992                 if (copyin(arg + offsetof(dtrace_module_uuids_list_t, dtmul_count),
18993                            &dtmul_count,
18994                            sizeof(dtmul_count))) {
18995                         cmn_err(CE_WARN, "failed to copyin dtmul_count");
18996                         return (EFAULT);
18997                 }
18998
18999                 /*
19000                  * Range check the count. More than 2k kexts is probably an error.
19001                  */
19002                 if (dtmul_count > 2048) {
19003                         cmn_err(CE_WARN, "dtmul_count is not valid");
19004                         return (EINVAL);
19005                 }
19006
19007                 /*
19008                  * For all queries, we return EINVAL when the user specified
19009                  * count does not match the actual number of modules we find
19010                  * available.
19011                  *
19012                  * If the user specified count is zero, then this serves as a
19013                  * simple query to count the available modules in need of symbols.
19014                  */
19015
19016                 rval = 0;
19017
19018                 if (dtmul_count == 0)
19019                 {
19020                         lck_mtx_lock(&mod_lock);
19021                         struct modctl* ctl = dtrace_modctl_list;
19022                         while (ctl) {
19023                                 ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
19024                                 if (!MOD_SYMBOLS_DONE(ctl)) {
19025                                         dtmul_count++;
19026                                         rval = EINVAL;
19027                                 }
19028                                 ctl = ctl->mod_next;
19029                         }
19030                         lck_mtx_unlock(&mod_lock);
19031
19032                         if (copyout(&dtmul_count, arg, sizeof (dtmul_count)) != 0)
19033                                 return (EFAULT);
19034                         else
19035                                 return (rval);
19036                 }
19037
19038                 /*
19039                  * If we reach this point, then we have a request for full list data.
19040                  * Allocate a correctly sized structure and copyin the data.
19041                  */
19042                 module_uuids_list_size = DTRACE_MODULE_UUIDS_LIST_SIZE(dtmul_count);
19043                 if ((uuids_list = kmem_alloc(module_uuids_list_size, KM_SLEEP)) == NULL)
19044                         return (ENOMEM);
19045
19046                 /* NOTE! We can no longer exit this method via return */
19047                 if (copyin(arg, uuids_list, module_uuids_list_size) != 0) {
19048                         cmn_err(CE_WARN, "failed copyin of dtrace_module_uuids_list_t");
19049                         rval = EFAULT;
19050                         goto moduuidslist_cleanup;
19051                 }
19052
19053                 /*
19054                  * Check that the count didn't change between the first copyin and the second.
19055                  */
19056                 if (uuids_list->dtmul_count != dtmul_count) {
19057                         rval = EINVAL;
19058                         goto moduuidslist_cleanup;
19059                 }
19060
19061                 /*
19062                  * Build the list of UUID's that need symbols
19063                  */
19064                 lck_mtx_lock(&mod_lock);
19065
19066                 dtmul_count = 0;
19067
19068                 struct modctl* ctl = dtrace_modctl_list;
19069                 while (ctl) {
19070                         /*
19071                          * We assume that userspace symbols will be "better" than kernel level symbols,
19072                          * as userspace can search for dSYM(s) and symbol'd binaries. Even if kernel syms
19073                          * are available, add user syms if the module might use them.
19074                          */
19075                         ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
19076                         if (!MOD_SYMBOLS_DONE(ctl)) {
19077                                 UUID* uuid = &uuids_list->dtmul_uuid[dtmul_count];
19078                                 if (dtmul_count++ < uuids_list->dtmul_count) {
19079                                         memcpy(uuid, ctl->mod_uuid, sizeof(UUID));
19080                                 }
19081                         }
19082                         ctl = ctl->mod_next;
19083                 }
19084
19085                 lck_mtx_unlock(&mod_lock);
19086
19087                 if (uuids_list->dtmul_count < dtmul_count)
19088                         rval = EINVAL;
19089
19090                 uuids_list->dtmul_count = dtmul_count;
19091
19092                 /*
19093                  * Copyout the symbols list (or at least the count!)
19094                  */
19095                 if (copyout(uuids_list, arg, module_uuids_list_size) != 0) {
19096                         cmn_err(CE_WARN, "failed copyout of dtrace_symbolsdesc_list_t");
19097                         rval = EFAULT;
19098                 }
19099
19100         moduuidslist_cleanup:
19101                 /*
19102                  * If we had to allocate struct memory, free it.
19103                  */
19104                 if (uuids_list != NULL) {
19105                         kmem_free(uuids_list, module_uuids_list_size);
19106                 }
19107
19108                 return rval;
19109         }
19110
19111         case DTRACEIOC_PROVMODSYMS: {
19112                 size_t module_symbols_size;
19113                 dtrace_module_symbols_t* module_symbols;
19114                 uint64_t dtmodsyms_count;
19115
19116                 /*
19117                  * Fail if the kernel symbol mode makes this operation illegal.
19118                  * Both NEVER & ALWAYS_FROM_KERNEL are permanent states, it is legal to check
19119                  * for them without holding the dtrace_lock.
19120                  */
19121                 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER ||
19122                     dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) {
19123                         cmn_err(CE_WARN, "dtrace_kernel_symbol_mode of %u disallows DTRACEIOC_PROVMODSYMS", dtrace_kernel_symbol_mode);
19124                         return (EPERM);
19125                 }
19126
19127                 /*
19128                  * Read the number of module symbols structs being passed in.
19129                  */
19130                 if (copyin(arg + offsetof(dtrace_module_symbols_t, dtmodsyms_count),
19131                            &dtmodsyms_count,
19132                            sizeof(dtmodsyms_count))) {
19133                         cmn_err(CE_WARN, "failed to copyin dtmodsyms_count");
19134                         return (EFAULT);
19135                 }
19136
19137                 /*
19138                  * Range check the count. How much data can we pass around?
19139                  * FIX ME!
19140                  */
19141                 if (dtmodsyms_count == 0 || (dtmodsyms_count > 100 * 1024)) {
19142                         cmn_err(CE_WARN, "dtmodsyms_count is not valid");
19143                         return (EINVAL);
19144                 }
19145
19146                 /*
19147                  * Allocate a correctly sized structure and copyin the data.
19148                  */
19149                 module_symbols_size = DTRACE_MODULE_SYMBOLS_SIZE(dtmodsyms_count);
19150                 if ((module_symbols = kmem_alloc(module_symbols_size, KM_SLEEP)) == NULL)
19151                         return (ENOMEM);
19152
19153                 rval = 0;
19154
19155                 /* NOTE! We can no longer exit this method via return */
19156                 if (copyin(arg, module_symbols, module_symbols_size) != 0) {
19157                         cmn_err(CE_WARN, "failed copyin of dtrace_module_symbols_t, symbol count %llu", module_symbols->dtmodsyms_count);
19158                         rval = EFAULT;
19159                         goto module_symbols_cleanup;
19160                 }
19161
19162                 /*
19163                  * Check that the count didn't change between the first copyin and the second.
19164                  */
19165                 if (module_symbols->dtmodsyms_count != dtmodsyms_count) {
19166                         rval = EINVAL;
19167                         goto module_symbols_cleanup;
19168                 }
19169
19170                 /*
19171                  * Find the modctl to add symbols to.
19172                  */
19173                 lck_mtx_lock(&dtrace_provider_lock);
19174                 lck_mtx_lock(&mod_lock);
19175
19176                 struct modctl* ctl = dtrace_modctl_list;
19177                 while (ctl) {
19178                         ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
19179                         if (MOD_HAS_UUID(ctl) && !MOD_SYMBOLS_DONE(ctl)) {
19180                                 if (memcmp(module_symbols->dtmodsyms_uuid, ctl->mod_uuid, sizeof(UUID)) == 0) {
19181                                         /* BINGO! */
19182                                         ctl->mod_user_symbols = module_symbols;
19183                                         break;
19184                                 }
19185                         }
19186                         ctl = ctl->mod_next;
19187                 }
19188
19189                 if (ctl) {
19190                         dtrace_provider_t *prv;
19191
19192                         /*
19193                          * We're going to call each providers per-module provide operation
19194                          * specifying only this module.
19195                          */
19196                         for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
19197                                 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
19198
19199                         /*
19200                          * We gave every provider a chance to provide with the user syms, go ahead and clear them
19201                          */
19202                         ctl->mod_user_symbols = NULL; /* MUST reset this to clear HAS_USERSPACE_SYMBOLS */
19203                 }
19204
19205                 lck_mtx_unlock(&mod_lock);
19206                 lck_mtx_unlock(&dtrace_provider_lock);
19207
19208         module_symbols_cleanup:
19209                 /*
19210                  * If we had to allocate struct memory, free it.
19211                  */
19212                 if (module_symbols != NULL) {
19213                         kmem_free(module_symbols, module_symbols_size);
19214                 }
19215
19216                 return rval;
19217         }
19218
19219                 default:
19220                         break;
19221         }
19222
19223         return (ENOTTY);
19224 }
19225 #endif /* __APPLE__ */
19226
19227 #if !defined(__APPLE__)
19228 /*ARGSUSED*/
19229 static int
19230 dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
19231 {
19232         dtrace_state_t *state;
19233
19234         switch (cmd) {
19235         case DDI_DETACH:
19236                 break;
19237
19238         case DDI_SUSPEND:
19239                 return (DDI_SUCCESS);
19240
19241         default:
19242                 return (DDI_FAILURE);
19243         }
19244
19245         lck_mtx_lock(&cpu_lock);
19246         lck_mtx_lock(&dtrace_provider_lock);
19247         lck_mtx_lock(&dtrace_lock);
19248
19249         ASSERT(dtrace_opens == 0);
19250
19251         if (dtrace_helpers > 0) {
19252                 lck_mtx_unlock(&dtrace_provider_lock);
19253                 lck_mtx_unlock(&dtrace_lock);
19254                 lck_mtx_unlock(&cpu_lock);
19255                 return (DDI_FAILURE);
19256         }
19257
19258         if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
19259                 lck_mtx_unlock(&dtrace_provider_lock);
19260                 lck_mtx_unlock(&dtrace_lock);
19261                 lck_mtx_unlock(&cpu_lock);
19262                 return (DDI_FAILURE);
19263         }
19264
19265         dtrace_provider = NULL;
19266
19267         if ((state = dtrace_anon_grab()) != NULL) {
19268                 /*
19269                  * If there were ECBs on this state, the provider should
19270                  * have not been allowed to detach; assert that there is
19271                  * none.
19272                  */
19273                 ASSERT(state->dts_necbs == 0);
19274                 dtrace_state_destroy(state);
19275
19276                 /*
19277                  * If we're being detached with anonymous state, we need to
19278                  * indicate to the kernel debugger that DTrace is now inactive.
19279                  */
19280                 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
19281         }
19282
19283         bzero(&dtrace_anon, sizeof (dtrace_anon_t));
19284         unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
19285         dtrace_cpu_init = NULL;
19286         dtrace_helpers_cleanup = NULL;
19287         dtrace_helpers_fork = NULL;
19288         dtrace_cpustart_init = NULL;
19289         dtrace_cpustart_fini = NULL;
19290         dtrace_debugger_init = NULL;
19291         dtrace_debugger_fini = NULL;
19292         dtrace_kreloc_init = NULL;
19293         dtrace_kreloc_fini = NULL;
19294         dtrace_modload = NULL;
19295         dtrace_modunload = NULL;
19296
19297         lck_mtx_unlock(&cpu_lock);
19298
19299         if (dtrace_helptrace_enabled) {
19300                 kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
19301                 dtrace_helptrace_buffer = NULL;
19302         }
19303
19304         kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
19305         dtrace_probes = NULL;
19306         dtrace_nprobes = 0;
19307
19308         dtrace_hash_destroy(dtrace_bymod);
19309         dtrace_hash_destroy(dtrace_byfunc);
19310         dtrace_hash_destroy(dtrace_byname);
19311         dtrace_bymod = NULL;
19312         dtrace_byfunc = NULL;
19313         dtrace_byname = NULL;
19314
19315         kmem_cache_destroy(dtrace_state_cache);
19316         vmem_destroy(dtrace_minor);
19317         vmem_destroy(dtrace_arena);
19318
19319         if (dtrace_toxrange != NULL) {
19320                 kmem_free(dtrace_toxrange,
19321                     dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
19322                 dtrace_toxrange = NULL;
19323                 dtrace_toxranges = 0;
19324                 dtrace_toxranges_max = 0;
19325         }
19326
19327         ddi_remove_minor_node(dtrace_devi, NULL);
19328         dtrace_devi = NULL;
19329
19330         ddi_soft_state_fini(&dtrace_softstate);
19331
19332         ASSERT(dtrace_vtime_references == 0);
19333         ASSERT(dtrace_opens == 0);
19334         ASSERT(dtrace_retained == NULL);
19335
19336         lck_mtx_unlock(&dtrace_lock);
19337         lck_mtx_unlock(&dtrace_provider_lock);
19338
19339         /*
19340          * We don't destroy the task queue until after we have dropped our
19341          * locks (taskq_destroy() may block on running tasks).  To prevent
19342          * attempting to do work after we have effectively detached but before
19343          * the task queue has been destroyed, all tasks dispatched via the
19344          * task queue must check that DTrace is still attached before
19345          * performing any operation.
19346          */
19347         taskq_destroy(dtrace_taskq);
19348         dtrace_taskq = NULL;
19349
19350         return (DDI_SUCCESS);
19351 }
19352
19353 /*ARGSUSED*/
19354 static int
19355 dtrace_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
19356 {
19357         int error;
19358
19359         switch (infocmd) {
19360         case DDI_INFO_DEVT2DEVINFO:
19361                 *result = (void *)dtrace_devi;
19362                 error = DDI_SUCCESS;
19363                 break;
19364         case DDI_INFO_DEVT2INSTANCE:
19365                 *result = (void *)0;
19366                 error = DDI_SUCCESS;
19367                 break;
19368         default:
19369                 error = DDI_FAILURE;
19370         }
19371         return (error);
19372 }
19373
19374 static struct cb_ops dtrace_cb_ops = {
19375         dtrace_open,            /* open */
19376         dtrace_close,           /* close */
19377         nulldev,                /* strategy */
19378         nulldev,                /* print */
19379         nodev,                  /* dump */
19380         nodev,                  /* read */
19381         nodev,                  /* write */
19382         dtrace_ioctl,           /* ioctl */
19383         nodev,                  /* devmap */
19384         nodev,                  /* mmap */
19385         nodev,                  /* segmap */
19386         nochpoll,               /* poll */
19387         ddi_prop_op,            /* cb_prop_op */
19388         0,                      /* streamtab  */
19389         D_NEW | D_MP            /* Driver compatibility flag */
19390 };
19391
19392 static struct dev_ops dtrace_ops = {
19393         DEVO_REV,               /* devo_rev */
19394         0,                      /* refcnt */
19395         dtrace_info,            /* get_dev_info */
19396         nulldev,                /* identify */
19397         nulldev,                /* probe */
19398         dtrace_attach,          /* attach */
19399         dtrace_detach,          /* detach */
19400         nodev,                  /* reset */
19401         &dtrace_cb_ops,         /* driver operations */
19402         NULL,                   /* bus operations */
19403         nodev                   /* dev power */
19404 };
19405
19406 static struct modldrv modldrv = {
19407         &mod_driverops,         /* module type (this is a pseudo driver) */
19408         "Dynamic Tracing",      /* name of module */
19409         &dtrace_ops,            /* driver ops */
19410 };
19411
19412 static struct modlinkage modlinkage = {
19413         MODREV_1,
19414         (void *)&modldrv,
19415         NULL
19416 };
19417
19418 int
19419 _init(void)
19420 {
19421         return (mod_install(&modlinkage));
19422 }
19423
19424 int
19425 _info(struct modinfo *modinfop)
19426 {
19427         return (mod_info(&modlinkage, modinfop));
19428 }
19429
19430 int
19431 _fini(void)
19432 {
19433         return (mod_remove(&modlinkage));
19434 }
19435 #else /* Darwin BSD driver model. */
19436
19437 d_open_t _dtrace_open, helper_open;
19438 d_close_t _dtrace_close, helper_close;
19439 d_ioctl_t _dtrace_ioctl, helper_ioctl;
19440
19441 int
19442 _dtrace_open(dev_t dev, int flags, int devtype, struct proc *p)
19443 {
19444 #pragma unused(p)
19445         dev_t locdev = dev;
19446
19447         return  dtrace_open( &locdev, flags, devtype, CRED());
19448 }
19449
19450 int
19451 helper_open(dev_t dev, int flags, int devtype, struct proc *p)
19452 {
19453 #pragma unused(dev,flags,devtype,p)
19454         return 0;
19455 }
19456
19457 int
19458 _dtrace_close(dev_t dev, int flags, int devtype, struct proc *p)
19459 {
19460 #pragma unused(p)
19461         return dtrace_close( dev, flags, devtype, CRED());
19462 }
19463
19464 int
19465 helper_close(dev_t dev, int flags, int devtype, struct proc *p)
19466 {
19467 #pragma unused(dev,flags,devtype,p)
19468         return 0;
19469 }
19470
19471 int
19472 _dtrace_ioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct proc *p)
19473 {
19474 #pragma unused(p)
19475         int err, rv = 0;
19476     user_addr_t uaddrp;
19477
19478     if (proc_is64bit(p))
19479                 uaddrp = *(user_addr_t *)data;
19480         else
19481                 uaddrp = (user_addr_t) *(uint32_t *)data;
19482
19483         err = dtrace_ioctl(dev, cmd, uaddrp, fflag, CRED(), &rv);
19484
19485         /* Darwin's BSD ioctls only return -1 or zero. Overload errno to mimic Solaris. 20 bits suffice. */
19486         if (err != 0) {
19487                 ASSERT( (err & 0xfffff000) == 0 );
19488                 return (err & 0xfff); /* ioctl will return -1 and will set errno to an error code < 4096 */
19489         } else if (rv != 0) {
19490                 ASSERT( (rv & 0xfff00000) == 0 );
19491                 return (((rv & 0xfffff) << 12)); /* ioctl will return -1 and will set errno to a value >= 4096 */
19492         } else
19493                 return 0;
19494 }
19495
19496 int
19497 helper_ioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct proc *p)
19498 {
19499 #pragma unused(dev,fflag,p)
19500         int err, rv = 0;
19501
19502         err = dtrace_ioctl_helper(cmd, data, &rv);
19503         /* Darwin's BSD ioctls only return -1 or zero. Overload errno to mimic Solaris. 20 bits suffice. */
19504         if (err != 0) {
19505                 ASSERT( (err & 0xfffff000) == 0 );
19506                 return (err & 0xfff); /* ioctl will return -1 and will set errno to an error code < 4096 */
19507         } else if (rv != 0) {
19508                 ASSERT( (rv & 0xfff00000) == 0 );
19509                 return (((rv & 0xfffff) << 12)); /* ioctl will return -1 and will set errno to a value >= 4096 */
19510         } else
19511                 return 0;
19512 }
19513
19514 #define HELPER_MAJOR  -24 /* let the kernel pick the device number */
19515
19516 /*
19517  * A struct describing which functions will get invoked for certain
19518  * actions.
19519  */
19520 static struct cdevsw helper_cdevsw =
19521 {
19522         helper_open,            /* open */
19523         helper_close,           /* close */
19524         eno_rdwrt,                      /* read */
19525         eno_rdwrt,                      /* write */
19526         helper_ioctl,           /* ioctl */
19527         (stop_fcn_t *)nulldev, /* stop */
19528         (reset_fcn_t *)nulldev, /* reset */
19529         NULL,                           /* tty's */
19530         eno_select,                     /* select */
19531         eno_mmap,                       /* mmap */
19532         eno_strat,                      /* strategy */
19533         eno_getc,                       /* getc */
19534         eno_putc,                       /* putc */
19535         0                                       /* type */
19536 };
19537
19538 static int helper_majdevno = 0;
19539
19540 static int gDTraceInited = 0;
19541
19542 void
19543 helper_init( void )
19544 {
19545         /*
19546          * Once the "helper" is initialized, it can take ioctl calls that use locks
19547          * and zones initialized in dtrace_init. Make certain dtrace_init was called
19548          * before us.
19549          */
19550
19551         if (!gDTraceInited) {
19552                 panic("helper_init before dtrace_init\n");
19553         }
19554
19555         if (0 >= helper_majdevno)
19556         {
19557                 helper_majdevno = cdevsw_add(HELPER_MAJOR, &helper_cdevsw);
19558
19559                 if (helper_majdevno < 0) {
19560                         printf("helper_init: failed to allocate a major number!\n");
19561                         return;
19562                 }
19563
19564                 if (NULL == devfs_make_node( makedev(helper_majdevno, 0), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666,
19565                                         DTRACEMNR_HELPER, 0 )) {
19566                         printf("dtrace_init: failed to devfs_make_node for helper!\n");
19567                         return;
19568                 }
19569         } else
19570                 panic("helper_init: called twice!\n");
19571 }
19572
19573 #undef HELPER_MAJOR
19574
19575 /*
19576  * Called with DEVFS_LOCK held, so vmem_alloc's underlying blist structures are protected.
19577  */
19578 static int
19579 dtrace_clone_func(dev_t dev, int action)
19580 {
19581 #pragma unused(dev)
19582
19583         if (action == DEVFS_CLONE_ALLOC) {
19584                 if (NULL == dtrace_minor) /* Arena not created yet!?! */
19585                         return 0;
19586                 else {
19587                         /*
19588                          * Propose a minor number, namely the next number that vmem_alloc() will return.
19589                          * Immediately put it back in play by calling vmem_free(). FIXME.
19590                          */
19591                         int ret = (int)(uintptr_t)vmem_alloc(dtrace_minor, 1, VM_BESTFIT | VM_SLEEP);
19592
19593                         vmem_free(dtrace_minor, (void *)(uintptr_t)ret, 1);
19594
19595                         return ret;
19596                 }
19597         }
19598         else if (action == DEVFS_CLONE_FREE) {
19599                 return 0;
19600         }
19601         else return -1;
19602 }
19603
19604 #define DTRACE_MAJOR  -24 /* let the kernel pick the device number */
19605
19606 static struct cdevsw dtrace_cdevsw =
19607 {
19608         _dtrace_open,           /* open */
19609         _dtrace_close,          /* close */
19610         eno_rdwrt,                      /* read */
19611         eno_rdwrt,                      /* write */
19612         _dtrace_ioctl,          /* ioctl */
19613         (stop_fcn_t *)nulldev, /* stop */
19614         (reset_fcn_t *)nulldev, /* reset */
19615         NULL,                           /* tty's */
19616         eno_select,                     /* select */
19617         eno_mmap,                       /* mmap */
19618         eno_strat,                      /* strategy */
19619         eno_getc,                       /* getc */
19620         eno_putc,                       /* putc */
19621         0                                       /* type */
19622 };
19623
19624 lck_attr_t* dtrace_lck_attr;
19625 lck_grp_attr_t* dtrace_lck_grp_attr;
19626 lck_grp_t* dtrace_lck_grp;
19627
19628 static int gMajDevNo;
19629
19630 void
19631 dtrace_init( void )
19632 {
19633         if (0 == gDTraceInited) {
19634                 int i, ncpu = NCPU;
19635
19636                 gMajDevNo = cdevsw_add(DTRACE_MAJOR, &dtrace_cdevsw);
19637
19638                 if (gMajDevNo < 0) {
19639                         printf("dtrace_init: failed to allocate a major number!\n");
19640                         gDTraceInited = 0;
19641                         return;
19642                 }
19643
19644                 if (NULL == devfs_make_node_clone( makedev(gMajDevNo, 0), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666,
19645                                         dtrace_clone_func, DTRACEMNR_DTRACE, 0 )) {
19646                         printf("dtrace_init: failed to devfs_make_node_clone for dtrace!\n");
19647                         gDTraceInited = 0;
19648                         return;
19649                 }
19650
19651 #if defined(DTRACE_MEMORY_ZONES)
19652                 /*
19653                  * Initialize the dtrace kalloc-emulation zones.
19654                  */
19655                 dtrace_alloc_init();
19656 #endif /* DTRACE_MEMORY_ZONES */
19657
19658                 /*
19659                  * Allocate the dtrace_probe_t zone
19660                  */
19661                 dtrace_probe_t_zone = zinit(sizeof(dtrace_probe_t),
19662                                             1024 * sizeof(dtrace_probe_t),
19663                                             sizeof(dtrace_probe_t),
19664                                             "dtrace.dtrace_probe_t");
19665
19666                 /*
19667                  * Create the dtrace lock group and attrs.
19668                  */
19669                 dtrace_lck_attr = lck_attr_alloc_init();
19670                 dtrace_lck_grp_attr= lck_grp_attr_alloc_init();
19671                 dtrace_lck_grp = lck_grp_alloc_init("dtrace",  dtrace_lck_grp_attr);
19672
19673                 /*
19674                  * We have to initialize all locks explicitly
19675                  */
19676                 lck_mtx_init(&dtrace_lock, dtrace_lck_grp, dtrace_lck_attr);
19677                 lck_mtx_init(&dtrace_provider_lock, dtrace_lck_grp, dtrace_lck_attr);
19678                 lck_mtx_init(&dtrace_meta_lock, dtrace_lck_grp, dtrace_lck_attr);
19679 #if DEBUG
19680                 lck_mtx_init(&dtrace_errlock, dtrace_lck_grp, dtrace_lck_attr);
19681 #endif
19682                 lck_rw_init(&dtrace_dof_mode_lock, dtrace_lck_grp, dtrace_lck_attr);
19683
19684                 /*
19685                  * The cpu_core structure consists of per-CPU state available in any context.
19686                  * On some architectures, this may mean that the page(s) containing the
19687                  * NCPU-sized array of cpu_core structures must be locked in the TLB -- it
19688                  * is up to the platform to assure that this is performed properly.  Note that
19689                  * the structure is sized to avoid false sharing.
19690                  */
19691                 lck_mtx_init(&cpu_lock, dtrace_lck_grp, dtrace_lck_attr);
19692                 lck_mtx_init(&mod_lock, dtrace_lck_grp, dtrace_lck_attr);
19693
19694                 dtrace_modctl_list = NULL;
19695
19696                 cpu_core = (cpu_core_t *)kmem_zalloc( ncpu * sizeof(cpu_core_t), KM_SLEEP );
19697                 for (i = 0; i < ncpu; ++i) {
19698                         lck_mtx_init(&cpu_core[i].cpuc_pid_lock, dtrace_lck_grp, dtrace_lck_attr);
19699                 }
19700
19701                 cpu_list = (dtrace_cpu_t *)kmem_zalloc( ncpu * sizeof(dtrace_cpu_t), KM_SLEEP );
19702                 for (i = 0; i < ncpu; ++i) {
19703                         cpu_list[i].cpu_id = (processorid_t)i;
19704                         cpu_list[i].cpu_next = &(cpu_list[(i+1) % ncpu]);
19705                         lck_rw_init(&cpu_list[i].cpu_ft_lock, dtrace_lck_grp, dtrace_lck_attr);
19706                 }
19707
19708                 lck_mtx_lock(&cpu_lock);
19709                 for (i = 0; i < ncpu; ++i)
19710                         /* FIXME: track CPU configuration a la CHUD Processor Pref Pane. */
19711                         dtrace_cpu_setup_initial( (processorid_t)i ); /* In lieu of register_cpu_setup_func() callback */
19712                 lck_mtx_unlock(&cpu_lock);
19713
19714                 (void)dtrace_abs_to_nano(0LL); /* Force once only call to clock_timebase_info (which can take a lock) */
19715
19716                 /*
19717                  * See dtrace_impl.h for a description of dof modes.
19718                  * The default is lazy dof.
19719                  *
19720                  * FIXME: Warn if state is LAZY_OFF? It won't break anything, but
19721                  * makes no sense...
19722                  */
19723                 if (!PE_parse_boot_argn("dtrace_dof_mode", &dtrace_dof_mode, sizeof (dtrace_dof_mode))) {
19724                         dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_ON;
19725                 }
19726
19727                 /*
19728                  * Sanity check of dof mode value.
19729                  */
19730                 switch (dtrace_dof_mode) {
19731                         case DTRACE_DOF_MODE_NEVER:
19732                         case DTRACE_DOF_MODE_LAZY_ON:
19733                                 /* valid modes, but nothing else we need to do */
19734                                 break;
19735
19736                         case DTRACE_DOF_MODE_LAZY_OFF:
19737                         case DTRACE_DOF_MODE_NON_LAZY:
19738                                 /* Cannot wait for a dtrace_open to init fasttrap */
19739                                 fasttrap_init();
19740                                 break;
19741
19742                         default:
19743                                 /* Invalid, clamp to non lazy */
19744                                 dtrace_dof_mode = DTRACE_DOF_MODE_NON_LAZY;
19745                                 fasttrap_init();
19746                                 break;
19747                 }
19748
19749                 /*
19750                  * See dtrace_impl.h for a description of kernel symbol modes.
19751                  * The default is to wait for symbols from userspace (lazy symbols).
19752                  */
19753                 if (!PE_parse_boot_argn("dtrace_kernel_symbol_mode", &dtrace_kernel_symbol_mode, sizeof (dtrace_kernel_symbol_mode))) {
19754                         dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE;
19755                 }
19756
19757                 gDTraceInited = 1;
19758
19759         } else
19760                 panic("dtrace_init: called twice!\n");
19761 }
19762
19763 void
19764 dtrace_postinit(void)
19765 {
19766         /*
19767          * Called from bsd_init after all provider's *_init() routines have been
19768          * run. That way, anonymous DOF enabled under dtrace_attach() is safe
19769          * to go.
19770          */
19771         dtrace_attach( (dev_info_t *)(uintptr_t)makedev(gMajDevNo, 0), 0 ); /* Punning a dev_t to a dev_info_t* */
19772
19773         /*
19774          * Add the mach_kernel to the module list for lazy processing
19775          */
19776         struct kmod_info fake_kernel_kmod;
19777         memset(&fake_kernel_kmod, 0, sizeof(fake_kernel_kmod));
19778
19779         strlcpy(fake_kernel_kmod.name, "mach_kernel", sizeof(fake_kernel_kmod.name));
19780         fake_kernel_kmod.id = 1;
19781         fake_kernel_kmod.address = g_kernel_kmod_info.address;
19782         fake_kernel_kmod.size = g_kernel_kmod_info.size;
19783
19784         if (dtrace_module_loaded(&fake_kernel_kmod) != 0) {
19785                 printf("dtrace_postinit: Could not register mach_kernel modctl\n");
19786         }
19787
19788         (void)OSKextRegisterKextsWithDTrace();
19789 }
19790 #undef DTRACE_MAJOR
19791
19792 /*
19793  * Routines used to register interest in cpu's being added to or removed
19794  * from the system.
19795  */
19796 void
19797 register_cpu_setup_func(cpu_setup_func_t *ignore1, void *ignore2)
19798 {
19799 #pragma unused(ignore1,ignore2)
19800 }
19801
19802 void
19803 unregister_cpu_setup_func(cpu_setup_func_t *ignore1, void *ignore2)
19804 {
19805 #pragma unused(ignore1,ignore2)
19806 }
19807 #endif /* __APPLE__ */