bsd/dev/dtrace/dtrace.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Portions Copyright (c) 2013, 2016, Joyent, Inc. All rights reserved.
  24  * Portions Copyright (c) 2013 by Delphix. All rights reserved.
  25  */
  26
  27 /*
  28  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  29  * Use is subject to license terms.
  30  */
  31
  32 /* #pragma ident        "@(#)dtrace.c   1.65    08/07/02 SMI" */
  33
  34 /*
  35  * DTrace - Dynamic Tracing for Solaris
  36  *
  37  * This is the implementation of the Solaris Dynamic Tracing framework
  38  * (DTrace).  The user-visible interface to DTrace is described at length in
  39  * the "Solaris Dynamic Tracing Guide".  The interfaces between the libdtrace
  40  * library, the in-kernel DTrace framework, and the DTrace providers are
  41  * described in the block comments in the <sys/dtrace.h> header file.  The
  42  * internal architecture of DTrace is described in the block comments in the
  43  * <sys/dtrace_impl.h> header file.  The comments contained within the DTrace
  44  * implementation very much assume mastery of all of these sources; if one has
  45  * an unanswered question about the implementation, one should consult them
  46  * first.
  47  *
  48  * The functions here are ordered roughly as follows:
  49  *
  50  *   - Probe context functions
  51  *   - Probe hashing functions
  52  *   - Non-probe context utility functions
  53  *   - Matching functions
  54  *   - Provider-to-Framework API functions
  55  *   - Probe management functions
  56  *   - DIF object functions
  57  *   - Format functions
  58  *   - Predicate functions
  59  *   - ECB functions
  60  *   - Buffer functions
  61  *   - Enabling functions
  62  *   - DOF functions
  63  *   - Anonymous enabling functions
  64  *   - Process functions
  65  *   - Consumer state functions
  66  *   - Helper functions
  67  *   - Hook functions
  68  *   - Driver cookbook functions
  69  *
  70  * Each group of functions begins with a block comment labelled the "DTrace
  71  * [Group] Functions", allowing one to find each block by searching forward
  72  * on capital-f functions.
  73  */
  74 #include <sys/errno.h>
  75 #include <sys/types.h>
  76 #include <sys/stat.h>
  77 #include <sys/conf.h>
  78 #include <sys/systm.h>
  79 #include <sys/dtrace_impl.h>
  80 #include <sys/param.h>
  81 #include <sys/proc_internal.h>
  82 #include <sys/ioctl.h>
  83 #include <sys/fcntl.h>
  84 #include <miscfs/devfs/devfs.h>
  85 #include <sys/malloc.h>
  86 #include <sys/kernel_types.h>
  87 #include <sys/proc_internal.h>
  88 #include <sys/uio_internal.h>
  89 #include <sys/kauth.h>
  90 #include <vm/pmap.h>
  91 #include <sys/user.h>
  92 #include <mach/exception_types.h>
  93 #include <sys/signalvar.h>
  94 #include <mach/task.h>
  95 #include <kern/zalloc.h>
  96 #include <kern/ast.h>
  97 #include <kern/sched_prim.h>
  98 #include <kern/task.h>
  99 #include <netinet/in.h>
 100 #include <libkern/sysctl.h>
 101 #include <sys/kdebug.h>
 102
 103 #if MONOTONIC
 104 #include <kern/monotonic.h>
 105 #include <machine/monotonic.h>
 106 #endif /* MONOTONIC */
 107
 108 #include <IOKit/IOPlatformExpert.h>
 109
 110 #include <kern/cpu_data.h>
 111 extern uint32_t pmap_find_phys(void *, uint64_t);
 112 extern boolean_t pmap_valid_page(uint32_t);
 113 extern void OSKextRegisterKextsWithDTrace(void);
 114 extern kmod_info_t g_kernel_kmod_info;
 115
 116 /* Solaris proc_t is the struct. Darwin's proc_t is a pointer to it. */
 117 #define proc_t struct proc /* Steer clear of the Darwin typedef for proc_t */
 118
 119 #define t_predcache t_dtrace_predcache /* Cosmetic. Helps readability of thread.h */
 120
 121 extern void dtrace_suspend(void);
 122 extern void dtrace_resume(void);
 123 extern void dtrace_early_init(void);
 124 extern int dtrace_keep_kernel_symbols(void);
 125 extern void dtrace_init(void);
 126 extern void helper_init(void);
 127 extern void fasttrap_init(void);
 128
 129 static int  dtrace_lazy_dofs_duplicate(proc_t *, proc_t *);
 130 extern void dtrace_lazy_dofs_destroy(proc_t *);
 131 extern void dtrace_postinit(void);
 132
 133 extern void dtrace_proc_fork(proc_t*, proc_t*, int);
 134 extern void dtrace_proc_exec(proc_t*);
 135 extern void dtrace_proc_exit(proc_t*);
 136
 137 /*
 138  * DTrace Tunable Variables
 139  *
 140  * The following variables may be dynamically tuned by using sysctl(8), the
 141  * variables being stored in the kern.dtrace namespace.  For example:
 142  *      sysctl kern.dtrace.dof_maxsize = 1048575        # 1M
 143  *
 144  * In general, the only variables that one should be tuning this way are those
 145  * that affect system-wide DTrace behavior, and for which the default behavior
 146  * is undesirable.  Most of these variables are tunable on a per-consumer
 147  * basis using DTrace options, and need not be tuned on a system-wide basis.
 148  * When tuning these variables, avoid pathological values; while some attempt
 149  * is made to verify the integrity of these variables, they are not considered
 150  * part of the supported interface to DTrace, and they are therefore not
 151  * checked comprehensively.
 152  */
 153 uint64_t        dtrace_buffer_memory_maxsize = 0;               /* initialized in dtrace_init */
 154 uint64_t        dtrace_buffer_memory_inuse = 0;
 155 int             dtrace_destructive_disallow = 0;
 156 dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024);
 157 size_t          dtrace_difo_maxsize = (256 * 1024);
 158 dtrace_optval_t dtrace_dof_maxsize = (512 * 1024);
 159 dtrace_optval_t dtrace_statvar_maxsize = (16 * 1024);
 160 dtrace_optval_t dtrace_statvar_maxsize_max = (16 * 10 * 1024);
 161 size_t          dtrace_actions_max = (16 * 1024);
 162 size_t          dtrace_retain_max = 1024;
 163 dtrace_optval_t dtrace_helper_actions_max = 32;
 164 dtrace_optval_t dtrace_helper_providers_max = 64;
 165 dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024);
 166 size_t          dtrace_strsize_default = 256;
 167 dtrace_optval_t dtrace_strsize_min = 8;
 168 dtrace_optval_t dtrace_strsize_max = 65536;
 169 dtrace_optval_t dtrace_cleanrate_default = 990099000;           /* 1.1 hz */
 170 dtrace_optval_t dtrace_cleanrate_min = 20000000;                        /* 50 hz */
 171 dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC;  /* 1/minute */
 172 dtrace_optval_t dtrace_aggrate_default = NANOSEC;               /* 1 hz */
 173 dtrace_optval_t dtrace_statusrate_default = NANOSEC;            /* 1 hz */
 174 dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC;  /* 6/minute */
 175 dtrace_optval_t dtrace_switchrate_default = NANOSEC;            /* 1 hz */
 176 dtrace_optval_t dtrace_nspec_default = 1;
 177 dtrace_optval_t dtrace_specsize_default = 32 * 1024;
 178 dtrace_optval_t dtrace_stackframes_default = 20;
 179 dtrace_optval_t dtrace_ustackframes_default = 20;
 180 dtrace_optval_t dtrace_jstackframes_default = 50;
 181 dtrace_optval_t dtrace_jstackstrsize_default = 512;
 182 dtrace_optval_t dtrace_buflimit_default = 75;
 183 dtrace_optval_t dtrace_buflimit_min = 1;
 184 dtrace_optval_t dtrace_buflimit_max = 99;
 185 int             dtrace_msgdsize_max = 128;
 186 hrtime_t        dtrace_chill_max = 500 * (NANOSEC / MILLISEC);  /* 500 ms */
 187 hrtime_t        dtrace_chill_interval = NANOSEC;                /* 1000 ms */
 188 int             dtrace_devdepth_max = 32;
 189 int             dtrace_err_verbose;
 190 int             dtrace_provide_private_probes = 0;
 191 hrtime_t        dtrace_deadman_interval = NANOSEC;
 192 hrtime_t        dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
 193 hrtime_t        dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
 194
 195 /*
 196  * DTrace External Variables
 197  *
 198  * As dtrace(7D) is a kernel module, any DTrace variables are obviously
 199  * available to DTrace consumers via the backtick (`) syntax.  One of these,
 200  * dtrace_zero, is made deliberately so:  it is provided as a source of
 201  * well-known, zero-filled memory.  While this variable is not documented,
 202  * it is used by some translators as an implementation detail.
 203  */
 204 const char      dtrace_zero[256] = { 0 };       /* zero-filled memory */
 205 unsigned int    dtrace_max_cpus = 0;            /* number of enabled cpus */
 206 /*
 207  * DTrace Internal Variables
 208  */
 209 static dev_info_t       *dtrace_devi;           /* device info */
 210 static vmem_t           *dtrace_arena;          /* probe ID arena */
 211 static dtrace_probe_t   **dtrace_probes;        /* array of all probes */
 212 static int              dtrace_nprobes;         /* number of probes */
 213 static dtrace_provider_t *dtrace_provider;      /* provider list */
 214 static dtrace_meta_t    *dtrace_meta_pid;       /* user-land meta provider */
 215 static int              dtrace_opens;           /* number of opens */
 216 static int              dtrace_helpers;         /* number of helpers */
 217 static dtrace_hash_t    *dtrace_strings;
 218 static dtrace_hash_t    *dtrace_byprov;         /* probes hashed by provider */
 219 static dtrace_hash_t    *dtrace_bymod;          /* probes hashed by module */
 220 static dtrace_hash_t    *dtrace_byfunc;         /* probes hashed by function */
 221 static dtrace_hash_t    *dtrace_byname;         /* probes hashed by name */
 222 static dtrace_toxrange_t *dtrace_toxrange;      /* toxic range array */
 223 static int              dtrace_toxranges;       /* number of toxic ranges */
 224 static int              dtrace_toxranges_max;   /* size of toxic range array */
 225 static dtrace_anon_t    dtrace_anon;            /* anonymous enabling */
 226 static kmem_cache_t     *dtrace_state_cache;    /* cache for dynamic state */
 227 static uint64_t         dtrace_vtime_references; /* number of vtimestamp refs */
 228 static kthread_t        *dtrace_panicked;       /* panicking thread */
 229 static dtrace_ecb_t     *dtrace_ecb_create_cache; /* cached created ECB */
 230 static dtrace_genid_t   dtrace_probegen;        /* current probe generation */
 231 static dtrace_helpers_t *dtrace_deferred_pid;   /* deferred helper list */
 232 static dtrace_enabling_t *dtrace_retained;      /* list of retained enablings */
 233 static dtrace_genid_t   dtrace_retained_gen;    /* current retained enab gen */
 234 static dtrace_dynvar_t  dtrace_dynhash_sink;    /* end of dynamic hash chains */
 235
 236 static int              dtrace_dof_mode;        /* See dtrace_impl.h for a description of Darwin's dof modes. */
 237
 238                         /*
 239                          * This does't quite fit as an internal variable, as it must be accessed in
 240                          * fbt_provide and sdt_provide. Its clearly not a dtrace tunable variable either...
 241                          */
 242 int                     dtrace_kernel_symbol_mode;      /* See dtrace_impl.h for a description of Darwin's kernel symbol modes. */
 243 static uint32_t         dtrace_wake_clients;
 244 static uint8_t      dtrace_kerneluuid[16];      /* the 128-bit uuid */
 245
 246 /*
 247  * To save memory, some common memory allocations are given a
 248  * unique zone. For example, dtrace_probe_t is 72 bytes in size,
 249  * which means it would fall into the kalloc.128 bucket. With
 250  * 20k elements allocated, the space saved is substantial.
 251  */
 252
 253 struct zone *dtrace_probe_t_zone;
 254
 255 static int dtrace_module_unloaded(struct kmod_info *kmod);
 256
 257 /*
 258  * DTrace Locking
 259  * DTrace is protected by three (relatively coarse-grained) locks:
 260  *
 261  * (1) dtrace_lock is required to manipulate essentially any DTrace state,
 262  *     including enabling state, probes, ECBs, consumer state, helper state,
 263  *     etc.  Importantly, dtrace_lock is _not_ required when in probe context;
 264  *     probe context is lock-free -- synchronization is handled via the
 265  *     dtrace_sync() cross call mechanism.
 266  *
 267  * (2) dtrace_provider_lock is required when manipulating provider state, or
 268  *     when provider state must be held constant.
 269  *
 270  * (3) dtrace_meta_lock is required when manipulating meta provider state, or
 271  *     when meta provider state must be held constant.
 272  *
 273  * The lock ordering between these three locks is dtrace_meta_lock before
 274  * dtrace_provider_lock before dtrace_lock.  (In particular, there are
 275  * several places where dtrace_provider_lock is held by the framework as it
 276  * calls into the providers -- which then call back into the framework,
 277  * grabbing dtrace_lock.)
 278  *
 279  * There are two other locks in the mix:  mod_lock and cpu_lock.  With respect
 280  * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
 281  * role as a coarse-grained lock; it is acquired before both of these locks.
 282  * With respect to dtrace_meta_lock, its behavior is stranger:  cpu_lock must
 283  * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
 284  * mod_lock is similar with respect to dtrace_provider_lock in that it must be
 285  * acquired _between_ dtrace_provider_lock and dtrace_lock.
 286  */
 287
 288
 289 /*
 290  * APPLE NOTE:
 291  *
 292  * For porting purposes, all kmutex_t vars have been changed
 293  * to lck_mtx_t, which require explicit initialization.
 294  *
 295  * kmutex_t becomes lck_mtx_t
 296  * mutex_enter() becomes lck_mtx_lock()
 297  * mutex_exit() becomes lck_mtx_unlock()
 298  *
 299  * Lock asserts are changed like this:
 300  *
 301  * ASSERT(MUTEX_HELD(&cpu_lock));
 302  *      becomes:
 303  * LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
 304  *
 305  */
 306 static lck_mtx_t        dtrace_lock;            /* probe state lock */
 307 static lck_mtx_t        dtrace_provider_lock;   /* provider state lock */
 308 static lck_mtx_t        dtrace_meta_lock;       /* meta-provider state lock */
 309 static lck_rw_t         dtrace_dof_mode_lock;   /* dof mode lock */
 310
 311 /*
 312  * DTrace Provider Variables
 313  *
 314  * These are the variables relating to DTrace as a provider (that is, the
 315  * provider of the BEGIN, END, and ERROR probes).
 316  */
 317 static dtrace_pattr_t   dtrace_provider_attr = {
 318 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
 319 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
 320 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
 321 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
 322 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
 323 };
 324
 325 static void
 326 dtrace_nullop(void)
 327 {}
 328
 329 static int
 330 dtrace_enable_nullop(void)
 331 {
 332     return (0);
 333 }
 334
 335 static dtrace_pops_t dtrace_provider_ops = {
 336         .dtps_provide = (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop,
 337         .dtps_provide_module =  (void (*)(void *, struct modctl *))dtrace_nullop,
 338         .dtps_enable =  (int (*)(void *, dtrace_id_t, void *))dtrace_nullop,
 339         .dtps_disable = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
 340         .dtps_suspend = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
 341         .dtps_resume =  (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
 342         .dtps_getargdesc =      NULL,
 343         .dtps_getargval =       NULL,
 344         .dtps_usermode =        NULL,
 345         .dtps_destroy = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
 346 };
 347
 348 static dtrace_id_t      dtrace_probeid_begin;   /* special BEGIN probe */
 349 static dtrace_id_t      dtrace_probeid_end;     /* special END probe */
 350 dtrace_id_t             dtrace_probeid_error;   /* special ERROR probe */
 351
 352 /*
 353  * DTrace Helper Tracing Variables
 354  */
 355 uint32_t dtrace_helptrace_next = 0;
 356 uint32_t dtrace_helptrace_nlocals;
 357 char    *dtrace_helptrace_buffer;
 358 size_t  dtrace_helptrace_bufsize = 512 * 1024;
 359
 360 #if DEBUG
 361 int     dtrace_helptrace_enabled = 1;
 362 #else
 363 int     dtrace_helptrace_enabled = 0;
 364 #endif
 365
 366 #if defined (__arm64__)
 367 /*
 368  * The ioctl for adding helper DOF is based on the
 369  * size of a user_addr_t.  We need to recognize both
 370  * U32 and U64 as the same action.
 371  */
 372 #define DTRACEHIOC_ADDDOF_U32       _IOW('h', 4, user32_addr_t)
 373 #define DTRACEHIOC_ADDDOF_U64       _IOW('h', 4, user64_addr_t)
 374 #endif  /* __arm64__ */
 375
 376 /*
 377  * DTrace Error Hashing
 378  *
 379  * On DEBUG kernels, DTrace will track the errors that has seen in a hash
 380  * table.  This is very useful for checking coverage of tests that are
 381  * expected to induce DIF or DOF processing errors, and may be useful for
 382  * debugging problems in the DIF code generator or in DOF generation .  The
 383  * error hash may be examined with the ::dtrace_errhash MDB dcmd.
 384  */
 385 #if DEBUG
 386 static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ];
 387 static const char *dtrace_errlast;
 388 static kthread_t *dtrace_errthread;
 389 static lck_mtx_t dtrace_errlock;
 390 #endif
 391
 392 /*
 393  * DTrace Macros and Constants
 394  *
 395  * These are various macros that are useful in various spots in the
 396  * implementation, along with a few random constants that have no meaning
 397  * outside of the implementation.  There is no real structure to this cpp
 398  * mishmash -- but is there ever?
 399  */
 400
 401 #define DTRACE_GETSTR(hash, elm)        \
 402         (hash->dth_getstr(elm, hash->dth_stroffs))
 403
 404 #define DTRACE_HASHSTR(hash, elm)       \
 405         dtrace_hash_str(DTRACE_GETSTR(hash, elm))
 406
 407 #define DTRACE_HASHNEXT(hash, elm)      \
 408         (void**)((uintptr_t)(elm) + (hash)->dth_nextoffs)
 409
 410 #define DTRACE_HASHPREV(hash, elm)      \
 411         (void**)((uintptr_t)(elm) + (hash)->dth_prevoffs)
 412
 413 #define DTRACE_HASHEQ(hash, lhs, rhs)   \
 414         (strcmp(DTRACE_GETSTR(hash, lhs), \
 415             DTRACE_GETSTR(hash, rhs)) == 0)
 416
 417 #define DTRACE_AGGHASHSIZE_SLEW         17
 418
 419 #define DTRACE_V4MAPPED_OFFSET          (sizeof (uint32_t) * 3)
 420
 421 /*
 422  * The key for a thread-local variable consists of the lower 61 bits of the
 423  * current_thread(), plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
 424  * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
 425  * equal to a variable identifier.  This is necessary (but not sufficient) to
 426  * assure that global associative arrays never collide with thread-local
 427  * variables.  To guarantee that they cannot collide, we must also define the
 428  * order for keying dynamic variables.  That order is:
 429  *
 430  *   [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
 431  *
 432  * Because the variable-key and the tls-key are in orthogonal spaces, there is
 433  * no way for a global variable key signature to match a thread-local key
 434  * signature.
 435  */
 436 #if defined (__x86_64__)
 437 /* FIXME: two function calls!! */
 438 #define DTRACE_TLS_THRKEY(where) { \
 439         uint_t intr = ml_at_interrupt_context(); /* Note: just one measly bit */ \
 440         uint64_t thr = (uintptr_t)current_thread(); \
 441         ASSERT(intr < (1 << 3)); \
 442         (where) = ((thr + DIF_VARIABLE_MAX) & \
 443             (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
 444 }
 445 #elif defined(__arm__)
 446 /* FIXME: three function calls!!! */
 447 #define DTRACE_TLS_THRKEY(where) { \
 448         uint_t intr = ml_at_interrupt_context(); /* Note: just one measly bit */ \
 449         uint64_t thr = (uintptr_t)current_thread(); \
 450         uint_t pid = (uint_t)dtrace_proc_selfpid(); \
 451         ASSERT(intr < (1 << 3)); \
 452         (where) = (((thr << 32 | pid) + DIF_VARIABLE_MAX) & \
 453             (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
 454 }
 455 #elif defined (__arm64__)
 456 /* FIXME: two function calls!! */
 457 #define DTRACE_TLS_THRKEY(where) { \
 458         uint_t intr = ml_at_interrupt_context(); /* Note: just one measly bit */ \
 459         uint64_t thr = (uintptr_t)current_thread(); \
 460         ASSERT(intr < (1 << 3)); \
 461         (where) = ((thr + DIF_VARIABLE_MAX) & \
 462             (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
 463 }
 464 #else
 465 #error Unknown architecture
 466 #endif
 467
 468 #define DT_BSWAP_8(x)   ((x) & 0xff)
 469 #define DT_BSWAP_16(x)  ((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
 470 #define DT_BSWAP_32(x)  ((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
 471 #define DT_BSWAP_64(x)  ((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
 472
 473 #define DT_MASK_LO 0x00000000FFFFFFFFULL
 474
 475 #define DTRACE_STORE(type, tomax, offset, what) \
 476         *((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
 477
 478
 479 #define DTRACE_ALIGNCHECK(addr, size, flags)                            \
 480         if (addr & (MIN(size,4) - 1)) {                                 \
 481                 *flags |= CPU_DTRACE_BADALIGN;                          \
 482                 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;        \
 483                 return (0);                                             \
 484         }
 485
 486 #define DTRACE_RANGE_REMAIN(remp, addr, baseaddr, basesz)               \
 487 do {                                                                    \
 488         if ((remp) != NULL) {                                           \
 489                 *(remp) = (uintptr_t)(baseaddr) + (basesz) - (addr);    \
 490         }                                                               \
 491 } while (0)
 492
 493
 494 /*
 495  * Test whether a range of memory starting at testaddr of size testsz falls
 496  * within the range of memory described by addr, sz.  We take care to avoid
 497  * problems with overflow and underflow of the unsigned quantities, and
 498  * disallow all negative sizes.  Ranges of size 0 are allowed.
 499  */
 500 #define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
 501         ((testaddr) - (baseaddr) < (basesz) && \
 502         (testaddr) + (testsz) - (baseaddr) <= (basesz) && \
 503         (testaddr) + (testsz) >= (testaddr))
 504
 505 /*
 506  * Test whether alloc_sz bytes will fit in the scratch region.  We isolate
 507  * alloc_sz on the righthand side of the comparison in order to avoid overflow
 508  * or underflow in the comparison with it.  This is simpler than the INRANGE
 509  * check above, because we know that the dtms_scratch_ptr is valid in the
 510  * range.  Allocations of size zero are allowed.
 511  */
 512 #define DTRACE_INSCRATCH(mstate, alloc_sz) \
 513         ((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
 514         (mstate)->dtms_scratch_ptr >= (alloc_sz))
 515
 516 #define RECOVER_LABEL(bits) dtraceLoadRecover##bits:
 517
 518 #if defined (__x86_64__) || (defined (__arm__) || defined (__arm64__))
 519 #define DTRACE_LOADFUNC(bits)                                           \
 520 /*CSTYLED*/                                                             \
 521 uint##bits##_t dtrace_load##bits(uintptr_t addr);                       \
 522                                                                         \
 523 uint##bits##_t                                                          \
 524 dtrace_load##bits(uintptr_t addr)                                       \
 525 {                                                                       \
 526         size_t size = bits / NBBY;                                      \
 527         /*CSTYLED*/                                                     \
 528         uint##bits##_t rval = 0;                                        \
 529         int i;                                                          \
 530         volatile uint16_t *flags = (volatile uint16_t *)                \
 531             &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;                   \
 532                                                                         \
 533         DTRACE_ALIGNCHECK(addr, size, flags);                           \
 534                                                                         \
 535         for (i = 0; i < dtrace_toxranges; i++) {                        \
 536                 if (addr >= dtrace_toxrange[i].dtt_limit)               \
 537                         continue;                                       \
 538                                                                         \
 539                 if (addr + size <= dtrace_toxrange[i].dtt_base)         \
 540                         continue;                                       \
 541                                                                         \
 542                 /*                                                      \
 543                  * This address falls within a toxic region; return 0.  \
 544                  */                                                     \
 545                 *flags |= CPU_DTRACE_BADADDR;                           \
 546                 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;        \
 547                 return (0);                                             \
 548         }                                                               \
 549                                                                         \
 550         {                                                               \
 551         volatile vm_offset_t recover = (vm_offset_t)&&dtraceLoadRecover##bits;          \
 552         *flags |= CPU_DTRACE_NOFAULT;                                   \
 553         recover = dtrace_set_thread_recover(current_thread(), recover); \
 554         /*CSTYLED*/                                                     \
 555         /*                                                              \
 556         * PR6394061 - avoid device memory that is unpredictably         \
 557         * mapped and unmapped                                           \
 558         */                                                              \
 559         if (pmap_valid_page(pmap_find_phys(kernel_pmap, addr)))         \
 560             rval = *((volatile uint##bits##_t *)addr);                  \
 561         else {                                                          \
 562                 *flags |= CPU_DTRACE_BADADDR;                           \
 563                 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr;        \
 564                 return (0);                                             \
 565         }                                                               \
 566                                                                         \
 567         RECOVER_LABEL(bits);                                            \
 568         (void)dtrace_set_thread_recover(current_thread(), recover);     \
 569         *flags &= ~CPU_DTRACE_NOFAULT;                                  \
 570         }                                                               \
 571                                                                         \
 572         return (rval);                                                  \
 573 }
 574 #else /* all other architectures */
 575 #error Unknown Architecture
 576 #endif
 577
 578 #ifdef __LP64__
 579 #define dtrace_loadptr  dtrace_load64
 580 #else
 581 #define dtrace_loadptr  dtrace_load32
 582 #endif
 583
 584 #define DTRACE_DYNHASH_FREE     0
 585 #define DTRACE_DYNHASH_SINK     1
 586 #define DTRACE_DYNHASH_VALID    2
 587
 588 #define DTRACE_MATCH_FAIL       -1
 589 #define DTRACE_MATCH_NEXT       0
 590 #define DTRACE_MATCH_DONE       1
 591 #define DTRACE_ANCHORED(probe)  ((probe)->dtpr_func[0] != '\0')
 592 #define DTRACE_STATE_ALIGN      64
 593
 594 #define DTRACE_FLAGS2FLT(flags)                                         \
 595         (((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR :           \
 596         ((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP :                \
 597         ((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO :            \
 598         ((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV :                \
 599         ((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV :                \
 600         ((flags) & CPU_DTRACE_TUPOFLOW) ?  DTRACEFLT_TUPOFLOW :         \
 601         ((flags) & CPU_DTRACE_BADALIGN) ?  DTRACEFLT_BADALIGN :         \
 602         ((flags) & CPU_DTRACE_NOSCRATCH) ?  DTRACEFLT_NOSCRATCH :       \
 603         ((flags) & CPU_DTRACE_BADSTACK) ?  DTRACEFLT_BADSTACK :         \
 604         DTRACEFLT_UNKNOWN)
 605
 606 #define DTRACEACT_ISSTRING(act)                                         \
 607         ((act)->dta_kind == DTRACEACT_DIFEXPR &&                        \
 608         (act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
 609
 610
 611 static size_t dtrace_strlen(const char *, size_t);
 612 static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
 613 static void dtrace_enabling_provide(dtrace_provider_t *);
 614 static int dtrace_enabling_match(dtrace_enabling_t *, int *, dtrace_match_cond_t *cond);
 615 static void dtrace_enabling_matchall_with_cond(dtrace_match_cond_t *cond);
 616 static void dtrace_enabling_matchall(void);
 617 static dtrace_state_t *dtrace_anon_grab(void);
 618 static uint64_t dtrace_helper(int, dtrace_mstate_t *,
 619     dtrace_state_t *, uint64_t, uint64_t);
 620 static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
 621 static void dtrace_buffer_drop(dtrace_buffer_t *);
 622 static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
 623     dtrace_state_t *, dtrace_mstate_t *);
 624 static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
 625     dtrace_optval_t);
 626 static int dtrace_ecb_create_enable(dtrace_probe_t *, void *, void *);
 627 static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
 628 static int dtrace_canload_remains(uint64_t, size_t, size_t *,
 629         dtrace_mstate_t *, dtrace_vstate_t *);
 630 static int dtrace_canstore_remains(uint64_t, size_t, size_t *,
 631         dtrace_mstate_t *, dtrace_vstate_t *);
 632
 633
 634 /*
 635  * DTrace sysctl handlers
 636  *
 637  * These declarations and functions are used for a deeper DTrace configuration.
 638  * Most of them are not per-consumer basis and may impact the other DTrace
 639  * consumers.  Correctness may not be supported for all the variables, so you
 640  * should be careful about what values you are using.
 641  */
 642
 643 SYSCTL_DECL(_kern_dtrace);
 644 SYSCTL_NODE(_kern, OID_AUTO, dtrace, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "dtrace");
 645
 646 static int
 647 sysctl_dtrace_err_verbose SYSCTL_HANDLER_ARGS
 648 {
 649 #pragma unused(oidp, arg2)
 650         int changed, error;
 651         int value = *(int *) arg1;
 652
 653         error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
 654         if (error || !changed)
 655                 return (error);
 656
 657         if (value != 0 && value != 1)
 658                 return (ERANGE);
 659
 660         lck_mtx_lock(&dtrace_lock);
 661                 dtrace_err_verbose = value;
 662         lck_mtx_unlock(&dtrace_lock);
 663
 664         return (0);
 665 }
 666
 667 /*
 668  * kern.dtrace.err_verbose
 669  *
 670  * Set DTrace verbosity when an error occured (0 = disabled, 1 = enabld).
 671  * Errors are reported when a DIFO or a DOF has been rejected by the kernel.
 672  */
 673 SYSCTL_PROC(_kern_dtrace, OID_AUTO, err_verbose,
 674         CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
 675         &dtrace_err_verbose, 0,
 676         sysctl_dtrace_err_verbose, "I", "dtrace error verbose");
 677
 678 static int
 679 sysctl_dtrace_buffer_memory_maxsize SYSCTL_HANDLER_ARGS
 680 {
 681 #pragma unused(oidp, arg2, req)
 682         int changed, error;
 683         uint64_t value = *(uint64_t *) arg1;
 684
 685         error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
 686         if (error || !changed)
 687                 return (error);
 688
 689         if (value <= dtrace_buffer_memory_inuse)
 690                 return (ERANGE);
 691
 692         lck_mtx_lock(&dtrace_lock);
 693                 dtrace_buffer_memory_maxsize = value;
 694         lck_mtx_unlock(&dtrace_lock);
 695
 696         return (0);
 697 }
 698
 699 /*
 700  * kern.dtrace.buffer_memory_maxsize
 701  *
 702  * Set DTrace maximal size in bytes used by all the consumers' state buffers.  By default
 703  * the limit is PHYS_MEM / 3 for *all* consumers.  Attempting to set a null, a negative value
 704  * or a value <= to dtrace_buffer_memory_inuse will result in a failure.
 705  */
 706 SYSCTL_PROC(_kern_dtrace, OID_AUTO, buffer_memory_maxsize,
 707         CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
 708         &dtrace_buffer_memory_maxsize, 0,
 709         sysctl_dtrace_buffer_memory_maxsize, "Q", "dtrace state buffer memory maxsize");
 710
 711 /*
 712  * kern.dtrace.buffer_memory_inuse
 713  *
 714  * Current state buffer memory used, in bytes, by all the DTrace consumers.
 715  * This value is read-only.
 716  */
 717 SYSCTL_QUAD(_kern_dtrace, OID_AUTO, buffer_memory_inuse, CTLFLAG_RD | CTLFLAG_LOCKED,
 718         &dtrace_buffer_memory_inuse, "dtrace state buffer memory in-use");
 719
 720 static int
 721 sysctl_dtrace_difo_maxsize SYSCTL_HANDLER_ARGS
 722 {
 723 #pragma unused(oidp, arg2, req)
 724         int changed, error;
 725         size_t value = *(size_t*) arg1;
 726
 727         error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
 728         if (error || !changed)
 729                 return (error);
 730
 731         if (value <= 0)
 732                 return (ERANGE);
 733
 734         lck_mtx_lock(&dtrace_lock);
 735                 dtrace_difo_maxsize = value;
 736         lck_mtx_unlock(&dtrace_lock);
 737
 738         return (0);
 739 }
 740
 741 /*
 742  * kern.dtrace.difo_maxsize
 743  *
 744  * Set the DIFO max size in bytes, check the definition of dtrace_difo_maxsize
 745  * to get the default value.  Attempting to set a null or negative size will
 746  * result in a failure.
 747  */
 748 SYSCTL_PROC(_kern_dtrace, OID_AUTO, difo_maxsize,
 749         CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
 750         &dtrace_difo_maxsize, 0,
 751         sysctl_dtrace_difo_maxsize, "Q", "dtrace difo maxsize");
 752
 753 static int
 754 sysctl_dtrace_dof_maxsize SYSCTL_HANDLER_ARGS
 755 {
 756 #pragma unused(oidp, arg2, req)
 757         int changed, error;
 758         dtrace_optval_t value = *(dtrace_optval_t *) arg1;
 759
 760         error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
 761         if (error || !changed)
 762                 return (error);
 763
 764         if (value <= 0)
 765                 return (ERANGE);
 766
 767         if (value >= dtrace_copy_maxsize())
 768                 return (ERANGE);
 769
 770         lck_mtx_lock(&dtrace_lock);
 771                 dtrace_dof_maxsize = value;
 772         lck_mtx_unlock(&dtrace_lock);
 773
 774         return (0);
 775 }
 776
 777 /*
 778  * kern.dtrace.dof_maxsize
 779  *
 780  * Set the DOF max size in bytes, check the definition of dtrace_dof_maxsize to
 781  * get the default value.  Attempting to set a null or negative size will result
 782  * in a failure.
 783  */
 784 SYSCTL_PROC(_kern_dtrace, OID_AUTO, dof_maxsize,
 785         CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
 786         &dtrace_dof_maxsize, 0,
 787         sysctl_dtrace_dof_maxsize, "Q", "dtrace dof maxsize");
 788
 789 static int
 790 sysctl_dtrace_statvar_maxsize SYSCTL_HANDLER_ARGS
 791 {
 792 #pragma unused(oidp, arg2, req)
 793         int changed, error;
 794         dtrace_optval_t value = *(dtrace_optval_t*) arg1;
 795
 796         error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
 797         if (error || !changed)
 798                 return (error);
 799
 800         if (value <= 0)
 801                 return (ERANGE);
 802         if (value > dtrace_statvar_maxsize_max)
 803                 return (ERANGE);
 804
 805         lck_mtx_lock(&dtrace_lock);
 806                 dtrace_statvar_maxsize = value;
 807         lck_mtx_unlock(&dtrace_lock);
 808
 809         return (0);
 810 }
 811
 812 /*
 813  * kern.dtrace.global_maxsize
 814  *
 815  * Set the variable max size in bytes, check the definition of
 816  * dtrace_statvar_maxsize to get the default value.  Attempting to set a null,
 817  * too high or negative size will result in a failure.
 818  */
 819 SYSCTL_PROC(_kern_dtrace, OID_AUTO, global_maxsize,
 820         CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
 821         &dtrace_statvar_maxsize, 0,
 822         sysctl_dtrace_statvar_maxsize, "Q", "dtrace statvar maxsize");
 823
 824 static int
 825 sysctl_dtrace_provide_private_probes SYSCTL_HANDLER_ARGS
 826 {
 827 #pragma unused(oidp, arg2)
 828         int error;
 829         int value = *(int *) arg1;
 830
 831         error = sysctl_io_number(req, value, sizeof(value), &value, NULL);
 832         if (error)
 833                 return (error);
 834
 835         if (req->newptr) {
 836                 if (value != 0 && value != 1)
 837                         return (ERANGE);
 838
 839                 /*
 840                  * We do not allow changing this back to zero, as private probes
 841                  * would still be left registered
 842                  */
 843                 if (value != 1)
 844                         return (EPERM);
 845
 846                 lck_mtx_lock(&dtrace_lock);
 847                 dtrace_provide_private_probes = value;
 848                 lck_mtx_unlock(&dtrace_lock);
 849         }
 850         return (0);
 851 }
 852
 853 /*
 854  * kern.dtrace.provide_private_probes
 855  *
 856  * Set whether the providers must provide the private probes.  This is
 857  * mainly used by the FBT provider to request probes for the private/static
 858  * symbols.
 859  */
 860 SYSCTL_PROC(_kern_dtrace, OID_AUTO, provide_private_probes,
 861         CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
 862         &dtrace_provide_private_probes, 0,
 863         sysctl_dtrace_provide_private_probes, "I", "provider must provide the private probes");
 864
 865 /*
 866  * kern.dtrace.dof_mode
 867  *
 868  * Returns the current DOF mode.
 869  * This value is read-only.
 870  */
 871 SYSCTL_INT(_kern_dtrace, OID_AUTO, dof_mode, CTLFLAG_RD | CTLFLAG_LOCKED,
 872         &dtrace_dof_mode, 0, "dtrace dof mode");
 873
 874 /*
 875  * DTrace Probe Context Functions
 876  *
 877  * These functions are called from probe context.  Because probe context is
 878  * any context in which C may be called, arbitrarily locks may be held,
 879  * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
 880  * As a result, functions called from probe context may only call other DTrace
 881  * support functions -- they may not interact at all with the system at large.
 882  * (Note that the ASSERT macro is made probe-context safe by redefining it in
 883  * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
 884  * loads are to be performed from probe context, they _must_ be in terms of
 885  * the safe dtrace_load*() variants.
 886  *
 887  * Some functions in this block are not actually called from probe context;
 888  * for these functions, there will be a comment above the function reading
 889  * "Note:  not called from probe context."
 890  */
 891
 892 int
 893 dtrace_assfail(const char *a, const char *f, int l)
 894 {
 895         panic("dtrace: assertion failed: %s, file: %s, line: %d", a, f, l);
 896
 897         /*
 898          * We just need something here that even the most clever compiler
 899          * cannot optimize away.
 900          */
 901         return (a[(uintptr_t)f]);
 902 }
 903
 904 /*
 905  * Atomically increment a specified error counter from probe context.
 906  */
 907 static void
 908 dtrace_error(uint32_t *counter)
 909 {
 910         /*
 911          * Most counters stored to in probe context are per-CPU counters.
 912          * However, there are some error conditions that are sufficiently
 913          * arcane that they don't merit per-CPU storage.  If these counters
 914          * are incremented concurrently on different CPUs, scalability will be
 915          * adversely affected -- but we don't expect them to be white-hot in a
 916          * correctly constructed enabling...
 917          */
 918         uint32_t oval, nval;
 919
 920         do {
 921                 oval = *counter;
 922
 923                 if ((nval = oval + 1) == 0) {
 924                         /*
 925                          * If the counter would wrap, set it to 1 -- assuring
 926                          * that the counter is never zero when we have seen
 927                          * errors.  (The counter must be 32-bits because we
 928                          * aren't guaranteed a 64-bit compare&swap operation.)
 929                          * To save this code both the infamy of being fingered
 930                          * by a priggish news story and the indignity of being
 931                          * the target of a neo-puritan witch trial, we're
 932                          * carefully avoiding any colorful description of the
 933                          * likelihood of this condition -- but suffice it to
 934                          * say that it is only slightly more likely than the
 935                          * overflow of predicate cache IDs, as discussed in
 936                          * dtrace_predicate_create().
 937                          */
 938                         nval = 1;
 939                 }
 940         } while (dtrace_cas32(counter, oval, nval) != oval);
 941 }
 942
 943 /*
 944  * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
 945  * uint8_t, a uint16_t, a uint32_t and a uint64_t.
 946  */
 947 DTRACE_LOADFUNC(8)
 948 DTRACE_LOADFUNC(16)
 949 DTRACE_LOADFUNC(32)
 950 DTRACE_LOADFUNC(64)
 951
 952 static int
 953 dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
 954 {
 955         if (dest < mstate->dtms_scratch_base)
 956                 return (0);
 957
 958         if (dest + size < dest)
 959                 return (0);
 960
 961         if (dest + size > mstate->dtms_scratch_ptr)
 962                 return (0);
 963
 964         return (1);
 965 }
 966
 967 static int
 968 dtrace_canstore_statvar(uint64_t addr, size_t sz, size_t *remain,
 969     dtrace_statvar_t **svars, int nsvars)
 970 {
 971         int i;
 972
 973         size_t maxglobalsize, maxlocalsize;
 974
 975         maxglobalsize = dtrace_statvar_maxsize + sizeof (uint64_t);
 976         maxlocalsize = (maxglobalsize) * NCPU;
 977
 978         if (nsvars == 0)
 979                 return (0);
 980
 981         for (i = 0; i < nsvars; i++) {
 982                 dtrace_statvar_t *svar = svars[i];
 983                 uint8_t scope;
 984                 size_t size;
 985
 986                 if (svar == NULL || (size = svar->dtsv_size) == 0)
 987                         continue;
 988
 989                 scope = svar->dtsv_var.dtdv_scope;
 990
 991                 /**
 992                  * We verify that our size is valid in the spirit of providing
 993                  * defense in depth:  we want to prevent attackers from using
 994                  * DTrace to escalate an orthogonal kernel heap corruption bug
 995                  * into the ability to store to arbitrary locations in memory.
 996                  */
 997                 VERIFY((scope == DIFV_SCOPE_GLOBAL && size <= maxglobalsize) ||
 998                         (scope == DIFV_SCOPE_LOCAL && size <= maxlocalsize));
 999
1000                 if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size)) {
1001                         DTRACE_RANGE_REMAIN(remain, addr, svar->dtsv_data,
1002                                 svar->dtsv_size);
1003                         return (1);
1004                 }
1005         }
1006
1007         return (0);
1008 }
1009
1010 /*
1011  * Check to see if the address is within a memory region to which a store may
1012  * be issued.  This includes the DTrace scratch areas, and any DTrace variable
1013  * region.  The caller of dtrace_canstore() is responsible for performing any
1014  * alignment checks that are needed before stores are actually executed.
1015  */
1016 static int
1017 dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
1018     dtrace_vstate_t *vstate)
1019 {
1020         return (dtrace_canstore_remains(addr, sz, NULL, mstate, vstate));
1021 }
1022 /*
1023  * Implementation of dtrace_canstore which communicates the upper bound of the
1024  * allowed memory region.
1025  */
1026 static int
1027 dtrace_canstore_remains(uint64_t addr, size_t sz, size_t *remain,
1028         dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1029 {
1030         /*
1031          * First, check to see if the address is in scratch space...
1032          */
1033         if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
1034             mstate->dtms_scratch_size)) {
1035                 DTRACE_RANGE_REMAIN(remain, addr, mstate->dtms_scratch_base,
1036                         mstate->dtms_scratch_size);
1037                 return (1);
1038         }
1039         /*
1040          * Now check to see if it's a dynamic variable.  This check will pick
1041          * up both thread-local variables and any global dynamically-allocated
1042          * variables.
1043          */
1044         if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base,
1045             vstate->dtvs_dynvars.dtds_size)) {
1046                 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
1047                 uintptr_t base = (uintptr_t)dstate->dtds_base +
1048                     (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
1049                 uintptr_t chunkoffs;
1050                 dtrace_dynvar_t *dvar;
1051
1052                 /*
1053                  * Before we assume that we can store here, we need to make
1054                  * sure that it isn't in our metadata -- storing to our
1055                  * dynamic variable metadata would corrupt our state.  For
1056                  * the range to not include any dynamic variable metadata,
1057                  * it must:
1058                  *
1059                  *      (1) Start above the hash table that is at the base of
1060                  *      the dynamic variable space
1061                  *
1062                  *      (2) Have a starting chunk offset that is beyond the
1063                  *      dtrace_dynvar_t that is at the base of every chunk
1064                  *
1065                  *      (3) Not span a chunk boundary
1066                  *
1067                  *      (4) Not be in the tuple space of a dynamic variable
1068                  *
1069                  */
1070                 if (addr < base)
1071                         return (0);
1072
1073                 chunkoffs = (addr - base) % dstate->dtds_chunksize;
1074
1075                 if (chunkoffs < sizeof (dtrace_dynvar_t))
1076                         return (0);
1077
1078                 if (chunkoffs + sz > dstate->dtds_chunksize)
1079                         return (0);
1080
1081                 dvar = (dtrace_dynvar_t *)((uintptr_t)addr - chunkoffs);
1082
1083                 if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE)
1084                         return (0);
1085
1086                 if (chunkoffs < sizeof (dtrace_dynvar_t) +
1087                         ((dvar->dtdv_tuple.dtt_nkeys - 1) * sizeof (dtrace_key_t)))
1088                         return (0);
1089
1090                 return (1);
1091         }
1092
1093         /*
1094          * Finally, check the static local and global variables.  These checks
1095          * take the longest, so we perform them last.
1096          */
1097         if (dtrace_canstore_statvar(addr, sz, remain,
1098             vstate->dtvs_locals, vstate->dtvs_nlocals))
1099                 return (1);
1100
1101         if (dtrace_canstore_statvar(addr, sz, remain,
1102             vstate->dtvs_globals, vstate->dtvs_nglobals))
1103                 return (1);
1104
1105         return (0);
1106 }
1107
1108
1109 /*
1110  * Convenience routine to check to see if the address is within a memory
1111  * region in which a load may be issued given the user's privilege level;
1112  * if not, it sets the appropriate error flags and loads 'addr' into the
1113  * illegal value slot.
1114  *
1115  * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
1116  * appropriate memory access protection.
1117  */
1118 int
1119 dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
1120     dtrace_vstate_t *vstate)
1121 {
1122         return (dtrace_canload_remains(addr, sz, NULL, mstate, vstate));
1123 }
1124
1125 /*
1126  * Implementation of dtrace_canload which communicates the upper bound of the
1127  * allowed memory region.
1128  */
1129 static int
1130 dtrace_canload_remains(uint64_t addr, size_t sz, size_t *remain,
1131         dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1132 {
1133         volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
1134
1135         /*
1136          * If we hold the privilege to read from kernel memory, then
1137          * everything is readable.
1138          */
1139         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
1140                 DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
1141                 return (1);
1142         }
1143
1144         /*
1145          * You can obviously read that which you can store.
1146          */
1147         if (dtrace_canstore_remains(addr, sz, remain, mstate, vstate))
1148                 return (1);
1149
1150         /*
1151          * We're allowed to read from our own string table.
1152          */
1153         if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab,
1154             mstate->dtms_difo->dtdo_strlen)) {
1155                 DTRACE_RANGE_REMAIN(remain, addr,
1156                         mstate->dtms_difo->dtdo_strtab,
1157                         mstate->dtms_difo->dtdo_strlen);
1158                 return (1);
1159         }
1160
1161         DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
1162         *illval = addr;
1163         return (0);
1164 }
1165
1166 /*
1167  * Convenience routine to check to see if a given string is within a memory
1168  * region in which a load may be issued given the user's privilege level;
1169  * this exists so that we don't need to issue unnecessary dtrace_strlen()
1170  * calls in the event that the user has all privileges.
1171  */
1172 static int
1173 dtrace_strcanload(uint64_t addr, size_t sz, size_t *remain,
1174         dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1175 {
1176         size_t rsize;
1177
1178         /*
1179          * If we hold the privilege to read from kernel memory, then
1180          * everything is readable.
1181          */
1182         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
1183                 DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
1184                 return (1);
1185         }
1186
1187         /*
1188          * Even if the caller is uninterested in querying the remaining valid
1189          * range, it is required to ensure that the access is allowed.
1190          */
1191         if (remain == NULL) {
1192                 remain = &rsize;
1193         }
1194         if (dtrace_canload_remains(addr, 0, remain, mstate, vstate)) {
1195                 size_t strsz;
1196                 /*
1197                  * Perform the strlen after determining the length of the
1198                  * memory region which is accessible.  This prevents timing
1199                  * information from being used to find NULs in memory which is
1200                  * not accessible to the caller.
1201                  */
1202                 strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr,
1203                         MIN(sz, *remain));
1204                 if (strsz <= *remain) {
1205                         return (1);
1206                 }
1207         }
1208
1209         return (0);
1210 }
1211
1212 /*
1213  * Convenience routine to check to see if a given variable is within a memory
1214  * region in which a load may be issued given the user's privilege level.
1215  */
1216 static int
1217 dtrace_vcanload(void *src, dtrace_diftype_t *type, size_t *remain,
1218         dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1219 {
1220         size_t sz;
1221         ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1222
1223         /*
1224          * Calculate the max size before performing any checks since even
1225          * DTRACE_ACCESS_KERNEL-credentialed callers expect that this function
1226          * return the max length via 'remain'.
1227          */
1228         if (type->dtdt_kind == DIF_TYPE_STRING) {
1229                 dtrace_state_t *state = vstate->dtvs_state;
1230
1231                 if (state != NULL) {
1232                         sz = state->dts_options[DTRACEOPT_STRSIZE];
1233                 } else {
1234                         /*
1235                          * In helper context, we have a NULL state; fall back
1236                          * to using the system-wide default for the string size
1237                          * in this case.
1238                          */
1239                         sz = dtrace_strsize_default;
1240                 }
1241         } else {
1242                 sz = type->dtdt_size;
1243         }
1244
1245         /*
1246          * If we hold the privilege to read from kernel memory, then
1247          * everything is readable.
1248          */
1249         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
1250                 DTRACE_RANGE_REMAIN(remain, (uintptr_t)src, src, sz);
1251                 return (1);
1252         }
1253
1254         if (type->dtdt_kind == DIF_TYPE_STRING) {
1255                 return (dtrace_strcanload((uintptr_t)src, sz, remain, mstate,
1256                         vstate));
1257         }
1258         return (dtrace_canload_remains((uintptr_t)src, sz, remain, mstate,
1259                 vstate));
1260 }
1261
1262 /*
1263  * Compare two strings using safe loads.
1264  */
1265 static int
1266 dtrace_strncmp(char *s1, char *s2, size_t limit)
1267 {
1268         uint8_t c1, c2;
1269         volatile uint16_t *flags;
1270
1271         if (s1 == s2 || limit == 0)
1272                 return (0);
1273
1274         flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1275
1276         do {
1277                 if (s1 == NULL) {
1278                         c1 = '\0';
1279                 } else {
1280                         c1 = dtrace_load8((uintptr_t)s1++);
1281                 }
1282
1283                 if (s2 == NULL) {
1284                         c2 = '\0';
1285                 } else {
1286                         c2 = dtrace_load8((uintptr_t)s2++);
1287                 }
1288
1289                 if (c1 != c2)
1290                         return (c1 - c2);
1291         } while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
1292
1293         return (0);
1294 }
1295
1296 /*
1297  * Compute strlen(s) for a string using safe memory accesses.  The additional
1298  * len parameter is used to specify a maximum length to ensure completion.
1299  */
1300 static size_t
1301 dtrace_strlen(const char *s, size_t lim)
1302 {
1303         uint_t len;
1304
1305         for (len = 0; len != lim; len++) {
1306                 if (dtrace_load8((uintptr_t)s++) == '\0')
1307                         break;
1308         }
1309
1310         return (len);
1311 }
1312
1313 /*
1314  * Check if an address falls within a toxic region.
1315  */
1316 static int
1317 dtrace_istoxic(uintptr_t kaddr, size_t size)
1318 {
1319         uintptr_t taddr, tsize;
1320         int i;
1321
1322         for (i = 0; i < dtrace_toxranges; i++) {
1323                 taddr = dtrace_toxrange[i].dtt_base;
1324                 tsize = dtrace_toxrange[i].dtt_limit - taddr;
1325
1326                 if (kaddr - taddr < tsize) {
1327                         DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1328                         cpu_core[CPU->cpu_id].cpuc_dtrace_illval = kaddr;
1329                         return (1);
1330                 }
1331
1332                 if (taddr - kaddr < size) {
1333                         DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1334                         cpu_core[CPU->cpu_id].cpuc_dtrace_illval = taddr;
1335                         return (1);
1336                 }
1337         }
1338
1339         return (0);
1340 }
1341
1342 /*
1343  * Copy src to dst using safe memory accesses.  The src is assumed to be unsafe
1344  * memory specified by the DIF program.  The dst is assumed to be safe memory
1345  * that we can store to directly because it is managed by DTrace.  As with
1346  * standard bcopy, overlapping copies are handled properly.
1347  */
1348 static void
1349 dtrace_bcopy(const void *src, void *dst, size_t len)
1350 {
1351         if (len != 0) {
1352                 uint8_t *s1 = dst;
1353                 const uint8_t *s2 = src;
1354
1355                 if (s1 <= s2) {
1356                         do {
1357                                 *s1++ = dtrace_load8((uintptr_t)s2++);
1358                         } while (--len != 0);
1359                 } else {
1360                         s2 += len;
1361                         s1 += len;
1362
1363                         do {
1364                                 *--s1 = dtrace_load8((uintptr_t)--s2);
1365                         } while (--len != 0);
1366                 }
1367         }
1368 }
1369
1370 /*
1371  * Copy src to dst using safe memory accesses, up to either the specified
1372  * length, or the point that a nul byte is encountered.  The src is assumed to
1373  * be unsafe memory specified by the DIF program.  The dst is assumed to be
1374  * safe memory that we can store to directly because it is managed by DTrace.
1375  * Unlike dtrace_bcopy(), overlapping regions are not handled.
1376  */
1377 static void
1378 dtrace_strcpy(const void *src, void *dst, size_t len)
1379 {
1380         if (len != 0) {
1381                 uint8_t *s1 = dst, c;
1382                 const uint8_t *s2 = src;
1383
1384                 do {
1385                         *s1++ = c = dtrace_load8((uintptr_t)s2++);
1386                 } while (--len != 0 && c != '\0');
1387         }
1388 }
1389
1390 /*
1391  * Copy src to dst, deriving the size and type from the specified (BYREF)
1392  * variable type.  The src is assumed to be unsafe memory specified by the DIF
1393  * program.  The dst is assumed to be DTrace variable memory that is of the
1394  * specified type; we assume that we can store to directly.
1395  */
1396 static void
1397 dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type, size_t limit)
1398 {
1399         ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1400
1401         if (type->dtdt_kind == DIF_TYPE_STRING) {
1402                 dtrace_strcpy(src, dst, MIN(type->dtdt_size, limit));
1403         } else {
1404                 dtrace_bcopy(src, dst, MIN(type->dtdt_size, limit));
1405         }
1406 }
1407
1408 /*
1409  * Compare s1 to s2 using safe memory accesses.  The s1 data is assumed to be
1410  * unsafe memory specified by the DIF program.  The s2 data is assumed to be
1411  * safe memory that we can access directly because it is managed by DTrace.
1412  */
1413 static int
1414 dtrace_bcmp(const void *s1, const void *s2, size_t len)
1415 {
1416         volatile uint16_t *flags;
1417
1418         flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1419
1420         if (s1 == s2)
1421                 return (0);
1422
1423         if (s1 == NULL || s2 == NULL)
1424                 return (1);
1425
1426         if (s1 != s2 && len != 0) {
1427                 const uint8_t *ps1 = s1;
1428                 const uint8_t *ps2 = s2;
1429
1430                 do {
1431                         if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
1432                                 return (1);
1433                 } while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
1434         }
1435         return (0);
1436 }
1437
1438 /*
1439  * Zero the specified region using a simple byte-by-byte loop.  Note that this
1440  * is for safe DTrace-managed memory only.
1441  */
1442 static void
1443 dtrace_bzero(void *dst, size_t len)
1444 {
1445         uchar_t *cp;
1446
1447         for (cp = dst; len != 0; len--)
1448                 *cp++ = 0;
1449 }
1450
1451 static void
1452 dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
1453 {
1454         uint64_t result[2];
1455
1456         result[0] = addend1[0] + addend2[0];
1457         result[1] = addend1[1] + addend2[1] +
1458             (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
1459
1460         sum[0] = result[0];
1461         sum[1] = result[1];
1462 }
1463
1464 /*
1465  * Shift the 128-bit value in a by b. If b is positive, shift left.
1466  * If b is negative, shift right.
1467  */
1468 static void
1469 dtrace_shift_128(uint64_t *a, int b)
1470 {
1471         uint64_t mask;
1472
1473         if (b == 0)
1474                 return;
1475
1476         if (b < 0) {
1477                 b = -b;
1478                 if (b >= 64) {
1479                         a[0] = a[1] >> (b - 64);
1480                         a[1] = 0;
1481                 } else {
1482                         a[0] >>= b;
1483                         mask = 1LL << (64 - b);
1484                         mask -= 1;
1485                         a[0] |= ((a[1] & mask) << (64 - b));
1486                         a[1] >>= b;
1487                 }
1488         } else {
1489                 if (b >= 64) {
1490                         a[1] = a[0] << (b - 64);
1491                         a[0] = 0;
1492                 } else {
1493                         a[1] <<= b;
1494                         mask = a[0] >> (64 - b);
1495                         a[1] |= mask;
1496                         a[0] <<= b;
1497                 }
1498         }
1499 }
1500
1501 /*
1502  * The basic idea is to break the 2 64-bit values into 4 32-bit values,
1503  * use native multiplication on those, and then re-combine into the
1504  * resulting 128-bit value.
1505  *
1506  * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1507  *     hi1 * hi2 << 64 +
1508  *     hi1 * lo2 << 32 +
1509  *     hi2 * lo1 << 32 +
1510  *     lo1 * lo2
1511  */
1512 static void
1513 dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1514 {
1515         uint64_t hi1, hi2, lo1, lo2;
1516         uint64_t tmp[2];
1517
1518         hi1 = factor1 >> 32;
1519         hi2 = factor2 >> 32;
1520
1521         lo1 = factor1 & DT_MASK_LO;
1522         lo2 = factor2 & DT_MASK_LO;
1523
1524         product[0] = lo1 * lo2;
1525         product[1] = hi1 * hi2;
1526
1527         tmp[0] = hi1 * lo2;
1528         tmp[1] = 0;
1529         dtrace_shift_128(tmp, 32);
1530         dtrace_add_128(product, tmp, product);
1531
1532         tmp[0] = hi2 * lo1;
1533         tmp[1] = 0;
1534         dtrace_shift_128(tmp, 32);
1535         dtrace_add_128(product, tmp, product);
1536 }
1537
1538 /*
1539  * This privilege check should be used by actions and subroutines to
1540  * verify that the user credentials of the process that enabled the
1541  * invoking ECB match the target credentials
1542  */
1543 static int
1544 dtrace_priv_proc_common_user(dtrace_state_t *state)
1545 {
1546         cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1547
1548         /*
1549          * We should always have a non-NULL state cred here, since if cred
1550          * is null (anonymous tracing), we fast-path bypass this routine.
1551          */
1552         ASSERT(s_cr != NULL);
1553
1554         if ((cr = dtrace_CRED()) != NULL &&
1555             posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_uid &&
1556             posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_ruid &&
1557             posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_suid &&
1558             posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_gid &&
1559             posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_rgid &&
1560             posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_sgid)
1561                 return (1);
1562
1563         return (0);
1564 }
1565
1566 /*
1567  * This privilege check should be used by actions and subroutines to
1568  * verify that the zone of the process that enabled the invoking ECB
1569  * matches the target credentials
1570  */
1571 static int
1572 dtrace_priv_proc_common_zone(dtrace_state_t *state)
1573 {
1574         cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1575 #pragma unused(cr, s_cr, state) /* __APPLE__ */
1576
1577         /*
1578          * We should always have a non-NULL state cred here, since if cred
1579          * is null (anonymous tracing), we fast-path bypass this routine.
1580          */
1581         ASSERT(s_cr != NULL);
1582
1583         return 1; /* APPLE NOTE: Darwin doesn't do zones. */
1584 }
1585
1586 /*
1587  * This privilege check should be used by actions and subroutines to
1588  * verify that the process has not setuid or changed credentials.
1589  */
1590 static int
1591 dtrace_priv_proc_common_nocd(void)
1592 {
1593         return 1; /* Darwin omits "No Core Dump" flag. */
1594 }
1595
1596 static int
1597 dtrace_priv_proc_destructive(dtrace_state_t *state)
1598 {
1599         int action = state->dts_cred.dcr_action;
1600
1601         if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1602                 goto bad;
1603
1604         if (dtrace_is_restricted() && !dtrace_can_attach_to_proc(current_proc()))
1605                 goto bad;
1606
1607         if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1608             dtrace_priv_proc_common_zone(state) == 0)
1609                 goto bad;
1610
1611         if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1612             dtrace_priv_proc_common_user(state) == 0)
1613                 goto bad;
1614
1615         if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1616             dtrace_priv_proc_common_nocd() == 0)
1617                 goto bad;
1618
1619         return (1);
1620
1621 bad:
1622         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1623
1624         return (0);
1625 }
1626
1627 static int
1628 dtrace_priv_proc_control(dtrace_state_t *state)
1629 {
1630         if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1631                 goto bad;
1632
1633         if (dtrace_is_restricted() && !dtrace_can_attach_to_proc(current_proc()))
1634                 goto bad;
1635
1636         if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1637                 return (1);
1638
1639         if (dtrace_priv_proc_common_zone(state) &&
1640             dtrace_priv_proc_common_user(state) &&
1641             dtrace_priv_proc_common_nocd())
1642                 return (1);
1643
1644 bad:
1645         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1646
1647         return (0);
1648 }
1649
1650 static int
1651 dtrace_priv_proc(dtrace_state_t *state)
1652 {
1653         if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1654                 goto bad;
1655
1656         if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed() && !dtrace_can_attach_to_proc(current_proc()))
1657                 goto bad;
1658
1659         if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1660                 return (1);
1661
1662 bad:
1663         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1664
1665         return (0);
1666 }
1667
1668 /*
1669  * The P_LNOATTACH check is an Apple specific check.
1670  * We need a version of dtrace_priv_proc() that omits
1671  * that check for PID and EXECNAME accesses
1672  */
1673 static int
1674 dtrace_priv_proc_relaxed(dtrace_state_t *state)
1675 {
1676
1677         if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1678                 return (1);
1679
1680         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1681
1682         return (0);
1683 }
1684
1685 static int
1686 dtrace_priv_kernel(dtrace_state_t *state)
1687 {
1688         if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed())
1689                 goto bad;
1690
1691         if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1692                 return (1);
1693
1694 bad:
1695         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1696
1697         return (0);
1698 }
1699
1700 static int
1701 dtrace_priv_kernel_destructive(dtrace_state_t *state)
1702 {
1703         if (dtrace_is_restricted())
1704                 goto bad;
1705
1706         if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1707                 return (1);
1708
1709 bad:
1710         cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1711
1712         return (0);
1713 }
1714
1715 /*
1716  * Note:  not called from probe context.  This function is called
1717  * asynchronously (and at a regular interval) from outside of probe context to
1718  * clean the dirty dynamic variable lists on all CPUs.  Dynamic variable
1719  * cleaning is explained in detail in <sys/dtrace_impl.h>.
1720  */
1721 static void
1722 dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1723 {
1724         dtrace_dynvar_t *dirty;
1725         dtrace_dstate_percpu_t *dcpu;
1726         int i, work = 0;
1727
1728         for (i = 0; i < (int)NCPU; i++) {
1729                 dcpu = &dstate->dtds_percpu[i];
1730
1731                 ASSERT(dcpu->dtdsc_rinsing == NULL);
1732
1733                 /*
1734                  * If the dirty list is NULL, there is no dirty work to do.
1735                  */
1736                 if (dcpu->dtdsc_dirty == NULL)
1737                         continue;
1738
1739                 /*
1740                  * If the clean list is non-NULL, then we're not going to do
1741                  * any work for this CPU -- it means that there has not been
1742                  * a dtrace_dynvar() allocation on this CPU (or from this CPU)
1743                  * since the last time we cleaned house.
1744                  */
1745                 if (dcpu->dtdsc_clean != NULL)
1746                         continue;
1747
1748                 work = 1;
1749
1750                 /*
1751                  * Atomically move the dirty list aside.
1752                  */
1753                 do {
1754                         dirty = dcpu->dtdsc_dirty;
1755
1756                         /*
1757                          * Before we zap the dirty list, set the rinsing list.
1758                          * (This allows for a potential assertion in
1759                          * dtrace_dynvar():  if a free dynamic variable appears
1760                          * on a hash chain, either the dirty list or the
1761                          * rinsing list for some CPU must be non-NULL.)
1762                          */
1763                         dcpu->dtdsc_rinsing = dirty;
1764                         dtrace_membar_producer();
1765                 } while (dtrace_casptr(&dcpu->dtdsc_dirty,
1766                     dirty, NULL) != dirty);
1767         }
1768
1769         if (!work) {
1770                 /*
1771                  * We have no work to do; we can simply return.
1772                  */
1773                 return;
1774         }
1775
1776         dtrace_sync();
1777
1778         for (i = 0; i < (int)NCPU; i++) {
1779                 dcpu = &dstate->dtds_percpu[i];
1780
1781                 if (dcpu->dtdsc_rinsing == NULL)
1782                         continue;
1783
1784                 /*
1785                  * We are now guaranteed that no hash chain contains a pointer
1786                  * into this dirty list; we can make it clean.
1787                  */
1788                 ASSERT(dcpu->dtdsc_clean == NULL);
1789                 dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1790                 dcpu->dtdsc_rinsing = NULL;
1791         }
1792
1793         /*
1794          * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1795          * sure that all CPUs have seen all of the dtdsc_clean pointers.
1796          * This prevents a race whereby a CPU incorrectly decides that
1797          * the state should be something other than DTRACE_DSTATE_CLEAN
1798          * after dtrace_dynvar_clean() has completed.
1799          */
1800         dtrace_sync();
1801
1802         dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1803 }
1804
1805 /*
1806  * Depending on the value of the op parameter, this function looks-up,
1807  * allocates or deallocates an arbitrarily-keyed dynamic variable.  If an
1808  * allocation is requested, this function will return a pointer to a
1809  * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1810  * variable can be allocated.  If NULL is returned, the appropriate counter
1811  * will be incremented.
1812  */
1813 static dtrace_dynvar_t *
1814 dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1815     dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1816     dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1817 {
1818         uint64_t hashval = DTRACE_DYNHASH_VALID;
1819         dtrace_dynhash_t *hash = dstate->dtds_hash;
1820         dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1821         processorid_t me = CPU->cpu_id, cpu = me;
1822         dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
1823         size_t bucket, ksize;
1824         size_t chunksize = dstate->dtds_chunksize;
1825         uintptr_t kdata, lock, nstate;
1826         uint_t i;
1827
1828         ASSERT(nkeys != 0);
1829
1830         /*
1831          * Hash the key.  As with aggregations, we use Jenkins' "One-at-a-time"
1832          * algorithm.  For the by-value portions, we perform the algorithm in
1833          * 16-bit chunks (as opposed to 8-bit chunks).  This speeds things up a
1834          * bit, and seems to have only a minute effect on distribution.  For
1835          * the by-reference data, we perform "One-at-a-time" iterating (safely)
1836          * over each referenced byte.  It's painful to do this, but it's much
1837          * better than pathological hash distribution.  The efficacy of the
1838          * hashing algorithm (and a comparison with other algorithms) may be
1839          * found by running the ::dtrace_dynstat MDB dcmd.
1840          */
1841         for (i = 0; i < nkeys; i++) {
1842                 if (key[i].dttk_size == 0) {
1843                         uint64_t val = key[i].dttk_value;
1844
1845                         hashval += (val >> 48) & 0xffff;
1846                         hashval += (hashval << 10);
1847                         hashval ^= (hashval >> 6);
1848
1849                         hashval += (val >> 32) & 0xffff;
1850                         hashval += (hashval << 10);
1851                         hashval ^= (hashval >> 6);
1852
1853                         hashval += (val >> 16) & 0xffff;
1854                         hashval += (hashval << 10);
1855                         hashval ^= (hashval >> 6);
1856
1857                         hashval += val & 0xffff;
1858                         hashval += (hashval << 10);
1859                         hashval ^= (hashval >> 6);
1860                 } else {
1861                         /*
1862                          * This is incredibly painful, but it beats the hell
1863                          * out of the alternative.
1864                          */
1865                         uint64_t j, size = key[i].dttk_size;
1866                         uintptr_t base = (uintptr_t)key[i].dttk_value;
1867
1868                         if (!dtrace_canload(base, size, mstate, vstate))
1869                                 break;
1870
1871                         for (j = 0; j < size; j++) {
1872                                 hashval += dtrace_load8(base + j);
1873                                 hashval += (hashval << 10);
1874                                 hashval ^= (hashval >> 6);
1875                         }
1876                 }
1877         }
1878
1879         if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1880                 return (NULL);
1881
1882         hashval += (hashval << 3);
1883         hashval ^= (hashval >> 11);
1884         hashval += (hashval << 15);
1885
1886         /*
1887          * There is a remote chance (ideally, 1 in 2^31) that our hashval
1888          * comes out to be one of our two sentinel hash values.  If this
1889          * actually happens, we set the hashval to be a value known to be a
1890          * non-sentinel value.
1891          */
1892         if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1893                 hashval = DTRACE_DYNHASH_VALID;
1894
1895         /*
1896          * Yes, it's painful to do a divide here.  If the cycle count becomes
1897          * important here, tricks can be pulled to reduce it.  (However, it's
1898          * critical that hash collisions be kept to an absolute minimum;
1899          * they're much more painful than a divide.)  It's better to have a
1900          * solution that generates few collisions and still keeps things
1901          * relatively simple.
1902          */
1903         bucket = hashval % dstate->dtds_hashsize;
1904
1905         if (op == DTRACE_DYNVAR_DEALLOC) {
1906                 volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1907
1908                 for (;;) {
1909                         while ((lock = *lockp) & 1)
1910                                 continue;
1911
1912                         if (dtrace_casptr((void *)(uintptr_t)lockp,
1913                             (void *)lock, (void *)(lock + 1)) == (void *)lock)
1914                                 break;
1915                 }
1916
1917                 dtrace_membar_producer();
1918         }
1919
1920 top:
1921         prev = NULL;
1922         lock = hash[bucket].dtdh_lock;
1923
1924         dtrace_membar_consumer();
1925
1926         start = hash[bucket].dtdh_chain;
1927         ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
1928             start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
1929             op != DTRACE_DYNVAR_DEALLOC));
1930
1931         for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1932                 dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
1933                 dtrace_key_t *dkey = &dtuple->dtt_key[0];
1934
1935                 if (dvar->dtdv_hashval != hashval) {
1936                         if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
1937                                 /*
1938                                  * We've reached the sink, and therefore the
1939                                  * end of the hash chain; we can kick out of
1940                                  * the loop knowing that we have seen a valid
1941                                  * snapshot of state.
1942                                  */
1943                                 ASSERT(dvar->dtdv_next == NULL);
1944                                 ASSERT(dvar == &dtrace_dynhash_sink);
1945                                 break;
1946                         }
1947
1948                         if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
1949                                 /*
1950                                  * We've gone off the rails:  somewhere along
1951                                  * the line, one of the members of this hash
1952                                  * chain was deleted.  Note that we could also
1953                                  * detect this by simply letting this loop run
1954                                  * to completion, as we would eventually hit
1955                                  * the end of the dirty list.  However, we
1956                                  * want to avoid running the length of the
1957                                  * dirty list unnecessarily (it might be quite
1958                                  * long), so we catch this as early as
1959                                  * possible by detecting the hash marker.  In
1960                                  * this case, we simply set dvar to NULL and
1961                                  * break; the conditional after the loop will
1962                                  * send us back to top.
1963                                  */
1964                                 dvar = NULL;
1965                                 break;
1966                         }
1967
1968                         goto next;
1969                 }
1970
1971                 if (dtuple->dtt_nkeys != nkeys)
1972                         goto next;
1973
1974                 for (i = 0; i < nkeys; i++, dkey++) {
1975                         if (dkey->dttk_size != key[i].dttk_size)
1976                                 goto next; /* size or type mismatch */
1977
1978                         if (dkey->dttk_size != 0) {
1979                                 if (dtrace_bcmp(
1980                                     (void *)(uintptr_t)key[i].dttk_value,
1981                                     (void *)(uintptr_t)dkey->dttk_value,
1982                                     dkey->dttk_size))
1983                                         goto next;
1984                         } else {
1985                                 if (dkey->dttk_value != key[i].dttk_value)
1986                                         goto next;
1987                         }
1988                 }
1989
1990                 if (op != DTRACE_DYNVAR_DEALLOC)
1991                         return (dvar);
1992
1993                 ASSERT(dvar->dtdv_next == NULL ||
1994                     dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
1995
1996                 if (prev != NULL) {
1997                         ASSERT(hash[bucket].dtdh_chain != dvar);
1998                         ASSERT(start != dvar);
1999                         ASSERT(prev->dtdv_next == dvar);
2000                         prev->dtdv_next = dvar->dtdv_next;
2001                 } else {
2002                         if (dtrace_casptr(&hash[bucket].dtdh_chain,
2003                             start, dvar->dtdv_next) != start) {
2004                                 /*
2005                                  * We have failed to atomically swing the
2006                                  * hash table head pointer, presumably because
2007                                  * of a conflicting allocation on another CPU.
2008                                  * We need to reread the hash chain and try
2009                                  * again.
2010                                  */
2011                                 goto top;
2012                         }
2013                 }
2014
2015                 dtrace_membar_producer();
2016
2017                 /*
2018                  * Now set the hash value to indicate that it's free.
2019                  */
2020                 ASSERT(hash[bucket].dtdh_chain != dvar);
2021                 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2022
2023                 dtrace_membar_producer();
2024
2025                 /*
2026                  * Set the next pointer to point at the dirty list, and
2027                  * atomically swing the dirty pointer to the newly freed dvar.
2028                  */
2029                 do {
2030                         next = dcpu->dtdsc_dirty;
2031                         dvar->dtdv_next = next;
2032                 } while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
2033
2034                 /*
2035                  * Finally, unlock this hash bucket.
2036                  */
2037                 ASSERT(hash[bucket].dtdh_lock == lock);
2038                 ASSERT(lock & 1);
2039                 hash[bucket].dtdh_lock++;
2040
2041                 return (NULL);
2042 next:
2043                 prev = dvar;
2044                 continue;
2045         }
2046
2047         if (dvar == NULL) {
2048                 /*
2049                  * If dvar is NULL, it is because we went off the rails:
2050                  * one of the elements that we traversed in the hash chain
2051                  * was deleted while we were traversing it.  In this case,
2052                  * we assert that we aren't doing a dealloc (deallocs lock
2053                  * the hash bucket to prevent themselves from racing with
2054                  * one another), and retry the hash chain traversal.
2055                  */
2056                 ASSERT(op != DTRACE_DYNVAR_DEALLOC);
2057                 goto top;
2058         }
2059
2060         if (op != DTRACE_DYNVAR_ALLOC) {
2061                 /*
2062                  * If we are not to allocate a new variable, we want to
2063                  * return NULL now.  Before we return, check that the value
2064                  * of the lock word hasn't changed.  If it has, we may have
2065                  * seen an inconsistent snapshot.
2066                  */
2067                 if (op == DTRACE_DYNVAR_NOALLOC) {
2068                         if (hash[bucket].dtdh_lock != lock)
2069                                 goto top;
2070                 } else {
2071                         ASSERT(op == DTRACE_DYNVAR_DEALLOC);
2072                         ASSERT(hash[bucket].dtdh_lock == lock);
2073                         ASSERT(lock & 1);
2074                         hash[bucket].dtdh_lock++;
2075                 }
2076
2077                 return (NULL);
2078         }
2079
2080         /*
2081          * We need to allocate a new dynamic variable.  The size we need is the
2082          * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
2083          * size of any auxiliary key data (rounded up to 8-byte alignment) plus
2084          * the size of any referred-to data (dsize).  We then round the final
2085          * size up to the chunksize for allocation.
2086          */
2087         for (ksize = 0, i = 0; i < nkeys; i++)
2088                 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
2089
2090         /*
2091          * This should be pretty much impossible, but could happen if, say,
2092          * strange DIF specified the tuple.  Ideally, this should be an
2093          * assertion and not an error condition -- but that requires that the
2094          * chunksize calculation in dtrace_difo_chunksize() be absolutely
2095          * bullet-proof.  (That is, it must not be able to be fooled by
2096          * malicious DIF.)  Given the lack of backwards branches in DIF,
2097          * solving this would presumably not amount to solving the Halting
2098          * Problem -- but it still seems awfully hard.
2099          */
2100         if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
2101             ksize + dsize > chunksize) {
2102                 dcpu->dtdsc_drops++;
2103                 return (NULL);
2104         }
2105
2106         nstate = DTRACE_DSTATE_EMPTY;
2107
2108         do {
2109 retry:
2110                 free = dcpu->dtdsc_free;
2111
2112                 if (free == NULL) {
2113                         dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
2114                         void *rval;
2115
2116                         if (clean == NULL) {
2117                                 /*
2118                                  * We're out of dynamic variable space on
2119                                  * this CPU.  Unless we have tried all CPUs,
2120                                  * we'll try to allocate from a different
2121                                  * CPU.
2122                                  */
2123                                 switch (dstate->dtds_state) {
2124                                 case DTRACE_DSTATE_CLEAN: {
2125                                         void *sp = &dstate->dtds_state;
2126
2127                                         if (++cpu >= (int)NCPU)
2128                                                 cpu = 0;
2129
2130                                         if (dcpu->dtdsc_dirty != NULL &&
2131                                             nstate == DTRACE_DSTATE_EMPTY)
2132                                                 nstate = DTRACE_DSTATE_DIRTY;
2133
2134                                         if (dcpu->dtdsc_rinsing != NULL)
2135                                                 nstate = DTRACE_DSTATE_RINSING;
2136
2137                                         dcpu = &dstate->dtds_percpu[cpu];
2138
2139                                         if (cpu != me)
2140                                                 goto retry;
2141
2142                                         (void) dtrace_cas32(sp,
2143                                             DTRACE_DSTATE_CLEAN, nstate);
2144
2145                                         /*
2146                                          * To increment the correct bean
2147                                          * counter, take another lap.
2148                                          */
2149                                         goto retry;
2150                                 }
2151
2152                                 case DTRACE_DSTATE_DIRTY:
2153                                         dcpu->dtdsc_dirty_drops++;
2154                                         break;
2155
2156                                 case DTRACE_DSTATE_RINSING:
2157                                         dcpu->dtdsc_rinsing_drops++;
2158                                         break;
2159
2160                                 case DTRACE_DSTATE_EMPTY:
2161                                         dcpu->dtdsc_drops++;
2162                                         break;
2163                                 }
2164
2165                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
2166                                 return (NULL);
2167                         }
2168
2169                         /*
2170                          * The clean list appears to be non-empty.  We want to
2171                          * move the clean list to the free list; we start by
2172                          * moving the clean pointer aside.
2173                          */
2174                         if (dtrace_casptr(&dcpu->dtdsc_clean,
2175                             clean, NULL) != clean) {
2176                                 /*
2177                                  * We are in one of two situations:
2178                                  *
2179                                  *  (a) The clean list was switched to the
2180                                  *      free list by another CPU.
2181                                  *
2182                                  *  (b) The clean list was added to by the
2183                                  *      cleansing cyclic.
2184                                  *
2185                                  * In either of these situations, we can
2186                                  * just reattempt the free list allocation.
2187                                  */
2188                                 goto retry;
2189                         }
2190
2191                         ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
2192
2193                         /*
2194                          * Now we'll move the clean list to the free list.
2195                          * It's impossible for this to fail:  the only way
2196                          * the free list can be updated is through this
2197                          * code path, and only one CPU can own the clean list.
2198                          * Thus, it would only be possible for this to fail if
2199                          * this code were racing with dtrace_dynvar_clean().
2200                          * (That is, if dtrace_dynvar_clean() updated the clean
2201                          * list, and we ended up racing to update the free
2202                          * list.)  This race is prevented by the dtrace_sync()
2203                          * in dtrace_dynvar_clean() -- which flushes the
2204                          * owners of the clean lists out before resetting
2205                          * the clean lists.
2206                          */
2207                         rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
2208                         ASSERT(rval == NULL);
2209                         goto retry;
2210                 }
2211
2212                 dvar = free;
2213                 new_free = dvar->dtdv_next;
2214         } while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
2215
2216         /*
2217          * We have now allocated a new chunk.  We copy the tuple keys into the
2218          * tuple array and copy any referenced key data into the data space
2219          * following the tuple array.  As we do this, we relocate dttk_value
2220          * in the final tuple to point to the key data address in the chunk.
2221          */
2222         kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
2223         dvar->dtdv_data = (void *)(kdata + ksize);
2224         dvar->dtdv_tuple.dtt_nkeys = nkeys;
2225
2226         for (i = 0; i < nkeys; i++) {
2227                 dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
2228                 size_t kesize = key[i].dttk_size;
2229
2230                 if (kesize != 0) {
2231                         dtrace_bcopy(
2232                             (const void *)(uintptr_t)key[i].dttk_value,
2233                             (void *)kdata, kesize);
2234                         dkey->dttk_value = kdata;
2235                         kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
2236                 } else {
2237                         dkey->dttk_value = key[i].dttk_value;
2238                 }
2239
2240                 dkey->dttk_size = kesize;
2241         }
2242
2243         ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
2244         dvar->dtdv_hashval = hashval;
2245         dvar->dtdv_next = start;
2246
2247         if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
2248                 return (dvar);
2249
2250         /*
2251          * The cas has failed.  Either another CPU is adding an element to
2252          * this hash chain, or another CPU is deleting an element from this
2253          * hash chain.  The simplest way to deal with both of these cases
2254          * (though not necessarily the most efficient) is to free our
2255          * allocated block and tail-call ourselves.  Note that the free is
2256          * to the dirty list and _not_ to the free list.  This is to prevent
2257          * races with allocators, above.
2258          */
2259         dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2260
2261         dtrace_membar_producer();
2262
2263         do {
2264                 free = dcpu->dtdsc_dirty;
2265                 dvar->dtdv_next = free;
2266         } while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
2267
2268         return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
2269 }
2270
2271 /*ARGSUSED*/
2272 static void
2273 dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
2274 {
2275 #pragma unused(arg) /* __APPLE__ */
2276         if ((int64_t)nval < (int64_t)*oval)
2277                 *oval = nval;
2278 }
2279
2280 /*ARGSUSED*/
2281 static void
2282 dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
2283 {
2284 #pragma unused(arg) /* __APPLE__ */
2285         if ((int64_t)nval > (int64_t)*oval)
2286                 *oval = nval;
2287 }
2288
2289 static void
2290 dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
2291 {
2292         int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
2293         int64_t val = (int64_t)nval;
2294
2295         if (val < 0) {
2296                 for (i = 0; i < zero; i++) {
2297                         if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
2298                                 quanta[i] += incr;
2299                                 return;
2300                         }
2301                 }
2302         } else {
2303                 for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
2304                         if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
2305                                 quanta[i - 1] += incr;
2306                                 return;
2307                         }
2308                 }
2309
2310                 quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
2311                 return;
2312         }
2313
2314         ASSERT(0);
2315 }
2316
2317 static void
2318 dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
2319 {
2320         uint64_t arg = *lquanta++;
2321         int32_t base = DTRACE_LQUANTIZE_BASE(arg);
2322         uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
2323         uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
2324         int32_t val = (int32_t)nval, level;
2325
2326         ASSERT(step != 0);
2327         ASSERT(levels != 0);
2328
2329         if (val < base) {
2330                 /*
2331                  * This is an underflow.
2332                  */
2333                 lquanta[0] += incr;
2334                 return;
2335         }
2336
2337         level = (val - base) / step;
2338
2339         if (level < levels) {
2340                 lquanta[level + 1] += incr;
2341                 return;
2342         }
2343
2344         /*
2345          * This is an overflow.
2346          */
2347         lquanta[levels + 1] += incr;
2348 }
2349
2350 static int
2351 dtrace_aggregate_llquantize_bucket(int16_t factor, int16_t low, int16_t high,
2352                                    int16_t nsteps, int64_t value)
2353 {
2354         int64_t this = 1, last, next;
2355         int base = 1, order;
2356
2357         for (order = 0; order < low; ++order)
2358                 this *= factor;
2359
2360         /*
2361          * If our value is less than our factor taken to the power of the
2362          * low order of magnitude, it goes into the zeroth bucket.
2363          */
2364         if (value < this)
2365                 return 0;
2366         else
2367                 last = this;
2368
2369         for (this *= factor; order <= high; ++order) {
2370                 int nbuckets = this > nsteps ? nsteps : this;
2371
2372                 /*
2373                  * We should not generally get log/linear quantizations
2374                  * with a high magnitude that allows 64-bits to
2375                  * overflow, but we nonetheless protect against this
2376                  * by explicitly checking for overflow, and clamping
2377                  * our value accordingly.
2378                  */
2379                 next = this * factor;
2380                 if (next < this) {
2381                         value = this - 1;
2382                 }
2383
2384                 /*
2385                  * If our value lies within this order of magnitude,
2386                  * determine its position by taking the offset within
2387                  * the order of magnitude, dividing by the bucket
2388                  * width, and adding to our (accumulated) base.
2389                  */
2390                 if (value < this) {
2391                         return (base + (value - last) / (this / nbuckets));
2392                 }
2393
2394                 base += nbuckets - (nbuckets / factor);
2395                 last = this;
2396                 this = next;
2397         }
2398
2399         /*
2400          * Our value is greater than or equal to our factor taken to the
2401          * power of one plus the high magnitude -- return the top bucket.
2402          */
2403         return base;
2404 }
2405
2406 static void
2407 dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
2408 {
2409         uint64_t arg    = *llquanta++;
2410         uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
2411         uint16_t low    = DTRACE_LLQUANTIZE_LOW(arg);
2412         uint16_t high   = DTRACE_LLQUANTIZE_HIGH(arg);
2413         uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
2414
2415         llquanta[dtrace_aggregate_llquantize_bucket(factor, low, high, nsteps, nval)] += incr;
2416 }
2417
2418 /*ARGSUSED*/
2419 static void
2420 dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
2421 {
2422 #pragma unused(arg) /* __APPLE__ */
2423         data[0]++;
2424         data[1] += nval;
2425 }
2426
2427 /*ARGSUSED*/
2428 static void
2429 dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
2430 {
2431 #pragma unused(arg) /* __APPLE__ */
2432         int64_t snval = (int64_t)nval;
2433         uint64_t tmp[2];
2434
2435         data[0]++;
2436         data[1] += nval;
2437
2438         /*
2439          * What we want to say here is:
2440          *
2441          * data[2] += nval * nval;
2442          *
2443          * But given that nval is 64-bit, we could easily overflow, so
2444          * we do this as 128-bit arithmetic.
2445          */
2446         if (snval < 0)
2447                 snval = -snval;
2448
2449         dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
2450         dtrace_add_128(data + 2, tmp, data + 2);
2451 }
2452
2453 /*ARGSUSED*/
2454 static void
2455 dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
2456 {
2457 #pragma unused(nval, arg) /* __APPLE__ */
2458         *oval = *oval + 1;
2459 }
2460
2461 /*ARGSUSED*/
2462 static void
2463 dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
2464 {
2465 #pragma unused(arg) /* __APPLE__ */
2466         *oval += nval;
2467 }
2468
2469 /*
2470  * Aggregate given the tuple in the principal data buffer, and the aggregating
2471  * action denoted by the specified dtrace_aggregation_t.  The aggregation
2472  * buffer is specified as the buf parameter.  This routine does not return
2473  * failure; if there is no space in the aggregation buffer, the data will be
2474  * dropped, and a corresponding counter incremented.
2475  */
2476 static void
2477 dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
2478     intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
2479 {
2480 #pragma unused(arg)
2481         dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
2482         uint32_t i, ndx, size, fsize;
2483         uint32_t align = sizeof (uint64_t) - 1;
2484         dtrace_aggbuffer_t *agb;
2485         dtrace_aggkey_t *key;
2486         uint32_t hashval = 0, limit, isstr;
2487         caddr_t tomax, data, kdata;
2488         dtrace_actkind_t action;
2489         dtrace_action_t *act;
2490         uintptr_t offs;
2491
2492         if (buf == NULL)
2493                 return;
2494
2495         if (!agg->dtag_hasarg) {
2496                 /*
2497                  * Currently, only quantize() and lquantize() take additional
2498                  * arguments, and they have the same semantics:  an increment
2499                  * value that defaults to 1 when not present.  If additional
2500                  * aggregating actions take arguments, the setting of the
2501                  * default argument value will presumably have to become more
2502                  * sophisticated...
2503                  */
2504                 arg = 1;
2505         }
2506
2507         action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
2508         size = rec->dtrd_offset - agg->dtag_base;
2509         fsize = size + rec->dtrd_size;
2510
2511         ASSERT(dbuf->dtb_tomax != NULL);
2512         data = dbuf->dtb_tomax + offset + agg->dtag_base;
2513
2514         if ((tomax = buf->dtb_tomax) == NULL) {
2515                 dtrace_buffer_drop(buf);
2516                 return;
2517         }
2518
2519         /*
2520          * The metastructure is always at the bottom of the buffer.
2521          */
2522         agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
2523             sizeof (dtrace_aggbuffer_t));
2524
2525         if (buf->dtb_offset == 0) {
2526                 /*
2527                  * We just kludge up approximately 1/8th of the size to be
2528                  * buckets.  If this guess ends up being routinely
2529                  * off-the-mark, we may need to dynamically readjust this
2530                  * based on past performance.
2531                  */
2532                 uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
2533
2534                 if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
2535                     (uintptr_t)tomax || hashsize == 0) {
2536                         /*
2537                          * We've been given a ludicrously small buffer;
2538                          * increment our drop count and leave.
2539                          */
2540                         dtrace_buffer_drop(buf);
2541                         return;
2542                 }
2543
2544                 /*
2545                  * And now, a pathetic attempt to try to get a an odd (or
2546                  * perchance, a prime) hash size for better hash distribution.
2547                  */
2548                 if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
2549                         hashsize -= DTRACE_AGGHASHSIZE_SLEW;
2550
2551                 agb->dtagb_hashsize = hashsize;
2552                 agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
2553                     agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
2554                 agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
2555
2556                 for (i = 0; i < agb->dtagb_hashsize; i++)
2557                         agb->dtagb_hash[i] = NULL;
2558         }
2559
2560         ASSERT(agg->dtag_first != NULL);
2561         ASSERT(agg->dtag_first->dta_intuple);
2562
2563         /*
2564          * Calculate the hash value based on the key.  Note that we _don't_
2565          * include the aggid in the hashing (but we will store it as part of
2566          * the key).  The hashing algorithm is Bob Jenkins' "One-at-a-time"
2567          * algorithm: a simple, quick algorithm that has no known funnels, and
2568          * gets good distribution in practice.  The efficacy of the hashing
2569          * algorithm (and a comparison with other algorithms) may be found by
2570          * running the ::dtrace_aggstat MDB dcmd.
2571          */
2572         for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2573                 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2574                 limit = i + act->dta_rec.dtrd_size;
2575                 ASSERT(limit <= size);
2576                 isstr = DTRACEACT_ISSTRING(act);
2577
2578                 for (; i < limit; i++) {
2579                         hashval += data[i];
2580                         hashval += (hashval << 10);
2581                         hashval ^= (hashval >> 6);
2582
2583                         if (isstr && data[i] == '\0')
2584                                 break;
2585                 }
2586         }
2587
2588         hashval += (hashval << 3);
2589         hashval ^= (hashval >> 11);
2590         hashval += (hashval << 15);
2591
2592         /*
2593          * Yes, the divide here is expensive -- but it's generally the least
2594          * of the performance issues given the amount of data that we iterate
2595          * over to compute hash values, compare data, etc.
2596          */
2597         ndx = hashval % agb->dtagb_hashsize;
2598
2599         for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
2600                 ASSERT((caddr_t)key >= tomax);
2601                 ASSERT((caddr_t)key < tomax + buf->dtb_size);
2602
2603                 if (hashval != key->dtak_hashval || key->dtak_size != size)
2604                         continue;
2605
2606                 kdata = key->dtak_data;
2607                 ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
2608
2609                 for (act = agg->dtag_first; act->dta_intuple;
2610                     act = act->dta_next) {
2611                         i = act->dta_rec.dtrd_offset - agg->dtag_base;
2612                         limit = i + act->dta_rec.dtrd_size;
2613                         ASSERT(limit <= size);
2614                         isstr = DTRACEACT_ISSTRING(act);
2615
2616                         for (; i < limit; i++) {
2617                                 if (kdata[i] != data[i])
2618                                         goto next;
2619
2620                                 if (isstr && data[i] == '\0')
2621                                         break;
2622                         }
2623                 }
2624
2625                 if (action != key->dtak_action) {
2626                         /*
2627                          * We are aggregating on the same value in the same
2628                          * aggregation with two different aggregating actions.
2629                          * (This should have been picked up in the compiler,
2630                          * so we may be dealing with errant or devious DIF.)
2631                          * This is an error condition; we indicate as much,
2632                          * and return.
2633                          */
2634                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2635                         return;
2636                 }
2637
2638                 /*
2639                  * This is a hit:  we need to apply the aggregator to
2640                  * the value at this key.
2641                  */
2642                 agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2643                 return;
2644 next:
2645                 continue;
2646         }
2647
2648         /*
2649          * We didn't find it.  We need to allocate some zero-filled space,
2650          * link it into the hash table appropriately, and apply the aggregator
2651          * to the (zero-filled) value.
2652          */
2653         offs = buf->dtb_offset;
2654         while (offs & (align - 1))
2655                 offs += sizeof (uint32_t);
2656
2657         /*
2658          * If we don't have enough room to both allocate a new key _and_
2659          * its associated data, increment the drop count and return.
2660          */
2661         if ((uintptr_t)tomax + offs + fsize >
2662             agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2663                 dtrace_buffer_drop(buf);
2664                 return;
2665         }
2666
2667         /*CONSTCOND*/
2668         ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
2669         key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
2670         agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2671
2672         key->dtak_data = kdata = tomax + offs;
2673         buf->dtb_offset = offs + fsize;
2674
2675         /*
2676          * Now copy the data across.
2677          */
2678         *((dtrace_aggid_t *)kdata) = agg->dtag_id;
2679
2680         for (i = sizeof (dtrace_aggid_t); i < size; i++)
2681                 kdata[i] = data[i];
2682
2683         /*
2684          * Because strings are not zeroed out by default, we need to iterate
2685          * looking for actions that store strings, and we need to explicitly
2686          * pad these strings out with zeroes.
2687          */
2688         for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2689                 int nul;
2690
2691                 if (!DTRACEACT_ISSTRING(act))
2692                         continue;
2693
2694                 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2695                 limit = i + act->dta_rec.dtrd_size;
2696                 ASSERT(limit <= size);
2697
2698                 for (nul = 0; i < limit; i++) {
2699                         if (nul) {
2700                                 kdata[i] = '\0';
2701                                 continue;
2702                         }
2703
2704                         if (data[i] != '\0')
2705                                 continue;
2706
2707                         nul = 1;
2708                 }
2709         }
2710
2711         for (i = size; i < fsize; i++)
2712                 kdata[i] = 0;
2713
2714         key->dtak_hashval = hashval;
2715         key->dtak_size = size;
2716         key->dtak_action = action;
2717         key->dtak_next = agb->dtagb_hash[ndx];
2718         agb->dtagb_hash[ndx] = key;
2719
2720         /*
2721          * Finally, apply the aggregator.
2722          */
2723         *((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
2724         agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2725 }
2726
2727 /*
2728  * Given consumer state, this routine finds a speculation in the INACTIVE
2729  * state and transitions it into the ACTIVE state.  If there is no speculation
2730  * in the INACTIVE state, 0 is returned.  In this case, no error counter is
2731  * incremented -- it is up to the caller to take appropriate action.
2732  */
2733 static int
2734 dtrace_speculation(dtrace_state_t *state)
2735 {
2736         int i = 0;
2737         dtrace_speculation_state_t current;
2738         uint32_t *stat = &state->dts_speculations_unavail, count;
2739
2740         while (i < state->dts_nspeculations) {
2741                 dtrace_speculation_t *spec = &state->dts_speculations[i];
2742
2743                 current = spec->dtsp_state;
2744
2745                 if (current != DTRACESPEC_INACTIVE) {
2746                         if (current == DTRACESPEC_COMMITTINGMANY ||
2747                             current == DTRACESPEC_COMMITTING ||
2748                             current == DTRACESPEC_DISCARDING)
2749                                 stat = &state->dts_speculations_busy;
2750                         i++;
2751                         continue;
2752                 }
2753
2754                 if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2755                     current, DTRACESPEC_ACTIVE) == current)
2756                         return (i + 1);
2757         }
2758
2759         /*
2760          * We couldn't find a speculation.  If we found as much as a single
2761          * busy speculation buffer, we'll attribute this failure as "busy"
2762          * instead of "unavail".
2763          */
2764         do {
2765                 count = *stat;
2766         } while (dtrace_cas32(stat, count, count + 1) != count);
2767
2768         return (0);
2769 }
2770
2771 /*
2772  * This routine commits an active speculation.  If the specified speculation
2773  * is not in a valid state to perform a commit(), this routine will silently do
2774  * nothing.  The state of the specified speculation is transitioned according
2775  * to the state transition diagram outlined in <sys/dtrace_impl.h>
2776  */
2777 static void
2778 dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2779     dtrace_specid_t which)
2780 {
2781         dtrace_speculation_t *spec;
2782         dtrace_buffer_t *src, *dest;
2783         uintptr_t daddr, saddr, dlimit, slimit;
2784         dtrace_speculation_state_t current,  new = DTRACESPEC_INACTIVE;
2785         intptr_t offs;
2786         uint64_t timestamp;
2787
2788         if (which == 0)
2789                 return;
2790
2791         if (which > (dtrace_specid_t)state->dts_nspeculations) {
2792                 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2793                 return;
2794         }
2795
2796         spec = &state->dts_speculations[which - 1];
2797         src = &spec->dtsp_buffer[cpu];
2798         dest = &state->dts_buffer[cpu];
2799
2800         do {
2801                 current = spec->dtsp_state;
2802
2803                 if (current == DTRACESPEC_COMMITTINGMANY)
2804                         break;
2805
2806                 switch (current) {
2807                 case DTRACESPEC_INACTIVE:
2808                 case DTRACESPEC_DISCARDING:
2809                         return;
2810
2811                 case DTRACESPEC_COMMITTING:
2812                         /*
2813                          * This is only possible if we are (a) commit()'ing
2814                          * without having done a prior speculate() on this CPU
2815                          * and (b) racing with another commit() on a different
2816                          * CPU.  There's nothing to do -- we just assert that
2817                          * our offset is 0.
2818                          */
2819                         ASSERT(src->dtb_offset == 0);
2820                         return;
2821
2822                 case DTRACESPEC_ACTIVE:
2823                         new = DTRACESPEC_COMMITTING;
2824                         break;
2825
2826                 case DTRACESPEC_ACTIVEONE:
2827                         /*
2828                          * This speculation is active on one CPU.  If our
2829                          * buffer offset is non-zero, we know that the one CPU
2830                          * must be us.  Otherwise, we are committing on a
2831                          * different CPU from the speculate(), and we must
2832                          * rely on being asynchronously cleaned.
2833                          */
2834                         if (src->dtb_offset != 0) {
2835                                 new = DTRACESPEC_COMMITTING;
2836                                 break;
2837                         }
2838                         /*FALLTHROUGH*/
2839
2840                 case DTRACESPEC_ACTIVEMANY:
2841                         new = DTRACESPEC_COMMITTINGMANY;
2842                         break;
2843
2844                 default:
2845                         ASSERT(0);
2846                 }
2847         } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2848             current, new) != current);
2849
2850         /*
2851          * We have set the state to indicate that we are committing this
2852          * speculation.  Now reserve the necessary space in the destination
2853          * buffer.
2854          */
2855         if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2856             sizeof (uint64_t), state, NULL)) < 0) {
2857                 dtrace_buffer_drop(dest);
2858                 goto out;
2859         }
2860
2861         /*
2862          * We have sufficient space to copy the speculative buffer into the
2863          * primary buffer.  First, modify the speculative buffer, filling
2864          * in the timestamp of all entries with the current time.  The data
2865          * must have the commit() time rather than the time it was traced,
2866          * so that all entries in the primary buffer are in timestamp order.
2867          */
2868         timestamp = dtrace_gethrtime();
2869         saddr = (uintptr_t)src->dtb_tomax;
2870         slimit = saddr + src->dtb_offset;
2871         while (saddr < slimit) {
2872                 size_t size;
2873                 dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr;
2874
2875                 if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
2876                         saddr += sizeof (dtrace_epid_t);
2877                         continue;
2878                 }
2879
2880                 ASSERT(dtrh->dtrh_epid <= ((dtrace_epid_t) state->dts_necbs));
2881                 size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size;
2882
2883                 ASSERT(saddr + size <= slimit);
2884                 ASSERT(size >= sizeof(dtrace_rechdr_t));
2885                 ASSERT(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh) == UINT64_MAX);
2886
2887                 DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);
2888
2889                 saddr += size;
2890         }
2891
2892         /*
2893          * Copy the buffer across.  (Note that this is a
2894          * highly subobtimal bcopy(); in the unlikely event that this becomes
2895          * a serious performance issue, a high-performance DTrace-specific
2896          * bcopy() should obviously be invented.)
2897          */
2898         daddr = (uintptr_t)dest->dtb_tomax + offs;
2899         dlimit = daddr + src->dtb_offset;
2900         saddr = (uintptr_t)src->dtb_tomax;
2901
2902         /*
2903          * First, the aligned portion.
2904          */
2905         while (dlimit - daddr >= sizeof (uint64_t)) {
2906                 *((uint64_t *)daddr) = *((uint64_t *)saddr);
2907
2908                 daddr += sizeof (uint64_t);
2909                 saddr += sizeof (uint64_t);
2910         }
2911
2912         /*
2913          * Now any left-over bit...
2914          */
2915         while (dlimit - daddr)
2916                 *((uint8_t *)daddr++) = *((uint8_t *)saddr++);
2917
2918         /*
2919          * Finally, commit the reserved space in the destination buffer.
2920          */
2921         dest->dtb_offset = offs + src->dtb_offset;
2922
2923 out:
2924         /*
2925          * If we're lucky enough to be the only active CPU on this speculation
2926          * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2927          */
2928         if (current == DTRACESPEC_ACTIVE ||
2929             (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
2930                 uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
2931                     DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
2932 #pragma unused(rval) /* __APPLE__ */
2933
2934                 ASSERT(rval == DTRACESPEC_COMMITTING);
2935         }
2936
2937         src->dtb_offset = 0;
2938         src->dtb_xamot_drops += src->dtb_drops;
2939         src->dtb_drops = 0;
2940 }
2941
2942 /*
2943  * This routine discards an active speculation.  If the specified speculation
2944  * is not in a valid state to perform a discard(), this routine will silently
2945  * do nothing.  The state of the specified speculation is transitioned
2946  * according to the state transition diagram outlined in <sys/dtrace_impl.h>
2947  */
2948 static void
2949 dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
2950     dtrace_specid_t which)
2951 {
2952         dtrace_speculation_t *spec;
2953         dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
2954         dtrace_buffer_t *buf;
2955
2956         if (which == 0)
2957                 return;
2958
2959         if (which > (dtrace_specid_t)state->dts_nspeculations) {
2960                 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2961                 return;
2962         }
2963
2964         spec = &state->dts_speculations[which - 1];
2965         buf = &spec->dtsp_buffer[cpu];
2966
2967         do {
2968                 current = spec->dtsp_state;
2969
2970                 switch (current) {
2971                 case DTRACESPEC_INACTIVE:
2972                 case DTRACESPEC_COMMITTINGMANY:
2973                 case DTRACESPEC_COMMITTING:
2974                 case DTRACESPEC_DISCARDING:
2975                         return;
2976
2977                 case DTRACESPEC_ACTIVE:
2978                 case DTRACESPEC_ACTIVEMANY:
2979                         new = DTRACESPEC_DISCARDING;
2980                         break;
2981
2982                 case DTRACESPEC_ACTIVEONE:
2983                         if (buf->dtb_offset != 0) {
2984                                 new = DTRACESPEC_INACTIVE;
2985                         } else {
2986                                 new = DTRACESPEC_DISCARDING;
2987                         }
2988                         break;
2989
2990                 default:
2991                         ASSERT(0);
2992                 }
2993         } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2994             current, new) != current);
2995
2996         buf->dtb_offset = 0;
2997         buf->dtb_drops = 0;
2998 }
2999
3000 /*
3001  * Note:  not called from probe context.  This function is called
3002  * asynchronously from cross call context to clean any speculations that are
3003  * in the COMMITTINGMANY or DISCARDING states.  These speculations may not be
3004  * transitioned back to the INACTIVE state until all CPUs have cleaned the
3005  * speculation.
3006  */
3007 static void
3008 dtrace_speculation_clean_here(dtrace_state_t *state)
3009 {
3010         dtrace_icookie_t cookie;
3011         processorid_t cpu = CPU->cpu_id;
3012         dtrace_buffer_t *dest = &state->dts_buffer[cpu];
3013         dtrace_specid_t i;
3014
3015         cookie = dtrace_interrupt_disable();
3016
3017         if (dest->dtb_tomax == NULL) {
3018                 dtrace_interrupt_enable(cookie);
3019                 return;
3020         }
3021
3022         for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
3023                 dtrace_speculation_t *spec = &state->dts_speculations[i];
3024                 dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
3025
3026                 if (src->dtb_tomax == NULL)
3027                         continue;
3028
3029                 if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
3030                         src->dtb_offset = 0;
3031                         continue;
3032                 }
3033
3034                 if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
3035                         continue;
3036
3037                 if (src->dtb_offset == 0)
3038                         continue;
3039
3040                 dtrace_speculation_commit(state, cpu, i + 1);
3041         }
3042
3043         dtrace_interrupt_enable(cookie);
3044 }
3045
3046 /*
3047  * Note:  not called from probe context.  This function is called
3048  * asynchronously (and at a regular interval) to clean any speculations that
3049  * are in the COMMITTINGMANY or DISCARDING states.  If it discovers that there
3050  * is work to be done, it cross calls all CPUs to perform that work;
3051  * COMMITMANY and DISCARDING speculations may not be transitioned back to the
3052  * INACTIVE state until they have been cleaned by all CPUs.
3053  */
3054 static void
3055 dtrace_speculation_clean(dtrace_state_t *state)
3056 {
3057         int work = 0;
3058         uint32_t rv;
3059         dtrace_specid_t i;
3060
3061         for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
3062                 dtrace_speculation_t *spec = &state->dts_speculations[i];
3063
3064                 ASSERT(!spec->dtsp_cleaning);
3065
3066                 if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
3067                     spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
3068                         continue;
3069
3070                 work++;
3071                 spec->dtsp_cleaning = 1;
3072         }
3073
3074         if (!work)
3075                 return;
3076
3077         dtrace_xcall(DTRACE_CPUALL,
3078             (dtrace_xcall_t)dtrace_speculation_clean_here, state);
3079
3080         /*
3081          * We now know that all CPUs have committed or discarded their
3082          * speculation buffers, as appropriate.  We can now set the state
3083          * to inactive.
3084          */
3085         for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
3086                 dtrace_speculation_t *spec = &state->dts_speculations[i];
3087                 dtrace_speculation_state_t current, new;
3088
3089                 if (!spec->dtsp_cleaning)
3090                         continue;
3091
3092                 current = spec->dtsp_state;
3093                 ASSERT(current == DTRACESPEC_DISCARDING ||
3094                     current == DTRACESPEC_COMMITTINGMANY);
3095
3096                 new = DTRACESPEC_INACTIVE;
3097
3098                 rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
3099                 ASSERT(rv == current);
3100                 spec->dtsp_cleaning = 0;
3101         }
3102 }
3103
3104 /*
3105  * Called as part of a speculate() to get the speculative buffer associated
3106  * with a given speculation.  Returns NULL if the specified speculation is not
3107  * in an ACTIVE state.  If the speculation is in the ACTIVEONE state -- and
3108  * the active CPU is not the specified CPU -- the speculation will be
3109  * atomically transitioned into the ACTIVEMANY state.
3110  */
3111 static dtrace_buffer_t *
3112 dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
3113     dtrace_specid_t which)
3114 {
3115         dtrace_speculation_t *spec;
3116         dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
3117         dtrace_buffer_t *buf;
3118
3119         if (which == 0)
3120                 return (NULL);
3121
3122         if (which > (dtrace_specid_t)state->dts_nspeculations) {
3123                 cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3124                 return (NULL);
3125         }
3126
3127         spec = &state->dts_speculations[which - 1];
3128         buf = &spec->dtsp_buffer[cpuid];
3129
3130         do {
3131                 current = spec->dtsp_state;
3132
3133                 switch (current) {
3134                 case DTRACESPEC_INACTIVE:
3135                 case DTRACESPEC_COMMITTINGMANY:
3136                 case DTRACESPEC_DISCARDING:
3137                         return (NULL);
3138
3139                 case DTRACESPEC_COMMITTING:
3140                         ASSERT(buf->dtb_offset == 0);
3141                         return (NULL);
3142
3143                 case DTRACESPEC_ACTIVEONE:
3144                         /*
3145                          * This speculation is currently active on one CPU.
3146                          * Check the offset in the buffer; if it's non-zero,
3147                          * that CPU must be us (and we leave the state alone).
3148                          * If it's zero, assume that we're starting on a new
3149                          * CPU -- and change the state to indicate that the
3150                          * speculation is active on more than one CPU.
3151                          */
3152                         if (buf->dtb_offset != 0)
3153                                 return (buf);
3154
3155                         new = DTRACESPEC_ACTIVEMANY;
3156                         break;
3157
3158                 case DTRACESPEC_ACTIVEMANY:
3159                         return (buf);
3160
3161                 case DTRACESPEC_ACTIVE:
3162                         new = DTRACESPEC_ACTIVEONE;
3163                         break;
3164
3165                 default:
3166                         ASSERT(0);
3167                 }
3168         } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
3169             current, new) != current);
3170
3171         ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
3172         return (buf);
3173 }
3174
3175 /*
3176  * Return a string.  In the event that the user lacks the privilege to access
3177  * arbitrary kernel memory, we copy the string out to scratch memory so that we
3178  * don't fail access checking.
3179  *
3180  * dtrace_dif_variable() uses this routine as a helper for various
3181  * builtin values such as 'execname' and 'probefunc.'
3182  */
3183 static
3184 uintptr_t
3185 dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
3186     dtrace_mstate_t *mstate)
3187 {
3188         uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3189         uintptr_t ret;
3190         size_t strsz;
3191
3192         /*
3193          * The easy case: this probe is allowed to read all of memory, so
3194          * we can just return this as a vanilla pointer.
3195          */
3196         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
3197                 return (addr);
3198
3199         /*
3200          * This is the tougher case: we copy the string in question from
3201          * kernel memory into scratch memory and return it that way: this
3202          * ensures that we won't trip up when access checking tests the
3203          * BYREF return value.
3204          */
3205         strsz = dtrace_strlen((char *)addr, size) + 1;
3206
3207         if (mstate->dtms_scratch_ptr + strsz >
3208             mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
3209                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3210                 return (0);
3211         }
3212
3213         dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
3214             strsz);
3215         ret = mstate->dtms_scratch_ptr;
3216         mstate->dtms_scratch_ptr += strsz;
3217         return (ret);
3218 }
3219
3220 /*
3221  * This function implements the DIF emulator's variable lookups.  The emulator
3222  * passes a reserved variable identifier and optional built-in array index.
3223  */
3224 static uint64_t
3225 dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
3226     uint64_t ndx)
3227 {
3228         /*
3229          * If we're accessing one of the uncached arguments, we'll turn this
3230          * into a reference in the args array.
3231          */
3232         if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
3233                 ndx = v - DIF_VAR_ARG0;
3234                 v = DIF_VAR_ARGS;
3235         }
3236
3237         switch (v) {
3238         case DIF_VAR_ARGS:
3239                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
3240                 if (ndx >= sizeof (mstate->dtms_arg) /
3241                     sizeof (mstate->dtms_arg[0])) {
3242                         /*
3243                          * APPLE NOTE: Account for introduction of __dtrace_probe()
3244                          */
3245                         int aframes = mstate->dtms_probe->dtpr_aframes + 3;
3246                         dtrace_vstate_t *vstate = &state->dts_vstate;
3247                         dtrace_provider_t *pv;
3248                         uint64_t val;
3249
3250                         pv = mstate->dtms_probe->dtpr_provider;
3251                         if (pv->dtpv_pops.dtps_getargval != NULL)
3252                                 val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
3253                                     mstate->dtms_probe->dtpr_id,
3254                                     mstate->dtms_probe->dtpr_arg, ndx, aframes);
3255                         /* Special case access of arg5 as passed to dtrace_probe_error() (which see.) */
3256                         else if (mstate->dtms_probe->dtpr_id == dtrace_probeid_error && ndx == 5) {
3257                                 return ((dtrace_state_t *)(uintptr_t)(mstate->dtms_arg[0]))->dts_arg_error_illval;
3258                         }
3259
3260                         else
3261                                 val = dtrace_getarg(ndx, aframes, mstate, vstate);
3262
3263                         /*
3264                          * This is regrettably required to keep the compiler
3265                          * from tail-optimizing the call to dtrace_getarg().
3266                          * The condition always evaluates to true, but the
3267                          * compiler has no way of figuring that out a priori.
3268                          * (None of this would be necessary if the compiler
3269                          * could be relied upon to _always_ tail-optimize
3270                          * the call to dtrace_getarg() -- but it can't.)
3271                          */
3272                         if (mstate->dtms_probe != NULL)
3273                                 return (val);
3274
3275                         ASSERT(0);
3276                 }
3277
3278                 return (mstate->dtms_arg[ndx]);
3279
3280         case DIF_VAR_UREGS: {
3281                 thread_t thread;
3282
3283                 if (!dtrace_priv_proc(state))
3284                         return (0);
3285
3286                 if ((thread = current_thread()) == NULL) {
3287                         DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3288                         cpu_core[CPU->cpu_id].cpuc_dtrace_illval = 0;
3289                         return (0);
3290                 }
3291
3292                 return (dtrace_getreg(find_user_regs(thread), ndx));
3293         }
3294
3295
3296         case DIF_VAR_CURTHREAD:
3297                 if (!dtrace_priv_kernel(state))
3298                         return (0);
3299
3300                 return ((uint64_t)(uintptr_t)current_thread());
3301
3302         case DIF_VAR_TIMESTAMP:
3303                 if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
3304                         mstate->dtms_timestamp = dtrace_gethrtime();
3305                         mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
3306                 }
3307                 return (mstate->dtms_timestamp);
3308
3309         case DIF_VAR_VTIMESTAMP:
3310                 ASSERT(dtrace_vtime_references != 0);
3311                 return (dtrace_get_thread_vtime(current_thread()));
3312
3313         case DIF_VAR_WALLTIMESTAMP:
3314                 if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
3315                         mstate->dtms_walltimestamp = dtrace_gethrestime();
3316                         mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
3317                 }
3318                 return (mstate->dtms_walltimestamp);
3319
3320         case DIF_VAR_MACHTIMESTAMP:
3321                 if (!(mstate->dtms_present & DTRACE_MSTATE_MACHTIMESTAMP)) {
3322                         mstate->dtms_machtimestamp = mach_absolute_time();
3323                         mstate->dtms_present |= DTRACE_MSTATE_MACHTIMESTAMP;
3324                 }
3325                 return (mstate->dtms_machtimestamp);
3326
3327         case DIF_VAR_CPU:
3328                 return ((uint64_t) dtrace_get_thread_last_cpu_id(current_thread()));
3329
3330         case DIF_VAR_IPL:
3331                 if (!dtrace_priv_kernel(state))
3332                         return (0);
3333                 if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
3334                         mstate->dtms_ipl = dtrace_getipl();
3335                         mstate->dtms_present |= DTRACE_MSTATE_IPL;
3336                 }
3337                 return (mstate->dtms_ipl);
3338
3339         case DIF_VAR_EPID:
3340                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
3341                 return (mstate->dtms_epid);
3342
3343         case DIF_VAR_ID:
3344                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3345                 return (mstate->dtms_probe->dtpr_id);
3346
3347         case DIF_VAR_STACKDEPTH:
3348                 if (!dtrace_priv_kernel(state))
3349                         return (0);
3350                 if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
3351                         /*
3352                          * APPLE NOTE: Account for introduction of __dtrace_probe()
3353                          */
3354                         int aframes = mstate->dtms_probe->dtpr_aframes + 3;
3355
3356                         mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
3357                         mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
3358                 }
3359                 return (mstate->dtms_stackdepth);
3360
3361         case DIF_VAR_USTACKDEPTH:
3362                 if (!dtrace_priv_proc(state))
3363                         return (0);
3364                 if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
3365                         /*
3366                          * See comment in DIF_VAR_PID.
3367                          */
3368                         if (DTRACE_ANCHORED(mstate->dtms_probe) &&
3369                             CPU_ON_INTR(CPU)) {
3370                                 mstate->dtms_ustackdepth = 0;
3371                         } else {
3372                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3373                                 mstate->dtms_ustackdepth =
3374                                     dtrace_getustackdepth();
3375                                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3376                         }
3377                         mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
3378                 }
3379                 return (mstate->dtms_ustackdepth);
3380
3381         case DIF_VAR_CALLER:
3382                 if (!dtrace_priv_kernel(state))
3383                         return (0);
3384                 if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
3385                         /*
3386                          * APPLE NOTE: Account for introduction of __dtrace_probe()
3387                          */
3388                         int aframes = mstate->dtms_probe->dtpr_aframes + 3;
3389
3390                         if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
3391                                 /*
3392                                  * If this is an unanchored probe, we are
3393                                  * required to go through the slow path:
3394                                  * dtrace_caller() only guarantees correct
3395                                  * results for anchored probes.
3396                                  */
3397                                 pc_t caller[2];
3398
3399                                 dtrace_getpcstack(caller, 2, aframes,
3400                                     (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
3401                                 mstate->dtms_caller = caller[1];
3402                         } else if ((mstate->dtms_caller =
3403                                 dtrace_caller(aframes)) == (uintptr_t)-1) {
3404                                 /*
3405                                  * We have failed to do this the quick way;
3406                                  * we must resort to the slower approach of
3407                                  * calling dtrace_getpcstack().
3408                                  */
3409                                 pc_t caller;
3410
3411                                 dtrace_getpcstack(&caller, 1, aframes, NULL);
3412                                 mstate->dtms_caller = caller;
3413                         }
3414
3415                         mstate->dtms_present |= DTRACE_MSTATE_CALLER;
3416                 }
3417                 return (mstate->dtms_caller);
3418
3419         case DIF_VAR_UCALLER:
3420                 if (!dtrace_priv_proc(state))
3421                         return (0);
3422
3423                 if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
3424                         uint64_t ustack[3];
3425
3426                         /*
3427                          * dtrace_getupcstack() fills in the first uint64_t
3428                          * with the current PID.  The second uint64_t will
3429                          * be the program counter at user-level.  The third
3430                          * uint64_t will contain the caller, which is what
3431                          * we're after.
3432                          */
3433                         ustack[2] = 0;
3434                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3435                         dtrace_getupcstack(ustack, 3);
3436                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3437                         mstate->dtms_ucaller = ustack[2];
3438                         mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
3439                 }
3440
3441                 return (mstate->dtms_ucaller);
3442
3443         case DIF_VAR_PROBEPROV:
3444                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3445                 return (dtrace_dif_varstr(
3446                     (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
3447                     state, mstate));
3448
3449         case DIF_VAR_PROBEMOD:
3450                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3451                 return (dtrace_dif_varstr(
3452                     (uintptr_t)mstate->dtms_probe->dtpr_mod,
3453                     state, mstate));
3454
3455         case DIF_VAR_PROBEFUNC:
3456                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3457                 return (dtrace_dif_varstr(
3458                     (uintptr_t)mstate->dtms_probe->dtpr_func,
3459                     state, mstate));
3460
3461         case DIF_VAR_PROBENAME:
3462                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3463                 return (dtrace_dif_varstr(
3464                     (uintptr_t)mstate->dtms_probe->dtpr_name,
3465                     state, mstate));
3466
3467         case DIF_VAR_PID:
3468                 if (!dtrace_priv_proc_relaxed(state))
3469                         return (0);
3470
3471                 /*
3472                  * Note that we are assuming that an unanchored probe is
3473                  * always due to a high-level interrupt.  (And we're assuming
3474                  * that there is only a single high level interrupt.)
3475                  */
3476                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3477                         /* Anchored probe that fires while on an interrupt accrues to process 0 */
3478                         return 0;
3479
3480                 return ((uint64_t)dtrace_proc_selfpid());
3481
3482         case DIF_VAR_PPID:
3483                 if (!dtrace_priv_proc_relaxed(state))
3484                         return (0);
3485
3486                 /*
3487                  * See comment in DIF_VAR_PID.
3488                  */
3489                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3490                         return (0);
3491
3492                 return ((uint64_t)dtrace_proc_selfppid());
3493
3494         case DIF_VAR_TID:
3495                 /* We do not need to check for null current_thread() */
3496                 return thread_tid(current_thread()); /* globally unique */
3497
3498         case DIF_VAR_PTHREAD_SELF:
3499                 if (!dtrace_priv_proc(state))
3500                         return (0);
3501
3502                 /* Not currently supported, but we should be able to delta the dispatchqaddr and dispatchqoffset to get pthread_self */
3503                 return 0;
3504
3505         case DIF_VAR_DISPATCHQADDR:
3506                 if (!dtrace_priv_proc(state))
3507                         return (0);
3508
3509                 /* We do not need to check for null current_thread() */
3510                 return thread_dispatchqaddr(current_thread());
3511
3512         case DIF_VAR_EXECNAME:
3513         {
3514                 char *xname = (char *)mstate->dtms_scratch_ptr;
3515                 size_t scratch_size = MAXCOMLEN+1;
3516
3517                 /* The scratch allocation's lifetime is that of the clause. */
3518                 if (!DTRACE_INSCRATCH(mstate, scratch_size)) {
3519                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3520                         return 0;
3521                 }
3522
3523                 if (!dtrace_priv_proc_relaxed(state))
3524                         return (0);
3525
3526                 mstate->dtms_scratch_ptr += scratch_size;
3527                 proc_selfname( xname, scratch_size );
3528
3529                 return ((uint64_t)(uintptr_t)xname);
3530         }
3531
3532
3533         case DIF_VAR_ZONENAME:
3534         {
3535                 /* scratch_size is equal to length('global') + 1 for the null-terminator. */
3536                 char *zname = (char *)mstate->dtms_scratch_ptr;
3537                 size_t scratch_size = 6 + 1;
3538
3539                 if (!dtrace_priv_proc(state))
3540                         return (0);
3541
3542                 /* The scratch allocation's lifetime is that of the clause. */
3543                 if (!DTRACE_INSCRATCH(mstate, scratch_size)) {
3544                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3545                         return 0;
3546                 }
3547
3548                 mstate->dtms_scratch_ptr += scratch_size;
3549
3550                 /* The kernel does not provide zonename, it will always return 'global'. */
3551                 strlcpy(zname, "global", scratch_size);
3552
3553                 return ((uint64_t)(uintptr_t)zname);
3554         }
3555
3556 #if MONOTONIC
3557         case DIF_VAR_CPUINSTRS:
3558                 return mt_cur_cpu_instrs();
3559
3560         case DIF_VAR_CPUCYCLES:
3561                 return mt_cur_cpu_cycles();
3562
3563         case DIF_VAR_VINSTRS:
3564                 return mt_cur_thread_instrs();
3565
3566         case DIF_VAR_VCYCLES:
3567                 return mt_cur_thread_cycles();
3568 #else /* MONOTONIC */
3569         case DIF_VAR_CPUINSTRS: /* FALLTHROUGH */
3570         case DIF_VAR_CPUCYCLES: /* FALLTHROUGH */
3571         case DIF_VAR_VINSTRS: /* FALLTHROUGH */
3572         case DIF_VAR_VCYCLES: /* FALLTHROUGH */
3573                 return 0;
3574 #endif /* !MONOTONIC */
3575
3576         case DIF_VAR_UID:
3577                 if (!dtrace_priv_proc_relaxed(state))
3578                         return (0);
3579
3580                 /*
3581                  * See comment in DIF_VAR_PID.
3582                  */
3583                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3584                         return (0);
3585
3586                 return ((uint64_t) dtrace_proc_selfruid());
3587
3588         case DIF_VAR_GID:
3589                 if (!dtrace_priv_proc(state))
3590                         return (0);
3591
3592                 /*
3593                  * See comment in DIF_VAR_PID.
3594                  */
3595                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3596                         return (0);
3597
3598                 if (dtrace_CRED() != NULL)
3599                         /* Credential does not require lazy initialization. */
3600                         return ((uint64_t)kauth_getgid());
3601                 else {
3602                         /* proc_lock would be taken under kauth_cred_proc_ref() in kauth_cred_get(). */
3603                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3604                         return -1ULL;
3605                 }
3606
3607         case DIF_VAR_ERRNO: {
3608                 uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
3609                 if (!dtrace_priv_proc(state))
3610                         return (0);
3611
3612                 /*
3613                  * See comment in DIF_VAR_PID.
3614                  */
3615                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3616                         return (0);
3617
3618                 if (uthread)
3619                         return (uint64_t)uthread->t_dtrace_errno;
3620                 else {
3621                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3622                         return -1ULL;
3623                 }
3624         }
3625
3626         default:
3627                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3628                 return (0);
3629         }
3630 }
3631
3632 /*
3633  * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
3634  * Notice that we don't bother validating the proper number of arguments or
3635  * their types in the tuple stack.  This isn't needed because all argument
3636  * interpretation is safe because of our load safety -- the worst that can
3637  * happen is that a bogus program can obtain bogus results.
3638  */
3639 static void
3640 dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
3641     dtrace_key_t *tupregs, int nargs,
3642     dtrace_mstate_t *mstate, dtrace_state_t *state)
3643 {
3644         volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
3645         volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
3646         dtrace_vstate_t *vstate = &state->dts_vstate;
3647
3648 #if !defined(__APPLE__)
3649         union {
3650                 mutex_impl_t mi;
3651                 uint64_t mx;
3652         } m;
3653
3654         union {
3655                 krwlock_t ri;
3656                 uintptr_t rw;
3657         } r;
3658 #else
3659 /* FIXME: awaits lock/mutex work */
3660 #endif /* __APPLE__ */
3661
3662         switch (subr) {
3663         case DIF_SUBR_RAND:
3664                 regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875;
3665                 break;
3666
3667 #if !defined(__APPLE__)
3668         case DIF_SUBR_MUTEX_OWNED:
3669                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3670                     mstate, vstate)) {
3671                         regs[rd] = 0;
3672                         break;
3673                 }
3674
3675                 m.mx = dtrace_load64(tupregs[0].dttk_value);
3676                 if (MUTEX_TYPE_ADAPTIVE(&m.mi))
3677                         regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
3678                 else
3679                         regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
3680                 break;
3681
3682         case DIF_SUBR_MUTEX_OWNER:
3683                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3684                     mstate, vstate)) {
3685                         regs[rd] = 0;
3686                         break;
3687                 }
3688
3689                 m.mx = dtrace_load64(tupregs[0].dttk_value);
3690                 if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
3691                     MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
3692                         regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
3693                 else
3694                         regs[rd] = 0;
3695                 break;
3696
3697         case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
3698                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3699                     mstate, vstate)) {
3700                         regs[rd] = 0;
3701                         break;
3702                 }
3703
3704                 m.mx = dtrace_load64(tupregs[0].dttk_value);
3705                 regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
3706                 break;
3707
3708         case DIF_SUBR_MUTEX_TYPE_SPIN:
3709                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
3710                     mstate, vstate)) {
3711                         regs[rd] = 0;
3712                         break;
3713                 }
3714
3715                 m.mx = dtrace_load64(tupregs[0].dttk_value);
3716                 regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
3717                 break;
3718
3719         case DIF_SUBR_RW_READ_HELD: {
3720                 uintptr_t tmp;
3721
3722                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
3723                     mstate, vstate)) {
3724                         regs[rd] = 0;
3725                         break;
3726                 }
3727
3728                 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3729                 regs[rd] = _RW_READ_HELD(&r.ri, tmp);
3730                 break;
3731         }
3732
3733         case DIF_SUBR_RW_WRITE_HELD:
3734                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
3735                     mstate, vstate)) {
3736                         regs[rd] = 0;
3737                         break;
3738                 }
3739
3740                 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3741                 regs[rd] = _RW_WRITE_HELD(&r.ri);
3742                 break;
3743
3744         case DIF_SUBR_RW_ISWRITER:
3745                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
3746                     mstate, vstate)) {
3747                         regs[rd] = 0;
3748                         break;
3749                 }
3750
3751                 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
3752                 regs[rd] = _RW_ISWRITER(&r.ri);
3753                 break;
3754 #else
3755 /* FIXME: awaits lock/mutex work */
3756 #endif /* __APPLE__ */
3757
3758         case DIF_SUBR_BCOPY: {
3759                 /*
3760                  * We need to be sure that the destination is in the scratch
3761                  * region -- no other region is allowed.
3762                  */
3763                 uintptr_t src = tupregs[0].dttk_value;
3764                 uintptr_t dest = tupregs[1].dttk_value;
3765                 size_t size = tupregs[2].dttk_value;
3766
3767                 if (!dtrace_inscratch(dest, size, mstate)) {
3768                         *flags |= CPU_DTRACE_BADADDR;
3769                         *illval = regs[rd];
3770                         break;
3771                 }
3772
3773                 if (!dtrace_canload(src, size, mstate, vstate)) {
3774                         regs[rd] = 0;
3775                         break;
3776                 }
3777
3778                 dtrace_bcopy((void *)src, (void *)dest, size);
3779                 break;
3780         }
3781
3782         case DIF_SUBR_ALLOCA:
3783         case DIF_SUBR_COPYIN: {
3784                 uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
3785                 uint64_t size =
3786                     tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
3787                 size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
3788
3789                 /*
3790                  * Check whether the user can access kernel memory
3791                  */
3792                 if (dtrace_priv_kernel(state) == 0) {
3793                         DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
3794                         regs[rd] = 0;
3795                         break;
3796                 }
3797                 /*
3798                  * This action doesn't require any credential checks since
3799                  * probes will not activate in user contexts to which the
3800                  * enabling user does not have permissions.
3801                  */
3802
3803                 /*
3804                  * Rounding up the user allocation size could have overflowed
3805                  * a large, bogus allocation (like -1ULL) to 0.
3806                  */
3807                 if (scratch_size < size ||
3808                     !DTRACE_INSCRATCH(mstate, scratch_size)) {
3809                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3810                         regs[rd] = 0;
3811                         break;
3812                 }
3813
3814                 if (subr == DIF_SUBR_COPYIN) {
3815                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3816                         if (dtrace_priv_proc(state))
3817                                 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3818                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3819                 }
3820
3821                 mstate->dtms_scratch_ptr += scratch_size;
3822                 regs[rd] = dest;
3823                 break;
3824         }
3825
3826         case DIF_SUBR_COPYINTO: {
3827                 uint64_t size = tupregs[1].dttk_value;
3828                 uintptr_t dest = tupregs[2].dttk_value;
3829
3830                 /*
3831                  * This action doesn't require any credential checks since
3832                  * probes will not activate in user contexts to which the
3833                  * enabling user does not have permissions.
3834                  */
3835                 if (!dtrace_inscratch(dest, size, mstate)) {
3836                         *flags |= CPU_DTRACE_BADADDR;
3837                         *illval = regs[rd];
3838                         break;
3839                 }
3840
3841                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3842                 if (dtrace_priv_proc(state))
3843                         dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
3844                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3845                 break;
3846         }
3847
3848         case DIF_SUBR_COPYINSTR: {
3849                 uintptr_t dest = mstate->dtms_scratch_ptr;
3850                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3851
3852                 if (nargs > 1 && tupregs[1].dttk_value < size)
3853                         size = tupregs[1].dttk_value + 1;
3854
3855                 /*
3856                  * This action doesn't require any credential checks since
3857                  * probes will not activate in user contexts to which the
3858                  * enabling user does not have permissions.
3859                  */
3860                 if (!DTRACE_INSCRATCH(mstate, size)) {
3861                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3862                         regs[rd] = 0;
3863                         break;
3864                 }
3865
3866                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3867                 if (dtrace_priv_proc(state))
3868                         dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
3869                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3870
3871                 ((char *)dest)[size - 1] = '\0';
3872                 mstate->dtms_scratch_ptr += size;
3873                 regs[rd] = dest;
3874                 break;
3875         }
3876
3877         case DIF_SUBR_MSGSIZE:
3878         case DIF_SUBR_MSGDSIZE: {
3879                 /* Darwin does not implement SysV streams messages */
3880                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3881                 regs[rd] = 0;
3882                 break;
3883         }
3884
3885         case DIF_SUBR_PROGENYOF: {
3886                 pid_t pid = tupregs[0].dttk_value;
3887                 struct proc *p = current_proc();
3888                 int rval = 0, lim = nprocs;
3889
3890                 while(p && (lim-- > 0)) {
3891                         pid_t ppid;
3892
3893                         ppid = (pid_t)dtrace_load32((uintptr_t)&(p->p_pid));
3894                         if (*flags & CPU_DTRACE_FAULT)
3895                                 break;
3896
3897                         if (ppid == pid) {
3898                                 rval = 1;
3899                                 break;
3900                         }
3901
3902                         if (ppid == 0)
3903                                 break; /* Can't climb process tree any further. */
3904
3905                         p = (struct proc *)dtrace_loadptr((uintptr_t)&(p->p_pptr));
3906                         if (*flags & CPU_DTRACE_FAULT)
3907                                 break;
3908                 }
3909
3910                 regs[rd] = rval;
3911                 break;
3912         }
3913
3914         case DIF_SUBR_SPECULATION:
3915                 regs[rd] = dtrace_speculation(state);
3916                 break;
3917
3918
3919         case DIF_SUBR_COPYOUT: {
3920                 uintptr_t kaddr = tupregs[0].dttk_value;
3921                 user_addr_t uaddr = tupregs[1].dttk_value;
3922                 uint64_t size = tupregs[2].dttk_value;
3923
3924                 if (!dtrace_destructive_disallow &&
3925                     dtrace_priv_proc_control(state) &&
3926                     !dtrace_istoxic(kaddr, size) &&
3927                     dtrace_canload(kaddr, size, mstate, vstate)) {
3928                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3929                         dtrace_copyout(kaddr, uaddr, size, flags);
3930                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3931                 }
3932                 break;
3933         }
3934
3935         case DIF_SUBR_COPYOUTSTR: {
3936                 uintptr_t kaddr = tupregs[0].dttk_value;
3937                 user_addr_t uaddr = tupregs[1].dttk_value;
3938                 uint64_t size = tupregs[2].dttk_value;
3939                 size_t lim;
3940
3941                 if (!dtrace_destructive_disallow &&
3942                     dtrace_priv_proc_control(state) &&
3943                     !dtrace_istoxic(kaddr, size) &&
3944                     dtrace_strcanload(kaddr, size, &lim, mstate, vstate)) {
3945                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3946                         dtrace_copyoutstr(kaddr, uaddr, lim, flags);
3947                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3948                 }
3949                 break;
3950         }
3951
3952         case DIF_SUBR_STRLEN: {
3953                 size_t size = state->dts_options[DTRACEOPT_STRSIZE];
3954                 uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
3955                 size_t lim;
3956
3957                 if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) {
3958                         regs[rd] = 0;
3959                         break;
3960                 }
3961
3962                 regs[rd] = dtrace_strlen((char *)addr, lim);
3963
3964                 break;
3965         }
3966
3967         case DIF_SUBR_STRCHR:
3968         case DIF_SUBR_STRRCHR: {
3969                 /*
3970                  * We're going to iterate over the string looking for the
3971                  * specified character.  We will iterate until we have reached
3972                  * the string length or we have found the character.  If this
3973                  * is DIF_SUBR_STRRCHR, we will look for the last occurrence
3974                  * of the specified character instead of the first.
3975                  */
3976                 uintptr_t addr = tupregs[0].dttk_value;
3977                 uintptr_t addr_limit;
3978                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3979                 size_t lim;
3980                 char c, target = (char)tupregs[1].dttk_value;
3981
3982                 if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) {
3983                         regs[rd] = 0;
3984                         break;
3985                 }
3986                 addr_limit = addr + lim;
3987
3988                 for (regs[rd] = 0; addr < addr_limit; addr++) {
3989                         if ((c = dtrace_load8(addr)) == target) {
3990                                 regs[rd] = addr;
3991
3992                                 if (subr == DIF_SUBR_STRCHR)
3993                                         break;
3994                         }
3995
3996                         if (c == '\0')
3997                                 break;
3998                 }
3999
4000                 break;
4001         }
4002
4003         case DIF_SUBR_STRSTR:
4004         case DIF_SUBR_INDEX:
4005         case DIF_SUBR_RINDEX: {
4006                 /*
4007                  * We're going to iterate over the string looking for the
4008                  * specified string.  We will iterate until we have reached
4009                  * the string length or we have found the string.  (Yes, this
4010                  * is done in the most naive way possible -- but considering
4011                  * that the string we're searching for is likely to be
4012                  * relatively short, the complexity of Rabin-Karp or similar
4013                  * hardly seems merited.)
4014                  */
4015                 char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
4016                 char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
4017                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4018                 size_t len = dtrace_strlen(addr, size);
4019                 size_t sublen = dtrace_strlen(substr, size);
4020                 char *limit = addr + len, *orig = addr;
4021                 int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
4022                 int inc = 1;
4023
4024                 regs[rd] = notfound;
4025
4026                 if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
4027                         regs[rd] = 0;
4028                         break;
4029                 }
4030
4031                 if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
4032                     vstate)) {
4033                         regs[rd] = 0;
4034                         break;
4035                 }
4036
4037                 /*
4038                  * strstr() and index()/rindex() have similar semantics if
4039                  * both strings are the empty string: strstr() returns a
4040                  * pointer to the (empty) string, and index() and rindex()
4041                  * both return index 0 (regardless of any position argument).
4042                  */
4043                 if (sublen == 0 && len == 0) {
4044                         if (subr == DIF_SUBR_STRSTR)
4045                                 regs[rd] = (uintptr_t)addr;
4046                         else
4047                                 regs[rd] = 0;
4048                         break;
4049                 }
4050
4051                 if (subr != DIF_SUBR_STRSTR) {
4052                         if (subr == DIF_SUBR_RINDEX) {
4053                                 limit = orig - 1;
4054                                 addr += len;
4055                                 inc = -1;
4056                         }
4057
4058                         /*
4059                          * Both index() and rindex() take an optional position
4060                          * argument that denotes the starting position.
4061                          */
4062                         if (nargs == 3) {
4063                                 int64_t pos = (int64_t)tupregs[2].dttk_value;
4064
4065                                 /*
4066                                  * If the position argument to index() is
4067                                  * negative, Perl implicitly clamps it at
4068                                  * zero.  This semantic is a little surprising
4069                                  * given the special meaning of negative
4070                                  * positions to similar Perl functions like
4071                                  * substr(), but it appears to reflect a
4072                                  * notion that index() can start from a
4073                                  * negative index and increment its way up to
4074                                  * the string.  Given this notion, Perl's
4075                                  * rindex() is at least self-consistent in
4076                                  * that it implicitly clamps positions greater
4077                                  * than the string length to be the string
4078                                  * length.  Where Perl completely loses
4079                                  * coherence, however, is when the specified
4080                                  * substring is the empty string ("").  In
4081                                  * this case, even if the position is
4082                                  * negative, rindex() returns 0 -- and even if
4083                                  * the position is greater than the length,
4084                                  * index() returns the string length.  These
4085                                  * semantics violate the notion that index()
4086                                  * should never return a value less than the
4087                                  * specified position and that rindex() should
4088                                  * never return a value greater than the
4089                                  * specified position.  (One assumes that
4090                                  * these semantics are artifacts of Perl's
4091                                  * implementation and not the results of
4092                                  * deliberate design -- it beggars belief that
4093                                  * even Larry Wall could desire such oddness.)
4094                                  * While in the abstract one would wish for
4095                                  * consistent position semantics across
4096                                  * substr(), index() and rindex() -- or at the
4097                                  * very least self-consistent position
4098                                  * semantics for index() and rindex() -- we
4099                                  * instead opt to keep with the extant Perl
4100                                  * semantics, in all their broken glory.  (Do
4101                                  * we have more desire to maintain Perl's
4102                                  * semantics than Perl does?  Probably.)
4103                                  */
4104                                 if (subr == DIF_SUBR_RINDEX) {
4105                                         if (pos < 0) {
4106                                                 if (sublen == 0)
4107                                                         regs[rd] = 0;
4108                                                 break;
4109                                         }
4110
4111                                         if ((size_t)pos > len)
4112                                                 pos = len;
4113                                 } else {
4114                                         if (pos < 0)
4115                                                 pos = 0;
4116
4117                                         if ((size_t)pos >= len) {
4118                                                 if (sublen == 0)
4119                                                         regs[rd] = len;
4120                                                 break;
4121                                         }
4122                                 }
4123
4124                                 addr = orig + pos;
4125                         }
4126                 }
4127
4128                 for (regs[rd] = notfound; addr != limit; addr += inc) {
4129                         if (dtrace_strncmp(addr, substr, sublen) == 0) {
4130                                 if (subr != DIF_SUBR_STRSTR) {
4131                                         /*
4132                                          * As D index() and rindex() are
4133                                          * modeled on Perl (and not on awk),
4134                                          * we return a zero-based (and not a
4135                                          * one-based) index.  (For you Perl
4136                                          * weenies: no, we're not going to add
4137                                          * $[ -- and shouldn't you be at a con
4138                                          * or something?)
4139                                          */
4140                                         regs[rd] = (uintptr_t)(addr - orig);
4141                                         break;
4142                                 }
4143
4144                                 ASSERT(subr == DIF_SUBR_STRSTR);
4145                                 regs[rd] = (uintptr_t)addr;
4146                                 break;
4147                         }
4148                 }
4149
4150                 break;
4151         }
4152
4153         case DIF_SUBR_STRTOK: {
4154                 uintptr_t addr = tupregs[0].dttk_value;
4155                 uintptr_t tokaddr = tupregs[1].dttk_value;
4156                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4157                 uintptr_t limit, toklimit;
4158                 size_t clim;
4159                 char *dest = (char *)mstate->dtms_scratch_ptr;
4160                 uint8_t c='\0', tokmap[32];      /* 256 / 8 */
4161                 uint64_t i = 0;
4162
4163                 /*
4164                  * Check both the token buffer and (later) the input buffer,
4165                  * since both could be non-scratch addresses.
4166                  */
4167                 if (!dtrace_strcanload(tokaddr, size, &clim, mstate, vstate)) {
4168                         regs[rd] = 0;
4169                         break;
4170                 }
4171                 toklimit = tokaddr + clim;
4172
4173                 if (!DTRACE_INSCRATCH(mstate, size)) {
4174                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4175                         regs[rd] = 0;
4176                         break;
4177                 }
4178
4179                 if (addr == 0) {
4180                         /*
4181                          * If the address specified is NULL, we use our saved
4182                          * strtok pointer from the mstate.  Note that this
4183                          * means that the saved strtok pointer is _only_
4184                          * valid within multiple enablings of the same probe --
4185                          * it behaves like an implicit clause-local variable.
4186                          */
4187                         addr = mstate->dtms_strtok;
4188                         limit = mstate->dtms_strtok_limit;
4189                 } else {
4190                         /*
4191                          * If the user-specified address is non-NULL we must
4192                          * access check it.  This is the only time we have
4193                          * a chance to do so, since this address may reside
4194                          * in the string table of this clause-- future calls
4195                          * (when we fetch addr from mstate->dtms_strtok)
4196                          * would fail this access check.
4197                          */
4198                         if (!dtrace_strcanload(addr, size, &clim, mstate,
4199                                 vstate)) {
4200                                 regs[rd] = 0;
4201                                 break;
4202                         }
4203                         limit = addr + clim;
4204                 }
4205
4206                 /*
4207                  * First, zero the token map, and then process the token
4208                  * string -- setting a bit in the map for every character
4209                  * found in the token string.
4210                  */
4211                 for (i = 0; i < (int)sizeof (tokmap); i++)
4212                         tokmap[i] = 0;
4213
4214                 for (; tokaddr < toklimit; tokaddr++) {
4215                         if ((c = dtrace_load8(tokaddr)) == '\0')
4216                                 break;
4217
4218                         ASSERT((c >> 3) < sizeof (tokmap));
4219                         tokmap[c >> 3] |= (1 << (c & 0x7));
4220                 }
4221
4222                 for (; addr < limit; addr++) {
4223                         /*
4224                          * We're looking for a character that is _not_
4225                          * contained in the token string.
4226                          */
4227                         if ((c = dtrace_load8(addr)) == '\0')
4228                                 break;
4229
4230                         if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
4231                                 break;
4232                 }
4233
4234                 if (c == '\0') {
4235                         /*
4236                          * We reached the end of the string without finding
4237                          * any character that was not in the token string.
4238                          * We return NULL in this case, and we set the saved
4239                          * address to NULL as well.
4240                          */
4241                         regs[rd] = 0;
4242                         mstate->dtms_strtok = 0;
4243                         mstate->dtms_strtok_limit = 0;
4244                         break;
4245                 }
4246
4247                 /*
4248                  * From here on, we're copying into the destination string.
4249                  */
4250                 for (i = 0; addr < limit && i < size - 1; addr++) {
4251                         if ((c = dtrace_load8(addr)) == '\0')
4252                                 break;
4253
4254                         if (tokmap[c >> 3] & (1 << (c & 0x7)))
4255                                 break;
4256
4257                         ASSERT(i < size);
4258                         dest[i++] = c;
4259                 }
4260
4261                 ASSERT(i < size);
4262                 dest[i] = '\0';
4263                 regs[rd] = (uintptr_t)dest;
4264                 mstate->dtms_scratch_ptr += size;
4265                 mstate->dtms_strtok = addr;
4266                 mstate->dtms_strtok_limit = limit;
4267                 break;
4268         }
4269
4270         case DIF_SUBR_SUBSTR: {
4271                 uintptr_t s = tupregs[0].dttk_value;
4272                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4273                 char *d = (char *)mstate->dtms_scratch_ptr;
4274                 int64_t index = (int64_t)tupregs[1].dttk_value;
4275                 int64_t remaining = (int64_t)tupregs[2].dttk_value;
4276                 size_t len = dtrace_strlen((char *)s, size);
4277                 int64_t i = 0;
4278
4279                 if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4280                         regs[rd] = 0;
4281                         break;
4282                 }
4283
4284                 if (!DTRACE_INSCRATCH(mstate, size)) {
4285                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4286                         regs[rd] = 0;
4287                         break;
4288                 }
4289
4290                 if (nargs <= 2)
4291                         remaining = (int64_t)size;
4292
4293                 if (index < 0) {
4294                         index += len;
4295
4296                         if (index < 0 && index + remaining > 0) {
4297                                 remaining += index;
4298                                 index = 0;
4299                         }
4300                 }
4301
4302                 if ((size_t)index >= len || index < 0) {
4303                         remaining = 0;
4304                 } else if (remaining < 0) {
4305                         remaining += len - index;
4306                 } else if ((uint64_t)index + (uint64_t)remaining > size) {
4307                         remaining = size - index;
4308                 }
4309
4310                 for (i = 0; i < remaining; i++) {
4311                         if ((d[i] = dtrace_load8(s + index + i)) == '\0')
4312                                 break;
4313                         }
4314
4315                 d[i] = '\0';
4316
4317                 mstate->dtms_scratch_ptr += size;
4318                 regs[rd] = (uintptr_t)d;
4319                 break;
4320         }
4321
4322         case DIF_SUBR_GETMAJOR:
4323                 regs[rd] = (uintptr_t)major( (dev_t)tupregs[0].dttk_value );
4324                 break;
4325
4326         case DIF_SUBR_GETMINOR:
4327                 regs[rd] = (uintptr_t)minor( (dev_t)tupregs[0].dttk_value );
4328                 break;
4329
4330         case DIF_SUBR_DDI_PATHNAME: {
4331                 /* APPLE NOTE: currently unsupported on Darwin */
4332                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4333                 regs[rd] = 0;
4334                 break;
4335         }
4336
4337         case DIF_SUBR_STRJOIN: {
4338                 char *d = (char *)mstate->dtms_scratch_ptr;
4339                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4340                 uintptr_t s1 = tupregs[0].dttk_value;
4341                 uintptr_t s2 = tupregs[1].dttk_value;
4342                 uint64_t i = 0, j = 0;
4343                 size_t lim1, lim2;
4344                 char c;
4345
4346                 if (!dtrace_strcanload(s1, size, &lim1, mstate, vstate) ||
4347                     !dtrace_strcanload(s2, size, &lim2, mstate, vstate)) {
4348                         regs[rd] = 0;
4349                         break;
4350                 }
4351
4352                 if (!DTRACE_INSCRATCH(mstate, size)) {
4353                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4354                         regs[rd] = 0;
4355                         break;
4356                 }
4357
4358                 for (;;) {
4359                         if (i >= size) {
4360                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4361                                 regs[rd] = 0;
4362                                 break;
4363                         }
4364                         c = (i >= lim1) ? '\0' : dtrace_load8(s1++);
4365                         if ((d[i++] = c) == '\0') {
4366                                 i--;
4367                                 break;
4368                         }
4369                 }
4370
4371                 for (;;) {
4372                         if (i >= size) {
4373                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4374                                 regs[rd] = 0;
4375                                 break;
4376                         }
4377                         c = (j++ >= lim2) ? '\0' : dtrace_load8(s2++);
4378                         if ((d[i++] = c) == '\0')
4379                                 break;
4380                 }
4381
4382                 if (i < size) {
4383                         mstate->dtms_scratch_ptr += i;
4384                         regs[rd] = (uintptr_t)d;
4385                 }
4386
4387                 break;
4388         }
4389
4390         case DIF_SUBR_LLTOSTR: {
4391                 int64_t i = (int64_t)tupregs[0].dttk_value;
4392                 uint64_t val, digit;
4393                 uint64_t size = 65;     /* enough room for 2^64 in binary */
4394                 char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
4395                 int base = 10;
4396
4397                 if (nargs > 1) {
4398                         if ((base = tupregs[1].dttk_value) <= 1 ||
4399                              base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
4400                                 *flags |= CPU_DTRACE_ILLOP;
4401                                 break;
4402                         }
4403                 }
4404
4405                 val = (base == 10 && i < 0) ? i * -1 : i;
4406
4407                 if (!DTRACE_INSCRATCH(mstate, size)) {
4408                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4409                         regs[rd] = 0;
4410                         break;
4411                 }
4412
4413                 for (*end-- = '\0'; val; val /= base) {
4414                         if ((digit = val % base) <= '9' - '0') {
4415                                 *end-- = '0' + digit;
4416                         } else {
4417                                 *end-- = 'a' + (digit - ('9' - '0') - 1);
4418                         }
4419                 }
4420
4421                 if (i == 0 && base == 16)
4422                         *end-- = '0';
4423
4424                 if (base == 16)
4425                         *end-- = 'x';
4426
4427                 if (i == 0 || base == 8 || base == 16)
4428                         *end-- = '0';
4429
4430                 if (i < 0 && base == 10)
4431                         *end-- = '-';
4432
4433                 regs[rd] = (uintptr_t)end + 1;
4434                 mstate->dtms_scratch_ptr += size;
4435                 break;
4436         }
4437
4438         case DIF_SUBR_HTONS:
4439         case DIF_SUBR_NTOHS:
4440 #ifdef _BIG_ENDIAN
4441                 regs[rd] = (uint16_t)tupregs[0].dttk_value;
4442 #else
4443                 regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
4444 #endif
4445                 break;
4446
4447
4448         case DIF_SUBR_HTONL:
4449         case DIF_SUBR_NTOHL:
4450 #ifdef _BIG_ENDIAN
4451                 regs[rd] = (uint32_t)tupregs[0].dttk_value;
4452 #else
4453                 regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
4454 #endif
4455                 break;
4456
4457
4458         case DIF_SUBR_HTONLL:
4459         case DIF_SUBR_NTOHLL:
4460 #ifdef _BIG_ENDIAN
4461                 regs[rd] = (uint64_t)tupregs[0].dttk_value;
4462 #else
4463                 regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
4464 #endif
4465                 break;
4466
4467
4468         case DIF_SUBR_DIRNAME:
4469         case DIF_SUBR_BASENAME: {
4470                 char *dest = (char *)mstate->dtms_scratch_ptr;
4471                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4472                 uintptr_t src = tupregs[0].dttk_value;
4473                 int i, j, len = dtrace_strlen((char *)src, size);
4474                 int lastbase = -1, firstbase = -1, lastdir = -1;
4475                 int start, end;
4476
4477                 if (!dtrace_canload(src, len + 1, mstate, vstate)) {
4478                         regs[rd] = 0;
4479                         break;
4480                 }
4481
4482                 if (!DTRACE_INSCRATCH(mstate, size)) {
4483                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4484                         regs[rd] = 0;
4485                         break;
4486                 }
4487
4488                 /*
4489                  * The basename and dirname for a zero-length string is
4490                  * defined to be "."
4491                  */
4492                 if (len == 0) {
4493                         len = 1;
4494                         src = (uintptr_t)".";
4495                 }
4496
4497                 /*
4498                  * Start from the back of the string, moving back toward the
4499                  * front until we see a character that isn't a slash.  That
4500                  * character is the last character in the basename.
4501                  */
4502                 for (i = len - 1; i >= 0; i--) {
4503                         if (dtrace_load8(src + i) != '/')
4504                                 break;
4505                 }
4506
4507                 if (i >= 0)
4508                         lastbase = i;
4509
4510                 /*
4511                  * Starting from the last character in the basename, move
4512                  * towards the front until we find a slash.  The character
4513                  * that we processed immediately before that is the first
4514                  * character in the basename.
4515                  */
4516                 for (; i >= 0; i--) {
4517                         if (dtrace_load8(src + i) == '/')
4518                                 break;
4519                 }
4520
4521                 if (i >= 0)
4522                         firstbase = i + 1;
4523
4524                 /*
4525                  * Now keep going until we find a non-slash character.  That
4526                  * character is the last character in the dirname.
4527                  */
4528                 for (; i >= 0; i--) {
4529                         if (dtrace_load8(src + i) != '/')
4530                                 break;
4531                 }
4532
4533                 if (i >= 0)
4534                         lastdir = i;
4535
4536                 ASSERT(!(lastbase == -1 && firstbase != -1));
4537                 ASSERT(!(firstbase == -1 && lastdir != -1));
4538
4539                 if (lastbase == -1) {
4540                         /*
4541                          * We didn't find a non-slash character.  We know that
4542                          * the length is non-zero, so the whole string must be
4543                          * slashes.  In either the dirname or the basename
4544                          * case, we return '/'.
4545                          */
4546                         ASSERT(firstbase == -1);
4547                         firstbase = lastbase = lastdir = 0;
4548                 }
4549
4550                 if (firstbase == -1) {
4551                         /*
4552                          * The entire string consists only of a basename
4553                          * component.  If we're looking for dirname, we need
4554                          * to change our string to be just "."; if we're
4555                          * looking for a basename, we'll just set the first
4556                          * character of the basename to be 0.
4557                          */
4558                         if (subr == DIF_SUBR_DIRNAME) {
4559                                 ASSERT(lastdir == -1);
4560                                 src = (uintptr_t)".";
4561                                 lastdir = 0;
4562                         } else {
4563                                 firstbase = 0;
4564                         }
4565                 }
4566
4567                 if (subr == DIF_SUBR_DIRNAME) {
4568                         if (lastdir == -1) {
4569                                 /*
4570                                  * We know that we have a slash in the name --
4571                                  * or lastdir would be set to 0, above.  And
4572                                  * because lastdir is -1, we know that this
4573                                  * slash must be the first character.  (That
4574                                  * is, the full string must be of the form
4575                                  * "/basename".)  In this case, the last
4576                                  * character of the directory name is 0.
4577                                  */
4578                                 lastdir = 0;
4579                         }
4580
4581                         start = 0;
4582                         end = lastdir;
4583                 } else {
4584                         ASSERT(subr == DIF_SUBR_BASENAME);
4585                         ASSERT(firstbase != -1 && lastbase != -1);
4586                         start = firstbase;
4587                         end = lastbase;
4588                 }
4589
4590                 for (i = start, j = 0; i <= end && (uint64_t)j < size - 1; i++, j++)
4591                         dest[j] = dtrace_load8(src + i);
4592
4593                 dest[j] = '\0';
4594                 regs[rd] = (uintptr_t)dest;
4595                 mstate->dtms_scratch_ptr += size;
4596                 break;
4597         }
4598
4599         case DIF_SUBR_CLEANPATH: {
4600                 char *dest = (char *)mstate->dtms_scratch_ptr, c;
4601                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4602                 uintptr_t src = tupregs[0].dttk_value;
4603                 size_t lim;
4604                 size_t i = 0, j = 0;
4605
4606                 if (!dtrace_strcanload(src, size, &lim, mstate, vstate)) {
4607                         regs[rd] = 0;
4608                         break;
4609                 }
4610
4611                 if (!DTRACE_INSCRATCH(mstate, size)) {
4612                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4613                         regs[rd] = 0;
4614                         break;
4615                 }
4616
4617                 /*
4618                  * Move forward, loading each character.
4619                  */
4620                 do {
4621                         c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
4622 next:
4623                         if ((uint64_t)(j + 5) >= size)  /* 5 = strlen("/..c\0") */
4624                                 break;
4625
4626                         if (c != '/') {
4627                                 dest[j++] = c;
4628                                 continue;
4629                         }
4630
4631                         c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
4632
4633                         if (c == '/') {
4634                                 /*
4635                                  * We have two slashes -- we can just advance
4636                                  * to the next character.
4637                                  */
4638                                 goto next;
4639                         }
4640
4641                         if (c != '.') {
4642                                 /*
4643                                  * This is not "." and it's not ".." -- we can
4644                                  * just store the "/" and this character and
4645                                  * drive on.
4646                                  */
4647                                 dest[j++] = '/';
4648                                 dest[j++] = c;
4649                                 continue;
4650                         }
4651
4652                         c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
4653
4654                         if (c == '/') {
4655                                 /*
4656                                  * This is a "/./" component.  We're not going
4657                                  * to store anything in the destination buffer;
4658                                  * we're just going to go to the next component.
4659                                  */
4660                                 goto next;
4661                         }
4662
4663                         if (c != '.') {
4664                                 /*
4665                                  * This is not ".." -- we can just store the
4666                                  * "/." and this character and continue
4667                                  * processing.
4668                                  */
4669                                 dest[j++] = '/';
4670                                 dest[j++] = '.';
4671                                 dest[j++] = c;
4672                                 continue;
4673                         }
4674
4675                         c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
4676
4677                         if (c != '/' && c != '\0') {
4678                                 /*
4679                                  * This is not ".." -- it's "..[mumble]".
4680                                  * We'll store the "/.." and this character
4681                                  * and continue processing.
4682                                  */
4683                                 dest[j++] = '/';
4684                                 dest[j++] = '.';
4685                                 dest[j++] = '.';
4686                                 dest[j++] = c;
4687                                 continue;
4688                         }
4689
4690                         /*
4691                          * This is "/../" or "/..\0".  We need to back up
4692                          * our destination pointer until we find a "/".
4693                          */
4694                         i--;
4695                         while (j != 0 && dest[--j] != '/')
4696                                 continue;
4697
4698                         if (c == '\0')
4699                                 dest[++j] = '/';
4700                 } while (c != '\0');
4701
4702                 dest[j] = '\0';
4703                 regs[rd] = (uintptr_t)dest;
4704                 mstate->dtms_scratch_ptr += size;
4705                 break;
4706         }
4707
4708         case DIF_SUBR_INET_NTOA:
4709         case DIF_SUBR_INET_NTOA6:
4710         case DIF_SUBR_INET_NTOP: {
4711                 size_t size;
4712                 int af, argi, i;
4713                 char *base, *end;
4714
4715                 if (subr == DIF_SUBR_INET_NTOP) {
4716                         af = (int)tupregs[0].dttk_value;
4717                         argi = 1;
4718                 } else {
4719                         af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
4720                         argi = 0;
4721                 }
4722
4723                 if (af == AF_INET) {
4724 #if !defined(__APPLE__)
4725                         ipaddr_t ip4;
4726 #else
4727                         uint32_t ip4;
4728 #endif /* __APPLE__ */
4729                         uint8_t *ptr8, val;
4730
4731                         /*
4732                          * Safely load the IPv4 address.
4733                          */
4734 #if !defined(__APPLE__)
4735                         ip4 = dtrace_load32(tupregs[argi].dttk_value);
4736 #else
4737                         if (!dtrace_canload(tupregs[argi].dttk_value, sizeof(ip4),
4738                                 mstate, vstate)) {
4739                                 regs[rd] = 0;
4740                                 break;
4741                         }
4742
4743                         dtrace_bcopy(
4744                             (void *)(uintptr_t)tupregs[argi].dttk_value,
4745                             (void *)(uintptr_t)&ip4, sizeof (ip4));
4746 #endif /* __APPLE__ */
4747                         /*
4748                          * Check an IPv4 string will fit in scratch.
4749                          */
4750 #if !defined(__APPLE__)
4751                         size = INET_ADDRSTRLEN;
4752 #else
4753                         size = MAX_IPv4_STR_LEN;
4754 #endif /* __APPLE__ */
4755                         if (!DTRACE_INSCRATCH(mstate, size)) {
4756                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4757                                 regs[rd] = 0;
4758                                 break;
4759                         }
4760                         base = (char *)mstate->dtms_scratch_ptr;
4761                         end = (char *)mstate->dtms_scratch_ptr + size - 1;
4762
4763                         /*
4764                          * Stringify as a dotted decimal quad.
4765                          */
4766                         *end-- = '\0';
4767                         ptr8 = (uint8_t *)&ip4;
4768                         for (i = 3; i >= 0; i--) {
4769                                 val = ptr8[i];
4770
4771                                 if (val == 0) {
4772                                         *end-- = '0';
4773                                 } else {
4774                                         for (; val; val /= 10) {
4775                                                 *end-- = '0' + (val % 10);
4776                                         }
4777                                 }
4778
4779                                 if (i > 0)
4780                                         *end-- = '.';
4781                         }
4782                         ASSERT(end + 1 >= base);
4783
4784                 } else if (af == AF_INET6) {
4785 #if defined(__APPLE__)
4786 #define _S6_un __u6_addr
4787 #define _S6_u8 __u6_addr8
4788 #endif /* __APPLE__ */
4789                         struct in6_addr ip6;
4790                         int firstzero, tryzero, numzero, v6end;
4791                         uint16_t val;
4792                         const char digits[] = "0123456789abcdef";
4793
4794                         /*
4795                          * Stringify using RFC 1884 convention 2 - 16 bit
4796                          * hexadecimal values with a zero-run compression.
4797                          * Lower case hexadecimal digits are used.
4798                          *      eg, fe80::214:4fff:fe0b:76c8.
4799                          * The IPv4 embedded form is returned for inet_ntop,
4800                          * just the IPv4 string is returned for inet_ntoa6.
4801                          */
4802
4803                         if (!dtrace_canload(tupregs[argi].dttk_value,
4804                                 sizeof(struct in6_addr), mstate, vstate)) {
4805                                 regs[rd] = 0;
4806                                 break;
4807                         }
4808
4809                         /*
4810                          * Safely load the IPv6 address.
4811                          */
4812                         dtrace_bcopy(
4813                             (void *)(uintptr_t)tupregs[argi].dttk_value,
4814                             (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
4815
4816                         /*
4817                          * Check an IPv6 string will fit in scratch.
4818                          */
4819                         size = INET6_ADDRSTRLEN;
4820                         if (!DTRACE_INSCRATCH(mstate, size)) {
4821                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4822                                 regs[rd] = 0;
4823                                 break;
4824                         }
4825                         base = (char *)mstate->dtms_scratch_ptr;
4826                         end = (char *)mstate->dtms_scratch_ptr + size - 1;
4827                         *end-- = '\0';
4828
4829                         /*
4830                          * Find the longest run of 16 bit zero values
4831                          * for the single allowed zero compression - "::".
4832                          */
4833                         firstzero = -1;
4834                         tryzero = -1;
4835                         numzero = 1;
4836                         for (i = 0; i < (int)sizeof (struct in6_addr); i++) {
4837                                 if (ip6._S6_un._S6_u8[i] == 0 &&
4838                                     tryzero == -1 && i % 2 == 0) {
4839                                         tryzero = i;
4840                                         continue;
4841                                 }
4842
4843                                 if (tryzero != -1 &&
4844                                     (ip6._S6_un._S6_u8[i] != 0 ||
4845                                     i == sizeof (struct in6_addr) - 1)) {
4846
4847                                         if (i - tryzero <= numzero) {
4848                                                 tryzero = -1;
4849                                                 continue;
4850                                         }
4851
4852                                         firstzero = tryzero;
4853                                         numzero = i - i % 2 - tryzero;
4854                                         tryzero = -1;
4855
4856                                         if (ip6._S6_un._S6_u8[i] == 0 &&
4857                                             i == sizeof (struct in6_addr) - 1)
4858                                                 numzero += 2;
4859                                 }
4860                         }
4861                         ASSERT(firstzero + numzero <= (int)sizeof (struct in6_addr));
4862
4863                         /*
4864                          * Check for an IPv4 embedded address.
4865                          */
4866                         v6end = sizeof (struct in6_addr) - 2;
4867                         if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
4868                             IN6_IS_ADDR_V4COMPAT(&ip6)) {
4869                                 for (i = sizeof (struct in6_addr) - 1;
4870                                      i >= (int)DTRACE_V4MAPPED_OFFSET; i--) {
4871                                         ASSERT(end >= base);
4872
4873                                         val = ip6._S6_un._S6_u8[i];
4874
4875                                         if (val == 0) {
4876                                                 *end-- = '0';
4877                                         } else {
4878                                                 for (; val; val /= 10) {
4879                                                         *end-- = '0' + val % 10;
4880                                                 }
4881                                         }
4882
4883                                         if (i > (int)DTRACE_V4MAPPED_OFFSET)
4884                                                 *end-- = '.';
4885                                 }
4886
4887                                 if (subr == DIF_SUBR_INET_NTOA6)
4888                                         goto inetout;
4889
4890                                 /*
4891                                  * Set v6end to skip the IPv4 address that
4892                                  * we have already stringified.
4893                                  */
4894                                 v6end = 10;
4895                         }
4896
4897                         /*
4898                          * Build the IPv6 string by working through the
4899                          * address in reverse.
4900                          */
4901                         for (i = v6end; i >= 0; i -= 2) {
4902                                 ASSERT(end >= base);
4903
4904                                 if (i == firstzero + numzero - 2) {
4905                                         *end-- = ':';
4906                                         *end-- = ':';
4907                                         i -= numzero - 2;
4908                                         continue;
4909                                 }
4910
4911                                 if (i < 14 && i != firstzero - 2)
4912                                         *end-- = ':';
4913
4914                                 val = (ip6._S6_un._S6_u8[i] << 8) +
4915                                     ip6._S6_un._S6_u8[i + 1];
4916
4917                                 if (val == 0) {
4918                                         *end-- = '0';
4919                                 } else {
4920                                         for (; val; val /= 16) {
4921                                                 *end-- = digits[val % 16];
4922                                         }
4923                                 }
4924                         }
4925                         ASSERT(end + 1 >= base);
4926
4927 #if defined(__APPLE__)
4928 #undef _S6_un
4929 #undef _S6_u8
4930 #endif /* __APPLE__ */
4931                 } else {
4932                         /*
4933                          * The user didn't use AH_INET or AH_INET6.
4934                          */
4935                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4936                         regs[rd] = 0;
4937                         break;
4938                 }
4939
4940 inetout:        regs[rd] = (uintptr_t)end + 1;
4941                 mstate->dtms_scratch_ptr += size;
4942                 break;
4943         }
4944
4945         case DIF_SUBR_TOUPPER:
4946         case DIF_SUBR_TOLOWER: {
4947                 uintptr_t src = tupregs[0].dttk_value;
4948                 char *dest = (char *)mstate->dtms_scratch_ptr;
4949                 char lower, upper, base, c;
4950                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4951                 size_t len = dtrace_strlen((char*) src, size);
4952                 size_t i = 0;
4953
4954                 lower = (subr == DIF_SUBR_TOUPPER) ? 'a' : 'A';
4955                 upper = (subr == DIF_SUBR_TOUPPER) ? 'z' : 'Z';
4956                 base  = (subr == DIF_SUBR_TOUPPER) ? 'A' : 'a';
4957
4958                 if (!dtrace_canload(src, len + 1, mstate, vstate)) {
4959                         regs[rd] = 0;
4960                         break;
4961                 }
4962
4963                 if (!DTRACE_INSCRATCH(mstate, size)) {
4964                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4965                         regs[rd] = 0;
4966                         break;
4967                 }
4968
4969                 for (i = 0; i < size - 1; ++i) {
4970                         if ((c = dtrace_load8(src + i)) == '\0')
4971                                 break;
4972                         if (c >= lower && c <= upper)
4973                                 c = base + (c - lower);
4974                         dest[i] = c;
4975                 }
4976
4977                 ASSERT(i < size);
4978
4979                 dest[i] = '\0';
4980                 regs[rd] = (uintptr_t) dest;
4981                 mstate->dtms_scratch_ptr += size;
4982
4983                 break;
4984         }
4985
4986 #if defined(__APPLE__)
4987         case DIF_SUBR_VM_KERNEL_ADDRPERM: {
4988                 if (!dtrace_priv_kernel(state)) {
4989                         regs[rd] = 0;
4990                 } else {
4991                         regs[rd] = VM_KERNEL_ADDRPERM((vm_offset_t) tupregs[0].dttk_value);
4992                 }
4993
4994                 break;
4995         }
4996
4997         case DIF_SUBR_KDEBUG_TRACE: {
4998                 uint32_t debugid;
4999                 uintptr_t args[4] = {0};
5000                 int i;
5001
5002                 if (nargs < 2 || nargs > 5) {
5003                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5004                         break;
5005                 }
5006
5007                 if (dtrace_destructive_disallow)
5008                         return;
5009
5010                 debugid = tupregs[0].dttk_value;
5011                 for (i = 0; i < nargs - 1; i++)
5012                         args[i] = tupregs[i + 1].dttk_value;
5013
5014                 kernel_debug(debugid, args[0], args[1], args[2], args[3], 0);
5015
5016                 break;
5017         }
5018
5019         case DIF_SUBR_KDEBUG_TRACE_STRING: {
5020                 if (nargs != 3) {
5021                         break;
5022                 }
5023
5024                 if (dtrace_destructive_disallow)
5025                         return;
5026
5027                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5028                 uint32_t debugid = tupregs[0].dttk_value;
5029                 uint64_t str_id = tupregs[1].dttk_value;
5030                 uintptr_t src = tupregs[2].dttk_value;
5031                 size_t lim;
5032                 char buf[size];
5033                 char* str = NULL;
5034
5035                 if (src != (uintptr_t)0) {
5036                         str = buf;
5037                         if (!dtrace_strcanload(src, size, &lim, mstate, vstate)) {
5038                                 break;
5039                         }
5040                         dtrace_strcpy((void*)src, buf, size);
5041                 }
5042
5043                 (void)kernel_debug_string(debugid, &str_id, str);
5044                 regs[rd] = str_id;
5045
5046                 break;
5047         }
5048 #endif
5049
5050         }
5051 }
5052
5053 /*
5054  * Emulate the execution of DTrace IR instructions specified by the given
5055  * DIF object.  This function is deliberately void of assertions as all of
5056  * the necessary checks are handled by a call to dtrace_difo_validate().
5057  */
5058 static uint64_t
5059 dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
5060     dtrace_vstate_t *vstate, dtrace_state_t *state)
5061 {
5062         const dif_instr_t *text = difo->dtdo_buf;
5063         const uint_t textlen = difo->dtdo_len;
5064         const char *strtab = difo->dtdo_strtab;
5065         const uint64_t *inttab = difo->dtdo_inttab;
5066
5067         uint64_t rval = 0;
5068         dtrace_statvar_t *svar;
5069         dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
5070         dtrace_difv_t *v;
5071         volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
5072         volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
5073
5074         dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
5075         uint64_t regs[DIF_DIR_NREGS];
5076         uint64_t *tmp;
5077
5078         uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
5079         int64_t cc_r;
5080         uint_t pc = 0, id, opc = 0;
5081         uint8_t ttop = 0;
5082         dif_instr_t instr;
5083         uint_t r1, r2, rd;
5084
5085         /*
5086          * We stash the current DIF object into the machine state: we need it
5087          * for subsequent access checking.
5088          */
5089         mstate->dtms_difo = difo;
5090
5091         regs[DIF_REG_R0] = 0;           /* %r0 is fixed at zero */
5092
5093         while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
5094                 opc = pc;
5095
5096                 instr = text[pc++];
5097                 r1 = DIF_INSTR_R1(instr);
5098                 r2 = DIF_INSTR_R2(instr);
5099                 rd = DIF_INSTR_RD(instr);
5100
5101                 switch (DIF_INSTR_OP(instr)) {
5102                 case DIF_OP_OR:
5103                         regs[rd] = regs[r1] | regs[r2];
5104                         break;
5105                 case DIF_OP_XOR:
5106                         regs[rd] = regs[r1] ^ regs[r2];
5107                         break;
5108                 case DIF_OP_AND:
5109                         regs[rd] = regs[r1] & regs[r2];
5110                         break;
5111                 case DIF_OP_SLL:
5112                         regs[rd] = regs[r1] << regs[r2];
5113                         break;
5114                 case DIF_OP_SRL:
5115                         regs[rd] = regs[r1] >> regs[r2];
5116                         break;
5117                 case DIF_OP_SUB:
5118                         regs[rd] = regs[r1] - regs[r2];
5119                         break;
5120                 case DIF_OP_ADD:
5121                         regs[rd] = regs[r1] + regs[r2];
5122                         break;
5123                 case DIF_OP_MUL:
5124                         regs[rd] = regs[r1] * regs[r2];
5125                         break;
5126                 case DIF_OP_SDIV:
5127                         if (regs[r2] == 0) {
5128                                 regs[rd] = 0;
5129                                 *flags |= CPU_DTRACE_DIVZERO;
5130                         } else {
5131                                 regs[rd] = (int64_t)regs[r1] /
5132                                     (int64_t)regs[r2];
5133                         }
5134                         break;
5135
5136                 case DIF_OP_UDIV:
5137                         if (regs[r2] == 0) {
5138                                 regs[rd] = 0;
5139                                 *flags |= CPU_DTRACE_DIVZERO;
5140                         } else {
5141                                 regs[rd] = regs[r1] / regs[r2];
5142                         }
5143                         break;
5144
5145                 case DIF_OP_SREM:
5146                         if (regs[r2] == 0) {
5147                                 regs[rd] = 0;
5148                                 *flags |= CPU_DTRACE_DIVZERO;
5149                         } else {
5150                                 regs[rd] = (int64_t)regs[r1] %
5151                                     (int64_t)regs[r2];
5152                         }
5153                         break;
5154
5155                 case DIF_OP_UREM:
5156                         if (regs[r2] == 0) {
5157                                 regs[rd] = 0;
5158                                 *flags |= CPU_DTRACE_DIVZERO;
5159                         } else {
5160                                 regs[rd] = regs[r1] % regs[r2];
5161                         }
5162                         break;
5163
5164                 case DIF_OP_NOT:
5165                         regs[rd] = ~regs[r1];
5166                         break;
5167                 case DIF_OP_MOV:
5168                         regs[rd] = regs[r1];
5169                         break;
5170                 case DIF_OP_CMP:
5171                         cc_r = regs[r1] - regs[r2];
5172                         cc_n = cc_r < 0;
5173                         cc_z = cc_r == 0;
5174                         cc_v = 0;
5175                         cc_c = regs[r1] < regs[r2];
5176                         break;
5177                 case DIF_OP_TST:
5178                         cc_n = cc_v = cc_c = 0;
5179                         cc_z = regs[r1] == 0;
5180                         break;
5181                 case DIF_OP_BA:
5182                         pc = DIF_INSTR_LABEL(instr);
5183                         break;
5184                 case DIF_OP_BE:
5185                         if (cc_z)
5186                                 pc = DIF_INSTR_LABEL(instr);
5187                         break;
5188                 case DIF_OP_BNE:
5189                         if (cc_z == 0)
5190                                 pc = DIF_INSTR_LABEL(instr);
5191                         break;
5192                 case DIF_OP_BG:
5193                         if ((cc_z | (cc_n ^ cc_v)) == 0)
5194                                 pc = DIF_INSTR_LABEL(instr);
5195                         break;
5196                 case DIF_OP_BGU:
5197                         if ((cc_c | cc_z) == 0)
5198                                 pc = DIF_INSTR_LABEL(instr);
5199                         break;
5200                 case DIF_OP_BGE:
5201                         if ((cc_n ^ cc_v) == 0)
5202                                 pc = DIF_INSTR_LABEL(instr);
5203                         break;
5204                 case DIF_OP_BGEU:
5205                         if (cc_c == 0)
5206                                 pc = DIF_INSTR_LABEL(instr);
5207                         break;
5208                 case DIF_OP_BL:
5209                         if (cc_n ^ cc_v)
5210                                 pc = DIF_INSTR_LABEL(instr);
5211                         break;
5212                 case DIF_OP_BLU:
5213                         if (cc_c)
5214                                 pc = DIF_INSTR_LABEL(instr);
5215                         break;
5216                 case DIF_OP_BLE:
5217                         if (cc_z | (cc_n ^ cc_v))
5218                                 pc = DIF_INSTR_LABEL(instr);
5219                         break;
5220                 case DIF_OP_BLEU:
5221                         if (cc_c | cc_z)
5222                                 pc = DIF_INSTR_LABEL(instr);
5223                         break;
5224                 case DIF_OP_RLDSB:
5225                         if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
5226                                 *flags |= CPU_DTRACE_KPRIV;
5227                                 *illval = regs[r1];
5228                                 break;
5229                         }
5230                         /*FALLTHROUGH*/
5231                 case DIF_OP_LDSB:
5232                         regs[rd] = (int8_t)dtrace_load8(regs[r1]);
5233                         break;
5234                 case DIF_OP_RLDSH:
5235                         if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
5236                                 *flags |= CPU_DTRACE_KPRIV;
5237                                 *illval = regs[r1];
5238                                 break;
5239                         }
5240                         /*FALLTHROUGH*/
5241                 case DIF_OP_LDSH:
5242                         regs[rd] = (int16_t)dtrace_load16(regs[r1]);
5243                         break;
5244                 case DIF_OP_RLDSW:
5245                         if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
5246                                 *flags |= CPU_DTRACE_KPRIV;
5247                                 *illval = regs[r1];
5248                                 break;
5249                         }
5250                         /*FALLTHROUGH*/
5251                 case DIF_OP_LDSW:
5252                         regs[rd] = (int32_t)dtrace_load32(regs[r1]);
5253                         break;
5254                 case DIF_OP_RLDUB:
5255                         if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
5256                                 *flags |= CPU_DTRACE_KPRIV;
5257                                 *illval = regs[r1];
5258                                 break;
5259                         }
5260                         /*FALLTHROUGH*/
5261                 case DIF_OP_LDUB:
5262                         regs[rd] = dtrace_load8(regs[r1]);
5263                         break;
5264                 case DIF_OP_RLDUH:
5265                         if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
5266                                 *flags |= CPU_DTRACE_KPRIV;
5267                                 *illval = regs[r1];
5268                                 break;
5269                         }
5270                         /*FALLTHROUGH*/
5271                 case DIF_OP_LDUH:
5272                         regs[rd] = dtrace_load16(regs[r1]);
5273                         break;
5274                 case DIF_OP_RLDUW:
5275                         if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
5276                                 *flags |= CPU_DTRACE_KPRIV;
5277                                 *illval = regs[r1];
5278                                 break;
5279                         }
5280                         /*FALLTHROUGH*/
5281                 case DIF_OP_LDUW:
5282                         regs[rd] = dtrace_load32(regs[r1]);
5283                         break;
5284                 case DIF_OP_RLDX:
5285                         if (!dtrace_canstore(regs[r1], 8, mstate, vstate)) {
5286                                 *flags |= CPU_DTRACE_KPRIV;
5287                                 *illval = regs[r1];
5288                                 break;
5289                         }
5290                         /*FALLTHROUGH*/
5291                 case DIF_OP_LDX:
5292                         regs[rd] = dtrace_load64(regs[r1]);
5293                         break;
5294 /*
5295  * Darwin 32-bit kernel may fetch from 64-bit user.
5296  * Do not cast regs to uintptr_t
5297  * DIF_OP_ULDSB,DIF_OP_ULDSH, DIF_OP_ULDSW, DIF_OP_ULDUB
5298  * DIF_OP_ULDUH, DIF_OP_ULDUW, DIF_OP_ULDX
5299  */
5300                 case DIF_OP_ULDSB:
5301                         regs[rd] = (int8_t)
5302                             dtrace_fuword8(regs[r1]);
5303                         break;
5304                 case DIF_OP_ULDSH:
5305                         regs[rd] = (int16_t)
5306                             dtrace_fuword16(regs[r1]);
5307                         break;
5308                 case DIF_OP_ULDSW:
5309                         regs[rd] = (int32_t)
5310                             dtrace_fuword32(regs[r1]);
5311                         break;
5312                 case DIF_OP_ULDUB:
5313                         regs[rd] =
5314                             dtrace_fuword8(regs[r1]);
5315                         break;
5316                 case DIF_OP_ULDUH:
5317                         regs[rd] =
5318                             dtrace_fuword16(regs[r1]);
5319                         break;
5320                 case DIF_OP_ULDUW:
5321                         regs[rd] =
5322                             dtrace_fuword32(regs[r1]);
5323                         break;
5324                 case DIF_OP_ULDX:
5325                         regs[rd] =
5326                             dtrace_fuword64(regs[r1]);
5327                         break;
5328                 case DIF_OP_RET:
5329                         rval = regs[rd];
5330                         pc = textlen;
5331                         break;
5332                 case DIF_OP_NOP:
5333                         break;
5334                 case DIF_OP_SETX:
5335                         regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
5336                         break;
5337                 case DIF_OP_SETS:
5338                         regs[rd] = (uint64_t)(uintptr_t)
5339                             (strtab + DIF_INSTR_STRING(instr));
5340                         break;
5341                 case DIF_OP_SCMP: {
5342                         size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
5343                         uintptr_t s1 = regs[r1];
5344                         uintptr_t s2 = regs[r2];
5345                         size_t lim1 = sz, lim2 = sz;
5346
5347                         if (s1 != 0 &&
5348                             !dtrace_strcanload(s1, sz, &lim1, mstate, vstate))
5349                                 break;
5350                         if (s2 != 0 &&
5351                             !dtrace_strcanload(s2, sz, &lim2, mstate, vstate))
5352                                 break;
5353
5354                         cc_r = dtrace_strncmp((char *)s1, (char *)s2,
5355                                 MIN(lim1, lim2));
5356
5357                         cc_n = cc_r < 0;
5358                         cc_z = cc_r == 0;
5359                         cc_v = cc_c = 0;
5360                         break;
5361                 }
5362                 case DIF_OP_LDGA:
5363                         regs[rd] = dtrace_dif_variable(mstate, state,
5364                             r1, regs[r2]);
5365                         break;
5366                 case DIF_OP_LDGS:
5367                         id = DIF_INSTR_VAR(instr);
5368
5369                         if (id >= DIF_VAR_OTHER_UBASE) {
5370                                 uintptr_t a;
5371
5372                                 id -= DIF_VAR_OTHER_UBASE;
5373                                 svar = vstate->dtvs_globals[id];
5374                                 ASSERT(svar != NULL);
5375                                 v = &svar->dtsv_var;
5376
5377                                 if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
5378                                         regs[rd] = svar->dtsv_data;
5379                                         break;
5380                                 }
5381
5382                                 a = (uintptr_t)svar->dtsv_data;
5383
5384                                 if (*(uint8_t *)a == UINT8_MAX) {
5385                                         /*
5386                                          * If the 0th byte is set to UINT8_MAX
5387                                          * then this is to be treated as a
5388                                          * reference to a NULL variable.
5389                                          */
5390                                         regs[rd] = 0;
5391                                 } else {
5392                                         regs[rd] = a + sizeof (uint64_t);
5393                                 }
5394
5395                                 break;
5396                         }
5397
5398                         regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
5399                         break;
5400
5401                 case DIF_OP_STGS:
5402                         id = DIF_INSTR_VAR(instr);
5403
5404                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
5405                         id -= DIF_VAR_OTHER_UBASE;
5406
5407                         VERIFY(id < (uint_t)vstate->dtvs_nglobals);
5408                         svar = vstate->dtvs_globals[id];
5409                         ASSERT(svar != NULL);
5410                         v = &svar->dtsv_var;
5411
5412                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5413                                 uintptr_t a = (uintptr_t)svar->dtsv_data;
5414                                 size_t lim;
5415
5416                                 ASSERT(a != 0);
5417                                 ASSERT(svar->dtsv_size != 0);
5418
5419                                 if (regs[rd] == 0) {
5420                                         *(uint8_t *)a = UINT8_MAX;
5421                                         break;
5422                                 } else {
5423                                         *(uint8_t *)a = 0;
5424                                         a += sizeof (uint64_t);
5425                                 }
5426                                 if (!dtrace_vcanload(
5427                                     (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5428                                         &lim, mstate, vstate))
5429                                         break;
5430
5431                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5432                                     (void *)a, &v->dtdv_type, lim);
5433                                 break;
5434                         }
5435
5436                         svar->dtsv_data = regs[rd];
5437                         break;
5438
5439                 case DIF_OP_LDTA:
5440                         /*
5441                          * There are no DTrace built-in thread-local arrays at
5442                          * present.  This opcode is saved for future work.
5443                          */
5444                         *flags |= CPU_DTRACE_ILLOP;
5445                         regs[rd] = 0;
5446                         break;
5447
5448                 case DIF_OP_LDLS:
5449                         id = DIF_INSTR_VAR(instr);
5450
5451                         if (id < DIF_VAR_OTHER_UBASE) {
5452                                 /*
5453                                  * For now, this has no meaning.
5454                                  */
5455                                 regs[rd] = 0;
5456                                 break;
5457                         }
5458
5459                         id -= DIF_VAR_OTHER_UBASE;
5460
5461                         ASSERT(id < (uint_t)vstate->dtvs_nlocals);
5462                         ASSERT(vstate->dtvs_locals != NULL);
5463                         svar = vstate->dtvs_locals[id];
5464                         ASSERT(svar != NULL);
5465                         v = &svar->dtsv_var;
5466
5467                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5468                                 uintptr_t a = (uintptr_t)svar->dtsv_data;
5469                                 size_t sz = v->dtdv_type.dtdt_size;
5470
5471                                 sz += sizeof (uint64_t);
5472                                 ASSERT(svar->dtsv_size == (int)NCPU * sz);
5473                                 a += CPU->cpu_id * sz;
5474
5475                                 if (*(uint8_t *)a == UINT8_MAX) {
5476                                         /*
5477                                          * If the 0th byte is set to UINT8_MAX
5478                                          * then this is to be treated as a
5479                                          * reference to a NULL variable.
5480                                          */
5481                                         regs[rd] = 0;
5482                                 } else {
5483                                         regs[rd] = a + sizeof (uint64_t);
5484                                 }
5485
5486                                 break;
5487                         }
5488
5489                         ASSERT(svar->dtsv_size == (int)NCPU * sizeof (uint64_t));
5490                         tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
5491                         regs[rd] = tmp[CPU->cpu_id];
5492                         break;
5493
5494                 case DIF_OP_STLS:
5495                         id = DIF_INSTR_VAR(instr);
5496
5497                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
5498                         id -= DIF_VAR_OTHER_UBASE;
5499                         VERIFY(id < (uint_t)vstate->dtvs_nlocals);
5500                         ASSERT(vstate->dtvs_locals != NULL);
5501                         svar = vstate->dtvs_locals[id];
5502                         ASSERT(svar != NULL);
5503                         v = &svar->dtsv_var;
5504
5505                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5506                                 uintptr_t a = (uintptr_t)svar->dtsv_data;
5507                                 size_t sz = v->dtdv_type.dtdt_size;
5508                                 size_t lim;
5509
5510                                 sz += sizeof (uint64_t);
5511                                 ASSERT(svar->dtsv_size == (int)NCPU * sz);
5512                                 a += CPU->cpu_id * sz;
5513
5514                                 if (regs[rd] == 0) {
5515                                         *(uint8_t *)a = UINT8_MAX;
5516                                         break;
5517                                 } else {
5518                                         *(uint8_t *)a = 0;
5519                                         a += sizeof (uint64_t);
5520                                 }
5521
5522                                 if (!dtrace_vcanload(
5523                                     (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5524                                     &lim, mstate, vstate))
5525                                         break;
5526
5527                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5528                                     (void *)a, &v->dtdv_type, lim);
5529                                 break;
5530                         }
5531
5532                         ASSERT(svar->dtsv_size == (int)NCPU * sizeof (uint64_t));
5533                         tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
5534                         tmp[CPU->cpu_id] = regs[rd];
5535                         break;
5536
5537                 case DIF_OP_LDTS: {
5538                         dtrace_dynvar_t *dvar;
5539                         dtrace_key_t *key;
5540
5541                         id = DIF_INSTR_VAR(instr);
5542                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
5543                         id -= DIF_VAR_OTHER_UBASE;
5544                         v = &vstate->dtvs_tlocals[id];
5545
5546                         key = &tupregs[DIF_DTR_NREGS];
5547                         key[0].dttk_value = (uint64_t)id;
5548                         key[0].dttk_size = 0;
5549                         DTRACE_TLS_THRKEY(key[1].dttk_value);
5550                         key[1].dttk_size = 0;
5551
5552                         dvar = dtrace_dynvar(dstate, 2, key,
5553                             sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
5554                             mstate, vstate);
5555
5556                         if (dvar == NULL) {
5557                                 regs[rd] = 0;
5558                                 break;
5559                         }
5560
5561                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5562                                 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
5563                         } else {
5564                                 regs[rd] = *((uint64_t *)dvar->dtdv_data);
5565                         }
5566
5567                         break;
5568                 }
5569
5570                 case DIF_OP_STTS: {
5571                         dtrace_dynvar_t *dvar;
5572                         dtrace_key_t *key;
5573
5574                         id = DIF_INSTR_VAR(instr);
5575                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
5576                         id -= DIF_VAR_OTHER_UBASE;
5577                         VERIFY(id < (uint_t)vstate->dtvs_ntlocals);
5578
5579                         key = &tupregs[DIF_DTR_NREGS];
5580                         key[0].dttk_value = (uint64_t)id;
5581                         key[0].dttk_size = 0;
5582                         DTRACE_TLS_THRKEY(key[1].dttk_value);
5583                         key[1].dttk_size = 0;
5584                         v = &vstate->dtvs_tlocals[id];
5585
5586                         dvar = dtrace_dynvar(dstate, 2, key,
5587                             v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5588                             v->dtdv_type.dtdt_size : sizeof (uint64_t),
5589                             regs[rd] ? DTRACE_DYNVAR_ALLOC :
5590                             DTRACE_DYNVAR_DEALLOC, mstate, vstate);
5591
5592                         /*
5593                          * Given that we're storing to thread-local data,
5594                          * we need to flush our predicate cache.
5595                          */
5596                         dtrace_set_thread_predcache(current_thread(), 0);
5597
5598                         if (dvar == NULL)
5599                                 break;
5600
5601                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5602                                 size_t lim;
5603
5604                                 if (!dtrace_vcanload(
5605                                     (void *)(uintptr_t)regs[rd],
5606                                     &v->dtdv_type, &lim, mstate, vstate))
5607                                         break;
5608
5609                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5610                                     dvar->dtdv_data, &v->dtdv_type, lim);
5611                         } else {
5612                                 *((uint64_t *)dvar->dtdv_data) = regs[rd];
5613                         }
5614
5615                         break;
5616                 }
5617
5618                 case DIF_OP_SRA:
5619                         regs[rd] = (int64_t)regs[r1] >> regs[r2];
5620                         break;
5621
5622                 case DIF_OP_CALL:
5623                         dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
5624                             regs, tupregs, ttop, mstate, state);
5625                         break;
5626
5627                 case DIF_OP_PUSHTR:
5628                         if (ttop == DIF_DTR_NREGS) {
5629                                 *flags |= CPU_DTRACE_TUPOFLOW;
5630                                 break;
5631                         }
5632
5633                         if (r1 == DIF_TYPE_STRING) {
5634                                 /*
5635                                  * If this is a string type and the size is 0,
5636                                  * we'll use the system-wide default string
5637                                  * size.  Note that we are _not_ looking at
5638                                  * the value of the DTRACEOPT_STRSIZE option;
5639                                  * had this been set, we would expect to have
5640                                  * a non-zero size value in the "pushtr".
5641                                  */
5642                                 tupregs[ttop].dttk_size =
5643                                     dtrace_strlen((char *)(uintptr_t)regs[rd],
5644                                     regs[r2] ? regs[r2] :
5645                                     dtrace_strsize_default) + 1;
5646                         } else {
5647                                 if (regs[r2] > LONG_MAX) {
5648                                         *flags |= CPU_DTRACE_ILLOP;
5649                                         break;
5650                                 }
5651                                 tupregs[ttop].dttk_size = regs[r2];
5652                         }
5653
5654                         tupregs[ttop++].dttk_value = regs[rd];
5655                         break;
5656
5657                 case DIF_OP_PUSHTV:
5658                         if (ttop == DIF_DTR_NREGS) {
5659                                 *flags |= CPU_DTRACE_TUPOFLOW;
5660                                 break;
5661                         }
5662
5663                         tupregs[ttop].dttk_value = regs[rd];
5664                         tupregs[ttop++].dttk_size = 0;
5665                         break;
5666
5667                 case DIF_OP_POPTS:
5668                         if (ttop != 0)
5669                                 ttop--;
5670                         break;
5671
5672                 case DIF_OP_FLUSHTS:
5673                         ttop = 0;
5674                         break;
5675
5676                 case DIF_OP_LDGAA:
5677                 case DIF_OP_LDTAA: {
5678                         dtrace_dynvar_t *dvar;
5679                         dtrace_key_t *key = tupregs;
5680                         uint_t nkeys = ttop;
5681
5682                         id = DIF_INSTR_VAR(instr);
5683                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
5684                         id -= DIF_VAR_OTHER_UBASE;
5685
5686                         key[nkeys].dttk_value = (uint64_t)id;
5687                         key[nkeys++].dttk_size = 0;
5688
5689                         if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
5690                                 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5691                                 key[nkeys++].dttk_size = 0;
5692                                 VERIFY(id < (uint_t)vstate->dtvs_ntlocals);
5693                                 v = &vstate->dtvs_tlocals[id];
5694                         } else {
5695                                 VERIFY(id < (uint_t)vstate->dtvs_nglobals);
5696                                 v = &vstate->dtvs_globals[id]->dtsv_var;
5697                         }
5698
5699                         dvar = dtrace_dynvar(dstate, nkeys, key,
5700                             v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5701                             v->dtdv_type.dtdt_size : sizeof (uint64_t),
5702                             DTRACE_DYNVAR_NOALLOC, mstate, vstate);
5703
5704                         if (dvar == NULL) {
5705                                 regs[rd] = 0;
5706                                 break;
5707                         }
5708
5709                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5710                                 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
5711                         } else {
5712                                 regs[rd] = *((uint64_t *)dvar->dtdv_data);
5713                         }
5714
5715                         break;
5716                 }
5717
5718                 case DIF_OP_STGAA:
5719                 case DIF_OP_STTAA: {
5720                         dtrace_dynvar_t *dvar;
5721                         dtrace_key_t *key = tupregs;
5722                         uint_t nkeys = ttop;
5723
5724                         id = DIF_INSTR_VAR(instr);
5725                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
5726                         id -= DIF_VAR_OTHER_UBASE;
5727
5728                         key[nkeys].dttk_value = (uint64_t)id;
5729                         key[nkeys++].dttk_size = 0;
5730
5731                         if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
5732                                 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
5733                                 key[nkeys++].dttk_size = 0;
5734                                 VERIFY(id < (uint_t)vstate->dtvs_ntlocals);
5735                                 v = &vstate->dtvs_tlocals[id];
5736                         } else {
5737                                 VERIFY(id < (uint_t)vstate->dtvs_nglobals);
5738                                 v = &vstate->dtvs_globals[id]->dtsv_var;
5739                         }
5740
5741                         dvar = dtrace_dynvar(dstate, nkeys, key,
5742                             v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
5743                             v->dtdv_type.dtdt_size : sizeof (uint64_t),
5744                             regs[rd] ? DTRACE_DYNVAR_ALLOC :
5745                             DTRACE_DYNVAR_DEALLOC, mstate, vstate);
5746
5747                         if (dvar == NULL)
5748                                 break;
5749
5750                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
5751                                 size_t lim;
5752
5753                                 if (!dtrace_vcanload(
5754                                     (void *)(uintptr_t)regs[rd], &v->dtdv_type,
5755                                     &lim, mstate, vstate))
5756                                         break;
5757
5758                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
5759                                     dvar->dtdv_data, &v->dtdv_type, lim);
5760                         } else {
5761                                 *((uint64_t *)dvar->dtdv_data) = regs[rd];
5762                         }
5763
5764                         break;
5765                 }
5766
5767                 case DIF_OP_ALLOCS: {
5768                         uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
5769                         size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
5770
5771                         /*
5772                          * Rounding up the user allocation size could have
5773                          * overflowed large, bogus allocations (like -1ULL) to
5774                          * 0.
5775                          */
5776                         if (size < regs[r1] ||
5777                             !DTRACE_INSCRATCH(mstate, size)) {
5778                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5779                                 regs[rd] = 0;
5780                                 break;
5781                         }
5782
5783                         dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
5784                                 mstate->dtms_scratch_ptr += size;
5785                                 regs[rd] = ptr;
5786                         break;
5787                 }
5788
5789                 case DIF_OP_COPYS:
5790                         if (!dtrace_canstore(regs[rd], regs[r2],
5791                             mstate, vstate)) {
5792                                 *flags |= CPU_DTRACE_BADADDR;
5793                                 *illval = regs[rd];
5794                                 break;
5795                         }
5796
5797                         if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
5798                                 break;
5799
5800                         dtrace_bcopy((void *)(uintptr_t)regs[r1],
5801                             (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
5802                         break;
5803
5804                 case DIF_OP_STB:
5805                         if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
5806                                 *flags |= CPU_DTRACE_BADADDR;
5807                                 *illval = regs[rd];
5808                                 break;
5809                         }
5810                         *((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
5811                         break;
5812
5813                 case DIF_OP_STH:
5814                         if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
5815                                 *flags |= CPU_DTRACE_BADADDR;
5816                                 *illval = regs[rd];
5817                                 break;
5818                         }
5819                         if (regs[rd] & 1) {
5820                                 *flags |= CPU_DTRACE_BADALIGN;
5821                                 *illval = regs[rd];
5822                                 break;
5823                         }
5824                         *((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
5825                         break;
5826
5827                 case DIF_OP_STW:
5828                         if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
5829                                 *flags |= CPU_DTRACE_BADADDR;
5830                                 *illval = regs[rd];
5831                                 break;
5832                         }
5833                         if (regs[rd] & 3) {
5834                                 *flags |= CPU_DTRACE_BADALIGN;
5835                                 *illval = regs[rd];
5836                                 break;
5837                         }
5838                         *((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
5839                         break;
5840
5841                 case DIF_OP_STX:
5842                         if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
5843                                 *flags |= CPU_DTRACE_BADADDR;
5844                                 *illval = regs[rd];
5845                                 break;
5846                         }
5847
5848                         /*
5849                         * Darwin kmem_zalloc() called from
5850                         * dtrace_difo_init() is 4-byte aligned.
5851                         */
5852                         if (regs[rd] & 3) {
5853                                 *flags |= CPU_DTRACE_BADALIGN;
5854                                 *illval = regs[rd];
5855                                 break;
5856                         }
5857                         *((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
5858                         break;
5859                 }
5860         }
5861
5862         if (!(*flags & CPU_DTRACE_FAULT))
5863                 return (rval);
5864
5865         mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
5866         mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
5867
5868         return (0);
5869 }
5870
5871 static void
5872 dtrace_action_breakpoint(dtrace_ecb_t *ecb)
5873 {
5874         dtrace_probe_t *probe = ecb->dte_probe;
5875         dtrace_provider_t *prov = probe->dtpr_provider;
5876         char c[DTRACE_FULLNAMELEN + 80], *str;
5877         const char *msg = "dtrace: breakpoint action at probe ";
5878         const char *ecbmsg = " (ecb ";
5879         uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
5880         uintptr_t val = (uintptr_t)ecb;
5881         int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
5882
5883         if (dtrace_destructive_disallow)
5884                 return;
5885
5886         /*
5887          * It's impossible to be taking action on the NULL probe.
5888          */
5889         ASSERT(probe != NULL);
5890
5891         /*
5892          * This is a poor man's (destitute man's?) sprintf():  we want to
5893          * print the provider name, module name, function name and name of
5894          * the probe, along with the hex address of the ECB with the breakpoint
5895          * action -- all of which we must place in the character buffer by
5896          * hand.
5897          */
5898         while (*msg != '\0')
5899                 c[i++] = *msg++;
5900
5901         for (str = prov->dtpv_name; *str != '\0'; str++)
5902                 c[i++] = *str;
5903         c[i++] = ':';
5904
5905         for (str = probe->dtpr_mod; *str != '\0'; str++)
5906                 c[i++] = *str;
5907         c[i++] = ':';
5908
5909         for (str = probe->dtpr_func; *str != '\0'; str++)
5910                 c[i++] = *str;
5911         c[i++] = ':';
5912
5913         for (str = probe->dtpr_name; *str != '\0'; str++)
5914                 c[i++] = *str;
5915
5916         while (*ecbmsg != '\0')
5917                 c[i++] = *ecbmsg++;
5918
5919         while (shift >= 0) {
5920                 mask = (uintptr_t)0xf << shift;
5921
5922                 if (val >= ((uintptr_t)1 << shift))
5923                         c[i++] = "0123456789abcdef"[(val & mask) >> shift];
5924                 shift -= 4;
5925         }
5926
5927         c[i++] = ')';
5928         c[i] = '\0';
5929
5930         debug_enter(c);
5931 }
5932
5933 static void
5934 dtrace_action_panic(dtrace_ecb_t *ecb)
5935 {
5936         dtrace_probe_t *probe = ecb->dte_probe;
5937
5938         /*
5939          * It's impossible to be taking action on the NULL probe.
5940          */
5941         ASSERT(probe != NULL);
5942
5943         if (dtrace_destructive_disallow)
5944                 return;
5945
5946         if (dtrace_panicked != NULL)
5947                 return;
5948
5949         if (dtrace_casptr(&dtrace_panicked, NULL, current_thread()) != NULL)
5950                 return;
5951
5952         /*
5953          * We won the right to panic.  (We want to be sure that only one
5954          * thread calls panic() from dtrace_probe(), and that panic() is
5955          * called exactly once.)
5956          */
5957         panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
5958             probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
5959             probe->dtpr_func, probe->dtpr_name, (void *)ecb);
5960
5961         /*
5962          * APPLE NOTE: this was for an old Mac OS X debug feature
5963          * allowing a return from panic().  Revisit someday.
5964          */
5965         dtrace_panicked = NULL;
5966 }
5967
5968 static void
5969 dtrace_action_raise(uint64_t sig)
5970 {
5971         if (dtrace_destructive_disallow)
5972                 return;
5973
5974         if (sig >= NSIG) {
5975                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5976                 return;
5977         }
5978
5979         /*
5980          * raise() has a queue depth of 1 -- we ignore all subsequent
5981          * invocations of the raise() action.
5982          */
5983
5984         uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
5985
5986         if (uthread && uthread->t_dtrace_sig == 0) {
5987                 uthread->t_dtrace_sig = sig;
5988                 act_set_astbsd(current_thread());
5989         }
5990 }
5991
5992 static void
5993 dtrace_action_stop(void)
5994 {
5995         if (dtrace_destructive_disallow)
5996                 return;
5997
5998         uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
5999         if (uthread) {
6000                 /*
6001                  * The currently running process will be set to task_suspend
6002                  * when it next leaves the kernel.
6003                 */
6004                 uthread->t_dtrace_stop = 1;
6005                 act_set_astbsd(current_thread());
6006         }
6007 }
6008
6009
6010 /*
6011  * APPLE NOTE: pidresume works in conjunction with the dtrace stop action.
6012  * Both activate only when the currently running process next leaves the
6013  * kernel.
6014  */
6015 static void
6016 dtrace_action_pidresume(uint64_t pid)
6017 {
6018         if (dtrace_destructive_disallow)
6019                 return;
6020
6021         if (kauth_cred_issuser(kauth_cred_get()) == 0) {
6022                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6023                 return;
6024         }
6025         uthread_t uthread = (uthread_t)get_bsdthread_info(current_thread());
6026
6027         /*
6028          * When the currently running process leaves the kernel, it attempts to
6029          * task_resume the process (denoted by pid), if that pid appears to have
6030          * been stopped by dtrace_action_stop().
6031          * The currently running process has a pidresume() queue depth of 1 --
6032          * subsequent invocations of the pidresume() action are ignored.
6033          */
6034
6035         if (pid != 0 && uthread && uthread->t_dtrace_resumepid == 0) {
6036                 uthread->t_dtrace_resumepid = pid;
6037                 act_set_astbsd(current_thread());
6038         }
6039 }
6040
6041 static void
6042 dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
6043 {
6044         hrtime_t now;
6045         volatile uint16_t *flags;
6046         dtrace_cpu_t *cpu = CPU;
6047
6048         if (dtrace_destructive_disallow)
6049                 return;
6050
6051         flags = (volatile uint16_t *)&cpu_core[cpu->cpu_id].cpuc_dtrace_flags;
6052
6053         now = dtrace_gethrtime();
6054
6055         if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
6056                 /*
6057                  * We need to advance the mark to the current time.
6058                  */
6059                 cpu->cpu_dtrace_chillmark = now;
6060                 cpu->cpu_dtrace_chilled = 0;
6061         }
6062
6063         /*
6064          * Now check to see if the requested chill time would take us over
6065          * the maximum amount of time allowed in the chill interval.  (Or
6066          * worse, if the calculation itself induces overflow.)
6067          */
6068         if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
6069             cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
6070                 *flags |= CPU_DTRACE_ILLOP;
6071                 return;
6072         }
6073
6074         while (dtrace_gethrtime() - now < val)
6075                 continue;
6076
6077         /*
6078          * Normally, we assure that the value of the variable "timestamp" does
6079          * not change within an ECB.  The presence of chill() represents an
6080          * exception to this rule, however.
6081          */
6082         mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
6083         cpu->cpu_dtrace_chilled += val;
6084 }
6085
6086 static void
6087 dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
6088     uint64_t *buf, uint64_t arg)
6089 {
6090         int nframes = DTRACE_USTACK_NFRAMES(arg);
6091         int strsize = DTRACE_USTACK_STRSIZE(arg);
6092         uint64_t *pcs = &buf[1], *fps;
6093         char *str = (char *)&pcs[nframes];
6094         int size, offs = 0, i, j;
6095         uintptr_t old = mstate->dtms_scratch_ptr, saved;
6096         uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
6097         char *sym;
6098
6099         /*
6100          * Should be taking a faster path if string space has not been
6101          * allocated.
6102          */
6103         ASSERT(strsize != 0);
6104
6105         /*
6106          * We will first allocate some temporary space for the frame pointers.
6107          */
6108         fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6109         size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
6110             (nframes * sizeof (uint64_t));
6111
6112         if (!DTRACE_INSCRATCH(mstate, (uintptr_t)size)) {
6113                 /*
6114                  * Not enough room for our frame pointers -- need to indicate
6115                  * that we ran out of scratch space.
6116                  */
6117                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6118                 return;
6119         }
6120
6121         mstate->dtms_scratch_ptr += size;
6122         saved = mstate->dtms_scratch_ptr;
6123
6124         /*
6125          * Now get a stack with both program counters and frame pointers.
6126          */
6127         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6128         dtrace_getufpstack(buf, fps, nframes + 1);
6129         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6130
6131         /*
6132          * If that faulted, we're cooked.
6133          */
6134         if (*flags & CPU_DTRACE_FAULT)
6135                 goto out;
6136
6137         /*
6138          * Now we want to walk up the stack, calling the USTACK helper.  For
6139          * each iteration, we restore the scratch pointer.
6140          */
6141         for (i = 0; i < nframes; i++) {
6142                 mstate->dtms_scratch_ptr = saved;
6143
6144                 if (offs >= strsize)
6145                         break;
6146
6147                 sym = (char *)(uintptr_t)dtrace_helper(
6148                     DTRACE_HELPER_ACTION_USTACK,
6149                     mstate, state, pcs[i], fps[i]);
6150
6151                 /*
6152                  * If we faulted while running the helper, we're going to
6153                  * clear the fault and null out the corresponding string.
6154                  */
6155                 if (*flags & CPU_DTRACE_FAULT) {
6156                         *flags &= ~CPU_DTRACE_FAULT;
6157                         str[offs++] = '\0';
6158                         continue;
6159                 }
6160
6161                 if (sym == NULL) {
6162                         str[offs++] = '\0';
6163                         continue;
6164                 }
6165
6166                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6167
6168                 /*
6169                  * Now copy in the string that the helper returned to us.
6170                  */
6171                 for (j = 0; offs + j < strsize; j++) {
6172                         if ((str[offs + j] = sym[j]) == '\0')
6173                                 break;
6174                 }
6175
6176                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6177
6178                 offs += j + 1;
6179         }
6180
6181         if (offs >= strsize) {
6182                 /*
6183                  * If we didn't have room for all of the strings, we don't
6184                  * abort processing -- this needn't be a fatal error -- but we
6185                  * still want to increment a counter (dts_stkstroverflows) to
6186                  * allow this condition to be warned about.  (If this is from
6187                  * a jstack() action, it is easily tuned via jstackstrsize.)
6188                  */
6189                 dtrace_error(&state->dts_stkstroverflows);
6190         }
6191
6192         while (offs < strsize)
6193                 str[offs++] = '\0';
6194
6195 out:
6196         mstate->dtms_scratch_ptr = old;
6197 }
6198
6199 static void
6200 dtrace_store_by_ref(dtrace_difo_t *dp, caddr_t tomax, size_t size,
6201     size_t *valoffsp, uint64_t *valp, uint64_t end, int intuple, int dtkind)
6202 {
6203         volatile uint16_t *flags;
6204         uint64_t val = *valp;
6205         size_t valoffs = *valoffsp;
6206
6207         flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
6208         ASSERT(dtkind == DIF_TF_BYREF || dtkind == DIF_TF_BYUREF);
6209
6210         /*
6211          * If this is a string, we're going to only load until we find the zero
6212          * byte -- after which we'll store zero bytes.
6213          */
6214         if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
6215                 char c = '\0' + 1;
6216                 size_t s;
6217
6218                 for (s = 0; s < size; s++) {
6219                         if (c != '\0' && dtkind == DIF_TF_BYREF) {
6220                                 c = dtrace_load8(val++);
6221                         } else if (c != '\0' && dtkind == DIF_TF_BYUREF) {
6222                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6223                                 c = dtrace_fuword8((user_addr_t)(uintptr_t)val++);
6224                                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6225                                 if (*flags & CPU_DTRACE_FAULT)
6226                                         break;
6227                         }
6228
6229                         DTRACE_STORE(uint8_t, tomax, valoffs++, c);
6230
6231                         if (c == '\0' && intuple)
6232                                 break;
6233                 }
6234         } else {
6235                 uint8_t c;
6236                 while (valoffs < end) {
6237                         if (dtkind == DIF_TF_BYREF) {
6238                                 c = dtrace_load8(val++);
6239                         } else if (dtkind == DIF_TF_BYUREF) {
6240                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6241                                 c = dtrace_fuword8((user_addr_t)(uintptr_t)val++);
6242                                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6243                                 if (*flags & CPU_DTRACE_FAULT)
6244                                         break;
6245                         }
6246
6247                         DTRACE_STORE(uint8_t, tomax,
6248                             valoffs++, c);
6249                 }
6250         }
6251
6252         *valp = val;
6253         *valoffsp = valoffs;
6254 }
6255
6256 /*
6257  * If you're looking for the epicenter of DTrace, you just found it.  This
6258  * is the function called by the provider to fire a probe -- from which all
6259  * subsequent probe-context DTrace activity emanates.
6260  */
6261 static void
6262 __dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
6263     uint64_t arg2, uint64_t arg3, uint64_t arg4)
6264 {
6265         processorid_t cpuid;
6266         dtrace_icookie_t cookie;
6267         dtrace_probe_t *probe;
6268         dtrace_mstate_t mstate;
6269         dtrace_ecb_t *ecb;
6270         dtrace_action_t *act;
6271         intptr_t offs;
6272         size_t size;
6273         int vtime, onintr;
6274         volatile uint16_t *flags;
6275         hrtime_t now;
6276
6277         cookie = dtrace_interrupt_disable();
6278         probe = dtrace_probes[id - 1];
6279         cpuid = CPU->cpu_id;
6280         onintr = CPU_ON_INTR(CPU);
6281
6282         if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
6283             probe->dtpr_predcache == dtrace_get_thread_predcache(current_thread())) {
6284                 /*
6285                  * We have hit in the predicate cache; we know that
6286                  * this predicate would evaluate to be false.
6287                  */
6288                 dtrace_interrupt_enable(cookie);
6289                 return;
6290         }
6291
6292         if (panic_quiesce) {
6293                 /*
6294                  * We don't trace anything if we're panicking.
6295                  */
6296                 dtrace_interrupt_enable(cookie);
6297                 return;
6298         }
6299
6300 #if !defined(__APPLE__)
6301         now = dtrace_gethrtime();
6302         vtime = dtrace_vtime_references != 0;
6303
6304         if (vtime && curthread->t_dtrace_start)
6305                 curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
6306 #else
6307         /*
6308          * APPLE NOTE:  The time spent entering DTrace and arriving
6309          * to this point, is attributed to the current thread.
6310          * Instead it should accrue to DTrace.  FIXME
6311          */
6312         vtime = dtrace_vtime_references != 0;
6313
6314         if (vtime)
6315         {
6316                 int64_t dtrace_accum_time, recent_vtime;
6317                 thread_t thread = current_thread();
6318
6319                 dtrace_accum_time = dtrace_get_thread_tracing(thread); /* Time spent inside DTrace so far (nanoseconds) */
6320
6321                 if (dtrace_accum_time >= 0) {
6322                         recent_vtime = dtrace_abs_to_nano(dtrace_calc_thread_recent_vtime(thread)); /* up to the moment thread vtime */
6323
6324                         recent_vtime = recent_vtime - dtrace_accum_time; /* Time without DTrace contribution */
6325
6326                         dtrace_set_thread_vtime(thread, recent_vtime);
6327                 }
6328         }
6329
6330         now = dtrace_gethrtime(); /* must not precede dtrace_calc_thread_recent_vtime() call! */
6331 #endif /* __APPLE__ */
6332
6333         /*
6334          * APPLE NOTE: A provider may call dtrace_probe_error() in lieu of
6335          * dtrace_probe() in some circumstances.   See, e.g. fasttrap_isa.c.
6336          * However the provider has no access to ECB context, so passes
6337          * 0 through "arg0" and the probe_id of the overridden probe as arg1.
6338          * Detect that here and cons up a viable state (from the probe_id).
6339          */
6340         if (dtrace_probeid_error == id && 0 == arg0) {
6341                 dtrace_id_t ftp_id = (dtrace_id_t)arg1;
6342                 dtrace_probe_t *ftp_probe = dtrace_probes[ftp_id - 1];
6343                 dtrace_ecb_t *ftp_ecb = ftp_probe->dtpr_ecb;
6344
6345                 if (NULL != ftp_ecb) {
6346                         dtrace_state_t *ftp_state = ftp_ecb->dte_state;
6347
6348                         arg0 = (uint64_t)(uintptr_t)ftp_state;
6349                         arg1 = ftp_ecb->dte_epid;
6350                         /*
6351                          * args[2-4] established by caller.
6352                          */
6353                         ftp_state->dts_arg_error_illval = -1; /* arg5 */
6354                 }
6355         }
6356
6357         mstate.dtms_difo = NULL;
6358         mstate.dtms_probe = probe;
6359         mstate.dtms_strtok = 0;
6360         mstate.dtms_arg[0] = arg0;
6361         mstate.dtms_arg[1] = arg1;
6362         mstate.dtms_arg[2] = arg2;
6363         mstate.dtms_arg[3] = arg3;
6364         mstate.dtms_arg[4] = arg4;
6365
6366         flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
6367
6368         for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
6369                 dtrace_predicate_t *pred = ecb->dte_predicate;
6370                 dtrace_state_t *state = ecb->dte_state;
6371                 dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
6372                 dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
6373                 dtrace_vstate_t *vstate = &state->dts_vstate;
6374                 dtrace_provider_t *prov = probe->dtpr_provider;
6375                 uint64_t tracememsize = 0;
6376                 int committed = 0;
6377                 caddr_t tomax;
6378
6379                 /*
6380                  * A little subtlety with the following (seemingly innocuous)
6381                  * declaration of the automatic 'val':  by looking at the
6382                  * code, you might think that it could be declared in the
6383                  * action processing loop, below.  (That is, it's only used in
6384                  * the action processing loop.)  However, it must be declared
6385                  * out of that scope because in the case of DIF expression
6386                  * arguments to aggregating actions, one iteration of the
6387                  * action loop will use the last iteration's value.
6388                  */
6389 #ifdef lint
6390                 uint64_t val = 0;
6391 #else
6392                 uint64_t val = 0;
6393 #endif
6394
6395                 mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
6396                 *flags &= ~CPU_DTRACE_ERROR;
6397
6398                 if (prov == dtrace_provider) {
6399                         /*
6400                          * If dtrace itself is the provider of this probe,
6401                          * we're only going to continue processing the ECB if
6402                          * arg0 (the dtrace_state_t) is equal to the ECB's
6403                          * creating state.  (This prevents disjoint consumers
6404                          * from seeing one another's metaprobes.)
6405                          */
6406                         if (arg0 != (uint64_t)(uintptr_t)state)
6407                                 continue;
6408                 }
6409
6410                 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
6411                         /*
6412                          * We're not currently active.  If our provider isn't
6413                          * the dtrace pseudo provider, we're not interested.
6414                          */
6415                         if (prov != dtrace_provider)
6416                                 continue;
6417
6418                         /*
6419                          * Now we must further check if we are in the BEGIN
6420                          * probe.  If we are, we will only continue processing
6421                          * if we're still in WARMUP -- if one BEGIN enabling
6422                          * has invoked the exit() action, we don't want to
6423                          * evaluate subsequent BEGIN enablings.
6424                          */
6425                         if (probe->dtpr_id == dtrace_probeid_begin &&
6426                             state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
6427                                 ASSERT(state->dts_activity ==
6428                                     DTRACE_ACTIVITY_DRAINING);
6429                                 continue;
6430                         }
6431                 }
6432
6433                 if (ecb->dte_cond) {
6434                         /*
6435                          * If the dte_cond bits indicate that this
6436                          * consumer is only allowed to see user-mode firings
6437                          * of this probe, call the provider's dtps_usermode()
6438                          * entry point to check that the probe was fired
6439                          * while in a user context. Skip this ECB if that's
6440                          * not the case.
6441                          */
6442                         if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&
6443                             prov->dtpv_pops.dtps_usermode &&
6444                             prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,
6445                             probe->dtpr_id, probe->dtpr_arg) == 0)
6446                                 continue;
6447
6448                         /*
6449                          * This is more subtle than it looks. We have to be
6450                          * absolutely certain that CRED() isn't going to
6451                          * change out from under us so it's only legit to
6452                          * examine that structure if we're in constrained
6453                          * situations. Currently, the only times we'll this
6454                          * check is if a non-super-user has enabled the
6455                          * profile or syscall providers -- providers that
6456                          * allow visibility of all processes. For the
6457                          * profile case, the check above will ensure that
6458                          * we're examining a user context.
6459                          */
6460                         if (ecb->dte_cond & DTRACE_COND_OWNER) {
6461                                 cred_t *cr;
6462                                 cred_t *s_cr =
6463                                     ecb->dte_state->dts_cred.dcr_cred;
6464                                 proc_t *proc;
6465 #pragma unused(proc) /* __APPLE__ */
6466
6467                                 ASSERT(s_cr != NULL);
6468
6469                         /*
6470                          * XXX this is hackish, but so is setting a variable
6471                          * XXX in a McCarthy OR...
6472                          */
6473                                 if ((cr = dtrace_CRED()) == NULL ||
6474                                     posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_uid ||
6475                                     posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_ruid ||
6476                                     posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_suid ||
6477                                     posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_gid ||
6478                                     posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_rgid ||
6479                                     posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_sgid ||
6480 #if !defined(__APPLE__)
6481                                     (proc = ttoproc(curthread)) == NULL ||
6482                                     (proc->p_flag & SNOCD))
6483 #else
6484                                         1) /* APPLE NOTE: Darwin omits "No Core Dump" flag */
6485 #endif /* __APPLE__ */
6486                                         continue;
6487                         }
6488
6489                         if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
6490                                 cred_t *cr;
6491                                 cred_t *s_cr =
6492                                     ecb->dte_state->dts_cred.dcr_cred;
6493 #pragma unused(cr, s_cr) /* __APPLE__ */
6494
6495                                 ASSERT(s_cr != NULL);
6496
6497 #if !defined(__APPLE__)
6498                                 if ((cr = CRED()) == NULL ||
6499                                     s_cr->cr_zone->zone_id !=
6500                                     cr->cr_zone->zone_id)
6501                                         continue;
6502 #else
6503                                 /* APPLE NOTE: Darwin doesn't do zones. */
6504 #endif /* __APPLE__ */
6505                         }
6506                 }
6507
6508                 if (now - state->dts_alive > dtrace_deadman_timeout) {
6509                         /*
6510                          * We seem to be dead.  Unless we (a) have kernel
6511                          * destructive permissions (b) have expicitly enabled
6512                          * destructive actions and (c) destructive actions have
6513                          * not been disabled, we're going to transition into
6514                          * the KILLED state, from which no further processing
6515                          * on this state will be performed.
6516                          */
6517                         if (!dtrace_priv_kernel_destructive(state) ||
6518                             !state->dts_cred.dcr_destructive ||
6519                             dtrace_destructive_disallow) {
6520                                 void *activity = &state->dts_activity;
6521                                 dtrace_activity_t current;
6522
6523                                 do {
6524                                         current = state->dts_activity;
6525                                 } while (dtrace_cas32(activity, current,
6526                                     DTRACE_ACTIVITY_KILLED) != current);
6527
6528                                 continue;
6529                         }
6530                 }
6531
6532                 if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
6533                     ecb->dte_alignment, state, &mstate)) < 0)
6534                         continue;
6535
6536                 tomax = buf->dtb_tomax;
6537                 ASSERT(tomax != NULL);
6538
6539                 /*
6540                  * Build and store the record header corresponding to the ECB.
6541                  */
6542                 if (ecb->dte_size != 0) {
6543                         dtrace_rechdr_t dtrh;
6544
6545                         if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
6546                                 mstate.dtms_timestamp = dtrace_gethrtime();
6547                                 mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP;
6548                         }
6549
6550                         ASSERT(ecb->dte_size >= sizeof(dtrace_rechdr_t));
6551
6552                         dtrh.dtrh_epid = ecb->dte_epid;
6553                         DTRACE_RECORD_STORE_TIMESTAMP(&dtrh, mstate.dtms_timestamp);
6554                         DTRACE_STORE(dtrace_rechdr_t, tomax, offs, dtrh);
6555                 }
6556
6557                 mstate.dtms_epid = ecb->dte_epid;
6558                 mstate.dtms_present |= DTRACE_MSTATE_EPID;
6559
6560                 if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
6561                         mstate.dtms_access = DTRACE_ACCESS_KERNEL;
6562                 else
6563                         mstate.dtms_access = 0;
6564
6565                 if (pred != NULL) {
6566                         dtrace_difo_t *dp = pred->dtp_difo;
6567                         uint64_t rval;
6568
6569                         rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
6570
6571                         if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
6572                                 dtrace_cacheid_t cid = probe->dtpr_predcache;
6573
6574                                 if (cid != DTRACE_CACHEIDNONE && !onintr) {
6575                                         /*
6576                                          * Update the predicate cache...
6577                                          */
6578                                         ASSERT(cid == pred->dtp_cacheid);
6579
6580                                         dtrace_set_thread_predcache(current_thread(), cid);
6581                                 }
6582
6583                                 continue;
6584                         }
6585                 }
6586
6587                 for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
6588                     act != NULL; act = act->dta_next) {
6589                         size_t valoffs;
6590                         dtrace_difo_t *dp;
6591                         dtrace_recdesc_t *rec = &act->dta_rec;
6592
6593                         size = rec->dtrd_size;
6594                         valoffs = offs + rec->dtrd_offset;
6595
6596                         if (DTRACEACT_ISAGG(act->dta_kind)) {
6597                                 uint64_t v = 0xbad;
6598                                 dtrace_aggregation_t *agg;
6599
6600                                 agg = (dtrace_aggregation_t *)act;
6601
6602                                 if ((dp = act->dta_difo) != NULL)
6603                                         v = dtrace_dif_emulate(dp,
6604                                             &mstate, vstate, state);
6605
6606                                 if (*flags & CPU_DTRACE_ERROR)
6607                                         continue;
6608
6609                                 /*
6610                                  * Note that we always pass the expression
6611                                  * value from the previous iteration of the
6612                                  * action loop.  This value will only be used
6613                                  * if there is an expression argument to the
6614                                  * aggregating action, denoted by the
6615                                  * dtag_hasarg field.
6616                                  */
6617                                 dtrace_aggregate(agg, buf,
6618                                     offs, aggbuf, v, val);
6619                                 continue;
6620                         }
6621
6622                         switch (act->dta_kind) {
6623                         case DTRACEACT_STOP:
6624                                 if (dtrace_priv_proc_destructive(state))
6625                                         dtrace_action_stop();
6626                                 continue;
6627
6628                         case DTRACEACT_BREAKPOINT:
6629                                 if (dtrace_priv_kernel_destructive(state))
6630                                         dtrace_action_breakpoint(ecb);
6631                                 continue;
6632
6633                         case DTRACEACT_PANIC:
6634                                 if (dtrace_priv_kernel_destructive(state))
6635                                         dtrace_action_panic(ecb);
6636                                 continue;
6637
6638                         case DTRACEACT_STACK:
6639                                 if (!dtrace_priv_kernel(state))
6640                                         continue;
6641
6642                                 dtrace_getpcstack((pc_t *)(tomax + valoffs),
6643                                     size / sizeof (pc_t), probe->dtpr_aframes,
6644                                     DTRACE_ANCHORED(probe) ? NULL :
6645                                   (uint32_t *)(uintptr_t)arg0);
6646                                 continue;
6647
6648                         case DTRACEACT_JSTACK:
6649                         case DTRACEACT_USTACK:
6650                                 if (!dtrace_priv_proc(state))
6651                                         continue;
6652
6653                                 /*
6654                                  * See comment in DIF_VAR_PID.
6655                                  */
6656                                 if (DTRACE_ANCHORED(mstate.dtms_probe) &&
6657                                     CPU_ON_INTR(CPU)) {
6658                                         int depth = DTRACE_USTACK_NFRAMES(
6659                                             rec->dtrd_arg) + 1;
6660
6661                                         dtrace_bzero((void *)(tomax + valoffs),
6662                                             DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
6663                                             + depth * sizeof (uint64_t));
6664
6665                                         continue;
6666                                 }
6667
6668                                 if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
6669                                     curproc->p_dtrace_helpers != NULL) {
6670                                         /*
6671                                          * This is the slow path -- we have
6672                                          * allocated string space, and we're
6673                                          * getting the stack of a process that
6674                                          * has helpers.  Call into a separate
6675                                          * routine to perform this processing.
6676                                          */
6677                                         dtrace_action_ustack(&mstate, state,
6678                                             (uint64_t *)(tomax + valoffs),
6679                                             rec->dtrd_arg);
6680                                         continue;
6681                                 }
6682
6683                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6684                                 dtrace_getupcstack((uint64_t *)
6685                                     (tomax + valoffs),
6686                                     DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
6687                                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6688                                 continue;
6689
6690                         default:
6691                                 break;
6692                         }
6693
6694                         dp = act->dta_difo;
6695                         ASSERT(dp != NULL);
6696
6697                         val = dtrace_dif_emulate(dp, &mstate, vstate, state);
6698
6699                         if (*flags & CPU_DTRACE_ERROR)
6700                                 continue;
6701
6702                         switch (act->dta_kind) {
6703                         case DTRACEACT_SPECULATE: {
6704                                 dtrace_rechdr_t *dtrh = NULL;
6705
6706                                 ASSERT(buf == &state->dts_buffer[cpuid]);
6707                                 buf = dtrace_speculation_buffer(state,
6708                                     cpuid, val);
6709
6710                                 if (buf == NULL) {
6711                                         *flags |= CPU_DTRACE_DROP;
6712                                         continue;
6713                                 }
6714
6715                                 offs = dtrace_buffer_reserve(buf,
6716                                     ecb->dte_needed, ecb->dte_alignment,
6717                                     state, NULL);
6718
6719                                 if (offs < 0) {
6720                                         *flags |= CPU_DTRACE_DROP;
6721                                         continue;
6722                                 }
6723
6724                                 tomax = buf->dtb_tomax;
6725                                 ASSERT(tomax != NULL);
6726
6727                                 if (ecb->dte_size == 0)
6728                                         continue;
6729
6730                                 ASSERT(ecb->dte_size >= sizeof(dtrace_rechdr_t));
6731                                 dtrh = ((void *)(tomax + offs));
6732                                 dtrh->dtrh_epid = ecb->dte_epid;
6733
6734                                 /*
6735                                  * When the speculation is committed, all of
6736                                  * the records in the speculative buffer will
6737                                  * have their timestamps set to the commit
6738                                  * time.  Until then, it is set to a sentinel
6739                                  * value, for debugability.
6740                                  */
6741                                 DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX);
6742
6743                                 continue;
6744                         }
6745
6746                         case DTRACEACT_CHILL:
6747                                 if (dtrace_priv_kernel_destructive(state))
6748                                         dtrace_action_chill(&mstate, val);
6749                                 continue;
6750
6751                         case DTRACEACT_RAISE:
6752                                 if (dtrace_priv_proc_destructive(state))
6753                                         dtrace_action_raise(val);
6754                                 continue;
6755
6756                         case DTRACEACT_PIDRESUME:   /* __APPLE__ */
6757                                 if (dtrace_priv_proc_destructive(state))
6758                                         dtrace_action_pidresume(val);
6759                                 continue;
6760
6761                         case DTRACEACT_COMMIT:
6762                                 ASSERT(!committed);
6763
6764                                 /*
6765                                  * We need to commit our buffer state.
6766                                  */
6767                                 if (ecb->dte_size)
6768                                         buf->dtb_offset = offs + ecb->dte_size;
6769                                 buf = &state->dts_buffer[cpuid];
6770                                 dtrace_speculation_commit(state, cpuid, val);
6771                                 committed = 1;
6772                                 continue;
6773
6774                         case DTRACEACT_DISCARD:
6775                                 dtrace_speculation_discard(state, cpuid, val);
6776                                 continue;
6777
6778                         case DTRACEACT_DIFEXPR:
6779                         case DTRACEACT_LIBACT:
6780                         case DTRACEACT_PRINTF:
6781                         case DTRACEACT_PRINTA:
6782                         case DTRACEACT_SYSTEM:
6783                         case DTRACEACT_FREOPEN:
6784                         case DTRACEACT_APPLEBINARY:   /* __APPLE__ */
6785                         case DTRACEACT_TRACEMEM:
6786                                 break;
6787
6788                         case DTRACEACT_TRACEMEM_DYNSIZE:
6789                                 tracememsize = val;
6790                                 break;
6791
6792                         case DTRACEACT_SYM:
6793                         case DTRACEACT_MOD:
6794                                 if (!dtrace_priv_kernel(state))
6795                                         continue;
6796                                 break;
6797
6798                         case DTRACEACT_USYM:
6799                         case DTRACEACT_UMOD:
6800                         case DTRACEACT_UADDR: {
6801                                 if (!dtrace_priv_proc(state))
6802                                         continue;
6803
6804                                 DTRACE_STORE(uint64_t, tomax,
6805                                     valoffs, (uint64_t)dtrace_proc_selfpid());
6806                                 DTRACE_STORE(uint64_t, tomax,
6807                                     valoffs + sizeof (uint64_t), val);
6808
6809                                 continue;
6810                         }
6811
6812                         case DTRACEACT_EXIT: {
6813                                 /*
6814                                  * For the exit action, we are going to attempt
6815                                  * to atomically set our activity to be
6816                                  * draining.  If this fails (either because
6817                                  * another CPU has beat us to the exit action,
6818                                  * or because our current activity is something
6819                                  * other than ACTIVE or WARMUP), we will
6820                                  * continue.  This assures that the exit action
6821                                  * can be successfully recorded at most once
6822                                  * when we're in the ACTIVE state.  If we're
6823                                  * encountering the exit() action while in
6824                                  * COOLDOWN, however, we want to honor the new
6825                                  * status code.  (We know that we're the only
6826                                  * thread in COOLDOWN, so there is no race.)
6827                                  */
6828                                 void *activity = &state->dts_activity;
6829                                 dtrace_activity_t current = state->dts_activity;
6830
6831                                 if (current == DTRACE_ACTIVITY_COOLDOWN)
6832                                         break;
6833
6834                                 if (current != DTRACE_ACTIVITY_WARMUP)
6835                                         current = DTRACE_ACTIVITY_ACTIVE;
6836
6837                                 if (dtrace_cas32(activity, current,
6838                                     DTRACE_ACTIVITY_DRAINING) != current) {
6839                                         *flags |= CPU_DTRACE_DROP;
6840                                         continue;
6841                                 }
6842
6843                                 break;
6844                         }
6845
6846                         default:
6847                                 ASSERT(0);
6848                         }
6849
6850                         if (dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF | DIF_TF_BYUREF)) {
6851                                 uintptr_t end = valoffs + size;
6852
6853                                 if (tracememsize != 0 &&
6854                                     valoffs + tracememsize < end)
6855                                 {
6856                                         end = valoffs + tracememsize;
6857                                         tracememsize = 0;
6858                                 }
6859
6860                                 if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF &&
6861                                     !dtrace_vcanload((void *)(uintptr_t)val,
6862                                     &dp->dtdo_rtype, NULL, &mstate, vstate))
6863                                 {
6864                                         continue;
6865                                 }
6866
6867                                 dtrace_store_by_ref(dp, tomax, size, &valoffs,
6868                                     &val, end, act->dta_intuple,
6869                                     dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ?
6870                                     DIF_TF_BYREF: DIF_TF_BYUREF);
6871
6872                                 continue;
6873                         }
6874
6875                         switch (size) {
6876                         case 0:
6877                                 break;
6878
6879                         case sizeof (uint8_t):
6880                                 DTRACE_STORE(uint8_t, tomax, valoffs, val);
6881                                 break;
6882                         case sizeof (uint16_t):
6883                                 DTRACE_STORE(uint16_t, tomax, valoffs, val);
6884                                 break;
6885                         case sizeof (uint32_t):
6886                                 DTRACE_STORE(uint32_t, tomax, valoffs, val);
6887                                 break;
6888                         case sizeof (uint64_t):
6889                                 DTRACE_STORE(uint64_t, tomax, valoffs, val);
6890                                 break;
6891                         default:
6892                                 /*
6893                                  * Any other size should have been returned by
6894                                  * reference, not by value.
6895                                  */
6896                                 ASSERT(0);
6897                                 break;
6898                         }
6899                 }
6900
6901                 if (*flags & CPU_DTRACE_DROP)
6902                         continue;
6903
6904                 if (*flags & CPU_DTRACE_FAULT) {
6905                         int ndx;
6906                         dtrace_action_t *err;
6907
6908                         buf->dtb_errors++;
6909
6910                         if (probe->dtpr_id == dtrace_probeid_error) {
6911                                 /*
6912                                  * There's nothing we can do -- we had an
6913                                  * error on the error probe.  We bump an
6914                                  * error counter to at least indicate that
6915                                  * this condition happened.
6916                                  */
6917                                 dtrace_error(&state->dts_dblerrors);
6918                                 continue;
6919                         }
6920
6921                         if (vtime) {
6922                                 /*
6923                                  * Before recursing on dtrace_probe(), we
6924                                  * need to explicitly clear out our start
6925                                  * time to prevent it from being accumulated
6926                                  * into t_dtrace_vtime.
6927                                  */
6928
6929                                 /*
6930                                  * Darwin sets the sign bit on t_dtrace_tracing
6931                                  * to suspend accumulation to it.
6932                                  */
6933                                 dtrace_set_thread_tracing(current_thread(),
6934                                     (1ULL<<63) | dtrace_get_thread_tracing(current_thread()));
6935
6936                         }
6937
6938                         /*
6939                          * Iterate over the actions to figure out which action
6940                          * we were processing when we experienced the error.
6941                          * Note that act points _past_ the faulting action; if
6942                          * act is ecb->dte_action, the fault was in the
6943                          * predicate, if it's ecb->dte_action->dta_next it's
6944                          * in action #1, and so on.
6945                          */
6946                         for (err = ecb->dte_action, ndx = 0;
6947                             err != act; err = err->dta_next, ndx++)
6948                                 continue;
6949
6950                         dtrace_probe_error(state, ecb->dte_epid, ndx,
6951                             (mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
6952                             mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
6953                             cpu_core[cpuid].cpuc_dtrace_illval);
6954
6955                         continue;
6956                 }
6957
6958                 if (!committed)
6959                         buf->dtb_offset = offs + ecb->dte_size;
6960         }
6961
6962         /* FIXME: On Darwin the time spent leaving DTrace from this point to the rti is attributed
6963            to the current thread. Instead it should accrue to DTrace. */
6964         if (vtime) {
6965                 thread_t thread = current_thread();
6966                 int64_t t = dtrace_get_thread_tracing(thread);
6967
6968                 if (t >= 0) {
6969                         /* Usual case, accumulate time spent here into t_dtrace_tracing */
6970                         dtrace_set_thread_tracing(thread, t + (dtrace_gethrtime() - now));
6971                 } else {
6972                         /* Return from error recursion. No accumulation, just clear the sign bit on t_dtrace_tracing. */
6973                         dtrace_set_thread_tracing(thread, (~(1ULL<<63)) & t);
6974                 }
6975         }
6976
6977         dtrace_interrupt_enable(cookie);
6978 }
6979
6980 /*
6981  * APPLE NOTE:  Don't allow a thread to re-enter dtrace_probe().
6982  * This could occur if a probe is encountered on some function in the
6983  * transitive closure of the call to dtrace_probe().
6984  * Solaris has some strong guarantees that this won't happen.
6985  * The Darwin implementation is not so mature as to make those guarantees.
6986  * Hence, the introduction of __dtrace_probe() on xnu.
6987  */
6988
6989 void
6990 dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
6991     uint64_t arg2, uint64_t arg3, uint64_t arg4)
6992 {
6993         thread_t thread = current_thread();
6994         disable_preemption();
6995         if (id == dtrace_probeid_error) {
6996                 __dtrace_probe(id, arg0, arg1, arg2, arg3, arg4);
6997                 dtrace_getipl(); /* Defeat tail-call optimization of __dtrace_probe() */
6998         } else if (!dtrace_get_thread_reentering(thread)) {
6999                 dtrace_set_thread_reentering(thread, TRUE);
7000                 __dtrace_probe(id, arg0, arg1, arg2, arg3, arg4);
7001                 dtrace_set_thread_reentering(thread, FALSE);
7002         }
7003 #if DEBUG
7004         else __dtrace_probe(dtrace_probeid_error, 0, id, 1, -1, DTRACEFLT_UNKNOWN);
7005 #endif
7006         enable_preemption();
7007 }
7008
7009 /*
7010  * DTrace Probe Hashing Functions
7011  *
7012  * The functions in this section (and indeed, the functions in remaining
7013  * sections) are not _called_ from probe context.  (Any exceptions to this are
7014  * marked with a "Note:".)  Rather, they are called from elsewhere in the
7015  * DTrace framework to look-up probes in, add probes to and remove probes from
7016  * the DTrace probe hashes.  (Each probe is hashed by each element of the
7017  * probe tuple -- allowing for fast lookups, regardless of what was
7018  * specified.)
7019  */
7020 static uint_t
7021 dtrace_hash_str(const char *p)
7022 {
7023         unsigned int g;
7024         uint_t hval = 0;
7025
7026         while (*p) {
7027                 hval = (hval << 4) + *p++;
7028                 if ((g = (hval & 0xf0000000)) != 0)
7029                         hval ^= g >> 24;
7030                 hval &= ~g;
7031         }
7032         return (hval);
7033 }
7034
7035 static const char*
7036 dtrace_strkey_probe_provider(void *elm, uintptr_t offs)
7037 {
7038 #pragma unused(offs)
7039         dtrace_probe_t *probe = (dtrace_probe_t*)elm;
7040         return probe->dtpr_provider->dtpv_name;
7041 }
7042
7043 static const char*
7044 dtrace_strkey_offset(void *elm, uintptr_t offs)
7045 {
7046         return ((char *)((uintptr_t)(elm) + offs));
7047 }
7048
7049 static const char*
7050 dtrace_strkey_deref_offset(void *elm, uintptr_t offs)
7051 {
7052         return *((char **)((uintptr_t)(elm) + offs));
7053 }
7054
7055 static dtrace_hash_t *
7056 dtrace_hash_create(dtrace_strkey_f func, uintptr_t arg, uintptr_t nextoffs, uintptr_t prevoffs)
7057 {
7058         dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
7059
7060         hash->dth_getstr = func;
7061         hash->dth_stroffs = arg;
7062         hash->dth_nextoffs = nextoffs;
7063         hash->dth_prevoffs = prevoffs;
7064
7065         hash->dth_size = 1;
7066         hash->dth_mask = hash->dth_size - 1;
7067
7068         hash->dth_tab = kmem_zalloc(hash->dth_size *
7069             sizeof (dtrace_hashbucket_t *), KM_SLEEP);
7070
7071         return (hash);
7072 }
7073
7074 /*
7075  * APPLE NOTE: dtrace_hash_destroy is not used.
7076  * It is called by dtrace_detach which is not
7077  * currently implemented.  Revisit someday.
7078  */
7079 #if !defined(__APPLE__)
7080 static void
7081 dtrace_hash_destroy(dtrace_hash_t *hash)
7082 {
7083 #if DEBUG
7084         int i;
7085
7086         for (i = 0; i < hash->dth_size; i++)
7087                 ASSERT(hash->dth_tab[i] == NULL);
7088 #endif
7089
7090         kmem_free(hash->dth_tab,
7091             hash->dth_size * sizeof (dtrace_hashbucket_t *));
7092         kmem_free(hash, sizeof (dtrace_hash_t));
7093 }
7094 #endif /* __APPLE__ */
7095
7096 static void
7097 dtrace_hash_resize(dtrace_hash_t *hash)
7098 {
7099         int size = hash->dth_size, i, ndx;
7100         int new_size = hash->dth_size << 1;
7101         int new_mask = new_size - 1;
7102         dtrace_hashbucket_t **new_tab, *bucket, *next;
7103
7104         ASSERT((new_size & new_mask) == 0);
7105
7106         new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
7107
7108         for (i = 0; i < size; i++) {
7109                 for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
7110                         void *elm = bucket->dthb_chain;
7111
7112                         ASSERT(elm != NULL);
7113                         ndx = DTRACE_HASHSTR(hash, elm) & new_mask;
7114
7115                         next = bucket->dthb_next;
7116                         bucket->dthb_next = new_tab[ndx];
7117                         new_tab[ndx] = bucket;
7118                 }
7119         }
7120
7121         kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
7122         hash->dth_tab = new_tab;
7123         hash->dth_size = new_size;
7124         hash->dth_mask = new_mask;
7125 }
7126
7127 static void
7128 dtrace_hash_add(dtrace_hash_t *hash, void *new)
7129 {
7130         int hashval = DTRACE_HASHSTR(hash, new);
7131         int ndx = hashval & hash->dth_mask;
7132         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7133         void **nextp, **prevp;
7134
7135         for (; bucket != NULL; bucket = bucket->dthb_next) {
7136                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
7137                         goto add;
7138         }
7139
7140         if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
7141                 dtrace_hash_resize(hash);
7142                 dtrace_hash_add(hash, new);
7143                 return;
7144         }
7145
7146         bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
7147         bucket->dthb_next = hash->dth_tab[ndx];
7148         hash->dth_tab[ndx] = bucket;
7149         hash->dth_nbuckets++;
7150
7151 add:
7152         nextp = DTRACE_HASHNEXT(hash, new);
7153         ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);
7154         *nextp = bucket->dthb_chain;
7155
7156         if (bucket->dthb_chain != NULL) {
7157                 prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
7158                 ASSERT(*prevp == NULL);
7159                 *prevp = new;
7160         }
7161
7162         bucket->dthb_chain = new;
7163         bucket->dthb_len++;
7164 }
7165
7166 static void *
7167 dtrace_hash_lookup_string(dtrace_hash_t *hash, const char *str)
7168 {
7169         int hashval = dtrace_hash_str(str);
7170         int ndx = hashval & hash->dth_mask;
7171         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7172
7173         for (; bucket != NULL; bucket = bucket->dthb_next) {
7174                 if (strcmp(str, DTRACE_GETSTR(hash, bucket->dthb_chain)) == 0)
7175                         return (bucket->dthb_chain);
7176         }
7177
7178         return (NULL);
7179 }
7180
7181 static dtrace_probe_t *
7182 dtrace_hash_lookup(dtrace_hash_t *hash, void *template)
7183 {
7184         return dtrace_hash_lookup_string(hash, DTRACE_GETSTR(hash, template));
7185 }
7186
7187 static int
7188 dtrace_hash_collisions(dtrace_hash_t *hash, void *template)
7189 {
7190         int hashval = DTRACE_HASHSTR(hash, template);
7191         int ndx = hashval & hash->dth_mask;
7192         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7193
7194         for (; bucket != NULL; bucket = bucket->dthb_next) {
7195                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
7196                         return (bucket->dthb_len);
7197         }
7198
7199         return (0);
7200 }
7201
7202 static void
7203 dtrace_hash_remove(dtrace_hash_t *hash, void *elm)
7204 {
7205         int ndx = DTRACE_HASHSTR(hash, elm) & hash->dth_mask;
7206         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7207
7208         void **prevp = DTRACE_HASHPREV(hash, elm);
7209         void **nextp = DTRACE_HASHNEXT(hash, elm);
7210
7211         /*
7212          * Find the bucket that we're removing this elm from.
7213          */
7214         for (; bucket != NULL; bucket = bucket->dthb_next) {
7215                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, elm))
7216                         break;
7217         }
7218
7219         ASSERT(bucket != NULL);
7220
7221         if (*prevp == NULL) {
7222                 if (*nextp == NULL) {
7223                         /*
7224                          * The removed element was the only element on this
7225                          * bucket; we need to remove the bucket.
7226                          */
7227                         dtrace_hashbucket_t *b = hash->dth_tab[ndx];
7228
7229                         ASSERT(bucket->dthb_chain == elm);
7230                         ASSERT(b != NULL);
7231
7232                         if (b == bucket) {
7233                                 hash->dth_tab[ndx] = bucket->dthb_next;
7234                         } else {
7235                                 while (b->dthb_next != bucket)
7236                                         b = b->dthb_next;
7237                                 b->dthb_next = bucket->dthb_next;
7238                         }
7239
7240                         ASSERT(hash->dth_nbuckets > 0);
7241                         hash->dth_nbuckets--;
7242                         kmem_free(bucket, sizeof (dtrace_hashbucket_t));
7243                         return;
7244                 }
7245
7246                 bucket->dthb_chain = *nextp;
7247         } else {
7248                 *(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;
7249         }
7250
7251         if (*nextp != NULL)
7252                 *(DTRACE_HASHPREV(hash, *nextp)) = *prevp;
7253 }
7254
7255 /*
7256  * DTrace Utility Functions
7257  *
7258  * These are random utility functions that are _not_ called from probe context.
7259  */
7260 static int
7261 dtrace_badattr(const dtrace_attribute_t *a)
7262 {
7263         return (a->dtat_name > DTRACE_STABILITY_MAX ||
7264             a->dtat_data > DTRACE_STABILITY_MAX ||
7265             a->dtat_class > DTRACE_CLASS_MAX);
7266 }
7267
7268 /*
7269  * Returns a dtrace-managed copy of a string, and will
7270  * deduplicate copies of the same string.
7271  * If the specified string is NULL, returns an empty string
7272  */
7273 static char *
7274 dtrace_strref(const char *str)
7275 {
7276         dtrace_string_t *s = NULL;
7277         size_t bufsize = (str != NULL ? strlen(str) : 0) + 1;
7278
7279         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7280
7281         if (str == NULL)
7282                 str = "";
7283
7284         for (s = dtrace_hash_lookup_string(dtrace_strings, str); s != NULL;
7285              s = *(DTRACE_HASHNEXT(dtrace_strings, s)))  {
7286                 if (strncmp(str, s->dtst_str, bufsize) != 0) {
7287                         continue;
7288                 }
7289                 ASSERT(s->dtst_refcount != UINT32_MAX);
7290                 s->dtst_refcount++;
7291                 return s->dtst_str;
7292         }
7293
7294         s = kmem_zalloc(sizeof(dtrace_string_t) + bufsize, KM_SLEEP);
7295         s->dtst_refcount = 1;
7296         (void) strlcpy(s->dtst_str, str, bufsize);
7297
7298         dtrace_hash_add(dtrace_strings, s);
7299
7300         return s->dtst_str;
7301 }
7302
7303 static void
7304 dtrace_strunref(const char *str)
7305 {
7306         ASSERT(str != NULL);
7307         dtrace_string_t *s = NULL;
7308         size_t bufsize = strlen(str) + 1;
7309
7310         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7311
7312         for (s = dtrace_hash_lookup_string(dtrace_strings, str); s != NULL;
7313              s = *(DTRACE_HASHNEXT(dtrace_strings, s)))  {
7314                 if (strncmp(str, s->dtst_str, bufsize) != 0) {
7315                         continue;
7316                 }
7317                 ASSERT(s->dtst_refcount != 0);
7318                 s->dtst_refcount--;
7319                 if (s->dtst_refcount == 0) {
7320                         dtrace_hash_remove(dtrace_strings, s);
7321                         kmem_free(s, sizeof(dtrace_string_t) + bufsize);
7322                 }
7323                 return;
7324         }
7325         panic("attempt to unref non-existent string %s", str);
7326 }
7327
7328 #define DTRACE_ISALPHA(c)       \
7329         (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
7330
7331 static int
7332 dtrace_badname(const char *s)
7333 {
7334         char c;
7335
7336         if (s == NULL || (c = *s++) == '\0')
7337                 return (0);
7338
7339         if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
7340                 return (1);
7341
7342         while ((c = *s++) != '\0') {
7343                 if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') &&
7344                     c != '-' && c != '_' && c != '.' && c != '`')
7345                         return (1);
7346         }
7347
7348         return (0);
7349 }
7350
7351 static void
7352 dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
7353 {
7354         uint32_t priv;
7355
7356         if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
7357                 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
7358                         priv = DTRACE_PRIV_USER | DTRACE_PRIV_PROC | DTRACE_PRIV_OWNER;
7359                 }
7360                 else {
7361                         priv = DTRACE_PRIV_ALL;
7362                 }
7363                 *uidp = 0;
7364                 *zoneidp = 0;
7365         } else {
7366                 *uidp = crgetuid(cr);
7367                 *zoneidp = crgetzoneid(cr);
7368
7369                 priv = 0;
7370                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
7371                         priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;
7372                 else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
7373                         priv |= DTRACE_PRIV_USER;
7374                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
7375                         priv |= DTRACE_PRIV_PROC;
7376                 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
7377                         priv |= DTRACE_PRIV_OWNER;
7378                 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
7379                         priv |= DTRACE_PRIV_ZONEOWNER;
7380         }
7381
7382         *privp = priv;
7383 }
7384
7385 #ifdef DTRACE_ERRDEBUG
7386 static void
7387 dtrace_errdebug(const char *str)
7388 {
7389         int hval = dtrace_hash_str(str) % DTRACE_ERRHASHSZ;
7390         int occupied = 0;
7391
7392         lck_mtx_lock(&dtrace_errlock);
7393         dtrace_errlast = str;
7394         dtrace_errthread = (kthread_t *)current_thread();
7395
7396         while (occupied++ < DTRACE_ERRHASHSZ) {
7397                 if (dtrace_errhash[hval].dter_msg == str) {
7398                         dtrace_errhash[hval].dter_count++;
7399                         goto out;
7400                 }
7401
7402                 if (dtrace_errhash[hval].dter_msg != NULL) {
7403                         hval = (hval + 1) % DTRACE_ERRHASHSZ;
7404                         continue;
7405                 }
7406
7407                 dtrace_errhash[hval].dter_msg = str;
7408                 dtrace_errhash[hval].dter_count = 1;
7409                 goto out;
7410         }
7411
7412         panic("dtrace: undersized error hash");
7413 out:
7414         lck_mtx_unlock(&dtrace_errlock);
7415 }
7416 #endif
7417
7418 /*
7419  * DTrace Matching Functions
7420  *
7421  * These functions are used to match groups of probes, given some elements of
7422  * a probe tuple, or some globbed expressions for elements of a probe tuple.
7423  */
7424 static int
7425 dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
7426     zoneid_t zoneid)
7427 {
7428         if (priv != DTRACE_PRIV_ALL) {
7429                 uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
7430                 uint32_t match = priv & ppriv;
7431
7432                 /*
7433                  * No PRIV_DTRACE_* privileges...
7434                  */
7435                 if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |
7436                     DTRACE_PRIV_KERNEL)) == 0)
7437                         return (0);
7438
7439                 /*
7440                  * No matching bits, but there were bits to match...
7441                  */
7442                 if (match == 0 && ppriv != 0)
7443                         return (0);
7444
7445                 /*
7446                  * Need to have permissions to the process, but don't...
7447                  */
7448                 if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
7449                     uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
7450                         return (0);
7451                 }
7452
7453                 /*
7454                  * Need to be in the same zone unless we possess the
7455                  * privilege to examine all zones.
7456                  */
7457                 if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
7458                     zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
7459                         return (0);
7460                 }
7461         }
7462
7463         return (1);
7464 }
7465
7466 /*
7467  * dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
7468  * consists of input pattern strings and an ops-vector to evaluate them.
7469  * This function returns >0 for match, 0 for no match, and <0 for error.
7470  */
7471 static int
7472 dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,
7473     uint32_t priv, uid_t uid, zoneid_t zoneid)
7474 {
7475         dtrace_provider_t *pvp = prp->dtpr_provider;
7476         int rv;
7477
7478         if (pvp->dtpv_defunct)
7479                 return (0);
7480
7481         if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
7482                 return (rv);
7483
7484         if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
7485                 return (rv);
7486
7487         if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
7488                 return (rv);
7489
7490         if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
7491                 return (rv);
7492
7493         if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
7494                 return (0);
7495
7496         return (rv);
7497 }
7498
7499 /*
7500  * dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
7501  * interface for matching a glob pattern 'p' to an input string 's'.  Unlike
7502  * libc's version, the kernel version only applies to 8-bit ASCII strings.
7503  * In addition, all of the recursion cases except for '*' matching have been
7504  * unwound.  For '*', we still implement recursive evaluation, but a depth
7505  * counter is maintained and matching is aborted if we recurse too deep.
7506  * The function returns 0 if no match, >0 if match, and <0 if recursion error.
7507  */
7508 static int
7509 dtrace_match_glob(const char *s, const char *p, int depth)
7510 {
7511         const char *olds;
7512         char s1, c;
7513         int gs;
7514
7515         if (depth > DTRACE_PROBEKEY_MAXDEPTH)
7516                 return (-1);
7517
7518         if (s == NULL)
7519                 s = ""; /* treat NULL as empty string */
7520
7521 top:
7522         olds = s;
7523         s1 = *s++;
7524
7525         if (p == NULL)
7526                 return (0);
7527
7528         if ((c = *p++) == '\0')
7529                 return (s1 == '\0');
7530
7531         switch (c) {
7532         case '[': {
7533                 int ok = 0, notflag = 0;
7534                 char lc = '\0';
7535
7536                 if (s1 == '\0')
7537                         return (0);
7538
7539                 if (*p == '!') {
7540                         notflag = 1;
7541                         p++;
7542                 }
7543
7544                 if ((c = *p++) == '\0')
7545                         return (0);
7546
7547                 do {
7548                         if (c == '-' && lc != '\0' && *p != ']') {
7549                                 if ((c = *p++) == '\0')
7550                                         return (0);
7551                                 if (c == '\\' && (c = *p++) == '\0')
7552                                         return (0);
7553
7554                                 if (notflag) {
7555                                         if (s1 < lc || s1 > c)
7556                                                 ok++;
7557                                         else
7558                                                 return (0);
7559                                 } else if (lc <= s1 && s1 <= c)
7560                                         ok++;
7561
7562                         } else if (c == '\\' && (c = *p++) == '\0')
7563                                 return (0);
7564
7565                         lc = c; /* save left-hand 'c' for next iteration */
7566
7567                         if (notflag) {
7568                                 if (s1 != c)
7569                                         ok++;
7570                                 else
7571                                         return (0);
7572                         } else if (s1 == c)
7573                                 ok++;
7574
7575                         if ((c = *p++) == '\0')
7576                                 return (0);
7577
7578                 } while (c != ']');
7579
7580                 if (ok)
7581                         goto top;
7582
7583                 return (0);
7584         }
7585
7586         case '\\':
7587                 if ((c = *p++) == '\0')
7588                         return (0);
7589                 /*FALLTHRU*/
7590
7591         default:
7592                 if (c != s1)
7593                         return (0);
7594                 /*FALLTHRU*/
7595
7596         case '?':
7597                 if (s1 != '\0')
7598                         goto top;
7599                 return (0);
7600
7601         case '*':
7602                 while (*p == '*')
7603                         p++; /* consecutive *'s are identical to a single one */
7604
7605                 if (*p == '\0')
7606                         return (1);
7607
7608                 for (s = olds; *s != '\0'; s++) {
7609                         if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
7610                                 return (gs);
7611                 }
7612
7613                 return (0);
7614         }
7615 }
7616
7617 /*ARGSUSED*/
7618 static int
7619 dtrace_match_string(const char *s, const char *p, int depth)
7620 {
7621 #pragma unused(depth) /* __APPLE__ */
7622         return (s != NULL && s == p);
7623 }
7624
7625 /*ARGSUSED*/
7626 static int
7627 dtrace_match_module(const char *s, const char *p, int depth)
7628 {
7629 #pragma unused(depth) /* __APPLE__ */
7630         size_t len;
7631         if (s == NULL || p == NULL)
7632                 return (0);
7633
7634         len = strlen(p);
7635
7636         if (strncmp(p, s, len) != 0)
7637                 return (0);
7638
7639         if (s[len] == '.' || s[len] == '\0')
7640                 return (1);
7641
7642         return (0);
7643 }
7644
7645 /*ARGSUSED*/
7646 static int
7647 dtrace_match_nul(const char *s, const char *p, int depth)
7648 {
7649 #pragma unused(s, p, depth) /* __APPLE__ */
7650         return (1); /* always match the empty pattern */
7651 }
7652
7653 /*ARGSUSED*/
7654 static int
7655 dtrace_match_nonzero(const char *s, const char *p, int depth)
7656 {
7657 #pragma unused(p, depth) /* __APPLE__ */
7658         return (s != NULL && s[0] != '\0');
7659 }
7660
7661 static int
7662 dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
7663     zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *, void *), void *arg1, void *arg2)
7664 {
7665         dtrace_probe_t *probe;
7666         dtrace_provider_t prov_template = {
7667                 .dtpv_name = (char *)(uintptr_t)pkp->dtpk_prov
7668         };
7669
7670         dtrace_probe_t template = {
7671                 .dtpr_provider = &prov_template,
7672                 .dtpr_mod = (char *)(uintptr_t)pkp->dtpk_mod,
7673                 .dtpr_func = (char *)(uintptr_t)pkp->dtpk_func,
7674                 .dtpr_name = (char *)(uintptr_t)pkp->dtpk_name
7675         };
7676
7677         dtrace_hash_t *hash = NULL;
7678         int len, rc, best = INT_MAX, nmatched = 0;
7679         dtrace_id_t i;
7680
7681         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7682
7683         /*
7684          * If the probe ID is specified in the key, just lookup by ID and
7685          * invoke the match callback once if a matching probe is found.
7686          */
7687         if (pkp->dtpk_id != DTRACE_IDNONE) {
7688                 if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
7689                     dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
7690                         if ((*matched)(probe, arg1, arg2) == DTRACE_MATCH_FAIL)
7691                                return (DTRACE_MATCH_FAIL);
7692                         nmatched++;
7693                 }
7694                 return (nmatched);
7695         }
7696
7697         /*
7698          * We want to find the most distinct of the provider name, module name,
7699          * function name, and name.  So for each one that is not a glob
7700          * pattern or empty string, we perform a lookup in the corresponding
7701          * hash and use the hash table with the fewest collisions to do our
7702          * search.
7703          */
7704         if (pkp->dtpk_pmatch == &dtrace_match_string &&
7705             (len = dtrace_hash_collisions(dtrace_byprov, &template)) < best) {
7706                 best = len;
7707                 hash = dtrace_byprov;
7708         }
7709
7710         if (pkp->dtpk_mmatch == &dtrace_match_string &&
7711             (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
7712                 best = len;
7713                 hash = dtrace_bymod;
7714         }
7715
7716         if (pkp->dtpk_fmatch == &dtrace_match_string &&
7717             (len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
7718                 best = len;
7719                 hash = dtrace_byfunc;
7720         }
7721
7722         if (pkp->dtpk_nmatch == &dtrace_match_string &&
7723             (len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
7724                 best = len;
7725                 hash = dtrace_byname;
7726         }
7727
7728         /*
7729          * If we did not select a hash table, iterate over every probe and
7730          * invoke our callback for each one that matches our input probe key.
7731          */
7732         if (hash == NULL) {
7733                 for (i = 0; i < (dtrace_id_t)dtrace_nprobes; i++) {
7734                         if ((probe = dtrace_probes[i]) == NULL ||
7735                             dtrace_match_probe(probe, pkp, priv, uid,
7736                             zoneid) <= 0)
7737                                 continue;
7738
7739                         nmatched++;
7740
7741                        if ((rc = (*matched)(probe, arg1, arg2)) != DTRACE_MATCH_NEXT) {
7742                                if (rc == DTRACE_MATCH_FAIL)
7743                                        return (DTRACE_MATCH_FAIL);
7744                                break;
7745                        }
7746                 }
7747
7748                 return (nmatched);
7749         }
7750
7751         /*
7752          * If we selected a hash table, iterate over each probe of the same key
7753          * name and invoke the callback for every probe that matches the other
7754          * attributes of our input probe key.
7755          */
7756         for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
7757             probe = *(DTRACE_HASHNEXT(hash, probe))) {
7758
7759                 if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
7760                         continue;
7761
7762                 nmatched++;
7763
7764                 if ((rc = (*matched)(probe, arg1, arg2)) != DTRACE_MATCH_NEXT) {
7765                     if (rc == DTRACE_MATCH_FAIL)
7766                         return (DTRACE_MATCH_FAIL);
7767                     break;
7768                 }
7769         }
7770
7771         return (nmatched);
7772 }
7773
7774 /*
7775  * Return the function pointer dtrace_probecmp() should use to compare the
7776  * specified pattern with a string.  For NULL or empty patterns, we select
7777  * dtrace_match_nul().  For glob pattern strings, we use dtrace_match_glob().
7778  * For non-empty non-glob strings, we use dtrace_match_string().
7779  */
7780 static dtrace_probekey_f *
7781 dtrace_probekey_func(const char *p)
7782 {
7783         char c;
7784
7785         if (p == NULL || *p == '\0')
7786                 return (&dtrace_match_nul);
7787
7788         while ((c = *p++) != '\0') {
7789                 if (c == '[' || c == '?' || c == '*' || c == '\\')
7790                         return (&dtrace_match_glob);
7791         }
7792
7793         return (&dtrace_match_string);
7794 }
7795
7796 static dtrace_probekey_f *
7797 dtrace_probekey_module_func(const char *p)
7798 {
7799         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7800
7801         dtrace_probekey_f *f = dtrace_probekey_func(p);
7802         if (f == &dtrace_match_string) {
7803                 dtrace_probe_t template = {
7804                         .dtpr_mod = (char *)(uintptr_t)p,
7805                 };
7806                 if (dtrace_hash_lookup(dtrace_bymod, &template) == NULL) {
7807                         return (&dtrace_match_module);
7808                 }
7809                 return (&dtrace_match_string);
7810         }
7811         return f;
7812 }
7813
7814 /*
7815  * Build a probe comparison key for use with dtrace_match_probe() from the
7816  * given probe description.  By convention, a null key only matches anchored
7817  * probes: if each field is the empty string, reset dtpk_fmatch to
7818  * dtrace_match_nonzero().
7819  */
7820 static void
7821 dtrace_probekey(const dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
7822 {
7823
7824         pkp->dtpk_prov = dtrace_strref(pdp->dtpd_provider);
7825         pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
7826
7827         pkp->dtpk_mod = dtrace_strref(pdp->dtpd_mod);
7828         pkp->dtpk_mmatch = dtrace_probekey_module_func(pdp->dtpd_mod);
7829
7830         pkp->dtpk_func = dtrace_strref(pdp->dtpd_func);
7831         pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
7832
7833         pkp->dtpk_name = dtrace_strref(pdp->dtpd_name);
7834         pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
7835
7836         pkp->dtpk_id = pdp->dtpd_id;
7837
7838         if (pkp->dtpk_id == DTRACE_IDNONE &&
7839             pkp->dtpk_pmatch == &dtrace_match_nul &&
7840             pkp->dtpk_mmatch == &dtrace_match_nul &&
7841             pkp->dtpk_fmatch == &dtrace_match_nul &&
7842             pkp->dtpk_nmatch == &dtrace_match_nul)
7843                 pkp->dtpk_fmatch = &dtrace_match_nonzero;
7844 }
7845
7846 static void
7847 dtrace_probekey_release(dtrace_probekey_t *pkp)
7848 {
7849         dtrace_strunref(pkp->dtpk_prov);
7850         dtrace_strunref(pkp->dtpk_mod);
7851         dtrace_strunref(pkp->dtpk_func);
7852         dtrace_strunref(pkp->dtpk_name);
7853 }
7854
7855 static int
7856 dtrace_cond_provider_match(dtrace_probedesc_t *desc, void *data)
7857 {
7858         if (desc == NULL)
7859                 return 1;
7860
7861         dtrace_probekey_f *func = dtrace_probekey_func(desc->dtpd_provider);
7862
7863         return func((char*)data, desc->dtpd_provider, 0);
7864 }
7865
7866 /*
7867  * DTrace Provider-to-Framework API Functions
7868  *
7869  * These functions implement much of the Provider-to-Framework API, as
7870  * described in <sys/dtrace.h>.  The parts of the API not in this section are
7871  * the functions in the API for probe management (found below), and
7872  * dtrace_probe() itself (found above).
7873  */
7874
7875 /*
7876  * Register the calling provider with the DTrace framework.  This should
7877  * generally be called by DTrace providers in their attach(9E) entry point.
7878  */
7879 int
7880 dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
7881     cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)
7882 {
7883         dtrace_provider_t *provider;
7884
7885         if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {
7886                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7887                     "arguments", name ? name : "<NULL>");
7888                 return (EINVAL);
7889         }
7890
7891         if (name[0] == '\0' || dtrace_badname(name)) {
7892                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7893                     "provider name", name);
7894                 return (EINVAL);
7895         }
7896
7897         if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||
7898             pops->dtps_enable == NULL || pops->dtps_disable == NULL ||
7899             pops->dtps_destroy == NULL ||
7900             ((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
7901                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7902                     "provider ops", name);
7903                 return (EINVAL);
7904         }
7905
7906         if (dtrace_badattr(&pap->dtpa_provider) ||
7907             dtrace_badattr(&pap->dtpa_mod) ||
7908             dtrace_badattr(&pap->dtpa_func) ||
7909             dtrace_badattr(&pap->dtpa_name) ||
7910             dtrace_badattr(&pap->dtpa_args)) {
7911                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7912                     "provider attributes", name);
7913                 return (EINVAL);
7914         }
7915
7916         if (priv & ~DTRACE_PRIV_ALL) {
7917                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
7918                     "privilege attributes", name);
7919                 return (EINVAL);
7920         }
7921
7922         if ((priv & DTRACE_PRIV_KERNEL) &&
7923             (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&
7924             pops->dtps_usermode == NULL) {
7925                 cmn_err(CE_WARN, "failed to register provider '%s': need "
7926                     "dtps_usermode() op for given privilege attributes", name);
7927                 return (EINVAL);
7928         }
7929
7930         provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
7931
7932         provider->dtpv_attr = *pap;
7933         provider->dtpv_priv.dtpp_flags = priv;
7934         if (cr != NULL) {
7935                 provider->dtpv_priv.dtpp_uid = crgetuid(cr);
7936                 provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
7937         }
7938         provider->dtpv_pops = *pops;
7939
7940         if (pops->dtps_provide == NULL) {
7941                 ASSERT(pops->dtps_provide_module != NULL);
7942                 provider->dtpv_pops.dtps_provide =
7943                     (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop;
7944         }
7945
7946         if (pops->dtps_provide_module == NULL) {
7947                 ASSERT(pops->dtps_provide != NULL);
7948                 provider->dtpv_pops.dtps_provide_module =
7949                     (void (*)(void *, struct modctl *))dtrace_nullop;
7950         }
7951
7952         if (pops->dtps_suspend == NULL) {
7953                 ASSERT(pops->dtps_resume == NULL);
7954                 provider->dtpv_pops.dtps_suspend =
7955                     (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
7956                 provider->dtpv_pops.dtps_resume =
7957                     (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
7958         }
7959
7960         provider->dtpv_arg = arg;
7961         *idp = (dtrace_provider_id_t)provider;
7962
7963         if (pops == &dtrace_provider_ops) {
7964                 LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
7965                 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
7966
7967                 provider->dtpv_name = dtrace_strref(name);
7968
7969                 ASSERT(dtrace_anon.dta_enabling == NULL);
7970
7971                 /*
7972                  * We make sure that the DTrace provider is at the head of
7973                  * the provider chain.
7974                  */
7975                 provider->dtpv_next = dtrace_provider;
7976                 dtrace_provider = provider;
7977                 return (0);
7978         }
7979
7980         lck_mtx_lock(&dtrace_provider_lock);
7981         lck_mtx_lock(&dtrace_lock);
7982
7983         provider->dtpv_name = dtrace_strref(name);
7984
7985         /*
7986          * If there is at least one provider registered, we'll add this
7987          * provider after the first provider.
7988          */
7989         if (dtrace_provider != NULL) {
7990                 provider->dtpv_next = dtrace_provider->dtpv_next;
7991                 dtrace_provider->dtpv_next = provider;
7992         } else {
7993                 dtrace_provider = provider;
7994         }
7995
7996         if (dtrace_retained != NULL) {
7997                 dtrace_enabling_provide(provider);
7998
7999                 /*
8000                  * Now we need to call dtrace_enabling_matchall_with_cond() --
8001                  * with a condition matching the provider name we just added,
8002                  * which will acquire cpu_lock and dtrace_lock.  We therefore need
8003                  * to drop all of our locks before calling into it...
8004                  */
8005                 lck_mtx_unlock(&dtrace_lock);
8006                 lck_mtx_unlock(&dtrace_provider_lock);
8007
8008                 dtrace_match_cond_t cond = {dtrace_cond_provider_match, provider->dtpv_name};
8009                 dtrace_enabling_matchall_with_cond(&cond);
8010
8011                 return (0);
8012         }
8013
8014         lck_mtx_unlock(&dtrace_lock);
8015         lck_mtx_unlock(&dtrace_provider_lock);
8016
8017         return (0);
8018 }
8019
8020 /*
8021  * Unregister the specified provider from the DTrace framework.  This should
8022  * generally be called by DTrace providers in their detach(9E) entry point.
8023  */
8024 int
8025 dtrace_unregister(dtrace_provider_id_t id)
8026 {
8027         dtrace_provider_t *old = (dtrace_provider_t *)id;
8028         dtrace_provider_t *prev = NULL;
8029         int self = 0;
8030         dtrace_probe_t *probe, *first = NULL, *next = NULL;
8031         dtrace_probe_t template = {
8032                 .dtpr_provider = old
8033         };
8034
8035         if (old->dtpv_pops.dtps_enable ==
8036             (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop) {
8037                 /*
8038                  * If DTrace itself is the provider, we're called with locks
8039                  * already held.
8040                  */
8041                 ASSERT(old == dtrace_provider);
8042                 ASSERT(dtrace_devi != NULL);
8043                 LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
8044                 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8045                 self = 1;
8046
8047                 if (dtrace_provider->dtpv_next != NULL) {
8048                         /*
8049                          * There's another provider here; return failure.
8050                          */
8051                         return (EBUSY);
8052                 }
8053         } else {
8054                 lck_mtx_lock(&dtrace_provider_lock);
8055                 lck_mtx_lock(&mod_lock);
8056                 lck_mtx_lock(&dtrace_lock);
8057         }
8058
8059         /*
8060          * If anyone has /dev/dtrace open, or if there are anonymous enabled
8061          * probes, we refuse to let providers slither away, unless this
8062          * provider has already been explicitly invalidated.
8063          */
8064         if (!old->dtpv_defunct &&
8065             (dtrace_opens || (dtrace_anon.dta_state != NULL &&
8066             dtrace_anon.dta_state->dts_necbs > 0))) {
8067                 if (!self) {
8068                         lck_mtx_unlock(&dtrace_lock);
8069                         lck_mtx_unlock(&mod_lock);
8070                         lck_mtx_unlock(&dtrace_provider_lock);
8071                 }
8072                 return (EBUSY);
8073         }
8074
8075         /*
8076          * Attempt to destroy the probes associated with this provider.
8077          */
8078         if (old->dtpv_ecb_count!=0) {
8079                 /*
8080                  * We have at least one ECB; we can't remove this provider.
8081                  */
8082                 if (!self) {
8083                         lck_mtx_unlock(&dtrace_lock);
8084                         lck_mtx_unlock(&mod_lock);
8085                         lck_mtx_unlock(&dtrace_provider_lock);
8086                 }
8087                 return (EBUSY);
8088         }
8089
8090         /*
8091          * All of the probes for this provider are disabled; we can safely
8092          * remove all of them from their hash chains and from the probe array.
8093          */
8094         for (probe = dtrace_hash_lookup(dtrace_byprov, &template); probe != NULL;
8095             probe = *(DTRACE_HASHNEXT(dtrace_byprov, probe))) {
8096                 if (probe->dtpr_provider != old)
8097                         continue;
8098
8099                 dtrace_probes[probe->dtpr_id - 1] = NULL;
8100                 old->dtpv_probe_count--;
8101
8102                 dtrace_hash_remove(dtrace_bymod, probe);
8103                 dtrace_hash_remove(dtrace_byfunc, probe);
8104                 dtrace_hash_remove(dtrace_byname, probe);
8105
8106                 if (first == NULL) {
8107                         first = probe;
8108                         probe->dtpr_nextmod = NULL;
8109                 } else {
8110                         /*
8111                          * Use nextmod as the chain of probes to remove
8112                          */
8113                         probe->dtpr_nextmod = first;
8114                         first = probe;
8115                 }
8116         }
8117
8118         for (probe = first; probe != NULL; probe = next) {
8119                 next = probe->dtpr_nextmod;
8120                 dtrace_hash_remove(dtrace_byprov, probe);
8121         }
8122
8123         /*
8124          * The provider's probes have been removed from the hash chains and
8125          * from the probe array.  Now issue a dtrace_sync() to be sure that
8126          * everyone has cleared out from any probe array processing.
8127          */
8128         dtrace_sync();
8129
8130         for (probe = first; probe != NULL; probe = next) {
8131                 next = probe->dtpr_nextmod;
8132
8133                 old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
8134                     probe->dtpr_arg);
8135                 dtrace_strunref(probe->dtpr_mod);
8136                 dtrace_strunref(probe->dtpr_func);
8137                 dtrace_strunref(probe->dtpr_name);
8138                 vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
8139                 zfree(dtrace_probe_t_zone, probe);
8140         }
8141
8142         if ((prev = dtrace_provider) == old) {
8143                 ASSERT(self || dtrace_devi == NULL);
8144                 ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
8145                 dtrace_provider = old->dtpv_next;
8146         } else {
8147                 while (prev != NULL && prev->dtpv_next != old)
8148                         prev = prev->dtpv_next;
8149
8150                 if (prev == NULL) {
8151                         panic("attempt to unregister non-existent "
8152                             "dtrace provider %p\n", (void *)id);
8153                 }
8154
8155                 prev->dtpv_next = old->dtpv_next;
8156         }
8157
8158         dtrace_strunref(old->dtpv_name);
8159
8160         if (!self) {
8161                 lck_mtx_unlock(&dtrace_lock);
8162                 lck_mtx_unlock(&mod_lock);
8163                 lck_mtx_unlock(&dtrace_provider_lock);
8164         }
8165
8166         kmem_free(old, sizeof (dtrace_provider_t));
8167
8168         return (0);
8169 }
8170
8171 /*
8172  * Invalidate the specified provider.  All subsequent probe lookups for the
8173  * specified provider will fail, but its probes will not be removed.
8174  */
8175 void
8176 dtrace_invalidate(dtrace_provider_id_t id)
8177 {
8178         dtrace_provider_t *pvp = (dtrace_provider_t *)id;
8179
8180         ASSERT(pvp->dtpv_pops.dtps_enable !=
8181             (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
8182
8183         lck_mtx_lock(&dtrace_provider_lock);
8184         lck_mtx_lock(&dtrace_lock);
8185
8186         pvp->dtpv_defunct = 1;
8187
8188         lck_mtx_unlock(&dtrace_lock);
8189         lck_mtx_unlock(&dtrace_provider_lock);
8190 }
8191
8192 /*
8193  * Indicate whether or not DTrace has attached.
8194  */
8195 int
8196 dtrace_attached(void)
8197 {
8198         /*
8199          * dtrace_provider will be non-NULL iff the DTrace driver has
8200          * attached.  (It's non-NULL because DTrace is always itself a
8201          * provider.)
8202          */
8203         return (dtrace_provider != NULL);
8204 }
8205
8206 /*
8207  * Remove all the unenabled probes for the given provider.  This function is
8208  * not unlike dtrace_unregister(), except that it doesn't remove the provider
8209  * -- just as many of its associated probes as it can.
8210  */
8211 int
8212 dtrace_condense(dtrace_provider_id_t id)
8213 {
8214         dtrace_provider_t *prov = (dtrace_provider_t *)id;
8215         dtrace_probe_t *probe, *first = NULL;
8216         dtrace_probe_t template = {
8217                 .dtpr_provider = prov
8218         };
8219
8220         /*
8221          * Make sure this isn't the dtrace provider itself.
8222          */
8223         ASSERT(prov->dtpv_pops.dtps_enable !=
8224           (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
8225
8226         lck_mtx_lock(&dtrace_provider_lock);
8227         lck_mtx_lock(&dtrace_lock);
8228
8229         /*
8230          * Attempt to destroy the probes associated with this provider.
8231          */
8232         for (probe = dtrace_hash_lookup(dtrace_byprov, &template); probe != NULL;
8233             probe = *(DTRACE_HASHNEXT(dtrace_byprov, probe))) {
8234
8235                 if (probe->dtpr_provider != prov)
8236                         continue;
8237
8238                 if (probe->dtpr_ecb != NULL)
8239                         continue;
8240
8241                 dtrace_probes[probe->dtpr_id - 1] = NULL;
8242                 prov->dtpv_probe_count--;
8243
8244                 dtrace_hash_remove(dtrace_bymod, probe);
8245                 dtrace_hash_remove(dtrace_byfunc, probe);
8246                 dtrace_hash_remove(dtrace_byname, probe);
8247
8248                 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
8249                     probe->dtpr_arg);
8250                 dtrace_strunref(probe->dtpr_mod);
8251                 dtrace_strunref(probe->dtpr_func);
8252                 dtrace_strunref(probe->dtpr_name);
8253                 if (first == NULL) {
8254                         first = probe;
8255                         probe->dtpr_nextmod = NULL;
8256                 } else {
8257                         /*
8258                          * Use nextmod as the chain of probes to remove
8259                          */
8260                         probe->dtpr_nextmod = first;
8261                         first = probe;
8262                 }
8263         }
8264
8265         for (probe = first; probe != NULL; probe = first) {
8266                 first = probe->dtpr_nextmod;
8267                 dtrace_hash_remove(dtrace_byprov, probe);
8268                 vmem_free(dtrace_arena, (void *)((uintptr_t)probe->dtpr_id), 1);
8269                 zfree(dtrace_probe_t_zone, probe);
8270         }
8271
8272         lck_mtx_unlock(&dtrace_lock);
8273         lck_mtx_unlock(&dtrace_provider_lock);
8274
8275         return (0);
8276 }
8277
8278 /*
8279  * DTrace Probe Management Functions
8280  *
8281  * The functions in this section perform the DTrace probe management,
8282  * including functions to create probes, look-up probes, and call into the
8283  * providers to request that probes be provided.  Some of these functions are
8284  * in the Provider-to-Framework API; these functions can be identified by the
8285  * fact that they are not declared "static".
8286  */
8287
8288 /*
8289  * Create a probe with the specified module name, function name, and name.
8290  */
8291 dtrace_id_t
8292 dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
8293     const char *func, const char *name, int aframes, void *arg)
8294 {
8295         dtrace_probe_t *probe, **probes;
8296         dtrace_provider_t *provider = (dtrace_provider_t *)prov;
8297         dtrace_id_t id;
8298
8299         if (provider == dtrace_provider) {
8300                 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8301         } else {
8302                 lck_mtx_lock(&dtrace_lock);
8303         }
8304
8305         id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
8306             VM_BESTFIT | VM_SLEEP);
8307
8308         probe = zalloc(dtrace_probe_t_zone);
8309         bzero(probe, sizeof (dtrace_probe_t));
8310
8311         probe->dtpr_id = id;
8312         probe->dtpr_gen = dtrace_probegen++;
8313         probe->dtpr_mod = dtrace_strref(mod);
8314         probe->dtpr_func = dtrace_strref(func);
8315         probe->dtpr_name = dtrace_strref(name);
8316         probe->dtpr_arg = arg;
8317         probe->dtpr_aframes = aframes;
8318         probe->dtpr_provider = provider;
8319
8320         dtrace_hash_add(dtrace_byprov, probe);
8321         dtrace_hash_add(dtrace_bymod, probe);
8322         dtrace_hash_add(dtrace_byfunc, probe);
8323         dtrace_hash_add(dtrace_byname, probe);
8324
8325         if (id - 1 >= (dtrace_id_t)dtrace_nprobes) {
8326                 size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
8327                 size_t nsize = osize << 1;
8328
8329                 if (nsize == 0) {
8330                         ASSERT(osize == 0);
8331                         ASSERT(dtrace_probes == NULL);
8332                         nsize = sizeof (dtrace_probe_t *);
8333                 }
8334
8335                 probes = kmem_zalloc(nsize, KM_SLEEP);
8336
8337                 if (dtrace_probes == NULL) {
8338                         ASSERT(osize == 0);
8339                         dtrace_probes = probes;
8340                         dtrace_nprobes = 1;
8341                 } else {
8342                         dtrace_probe_t **oprobes = dtrace_probes;
8343
8344                         bcopy(oprobes, probes, osize);
8345                         dtrace_membar_producer();
8346                         dtrace_probes = probes;
8347
8348                         dtrace_sync();
8349
8350                         /*
8351                          * All CPUs are now seeing the new probes array; we can
8352                          * safely free the old array.
8353                          */
8354                         kmem_free(oprobes, osize);
8355                         dtrace_nprobes <<= 1;
8356                 }
8357
8358                 ASSERT(id - 1 < (dtrace_id_t)dtrace_nprobes);
8359         }
8360
8361         ASSERT(dtrace_probes[id - 1] == NULL);
8362         dtrace_probes[id - 1] = probe;
8363         provider->dtpv_probe_count++;
8364
8365         if (provider != dtrace_provider)
8366                 lck_mtx_unlock(&dtrace_lock);
8367
8368         return (id);
8369 }
8370
8371 static dtrace_probe_t *
8372 dtrace_probe_lookup_id(dtrace_id_t id)
8373 {
8374         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8375
8376         if (id == 0 || id > (dtrace_id_t)dtrace_nprobes)
8377                 return (NULL);
8378
8379         return (dtrace_probes[id - 1]);
8380 }
8381
8382 static int
8383 dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg1, void *arg2)
8384 {
8385 #pragma unused(arg2)
8386         *((dtrace_id_t *)arg1) = probe->dtpr_id;
8387
8388         return (DTRACE_MATCH_DONE);
8389 }
8390
8391 /*
8392  * Look up a probe based on provider and one or more of module name, function
8393  * name and probe name.
8394  */
8395 dtrace_id_t
8396 dtrace_probe_lookup(dtrace_provider_id_t prid, const char *mod,
8397     const char *func, const char *name)
8398 {
8399         dtrace_probekey_t pkey;
8400         dtrace_id_t id;
8401         int match;
8402
8403         lck_mtx_lock(&dtrace_lock);
8404
8405         pkey.dtpk_prov = dtrace_strref(((dtrace_provider_t *)prid)->dtpv_name);
8406         pkey.dtpk_pmatch = &dtrace_match_string;
8407         pkey.dtpk_mod = dtrace_strref(mod);
8408         pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
8409         pkey.dtpk_func = dtrace_strref(func);
8410         pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
8411         pkey.dtpk_name = dtrace_strref(name);
8412         pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
8413         pkey.dtpk_id = DTRACE_IDNONE;
8414
8415         match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
8416             dtrace_probe_lookup_match, &id, NULL);
8417
8418         dtrace_probekey_release(&pkey);
8419
8420         lck_mtx_unlock(&dtrace_lock);
8421
8422         ASSERT(match == 1 || match == 0);
8423         return (match ? id : 0);
8424 }
8425
8426 /*
8427  * Returns the probe argument associated with the specified probe.
8428  */
8429 void *
8430 dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
8431 {
8432         dtrace_probe_t *probe;
8433         void *rval = NULL;
8434
8435         lck_mtx_lock(&dtrace_lock);
8436
8437         if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
8438             probe->dtpr_provider == (dtrace_provider_t *)id)
8439                 rval = probe->dtpr_arg;
8440
8441         lck_mtx_unlock(&dtrace_lock);
8442
8443         return (rval);
8444 }
8445
8446 /*
8447  * Copy a probe into a probe description.
8448  */
8449 static void
8450 dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)
8451 {
8452         bzero(pdp, sizeof (dtrace_probedesc_t));
8453         pdp->dtpd_id = prp->dtpr_id;
8454
8455         /* APPLE NOTE: Darwin employs size bounded string operation. */
8456         (void) strlcpy(pdp->dtpd_provider,
8457             prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN);
8458
8459         (void) strlcpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN);
8460         (void) strlcpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN);
8461         (void) strlcpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN);
8462 }
8463
8464 /*
8465  * Called to indicate that a probe -- or probes -- should be provided by a
8466  * specfied provider.  If the specified description is NULL, the provider will
8467  * be told to provide all of its probes.  (This is done whenever a new
8468  * consumer comes along, or whenever a retained enabling is to be matched.) If
8469  * the specified description is non-NULL, the provider is given the
8470  * opportunity to dynamically provide the specified probe, allowing providers
8471  * to support the creation of probes on-the-fly.  (So-called _autocreated_
8472  * probes.)  If the provider is NULL, the operations will be applied to all
8473  * providers; if the provider is non-NULL the operations will only be applied
8474  * to the specified provider.  The dtrace_provider_lock must be held, and the
8475  * dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
8476  * will need to grab the dtrace_lock when it reenters the framework through
8477  * dtrace_probe_lookup(), dtrace_probe_create(), etc.
8478  */
8479 static void
8480 dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
8481 {
8482         struct modctl *ctl;
8483         int all = 0;
8484
8485         LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
8486
8487         if (prv == NULL) {
8488                 all = 1;
8489                 prv = dtrace_provider;
8490         }
8491
8492         do {
8493                 /*
8494                  * First, call the blanket provide operation.
8495                  */
8496                 prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
8497
8498                 /*
8499                  * Now call the per-module provide operation.  We will grab
8500                  * mod_lock to prevent the list from being modified.  Note
8501                  * that this also prevents the mod_busy bits from changing.
8502                  * (mod_busy can only be changed with mod_lock held.)
8503                  */
8504                 lck_mtx_lock(&mod_lock);
8505
8506                 ctl = dtrace_modctl_list;
8507                 while (ctl) {
8508                         prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
8509                         ctl = ctl->mod_next;
8510                 }
8511
8512                 lck_mtx_unlock(&mod_lock);
8513         } while (all && (prv = prv->dtpv_next) != NULL);
8514 }
8515
8516 /*
8517  * Iterate over each probe, and call the Framework-to-Provider API function
8518  * denoted by offs.
8519  */
8520 static void
8521 dtrace_probe_foreach(uintptr_t offs)
8522 {
8523         dtrace_provider_t *prov;
8524         void (*func)(void *, dtrace_id_t, void *);
8525         dtrace_probe_t *probe;
8526         dtrace_icookie_t cookie;
8527         int i;
8528
8529         /*
8530          * We disable interrupts to walk through the probe array.  This is
8531          * safe -- the dtrace_sync() in dtrace_unregister() assures that we
8532          * won't see stale data.
8533          */
8534         cookie = dtrace_interrupt_disable();
8535
8536         for (i = 0; i < dtrace_nprobes; i++) {
8537                 if ((probe = dtrace_probes[i]) == NULL)
8538                         continue;
8539
8540                 if (probe->dtpr_ecb == NULL) {
8541                         /*
8542                          * This probe isn't enabled -- don't call the function.
8543                          */
8544                         continue;
8545                 }
8546
8547                 prov = probe->dtpr_provider;
8548                 func = *((void(**)(void *, dtrace_id_t, void *))
8549                     ((uintptr_t)&prov->dtpv_pops + offs));
8550
8551                 func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
8552         }
8553
8554         dtrace_interrupt_enable(cookie);
8555 }
8556
8557 static int
8558 dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab, dtrace_ecbdesc_t *ep)
8559 {
8560         dtrace_probekey_t pkey;
8561         uint32_t priv;
8562         uid_t uid;
8563         zoneid_t zoneid;
8564         int err;
8565
8566         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8567
8568         dtrace_ecb_create_cache = NULL;
8569
8570         if (desc == NULL) {
8571                 /*
8572                  * If we're passed a NULL description, we're being asked to
8573                  * create an ECB with a NULL probe.
8574                  */
8575                 (void) dtrace_ecb_create_enable(NULL, enab, ep);
8576                 return (0);
8577         }
8578
8579         dtrace_probekey(desc, &pkey);
8580         dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
8581             &priv, &uid, &zoneid);
8582
8583         err = dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable, enab, ep);
8584
8585         dtrace_probekey_release(&pkey);
8586
8587         return err;
8588 }
8589
8590 /*
8591  * DTrace Helper Provider Functions
8592  */
8593 static void
8594 dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
8595 {
8596         attr->dtat_name = DOF_ATTR_NAME(dofattr);
8597         attr->dtat_data = DOF_ATTR_DATA(dofattr);
8598         attr->dtat_class = DOF_ATTR_CLASS(dofattr);
8599 }
8600
8601 static void
8602 dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
8603     const dof_provider_t *dofprov, char *strtab)
8604 {
8605         hprov->dthpv_provname = strtab + dofprov->dofpv_name;
8606         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
8607             dofprov->dofpv_provattr);
8608         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
8609             dofprov->dofpv_modattr);
8610         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
8611             dofprov->dofpv_funcattr);
8612         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
8613             dofprov->dofpv_nameattr);
8614         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
8615             dofprov->dofpv_argsattr);
8616 }
8617
8618 static void
8619 dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, proc_t *p)
8620 {
8621         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8622         dof_hdr_t *dof = (dof_hdr_t *)daddr;
8623         dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
8624         dof_provider_t *provider;
8625         dof_probe_t *probe;
8626         uint32_t *off, *enoff;
8627         uint8_t *arg;
8628         char *strtab;
8629         uint_t i, nprobes;
8630         dtrace_helper_provdesc_t dhpv;
8631         dtrace_helper_probedesc_t dhpb;
8632         dtrace_meta_t *meta = dtrace_meta_pid;
8633         dtrace_mops_t *mops = &meta->dtm_mops;
8634         void *parg;
8635
8636         provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
8637         str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8638             provider->dofpv_strtab * dof->dofh_secsize);
8639         prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8640             provider->dofpv_probes * dof->dofh_secsize);
8641         arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8642             provider->dofpv_prargs * dof->dofh_secsize);
8643         off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8644             provider->dofpv_proffs * dof->dofh_secsize);
8645
8646         strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
8647         off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
8648         arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
8649         enoff = NULL;
8650
8651         /*
8652          * See dtrace_helper_provider_validate().
8653          */
8654         if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
8655             provider->dofpv_prenoffs != DOF_SECT_NONE) {
8656                 enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8657                     provider->dofpv_prenoffs * dof->dofh_secsize);
8658                 enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
8659         }
8660
8661         nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
8662
8663         /*
8664          * Create the provider.
8665          */
8666         dtrace_dofprov2hprov(&dhpv, provider, strtab);
8667
8668         if ((parg = mops->dtms_provide_proc(meta->dtm_arg, &dhpv, p)) == NULL)
8669                 return;
8670
8671         meta->dtm_count++;
8672
8673         /*
8674          * Create the probes.
8675          */
8676         for (i = 0; i < nprobes; i++) {
8677                 probe = (dof_probe_t *)(uintptr_t)(daddr +
8678                     prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
8679
8680                 dhpb.dthpb_mod = dhp->dofhp_mod;
8681                 dhpb.dthpb_func = strtab + probe->dofpr_func;
8682                 dhpb.dthpb_name = strtab + probe->dofpr_name;
8683 #if !defined(__APPLE__)
8684                 dhpb.dthpb_base = probe->dofpr_addr;
8685 #else
8686                 dhpb.dthpb_base = dhp->dofhp_addr; /* FIXME: James, why? */
8687 #endif
8688                 dhpb.dthpb_offs = (int32_t *)(off + probe->dofpr_offidx);
8689                 dhpb.dthpb_noffs = probe->dofpr_noffs;
8690                 if (enoff != NULL) {
8691                         dhpb.dthpb_enoffs = (int32_t *)(enoff + probe->dofpr_enoffidx);
8692                         dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
8693                 } else {
8694                         dhpb.dthpb_enoffs = NULL;
8695                         dhpb.dthpb_nenoffs = 0;
8696                 }
8697                 dhpb.dthpb_args = arg + probe->dofpr_argidx;
8698                 dhpb.dthpb_nargc = probe->dofpr_nargc;
8699                 dhpb.dthpb_xargc = probe->dofpr_xargc;
8700                 dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
8701                 dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
8702
8703                 mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
8704         }
8705
8706         /*
8707          * Since we just created probes, we need to match our enablings
8708          * against those, with a precondition knowing that we have only
8709          * added probes from this provider
8710          */
8711         char *prov_name = mops->dtms_provider_name(parg);
8712         ASSERT(prov_name != NULL);
8713         dtrace_match_cond_t cond = {dtrace_cond_provider_match, (void*)prov_name};
8714
8715         dtrace_enabling_matchall_with_cond(&cond);
8716 }
8717
8718 static void
8719 dtrace_helper_provide(dof_helper_t *dhp, proc_t *p)
8720 {
8721         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8722         dof_hdr_t *dof = (dof_hdr_t *)daddr;
8723         uint32_t i;
8724
8725         LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
8726
8727         for (i = 0; i < dof->dofh_secnum; i++) {
8728                 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
8729                     dof->dofh_secoff + i * dof->dofh_secsize);
8730
8731                 if (sec->dofs_type != DOF_SECT_PROVIDER)
8732                         continue;
8733
8734                 dtrace_helper_provide_one(dhp, sec, p);
8735         }
8736 }
8737
8738 static void
8739 dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, proc_t *p)
8740 {
8741         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8742         dof_hdr_t *dof = (dof_hdr_t *)daddr;
8743         dof_sec_t *str_sec;
8744         dof_provider_t *provider;
8745         char *strtab;
8746         dtrace_helper_provdesc_t dhpv;
8747         dtrace_meta_t *meta = dtrace_meta_pid;
8748         dtrace_mops_t *mops = &meta->dtm_mops;
8749
8750         provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
8751         str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
8752             provider->dofpv_strtab * dof->dofh_secsize);
8753
8754         strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
8755
8756         /*
8757          * Create the provider.
8758          */
8759         dtrace_dofprov2hprov(&dhpv, provider, strtab);
8760
8761         mops->dtms_remove_proc(meta->dtm_arg, &dhpv, p);
8762
8763         meta->dtm_count--;
8764 }
8765
8766 static void
8767 dtrace_helper_provider_remove(dof_helper_t *dhp, proc_t *p)
8768 {
8769         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
8770         dof_hdr_t *dof = (dof_hdr_t *)daddr;
8771         uint32_t i;
8772
8773         LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
8774
8775         for (i = 0; i < dof->dofh_secnum; i++) {
8776                 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
8777                     dof->dofh_secoff + i * dof->dofh_secsize);
8778
8779                 if (sec->dofs_type != DOF_SECT_PROVIDER)
8780                         continue;
8781
8782                 dtrace_helper_provider_remove_one(dhp, sec, p);
8783         }
8784 }
8785
8786 /*
8787  * DTrace Meta Provider-to-Framework API Functions
8788  *
8789  * These functions implement the Meta Provider-to-Framework API, as described
8790  * in <sys/dtrace.h>.
8791  */
8792 int
8793 dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
8794     dtrace_meta_provider_id_t *idp)
8795 {
8796         dtrace_meta_t *meta;
8797         dtrace_helpers_t *help, *next;
8798         uint_t i;
8799
8800         *idp = DTRACE_METAPROVNONE;
8801
8802         /*
8803          * We strictly don't need the name, but we hold onto it for
8804          * debuggability. All hail error queues!
8805          */
8806         if (name == NULL) {
8807                 cmn_err(CE_WARN, "failed to register meta-provider: "
8808                     "invalid name");
8809                 return (EINVAL);
8810         }
8811
8812         if (mops == NULL ||
8813             mops->dtms_create_probe == NULL ||
8814             mops->dtms_provide_proc == NULL ||
8815             mops->dtms_remove_proc == NULL) {
8816                 cmn_err(CE_WARN, "failed to register meta-register %s: "
8817                     "invalid ops", name);
8818                 return (EINVAL);
8819         }
8820
8821         meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
8822         meta->dtm_mops = *mops;
8823         meta->dtm_arg = arg;
8824
8825         lck_mtx_lock(&dtrace_meta_lock);
8826         lck_mtx_lock(&dtrace_lock);
8827
8828         if (dtrace_meta_pid != NULL) {
8829                 lck_mtx_unlock(&dtrace_lock);
8830                 lck_mtx_unlock(&dtrace_meta_lock);
8831                 cmn_err(CE_WARN, "failed to register meta-register %s: "
8832                     "user-land meta-provider exists", name);
8833                 kmem_free(meta, sizeof (dtrace_meta_t));
8834                 return (EINVAL);
8835         }
8836
8837         meta->dtm_name = dtrace_strref(name);
8838
8839         dtrace_meta_pid = meta;
8840         *idp = (dtrace_meta_provider_id_t)meta;
8841
8842         /*
8843          * If there are providers and probes ready to go, pass them
8844          * off to the new meta provider now.
8845          */
8846
8847         help = dtrace_deferred_pid;
8848         dtrace_deferred_pid = NULL;
8849
8850         lck_mtx_unlock(&dtrace_lock);
8851
8852         while (help != NULL) {
8853                 for (i = 0; i < help->dthps_nprovs; i++) {
8854                         proc_t *p = proc_find(help->dthps_pid);
8855                         if (p == PROC_NULL)
8856                                 continue;
8857                         dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
8858                             p);
8859                         proc_rele(p);
8860                 }
8861
8862                 next = help->dthps_next;
8863                 help->dthps_next = NULL;
8864                 help->dthps_prev = NULL;
8865                 help->dthps_deferred = 0;
8866                 help = next;
8867         }
8868
8869         lck_mtx_unlock(&dtrace_meta_lock);
8870
8871         return (0);
8872 }
8873
8874 int
8875 dtrace_meta_unregister(dtrace_meta_provider_id_t id)
8876 {
8877         dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;
8878
8879         lck_mtx_lock(&dtrace_meta_lock);
8880         lck_mtx_lock(&dtrace_lock);
8881
8882         if (old == dtrace_meta_pid) {
8883                 pp = &dtrace_meta_pid;
8884         } else {
8885                 panic("attempt to unregister non-existent "
8886                     "dtrace meta-provider %p\n", (void *)old);
8887         }
8888
8889         if (old->dtm_count != 0) {
8890                 lck_mtx_unlock(&dtrace_lock);
8891                 lck_mtx_unlock(&dtrace_meta_lock);
8892                 return (EBUSY);
8893         }
8894
8895         *pp = NULL;
8896
8897         dtrace_strunref(old->dtm_name);
8898
8899         lck_mtx_unlock(&dtrace_lock);
8900         lck_mtx_unlock(&dtrace_meta_lock);
8901
8902         kmem_free(old, sizeof (dtrace_meta_t));
8903
8904         return (0);
8905 }
8906
8907
8908 /*
8909  * DTrace DIF Object Functions
8910  */
8911 static int
8912 dtrace_difo_err(uint_t pc, const char *format, ...)
8913 {
8914         if (dtrace_err_verbose) {
8915                 va_list alist;
8916
8917                 (void) uprintf("dtrace DIF object error: [%u]: ", pc);
8918                 va_start(alist, format);
8919                 (void) vuprintf(format, alist);
8920                 va_end(alist);
8921         }
8922
8923 #ifdef DTRACE_ERRDEBUG
8924         dtrace_errdebug(format);
8925 #endif
8926         return (1);
8927 }
8928
8929 /*
8930  * Validate a DTrace DIF object by checking the IR instructions.  The following
8931  * rules are currently enforced by dtrace_difo_validate():
8932  *
8933  * 1. Each instruction must have a valid opcode
8934  * 2. Each register, string, variable, or subroutine reference must be valid
8935  * 3. No instruction can modify register %r0 (must be zero)
8936  * 4. All instruction reserved bits must be set to zero
8937  * 5. The last instruction must be a "ret" instruction
8938  * 6. All branch targets must reference a valid instruction _after_ the branch
8939  */
8940 static int
8941 dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
8942     cred_t *cr)
8943 {
8944         int err = 0;
8945         uint_t i;
8946
8947         int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
8948         int kcheckload;
8949         uint_t pc;
8950         int maxglobal = -1, maxlocal = -1, maxtlocal = -1;
8951
8952         kcheckload = cr == NULL ||
8953             (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
8954
8955         dp->dtdo_destructive = 0;
8956
8957         for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
8958                 dif_instr_t instr = dp->dtdo_buf[pc];
8959
8960                 uint_t r1 = DIF_INSTR_R1(instr);
8961                 uint_t r2 = DIF_INSTR_R2(instr);
8962                 uint_t rd = DIF_INSTR_RD(instr);
8963                 uint_t rs = DIF_INSTR_RS(instr);
8964                 uint_t label = DIF_INSTR_LABEL(instr);
8965                 uint_t v = DIF_INSTR_VAR(instr);
8966                 uint_t subr = DIF_INSTR_SUBR(instr);
8967                 uint_t type = DIF_INSTR_TYPE(instr);
8968                 uint_t op = DIF_INSTR_OP(instr);
8969
8970                 switch (op) {
8971                 case DIF_OP_OR:
8972                 case DIF_OP_XOR:
8973                 case DIF_OP_AND:
8974                 case DIF_OP_SLL:
8975                 case DIF_OP_SRL:
8976                 case DIF_OP_SRA:
8977                 case DIF_OP_SUB:
8978                 case DIF_OP_ADD:
8979                 case DIF_OP_MUL:
8980                 case DIF_OP_SDIV:
8981                 case DIF_OP_UDIV:
8982                 case DIF_OP_SREM:
8983                 case DIF_OP_UREM:
8984                 case DIF_OP_COPYS:
8985                         if (r1 >= nregs)
8986                                 err += efunc(pc, "invalid register %u\n", r1);
8987                         if (r2 >= nregs)
8988                                 err += efunc(pc, "invalid register %u\n", r2);
8989                         if (rd >= nregs)
8990                                 err += efunc(pc, "invalid register %u\n", rd);
8991                         if (rd == 0)
8992                                 err += efunc(pc, "cannot write to %r0\n");
8993                         break;
8994                 case DIF_OP_NOT:
8995                 case DIF_OP_MOV:
8996                 case DIF_OP_ALLOCS:
8997                         if (r1 >= nregs)
8998                                 err += efunc(pc, "invalid register %u\n", r1);
8999                         if (r2 != 0)
9000                                 err += efunc(pc, "non-zero reserved bits\n");
9001                         if (rd >= nregs)
9002                                 err += efunc(pc, "invalid register %u\n", rd);
9003                         if (rd == 0)
9004                                 err += efunc(pc, "cannot write to %r0\n");
9005                         break;
9006                 case DIF_OP_LDSB:
9007                 case DIF_OP_LDSH:
9008                 case DIF_OP_LDSW:
9009                 case DIF_OP_LDUB:
9010                 case DIF_OP_LDUH:
9011                 case DIF_OP_LDUW:
9012                 case DIF_OP_LDX:
9013                         if (r1 >= nregs)
9014                                 err += efunc(pc, "invalid register %u\n", r1);
9015                         if (r2 != 0)
9016                                 err += efunc(pc, "non-zero reserved bits\n");
9017                         if (rd >= nregs)
9018                                 err += efunc(pc, "invalid register %u\n", rd);
9019                         if (rd == 0)
9020                                 err += efunc(pc, "cannot write to %r0\n");
9021                         if (kcheckload)
9022                                 dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
9023                                     DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
9024                         break;
9025                 case DIF_OP_RLDSB:
9026                 case DIF_OP_RLDSH:
9027                 case DIF_OP_RLDSW:
9028                 case DIF_OP_RLDUB:
9029                 case DIF_OP_RLDUH:
9030                 case DIF_OP_RLDUW:
9031                 case DIF_OP_RLDX:
9032                         if (r1 >= nregs)
9033                                 err += efunc(pc, "invalid register %u\n", r1);
9034                         if (r2 != 0)
9035                                 err += efunc(pc, "non-zero reserved bits\n");
9036                         if (rd >= nregs)
9037                                 err += efunc(pc, "invalid register %u\n", rd);
9038                         if (rd == 0)
9039                                 err += efunc(pc, "cannot write to %r0\n");
9040                         break;
9041                 case DIF_OP_ULDSB:
9042                 case DIF_OP_ULDSH:
9043                 case DIF_OP_ULDSW:
9044                 case DIF_OP_ULDUB:
9045                 case DIF_OP_ULDUH:
9046                 case DIF_OP_ULDUW:
9047                 case DIF_OP_ULDX:
9048                         if (r1 >= nregs)
9049                                 err += efunc(pc, "invalid register %u\n", r1);
9050                         if (r2 != 0)
9051                                 err += efunc(pc, "non-zero reserved bits\n");
9052                         if (rd >= nregs)
9053                                 err += efunc(pc, "invalid register %u\n", rd);
9054                         if (rd == 0)
9055                                 err += efunc(pc, "cannot write to %r0\n");
9056                         break;
9057                 case DIF_OP_STB:
9058                 case DIF_OP_STH:
9059                 case DIF_OP_STW:
9060                 case DIF_OP_STX:
9061                         if (r1 >= nregs)
9062                                 err += efunc(pc, "invalid register %u\n", r1);
9063                         if (r2 != 0)
9064                                 err += efunc(pc, "non-zero reserved bits\n");
9065                         if (rd >= nregs)
9066                                 err += efunc(pc, "invalid register %u\n", rd);
9067                         if (rd == 0)
9068                                 err += efunc(pc, "cannot write to 0 address\n");
9069                         break;
9070                 case DIF_OP_CMP:
9071                 case DIF_OP_SCMP:
9072                         if (r1 >= nregs)
9073                                 err += efunc(pc, "invalid register %u\n", r1);
9074                         if (r2 >= nregs)
9075                                 err += efunc(pc, "invalid register %u\n", r2);
9076                         if (rd != 0)
9077                                 err += efunc(pc, "non-zero reserved bits\n");
9078                         break;
9079                 case DIF_OP_TST:
9080                         if (r1 >= nregs)
9081                                 err += efunc(pc, "invalid register %u\n", r1);
9082                         if (r2 != 0 || rd != 0)
9083                                 err += efunc(pc, "non-zero reserved bits\n");
9084                         break;
9085                 case DIF_OP_BA:
9086                 case DIF_OP_BE:
9087                 case DIF_OP_BNE:
9088                 case DIF_OP_BG:
9089                 case DIF_OP_BGU:
9090                 case DIF_OP_BGE:
9091                 case DIF_OP_BGEU:
9092                 case DIF_OP_BL:
9093                 case DIF_OP_BLU:
9094                 case DIF_OP_BLE:
9095                 case DIF_OP_BLEU:
9096                         if (label >= dp->dtdo_len) {
9097                                 err += efunc(pc, "invalid branch target %u\n",
9098                                     label);
9099                         }
9100                         if (label <= pc) {
9101                                 err += efunc(pc, "backward branch to %u\n",
9102                                     label);
9103                         }
9104                         break;
9105                 case DIF_OP_RET:
9106                         if (r1 != 0 || r2 != 0)
9107                                 err += efunc(pc, "non-zero reserved bits\n");
9108                         if (rd >= nregs)
9109                                 err += efunc(pc, "invalid register %u\n", rd);
9110                         break;
9111                 case DIF_OP_NOP:
9112                 case DIF_OP_POPTS:
9113                 case DIF_OP_FLUSHTS:
9114                         if (r1 != 0 || r2 != 0 || rd != 0)
9115                                 err += efunc(pc, "non-zero reserved bits\n");
9116                         break;
9117                 case DIF_OP_SETX:
9118                         if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
9119                                 err += efunc(pc, "invalid integer ref %u\n",
9120                                     DIF_INSTR_INTEGER(instr));
9121                         }
9122                         if (rd >= nregs)
9123                                 err += efunc(pc, "invalid register %u\n", rd);
9124                         if (rd == 0)
9125                                 err += efunc(pc, "cannot write to %r0\n");
9126                         break;
9127                 case DIF_OP_SETS:
9128                         if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
9129                                 err += efunc(pc, "invalid string ref %u\n",
9130                                     DIF_INSTR_STRING(instr));
9131                         }
9132                         if (rd >= nregs)
9133                                 err += efunc(pc, "invalid register %u\n", rd);
9134                         if (rd == 0)
9135                                 err += efunc(pc, "cannot write to %r0\n");
9136                         break;
9137                 case DIF_OP_LDGA:
9138                 case DIF_OP_LDTA:
9139                         if (r1 > DIF_VAR_ARRAY_MAX)
9140                                 err += efunc(pc, "invalid array %u\n", r1);
9141                         if (r2 >= nregs)
9142                                 err += efunc(pc, "invalid register %u\n", r2);
9143                         if (rd >= nregs)
9144                                 err += efunc(pc, "invalid register %u\n", rd);
9145                         if (rd == 0)
9146                                 err += efunc(pc, "cannot write to %r0\n");
9147                         break;
9148                 case DIF_OP_LDGS:
9149                 case DIF_OP_LDTS:
9150                 case DIF_OP_LDLS:
9151                 case DIF_OP_LDGAA:
9152                 case DIF_OP_LDTAA:
9153                         if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)
9154                                 err += efunc(pc, "invalid variable %u\n", v);
9155                         if (rd >= nregs)
9156                                 err += efunc(pc, "invalid register %u\n", rd);
9157                         if (rd == 0)
9158                                 err += efunc(pc, "cannot write to %r0\n");
9159                         break;
9160                 case DIF_OP_STGS:
9161                 case DIF_OP_STTS:
9162                 case DIF_OP_STLS:
9163                 case DIF_OP_STGAA:
9164                 case DIF_OP_STTAA:
9165                         if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)
9166                                 err += efunc(pc, "invalid variable %u\n", v);
9167                         if (rs >= nregs)
9168                                 err += efunc(pc, "invalid register %u\n", rd);
9169                         break;
9170                 case DIF_OP_CALL:
9171                         if (subr > DIF_SUBR_MAX &&
9172                            !(subr >= DIF_SUBR_APPLE_MIN && subr <= DIF_SUBR_APPLE_MAX))
9173                                 err += efunc(pc, "invalid subr %u\n", subr);
9174                         if (rd >= nregs)
9175                                 err += efunc(pc, "invalid register %u\n", rd);
9176                         if (rd == 0)
9177                                 err += efunc(pc, "cannot write to %r0\n");
9178
9179                         if (subr == DIF_SUBR_COPYOUT ||
9180                             subr == DIF_SUBR_COPYOUTSTR ||
9181                             subr == DIF_SUBR_KDEBUG_TRACE ||
9182                             subr == DIF_SUBR_KDEBUG_TRACE_STRING) {
9183                                 dp->dtdo_destructive = 1;
9184                         }
9185                         break;
9186                 case DIF_OP_PUSHTR:
9187                         if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
9188                                 err += efunc(pc, "invalid ref type %u\n", type);
9189                         if (r2 >= nregs)
9190                                 err += efunc(pc, "invalid register %u\n", r2);
9191                         if (rs >= nregs)
9192                                 err += efunc(pc, "invalid register %u\n", rs);
9193                         break;
9194                 case DIF_OP_PUSHTV:
9195                         if (type != DIF_TYPE_CTF)
9196                                 err += efunc(pc, "invalid val type %u\n", type);
9197                         if (r2 >= nregs)
9198                                 err += efunc(pc, "invalid register %u\n", r2);
9199                         if (rs >= nregs)
9200                                 err += efunc(pc, "invalid register %u\n", rs);
9201                         break;
9202                 default:
9203                         err += efunc(pc, "invalid opcode %u\n",
9204                             DIF_INSTR_OP(instr));
9205                 }
9206         }
9207
9208         if (dp->dtdo_len != 0 &&
9209             DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
9210                 err += efunc(dp->dtdo_len - 1,
9211                     "expected 'ret' as last DIF instruction\n");
9212         }
9213
9214         if (!(dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF | DIF_TF_BYUREF))) {
9215                 /*
9216                  * If we're not returning by reference, the size must be either
9217                  * 0 or the size of one of the base types.
9218                  */
9219                 switch (dp->dtdo_rtype.dtdt_size) {
9220                 case 0:
9221                 case sizeof (uint8_t):
9222                 case sizeof (uint16_t):
9223                 case sizeof (uint32_t):
9224                 case sizeof (uint64_t):
9225                         break;
9226
9227                 default:
9228                         err += efunc(dp->dtdo_len - 1, "bad return size\n");
9229                 }
9230         }
9231
9232         for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
9233                 dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;
9234                 dtrace_diftype_t *vt, *et;
9235                 uint_t id;
9236                 int ndx;
9237
9238                 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
9239                     v->dtdv_scope != DIFV_SCOPE_THREAD &&
9240                     v->dtdv_scope != DIFV_SCOPE_LOCAL) {
9241                         err += efunc(i, "unrecognized variable scope %d\n",
9242                             v->dtdv_scope);
9243                         break;
9244                 }
9245
9246                 if (v->dtdv_kind != DIFV_KIND_ARRAY &&
9247                     v->dtdv_kind != DIFV_KIND_SCALAR) {
9248                         err += efunc(i, "unrecognized variable type %d\n",
9249                             v->dtdv_kind);
9250                         break;
9251                 }
9252
9253                 if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
9254                         err += efunc(i, "%d exceeds variable id limit\n", id);
9255                         break;
9256                 }
9257
9258                 if (id < DIF_VAR_OTHER_UBASE)
9259                         continue;
9260
9261                 /*
9262                  * For user-defined variables, we need to check that this
9263                  * definition is identical to any previous definition that we
9264                  * encountered.
9265                  */
9266                 ndx = id - DIF_VAR_OTHER_UBASE;
9267
9268                 switch (v->dtdv_scope) {
9269                 case DIFV_SCOPE_GLOBAL:
9270                         if (maxglobal == -1 || ndx > maxglobal)
9271                                 maxglobal = ndx;
9272
9273                         if (ndx < vstate->dtvs_nglobals) {
9274                                 dtrace_statvar_t *svar;
9275
9276                                 if ((svar = vstate->dtvs_globals[ndx]) != NULL)
9277                                         existing = &svar->dtsv_var;
9278                         }
9279
9280                         break;
9281
9282                 case DIFV_SCOPE_THREAD:
9283                         if (maxtlocal == -1 || ndx > maxtlocal)
9284                                 maxtlocal = ndx;
9285
9286                         if (ndx < vstate->dtvs_ntlocals)
9287                                 existing = &vstate->dtvs_tlocals[ndx];
9288                         break;
9289
9290                 case DIFV_SCOPE_LOCAL:
9291                         if (maxlocal == -1 || ndx > maxlocal)
9292                                 maxlocal = ndx;
9293                         if (ndx < vstate->dtvs_nlocals) {
9294                                 dtrace_statvar_t *svar;
9295
9296                                 if ((svar = vstate->dtvs_locals[ndx]) != NULL)
9297                                         existing = &svar->dtsv_var;
9298                         }
9299
9300                         break;
9301                 }
9302
9303                 vt = &v->dtdv_type;
9304
9305                 if (vt->dtdt_flags & DIF_TF_BYREF) {
9306                         if (vt->dtdt_size == 0) {
9307                                 err += efunc(i, "zero-sized variable\n");
9308                                 break;
9309                         }
9310
9311                         if ((v->dtdv_scope == DIFV_SCOPE_GLOBAL ||
9312                             v->dtdv_scope == DIFV_SCOPE_LOCAL) &&
9313                             vt->dtdt_size > dtrace_statvar_maxsize) {
9314                                 err += efunc(i, "oversized by-ref static\n");
9315                                 break;
9316                         }
9317                 }
9318
9319                 if (existing == NULL || existing->dtdv_id == 0)
9320                         continue;
9321
9322                 ASSERT(existing->dtdv_id == v->dtdv_id);
9323                 ASSERT(existing->dtdv_scope == v->dtdv_scope);
9324
9325                 if (existing->dtdv_kind != v->dtdv_kind)
9326                         err += efunc(i, "%d changed variable kind\n", id);
9327
9328                 et = &existing->dtdv_type;
9329
9330                 if (vt->dtdt_flags != et->dtdt_flags) {
9331                         err += efunc(i, "%d changed variable type flags\n", id);
9332                         break;
9333                 }
9334
9335                 if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
9336                         err += efunc(i, "%d changed variable type size\n", id);
9337                         break;
9338                 }
9339         }
9340
9341         for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
9342                 dif_instr_t instr = dp->dtdo_buf[pc];
9343
9344                 uint_t v = DIF_INSTR_VAR(instr);
9345                 uint_t op = DIF_INSTR_OP(instr);
9346
9347                 switch (op) {
9348                 case DIF_OP_LDGS:
9349                 case DIF_OP_LDGAA:
9350                 case DIF_OP_STGS:
9351                 case DIF_OP_STGAA:
9352                         if (v > (uint_t)(DIF_VAR_OTHER_UBASE + maxglobal))
9353                                 err += efunc(pc, "invalid variable %u\n", v);
9354                         break;
9355                 case DIF_OP_LDTS:
9356                 case DIF_OP_LDTAA:
9357                 case DIF_OP_STTS:
9358                 case DIF_OP_STTAA:
9359                         if (v > (uint_t)(DIF_VAR_OTHER_UBASE + maxtlocal))
9360                                 err += efunc(pc, "invalid variable %u\n", v);
9361                         break;
9362                 case DIF_OP_LDLS:
9363                 case DIF_OP_STLS:
9364                         if (v > (uint_t)(DIF_VAR_OTHER_UBASE + maxlocal))
9365                                 err += efunc(pc, "invalid variable %u\n", v);
9366                         break;
9367                 default:
9368                         break;
9369                 }
9370         }
9371
9372         return (err);
9373 }
9374
9375 /*
9376  * Validate a DTrace DIF object that it is to be used as a helper.  Helpers
9377  * are much more constrained than normal DIFOs.  Specifically, they may
9378  * not:
9379  *
9380  * 1. Make calls to subroutines other than copyin(), copyinstr() or
9381  *    miscellaneous string routines
9382  * 2. Access DTrace variables other than the args[] array, and the
9383  *    curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
9384  * 3. Have thread-local variables.
9385  * 4. Have dynamic variables.
9386  */
9387 static int
9388 dtrace_difo_validate_helper(dtrace_difo_t *dp)
9389 {
9390         int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
9391         int err = 0;
9392         uint_t pc;
9393
9394         for (pc = 0; pc < dp->dtdo_len; pc++) {
9395                 dif_instr_t instr = dp->dtdo_buf[pc];
9396
9397                 uint_t v = DIF_INSTR_VAR(instr);
9398                 uint_t subr = DIF_INSTR_SUBR(instr);
9399                 uint_t op = DIF_INSTR_OP(instr);
9400
9401                 switch (op) {
9402                 case DIF_OP_OR:
9403                 case DIF_OP_XOR:
9404                 case DIF_OP_AND:
9405                 case DIF_OP_SLL:
9406                 case DIF_OP_SRL:
9407                 case DIF_OP_SRA:
9408                 case DIF_OP_SUB:
9409                 case DIF_OP_ADD:
9410                 case DIF_OP_MUL:
9411                 case DIF_OP_SDIV:
9412                 case DIF_OP_UDIV:
9413                 case DIF_OP_SREM:
9414                 case DIF_OP_UREM:
9415                 case DIF_OP_COPYS:
9416                 case DIF_OP_NOT:
9417                 case DIF_OP_MOV:
9418                 case DIF_OP_RLDSB:
9419                 case DIF_OP_RLDSH:
9420                 case DIF_OP_RLDSW:
9421                 case DIF_OP_RLDUB:
9422                 case DIF_OP_RLDUH:
9423                 case DIF_OP_RLDUW:
9424                 case DIF_OP_RLDX:
9425                 case DIF_OP_ULDSB:
9426                 case DIF_OP_ULDSH:
9427                 case DIF_OP_ULDSW:
9428                 case DIF_OP_ULDUB:
9429                 case DIF_OP_ULDUH:
9430                 case DIF_OP_ULDUW:
9431                 case DIF_OP_ULDX:
9432                 case DIF_OP_STB:
9433                 case DIF_OP_STH:
9434                 case DIF_OP_STW:
9435                 case DIF_OP_STX:
9436                 case DIF_OP_ALLOCS:
9437                 case DIF_OP_CMP:
9438                 case DIF_OP_SCMP:
9439                 case DIF_OP_TST:
9440                 case DIF_OP_BA:
9441                 case DIF_OP_BE:
9442                 case DIF_OP_BNE:
9443                 case DIF_OP_BG:
9444                 case DIF_OP_BGU:
9445                 case DIF_OP_BGE:
9446                 case DIF_OP_BGEU:
9447                 case DIF_OP_BL:
9448                 case DIF_OP_BLU:
9449                 case DIF_OP_BLE:
9450                 case DIF_OP_BLEU:
9451                 case DIF_OP_RET:
9452                 case DIF_OP_NOP:
9453                 case DIF_OP_POPTS:
9454                 case DIF_OP_FLUSHTS:
9455                 case DIF_OP_SETX:
9456                 case DIF_OP_SETS:
9457                 case DIF_OP_LDGA:
9458                 case DIF_OP_LDLS:
9459                 case DIF_OP_STGS:
9460                 case DIF_OP_STLS:
9461                 case DIF_OP_PUSHTR:
9462                 case DIF_OP_PUSHTV:
9463                         break;
9464
9465                 case DIF_OP_LDGS:
9466                         if (v >= DIF_VAR_OTHER_UBASE)
9467                                 break;
9468
9469                         if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
9470                                 break;
9471
9472                         if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||
9473                             v == DIF_VAR_PPID || v == DIF_VAR_TID ||
9474                             v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||
9475                             v == DIF_VAR_UID || v == DIF_VAR_GID)
9476                                 break;
9477
9478                         err += efunc(pc, "illegal variable %u\n", v);
9479                         break;
9480
9481                 case DIF_OP_LDTA:
9482                 case DIF_OP_LDTS:
9483                 case DIF_OP_LDGAA:
9484                 case DIF_OP_LDTAA:
9485                         err += efunc(pc, "illegal dynamic variable load\n");
9486                         break;
9487
9488                 case DIF_OP_STTS:
9489                 case DIF_OP_STGAA:
9490                 case DIF_OP_STTAA:
9491                         err += efunc(pc, "illegal dynamic variable store\n");
9492                         break;
9493
9494                 case DIF_OP_CALL:
9495                         if (subr == DIF_SUBR_ALLOCA ||
9496                             subr == DIF_SUBR_BCOPY ||
9497                             subr == DIF_SUBR_COPYIN ||
9498                             subr == DIF_SUBR_COPYINTO ||
9499                             subr == DIF_SUBR_COPYINSTR ||
9500                             subr == DIF_SUBR_INDEX ||
9501                             subr == DIF_SUBR_INET_NTOA ||
9502                             subr == DIF_SUBR_INET_NTOA6 ||
9503                             subr == DIF_SUBR_INET_NTOP ||
9504                             subr == DIF_SUBR_LLTOSTR ||
9505                             subr == DIF_SUBR_RINDEX ||
9506                             subr == DIF_SUBR_STRCHR ||
9507                             subr == DIF_SUBR_STRJOIN ||
9508                             subr == DIF_SUBR_STRRCHR ||
9509                             subr == DIF_SUBR_STRSTR ||
9510                             subr == DIF_SUBR_KDEBUG_TRACE ||
9511                             subr == DIF_SUBR_KDEBUG_TRACE_STRING ||
9512                             subr == DIF_SUBR_HTONS ||
9513                             subr == DIF_SUBR_HTONL ||
9514                             subr == DIF_SUBR_HTONLL ||
9515                             subr == DIF_SUBR_NTOHS ||
9516                             subr == DIF_SUBR_NTOHL ||
9517                             subr == DIF_SUBR_NTOHLL)
9518                                 break;
9519
9520                         err += efunc(pc, "invalid subr %u\n", subr);
9521                         break;
9522
9523                 default:
9524                         err += efunc(pc, "invalid opcode %u\n",
9525                             DIF_INSTR_OP(instr));
9526                 }
9527         }
9528
9529         return (err);
9530 }
9531
9532 /*
9533  * Returns 1 if the expression in the DIF object can be cached on a per-thread
9534  * basis; 0 if not.
9535  */
9536 static int
9537 dtrace_difo_cacheable(dtrace_difo_t *dp)
9538 {
9539         uint_t i;
9540
9541         if (dp == NULL)
9542                 return (0);
9543
9544         for (i = 0; i < dp->dtdo_varlen; i++) {
9545                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9546
9547                 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
9548                         continue;
9549
9550                 switch (v->dtdv_id) {
9551                 case DIF_VAR_CURTHREAD:
9552                 case DIF_VAR_PID:
9553                 case DIF_VAR_TID:
9554                 case DIF_VAR_EXECNAME:
9555                 case DIF_VAR_ZONENAME:
9556                         break;
9557
9558                 default:
9559                         return (0);
9560                 }
9561         }
9562
9563         /*
9564          * This DIF object may be cacheable.  Now we need to look for any
9565          * array loading instructions, any memory loading instructions, or
9566          * any stores to thread-local variables.
9567          */
9568         for (i = 0; i < dp->dtdo_len; i++) {
9569                 uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
9570
9571                 if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||
9572                     (op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||
9573                     (op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||
9574                     op == DIF_OP_LDGA || op == DIF_OP_STTS)
9575                         return (0);
9576         }
9577
9578         return (1);
9579 }
9580
9581 static void
9582 dtrace_difo_hold(dtrace_difo_t *dp)
9583 {
9584         uint_t i;
9585
9586         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9587
9588         dp->dtdo_refcnt++;
9589         ASSERT(dp->dtdo_refcnt != 0);
9590
9591         /*
9592          * We need to check this DIF object for references to the variable
9593          * DIF_VAR_VTIMESTAMP.
9594          */
9595         for (i = 0; i < dp->dtdo_varlen; i++) {
9596                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9597
9598                 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
9599                         continue;
9600
9601                 if (dtrace_vtime_references++ == 0)
9602                         dtrace_vtime_enable();
9603         }
9604 }
9605
9606 /*
9607  * This routine calculates the dynamic variable chunksize for a given DIF
9608  * object.  The calculation is not fool-proof, and can probably be tricked by
9609  * malicious DIF -- but it works for all compiler-generated DIF.  Because this
9610  * calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
9611  * if a dynamic variable size exceeds the chunksize.
9612  */
9613 static void
9614 dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9615 {
9616         uint64_t sval = 0;
9617         dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
9618         const dif_instr_t *text = dp->dtdo_buf;
9619         uint_t pc, srd = 0;
9620         uint_t ttop = 0;
9621         size_t size, ksize;
9622         uint_t id, i;
9623
9624         for (pc = 0; pc < dp->dtdo_len; pc++) {
9625                 dif_instr_t instr = text[pc];
9626                 uint_t op = DIF_INSTR_OP(instr);
9627                 uint_t rd = DIF_INSTR_RD(instr);
9628                 uint_t r1 = DIF_INSTR_R1(instr);
9629                 uint_t nkeys = 0;
9630                 uchar_t scope;
9631
9632                 dtrace_key_t *key = tupregs;
9633
9634                 switch (op) {
9635                 case DIF_OP_SETX:
9636                         sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
9637                         srd = rd;
9638                         continue;
9639
9640                 case DIF_OP_STTS:
9641                         key = &tupregs[DIF_DTR_NREGS];
9642                         key[0].dttk_size = 0;
9643                         key[1].dttk_size = 0;
9644                         nkeys = 2;
9645                         scope = DIFV_SCOPE_THREAD;
9646                         break;
9647
9648                 case DIF_OP_STGAA:
9649                 case DIF_OP_STTAA:
9650                         nkeys = ttop;
9651
9652                         if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
9653                                 key[nkeys++].dttk_size = 0;
9654
9655                         key[nkeys++].dttk_size = 0;
9656
9657                         if (op == DIF_OP_STTAA) {
9658                                 scope = DIFV_SCOPE_THREAD;
9659                         } else {
9660                                 scope = DIFV_SCOPE_GLOBAL;
9661                         }
9662
9663                         break;
9664
9665                 case DIF_OP_PUSHTR:
9666                         if (ttop == DIF_DTR_NREGS)
9667                                 return;
9668
9669                         if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {
9670                                 /*
9671                                  * If the register for the size of the "pushtr"
9672                                  * is %r0 (or the value is 0) and the type is
9673                                  * a string, we'll use the system-wide default
9674                                  * string size.
9675                                  */
9676                                 tupregs[ttop++].dttk_size =
9677                                     dtrace_strsize_default;
9678                         } else {
9679                                 if (srd == 0)
9680                                         return;
9681
9682                                 if (sval > LONG_MAX)
9683                                         return;
9684
9685                                 tupregs[ttop++].dttk_size = sval;
9686                         }
9687
9688                         break;
9689
9690                 case DIF_OP_PUSHTV:
9691                         if (ttop == DIF_DTR_NREGS)
9692                                 return;
9693
9694                         tupregs[ttop++].dttk_size = 0;
9695                         break;
9696
9697                 case DIF_OP_FLUSHTS:
9698                         ttop = 0;
9699                         break;
9700
9701                 case DIF_OP_POPTS:
9702                         if (ttop != 0)
9703                                 ttop--;
9704                         break;
9705                 }
9706
9707                 sval = 0;
9708                 srd = 0;
9709
9710                 if (nkeys == 0)
9711                         continue;
9712
9713                 /*
9714                  * We have a dynamic variable allocation; calculate its size.
9715                  */
9716                 for (ksize = 0, i = 0; i < nkeys; i++)
9717                         ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
9718
9719                 size = sizeof (dtrace_dynvar_t);
9720                 size += sizeof (dtrace_key_t) * (nkeys - 1);
9721                 size += ksize;
9722
9723                 /*
9724                  * Now we need to determine the size of the stored data.
9725                  */
9726                 id = DIF_INSTR_VAR(instr);
9727
9728                 for (i = 0; i < dp->dtdo_varlen; i++) {
9729                         dtrace_difv_t *v = &dp->dtdo_vartab[i];
9730
9731                         if (v->dtdv_id == id && v->dtdv_scope == scope) {
9732                                 size += v->dtdv_type.dtdt_size;
9733                                 break;
9734                         }
9735                 }
9736
9737                 if (i == dp->dtdo_varlen)
9738                         return;
9739
9740                 /*
9741                  * We have the size.  If this is larger than the chunk size
9742                  * for our dynamic variable state, reset the chunk size.
9743                  */
9744                 size = P2ROUNDUP(size, sizeof (uint64_t));
9745
9746                 /*
9747                  * Before setting the chunk size, check that we're not going
9748                  * to set it to a negative value...
9749                  */
9750                 if (size > LONG_MAX)
9751                         return;
9752
9753                 /*
9754                  * ...and make certain that we didn't badly overflow.
9755                  */
9756                 if (size < ksize || size < sizeof (dtrace_dynvar_t))
9757                         return;
9758
9759                 if (size > vstate->dtvs_dynvars.dtds_chunksize)
9760                         vstate->dtvs_dynvars.dtds_chunksize = size;
9761         }
9762 }
9763
9764 static void
9765 dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9766 {
9767         int oldsvars, osz, nsz, otlocals, ntlocals;
9768         uint_t i, id;
9769
9770         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9771         ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);
9772
9773         for (i = 0; i < dp->dtdo_varlen; i++) {
9774                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9775                 dtrace_statvar_t *svar;
9776                 dtrace_statvar_t ***svarp = NULL;
9777                 size_t dsize = 0;
9778                 uint8_t scope = v->dtdv_scope;
9779                 int *np = (int *)NULL;
9780
9781                 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
9782                         continue;
9783
9784                 id -= DIF_VAR_OTHER_UBASE;
9785
9786                 switch (scope) {
9787                 case DIFV_SCOPE_THREAD:
9788                         while (id >= (uint_t)(otlocals = vstate->dtvs_ntlocals)) {
9789                                 dtrace_difv_t *tlocals;
9790
9791                                 if ((ntlocals = (otlocals << 1)) == 0)
9792                                         ntlocals = 1;
9793
9794                                 osz = otlocals * sizeof (dtrace_difv_t);
9795                                 nsz = ntlocals * sizeof (dtrace_difv_t);
9796
9797                                 tlocals = kmem_zalloc(nsz, KM_SLEEP);
9798
9799                                 if (osz != 0) {
9800                                         bcopy(vstate->dtvs_tlocals,
9801                                             tlocals, osz);
9802                                         kmem_free(vstate->dtvs_tlocals, osz);
9803                                 }
9804
9805                                 vstate->dtvs_tlocals = tlocals;
9806                                 vstate->dtvs_ntlocals = ntlocals;
9807                         }
9808
9809                         vstate->dtvs_tlocals[id] = *v;
9810                         continue;
9811
9812                 case DIFV_SCOPE_LOCAL:
9813                         np = &vstate->dtvs_nlocals;
9814                         svarp = &vstate->dtvs_locals;
9815
9816                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
9817                                 dsize = (int)NCPU * (v->dtdv_type.dtdt_size +
9818                                     sizeof (uint64_t));
9819                         else
9820                                 dsize = (int)NCPU * sizeof (uint64_t);
9821
9822                         break;
9823
9824                 case DIFV_SCOPE_GLOBAL:
9825                         np = &vstate->dtvs_nglobals;
9826                         svarp = &vstate->dtvs_globals;
9827
9828                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
9829                                 dsize = v->dtdv_type.dtdt_size +
9830                                     sizeof (uint64_t);
9831
9832                         break;
9833
9834                 default:
9835                         ASSERT(0);
9836                 }
9837
9838                 while (id >= (uint_t)(oldsvars = *np)) {
9839                         dtrace_statvar_t **statics;
9840                         int newsvars, oldsize, newsize;
9841
9842                         if ((newsvars = (oldsvars << 1)) == 0)
9843                                 newsvars = 1;
9844
9845                         oldsize = oldsvars * sizeof (dtrace_statvar_t *);
9846                         newsize = newsvars * sizeof (dtrace_statvar_t *);
9847
9848                         statics = kmem_zalloc(newsize, KM_SLEEP);
9849
9850                         if (oldsize != 0) {
9851                                 bcopy(*svarp, statics, oldsize);
9852                                 kmem_free(*svarp, oldsize);
9853                         }
9854
9855                         *svarp = statics;
9856                         *np = newsvars;
9857                 }
9858
9859                 if ((svar = (*svarp)[id]) == NULL) {
9860                         svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
9861                         svar->dtsv_var = *v;
9862
9863                         if ((svar->dtsv_size = dsize) != 0) {
9864                                 svar->dtsv_data = (uint64_t)(uintptr_t)
9865                                     kmem_zalloc(dsize, KM_SLEEP);
9866                         }
9867
9868                         (*svarp)[id] = svar;
9869                 }
9870
9871                 svar->dtsv_refcnt++;
9872         }
9873
9874         dtrace_difo_chunksize(dp, vstate);
9875         dtrace_difo_hold(dp);
9876 }
9877
9878 static dtrace_difo_t *
9879 dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9880 {
9881         dtrace_difo_t *new;
9882         size_t sz;
9883
9884         ASSERT(dp->dtdo_buf != NULL);
9885         ASSERT(dp->dtdo_refcnt != 0);
9886
9887         new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
9888
9889         ASSERT(dp->dtdo_buf != NULL);
9890         sz = dp->dtdo_len * sizeof (dif_instr_t);
9891         new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
9892         bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
9893         new->dtdo_len = dp->dtdo_len;
9894
9895         if (dp->dtdo_strtab != NULL) {
9896                 ASSERT(dp->dtdo_strlen != 0);
9897                 new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
9898                 bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
9899                 new->dtdo_strlen = dp->dtdo_strlen;
9900         }
9901
9902         if (dp->dtdo_inttab != NULL) {
9903                 ASSERT(dp->dtdo_intlen != 0);
9904                 sz = dp->dtdo_intlen * sizeof (uint64_t);
9905                 new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
9906                 bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
9907                 new->dtdo_intlen = dp->dtdo_intlen;
9908         }
9909
9910         if (dp->dtdo_vartab != NULL) {
9911                 ASSERT(dp->dtdo_varlen != 0);
9912                 sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
9913                 new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
9914                 bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
9915                 new->dtdo_varlen = dp->dtdo_varlen;
9916         }
9917
9918         dtrace_difo_init(new, vstate);
9919         return (new);
9920 }
9921
9922 static void
9923 dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9924 {
9925         uint_t i;
9926
9927         ASSERT(dp->dtdo_refcnt == 0);
9928
9929         for (i = 0; i < dp->dtdo_varlen; i++) {
9930                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9931                 dtrace_statvar_t *svar;
9932                 dtrace_statvar_t **svarp = NULL;
9933                 uint_t id;
9934                 uint8_t scope = v->dtdv_scope;
9935                 int *np = NULL;
9936
9937                 switch (scope) {
9938                 case DIFV_SCOPE_THREAD:
9939                         continue;
9940
9941                 case DIFV_SCOPE_LOCAL:
9942                         np = &vstate->dtvs_nlocals;
9943                         svarp = vstate->dtvs_locals;
9944                         break;
9945
9946                 case DIFV_SCOPE_GLOBAL:
9947                         np = &vstate->dtvs_nglobals;
9948                         svarp = vstate->dtvs_globals;
9949                         break;
9950
9951                 default:
9952                         ASSERT(0);
9953                 }
9954
9955                 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
9956                         continue;
9957
9958                 id -= DIF_VAR_OTHER_UBASE;
9959
9960                 ASSERT(id < (uint_t)*np);
9961
9962                 svar = svarp[id];
9963                 ASSERT(svar != NULL);
9964                 ASSERT(svar->dtsv_refcnt > 0);
9965
9966                 if (--svar->dtsv_refcnt > 0)
9967                         continue;
9968
9969                 if (svar->dtsv_size != 0) {
9970                         ASSERT(svar->dtsv_data != 0);
9971                         kmem_free((void *)(uintptr_t)svar->dtsv_data,
9972                             svar->dtsv_size);
9973                 }
9974
9975                 kmem_free(svar, sizeof (dtrace_statvar_t));
9976                 svarp[id] = NULL;
9977         }
9978
9979         kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
9980         kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
9981         kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
9982         kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
9983
9984         kmem_free(dp, sizeof (dtrace_difo_t));
9985 }
9986
9987 static void
9988 dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
9989 {
9990         uint_t i;
9991
9992         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9993         ASSERT(dp->dtdo_refcnt != 0);
9994
9995         for (i = 0; i < dp->dtdo_varlen; i++) {
9996                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
9997
9998                 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
9999                         continue;
10000
10001                 ASSERT(dtrace_vtime_references > 0);
10002                 if (--dtrace_vtime_references == 0)
10003                         dtrace_vtime_disable();
10004         }
10005
10006         if (--dp->dtdo_refcnt == 0)
10007                 dtrace_difo_destroy(dp, vstate);
10008 }
10009
10010 /*
10011  * DTrace Format Functions
10012  */
10013 static uint16_t
10014 dtrace_format_add(dtrace_state_t *state, char *str)
10015 {
10016         char *fmt, **new;
10017         uint16_t ndx, len = strlen(str) + 1;
10018
10019         fmt = kmem_zalloc(len, KM_SLEEP);
10020         bcopy(str, fmt, len);
10021
10022         for (ndx = 0; ndx < state->dts_nformats; ndx++) {
10023                 if (state->dts_formats[ndx] == NULL) {
10024                         state->dts_formats[ndx] = fmt;
10025                         return (ndx + 1);
10026                 }
10027         }
10028
10029         if (state->dts_nformats == USHRT_MAX) {
10030                 /*
10031                  * This is only likely if a denial-of-service attack is being
10032                  * attempted.  As such, it's okay to fail silently here.
10033                  */
10034                 kmem_free(fmt, len);
10035                 return (0);
10036         }
10037
10038         /*
10039          * For simplicity, we always resize the formats array to be exactly the
10040          * number of formats.
10041          */
10042         ndx = state->dts_nformats++;
10043         new = kmem_alloc((ndx + 1) * sizeof (char *), KM_SLEEP);
10044
10045         if (state->dts_formats != NULL) {
10046                 ASSERT(ndx != 0);
10047                 bcopy(state->dts_formats, new, ndx * sizeof (char *));
10048                 kmem_free(state->dts_formats, ndx * sizeof (char *));
10049         }
10050
10051         state->dts_formats = new;
10052         state->dts_formats[ndx] = fmt;
10053
10054         return (ndx + 1);
10055 }
10056
10057 static void
10058 dtrace_format_remove(dtrace_state_t *state, uint16_t format)
10059 {
10060         char *fmt;
10061
10062         ASSERT(state->dts_formats != NULL);
10063         ASSERT(format <= state->dts_nformats);
10064         ASSERT(state->dts_formats[format - 1] != NULL);
10065
10066         fmt = state->dts_formats[format - 1];
10067         kmem_free(fmt, strlen(fmt) + 1);
10068         state->dts_formats[format - 1] = NULL;
10069 }
10070
10071 static void
10072 dtrace_format_destroy(dtrace_state_t *state)
10073 {
10074         int i;
10075
10076         if (state->dts_nformats == 0) {
10077                 ASSERT(state->dts_formats == NULL);
10078                 return;
10079         }
10080
10081         ASSERT(state->dts_formats != NULL);
10082
10083         for (i = 0; i < state->dts_nformats; i++) {
10084                 char *fmt = state->dts_formats[i];
10085
10086                 if (fmt == NULL)
10087                         continue;
10088
10089                 kmem_free(fmt, strlen(fmt) + 1);
10090         }
10091
10092         kmem_free(state->dts_formats, state->dts_nformats * sizeof (char *));
10093         state->dts_nformats = 0;
10094         state->dts_formats = NULL;
10095 }
10096
10097 /*
10098  * DTrace Predicate Functions
10099  */
10100 static dtrace_predicate_t *
10101 dtrace_predicate_create(dtrace_difo_t *dp)
10102 {
10103         dtrace_predicate_t *pred;
10104
10105         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10106         ASSERT(dp->dtdo_refcnt != 0);
10107
10108         pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
10109         pred->dtp_difo = dp;
10110         pred->dtp_refcnt = 1;
10111
10112         if (!dtrace_difo_cacheable(dp))
10113                 return (pred);
10114
10115         if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
10116                 /*
10117                  * This is only theoretically possible -- we have had 2^32
10118                  * cacheable predicates on this machine.  We cannot allow any
10119                  * more predicates to become cacheable:  as unlikely as it is,
10120                  * there may be a thread caching a (now stale) predicate cache
10121                  * ID. (N.B.: the temptation is being successfully resisted to
10122                  * have this cmn_err() "Holy shit -- we executed this code!")
10123                  */
10124                 return (pred);
10125         }
10126
10127         pred->dtp_cacheid = dtrace_predcache_id++;
10128
10129         return (pred);
10130 }
10131
10132 static void
10133 dtrace_predicate_hold(dtrace_predicate_t *pred)
10134 {
10135         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10136         ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
10137         ASSERT(pred->dtp_refcnt > 0);
10138
10139         pred->dtp_refcnt++;
10140 }
10141
10142 static void
10143 dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)
10144 {
10145         dtrace_difo_t *dp = pred->dtp_difo;
10146 #pragma unused(dp) /* __APPLE__ */
10147
10148         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10149         ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
10150         ASSERT(pred->dtp_refcnt > 0);
10151
10152         if (--pred->dtp_refcnt == 0) {
10153                 dtrace_difo_release(pred->dtp_difo, vstate);
10154                 kmem_free(pred, sizeof (dtrace_predicate_t));
10155         }
10156 }
10157
10158 /*
10159  * DTrace Action Description Functions
10160  */
10161 static dtrace_actdesc_t *
10162 dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
10163     uint64_t uarg, uint64_t arg)
10164 {
10165         dtrace_actdesc_t *act;
10166
10167         ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != 0 &&
10168             arg >= KERNELBASE) || (arg == 0 && kind == DTRACEACT_PRINTA));
10169
10170         act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
10171         act->dtad_kind = kind;
10172         act->dtad_ntuple = ntuple;
10173         act->dtad_uarg = uarg;
10174         act->dtad_arg = arg;
10175         act->dtad_refcnt = 1;
10176
10177         return (act);
10178 }
10179
10180 static void
10181 dtrace_actdesc_hold(dtrace_actdesc_t *act)
10182 {
10183         ASSERT(act->dtad_refcnt >= 1);
10184         act->dtad_refcnt++;
10185 }
10186
10187 static void
10188 dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)
10189 {
10190         dtrace_actkind_t kind = act->dtad_kind;
10191         dtrace_difo_t *dp;
10192
10193         ASSERT(act->dtad_refcnt >= 1);
10194
10195         if (--act->dtad_refcnt != 0)
10196                 return;
10197
10198         if ((dp = act->dtad_difo) != NULL)
10199                 dtrace_difo_release(dp, vstate);
10200
10201         if (DTRACEACT_ISPRINTFLIKE(kind)) {
10202                 char *str = (char *)(uintptr_t)act->dtad_arg;
10203
10204                 ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
10205                     (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
10206
10207                 if (str != NULL)
10208                         kmem_free(str, strlen(str) + 1);
10209         }
10210
10211         kmem_free(act, sizeof (dtrace_actdesc_t));
10212 }
10213
10214 /*
10215  * DTrace ECB Functions
10216  */
10217 static dtrace_ecb_t *
10218 dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
10219 {
10220         dtrace_ecb_t *ecb;
10221         dtrace_epid_t epid;
10222
10223         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10224
10225         ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
10226         ecb->dte_predicate = NULL;
10227         ecb->dte_probe = probe;
10228
10229         /*
10230          * The default size is the size of the default action: recording
10231          * the header.
10232          */
10233         ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t);
10234         ecb->dte_alignment = sizeof (dtrace_epid_t);
10235
10236         epid = state->dts_epid++;
10237
10238         if (epid - 1 >= (dtrace_epid_t)state->dts_necbs) {
10239                 dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;
10240                 int necbs = state->dts_necbs << 1;
10241
10242                 ASSERT(epid == (dtrace_epid_t)state->dts_necbs + 1);
10243
10244                 if (necbs == 0) {
10245                         ASSERT(oecbs == NULL);
10246                         necbs = 1;
10247                 }
10248
10249                 ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
10250
10251                 if (oecbs != NULL)
10252                         bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
10253
10254                 dtrace_membar_producer();
10255                 state->dts_ecbs = ecbs;
10256
10257                 if (oecbs != NULL) {
10258                         /*
10259                          * If this state is active, we must dtrace_sync()
10260                          * before we can free the old dts_ecbs array:  we're
10261                          * coming in hot, and there may be active ring
10262                          * buffer processing (which indexes into the dts_ecbs
10263                          * array) on another CPU.
10264                          */
10265                         if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
10266                                 dtrace_sync();
10267
10268                         kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
10269                 }
10270
10271                 dtrace_membar_producer();
10272                 state->dts_necbs = necbs;
10273         }
10274
10275         ecb->dte_state = state;
10276
10277         ASSERT(state->dts_ecbs[epid - 1] == NULL);
10278         dtrace_membar_producer();
10279         state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;
10280
10281         return (ecb);
10282 }
10283
10284 static int
10285 dtrace_ecb_enable(dtrace_ecb_t *ecb)
10286 {
10287         dtrace_probe_t *probe = ecb->dte_probe;
10288
10289         LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
10290         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10291         ASSERT(ecb->dte_next == NULL);
10292
10293         if (probe == NULL) {
10294                 /*
10295                  * This is the NULL probe -- there's nothing to do.
10296                  */
10297             return(0);
10298         }
10299
10300         probe->dtpr_provider->dtpv_ecb_count++;
10301         if (probe->dtpr_ecb == NULL) {
10302                 dtrace_provider_t *prov = probe->dtpr_provider;
10303
10304                 /*
10305                  * We're the first ECB on this probe.
10306                  */
10307                 probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
10308
10309                 if (ecb->dte_predicate != NULL)
10310                         probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
10311
10312                 return (prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
10313                     probe->dtpr_id, probe->dtpr_arg));
10314         } else {
10315                 /*
10316                  * This probe is already active.  Swing the last pointer to
10317                  * point to the new ECB, and issue a dtrace_sync() to assure
10318                  * that all CPUs have seen the change.
10319                  */
10320                 ASSERT(probe->dtpr_ecb_last != NULL);
10321                 probe->dtpr_ecb_last->dte_next = ecb;
10322                 probe->dtpr_ecb_last = ecb;
10323                 probe->dtpr_predcache = 0;
10324
10325                 dtrace_sync();
10326                 return(0);
10327         }
10328 }
10329
10330 static int
10331 dtrace_ecb_resize(dtrace_ecb_t *ecb)
10332 {
10333         dtrace_action_t *act;
10334         uint32_t curneeded = UINT32_MAX;
10335         uint32_t aggbase = UINT32_MAX;
10336
10337         /*
10338          * If we record anything, we always record the dtrace_rechdr_t.  (And
10339          * we always record it first.)
10340          */
10341         ecb->dte_size = sizeof (dtrace_rechdr_t);
10342         ecb->dte_alignment = sizeof (dtrace_epid_t);
10343
10344         for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
10345                 dtrace_recdesc_t *rec = &act->dta_rec;
10346                 ASSERT(rec->dtrd_size > 0 || rec->dtrd_alignment == 1);
10347
10348                 ecb->dte_alignment = MAX(ecb->dte_alignment, rec->dtrd_alignment);
10349
10350                 if (DTRACEACT_ISAGG(act->dta_kind)) {
10351                         dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
10352
10353                         ASSERT(rec->dtrd_size != 0);
10354                         ASSERT(agg->dtag_first != NULL);
10355                         ASSERT(act->dta_prev->dta_intuple);
10356                         ASSERT(aggbase != UINT32_MAX);
10357                         ASSERT(curneeded != UINT32_MAX);
10358
10359                         agg->dtag_base = aggbase;
10360                         curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
10361                         rec->dtrd_offset = curneeded;
10362                         if (curneeded + rec->dtrd_size < curneeded)
10363                                 return (EINVAL);
10364                         curneeded += rec->dtrd_size;
10365                         ecb->dte_needed = MAX(ecb->dte_needed, curneeded);
10366
10367                         aggbase = UINT32_MAX;
10368                         curneeded = UINT32_MAX;
10369                 } else if (act->dta_intuple) {
10370                         if (curneeded == UINT32_MAX) {
10371                                 /*
10372                                  * This is the first record in a tuple.  Align
10373                                  * curneeded to be at offset 4 in an 8-byte
10374                                  * aligned block.
10375                                  */
10376                                 ASSERT(act->dta_prev == NULL || !act->dta_prev->dta_intuple);
10377                                 ASSERT(aggbase == UINT32_MAX);
10378
10379                                 curneeded = P2PHASEUP(ecb->dte_size,
10380                                     sizeof (uint64_t), sizeof (dtrace_aggid_t));
10381
10382                                 aggbase = curneeded - sizeof (dtrace_aggid_t);
10383                                 ASSERT(IS_P2ALIGNED(aggbase,
10384                                     sizeof (uint64_t)));
10385                         }
10386
10387                         curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
10388                         rec->dtrd_offset = curneeded;
10389                         curneeded += rec->dtrd_size;
10390                         if (curneeded + rec->dtrd_size < curneeded)
10391                                 return (EINVAL);
10392                 } else {
10393                         /* tuples must be followed by an aggregation */
10394                         ASSERT(act->dta_prev == NULL || !act->dta_prev->dta_intuple);
10395                         ecb->dte_size = P2ROUNDUP(ecb->dte_size, rec->dtrd_alignment);
10396                         rec->dtrd_offset = ecb->dte_size;
10397                         if (ecb->dte_size + rec->dtrd_size < ecb->dte_size)
10398                                 return (EINVAL);
10399                         ecb->dte_size += rec->dtrd_size;
10400                         ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size);
10401                 }
10402         }
10403
10404         if ((act = ecb->dte_action) != NULL &&
10405             !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
10406             ecb->dte_size == sizeof (dtrace_rechdr_t)) {
10407                 /*
10408                  * If the size is still sizeof (dtrace_rechdr_t), then all
10409                  * actions store no data; set the size to 0.
10410                  */
10411                 ecb->dte_size = 0;
10412         }
10413
10414         ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t));
10415         ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t)));
10416         ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed, ecb->dte_needed);
10417         return (0);
10418 }
10419
10420 static dtrace_action_t *
10421 dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
10422 {
10423         dtrace_aggregation_t *agg;
10424         size_t size = sizeof (uint64_t);
10425         int ntuple = desc->dtad_ntuple;
10426         dtrace_action_t *act;
10427         dtrace_recdesc_t *frec;
10428         dtrace_aggid_t aggid;
10429         dtrace_state_t *state = ecb->dte_state;
10430
10431         agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
10432         agg->dtag_ecb = ecb;
10433
10434         ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
10435
10436         switch (desc->dtad_kind) {
10437         case DTRACEAGG_MIN:
10438                 agg->dtag_initial = INT64_MAX;
10439                 agg->dtag_aggregate = dtrace_aggregate_min;
10440                 break;
10441
10442         case DTRACEAGG_MAX:
10443                 agg->dtag_initial = INT64_MIN;
10444                 agg->dtag_aggregate = dtrace_aggregate_max;
10445                 break;
10446
10447         case DTRACEAGG_COUNT:
10448                 agg->dtag_aggregate = dtrace_aggregate_count;
10449                 break;
10450
10451         case DTRACEAGG_QUANTIZE:
10452                 agg->dtag_aggregate = dtrace_aggregate_quantize;
10453                 size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
10454                     sizeof (uint64_t);
10455                 break;
10456
10457         case DTRACEAGG_LQUANTIZE: {
10458                 uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
10459                 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
10460
10461                 agg->dtag_initial = desc->dtad_arg;
10462                 agg->dtag_aggregate = dtrace_aggregate_lquantize;
10463
10464                 if (step == 0 || levels == 0)
10465                         goto err;
10466
10467                 size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
10468                 break;
10469         }
10470
10471         case DTRACEAGG_LLQUANTIZE: {
10472                 uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(desc->dtad_arg);
10473                 uint16_t low    = DTRACE_LLQUANTIZE_LOW(desc->dtad_arg);
10474                 uint16_t high   = DTRACE_LLQUANTIZE_HIGH(desc->dtad_arg);
10475                 uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(desc->dtad_arg);
10476                 int64_t v;
10477
10478                 agg->dtag_initial = desc->dtad_arg;
10479                 agg->dtag_aggregate = dtrace_aggregate_llquantize;
10480
10481                 if (factor < 2 || low >= high || nsteps < factor)
10482                         goto err;
10483
10484                 /*
10485                  * Now check that the number of steps evenly divides a power
10486                  * of the factor.  (This assures both integer bucket size and
10487                  * linearity within each magnitude.)
10488                  */
10489                 for (v = factor; v < nsteps; v *= factor)
10490                         continue;
10491
10492                 if ((v % nsteps) || (nsteps % factor))
10493                         goto err;
10494
10495                 size = (dtrace_aggregate_llquantize_bucket(factor, low, high, nsteps, INT64_MAX) + 2) * sizeof (uint64_t);
10496                 break;
10497   }
10498
10499         case DTRACEAGG_AVG:
10500                 agg->dtag_aggregate = dtrace_aggregate_avg;
10501                 size = sizeof (uint64_t) * 2;
10502                 break;
10503
10504         case DTRACEAGG_STDDEV:
10505                 agg->dtag_aggregate = dtrace_aggregate_stddev;
10506                 size = sizeof (uint64_t) * 4;
10507                 break;
10508
10509         case DTRACEAGG_SUM:
10510                 agg->dtag_aggregate = dtrace_aggregate_sum;
10511                 break;
10512
10513         default:
10514                 goto err;
10515         }
10516
10517         agg->dtag_action.dta_rec.dtrd_size = size;
10518
10519         if (ntuple == 0)
10520                 goto err;
10521
10522         /*
10523          * We must make sure that we have enough actions for the n-tuple.
10524          */
10525         for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
10526                 if (DTRACEACT_ISAGG(act->dta_kind))
10527                         break;
10528
10529                 if (--ntuple == 0) {
10530                         /*
10531                          * This is the action with which our n-tuple begins.
10532                          */
10533                         agg->dtag_first = act;
10534                         goto success;
10535                 }
10536         }
10537
10538         /*
10539          * This n-tuple is short by ntuple elements.  Return failure.
10540          */
10541         ASSERT(ntuple != 0);
10542 err:
10543         kmem_free(agg, sizeof (dtrace_aggregation_t));
10544         return (NULL);
10545
10546 success:
10547         /*
10548          * If the last action in the tuple has a size of zero, it's actually
10549          * an expression argument for the aggregating action.
10550          */
10551         ASSERT(ecb->dte_action_last != NULL);
10552         act = ecb->dte_action_last;
10553
10554         if (act->dta_kind == DTRACEACT_DIFEXPR) {
10555                 ASSERT(act->dta_difo != NULL);
10556
10557                 if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
10558                         agg->dtag_hasarg = 1;
10559         }
10560
10561         /*
10562          * We need to allocate an id for this aggregation.
10563          */
10564         aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
10565             VM_BESTFIT | VM_SLEEP);
10566
10567         if (aggid - 1 >= (dtrace_aggid_t)state->dts_naggregations) {
10568                 dtrace_aggregation_t **oaggs = state->dts_aggregations;
10569                 dtrace_aggregation_t **aggs;
10570                 int naggs = state->dts_naggregations << 1;
10571                 int onaggs = state->dts_naggregations;
10572
10573                 ASSERT(aggid == (dtrace_aggid_t)state->dts_naggregations + 1);
10574
10575                 if (naggs == 0) {
10576                         ASSERT(oaggs == NULL);
10577                         naggs = 1;
10578                 }
10579
10580                 aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
10581
10582                 if (oaggs != NULL) {
10583                         bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
10584                         kmem_free(oaggs, onaggs * sizeof (*aggs));
10585                 }
10586
10587                 state->dts_aggregations = aggs;
10588                 state->dts_naggregations = naggs;
10589         }
10590
10591         ASSERT(state->dts_aggregations[aggid - 1] == NULL);
10592         state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;
10593
10594         frec = &agg->dtag_first->dta_rec;
10595         if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
10596                 frec->dtrd_alignment = sizeof (dtrace_aggid_t);
10597
10598         for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
10599                 ASSERT(!act->dta_intuple);
10600                 act->dta_intuple = 1;
10601         }
10602
10603         return (&agg->dtag_action);
10604 }
10605
10606 static void
10607 dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)
10608 {
10609         dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
10610         dtrace_state_t *state = ecb->dte_state;
10611         dtrace_aggid_t aggid = agg->dtag_id;
10612
10613         ASSERT(DTRACEACT_ISAGG(act->dta_kind));
10614         vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
10615
10616         ASSERT(state->dts_aggregations[aggid - 1] == agg);
10617         state->dts_aggregations[aggid - 1] = NULL;
10618
10619         kmem_free(agg, sizeof (dtrace_aggregation_t));
10620 }
10621
10622 static int
10623 dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
10624 {
10625         dtrace_action_t *action, *last;
10626         dtrace_difo_t *dp = desc->dtad_difo;
10627         uint32_t size = 0, align = sizeof (uint8_t), mask;
10628         uint16_t format = 0;
10629         dtrace_recdesc_t *rec;
10630         dtrace_state_t *state = ecb->dte_state;
10631         dtrace_optval_t *opt = state->dts_options;
10632         dtrace_optval_t nframes=0, strsize;
10633         uint64_t arg = desc->dtad_arg;
10634
10635         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10636         ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);
10637
10638         if (DTRACEACT_ISAGG(desc->dtad_kind)) {
10639                 /*
10640                  * If this is an aggregating action, there must be neither
10641                  * a speculate nor a commit on the action chain.
10642                  */
10643                 dtrace_action_t *act;
10644
10645                 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
10646                         if (act->dta_kind == DTRACEACT_COMMIT)
10647                                 return (EINVAL);
10648
10649                         if (act->dta_kind == DTRACEACT_SPECULATE)
10650                                 return (EINVAL);
10651                 }
10652
10653                 action = dtrace_ecb_aggregation_create(ecb, desc);
10654
10655                 if (action == NULL)
10656                         return (EINVAL);
10657         } else {
10658                 if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||
10659                     (desc->dtad_kind == DTRACEACT_DIFEXPR &&
10660                     dp != NULL && dp->dtdo_destructive)) {
10661                         state->dts_destructive = 1;
10662                 }
10663
10664                 switch (desc->dtad_kind) {
10665                 case DTRACEACT_PRINTF:
10666                 case DTRACEACT_PRINTA:
10667                 case DTRACEACT_SYSTEM:
10668                 case DTRACEACT_FREOPEN:
10669                 case DTRACEACT_DIFEXPR:
10670                         /*
10671                          * We know that our arg is a string -- turn it into a
10672                          * format.
10673                          */
10674                         if (arg == 0) {
10675                                 ASSERT(desc->dtad_kind == DTRACEACT_PRINTA ||
10676                                        desc->dtad_kind == DTRACEACT_DIFEXPR);
10677                                 format = 0;
10678                         } else {
10679                                 ASSERT(arg != 0);
10680                                 ASSERT(arg > KERNELBASE);
10681                                 format = dtrace_format_add(state,
10682                                     (char *)(uintptr_t)arg);
10683                         }
10684
10685                         /*FALLTHROUGH*/
10686                 case DTRACEACT_LIBACT:
10687                 case DTRACEACT_TRACEMEM:
10688                 case DTRACEACT_TRACEMEM_DYNSIZE:
10689                 case DTRACEACT_APPLEBINARY:     /* __APPLE__ */
10690                         if (dp == NULL)
10691                                 return (EINVAL);
10692
10693                         if ((size = dp->dtdo_rtype.dtdt_size) != 0)
10694                                 break;
10695
10696                         if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
10697                                 if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10698                                         return (EINVAL);
10699
10700                                 size = opt[DTRACEOPT_STRSIZE];
10701                         }
10702
10703                         break;
10704
10705                 case DTRACEACT_STACK:
10706                         if ((nframes = arg) == 0) {
10707                                 nframes = opt[DTRACEOPT_STACKFRAMES];
10708                                 ASSERT(nframes > 0);
10709                                 arg = nframes;
10710                         }
10711
10712                         size = nframes * sizeof (pc_t);
10713                         break;
10714
10715                 case DTRACEACT_JSTACK:
10716                         if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
10717                                 strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
10718
10719                         if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
10720                                 nframes = opt[DTRACEOPT_JSTACKFRAMES];
10721
10722                         arg = DTRACE_USTACK_ARG(nframes, strsize);
10723
10724                         /*FALLTHROUGH*/
10725                 case DTRACEACT_USTACK:
10726                         if (desc->dtad_kind != DTRACEACT_JSTACK &&
10727                             (nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
10728                                 strsize = DTRACE_USTACK_STRSIZE(arg);
10729                                 nframes = opt[DTRACEOPT_USTACKFRAMES];
10730                                 ASSERT(nframes > 0);
10731                                 arg = DTRACE_USTACK_ARG(nframes, strsize);
10732                         }
10733
10734                         /*
10735                          * Save a slot for the pid.
10736                          */
10737                         size = (nframes + 1) * sizeof (uint64_t);
10738                         size += DTRACE_USTACK_STRSIZE(arg);
10739                         size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
10740
10741                         break;
10742
10743                 case DTRACEACT_SYM:
10744                 case DTRACEACT_MOD:
10745                         if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=
10746                             sizeof (uint64_t)) ||
10747                             (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10748                                 return (EINVAL);
10749                         break;
10750
10751                 case DTRACEACT_USYM:
10752                 case DTRACEACT_UMOD:
10753                 case DTRACEACT_UADDR:
10754                         if (dp == NULL ||
10755                             (dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||
10756                             (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10757                                 return (EINVAL);
10758
10759                         /*
10760                          * We have a slot for the pid, plus a slot for the
10761                          * argument.  To keep things simple (aligned with
10762                          * bitness-neutral sizing), we store each as a 64-bit
10763                          * quantity.
10764                          */
10765                         size = 2 * sizeof (uint64_t);
10766                         break;
10767
10768                 case DTRACEACT_STOP:
10769                 case DTRACEACT_BREAKPOINT:
10770                 case DTRACEACT_PANIC:
10771                         break;
10772
10773                 case DTRACEACT_CHILL:
10774                 case DTRACEACT_DISCARD:
10775                 case DTRACEACT_RAISE:
10776                 case DTRACEACT_PIDRESUME:       /* __APPLE__ */
10777                         if (dp == NULL)
10778                                 return (EINVAL);
10779                         break;
10780
10781                 case DTRACEACT_EXIT:
10782                         if (dp == NULL ||
10783                             (size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||
10784                             (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
10785                                 return (EINVAL);
10786                         break;
10787
10788                 case DTRACEACT_SPECULATE:
10789                         if (ecb->dte_size > sizeof (dtrace_rechdr_t))
10790                                 return (EINVAL);
10791
10792                         if (dp == NULL)
10793                                 return (EINVAL);
10794
10795                         state->dts_speculates = 1;
10796                         break;
10797
10798                 case DTRACEACT_COMMIT: {
10799                         dtrace_action_t *act = ecb->dte_action;
10800
10801                         for (; act != NULL; act = act->dta_next) {
10802                                 if (act->dta_kind == DTRACEACT_COMMIT)
10803                                         return (EINVAL);
10804                         }
10805
10806                         if (dp == NULL)
10807                                 return (EINVAL);
10808                         break;
10809                 }
10810
10811                 default:
10812                         return (EINVAL);
10813                 }
10814
10815                 if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {
10816                         /*
10817                          * If this is a data-storing action or a speculate,
10818                          * we must be sure that there isn't a commit on the
10819                          * action chain.
10820                          */
10821                         dtrace_action_t *act = ecb->dte_action;
10822
10823                         for (; act != NULL; act = act->dta_next) {
10824                                 if (act->dta_kind == DTRACEACT_COMMIT)
10825                                         return (EINVAL);
10826                         }
10827                 }
10828
10829                 action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
10830                 action->dta_rec.dtrd_size = size;
10831         }
10832
10833         action->dta_refcnt = 1;
10834         rec = &action->dta_rec;
10835         size = rec->dtrd_size;
10836
10837         for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
10838                 if (!(size & mask)) {
10839                         align = mask + 1;
10840                         break;
10841                 }
10842         }
10843
10844         action->dta_kind = desc->dtad_kind;
10845
10846         if ((action->dta_difo = dp) != NULL)
10847                 dtrace_difo_hold(dp);
10848
10849         rec->dtrd_action = action->dta_kind;
10850         rec->dtrd_arg = arg;
10851         rec->dtrd_uarg = desc->dtad_uarg;
10852         rec->dtrd_alignment = (uint16_t)align;
10853         rec->dtrd_format = format;
10854
10855         if ((last = ecb->dte_action_last) != NULL) {
10856                 ASSERT(ecb->dte_action != NULL);
10857                 action->dta_prev = last;
10858                 last->dta_next = action;
10859         } else {
10860                 ASSERT(ecb->dte_action == NULL);
10861                 ecb->dte_action = action;
10862         }
10863
10864         ecb->dte_action_last = action;
10865
10866         return (0);
10867 }
10868
10869 static void
10870 dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
10871 {
10872         dtrace_action_t *act = ecb->dte_action, *next;
10873         dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
10874         dtrace_difo_t *dp;
10875         uint16_t format;
10876
10877         if (act != NULL && act->dta_refcnt > 1) {
10878                 ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);
10879                 act->dta_refcnt--;
10880         } else {
10881                 for (; act != NULL; act = next) {
10882                         next = act->dta_next;
10883                         ASSERT(next != NULL || act == ecb->dte_action_last);
10884                         ASSERT(act->dta_refcnt == 1);
10885
10886                         if ((format = act->dta_rec.dtrd_format) != 0)
10887                                 dtrace_format_remove(ecb->dte_state, format);
10888
10889                         if ((dp = act->dta_difo) != NULL)
10890                                 dtrace_difo_release(dp, vstate);
10891
10892                         if (DTRACEACT_ISAGG(act->dta_kind)) {
10893                                 dtrace_ecb_aggregation_destroy(ecb, act);
10894                         } else {
10895                                 kmem_free(act, sizeof (dtrace_action_t));
10896                         }
10897                 }
10898         }
10899
10900         ecb->dte_action = NULL;
10901         ecb->dte_action_last = NULL;
10902         ecb->dte_size = 0;
10903 }
10904
10905 static void
10906 dtrace_ecb_disable(dtrace_ecb_t *ecb)
10907 {
10908         /*
10909          * We disable the ECB by removing it from its probe.
10910          */
10911         dtrace_ecb_t *pecb, *prev = NULL;
10912         dtrace_probe_t *probe = ecb->dte_probe;
10913
10914         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10915
10916         if (probe == NULL) {
10917                 /*
10918                  * This is the NULL probe; there is nothing to disable.
10919                  */
10920                 return;
10921         }
10922
10923         for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
10924                 if (pecb == ecb)
10925                         break;
10926                 prev = pecb;
10927         }
10928
10929         ASSERT(pecb != NULL);
10930
10931         if (prev == NULL) {
10932                 probe->dtpr_ecb = ecb->dte_next;
10933         } else {
10934                 prev->dte_next = ecb->dte_next;
10935         }
10936
10937         if (ecb == probe->dtpr_ecb_last) {
10938                 ASSERT(ecb->dte_next == NULL);
10939                 probe->dtpr_ecb_last = prev;
10940         }
10941
10942         probe->dtpr_provider->dtpv_ecb_count--;
10943         /*
10944          * The ECB has been disconnected from the probe; now sync to assure
10945          * that all CPUs have seen the change before returning.
10946          */
10947         dtrace_sync();
10948
10949         if (probe->dtpr_ecb == NULL) {
10950                 /*
10951                  * That was the last ECB on the probe; clear the predicate
10952                  * cache ID for the probe, disable it and sync one more time
10953                  * to assure that we'll never hit it again.
10954                  */
10955                 dtrace_provider_t *prov = probe->dtpr_provider;
10956
10957                 ASSERT(ecb->dte_next == NULL);
10958                 ASSERT(probe->dtpr_ecb_last == NULL);
10959                 probe->dtpr_predcache = DTRACE_CACHEIDNONE;
10960                 prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
10961                     probe->dtpr_id, probe->dtpr_arg);
10962                 dtrace_sync();
10963         } else {
10964                 /*
10965                  * There is at least one ECB remaining on the probe.  If there
10966                  * is _exactly_ one, set the probe's predicate cache ID to be
10967                  * the predicate cache ID of the remaining ECB.
10968                  */
10969                 ASSERT(probe->dtpr_ecb_last != NULL);
10970                 ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
10971
10972                 if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
10973                         dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
10974
10975                         ASSERT(probe->dtpr_ecb->dte_next == NULL);
10976
10977                         if (p != NULL)
10978                                 probe->dtpr_predcache = p->dtp_cacheid;
10979                 }
10980
10981                 ecb->dte_next = NULL;
10982         }
10983 }
10984
10985 static void
10986 dtrace_ecb_destroy(dtrace_ecb_t *ecb)
10987 {
10988         dtrace_state_t *state = ecb->dte_state;
10989         dtrace_vstate_t *vstate = &state->dts_vstate;
10990         dtrace_predicate_t *pred;
10991         dtrace_epid_t epid = ecb->dte_epid;
10992
10993         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10994         ASSERT(ecb->dte_next == NULL);
10995         ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);
10996
10997         if ((pred = ecb->dte_predicate) != NULL)
10998                 dtrace_predicate_release(pred, vstate);
10999
11000         dtrace_ecb_action_remove(ecb);
11001
11002         ASSERT(state->dts_ecbs[epid - 1] == ecb);
11003         state->dts_ecbs[epid - 1] = NULL;
11004
11005         kmem_free(ecb, sizeof (dtrace_ecb_t));
11006 }
11007
11008 static dtrace_ecb_t *
11009 dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
11010     dtrace_enabling_t *enab)
11011 {
11012         dtrace_ecb_t *ecb;
11013         dtrace_predicate_t *pred;
11014         dtrace_actdesc_t *act;
11015         dtrace_provider_t *prov;
11016         dtrace_ecbdesc_t *desc = enab->dten_current;
11017
11018         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11019         ASSERT(state != NULL);
11020
11021         ecb = dtrace_ecb_add(state, probe);
11022         ecb->dte_uarg = desc->dted_uarg;
11023
11024         if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
11025                 dtrace_predicate_hold(pred);
11026                 ecb->dte_predicate = pred;
11027         }
11028
11029         if (probe != NULL) {
11030                 /*
11031                  * If the provider shows more leg than the consumer is old
11032                  * enough to see, we need to enable the appropriate implicit
11033                  * predicate bits to prevent the ecb from activating at
11034                  * revealing times.
11035                  *
11036                  * Providers specifying DTRACE_PRIV_USER at register time
11037                  * are stating that they need the /proc-style privilege
11038                  * model to be enforced, and this is what DTRACE_COND_OWNER
11039                  * and DTRACE_COND_ZONEOWNER will then do at probe time.
11040                  */
11041                 prov = probe->dtpr_provider;
11042                 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
11043                     (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11044                         ecb->dte_cond |= DTRACE_COND_OWNER;
11045
11046                 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
11047                     (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11048                         ecb->dte_cond |= DTRACE_COND_ZONEOWNER;
11049
11050                 /*
11051                  * If the provider shows us kernel innards and the user
11052                  * is lacking sufficient privilege, enable the
11053                  * DTRACE_COND_USERMODE implicit predicate.
11054                  */
11055                 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
11056                     (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
11057                         ecb->dte_cond |= DTRACE_COND_USERMODE;
11058         }
11059
11060         if (dtrace_ecb_create_cache != NULL) {
11061                 /*
11062                  * If we have a cached ecb, we'll use its action list instead
11063                  * of creating our own (saving both time and space).
11064                  */
11065                 dtrace_ecb_t *cached = dtrace_ecb_create_cache;
11066                 dtrace_action_t *act_if = cached->dte_action;
11067
11068                 if (act_if != NULL) {
11069                         ASSERT(act_if->dta_refcnt > 0);
11070                         act_if->dta_refcnt++;
11071                         ecb->dte_action = act_if;
11072                         ecb->dte_action_last = cached->dte_action_last;
11073                         ecb->dte_needed = cached->dte_needed;
11074                         ecb->dte_size = cached->dte_size;
11075                         ecb->dte_alignment = cached->dte_alignment;
11076                 }
11077
11078                 return (ecb);
11079         }
11080
11081         for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
11082                 if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
11083                         dtrace_ecb_destroy(ecb);
11084                         return (NULL);
11085                 }
11086         }
11087
11088         if ((enab->dten_error = dtrace_ecb_resize(ecb)) != 0) {
11089                 dtrace_ecb_destroy(ecb);
11090                 return (NULL);
11091         }
11092
11093         return (dtrace_ecb_create_cache = ecb);
11094 }
11095
11096 static int
11097 dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg1, void *arg2)
11098 {
11099         dtrace_ecb_t *ecb;
11100         dtrace_enabling_t *enab = arg1;
11101         dtrace_ecbdesc_t *ep = arg2;
11102         dtrace_state_t *state = enab->dten_vstate->dtvs_state;
11103
11104         ASSERT(state != NULL);
11105
11106         if (probe != NULL && ep != NULL && probe->dtpr_gen < ep->dted_probegen) {
11107                 /*
11108                  * This probe was created in a generation for which this
11109                  * enabling has previously created ECBs; we don't want to
11110                  * enable it again, so just kick out.
11111                  */
11112                 return (DTRACE_MATCH_NEXT);
11113         }
11114
11115         if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
11116                 return (DTRACE_MATCH_DONE);
11117
11118         if (dtrace_ecb_enable(ecb) < 0)
11119                return (DTRACE_MATCH_FAIL);
11120
11121         return (DTRACE_MATCH_NEXT);
11122 }
11123
11124 static dtrace_ecb_t *
11125 dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
11126 {
11127         dtrace_ecb_t *ecb;
11128 #pragma unused(ecb) /* __APPLE__ */
11129
11130         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11131
11132         if (id == 0 || id > (dtrace_epid_t)state->dts_necbs)
11133                 return (NULL);
11134
11135         ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
11136         ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);
11137
11138         return (state->dts_ecbs[id - 1]);
11139 }
11140
11141 static dtrace_aggregation_t *
11142 dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
11143 {
11144         dtrace_aggregation_t *agg;
11145 #pragma unused(agg) /* __APPLE__ */
11146
11147         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11148
11149         if (id == 0 || id > (dtrace_aggid_t)state->dts_naggregations)
11150                 return (NULL);
11151
11152         ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
11153         ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||
11154             agg->dtag_id == id);
11155
11156         return (state->dts_aggregations[id - 1]);
11157 }
11158
11159 /*
11160  * DTrace Buffer Functions
11161  *
11162  * The following functions manipulate DTrace buffers.  Most of these functions
11163  * are called in the context of establishing or processing consumer state;
11164  * exceptions are explicitly noted.
11165  */
11166
11167 /*
11168  * Note:  called from cross call context.  This function switches the two
11169  * buffers on a given CPU.  The atomicity of this operation is assured by
11170  * disabling interrupts while the actual switch takes place; the disabling of
11171  * interrupts serializes the execution with any execution of dtrace_probe() on
11172  * the same CPU.
11173  */
11174 static void
11175 dtrace_buffer_switch(dtrace_buffer_t *buf)
11176 {
11177         caddr_t tomax = buf->dtb_tomax;
11178         caddr_t xamot = buf->dtb_xamot;
11179         dtrace_icookie_t cookie;
11180         hrtime_t now;
11181
11182         ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
11183         ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
11184
11185         cookie = dtrace_interrupt_disable();
11186         now = dtrace_gethrtime();
11187         buf->dtb_tomax = xamot;
11188         buf->dtb_xamot = tomax;
11189         buf->dtb_xamot_drops = buf->dtb_drops;
11190         buf->dtb_xamot_offset = buf->dtb_offset;
11191         buf->dtb_xamot_errors = buf->dtb_errors;
11192         buf->dtb_xamot_flags = buf->dtb_flags;
11193         buf->dtb_offset = 0;
11194         buf->dtb_drops = 0;
11195         buf->dtb_errors = 0;
11196         buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
11197         buf->dtb_interval = now - buf->dtb_switched;
11198         buf->dtb_switched = now;
11199         buf->dtb_cur_limit = buf->dtb_limit;
11200
11201         dtrace_interrupt_enable(cookie);
11202 }
11203
11204 /*
11205  * Note:  called from cross call context.  This function activates a buffer
11206  * on a CPU.  As with dtrace_buffer_switch(), the atomicity of the operation
11207  * is guaranteed by the disabling of interrupts.
11208  */
11209 static void
11210 dtrace_buffer_activate(dtrace_state_t *state)
11211 {
11212         dtrace_buffer_t *buf;
11213         dtrace_icookie_t cookie = dtrace_interrupt_disable();
11214
11215         buf = &state->dts_buffer[CPU->cpu_id];
11216
11217         if (buf->dtb_tomax != NULL) {
11218                 /*
11219                  * We might like to assert that the buffer is marked inactive,
11220                  * but this isn't necessarily true:  the buffer for the CPU
11221                  * that processes the BEGIN probe has its buffer activated
11222                  * manually.  In this case, we take the (harmless) action
11223                  * re-clearing the bit INACTIVE bit.
11224                  */
11225                 buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
11226         }
11227
11228         dtrace_interrupt_enable(cookie);
11229 }
11230
11231 static int
11232 dtrace_buffer_canalloc(size_t size)
11233 {
11234         if (size > (UINT64_MAX - dtrace_buffer_memory_inuse))
11235                 return (B_FALSE);
11236         if ((size + dtrace_buffer_memory_inuse) > dtrace_buffer_memory_maxsize)
11237                 return (B_FALSE);
11238
11239         return (B_TRUE);
11240 }
11241
11242 static int
11243 dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t limit, size_t size, int flags,
11244     processorid_t cpu)
11245 {
11246         dtrace_cpu_t *cp;
11247         dtrace_buffer_t *buf;
11248         size_t size_before_alloc = dtrace_buffer_memory_inuse;
11249
11250         LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
11251         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11252
11253         if (size > (size_t)dtrace_nonroot_maxsize &&
11254             !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
11255                 return (EFBIG);
11256
11257         cp = cpu_list;
11258
11259         do {
11260                 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
11261                         continue;
11262
11263                 buf = &bufs[cp->cpu_id];
11264
11265                 /*
11266                  * If there is already a buffer allocated for this CPU, it
11267                  * is only possible that this is a DR event.  In this case,
11268                  * the buffer size must match our specified size.
11269                  */
11270                 if (buf->dtb_tomax != NULL) {
11271                         ASSERT(buf->dtb_size == size);
11272                         continue;
11273                 }
11274
11275                 ASSERT(buf->dtb_xamot == NULL);
11276
11277                 /* DTrace, please do not eat all the memory. */
11278                 if (dtrace_buffer_canalloc(size) == B_FALSE)
11279                         goto err;
11280                 if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
11281                         goto err;
11282                 dtrace_buffer_memory_inuse += size;
11283
11284                 /* Unsure that limit is always lower than size */
11285                 limit = limit == size ? limit - 1 : limit;
11286                 buf->dtb_cur_limit = limit;
11287                 buf->dtb_limit = limit;
11288                 buf->dtb_size = size;
11289                 buf->dtb_flags = flags;
11290                 buf->dtb_offset = 0;
11291                 buf->dtb_drops = 0;
11292
11293                 if (flags & DTRACEBUF_NOSWITCH)
11294                         continue;
11295
11296                 /* DTrace, please do not eat all the memory. */
11297                 if (dtrace_buffer_canalloc(size) == B_FALSE)
11298                         goto err;
11299                 if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
11300                         goto err;
11301                 dtrace_buffer_memory_inuse += size;
11302         } while ((cp = cp->cpu_next) != cpu_list);
11303
11304         ASSERT(dtrace_buffer_memory_inuse <= dtrace_buffer_memory_maxsize);
11305
11306         return (0);
11307
11308 err:
11309         cp = cpu_list;
11310
11311         do {
11312                 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
11313                         continue;
11314
11315                 buf = &bufs[cp->cpu_id];
11316
11317                 if (buf->dtb_xamot != NULL) {
11318                         ASSERT(buf->dtb_tomax != NULL);
11319                         ASSERT(buf->dtb_size == size);
11320                         kmem_free(buf->dtb_xamot, size);
11321                 }
11322
11323                 if (buf->dtb_tomax != NULL) {
11324                         ASSERT(buf->dtb_size == size);
11325                         kmem_free(buf->dtb_tomax, size);
11326                 }
11327
11328                 buf->dtb_tomax = NULL;
11329                 buf->dtb_xamot = NULL;
11330                 buf->dtb_size = 0;
11331         } while ((cp = cp->cpu_next) != cpu_list);
11332
11333         /* Restore the size saved before allocating memory */
11334         dtrace_buffer_memory_inuse = size_before_alloc;
11335
11336         return (ENOMEM);
11337 }
11338
11339 /*
11340  * Note:  called from probe context.  This function just increments the drop
11341  * count on a buffer.  It has been made a function to allow for the
11342  * possibility of understanding the source of mysterious drop counts.  (A
11343  * problem for which one may be particularly disappointed that DTrace cannot
11344  * be used to understand DTrace.)
11345  */
11346 static void
11347 dtrace_buffer_drop(dtrace_buffer_t *buf)
11348 {
11349         buf->dtb_drops++;
11350 }
11351
11352 /*
11353  * Note:  called from probe context.  This function is called to reserve space
11354  * in a buffer.  If mstate is non-NULL, sets the scratch base and size in the
11355  * mstate.  Returns the new offset in the buffer, or a negative value if an
11356  * error has occurred.
11357  */
11358 static intptr_t
11359 dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
11360     dtrace_state_t *state, dtrace_mstate_t *mstate)
11361 {
11362         intptr_t offs = buf->dtb_offset, soffs;
11363         intptr_t woffs;
11364         caddr_t tomax;
11365         size_t total_off;
11366
11367         if (buf->dtb_flags & DTRACEBUF_INACTIVE)
11368                 return (-1);
11369
11370         if ((tomax = buf->dtb_tomax) == NULL) {
11371                 dtrace_buffer_drop(buf);
11372                 return (-1);
11373         }
11374
11375         if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {
11376                 while (offs & (align - 1)) {
11377                         /*
11378                          * Assert that our alignment is off by a number which
11379                          * is itself sizeof (uint32_t) aligned.
11380                          */
11381                         ASSERT(!((align - (offs & (align - 1))) &
11382                             (sizeof (uint32_t) - 1)));
11383                         DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
11384                         offs += sizeof (uint32_t);
11385                 }
11386
11387                 if ((uint64_t)(soffs = offs + needed) > buf->dtb_cur_limit) {
11388                         if (buf->dtb_cur_limit == buf->dtb_limit) {
11389                                 buf->dtb_cur_limit = buf->dtb_size;
11390
11391                                 atomic_add_32(&state->dts_buf_over_limit, 1);
11392                                 /**
11393                                  * Set an AST on the current processor
11394                                  * so that we can wake up the process
11395                                  * outside of probe context, when we know
11396                                  * it is safe to do so
11397                                  */
11398                                 minor_t minor = getminor(state->dts_dev);
11399                                 ASSERT(minor < 32);
11400
11401                                 atomic_or_32(&dtrace_wake_clients, 1 << minor);
11402                                 ast_dtrace_on();
11403                         }
11404                         if ((uint64_t)soffs > buf->dtb_size) {
11405                                 dtrace_buffer_drop(buf);
11406                                 return (-1);
11407                         }
11408                 }
11409
11410                 if (mstate == NULL)
11411                         return (offs);
11412
11413                 mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
11414                 mstate->dtms_scratch_size = buf->dtb_size - soffs;
11415                 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
11416
11417                 return (offs);
11418         }
11419
11420         if (buf->dtb_flags & DTRACEBUF_FILL) {
11421                 if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
11422                     (buf->dtb_flags & DTRACEBUF_FULL))
11423                         return (-1);
11424                 goto out;
11425         }
11426
11427         total_off = needed + (offs & (align - 1));
11428
11429         /*
11430          * For a ring buffer, life is quite a bit more complicated.  Before
11431          * we can store any padding, we need to adjust our wrapping offset.
11432          * (If we've never before wrapped or we're not about to, no adjustment
11433          * is required.)
11434          */
11435         if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
11436             offs + total_off > buf->dtb_size) {
11437                 woffs = buf->dtb_xamot_offset;
11438
11439                 if (offs + total_off > buf->dtb_size) {
11440                         /*
11441                          * We can't fit in the end of the buffer.  First, a
11442                          * sanity check that we can fit in the buffer at all.
11443                          */
11444                         if (total_off > buf->dtb_size) {
11445                                 dtrace_buffer_drop(buf);
11446                                 return (-1);
11447                         }
11448
11449                         /*
11450                          * We're going to be storing at the top of the buffer,
11451                          * so now we need to deal with the wrapped offset.  We
11452                          * only reset our wrapped offset to 0 if it is
11453                          * currently greater than the current offset.  If it
11454                          * is less than the current offset, it is because a
11455                          * previous allocation induced a wrap -- but the
11456                          * allocation didn't subsequently take the space due
11457                          * to an error or false predicate evaluation.  In this
11458                          * case, we'll just leave the wrapped offset alone: if
11459                          * the wrapped offset hasn't been advanced far enough
11460                          * for this allocation, it will be adjusted in the
11461                          * lower loop.
11462                          */
11463                         if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
11464                                 if (woffs >= offs)
11465                                         woffs = 0;
11466                         } else {
11467                                 woffs = 0;
11468                         }
11469
11470                         /*
11471                          * Now we know that we're going to be storing to the
11472                          * top of the buffer and that there is room for us
11473                          * there.  We need to clear the buffer from the current
11474                          * offset to the end (there may be old gunk there).
11475                          */
11476                         while ((uint64_t)offs < buf->dtb_size)
11477                                 tomax[offs++] = 0;
11478
11479                         /*
11480                          * We need to set our offset to zero.  And because we
11481                          * are wrapping, we need to set the bit indicating as
11482                          * much.  We can also adjust our needed space back
11483                          * down to the space required by the ECB -- we know
11484                          * that the top of the buffer is aligned.
11485                          */
11486                         offs = 0;
11487                         total_off = needed;
11488                         buf->dtb_flags |= DTRACEBUF_WRAPPED;
11489                 } else {
11490                         /*
11491                          * There is room for us in the buffer, so we simply
11492                          * need to check the wrapped offset.
11493                          */
11494                         if (woffs < offs) {
11495                                 /*
11496                                  * The wrapped offset is less than the offset.
11497                                  * This can happen if we allocated buffer space
11498                                  * that induced a wrap, but then we didn't
11499                                  * subsequently take the space due to an error
11500                                  * or false predicate evaluation.  This is
11501                                  * okay; we know that _this_ allocation isn't
11502                                  * going to induce a wrap.  We still can't
11503                                  * reset the wrapped offset to be zero,
11504                                  * however: the space may have been trashed in
11505                                  * the previous failed probe attempt.  But at
11506                                  * least the wrapped offset doesn't need to
11507                                  * be adjusted at all...
11508                                  */
11509                                 goto out;
11510                         }
11511                 }
11512
11513                 while (offs + total_off > (size_t)woffs) {
11514                         dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
11515                         size_t size;
11516
11517                         if (epid == DTRACE_EPIDNONE) {
11518                                 size = sizeof (uint32_t);
11519                         } else {
11520                                 ASSERT(epid <= (dtrace_epid_t)state->dts_necbs);
11521                                 ASSERT(state->dts_ecbs[epid - 1] != NULL);
11522
11523                                 size = state->dts_ecbs[epid - 1]->dte_size;
11524                         }
11525
11526                         ASSERT(woffs + size <= buf->dtb_size);
11527                         ASSERT(size != 0);
11528
11529                         if (woffs + size == buf->dtb_size) {
11530                                 /*
11531                                  * We've reached the end of the buffer; we want
11532                                  * to set the wrapped offset to 0 and break
11533                                  * out.  However, if the offs is 0, then we're
11534                                  * in a strange edge-condition:  the amount of
11535                                  * space that we want to reserve plus the size
11536                                  * of the record that we're overwriting is
11537                                  * greater than the size of the buffer.  This
11538                                  * is problematic because if we reserve the
11539                                  * space but subsequently don't consume it (due
11540                                  * to a failed predicate or error) the wrapped
11541                                  * offset will be 0 -- yet the EPID at offset 0
11542                                  * will not be committed.  This situation is
11543                                  * relatively easy to deal with:  if we're in
11544                                  * this case, the buffer is indistinguishable
11545                                  * from one that hasn't wrapped; we need only
11546                                  * finish the job by clearing the wrapped bit,
11547                                  * explicitly setting the offset to be 0, and
11548                                  * zero'ing out the old data in the buffer.
11549                                  */
11550                                 if (offs == 0) {
11551                                         buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
11552                                         buf->dtb_offset = 0;
11553                                         woffs = total_off;
11554
11555                                         while ((uint64_t)woffs < buf->dtb_size)
11556                                                 tomax[woffs++] = 0;
11557                                 }
11558
11559                                 woffs = 0;
11560                                 break;
11561                         }
11562
11563                         woffs += size;
11564                 }
11565
11566                 /*
11567                  * We have a wrapped offset.  It may be that the wrapped offset
11568                  * has become zero -- that's okay.
11569                  */
11570                 buf->dtb_xamot_offset = woffs;
11571         }
11572
11573 out:
11574         /*
11575          * Now we can plow the buffer with any necessary padding.
11576          */
11577         while (offs & (align - 1)) {
11578                 /*
11579                  * Assert that our alignment is off by a number which
11580                  * is itself sizeof (uint32_t) aligned.
11581                  */
11582                 ASSERT(!((align - (offs & (align - 1))) &
11583                     (sizeof (uint32_t) - 1)));
11584                 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
11585                 offs += sizeof (uint32_t);
11586         }
11587
11588         if (buf->dtb_flags & DTRACEBUF_FILL) {
11589                 if (offs + needed > buf->dtb_size - state->dts_reserve) {
11590                         buf->dtb_flags |= DTRACEBUF_FULL;
11591                         return (-1);
11592                 }
11593         }
11594
11595         if (mstate == NULL)
11596                 return (offs);
11597
11598         /*
11599          * For ring buffers and fill buffers, the scratch space is always
11600          * the inactive buffer.
11601          */
11602         mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
11603         mstate->dtms_scratch_size = buf->dtb_size;
11604         mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
11605
11606         return (offs);
11607 }
11608
11609 static void
11610 dtrace_buffer_polish(dtrace_buffer_t *buf)
11611 {
11612         ASSERT(buf->dtb_flags & DTRACEBUF_RING);
11613         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11614
11615         if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
11616                 return;
11617
11618         /*
11619          * We need to polish the ring buffer.  There are three cases:
11620          *
11621          * - The first (and presumably most common) is that there is no gap
11622          *   between the buffer offset and the wrapped offset.  In this case,
11623          *   there is nothing in the buffer that isn't valid data; we can
11624          *   mark the buffer as polished and return.
11625          *
11626          * - The second (less common than the first but still more common
11627          *   than the third) is that there is a gap between the buffer offset
11628          *   and the wrapped offset, and the wrapped offset is larger than the
11629          *   buffer offset.  This can happen because of an alignment issue, or
11630          *   can happen because of a call to dtrace_buffer_reserve() that
11631          *   didn't subsequently consume the buffer space.  In this case,
11632          *   we need to zero the data from the buffer offset to the wrapped
11633          *   offset.
11634          *
11635          * - The third (and least common) is that there is a gap between the
11636          *   buffer offset and the wrapped offset, but the wrapped offset is
11637          *   _less_ than the buffer offset.  This can only happen because a
11638          *   call to dtrace_buffer_reserve() induced a wrap, but the space
11639          *   was not subsequently consumed.  In this case, we need to zero the
11640          *   space from the offset to the end of the buffer _and_ from the
11641          *   top of the buffer to the wrapped offset.
11642          */
11643         if (buf->dtb_offset < buf->dtb_xamot_offset) {
11644                 bzero(buf->dtb_tomax + buf->dtb_offset,
11645                     buf->dtb_xamot_offset - buf->dtb_offset);
11646         }
11647
11648         if (buf->dtb_offset > buf->dtb_xamot_offset) {
11649                 bzero(buf->dtb_tomax + buf->dtb_offset,
11650                     buf->dtb_size - buf->dtb_offset);
11651                 bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
11652         }
11653 }
11654
11655 static void
11656 dtrace_buffer_free(dtrace_buffer_t *bufs)
11657 {
11658         int i;
11659
11660         for (i = 0; i < (int)NCPU; i++) {
11661                 dtrace_buffer_t *buf = &bufs[i];
11662
11663                 if (buf->dtb_tomax == NULL) {
11664                         ASSERT(buf->dtb_xamot == NULL);
11665                         ASSERT(buf->dtb_size == 0);
11666                         continue;
11667                 }
11668
11669                 if (buf->dtb_xamot != NULL) {
11670                         ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
11671                         kmem_free(buf->dtb_xamot, buf->dtb_size);
11672
11673                         ASSERT(dtrace_buffer_memory_inuse >= buf->dtb_size);
11674                         dtrace_buffer_memory_inuse -= buf->dtb_size;
11675                 }
11676
11677                 kmem_free(buf->dtb_tomax, buf->dtb_size);
11678                 ASSERT(dtrace_buffer_memory_inuse >= buf->dtb_size);
11679                 dtrace_buffer_memory_inuse -= buf->dtb_size;
11680
11681                 buf->dtb_size = 0;
11682                 buf->dtb_tomax = NULL;
11683                 buf->dtb_xamot = NULL;
11684         }
11685 }
11686
11687 /*
11688  * DTrace Enabling Functions
11689  */
11690 static dtrace_enabling_t *
11691 dtrace_enabling_create(dtrace_vstate_t *vstate)
11692 {
11693         dtrace_enabling_t *enab;
11694
11695         enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
11696         enab->dten_vstate = vstate;
11697
11698         return (enab);
11699 }
11700
11701 static void
11702 dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)
11703 {
11704         dtrace_ecbdesc_t **ndesc;
11705         size_t osize, nsize;
11706
11707         /*
11708          * We can't add to enablings after we've enabled them, or after we've
11709          * retained them.
11710          */
11711         ASSERT(enab->dten_probegen == 0);
11712         ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
11713
11714         /* APPLE NOTE: this protects against gcc 4.0 botch on x86 */
11715         if (ecb == NULL) return;
11716
11717         if (enab->dten_ndesc < enab->dten_maxdesc) {
11718                 enab->dten_desc[enab->dten_ndesc++] = ecb;
11719                 return;
11720         }
11721
11722         osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
11723
11724         if (enab->dten_maxdesc == 0) {
11725                 enab->dten_maxdesc = 1;
11726         } else {
11727                 enab->dten_maxdesc <<= 1;
11728         }
11729
11730         ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
11731
11732         nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
11733         ndesc = kmem_zalloc(nsize, KM_SLEEP);
11734         bcopy(enab->dten_desc, ndesc, osize);
11735         kmem_free(enab->dten_desc, osize);
11736
11737         enab->dten_desc = ndesc;
11738         enab->dten_desc[enab->dten_ndesc++] = ecb;
11739 }
11740
11741 static void
11742 dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,
11743     dtrace_probedesc_t *pd)
11744 {
11745         dtrace_ecbdesc_t *new;
11746         dtrace_predicate_t *pred;
11747         dtrace_actdesc_t *act;
11748
11749         /*
11750          * We're going to create a new ECB description that matches the
11751          * specified ECB in every way, but has the specified probe description.
11752          */
11753         new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
11754
11755         if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
11756                 dtrace_predicate_hold(pred);
11757
11758         for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
11759                 dtrace_actdesc_hold(act);
11760
11761         new->dted_action = ecb->dted_action;
11762         new->dted_pred = ecb->dted_pred;
11763         new->dted_probe = *pd;
11764         new->dted_uarg = ecb->dted_uarg;
11765
11766         dtrace_enabling_add(enab, new);
11767 }
11768
11769 static void
11770 dtrace_enabling_dump(dtrace_enabling_t *enab)
11771 {
11772         int i;
11773
11774         for (i = 0; i < enab->dten_ndesc; i++) {
11775                 dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
11776
11777                 cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
11778                     desc->dtpd_provider, desc->dtpd_mod,
11779                     desc->dtpd_func, desc->dtpd_name);
11780         }
11781 }
11782
11783 static void
11784 dtrace_enabling_destroy(dtrace_enabling_t *enab)
11785 {
11786         int i;
11787         dtrace_ecbdesc_t *ep;
11788         dtrace_vstate_t *vstate = enab->dten_vstate;
11789
11790         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11791
11792         for (i = 0; i < enab->dten_ndesc; i++) {
11793                 dtrace_actdesc_t *act, *next;
11794                 dtrace_predicate_t *pred;
11795
11796                 ep = enab->dten_desc[i];
11797
11798                 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
11799                         dtrace_predicate_release(pred, vstate);
11800
11801                 for (act = ep->dted_action; act != NULL; act = next) {
11802                         next = act->dtad_next;
11803                         dtrace_actdesc_release(act, vstate);
11804                 }
11805
11806                 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
11807         }
11808
11809         kmem_free(enab->dten_desc,
11810             enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
11811
11812         /*
11813          * If this was a retained enabling, decrement the dts_nretained count
11814          * and take it off of the dtrace_retained list.
11815          */
11816         if (enab->dten_prev != NULL || enab->dten_next != NULL ||
11817             dtrace_retained == enab) {
11818                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
11819                 ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
11820                 enab->dten_vstate->dtvs_state->dts_nretained--;
11821                 dtrace_retained_gen++;
11822         }
11823
11824         if (enab->dten_prev == NULL) {
11825                 if (dtrace_retained == enab) {
11826                         dtrace_retained = enab->dten_next;
11827
11828                         if (dtrace_retained != NULL)
11829                                 dtrace_retained->dten_prev = NULL;
11830                 }
11831         } else {
11832                 ASSERT(enab != dtrace_retained);
11833                 ASSERT(dtrace_retained != NULL);
11834                 enab->dten_prev->dten_next = enab->dten_next;
11835         }
11836
11837         if (enab->dten_next != NULL) {
11838                 ASSERT(dtrace_retained != NULL);
11839                 enab->dten_next->dten_prev = enab->dten_prev;
11840         }
11841
11842         kmem_free(enab, sizeof (dtrace_enabling_t));
11843 }
11844
11845 static int
11846 dtrace_enabling_retain(dtrace_enabling_t *enab)
11847 {
11848         dtrace_state_t *state;
11849
11850         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11851         ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
11852         ASSERT(enab->dten_vstate != NULL);
11853
11854         state = enab->dten_vstate->dtvs_state;
11855         ASSERT(state != NULL);
11856
11857         /*
11858          * We only allow each state to retain dtrace_retain_max enablings.
11859          */
11860         if (state->dts_nretained >= dtrace_retain_max)
11861                 return (ENOSPC);
11862
11863         state->dts_nretained++;
11864         dtrace_retained_gen++;
11865
11866         if (dtrace_retained == NULL) {
11867                 dtrace_retained = enab;
11868                 return (0);
11869         }
11870
11871         enab->dten_next = dtrace_retained;
11872         dtrace_retained->dten_prev = enab;
11873         dtrace_retained = enab;
11874
11875         return (0);
11876 }
11877
11878 static int
11879 dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,
11880     dtrace_probedesc_t *create)
11881 {
11882         dtrace_enabling_t *new, *enab;
11883         int found = 0, err = ENOENT;
11884
11885         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11886         ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
11887         ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
11888         ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
11889         ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
11890
11891         new = dtrace_enabling_create(&state->dts_vstate);
11892
11893         /*
11894          * Iterate over all retained enablings, looking for enablings that
11895          * match the specified state.
11896          */
11897         for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
11898                 int i;
11899
11900                 /*
11901                  * dtvs_state can only be NULL for helper enablings -- and
11902                  * helper enablings can't be retained.
11903                  */
11904                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
11905
11906                 if (enab->dten_vstate->dtvs_state != state)
11907                         continue;
11908
11909                 /*
11910                  * Now iterate over each probe description; we're looking for
11911                  * an exact match to the specified probe description.
11912                  */
11913                 for (i = 0; i < enab->dten_ndesc; i++) {
11914                         dtrace_ecbdesc_t *ep = enab->dten_desc[i];
11915                         dtrace_probedesc_t *pd = &ep->dted_probe;
11916
11917                         /* APPLE NOTE: Darwin employs size bounded string operation. */
11918                         if (strncmp(pd->dtpd_provider, match->dtpd_provider, DTRACE_PROVNAMELEN))
11919                                 continue;
11920
11921                         if (strncmp(pd->dtpd_mod, match->dtpd_mod, DTRACE_MODNAMELEN))
11922                                 continue;
11923
11924                         if (strncmp(pd->dtpd_func, match->dtpd_func, DTRACE_FUNCNAMELEN))
11925                                 continue;
11926
11927                         if (strncmp(pd->dtpd_name, match->dtpd_name, DTRACE_NAMELEN))
11928                                 continue;
11929
11930                         /*
11931                          * We have a winning probe!  Add it to our growing
11932                          * enabling.
11933                          */
11934                         found = 1;
11935                         dtrace_enabling_addlike(new, ep, create);
11936                 }
11937         }
11938
11939         if (!found || (err = dtrace_enabling_retain(new)) != 0) {
11940                 dtrace_enabling_destroy(new);
11941                 return (err);
11942         }
11943
11944         return (0);
11945 }
11946
11947 static void
11948 dtrace_enabling_retract(dtrace_state_t *state)
11949 {
11950         dtrace_enabling_t *enab, *next;
11951
11952         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11953
11954         /*
11955          * Iterate over all retained enablings, destroy the enablings retained
11956          * for the specified state.
11957          */
11958         for (enab = dtrace_retained; enab != NULL; enab = next) {
11959                 next = enab->dten_next;
11960
11961                 /*
11962                  * dtvs_state can only be NULL for helper enablings -- and
11963                  * helper enablings can't be retained.
11964                  */
11965                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
11966
11967                 if (enab->dten_vstate->dtvs_state == state) {
11968                         ASSERT(state->dts_nretained > 0);
11969                         dtrace_enabling_destroy(enab);
11970                 }
11971         }
11972
11973         ASSERT(state->dts_nretained == 0);
11974 }
11975
11976 static int
11977 dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched, dtrace_match_cond_t *cond)
11978 {
11979         int i = 0;
11980         int total_matched = 0, matched = 0;
11981
11982         LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
11983         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11984
11985         for (i = 0; i < enab->dten_ndesc; i++) {
11986                 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
11987
11988                 enab->dten_current = ep;
11989                 enab->dten_error = 0;
11990
11991                 /**
11992                  * Before doing a dtrace_probe_enable, which is really
11993                  * expensive, check that this enabling matches the matching precondition
11994                  * if we have one
11995                  */
11996                 if (cond && (cond->dmc_func(&ep->dted_probe, cond->dmc_data) == 0)) {
11997                         continue;
11998                 }
11999                 /*
12000                  * If a provider failed to enable a probe then get out and
12001                  * let the consumer know we failed.
12002                  */
12003                 if ((matched = dtrace_probe_enable(&ep->dted_probe, enab, ep)) < 0)
12004                         return (EBUSY);
12005
12006                 total_matched += matched;
12007
12008                 if (enab->dten_error != 0) {
12009                         /*
12010                          * If we get an error half-way through enabling the
12011                          * probes, we kick out -- perhaps with some number of
12012                          * them enabled.  Leaving enabled probes enabled may
12013                          * be slightly confusing for user-level, but we expect
12014                          * that no one will attempt to actually drive on in
12015                          * the face of such errors.  If this is an anonymous
12016                          * enabling (indicated with a NULL nmatched pointer),
12017                          * we cmn_err() a message.  We aren't expecting to
12018                          * get such an error -- such as it can exist at all,
12019                          * it would be a result of corrupted DOF in the driver
12020                          * properties.
12021                          */
12022                         if (nmatched == NULL) {
12023                                 cmn_err(CE_WARN, "dtrace_enabling_match() "
12024                                     "error on %p: %d", (void *)ep,
12025                                     enab->dten_error);
12026                         }
12027
12028                         return (enab->dten_error);
12029                 }
12030
12031                 ep->dted_probegen = dtrace_probegen;
12032         }
12033
12034         if (nmatched != NULL)
12035                 *nmatched = total_matched;
12036
12037         return (0);
12038 }
12039
12040 static void
12041 dtrace_enabling_matchall_with_cond(dtrace_match_cond_t *cond)
12042 {
12043         dtrace_enabling_t *enab;
12044
12045         lck_mtx_lock(&cpu_lock);
12046         lck_mtx_lock(&dtrace_lock);
12047
12048         /*
12049          * Iterate over all retained enablings to see if any probes match
12050          * against them.  We only perform this operation on enablings for which
12051          * we have sufficient permissions by virtue of being in the global zone
12052          * or in the same zone as the DTrace client.  Because we can be called
12053          * after dtrace_detach() has been called, we cannot assert that there
12054          * are retained enablings.  We can safely load from dtrace_retained,
12055          * however:  the taskq_destroy() at the end of dtrace_detach() will
12056          * block pending our completion.
12057          */
12058
12059         /*
12060          * Darwin doesn't do zones.
12061          * Behave as if always in "global" zone."
12062          */
12063         for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12064                 (void) dtrace_enabling_match(enab, NULL, cond);
12065         }
12066
12067         lck_mtx_unlock(&dtrace_lock);
12068         lck_mtx_unlock(&cpu_lock);
12069
12070 }
12071
12072 static void
12073 dtrace_enabling_matchall(void)
12074 {
12075         dtrace_enabling_matchall_with_cond(NULL);
12076 }
12077
12078
12079
12080 /*
12081  * If an enabling is to be enabled without having matched probes (that is, if
12082  * dtrace_state_go() is to be called on the underlying dtrace_state_t), the
12083  * enabling must be _primed_ by creating an ECB for every ECB description.
12084  * This must be done to assure that we know the number of speculations, the
12085  * number of aggregations, the minimum buffer size needed, etc. before we
12086  * transition out of DTRACE_ACTIVITY_INACTIVE.  To do this without actually
12087  * enabling any probes, we create ECBs for every ECB decription, but with a
12088  * NULL probe -- which is exactly what this function does.
12089  */
12090 static void
12091 dtrace_enabling_prime(dtrace_state_t *state)
12092 {
12093         dtrace_enabling_t *enab;
12094         int i;
12095
12096         for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12097                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12098
12099                 if (enab->dten_vstate->dtvs_state != state)
12100                         continue;
12101
12102                 /*
12103                  * We don't want to prime an enabling more than once, lest
12104                  * we allow a malicious user to induce resource exhaustion.
12105                  * (The ECBs that result from priming an enabling aren't
12106                  * leaked -- but they also aren't deallocated until the
12107                  * consumer state is destroyed.)
12108                  */
12109                 if (enab->dten_primed)
12110                         continue;
12111
12112                 for (i = 0; i < enab->dten_ndesc; i++) {
12113                         enab->dten_current = enab->dten_desc[i];
12114                         (void) dtrace_probe_enable(NULL, enab, NULL);
12115                 }
12116
12117                 enab->dten_primed = 1;
12118         }
12119 }
12120
12121 /*
12122  * Called to indicate that probes should be provided due to retained
12123  * enablings.  This is implemented in terms of dtrace_probe_provide(), but it
12124  * must take an initial lap through the enabling calling the dtps_provide()
12125  * entry point explicitly to allow for autocreated probes.
12126  */
12127 static void
12128 dtrace_enabling_provide(dtrace_provider_t *prv)
12129 {
12130         int i, all = 0;
12131         dtrace_probedesc_t desc;
12132         dtrace_genid_t gen;
12133
12134         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12135         LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
12136
12137         if (prv == NULL) {
12138                 all = 1;
12139                 prv = dtrace_provider;
12140         }
12141
12142         do {
12143                 dtrace_enabling_t *enab;
12144                 void *parg = prv->dtpv_arg;
12145
12146 retry:
12147                 gen = dtrace_retained_gen;
12148                 for (enab = dtrace_retained; enab != NULL;
12149                     enab = enab->dten_next) {
12150                         for (i = 0; i < enab->dten_ndesc; i++) {
12151                                 desc = enab->dten_desc[i]->dted_probe;
12152                                 lck_mtx_unlock(&dtrace_lock);
12153                                 prv->dtpv_pops.dtps_provide(parg, &desc);
12154                                 lck_mtx_lock(&dtrace_lock);
12155                                 /*
12156                                  * Process the retained enablings again if
12157                                  * they have changed while we weren't holding
12158                                  * dtrace_lock.
12159                                  */
12160                                 if (gen != dtrace_retained_gen)
12161                                         goto retry;
12162                         }
12163                 }
12164         } while (all && (prv = prv->dtpv_next) != NULL);
12165
12166         lck_mtx_unlock(&dtrace_lock);
12167         dtrace_probe_provide(NULL, all ? NULL : prv);
12168         lck_mtx_lock(&dtrace_lock);
12169 }
12170
12171 /*
12172  * DTrace DOF Functions
12173  */
12174 /*ARGSUSED*/
12175 static void
12176 dtrace_dof_error(dof_hdr_t *dof, const char *str)
12177 {
12178 #pragma unused(dof) /* __APPLE__ */
12179         if (dtrace_err_verbose)
12180                 cmn_err(CE_WARN, "failed to process DOF: %s", str);
12181
12182 #ifdef DTRACE_ERRDEBUG
12183         dtrace_errdebug(str);
12184 #endif
12185 }
12186
12187 /*
12188  * Create DOF out of a currently enabled state.  Right now, we only create
12189  * DOF containing the run-time options -- but this could be expanded to create
12190  * complete DOF representing the enabled state.
12191  */
12192 static dof_hdr_t *
12193 dtrace_dof_create(dtrace_state_t *state)
12194 {
12195         dof_hdr_t *dof;
12196         dof_sec_t *sec;
12197         dof_optdesc_t *opt;
12198         int i, len = sizeof (dof_hdr_t) +
12199             roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
12200             sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12201
12202         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12203
12204         dof = kmem_zalloc_aligned(len, 8, KM_SLEEP);
12205         dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
12206         dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
12207         dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
12208         dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
12209
12210         dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
12211         dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
12212         dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
12213         dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
12214         dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
12215         dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
12216
12217         dof->dofh_flags = 0;
12218         dof->dofh_hdrsize = sizeof (dof_hdr_t);
12219         dof->dofh_secsize = sizeof (dof_sec_t);
12220         dof->dofh_secnum = 1;   /* only DOF_SECT_OPTDESC */
12221         dof->dofh_secoff = sizeof (dof_hdr_t);
12222         dof->dofh_loadsz = len;
12223         dof->dofh_filesz = len;
12224         dof->dofh_pad = 0;
12225
12226         /*
12227          * Fill in the option section header...
12228          */
12229         sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
12230         sec->dofs_type = DOF_SECT_OPTDESC;
12231         sec->dofs_align = sizeof (uint64_t);
12232         sec->dofs_flags = DOF_SECF_LOAD;
12233         sec->dofs_entsize = sizeof (dof_optdesc_t);
12234
12235         opt = (dof_optdesc_t *)((uintptr_t)sec +
12236             roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
12237
12238         sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
12239         sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12240
12241         for (i = 0; i < DTRACEOPT_MAX; i++) {
12242                 opt[i].dofo_option = i;
12243                 opt[i].dofo_strtab = DOF_SECIDX_NONE;
12244                 opt[i].dofo_value = state->dts_options[i];
12245         }
12246
12247         return (dof);
12248 }
12249
12250 static dof_hdr_t *
12251 dtrace_dof_copyin(user_addr_t uarg, int *errp)
12252 {
12253         dof_hdr_t hdr, *dof;
12254
12255         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
12256
12257         /*
12258          * First, we're going to copyin() the sizeof (dof_hdr_t).
12259          */
12260         if (copyin(uarg, &hdr, sizeof (hdr)) != 0) {
12261                 dtrace_dof_error(NULL, "failed to copyin DOF header");
12262                 *errp = EFAULT;
12263                 return (NULL);
12264         }
12265
12266         /*
12267          * Now we'll allocate the entire DOF and copy it in -- provided
12268          * that the length isn't outrageous.
12269          */
12270         if (hdr.dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
12271                 dtrace_dof_error(&hdr, "load size exceeds maximum");
12272                 *errp = E2BIG;
12273                 return (NULL);
12274         }
12275
12276         if (hdr.dofh_loadsz < sizeof (hdr)) {
12277                 dtrace_dof_error(&hdr, "invalid load size");
12278                 *errp = EINVAL;
12279                 return (NULL);
12280         }
12281
12282         dof = kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP);
12283
12284         if (copyin(uarg, dof, hdr.dofh_loadsz) != 0  ||
12285           dof->dofh_loadsz != hdr.dofh_loadsz) {
12286             kmem_free_aligned(dof, hdr.dofh_loadsz);
12287             *errp = EFAULT;
12288             return (NULL);
12289         }
12290
12291         return (dof);
12292 }
12293
12294 static dof_hdr_t *
12295 dtrace_dof_copyin_from_proc(proc_t* p, user_addr_t uarg, int *errp)
12296 {
12297         dof_hdr_t hdr, *dof;
12298
12299         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
12300
12301         /*
12302          * First, we're going to copyin() the sizeof (dof_hdr_t).
12303          */
12304         if (uread(p, &hdr, sizeof(hdr), uarg) != KERN_SUCCESS) {
12305                 dtrace_dof_error(NULL, "failed to copyin DOF header");
12306                 *errp = EFAULT;
12307                 return (NULL);
12308         }
12309
12310         /*
12311          * Now we'll allocate the entire DOF and copy it in -- provided
12312          * that the length isn't outrageous.
12313          */
12314         if (hdr.dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
12315                 dtrace_dof_error(&hdr, "load size exceeds maximum");
12316                 *errp = E2BIG;
12317                 return (NULL);
12318         }
12319
12320         if (hdr.dofh_loadsz < sizeof (hdr)) {
12321                 dtrace_dof_error(&hdr, "invalid load size");
12322                 *errp = EINVAL;
12323                 return (NULL);
12324         }
12325
12326         dof = kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP);
12327
12328         if (uread(p, dof, hdr.dofh_loadsz, uarg) != KERN_SUCCESS) {
12329                 kmem_free_aligned(dof, hdr.dofh_loadsz);
12330                 *errp = EFAULT;
12331                 return (NULL);
12332         }
12333
12334         return (dof);
12335 }
12336
12337 static void
12338 dtrace_dof_destroy(dof_hdr_t *dof)
12339 {
12340         kmem_free_aligned(dof, dof->dofh_loadsz);
12341 }
12342
12343 static dof_hdr_t *
12344 dtrace_dof_property(const char *name)
12345 {
12346         unsigned int len = 0;
12347         dof_hdr_t *dof;
12348
12349         if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
12350                 return NULL;
12351         }
12352
12353         if (!PEReadNVRAMProperty(name, NULL, &len)) {
12354                 return NULL;
12355         }
12356
12357         dof = kmem_alloc_aligned(len, 8, KM_SLEEP);
12358
12359         if (!PEReadNVRAMProperty(name, dof, &len)) {
12360                 dtrace_dof_destroy(dof);
12361                 dtrace_dof_error(NULL, "unreadable DOF");
12362                 return NULL;
12363         }
12364
12365         if (len < sizeof (dof_hdr_t)) {
12366                 dtrace_dof_destroy(dof);
12367                 dtrace_dof_error(NULL, "truncated header");
12368                 return (NULL);
12369         }
12370
12371         if (len < dof->dofh_loadsz) {
12372                 dtrace_dof_destroy(dof);
12373                 dtrace_dof_error(NULL, "truncated DOF");
12374                 return (NULL);
12375         }
12376
12377         if (len != dof->dofh_loadsz) {
12378                 dtrace_dof_destroy(dof);
12379                 dtrace_dof_error(NULL, "invalid DOF size");
12380                 return (NULL);
12381         }
12382
12383         if (dof->dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
12384                 dtrace_dof_destroy(dof);
12385                 dtrace_dof_error(NULL, "oversized DOF");
12386                 return (NULL);
12387         }
12388
12389         return (dof);
12390 }
12391
12392 /*
12393  * Return the dof_sec_t pointer corresponding to a given section index.  If the
12394  * index is not valid, dtrace_dof_error() is called and NULL is returned.  If
12395  * a type other than DOF_SECT_NONE is specified, the header is checked against
12396  * this type and NULL is returned if the types do not match.
12397  */
12398 static dof_sec_t *
12399 dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
12400 {
12401         dof_sec_t *sec = (dof_sec_t *)(uintptr_t)
12402             ((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
12403
12404         if (i >= dof->dofh_secnum) {
12405                 dtrace_dof_error(dof, "referenced section index is invalid");
12406                 return (NULL);
12407         }
12408
12409         if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
12410                 dtrace_dof_error(dof, "referenced section is not loadable");
12411                 return (NULL);
12412         }
12413
12414         if (type != DOF_SECT_NONE && type != sec->dofs_type) {
12415                 dtrace_dof_error(dof, "referenced section is the wrong type");
12416                 return (NULL);
12417         }
12418
12419         return (sec);
12420 }
12421
12422 static dtrace_probedesc_t *
12423 dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)
12424 {
12425         dof_probedesc_t *probe;
12426         dof_sec_t *strtab;
12427         uintptr_t daddr = (uintptr_t)dof;
12428         uintptr_t str;
12429         size_t size;
12430
12431         if (sec->dofs_type != DOF_SECT_PROBEDESC) {
12432                 dtrace_dof_error(dof, "invalid probe section");
12433                 return (NULL);
12434         }
12435
12436         if (sec->dofs_align != sizeof (dof_secidx_t)) {
12437                 dtrace_dof_error(dof, "bad alignment in probe description");
12438                 return (NULL);
12439         }
12440
12441         if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
12442                 dtrace_dof_error(dof, "truncated probe description");
12443                 return (NULL);
12444         }
12445
12446         probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
12447         strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
12448
12449         if (strtab == NULL)
12450                 return (NULL);
12451
12452         str = daddr + strtab->dofs_offset;
12453         size = strtab->dofs_size;
12454
12455         if (probe->dofp_provider >= strtab->dofs_size) {
12456                 dtrace_dof_error(dof, "corrupt probe provider");
12457                 return (NULL);
12458         }
12459
12460         (void) strncpy(desc->dtpd_provider,
12461             (char *)(str + probe->dofp_provider),
12462             MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));
12463
12464         /* APPLE NOTE: Darwin employs size bounded string operation. */
12465         desc->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
12466
12467         if (probe->dofp_mod >= strtab->dofs_size) {
12468                 dtrace_dof_error(dof, "corrupt probe module");
12469                 return (NULL);
12470         }
12471
12472         (void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
12473             MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));
12474
12475         /* APPLE NOTE: Darwin employs size bounded string operation. */
12476         desc->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
12477
12478         if (probe->dofp_func >= strtab->dofs_size) {
12479                 dtrace_dof_error(dof, "corrupt probe function");
12480                 return (NULL);
12481         }
12482
12483         (void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
12484             MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));
12485
12486         /* APPLE NOTE: Darwin employs size bounded string operation. */
12487         desc->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
12488
12489         if (probe->dofp_name >= strtab->dofs_size) {
12490                 dtrace_dof_error(dof, "corrupt probe name");
12491                 return (NULL);
12492         }
12493
12494         (void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
12495             MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));
12496
12497         /* APPLE NOTE: Darwin employs size bounded string operation. */
12498         desc->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
12499
12500         return (desc);
12501 }
12502
12503 static dtrace_difo_t *
12504 dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
12505     cred_t *cr)
12506 {
12507         dtrace_difo_t *dp;
12508         size_t ttl = 0;
12509         dof_difohdr_t *dofd;
12510         uintptr_t daddr = (uintptr_t)dof;
12511         size_t max_size = dtrace_difo_maxsize;
12512         uint_t i;
12513         int l, n;
12514
12515
12516         static const struct {
12517                 int section;
12518                 int bufoffs;
12519                 int lenoffs;
12520                 int entsize;
12521                 int align;
12522                 const char *msg;
12523         } difo[] = {
12524                 { DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
12525                 offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
12526                 sizeof (dif_instr_t), "multiple DIF sections" },
12527
12528                 { DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
12529                 offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
12530                 sizeof (uint64_t), "multiple integer tables" },
12531
12532                 { DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
12533                 offsetof(dtrace_difo_t, dtdo_strlen), 0,
12534                 sizeof (char), "multiple string tables" },
12535
12536                 { DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
12537                 offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
12538                 sizeof (uint_t), "multiple variable tables" },
12539
12540                 { DOF_SECT_NONE, 0, 0, 0, 0, NULL }
12541         };
12542
12543         if (sec->dofs_type != DOF_SECT_DIFOHDR) {
12544                 dtrace_dof_error(dof, "invalid DIFO header section");
12545                 return (NULL);
12546         }
12547
12548         if (sec->dofs_align != sizeof (dof_secidx_t)) {
12549                 dtrace_dof_error(dof, "bad alignment in DIFO header");
12550                 return (NULL);
12551         }
12552
12553         if (sec->dofs_size < sizeof (dof_difohdr_t) ||
12554             sec->dofs_size % sizeof (dof_secidx_t)) {
12555                 dtrace_dof_error(dof, "bad size in DIFO header");
12556                 return (NULL);
12557         }
12558
12559         dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
12560         n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;
12561
12562         dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
12563         dp->dtdo_rtype = dofd->dofd_rtype;
12564
12565         for (l = 0; l < n; l++) {
12566                 dof_sec_t *subsec;
12567                 void **bufp;
12568                 uint32_t *lenp;
12569
12570                 if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
12571                     dofd->dofd_links[l])) == NULL)
12572                         goto err; /* invalid section link */
12573
12574                 if (ttl + subsec->dofs_size > max_size) {
12575                         dtrace_dof_error(dof, "exceeds maximum size");
12576                         goto err;
12577                 }
12578
12579                 ttl += subsec->dofs_size;
12580
12581                 for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
12582
12583                         if (subsec->dofs_type != (uint32_t)difo[i].section)
12584                                 continue;
12585
12586                         if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
12587                                 dtrace_dof_error(dof, "section not loaded");
12588                                 goto err;
12589                         }
12590
12591                         if (subsec->dofs_align != (uint32_t)difo[i].align) {
12592                                 dtrace_dof_error(dof, "bad alignment");
12593                                 goto err;
12594                         }
12595
12596                         bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
12597                         lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
12598
12599                         if (*bufp != NULL) {
12600                                 dtrace_dof_error(dof, difo[i].msg);
12601                                 goto err;
12602                         }
12603
12604                         if ((uint32_t)difo[i].entsize != subsec->dofs_entsize) {
12605                                 dtrace_dof_error(dof, "entry size mismatch");
12606                                 goto err;
12607                         }
12608
12609                         if (subsec->dofs_entsize != 0 &&
12610                             (subsec->dofs_size % subsec->dofs_entsize) != 0) {
12611                                 dtrace_dof_error(dof, "corrupt entry size");
12612                                 goto err;
12613                         }
12614
12615                         *lenp = subsec->dofs_size;
12616                         *bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
12617                         bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
12618                             *bufp, subsec->dofs_size);
12619
12620                         if (subsec->dofs_entsize != 0)
12621                                 *lenp /= subsec->dofs_entsize;
12622
12623                         break;
12624                 }
12625
12626                 /*
12627                  * If we encounter a loadable DIFO sub-section that is not
12628                  * known to us, assume this is a broken program and fail.
12629                  */
12630                 if (difo[i].section == DOF_SECT_NONE &&
12631                     (subsec->dofs_flags & DOF_SECF_LOAD)) {
12632                         dtrace_dof_error(dof, "unrecognized DIFO subsection");
12633                         goto err;
12634                 }
12635         }
12636
12637         if (dp->dtdo_buf == NULL) {
12638                 /*
12639                  * We can't have a DIF object without DIF text.
12640                  */
12641                 dtrace_dof_error(dof, "missing DIF text");
12642                 goto err;
12643         }
12644
12645         /*
12646          * Before we validate the DIF object, run through the variable table
12647          * looking for the strings -- if any of their size are under, we'll set
12648          * their size to be the system-wide default string size.  Note that
12649          * this should _not_ happen if the "strsize" option has been set --
12650          * in this case, the compiler should have set the size to reflect the
12651          * setting of the option.
12652          */
12653         for (i = 0; i < dp->dtdo_varlen; i++) {
12654                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
12655                 dtrace_diftype_t *t = &v->dtdv_type;
12656
12657                 if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
12658                         continue;
12659
12660                 if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
12661                         t->dtdt_size = dtrace_strsize_default;
12662         }
12663
12664         if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
12665                 goto err;
12666
12667         dtrace_difo_init(dp, vstate);
12668         return (dp);
12669
12670 err:
12671         kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
12672         kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
12673         kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
12674         kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
12675
12676         kmem_free(dp, sizeof (dtrace_difo_t));
12677         return (NULL);
12678 }
12679
12680 static dtrace_predicate_t *
12681 dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
12682     cred_t *cr)
12683 {
12684         dtrace_difo_t *dp;
12685
12686         if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
12687                 return (NULL);
12688
12689         return (dtrace_predicate_create(dp));
12690 }
12691
12692 static dtrace_actdesc_t *
12693 dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
12694     cred_t *cr)
12695 {
12696         dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;
12697         dof_actdesc_t *desc;
12698         dof_sec_t *difosec;
12699         size_t offs;
12700         uintptr_t daddr = (uintptr_t)dof;
12701         uint64_t arg;
12702         dtrace_actkind_t kind;
12703
12704         if (sec->dofs_type != DOF_SECT_ACTDESC) {
12705                 dtrace_dof_error(dof, "invalid action section");
12706                 return (NULL);
12707         }
12708
12709         if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
12710                 dtrace_dof_error(dof, "truncated action description");
12711                 return (NULL);
12712         }
12713
12714         if (sec->dofs_align != sizeof (uint64_t)) {
12715                 dtrace_dof_error(dof, "bad alignment in action description");
12716                 return (NULL);
12717         }
12718
12719         if (sec->dofs_size < sec->dofs_entsize) {
12720                 dtrace_dof_error(dof, "section entry size exceeds total size");
12721                 return (NULL);
12722         }
12723
12724         if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
12725                 dtrace_dof_error(dof, "bad entry size in action description");
12726                 return (NULL);
12727         }
12728
12729         if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
12730                 dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
12731                 return (NULL);
12732         }
12733
12734         for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
12735                 desc = (dof_actdesc_t *)(daddr +
12736                     (uintptr_t)sec->dofs_offset + offs);
12737                 kind = (dtrace_actkind_t)desc->dofa_kind;
12738
12739                 if ((DTRACEACT_ISPRINTFLIKE(kind) &&
12740                     (kind != DTRACEACT_PRINTA || desc->dofa_strtab != DOF_SECIDX_NONE)) ||
12741                     (kind == DTRACEACT_DIFEXPR && desc->dofa_strtab != DOF_SECIDX_NONE))
12742                 {
12743                         dof_sec_t *strtab;
12744                         char *str, *fmt;
12745                         uint64_t i;
12746
12747                         /*
12748                          * The argument to these actions is an index into the
12749                          * DOF string table.  For printf()-like actions, this
12750                          * is the format string.  For print(), this is the
12751                          * CTF type of the expression result.
12752                          */
12753                         if ((strtab = dtrace_dof_sect(dof,
12754                             DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
12755                                 goto err;
12756
12757                         str = (char *)((uintptr_t)dof +
12758                             (uintptr_t)strtab->dofs_offset);
12759
12760                         for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
12761                                 if (str[i] == '\0')
12762                                         break;
12763                         }
12764
12765                         if (i >= strtab->dofs_size) {
12766                                 dtrace_dof_error(dof, "bogus format string");
12767                                 goto err;
12768                         }
12769
12770                         if (i == desc->dofa_arg) {
12771                                 dtrace_dof_error(dof, "empty format string");
12772                                 goto err;
12773                         }
12774
12775                         i -= desc->dofa_arg;
12776                         fmt = kmem_alloc(i + 1, KM_SLEEP);
12777                         bcopy(&str[desc->dofa_arg], fmt, i + 1);
12778                         arg = (uint64_t)(uintptr_t)fmt;
12779                 } else {
12780                         if (kind == DTRACEACT_PRINTA) {
12781                                 ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
12782                                 arg = 0;
12783                         } else {
12784                                 arg = desc->dofa_arg;
12785                         }
12786                 }
12787
12788                 act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
12789                     desc->dofa_uarg, arg);
12790
12791                 if (last != NULL) {
12792                         last->dtad_next = act;
12793                 } else {
12794                         first = act;
12795                 }
12796
12797                 last = act;
12798
12799                 if (desc->dofa_difo == DOF_SECIDX_NONE)
12800                         continue;
12801
12802                 if ((difosec = dtrace_dof_sect(dof,
12803                     DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
12804                         goto err;
12805
12806                 act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
12807
12808                 if (act->dtad_difo == NULL)
12809                         goto err;
12810         }
12811
12812         ASSERT(first != NULL);
12813         return (first);
12814
12815 err:
12816         for (act = first; act != NULL; act = next) {
12817                 next = act->dtad_next;
12818                 dtrace_actdesc_release(act, vstate);
12819         }
12820
12821         return (NULL);
12822 }
12823
12824 static dtrace_ecbdesc_t *
12825 dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
12826     cred_t *cr)
12827 {
12828         dtrace_ecbdesc_t *ep;
12829         dof_ecbdesc_t *ecb;
12830         dtrace_probedesc_t *desc;
12831         dtrace_predicate_t *pred = NULL;
12832
12833         if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
12834                 dtrace_dof_error(dof, "truncated ECB description");
12835                 return (NULL);
12836         }
12837
12838         if (sec->dofs_align != sizeof (uint64_t)) {
12839                 dtrace_dof_error(dof, "bad alignment in ECB description");
12840                 return (NULL);
12841         }
12842
12843         ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
12844         sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
12845
12846         if (sec == NULL)
12847                 return (NULL);
12848
12849         ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
12850         ep->dted_uarg = ecb->dofe_uarg;
12851         desc = &ep->dted_probe;
12852
12853         if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
12854                 goto err;
12855
12856         if (ecb->dofe_pred != DOF_SECIDX_NONE) {
12857                 if ((sec = dtrace_dof_sect(dof,
12858                     DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
12859                         goto err;
12860
12861                 if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
12862                         goto err;
12863
12864                 ep->dted_pred.dtpdd_predicate = pred;
12865         }
12866
12867         if (ecb->dofe_actions != DOF_SECIDX_NONE) {
12868                 if ((sec = dtrace_dof_sect(dof,
12869                     DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
12870                         goto err;
12871
12872                 ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
12873
12874                 if (ep->dted_action == NULL)
12875                         goto err;
12876         }
12877
12878         return (ep);
12879
12880 err:
12881         if (pred != NULL)
12882                 dtrace_predicate_release(pred, vstate);
12883         kmem_free(ep, sizeof (dtrace_ecbdesc_t));
12884         return (NULL);
12885 }
12886
12887 /*
12888  * APPLE NOTE: dyld handles dof relocation.
12889  * Darwin does not need dtrace_dof_relocate()
12890  */
12891
12892 /*
12893  * The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
12894  * header:  it should be at the front of a memory region that is at least
12895  * sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
12896  * size.  It need not be validated in any other way.
12897  */
12898 static int
12899 dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
12900     dtrace_enabling_t **enabp, uint64_t ubase, int noprobes)
12901 {
12902 #pragma unused(ubase) /* __APPLE__ */
12903         uint64_t len = dof->dofh_loadsz, seclen;
12904         uintptr_t daddr = (uintptr_t)dof;
12905         dtrace_ecbdesc_t *ep;
12906         dtrace_enabling_t *enab;
12907         uint_t i;
12908
12909         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12910         ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
12911
12912         /*
12913          * Check the DOF header identification bytes.  In addition to checking
12914          * valid settings, we also verify that unused bits/bytes are zeroed so
12915          * we can use them later without fear of regressing existing binaries.
12916          */
12917         if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
12918             DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
12919                 dtrace_dof_error(dof, "DOF magic string mismatch");
12920                 return (-1);
12921         }
12922
12923         if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
12924             dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
12925                 dtrace_dof_error(dof, "DOF has invalid data model");
12926                 return (-1);
12927         }
12928
12929         if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
12930                 dtrace_dof_error(dof, "DOF encoding mismatch");
12931                 return (-1);
12932         }
12933
12934         /*
12935          * APPLE NOTE: Darwin only supports DOF_VERSION_3 for now.
12936          */
12937         if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_3) {
12938                 dtrace_dof_error(dof, "DOF version mismatch");
12939                 return (-1);
12940         }
12941
12942         if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
12943                 dtrace_dof_error(dof, "DOF uses unsupported instruction set");
12944                 return (-1);
12945         }
12946
12947         if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
12948                 dtrace_dof_error(dof, "DOF uses too many integer registers");
12949                 return (-1);
12950         }
12951
12952         if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
12953                 dtrace_dof_error(dof, "DOF uses too many tuple registers");
12954                 return (-1);
12955         }
12956
12957         for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
12958                 if (dof->dofh_ident[i] != 0) {
12959                         dtrace_dof_error(dof, "DOF has invalid ident byte set");
12960                         return (-1);
12961                 }
12962         }
12963
12964         if (dof->dofh_flags & ~DOF_FL_VALID) {
12965                 dtrace_dof_error(dof, "DOF has invalid flag bits set");
12966                 return (-1);
12967         }
12968
12969         if (dof->dofh_secsize < sizeof(dof_sec_t)) {
12970                 dtrace_dof_error(dof, "invalid section header size");
12971                 return (-1);
12972         }
12973
12974         /*
12975          * Check that the section headers don't exceed the amount of DOF
12976          * data.  Note that we cast the section size and number of sections
12977          * to uint64_t's to prevent possible overflow in the multiplication.
12978          */
12979         seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
12980
12981         if (dof->dofh_secoff > len || seclen > len ||
12982             dof->dofh_secoff + seclen > len) {
12983                 dtrace_dof_error(dof, "truncated section headers");
12984                 return (-1);
12985         }
12986
12987         if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
12988                 dtrace_dof_error(dof, "misaligned section headers");
12989                 return (-1);
12990         }
12991
12992         if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
12993                 dtrace_dof_error(dof, "misaligned section size");
12994                 return (-1);
12995         }
12996
12997         /*
12998          * Take an initial pass through the section headers to be sure that
12999          * the headers don't have stray offsets.  If the 'noprobes' flag is
13000          * set, do not permit sections relating to providers, probes, or args.
13001          */
13002         for (i = 0; i < dof->dofh_secnum; i++) {
13003                 dof_sec_t *sec = (dof_sec_t *)(daddr +
13004                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13005
13006                 if (noprobes) {
13007                         switch (sec->dofs_type) {
13008                         case DOF_SECT_PROVIDER:
13009                         case DOF_SECT_PROBES:
13010                         case DOF_SECT_PRARGS:
13011                         case DOF_SECT_PROFFS:
13012                                 dtrace_dof_error(dof, "illegal sections "
13013                                     "for enabling");
13014                                 return (-1);
13015                         }
13016                 }
13017
13018                 if (!(sec->dofs_flags & DOF_SECF_LOAD))
13019                         continue; /* just ignore non-loadable sections */
13020
13021                 if (sec->dofs_align & (sec->dofs_align - 1)) {
13022                         dtrace_dof_error(dof, "bad section alignment");
13023                         return (-1);
13024                 }
13025
13026                 if (sec->dofs_offset & (sec->dofs_align - 1)) {
13027                         dtrace_dof_error(dof, "misaligned section");
13028                         return (-1);
13029                 }
13030
13031                 if (sec->dofs_offset > len || sec->dofs_size > len ||
13032                     sec->dofs_offset + sec->dofs_size > len) {
13033                         dtrace_dof_error(dof, "corrupt section header");
13034                         return (-1);
13035                 }
13036
13037                 if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +
13038                     sec->dofs_offset + sec->dofs_size - 1) != '\0') {
13039                         dtrace_dof_error(dof, "non-terminating string table");
13040                         return (-1);
13041                 }
13042         }
13043
13044         /*
13045          * APPLE NOTE: We have no further relocation to perform.
13046          * All dof values are relative offsets.
13047          */
13048
13049         if ((enab = *enabp) == NULL)
13050                 enab = *enabp = dtrace_enabling_create(vstate);
13051
13052         for (i = 0; i < dof->dofh_secnum; i++) {
13053                 dof_sec_t *sec = (dof_sec_t *)(daddr +
13054                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13055
13056                 if (sec->dofs_type != DOF_SECT_ECBDESC)
13057                         continue;
13058
13059                 /*
13060                  * APPLE NOTE: Defend against gcc 4.0 botch on x86.
13061                  * not all paths out of inlined dtrace_dof_ecbdesc
13062                  * are checked for the NULL return value.
13063                  * Check for NULL explicitly here.
13064                 */
13065                 ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr);
13066                 if (ep == NULL) {
13067                         dtrace_enabling_destroy(enab);
13068                         *enabp = NULL;
13069                         return (-1);
13070                 }
13071
13072                 dtrace_enabling_add(enab, ep);
13073         }
13074
13075         return (0);
13076 }
13077
13078 /*
13079  * Process DOF for any options.  This routine assumes that the DOF has been
13080  * at least processed by dtrace_dof_slurp().
13081  */
13082 static int
13083 dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)
13084 {
13085         uint_t i;
13086         int rval;
13087         uint32_t entsize;
13088         size_t offs;
13089         dof_optdesc_t *desc;
13090
13091         for (i = 0; i < dof->dofh_secnum; i++) {
13092                 dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +
13093                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13094
13095                 if (sec->dofs_type != DOF_SECT_OPTDESC)
13096                         continue;
13097
13098                 if (sec->dofs_align != sizeof (uint64_t)) {
13099                         dtrace_dof_error(dof, "bad alignment in "
13100                             "option description");
13101                         return (EINVAL);
13102                 }
13103
13104                 if ((entsize = sec->dofs_entsize) == 0) {
13105                         dtrace_dof_error(dof, "zeroed option entry size");
13106                         return (EINVAL);
13107                 }
13108
13109                 if (entsize < sizeof (dof_optdesc_t)) {
13110                         dtrace_dof_error(dof, "bad option entry size");
13111                         return (EINVAL);
13112                 }
13113
13114                 for (offs = 0; offs < sec->dofs_size; offs += entsize) {
13115                         desc = (dof_optdesc_t *)((uintptr_t)dof +
13116                             (uintptr_t)sec->dofs_offset + offs);
13117
13118                         if (desc->dofo_strtab != DOF_SECIDX_NONE) {
13119                                 dtrace_dof_error(dof, "non-zero option string");
13120                                 return (EINVAL);
13121                         }
13122
13123                         if (desc->dofo_value == (uint64_t)DTRACEOPT_UNSET) {
13124                                 dtrace_dof_error(dof, "unset option");
13125                                 return (EINVAL);
13126                         }
13127
13128                         if ((rval = dtrace_state_option(state,
13129                             desc->dofo_option, desc->dofo_value)) != 0) {
13130                                 dtrace_dof_error(dof, "rejected option");
13131                                 return (rval);
13132                         }
13133                 }
13134         }
13135
13136         return (0);
13137 }
13138
13139 /*
13140  * DTrace Consumer State Functions
13141  */
13142 static int
13143 dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
13144 {
13145         size_t hashsize, maxper, min_size, chunksize = dstate->dtds_chunksize;
13146         void *base;
13147         uintptr_t limit;
13148         dtrace_dynvar_t *dvar, *next, *start;
13149         size_t i;
13150
13151         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13152         ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
13153
13154         bzero(dstate, sizeof (dtrace_dstate_t));
13155
13156         if ((dstate->dtds_chunksize = chunksize) == 0)
13157                 dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
13158
13159         VERIFY(dstate->dtds_chunksize < (LONG_MAX - sizeof (dtrace_dynhash_t)));
13160
13161         if (size < (min_size = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
13162                 size = min_size;
13163
13164         if ((base = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
13165                 return (ENOMEM);
13166
13167         dstate->dtds_size = size;
13168         dstate->dtds_base = base;
13169         dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
13170         bzero(dstate->dtds_percpu, (int)NCPU * sizeof (dtrace_dstate_percpu_t));
13171
13172         hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
13173
13174         if (hashsize != 1 && (hashsize & 1))
13175                 hashsize--;
13176
13177         dstate->dtds_hashsize = hashsize;
13178         dstate->dtds_hash = dstate->dtds_base;
13179
13180         /*
13181          * Set all of our hash buckets to point to the single sink, and (if
13182          * it hasn't already been set), set the sink's hash value to be the
13183          * sink sentinel value.  The sink is needed for dynamic variable
13184          * lookups to know that they have iterated over an entire, valid hash
13185          * chain.
13186          */
13187         for (i = 0; i < hashsize; i++)
13188                 dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
13189
13190         if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
13191                 dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
13192
13193         /*
13194          * Determine number of active CPUs.  Divide free list evenly among
13195          * active CPUs.
13196          */
13197         start = (dtrace_dynvar_t *)
13198             ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
13199         limit = (uintptr_t)base + size;
13200
13201         VERIFY((uintptr_t)start < limit);
13202         VERIFY((uintptr_t)start >= (uintptr_t)base);
13203
13204         maxper = (limit - (uintptr_t)start) / (int)NCPU;
13205         maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
13206
13207         for (i = 0; i < NCPU; i++) {
13208                 dstate->dtds_percpu[i].dtdsc_free = dvar = start;
13209
13210                 /*
13211                  * If we don't even have enough chunks to make it once through
13212                  * NCPUs, we're just going to allocate everything to the first
13213                  * CPU.  And if we're on the last CPU, we're going to allocate
13214                  * whatever is left over.  In either case, we set the limit to
13215                  * be the limit of the dynamic variable space.
13216                  */
13217                 if (maxper == 0 || i == NCPU - 1) {
13218                         limit = (uintptr_t)base + size;
13219                         start = NULL;
13220                 } else {
13221                         limit = (uintptr_t)start + maxper;
13222                         start = (dtrace_dynvar_t *)limit;
13223                 }
13224
13225                 VERIFY(limit <= (uintptr_t)base + size);
13226
13227                 for (;;) {
13228                         next = (dtrace_dynvar_t *)((uintptr_t)dvar +
13229                             dstate->dtds_chunksize);
13230
13231                         if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
13232                                 break;
13233
13234                         VERIFY((uintptr_t)dvar >= (uintptr_t)base &&
13235                             (uintptr_t)dvar <= (uintptr_t)base + size);
13236                         dvar->dtdv_next = next;
13237                         dvar = next;
13238                 }
13239
13240                 if (maxper == 0)
13241                         break;
13242         }
13243
13244         return (0);
13245 }
13246
13247 static void
13248 dtrace_dstate_fini(dtrace_dstate_t *dstate)
13249 {
13250         LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
13251
13252         if (dstate->dtds_base == NULL)
13253                 return;
13254
13255         kmem_free(dstate->dtds_base, dstate->dtds_size);
13256         kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);
13257 }
13258
13259 static void
13260 dtrace_vstate_fini(dtrace_vstate_t *vstate)
13261 {
13262         /*
13263          * Logical XOR, where are you?
13264          */
13265         ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));
13266
13267         if (vstate->dtvs_nglobals > 0) {
13268                 kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
13269                     sizeof (dtrace_statvar_t *));
13270         }
13271
13272         if (vstate->dtvs_ntlocals > 0) {
13273                 kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
13274                     sizeof (dtrace_difv_t));
13275         }
13276
13277         ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));
13278
13279         if (vstate->dtvs_nlocals > 0) {
13280                 kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
13281                     sizeof (dtrace_statvar_t *));
13282         }
13283 }
13284
13285 static void
13286 dtrace_state_clean(dtrace_state_t *state)
13287 {
13288         if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
13289                 return;
13290
13291         dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
13292         dtrace_speculation_clean(state);
13293 }
13294
13295 static void
13296 dtrace_state_deadman(dtrace_state_t *state)
13297 {
13298         hrtime_t now;
13299
13300         dtrace_sync();
13301
13302         now = dtrace_gethrtime();
13303
13304         if (state != dtrace_anon.dta_state &&
13305             now - state->dts_laststatus >= dtrace_deadman_user)
13306                 return;
13307
13308         /*
13309          * We must be sure that dts_alive never appears to be less than the
13310          * value upon entry to dtrace_state_deadman(), and because we lack a
13311          * dtrace_cas64(), we cannot store to it atomically.  We thus instead
13312          * store INT64_MAX to it, followed by a memory barrier, followed by
13313          * the new value.  This assures that dts_alive never appears to be
13314          * less than its true value, regardless of the order in which the
13315          * stores to the underlying storage are issued.
13316          */
13317         state->dts_alive = INT64_MAX;
13318         dtrace_membar_producer();
13319         state->dts_alive = now;
13320 }
13321
13322 static int
13323 dtrace_state_create(dev_t *devp, cred_t *cr, dtrace_state_t **new_state)
13324 {
13325         minor_t minor;
13326         major_t major;
13327         char c[30];
13328         dtrace_state_t *state;
13329         dtrace_optval_t *opt;
13330         int bufsize = (int)NCPU * sizeof (dtrace_buffer_t), i;
13331
13332         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13333         LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
13334
13335         /* Cause restart */
13336         *new_state = NULL;
13337
13338         if (devp != NULL) {
13339                 minor = getminor(*devp);
13340         }
13341         else {
13342                 minor = DTRACE_NCLIENTS - 1;
13343         }
13344
13345         state = dtrace_state_allocate(minor);
13346         if (NULL == state) {
13347                 printf("dtrace_open: couldn't acquire minor number %d. This usually means that too many DTrace clients are in use at the moment", minor);
13348                 return (ERESTART);      /* can't reacquire */
13349         }
13350
13351         state->dts_epid = DTRACE_EPIDNONE + 1;
13352
13353         (void) snprintf(c, sizeof (c), "dtrace_aggid_%d", minor);
13354         state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
13355             NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
13356
13357         if (devp != NULL) {
13358                 major = getemajor(*devp);
13359         } else {
13360                 major = ddi_driver_major(dtrace_devi);
13361         }
13362
13363         state->dts_dev = makedev(major, minor);
13364
13365         if (devp != NULL)
13366                 *devp = state->dts_dev;
13367
13368         /*
13369          * We allocate NCPU buffers.  On the one hand, this can be quite
13370          * a bit of memory per instance (nearly 36K on a Starcat).  On the
13371          * other hand, it saves an additional memory reference in the probe
13372          * path.
13373          */
13374         state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
13375         state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
13376         state->dts_buf_over_limit = 0;
13377         state->dts_cleaner = CYCLIC_NONE;
13378         state->dts_deadman = CYCLIC_NONE;
13379         state->dts_vstate.dtvs_state = state;
13380
13381         for (i = 0; i < DTRACEOPT_MAX; i++)
13382                 state->dts_options[i] = DTRACEOPT_UNSET;
13383
13384         /*
13385          * Set the default options.
13386          */
13387         opt = state->dts_options;
13388         opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
13389         opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
13390         opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
13391         opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
13392         opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
13393         opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
13394         opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
13395         opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
13396         opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
13397         opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
13398         opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
13399         opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
13400         opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
13401         opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
13402         opt[DTRACEOPT_BUFLIMIT] = dtrace_buflimit_default;
13403
13404         /*
13405          * Depending on the user credentials, we set flag bits which alter probe
13406          * visibility or the amount of destructiveness allowed.  In the case of
13407          * actual anonymous tracing, or the possession of all privileges, all of
13408          * the normal checks are bypassed.
13409          */
13410 #if defined(__APPLE__)
13411         if (cr != NULL) {
13412                 kauth_cred_ref(cr);
13413                 state->dts_cred.dcr_cred = cr;
13414         }
13415         if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
13416                 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
13417                         /*
13418                          * Allow only proc credentials when DTrace is
13419                          * restricted by the current security policy
13420                          */
13421                         state->dts_cred.dcr_visible = DTRACE_CRV_ALLPROC;
13422                         state->dts_cred.dcr_action = DTRACE_CRA_PROC | DTRACE_CRA_PROC_CONTROL | DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
13423                 }
13424                 else {
13425                         state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
13426                         state->dts_cred.dcr_action = DTRACE_CRA_ALL;
13427                 }
13428         }
13429
13430 #else
13431         if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
13432                 state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
13433                 state->dts_cred.dcr_action = DTRACE_CRA_ALL;
13434         }
13435         else {
13436                 /*
13437                  * Set up the credentials for this instantiation.  We take a
13438                  * hold on the credential to prevent it from disappearing on
13439                  * us; this in turn prevents the zone_t referenced by this
13440                  * credential from disappearing.  This means that we can
13441                  * examine the credential and the zone from probe context.
13442                  */
13443                 crhold(cr);
13444                 state->dts_cred.dcr_cred = cr;
13445
13446                 /*
13447                  * CRA_PROC means "we have *some* privilege for dtrace" and
13448                  * unlocks the use of variables like pid, zonename, etc.
13449                  */
13450                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||
13451                     PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
13452                         state->dts_cred.dcr_action |= DTRACE_CRA_PROC;
13453                 }
13454
13455                 /*
13456                  * dtrace_user allows use of syscall and profile providers.
13457                  * If the user also has proc_owner and/or proc_zone, we
13458                  * extend the scope to include additional visibility and
13459                  * destructive power.
13460                  */
13461                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
13462                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
13463                                 state->dts_cred.dcr_visible |=
13464                                     DTRACE_CRV_ALLPROC;
13465
13466                                 state->dts_cred.dcr_action |=
13467                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
13468                         }
13469
13470                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
13471                                 state->dts_cred.dcr_visible |=
13472                                     DTRACE_CRV_ALLZONE;
13473
13474                                 state->dts_cred.dcr_action |=
13475                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
13476                         }
13477
13478                         /*
13479                          * If we have all privs in whatever zone this is,
13480                          * we can do destructive things to processes which
13481                          * have altered credentials.
13482                          *
13483                          * APPLE NOTE: Darwin doesn't do zones.
13484                          * Behave as if zone always has destructive privs.
13485                          */
13486
13487                         state->dts_cred.dcr_action |=
13488                                 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
13489                 }
13490
13491                 /*
13492                  * Holding the dtrace_kernel privilege also implies that
13493                  * the user has the dtrace_user privilege from a visibility
13494                  * perspective.  But without further privileges, some
13495                  * destructive actions are not available.
13496                  */
13497                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
13498                         /*
13499                          * Make all probes in all zones visible.  However,
13500                          * this doesn't mean that all actions become available
13501                          * to all zones.
13502                          */
13503                         state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |
13504                             DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;
13505
13506                         state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |
13507                             DTRACE_CRA_PROC;
13508                         /*
13509                          * Holding proc_owner means that destructive actions
13510                          * for *this* zone are allowed.
13511                          */
13512                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
13513                                 state->dts_cred.dcr_action |=
13514                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
13515
13516                         /*
13517                          * Holding proc_zone means that destructive actions
13518                          * for this user/group ID in all zones is allowed.
13519                          */
13520                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
13521                                 state->dts_cred.dcr_action |=
13522                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
13523
13524                         /*
13525                          * If we have all privs in whatever zone this is,
13526                          * we can do destructive things to processes which
13527                          * have altered credentials.
13528                          *
13529                          * APPLE NOTE: Darwin doesn't do zones.
13530                          * Behave as if zone always has destructive privs.
13531                          */
13532                         state->dts_cred.dcr_action |=
13533                                 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
13534                 }
13535
13536                 /*
13537                  * Holding the dtrace_proc privilege gives control over fasttrap
13538                  * and pid providers.  We need to grant wider destructive
13539                  * privileges in the event that the user has proc_owner and/or
13540                  * proc_zone.
13541                  */
13542                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
13543                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
13544                                 state->dts_cred.dcr_action |=
13545                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
13546
13547                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
13548                                 state->dts_cred.dcr_action |=
13549                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
13550                 }
13551         }
13552 #endif
13553
13554         *new_state = state;
13555         return(0);  /* Success */
13556 }
13557
13558 static int
13559 dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
13560 {
13561         dtrace_optval_t *opt = state->dts_options, size;
13562         processorid_t cpu = 0;
13563         size_t limit = buf->dtb_size;
13564         int flags = 0, rval;
13565
13566         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13567         LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
13568         ASSERT(which < DTRACEOPT_MAX);
13569         ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||
13570             (state == dtrace_anon.dta_state &&
13571             state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
13572
13573         if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)
13574                 return (0);
13575
13576         if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
13577                 cpu = opt[DTRACEOPT_CPU];
13578
13579         if (which == DTRACEOPT_SPECSIZE)
13580                 flags |= DTRACEBUF_NOSWITCH;
13581
13582         if (which == DTRACEOPT_BUFSIZE) {
13583                 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
13584                         flags |= DTRACEBUF_RING;
13585
13586                 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
13587                         flags |= DTRACEBUF_FILL;
13588
13589                 if (state != dtrace_anon.dta_state ||
13590                     state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
13591                         flags |= DTRACEBUF_INACTIVE;
13592         }
13593
13594         for (size = opt[which]; (size_t)size >= sizeof (uint64_t); size >>= 1) {
13595                 /*
13596                  * The size must be 8-byte aligned.  If the size is not 8-byte
13597                  * aligned, drop it down by the difference.
13598                  */
13599                 if (size & (sizeof (uint64_t) - 1))
13600                         size -= size & (sizeof (uint64_t) - 1);
13601
13602                 if (size < state->dts_reserve) {
13603                         /*
13604                          * Buffers always must be large enough to accommodate
13605                          * their prereserved space.  We return E2BIG instead
13606                          * of ENOMEM in this case to allow for user-level
13607                          * software to differentiate the cases.
13608                          */
13609                         return (E2BIG);
13610                 }
13611                 limit = opt[DTRACEOPT_BUFLIMIT] * size / 100;
13612                 rval = dtrace_buffer_alloc(buf, limit, size, flags, cpu);
13613
13614                 if (rval != ENOMEM) {
13615                         opt[which] = size;
13616                         return (rval);
13617                 }
13618
13619                 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
13620                         return (rval);
13621         }
13622
13623         return (ENOMEM);
13624 }
13625
13626 static int
13627 dtrace_state_buffers(dtrace_state_t *state)
13628 {
13629         dtrace_speculation_t *spec = state->dts_speculations;
13630         int rval, i;
13631
13632         if ((rval = dtrace_state_buffer(state, state->dts_buffer,
13633             DTRACEOPT_BUFSIZE)) != 0)
13634                 return (rval);
13635
13636         if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
13637             DTRACEOPT_AGGSIZE)) != 0)
13638                 return (rval);
13639
13640         for (i = 0; i < state->dts_nspeculations; i++) {
13641                 if ((rval = dtrace_state_buffer(state,
13642                     spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
13643                         return (rval);
13644         }
13645
13646         return (0);
13647 }
13648
13649 static void
13650 dtrace_state_prereserve(dtrace_state_t *state)
13651 {
13652         dtrace_ecb_t *ecb;
13653         dtrace_probe_t *probe;
13654
13655         state->dts_reserve = 0;
13656
13657         if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
13658                 return;
13659
13660         /*
13661          * If our buffer policy is a "fill" buffer policy, we need to set the
13662          * prereserved space to be the space required by the END probes.
13663          */
13664         probe = dtrace_probes[dtrace_probeid_end - 1];
13665         ASSERT(probe != NULL);
13666
13667         for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
13668                 if (ecb->dte_state != state)
13669                         continue;
13670
13671                 state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
13672         }
13673 }
13674
13675 static int
13676 dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
13677 {
13678         dtrace_optval_t *opt = state->dts_options, sz, nspec;
13679         dtrace_speculation_t *spec;
13680         dtrace_buffer_t *buf;
13681         cyc_handler_t hdlr;
13682         cyc_time_t when;
13683         int rval = 0, i, bufsize = (int)NCPU * sizeof (dtrace_buffer_t);
13684         dtrace_icookie_t cookie;
13685
13686         lck_mtx_lock(&cpu_lock);
13687         lck_mtx_lock(&dtrace_lock);
13688
13689         if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
13690                 rval = EBUSY;
13691                 goto out;
13692         }
13693
13694         /*
13695          * Before we can perform any checks, we must prime all of the
13696          * retained enablings that correspond to this state.
13697          */
13698         dtrace_enabling_prime(state);
13699
13700         if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
13701                 rval = EACCES;
13702                 goto out;
13703         }
13704
13705         dtrace_state_prereserve(state);
13706
13707         /*
13708          * Now we want to do is try to allocate our speculations.
13709          * We do not automatically resize the number of speculations; if
13710          * this fails, we will fail the operation.
13711          */
13712         nspec = opt[DTRACEOPT_NSPEC];
13713         ASSERT(nspec != DTRACEOPT_UNSET);
13714
13715         if (nspec > INT_MAX) {
13716                 rval = ENOMEM;
13717                 goto out;
13718         }
13719
13720         spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t), KM_NOSLEEP);
13721
13722         if (spec == NULL) {
13723                 rval = ENOMEM;
13724                 goto out;
13725         }
13726
13727         state->dts_speculations = spec;
13728         state->dts_nspeculations = (int)nspec;
13729
13730         for (i = 0; i < nspec; i++) {
13731                 if ((buf = kmem_zalloc(bufsize, KM_NOSLEEP)) == NULL) {
13732                         rval = ENOMEM;
13733                         goto err;
13734                 }
13735
13736                 spec[i].dtsp_buffer = buf;
13737         }
13738
13739         if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
13740                 if (dtrace_anon.dta_state == NULL) {
13741                         rval = ENOENT;
13742                         goto out;
13743                 }
13744
13745                 if (state->dts_necbs != 0) {
13746                         rval = EALREADY;
13747                         goto out;
13748                 }
13749
13750                 state->dts_anon = dtrace_anon_grab();
13751                 ASSERT(state->dts_anon != NULL);
13752                 state = state->dts_anon;
13753
13754                 /*
13755                  * We want "grabanon" to be set in the grabbed state, so we'll
13756                  * copy that option value from the grabbing state into the
13757                  * grabbed state.
13758                  */
13759                 state->dts_options[DTRACEOPT_GRABANON] =
13760                     opt[DTRACEOPT_GRABANON];
13761
13762                 *cpu = dtrace_anon.dta_beganon;
13763
13764                 /*
13765                  * If the anonymous state is active (as it almost certainly
13766                  * is if the anonymous enabling ultimately matched anything),
13767                  * we don't allow any further option processing -- but we
13768                  * don't return failure.
13769                  */
13770                 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
13771                         goto out;
13772         }
13773
13774         if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
13775             opt[DTRACEOPT_AGGSIZE] != 0) {
13776                 if (state->dts_aggregations == NULL) {
13777                         /*
13778                          * We're not going to create an aggregation buffer
13779                          * because we don't have any ECBs that contain
13780                          * aggregations -- set this option to 0.
13781                          */
13782                         opt[DTRACEOPT_AGGSIZE] = 0;
13783                 } else {
13784                         /*
13785                          * If we have an aggregation buffer, we must also have
13786                          * a buffer to use as scratch.
13787                          */
13788                         if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
13789                           (size_t)opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
13790                                 opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
13791                         }
13792                 }
13793         }
13794
13795         if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
13796             opt[DTRACEOPT_SPECSIZE] != 0) {
13797                 if (!state->dts_speculates) {
13798                         /*
13799                          * We're not going to create speculation buffers
13800                          * because we don't have any ECBs that actually
13801                          * speculate -- set the speculation size to 0.
13802                          */
13803                         opt[DTRACEOPT_SPECSIZE] = 0;
13804                 }
13805         }
13806
13807         /*
13808          * The bare minimum size for any buffer that we're actually going to
13809          * do anything to is sizeof (uint64_t).
13810          */
13811         sz = sizeof (uint64_t);
13812
13813         if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||
13814             (state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||
13815             (state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
13816                 /*
13817                  * A buffer size has been explicitly set to 0 (or to a size
13818                  * that will be adjusted to 0) and we need the space -- we
13819                  * need to return failure.  We return ENOSPC to differentiate
13820                  * it from failing to allocate a buffer due to failure to meet
13821                  * the reserve (for which we return E2BIG).
13822                  */
13823                 rval = ENOSPC;
13824                 goto out;
13825         }
13826
13827         if ((rval = dtrace_state_buffers(state)) != 0)
13828                 goto err;
13829
13830         if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
13831                 sz = dtrace_dstate_defsize;
13832
13833         do {
13834                 rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
13835
13836                 if (rval == 0)
13837                         break;
13838
13839                 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
13840                         goto err;
13841         } while (sz >>= 1);
13842
13843         opt[DTRACEOPT_DYNVARSIZE] = sz;
13844
13845         if (rval != 0)
13846                 goto err;
13847
13848         if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
13849                 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
13850
13851         if (opt[DTRACEOPT_CLEANRATE] == 0)
13852                 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
13853
13854         if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
13855                 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
13856
13857         if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
13858                 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
13859
13860         if (opt[DTRACEOPT_STRSIZE] > dtrace_strsize_max)
13861                 opt[DTRACEOPT_STRSIZE] = dtrace_strsize_max;
13862
13863         if (opt[DTRACEOPT_STRSIZE] < dtrace_strsize_min)
13864                 opt[DTRACEOPT_STRSIZE] = dtrace_strsize_min;
13865
13866         if (opt[DTRACEOPT_BUFLIMIT] > dtrace_buflimit_max)
13867                 opt[DTRACEOPT_BUFLIMIT] = dtrace_buflimit_max;
13868
13869         if (opt[DTRACEOPT_BUFLIMIT] < dtrace_buflimit_min)
13870                 opt[DTRACEOPT_BUFLIMIT] = dtrace_buflimit_min;
13871
13872         hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
13873         hdlr.cyh_arg = state;
13874         hdlr.cyh_level = CY_LOW_LEVEL;
13875
13876         when.cyt_when = 0;
13877         when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
13878
13879         state->dts_cleaner = cyclic_add(&hdlr, &when);
13880
13881         hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
13882         hdlr.cyh_arg = state;
13883         hdlr.cyh_level = CY_LOW_LEVEL;
13884
13885         when.cyt_when = 0;
13886         when.cyt_interval = dtrace_deadman_interval;
13887
13888         state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
13889         state->dts_deadman = cyclic_add(&hdlr, &when);
13890
13891         state->dts_activity = DTRACE_ACTIVITY_WARMUP;
13892
13893         /*
13894          * Now it's time to actually fire the BEGIN probe.  We need to disable
13895          * interrupts here both to record the CPU on which we fired the BEGIN
13896          * probe (the data from this CPU will be processed first at user
13897          * level) and to manually activate the buffer for this CPU.
13898          */
13899         cookie = dtrace_interrupt_disable();
13900         *cpu = CPU->cpu_id;
13901         ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
13902         state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
13903
13904         dtrace_probe(dtrace_probeid_begin,
13905             (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
13906         dtrace_interrupt_enable(cookie);
13907         /*
13908          * We may have had an exit action from a BEGIN probe; only change our
13909          * state to ACTIVE if we're still in WARMUP.
13910          */
13911         ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||
13912             state->dts_activity == DTRACE_ACTIVITY_DRAINING);
13913
13914         if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
13915                 state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
13916
13917         /*
13918          * Regardless of whether or not now we're in ACTIVE or DRAINING, we
13919          * want each CPU to transition its principal buffer out of the
13920          * INACTIVE state.  Doing this assures that no CPU will suddenly begin
13921          * processing an ECB halfway down a probe's ECB chain; all CPUs will
13922          * atomically transition from processing none of a state's ECBs to
13923          * processing all of them.
13924          */
13925         dtrace_xcall(DTRACE_CPUALL,
13926             (dtrace_xcall_t)dtrace_buffer_activate, state);
13927         goto out;
13928
13929 err:
13930         dtrace_buffer_free(state->dts_buffer);
13931         dtrace_buffer_free(state->dts_aggbuffer);
13932
13933         if ((nspec = state->dts_nspeculations) == 0) {
13934                 ASSERT(state->dts_speculations == NULL);
13935                 goto out;
13936         }
13937
13938         spec = state->dts_speculations;
13939         ASSERT(spec != NULL);
13940
13941         for (i = 0; i < state->dts_nspeculations; i++) {
13942                 if ((buf = spec[i].dtsp_buffer) == NULL)
13943                         break;
13944
13945                 dtrace_buffer_free(buf);
13946                 kmem_free(buf, bufsize);
13947         }
13948
13949         kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
13950         state->dts_nspeculations = 0;
13951         state->dts_speculations = NULL;
13952
13953 out:
13954         lck_mtx_unlock(&dtrace_lock);
13955         lck_mtx_unlock(&cpu_lock);
13956
13957         return (rval);
13958 }
13959
13960 static int
13961 dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
13962 {
13963         dtrace_icookie_t cookie;
13964
13965         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13966
13967         if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
13968             state->dts_activity != DTRACE_ACTIVITY_DRAINING)
13969                 return (EINVAL);
13970
13971         /*
13972          * We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
13973          * to be sure that every CPU has seen it.  See below for the details
13974          * on why this is done.
13975          */
13976         state->dts_activity = DTRACE_ACTIVITY_DRAINING;
13977         dtrace_sync();
13978
13979         /*
13980          * By this point, it is impossible for any CPU to be still processing
13981          * with DTRACE_ACTIVITY_ACTIVE.  We can thus set our activity to
13982          * DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
13983          * other CPU in dtrace_buffer_reserve().  This allows dtrace_probe()
13984          * and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
13985          * iff we're in the END probe.
13986          */
13987         state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
13988         dtrace_sync();
13989         ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
13990
13991         /*
13992          * Finally, we can release the reserve and call the END probe.  We
13993          * disable interrupts across calling the END probe to allow us to
13994          * return the CPU on which we actually called the END probe.  This
13995          * allows user-land to be sure that this CPU's principal buffer is
13996          * processed last.
13997          */
13998         state->dts_reserve = 0;
13999
14000         cookie = dtrace_interrupt_disable();
14001         *cpu = CPU->cpu_id;
14002         dtrace_probe(dtrace_probeid_end,
14003             (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
14004         dtrace_interrupt_enable(cookie);
14005
14006         state->dts_activity = DTRACE_ACTIVITY_STOPPED;
14007         dtrace_sync();
14008
14009         return (0);
14010 }
14011
14012 static int
14013 dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
14014     dtrace_optval_t val)
14015 {
14016         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14017
14018         if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14019                 return (EBUSY);
14020
14021         if (option >= DTRACEOPT_MAX)
14022                 return (EINVAL);
14023
14024         if (option != DTRACEOPT_CPU && val < 0)
14025                 return (EINVAL);
14026
14027         switch (option) {
14028         case DTRACEOPT_DESTRUCTIVE:
14029                 /*
14030                  * Prevent consumers from enabling destructive actions if DTrace
14031                  * is running in a restricted environment, or if actions are
14032                  * disallowed.
14033                  */
14034                 if (dtrace_is_restricted() || dtrace_destructive_disallow)
14035                         return (EACCES);
14036
14037                 state->dts_cred.dcr_destructive = 1;
14038                 break;
14039
14040         case DTRACEOPT_BUFSIZE:
14041         case DTRACEOPT_DYNVARSIZE:
14042         case DTRACEOPT_AGGSIZE:
14043         case DTRACEOPT_SPECSIZE:
14044         case DTRACEOPT_STRSIZE:
14045                 if (val < 0)
14046                         return (EINVAL);
14047
14048                 if (val >= LONG_MAX) {
14049                         /*
14050                          * If this is an otherwise negative value, set it to
14051                          * the highest multiple of 128m less than LONG_MAX.
14052                          * Technically, we're adjusting the size without
14053                          * regard to the buffer resizing policy, but in fact,
14054                          * this has no effect -- if we set the buffer size to
14055                          * ~LONG_MAX and the buffer policy is ultimately set to
14056                          * be "manual", the buffer allocation is guaranteed to
14057                          * fail, if only because the allocation requires two
14058                          * buffers.  (We set the the size to the highest
14059                          * multiple of 128m because it ensures that the size
14060                          * will remain a multiple of a megabyte when
14061                          * repeatedly halved -- all the way down to 15m.)
14062                          */
14063                         val = LONG_MAX - (1 << 27) + 1;
14064                 }
14065         }
14066
14067         state->dts_options[option] = val;
14068
14069         return (0);
14070 }
14071
14072 static void
14073 dtrace_state_destroy(dtrace_state_t *state)
14074 {
14075         dtrace_ecb_t *ecb;
14076         dtrace_vstate_t *vstate = &state->dts_vstate;
14077         minor_t minor = getminor(state->dts_dev);
14078         int i, bufsize = (int)NCPU * sizeof (dtrace_buffer_t);
14079         dtrace_speculation_t *spec = state->dts_speculations;
14080         int nspec = state->dts_nspeculations;
14081         uint32_t match;
14082
14083         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14084         LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14085
14086         /*
14087          * First, retract any retained enablings for this state.
14088          */
14089         dtrace_enabling_retract(state);
14090         ASSERT(state->dts_nretained == 0);
14091
14092         if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||
14093             state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
14094                 /*
14095                  * We have managed to come into dtrace_state_destroy() on a
14096                  * hot enabling -- almost certainly because of a disorderly
14097                  * shutdown of a consumer.  (That is, a consumer that is
14098                  * exiting without having called dtrace_stop().) In this case,
14099                  * we're going to set our activity to be KILLED, and then
14100                  * issue a sync to be sure that everyone is out of probe
14101                  * context before we start blowing away ECBs.
14102                  */
14103                 state->dts_activity = DTRACE_ACTIVITY_KILLED;
14104                 dtrace_sync();
14105         }
14106
14107         /*
14108          * Release the credential hold we took in dtrace_state_create().
14109          */
14110         if (state->dts_cred.dcr_cred != NULL)
14111                 kauth_cred_unref(&state->dts_cred.dcr_cred);
14112
14113         /*
14114          * Now we can safely disable and destroy any enabled probes.  Because
14115          * any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
14116          * (especially if they're all enabled), we take two passes through the
14117          * ECBs:  in the first, we disable just DTRACE_PRIV_KERNEL probes, and
14118          * in the second we disable whatever is left over.
14119          */
14120         for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
14121                 for (i = 0; i < state->dts_necbs; i++) {
14122                         if ((ecb = state->dts_ecbs[i]) == NULL)
14123                                 continue;
14124
14125                         if (match && ecb->dte_probe != NULL) {
14126                                 dtrace_probe_t *probe = ecb->dte_probe;
14127                                 dtrace_provider_t *prov = probe->dtpr_provider;
14128
14129                                 if (!(prov->dtpv_priv.dtpp_flags & match))
14130                                         continue;
14131                         }
14132
14133                         dtrace_ecb_disable(ecb);
14134                         dtrace_ecb_destroy(ecb);
14135                 }
14136
14137                 if (!match)
14138                         break;
14139         }
14140
14141         /*
14142          * Before we free the buffers, perform one more sync to assure that
14143          * every CPU is out of probe context.
14144          */
14145         dtrace_sync();
14146
14147         dtrace_buffer_free(state->dts_buffer);
14148         dtrace_buffer_free(state->dts_aggbuffer);
14149
14150         for (i = 0; i < nspec; i++)
14151                 dtrace_buffer_free(spec[i].dtsp_buffer);
14152
14153         if (state->dts_cleaner != CYCLIC_NONE)
14154                 cyclic_remove(state->dts_cleaner);
14155
14156         if (state->dts_deadman != CYCLIC_NONE)
14157                 cyclic_remove(state->dts_deadman);
14158
14159         dtrace_dstate_fini(&vstate->dtvs_dynvars);
14160         dtrace_vstate_fini(vstate);
14161         kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
14162
14163         if (state->dts_aggregations != NULL) {
14164 #if DEBUG
14165                 for (i = 0; i < state->dts_naggregations; i++)
14166                         ASSERT(state->dts_aggregations[i] == NULL);
14167 #endif
14168                 ASSERT(state->dts_naggregations > 0);
14169                 kmem_free(state->dts_aggregations,
14170                     state->dts_naggregations * sizeof (dtrace_aggregation_t *));
14171         }
14172
14173         kmem_free(state->dts_buffer, bufsize);
14174         kmem_free(state->dts_aggbuffer, bufsize);
14175
14176         for (i = 0; i < nspec; i++)
14177                 kmem_free(spec[i].dtsp_buffer, bufsize);
14178
14179         kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
14180
14181         dtrace_format_destroy(state);
14182
14183         vmem_destroy(state->dts_aggid_arena);
14184         dtrace_state_free(minor);
14185 }
14186
14187 /*
14188  * DTrace Anonymous Enabling Functions
14189  */
14190
14191 int
14192 dtrace_keep_kernel_symbols(void)
14193 {
14194         if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
14195                 return 0;
14196         }
14197
14198         if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL)
14199                 return 1;
14200
14201         return 0;
14202 }
14203
14204 static dtrace_state_t *
14205 dtrace_anon_grab(void)
14206 {
14207         dtrace_state_t *state;
14208
14209         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14210
14211         if ((state = dtrace_anon.dta_state) == NULL) {
14212                 ASSERT(dtrace_anon.dta_enabling == NULL);
14213                 return (NULL);
14214         }
14215
14216         ASSERT(dtrace_anon.dta_enabling != NULL);
14217         ASSERT(dtrace_retained != NULL);
14218
14219         dtrace_enabling_destroy(dtrace_anon.dta_enabling);
14220         dtrace_anon.dta_enabling = NULL;
14221         dtrace_anon.dta_state = NULL;
14222
14223         return (state);
14224 }
14225
14226 static void
14227 dtrace_anon_property(void)
14228 {
14229         int i, rv;
14230         dtrace_state_t *state;
14231         dof_hdr_t *dof;
14232         char c[32];             /* enough for "dof-data-" + digits */
14233
14234         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14235         LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14236
14237         for (i = 0; ; i++) {
14238                 (void) snprintf(c, sizeof (c), "dof-data-%d", i);
14239
14240                 dtrace_err_verbose = 1;
14241
14242                 if ((dof = dtrace_dof_property(c)) == NULL) {
14243                         dtrace_err_verbose = 0;
14244                         break;
14245                 }
14246
14247 #ifdef illumos
14248                 /*
14249                  * We want to create anonymous state, so we need to transition
14250                  * the kernel debugger to indicate that DTrace is active.  If
14251                  * this fails (e.g. because the debugger has modified text in
14252                  * some way), we won't continue with the processing.
14253                  */
14254                 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
14255                         cmn_err(CE_NOTE, "kernel debugger active; anonymous "
14256                             "enabling ignored.");
14257                         dtrace_dof_destroy(dof);
14258                         break;
14259                 }
14260 #endif
14261
14262                 /*
14263                  * If we haven't allocated an anonymous state, we'll do so now.
14264                  */
14265                 if ((state = dtrace_anon.dta_state) == NULL) {
14266                         rv = dtrace_state_create(NULL, NULL, &state);
14267                         dtrace_anon.dta_state = state;
14268                         if (rv != 0 || state == NULL) {
14269                                 /*
14270                                  * This basically shouldn't happen:  the only
14271                                  * failure mode from dtrace_state_create() is a
14272                                  * failure of ddi_soft_state_zalloc() that
14273                                  * itself should never happen.  Still, the
14274                                  * interface allows for a failure mode, and
14275                                  * we want to fail as gracefully as possible:
14276                                  * we'll emit an error message and cease
14277                                  * processing anonymous state in this case.
14278                                  */
14279                                 cmn_err(CE_WARN, "failed to create "
14280                                     "anonymous state");
14281                                 dtrace_dof_destroy(dof);
14282                                 break;
14283                         }
14284                 }
14285
14286                 rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
14287                     &dtrace_anon.dta_enabling, 0, B_TRUE);
14288
14289                 if (rv == 0)
14290                         rv = dtrace_dof_options(dof, state);
14291
14292                 dtrace_err_verbose = 0;
14293                 dtrace_dof_destroy(dof);
14294
14295                 if (rv != 0) {
14296                         /*
14297                          * This is malformed DOF; chuck any anonymous state
14298                          * that we created.
14299                          */
14300                         ASSERT(dtrace_anon.dta_enabling == NULL);
14301                         dtrace_state_destroy(state);
14302                         dtrace_anon.dta_state = NULL;
14303                         break;
14304                 }
14305
14306                 ASSERT(dtrace_anon.dta_enabling != NULL);
14307         }
14308
14309         if (dtrace_anon.dta_enabling != NULL) {
14310                 int rval;
14311
14312                 /*
14313                  * dtrace_enabling_retain() can only fail because we are
14314                  * trying to retain more enablings than are allowed -- but
14315                  * we only have one anonymous enabling, and we are guaranteed
14316                  * to be allowed at least one retained enabling; we assert
14317                  * that dtrace_enabling_retain() returns success.
14318                  */
14319                 rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
14320                 ASSERT(rval == 0);
14321
14322                 dtrace_enabling_dump(dtrace_anon.dta_enabling);
14323         }
14324 }
14325
14326 /*
14327  * DTrace Helper Functions
14328  */
14329 static void
14330 dtrace_helper_trace(dtrace_helper_action_t *helper,
14331     dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
14332 {
14333         uint32_t size, next, nnext;
14334         int i;
14335         dtrace_helptrace_t *ent;
14336         uint16_t flags = cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
14337
14338         if (!dtrace_helptrace_enabled)
14339                 return;
14340
14341         ASSERT((uint32_t)vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
14342
14343         /*
14344          * What would a tracing framework be without its own tracing
14345          * framework?  (Well, a hell of a lot simpler, for starters...)
14346          */
14347         size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
14348             sizeof (uint64_t) - sizeof (uint64_t);
14349
14350         /*
14351          * Iterate until we can allocate a slot in the trace buffer.
14352          */
14353         do {
14354                 next = dtrace_helptrace_next;
14355
14356                 if (next + size < dtrace_helptrace_bufsize) {
14357                         nnext = next + size;
14358                 } else {
14359                         nnext = size;
14360                 }
14361         } while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
14362
14363         /*
14364          * We have our slot; fill it in.
14365          */
14366         if (nnext == size)
14367                 next = 0;
14368
14369         ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
14370         ent->dtht_helper = helper;
14371         ent->dtht_where = where;
14372         ent->dtht_nlocals = vstate->dtvs_nlocals;
14373
14374         ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
14375             mstate->dtms_fltoffs : -1;
14376         ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
14377         ent->dtht_illval = cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
14378
14379         for (i = 0; i < vstate->dtvs_nlocals; i++) {
14380                 dtrace_statvar_t *svar;
14381
14382                 if ((svar = vstate->dtvs_locals[i]) == NULL)
14383                         continue;
14384
14385                 ASSERT(svar->dtsv_size >= (int)NCPU * sizeof (uint64_t));
14386                 ent->dtht_locals[i] =
14387                     ((uint64_t *)(uintptr_t)svar->dtsv_data)[CPU->cpu_id];
14388         }
14389 }
14390
14391 static uint64_t
14392 dtrace_helper(int which, dtrace_mstate_t *mstate,
14393     dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
14394 {
14395         uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
14396         uint64_t sarg0 = mstate->dtms_arg[0];
14397         uint64_t sarg1 = mstate->dtms_arg[1];
14398         uint64_t rval = 0;
14399         dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
14400         dtrace_helper_action_t *helper;
14401         dtrace_vstate_t *vstate;
14402         dtrace_difo_t *pred;
14403         int i, trace = dtrace_helptrace_enabled;
14404
14405         ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
14406
14407         if (helpers == NULL)
14408                 return (0);
14409
14410         if ((helper = helpers->dthps_actions[which]) == NULL)
14411                 return (0);
14412
14413         vstate = &helpers->dthps_vstate;
14414         mstate->dtms_arg[0] = arg0;
14415         mstate->dtms_arg[1] = arg1;
14416
14417         /*
14418          * Now iterate over each helper.  If its predicate evaluates to 'true',
14419          * we'll call the corresponding actions.  Note that the below calls
14420          * to dtrace_dif_emulate() may set faults in machine state.  This is
14421          * okay:  our caller (the outer dtrace_dif_emulate()) will simply plow
14422          * the stored DIF offset with its own (which is the desired behavior).
14423          * Also, note the calls to dtrace_dif_emulate() may allocate scratch
14424          * from machine state; this is okay, too.
14425          */
14426         for (; helper != NULL; helper = helper->dtha_next) {
14427                 if ((pred = helper->dtha_predicate) != NULL) {
14428                         if (trace)
14429                                 dtrace_helper_trace(helper, mstate, vstate, 0);
14430
14431                         if (!dtrace_dif_emulate(pred, mstate, vstate, state))
14432                                 goto next;
14433
14434                         if (*flags & CPU_DTRACE_FAULT)
14435                                 goto err;
14436                 }
14437
14438                 for (i = 0; i < helper->dtha_nactions; i++) {
14439                         if (trace)
14440                                 dtrace_helper_trace(helper,
14441                                     mstate, vstate, i + 1);
14442
14443                         rval = dtrace_dif_emulate(helper->dtha_actions[i],
14444                             mstate, vstate, state);
14445
14446                         if (*flags & CPU_DTRACE_FAULT)
14447                                 goto err;
14448                 }
14449
14450 next:
14451                 if (trace)
14452                         dtrace_helper_trace(helper, mstate, vstate,
14453                             DTRACE_HELPTRACE_NEXT);
14454         }
14455
14456         if (trace)
14457                 dtrace_helper_trace(helper, mstate, vstate,
14458                     DTRACE_HELPTRACE_DONE);
14459
14460         /*
14461          * Restore the arg0 that we saved upon entry.
14462          */
14463         mstate->dtms_arg[0] = sarg0;
14464         mstate->dtms_arg[1] = sarg1;
14465
14466         return (rval);
14467
14468 err:
14469         if (trace)
14470                 dtrace_helper_trace(helper, mstate, vstate,
14471                     DTRACE_HELPTRACE_ERR);
14472
14473         /*
14474          * Restore the arg0 that we saved upon entry.
14475          */
14476         mstate->dtms_arg[0] = sarg0;
14477         mstate->dtms_arg[1] = sarg1;
14478
14479         return (0);
14480 }
14481
14482 static void
14483 dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
14484     dtrace_vstate_t *vstate)
14485 {
14486         int i;
14487
14488         if (helper->dtha_predicate != NULL)
14489                 dtrace_difo_release(helper->dtha_predicate, vstate);
14490
14491         for (i = 0; i < helper->dtha_nactions; i++) {
14492                 ASSERT(helper->dtha_actions[i] != NULL);
14493                 dtrace_difo_release(helper->dtha_actions[i], vstate);
14494         }
14495
14496         kmem_free(helper->dtha_actions,
14497             helper->dtha_nactions * sizeof (dtrace_difo_t *));
14498         kmem_free(helper, sizeof (dtrace_helper_action_t));
14499 }
14500
14501 static int
14502 dtrace_helper_destroygen(proc_t* p, int gen)
14503 {
14504         dtrace_helpers_t *help = p->p_dtrace_helpers;
14505         dtrace_vstate_t *vstate;
14506         uint_t i;
14507
14508         LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
14509         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14510
14511         if (help == NULL || gen > help->dthps_generation)
14512                 return (EINVAL);
14513
14514         vstate = &help->dthps_vstate;
14515
14516         for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
14517                 dtrace_helper_action_t *last = NULL, *h, *next;
14518
14519                 for (h = help->dthps_actions[i]; h != NULL; h = next) {
14520                         next = h->dtha_next;
14521
14522                         if (h->dtha_generation == gen) {
14523                                 if (last != NULL) {
14524                                         last->dtha_next = next;
14525                                 } else {
14526                                         help->dthps_actions[i] = next;
14527                                 }
14528
14529                                 dtrace_helper_action_destroy(h, vstate);
14530                         } else {
14531                                 last = h;
14532                         }
14533                 }
14534         }
14535
14536         /*
14537          * Interate until we've cleared out all helper providers with the
14538          * given generation number.
14539          */
14540         for (;;) {
14541                 dtrace_helper_provider_t *prov = NULL;
14542
14543                 /*
14544                  * Look for a helper provider with the right generation. We
14545                  * have to start back at the beginning of the list each time
14546                  * because we drop dtrace_lock. It's unlikely that we'll make
14547                  * more than two passes.
14548                  */
14549                 for (i = 0; i < help->dthps_nprovs; i++) {
14550                         prov = help->dthps_provs[i];
14551
14552                         if (prov->dthp_generation == gen)
14553                                 break;
14554                 }
14555
14556                 /*
14557                  * If there were no matches, we're done.
14558                  */
14559                 if (i == help->dthps_nprovs)
14560                         break;
14561
14562                 /*
14563                  * Move the last helper provider into this slot.
14564                  */
14565                 help->dthps_nprovs--;
14566                 help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
14567                 help->dthps_provs[help->dthps_nprovs] = NULL;
14568
14569                 lck_mtx_unlock(&dtrace_lock);
14570
14571                 /*
14572                  * If we have a meta provider, remove this helper provider.
14573                  */
14574                 if (dtrace_meta_pid != NULL) {
14575                         ASSERT(dtrace_deferred_pid == NULL);
14576                         dtrace_helper_provider_remove(&prov->dthp_prov,
14577                             p);
14578                 }
14579
14580                 dtrace_helper_provider_destroy(prov);
14581
14582                 lck_mtx_lock(&dtrace_lock);
14583         }
14584
14585         return (0);
14586 }
14587
14588 static int
14589 dtrace_helper_validate(dtrace_helper_action_t *helper)
14590 {
14591         int err = 0, i;
14592         dtrace_difo_t *dp;
14593
14594         if ((dp = helper->dtha_predicate) != NULL)
14595                 err += dtrace_difo_validate_helper(dp);
14596
14597         for (i = 0; i < helper->dtha_nactions; i++)
14598                 err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
14599
14600         return (err == 0);
14601 }
14602
14603 static int
14604 dtrace_helper_action_add(proc_t* p, int which, dtrace_ecbdesc_t *ep)
14605 {
14606         dtrace_helpers_t *help;
14607         dtrace_helper_action_t *helper, *last;
14608         dtrace_actdesc_t *act;
14609         dtrace_vstate_t *vstate;
14610         dtrace_predicate_t *pred;
14611         int count = 0, nactions = 0, i;
14612
14613         if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
14614                 return (EINVAL);
14615
14616         help = p->p_dtrace_helpers;
14617         last = help->dthps_actions[which];
14618         vstate = &help->dthps_vstate;
14619
14620         for (count = 0; last != NULL; last = last->dtha_next) {
14621                 count++;
14622                 if (last->dtha_next == NULL)
14623                         break;
14624         }
14625
14626         /*
14627          * If we already have dtrace_helper_actions_max helper actions for this
14628          * helper action type, we'll refuse to add a new one.
14629          */
14630         if (count >= dtrace_helper_actions_max)
14631                 return (ENOSPC);
14632
14633         helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
14634         helper->dtha_generation = help->dthps_generation;
14635
14636         if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
14637                 ASSERT(pred->dtp_difo != NULL);
14638                 dtrace_difo_hold(pred->dtp_difo);
14639                 helper->dtha_predicate = pred->dtp_difo;
14640         }
14641
14642         for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
14643                 if (act->dtad_kind != DTRACEACT_DIFEXPR)
14644                         goto err;
14645
14646                 if (act->dtad_difo == NULL)
14647                         goto err;
14648
14649                 nactions++;
14650         }
14651
14652         helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *
14653             (helper->dtha_nactions = nactions), KM_SLEEP);
14654
14655         for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
14656                 dtrace_difo_hold(act->dtad_difo);
14657                 helper->dtha_actions[i++] = act->dtad_difo;
14658         }
14659
14660         if (!dtrace_helper_validate(helper))
14661                 goto err;
14662
14663         if (last == NULL) {
14664                 help->dthps_actions[which] = helper;
14665         } else {
14666                 last->dtha_next = helper;
14667         }
14668
14669         if ((uint32_t)vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
14670                 dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
14671                 dtrace_helptrace_next = 0;
14672         }
14673
14674         return (0);
14675 err:
14676         dtrace_helper_action_destroy(helper, vstate);
14677         return (EINVAL);
14678 }
14679
14680 static void
14681 dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
14682     dof_helper_t *dofhp)
14683 {
14684         LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
14685         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
14686
14687         lck_mtx_lock(&dtrace_lock);
14688
14689         if (!dtrace_attached() || dtrace_meta_pid == NULL) {
14690                 /*
14691                  * If the dtrace module is loaded but not attached, or if
14692                  * there aren't isn't a meta provider registered to deal with
14693                  * these provider descriptions, we need to postpone creating
14694                  * the actual providers until later.
14695                  */
14696
14697                 if (help->dthps_next == NULL && help->dthps_prev == NULL &&
14698                     dtrace_deferred_pid != help) {
14699                         help->dthps_deferred = 1;
14700                         help->dthps_pid = p->p_pid;
14701                         help->dthps_next = dtrace_deferred_pid;
14702                         help->dthps_prev = NULL;
14703                         if (dtrace_deferred_pid != NULL)
14704                                 dtrace_deferred_pid->dthps_prev = help;
14705                         dtrace_deferred_pid = help;
14706                 }
14707
14708                 lck_mtx_unlock(&dtrace_lock);
14709
14710         } else if (dofhp != NULL) {
14711                 /*
14712                  * If the dtrace module is loaded and we have a particular
14713                  * helper provider description, pass that off to the
14714                  * meta provider.
14715                  */
14716
14717                 lck_mtx_unlock(&dtrace_lock);
14718
14719                 dtrace_helper_provide(dofhp, p);
14720
14721         } else {
14722                 /*
14723                  * Otherwise, just pass all the helper provider descriptions
14724                  * off to the meta provider.
14725                  */
14726
14727                 uint_t i;
14728                 lck_mtx_unlock(&dtrace_lock);
14729
14730                 for (i = 0; i < help->dthps_nprovs; i++) {
14731                         dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
14732                                 p);
14733                 }
14734         }
14735 }
14736
14737 static int
14738 dtrace_helper_provider_add(proc_t* p, dof_helper_t *dofhp, int gen)
14739 {
14740         dtrace_helpers_t *help;
14741         dtrace_helper_provider_t *hprov, **tmp_provs;
14742         uint_t tmp_maxprovs, i;
14743
14744         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14745         help = p->p_dtrace_helpers;
14746         ASSERT(help != NULL);
14747
14748         /*
14749          * If we already have dtrace_helper_providers_max helper providers,
14750          * we're refuse to add a new one.
14751          */
14752         if (help->dthps_nprovs >= dtrace_helper_providers_max)
14753                 return (ENOSPC);
14754
14755         /*
14756          * Check to make sure this isn't a duplicate.
14757          */
14758         for (i = 0; i < help->dthps_nprovs; i++) {
14759                 if (dofhp->dofhp_addr ==
14760                     help->dthps_provs[i]->dthp_prov.dofhp_addr)
14761                         return (EALREADY);
14762         }
14763
14764         hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
14765         hprov->dthp_prov = *dofhp;
14766         hprov->dthp_ref = 1;
14767         hprov->dthp_generation = gen;
14768
14769         /*
14770          * Allocate a bigger table for helper providers if it's already full.
14771          */
14772         if (help->dthps_maxprovs == help->dthps_nprovs) {
14773                 tmp_maxprovs = help->dthps_maxprovs;
14774                 tmp_provs = help->dthps_provs;
14775
14776                 if (help->dthps_maxprovs == 0)
14777                         help->dthps_maxprovs = 2;
14778                 else
14779                         help->dthps_maxprovs *= 2;
14780                 if (help->dthps_maxprovs > dtrace_helper_providers_max)
14781                         help->dthps_maxprovs = dtrace_helper_providers_max;
14782
14783                 ASSERT(tmp_maxprovs < help->dthps_maxprovs);
14784
14785                 help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
14786                     sizeof (dtrace_helper_provider_t *), KM_SLEEP);
14787
14788                 if (tmp_provs != NULL) {
14789                         bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
14790                             sizeof (dtrace_helper_provider_t *));
14791                         kmem_free(tmp_provs, tmp_maxprovs *
14792                             sizeof (dtrace_helper_provider_t *));
14793                 }
14794         }
14795
14796         help->dthps_provs[help->dthps_nprovs] = hprov;
14797         help->dthps_nprovs++;
14798
14799         return (0);
14800 }
14801
14802 static void
14803 dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
14804 {
14805         lck_mtx_lock(&dtrace_lock);
14806
14807         if (--hprov->dthp_ref == 0) {
14808                 dof_hdr_t *dof;
14809                 lck_mtx_unlock(&dtrace_lock);
14810                 dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
14811                 dtrace_dof_destroy(dof);
14812                 kmem_free(hprov, sizeof (dtrace_helper_provider_t));
14813         } else {
14814                 lck_mtx_unlock(&dtrace_lock);
14815         }
14816 }
14817
14818 static int
14819 dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)
14820 {
14821         uintptr_t daddr = (uintptr_t)dof;
14822         dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
14823         dof_provider_t *provider;
14824         dof_probe_t *probe;
14825         uint8_t *arg;
14826         char *strtab, *typestr;
14827         dof_stridx_t typeidx;
14828         size_t typesz;
14829         uint_t nprobes, j, k;
14830
14831         ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
14832
14833         if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
14834                 dtrace_dof_error(dof, "misaligned section offset");
14835                 return (-1);
14836         }
14837
14838         /*
14839          * The section needs to be large enough to contain the DOF provider
14840          * structure appropriate for the given version.
14841          */
14842         if (sec->dofs_size <
14843             ((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
14844             offsetof(dof_provider_t, dofpv_prenoffs) :
14845             sizeof (dof_provider_t))) {
14846                 dtrace_dof_error(dof, "provider section too small");
14847                 return (-1);
14848         }
14849
14850         provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
14851         str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
14852         prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
14853         arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
14854         off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
14855
14856         if (str_sec == NULL || prb_sec == NULL ||
14857             arg_sec == NULL || off_sec == NULL)
14858                 return (-1);
14859
14860         enoff_sec = NULL;
14861
14862         if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
14863             provider->dofpv_prenoffs != DOF_SECT_NONE &&
14864             (enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
14865             provider->dofpv_prenoffs)) == NULL)
14866                 return (-1);
14867
14868         strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
14869
14870         if (provider->dofpv_name >= str_sec->dofs_size ||
14871             strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
14872                 dtrace_dof_error(dof, "invalid provider name");
14873                 return (-1);
14874         }
14875
14876         if (prb_sec->dofs_entsize == 0 ||
14877             prb_sec->dofs_entsize > prb_sec->dofs_size) {
14878                 dtrace_dof_error(dof, "invalid entry size");
14879                 return (-1);
14880         }
14881
14882         if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
14883                 dtrace_dof_error(dof, "misaligned entry size");
14884                 return (-1);
14885         }
14886
14887         if (off_sec->dofs_entsize != sizeof (uint32_t)) {
14888                 dtrace_dof_error(dof, "invalid entry size");
14889                 return (-1);
14890         }
14891
14892         if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
14893                 dtrace_dof_error(dof, "misaligned section offset");
14894                 return (-1);
14895         }
14896
14897         if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
14898                 dtrace_dof_error(dof, "invalid entry size");
14899                 return (-1);
14900         }
14901
14902         arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
14903
14904         nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
14905
14906         /*
14907          * Take a pass through the probes to check for errors.
14908          */
14909         for (j = 0; j < nprobes; j++) {
14910                 probe = (dof_probe_t *)(uintptr_t)(daddr +
14911                     prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
14912
14913                 if (probe->dofpr_func >= str_sec->dofs_size) {
14914                         dtrace_dof_error(dof, "invalid function name");
14915                         return (-1);
14916                 }
14917
14918                 if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
14919                         dtrace_dof_error(dof, "function name too long");
14920                         return (-1);
14921                 }
14922
14923                 if (probe->dofpr_name >= str_sec->dofs_size ||
14924                     strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
14925                         dtrace_dof_error(dof, "invalid probe name");
14926                         return (-1);
14927                 }
14928
14929                 /*
14930                  * The offset count must not wrap the index, and the offsets
14931                  * must also not overflow the section's data.
14932                  */
14933                 if (probe->dofpr_offidx + probe->dofpr_noffs <
14934                     probe->dofpr_offidx ||
14935                     (probe->dofpr_offidx + probe->dofpr_noffs) *
14936                     off_sec->dofs_entsize > off_sec->dofs_size) {
14937                         dtrace_dof_error(dof, "invalid probe offset");
14938                         return (-1);
14939                 }
14940
14941                 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
14942                         /*
14943                          * If there's no is-enabled offset section, make sure
14944                          * there aren't any is-enabled offsets. Otherwise
14945                          * perform the same checks as for probe offsets
14946                          * (immediately above).
14947                          */
14948                         if (enoff_sec == NULL) {
14949                                 if (probe->dofpr_enoffidx != 0 ||
14950                                     probe->dofpr_nenoffs != 0) {
14951                                         dtrace_dof_error(dof, "is-enabled "
14952                                             "offsets with null section");
14953                                         return (-1);
14954                                 }
14955                         } else if (probe->dofpr_enoffidx +
14956                             probe->dofpr_nenoffs < probe->dofpr_enoffidx ||
14957                             (probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
14958                             enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
14959                                 dtrace_dof_error(dof, "invalid is-enabled "
14960                                     "offset");
14961                                 return (-1);
14962                         }
14963
14964                         if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
14965                                 dtrace_dof_error(dof, "zero probe and "
14966                                     "is-enabled offsets");
14967                                 return (-1);
14968                         }
14969                 } else if (probe->dofpr_noffs == 0) {
14970                         dtrace_dof_error(dof, "zero probe offsets");
14971                         return (-1);
14972                 }
14973
14974                 if (probe->dofpr_argidx + probe->dofpr_xargc <
14975                     probe->dofpr_argidx ||
14976                     (probe->dofpr_argidx + probe->dofpr_xargc) *
14977                     arg_sec->dofs_entsize > arg_sec->dofs_size) {
14978                         dtrace_dof_error(dof, "invalid args");
14979                         return (-1);
14980                 }
14981
14982                 typeidx = probe->dofpr_nargv;
14983                 typestr = strtab + probe->dofpr_nargv;
14984                 for (k = 0; k < probe->dofpr_nargc; k++) {
14985                         if (typeidx >= str_sec->dofs_size) {
14986                                 dtrace_dof_error(dof, "bad "
14987                                     "native argument type");
14988                                 return (-1);
14989                         }
14990
14991                         typesz = strlen(typestr) + 1;
14992                         if (typesz > DTRACE_ARGTYPELEN) {
14993                                 dtrace_dof_error(dof, "native "
14994                                     "argument type too long");
14995                                 return (-1);
14996                         }
14997                         typeidx += typesz;
14998                         typestr += typesz;
14999                 }
15000
15001                 typeidx = probe->dofpr_xargv;
15002                 typestr = strtab + probe->dofpr_xargv;
15003                 for (k = 0; k < probe->dofpr_xargc; k++) {
15004                         if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
15005                                 dtrace_dof_error(dof, "bad "
15006                                     "native argument index");
15007                                 return (-1);
15008                         }
15009
15010                         if (typeidx >= str_sec->dofs_size) {
15011                                 dtrace_dof_error(dof, "bad "
15012                                     "translated argument type");
15013                                 return (-1);
15014                         }
15015
15016                         typesz = strlen(typestr) + 1;
15017                         if (typesz > DTRACE_ARGTYPELEN) {
15018                                 dtrace_dof_error(dof, "translated argument "
15019                                     "type too long");
15020                                 return (-1);
15021                         }
15022
15023                         typeidx += typesz;
15024                         typestr += typesz;
15025                 }
15026         }
15027
15028         return (0);
15029 }
15030
15031 static int
15032 dtrace_helper_slurp(proc_t* p, dof_hdr_t *dof, dof_helper_t *dhp)
15033 {
15034         dtrace_helpers_t *help;
15035         dtrace_vstate_t *vstate;
15036         dtrace_enabling_t *enab = NULL;
15037         int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
15038         uintptr_t daddr = (uintptr_t)dof;
15039
15040         LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
15041         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15042
15043         if ((help = p->p_dtrace_helpers) == NULL)
15044                 help = dtrace_helpers_create(p);
15045
15046         vstate = &help->dthps_vstate;
15047
15048         if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab,
15049             dhp != NULL ? dhp->dofhp_addr : 0, B_FALSE)) != 0) {
15050                 dtrace_dof_destroy(dof);
15051                 return (rv);
15052         }
15053
15054         /*
15055          * Look for helper providers and validate their descriptions.
15056          */
15057         if (dhp != NULL) {
15058                 for (i = 0; (uint32_t)i < dof->dofh_secnum; i++) {
15059                         dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
15060                             dof->dofh_secoff + i * dof->dofh_secsize);
15061
15062                         if (sec->dofs_type != DOF_SECT_PROVIDER)
15063                                 continue;
15064
15065                         if (dtrace_helper_provider_validate(dof, sec) != 0) {
15066                                 dtrace_enabling_destroy(enab);
15067                                 dtrace_dof_destroy(dof);
15068                                 return (-1);
15069                         }
15070
15071                         nprovs++;
15072                 }
15073         }
15074
15075         /*
15076          * Now we need to walk through the ECB descriptions in the enabling.
15077          */
15078         for (i = 0; i < enab->dten_ndesc; i++) {
15079                 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
15080                 dtrace_probedesc_t *desc = &ep->dted_probe;
15081
15082                 /* APPLE NOTE: Darwin employs size bounded string operation. */
15083                 if (!LIT_STRNEQL(desc->dtpd_provider, "dtrace"))
15084                         continue;
15085
15086                 if (!LIT_STRNEQL(desc->dtpd_mod, "helper"))
15087                         continue;
15088
15089                 if (!LIT_STRNEQL(desc->dtpd_func, "ustack"))
15090                         continue;
15091
15092                 if ((rv = dtrace_helper_action_add(p, DTRACE_HELPER_ACTION_USTACK,
15093                     ep)) != 0) {
15094                         /*
15095                          * Adding this helper action failed -- we are now going
15096                          * to rip out the entire generation and return failure.
15097                          */
15098                         (void) dtrace_helper_destroygen(p, help->dthps_generation);
15099                         dtrace_enabling_destroy(enab);
15100                         dtrace_dof_destroy(dof);
15101                         return (-1);
15102                 }
15103
15104                 nhelpers++;
15105         }
15106
15107         if (nhelpers < enab->dten_ndesc)
15108                 dtrace_dof_error(dof, "unmatched helpers");
15109
15110         gen = help->dthps_generation++;
15111         dtrace_enabling_destroy(enab);
15112
15113         if (dhp != NULL && nprovs > 0) {
15114                 dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
15115                 if (dtrace_helper_provider_add(p, dhp, gen) == 0) {
15116                         lck_mtx_unlock(&dtrace_lock);
15117                         dtrace_helper_provider_register(p, help, dhp);
15118                         lck_mtx_lock(&dtrace_lock);
15119
15120                         destroy = 0;
15121                 }
15122         }
15123
15124         if (destroy)
15125                 dtrace_dof_destroy(dof);
15126
15127         return (gen);
15128 }
15129
15130 /*
15131  * APPLE NOTE:  DTrace lazy dof implementation
15132  *
15133  * DTrace user static probes (USDT probes) and helper actions are loaded
15134  * in a process by proccessing dof sections. The dof sections are passed
15135  * into the kernel by dyld, in a dof_ioctl_data_t block. It is rather
15136  * expensive to process dof for a process that will never use it. There
15137  * is a memory cost (allocating the providers/probes), and a cpu cost
15138  * (creating the providers/probes).
15139  *
15140  * To reduce this cost, we use "lazy dof". The normal proceedure for
15141  * dof processing is to copyin the dof(s) pointed to by the dof_ioctl_data_t
15142  * block, and invoke dof_slurp_helper() on them. When "lazy dof" is
15143  * used, each process retains the dof_ioctl_data_t block, instead of
15144  * copying in the data it points to.
15145  *
15146  * The dof_ioctl_data_t blocks are managed as if they were the actual
15147  * processed dof; on fork the block is copied to the child, on exec and
15148  * exit the block is freed.
15149  *
15150  * If the process loads library(s) containing additional dof, the
15151  * new dof_ioctl_data_t is merged with the existing block.
15152  *
15153  * There are a few catches that make this slightly more difficult.
15154  * When dyld registers dof_ioctl_data_t blocks, it expects a unique
15155  * identifier value for each dof in the block. In non-lazy dof terms,
15156  * this is the generation that dof was loaded in. If we hand back
15157  * a UID for a lazy dof, that same UID must be able to unload the
15158  * dof once it has become non-lazy. To meet this requirement, the
15159  * code that loads lazy dof requires that the UID's for dof(s) in
15160  * the lazy dof be sorted, and in ascending order. It is okay to skip
15161  * UID's, I.E., 1 -> 5 -> 6 is legal.
15162  *
15163  * Once a process has become non-lazy, it will stay non-lazy. All
15164  * future dof operations for that process will be non-lazy, even
15165  * if the dof mode transitions back to lazy.
15166  *
15167  * Always do lazy dof checks before non-lazy (I.E. In fork, exit, exec.).
15168  * That way if the lazy check fails due to transitioning to non-lazy, the
15169  * right thing is done with the newly faulted in dof.
15170  */
15171
15172 /*
15173  * This method is a bit squicky. It must handle:
15174  *
15175  * dof should not be lazy.
15176  * dof should have been handled lazily, but there was an error
15177  * dof was handled lazily, and needs to be freed.
15178  * dof was handled lazily, and must not be freed.
15179  *
15180  *
15181  * Returns EACCESS if dof should be handled non-lazily.
15182  *
15183  * KERN_SUCCESS and all other return codes indicate lazy handling of dof.
15184  *
15185  * If the dofs data is claimed by this method, dofs_claimed will be set.
15186  * Callers should not free claimed dofs.
15187  */
15188 static int
15189 dtrace_lazy_dofs_add(proc_t *p, dof_ioctl_data_t* incoming_dofs, int *dofs_claimed)
15190 {
15191         ASSERT(p);
15192         ASSERT(incoming_dofs && incoming_dofs->dofiod_count > 0);
15193
15194         int rval = 0;
15195         *dofs_claimed = 0;
15196
15197         lck_rw_lock_shared(&dtrace_dof_mode_lock);
15198
15199         ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
15200         ASSERT(dtrace_dof_mode != DTRACE_DOF_MODE_NEVER);
15201
15202         /*
15203          * Any existing helpers force non-lazy behavior.
15204          */
15205         if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON && (p->p_dtrace_helpers == NULL)) {
15206                 dtrace_sprlock(p);
15207
15208                 dof_ioctl_data_t* existing_dofs = p->p_dtrace_lazy_dofs;
15209                 unsigned int existing_dofs_count = (existing_dofs) ? existing_dofs->dofiod_count : 0;
15210                 unsigned int i, merged_dofs_count = incoming_dofs->dofiod_count + existing_dofs_count;
15211
15212                 /*
15213                  * Range check...
15214                  */
15215                 if (merged_dofs_count == 0 || merged_dofs_count > 1024) {
15216                         dtrace_dof_error(NULL, "lazy_dofs_add merged_dofs_count out of range");
15217                         rval = EINVAL;
15218                         goto unlock;
15219                 }
15220
15221                 /*
15222                  * Each dof being added must be assigned a unique generation.
15223                  */
15224                 uint64_t generation = (existing_dofs) ? existing_dofs->dofiod_helpers[existing_dofs_count - 1].dofhp_dof + 1 : 1;
15225                 for (i=0; i<incoming_dofs->dofiod_count; i++) {
15226                         /*
15227                          * We rely on these being the same so we can overwrite dofhp_dof and not lose info.
15228                          */
15229                         ASSERT(incoming_dofs->dofiod_helpers[i].dofhp_dof == incoming_dofs->dofiod_helpers[i].dofhp_addr);
15230                         incoming_dofs->dofiod_helpers[i].dofhp_dof = generation++;
15231                 }
15232
15233
15234                 if (existing_dofs) {
15235                         /*
15236                          * Merge the existing and incoming dofs
15237                          */
15238                         size_t merged_dofs_size = DOF_IOCTL_DATA_T_SIZE(merged_dofs_count);
15239                         dof_ioctl_data_t* merged_dofs = kmem_alloc(merged_dofs_size, KM_SLEEP);
15240
15241                         bcopy(&existing_dofs->dofiod_helpers[0],
15242                               &merged_dofs->dofiod_helpers[0],
15243                               sizeof(dof_helper_t) * existing_dofs_count);
15244                         bcopy(&incoming_dofs->dofiod_helpers[0],
15245                               &merged_dofs->dofiod_helpers[existing_dofs_count],
15246                               sizeof(dof_helper_t) * incoming_dofs->dofiod_count);
15247
15248                         merged_dofs->dofiod_count = merged_dofs_count;
15249
15250                         kmem_free(existing_dofs, DOF_IOCTL_DATA_T_SIZE(existing_dofs_count));
15251
15252                         p->p_dtrace_lazy_dofs = merged_dofs;
15253                 } else {
15254                         /*
15255                          * Claim the incoming dofs
15256                          */
15257                         *dofs_claimed = 1;
15258                         p->p_dtrace_lazy_dofs = incoming_dofs;
15259                 }
15260
15261 #if DEBUG
15262                 dof_ioctl_data_t* all_dofs = p->p_dtrace_lazy_dofs;
15263                 for (i=0; i<all_dofs->dofiod_count-1; i++) {
15264                         ASSERT(all_dofs->dofiod_helpers[i].dofhp_dof < all_dofs->dofiod_helpers[i+1].dofhp_dof);
15265                 }
15266 #endif /* DEBUG */
15267
15268 unlock:
15269                 dtrace_sprunlock(p);
15270         } else {
15271                 rval = EACCES;
15272         }
15273
15274         lck_rw_unlock_shared(&dtrace_dof_mode_lock);
15275
15276         return rval;
15277 }
15278
15279 /*
15280  * Returns:
15281  *
15282  * EINVAL: lazy dof is enabled, but the requested generation was not found.
15283  * EACCES: This removal needs to be handled non-lazily.
15284  */
15285 static int
15286 dtrace_lazy_dofs_remove(proc_t *p, int generation)
15287 {
15288         int rval = EINVAL;
15289
15290         lck_rw_lock_shared(&dtrace_dof_mode_lock);
15291
15292         ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
15293         ASSERT(dtrace_dof_mode != DTRACE_DOF_MODE_NEVER);
15294
15295         /*
15296          * Any existing helpers force non-lazy behavior.
15297          */
15298         if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON && (p->p_dtrace_helpers == NULL)) {
15299                 dtrace_sprlock(p);
15300
15301                 dof_ioctl_data_t* existing_dofs = p->p_dtrace_lazy_dofs;
15302
15303                 if (existing_dofs) {
15304                         int index, existing_dofs_count = existing_dofs->dofiod_count;
15305                         for (index=0; index<existing_dofs_count; index++) {
15306                                 if ((int)existing_dofs->dofiod_helpers[index].dofhp_dof == generation) {
15307                                         dof_ioctl_data_t* removed_dofs = NULL;
15308
15309                                         /*
15310                                          * If there is only 1 dof, we'll delete it and swap in NULL.
15311                                          */
15312                                         if (existing_dofs_count > 1) {
15313                                                 int removed_dofs_count = existing_dofs_count - 1;
15314                                                 size_t removed_dofs_size = DOF_IOCTL_DATA_T_SIZE(removed_dofs_count);
15315
15316                                                 removed_dofs = kmem_alloc(removed_dofs_size, KM_SLEEP);
15317                                                 removed_dofs->dofiod_count = removed_dofs_count;
15318
15319                                                 /*
15320                                                  * copy the remaining data.
15321                                                  */
15322                                                 if (index > 0) {
15323                                                         bcopy(&existing_dofs->dofiod_helpers[0],
15324                                                               &removed_dofs->dofiod_helpers[0],
15325                                                               index * sizeof(dof_helper_t));
15326                                                 }
15327
15328                                                 if (index < existing_dofs_count-1) {
15329                                                         bcopy(&existing_dofs->dofiod_helpers[index+1],
15330                                                               &removed_dofs->dofiod_helpers[index],
15331                                                               (existing_dofs_count - index - 1) * sizeof(dof_helper_t));
15332                                                 }
15333                                         }
15334
15335                                         kmem_free(existing_dofs, DOF_IOCTL_DATA_T_SIZE(existing_dofs_count));
15336
15337                                         p->p_dtrace_lazy_dofs = removed_dofs;
15338
15339                                         rval = KERN_SUCCESS;
15340
15341                                         break;
15342                                 }
15343                         }
15344
15345 #if DEBUG
15346                         dof_ioctl_data_t* all_dofs = p->p_dtrace_lazy_dofs;
15347                         if (all_dofs) {
15348                                 unsigned int i;
15349                                 for (i=0; i<all_dofs->dofiod_count-1; i++) {
15350                                         ASSERT(all_dofs->dofiod_helpers[i].dofhp_dof < all_dofs->dofiod_helpers[i+1].dofhp_dof);
15351                                 }
15352                         }
15353 #endif
15354
15355                 }
15356                 dtrace_sprunlock(p);
15357         } else {
15358                 rval = EACCES;
15359         }
15360
15361         lck_rw_unlock_shared(&dtrace_dof_mode_lock);
15362
15363         return rval;
15364 }
15365
15366 void
15367 dtrace_lazy_dofs_destroy(proc_t *p)
15368 {
15369         lck_rw_lock_shared(&dtrace_dof_mode_lock);
15370         dtrace_sprlock(p);
15371
15372         ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
15373
15374         dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs;
15375         p->p_dtrace_lazy_dofs = NULL;
15376
15377         dtrace_sprunlock(p);
15378         lck_rw_unlock_shared(&dtrace_dof_mode_lock);
15379
15380         if (lazy_dofs) {
15381                 kmem_free(lazy_dofs, DOF_IOCTL_DATA_T_SIZE(lazy_dofs->dofiod_count));
15382         }
15383 }
15384
15385 static int
15386 dtrace_lazy_dofs_proc_iterate_filter(proc_t *p, void* ignored)
15387 {
15388 #pragma unused(ignored)
15389         /*
15390          * Okay to NULL test without taking the sprlock.
15391          */
15392         return p->p_dtrace_lazy_dofs != NULL;
15393 }
15394
15395 static void
15396 dtrace_lazy_dofs_process(proc_t *p) {
15397         /*
15398          * It is possible this process may exit during our attempt to
15399          * fault in the dof. We could fix this by holding locks longer,
15400          * but the errors are benign.
15401          */
15402         dtrace_sprlock(p);
15403
15404
15405         ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
15406         ASSERT(dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF);
15407
15408         dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs;
15409         p->p_dtrace_lazy_dofs = NULL;
15410
15411         dtrace_sprunlock(p);
15412         lck_mtx_lock(&dtrace_meta_lock);
15413         /*
15414          * Process each dof_helper_t
15415          */
15416         if (lazy_dofs != NULL) {
15417                 unsigned int i;
15418                 int rval;
15419
15420                 for (i=0; i<lazy_dofs->dofiod_count; i++) {
15421                         /*
15422                          * When loading lazy dof, we depend on the generations being sorted in ascending order.
15423                          */
15424                         ASSERT(i >= (lazy_dofs->dofiod_count - 1) || lazy_dofs->dofiod_helpers[i].dofhp_dof < lazy_dofs->dofiod_helpers[i+1].dofhp_dof);
15425
15426                         dof_helper_t *dhp = &lazy_dofs->dofiod_helpers[i];
15427
15428                         /*
15429                          * We stored the generation in dofhp_dof. Save it, and restore the original value.
15430                          */
15431                         int generation = dhp->dofhp_dof;
15432                         dhp->dofhp_dof = dhp->dofhp_addr;
15433
15434                         dof_hdr_t *dof = dtrace_dof_copyin_from_proc(p, dhp->dofhp_dof, &rval);
15435
15436                         if (dof != NULL) {
15437                                 dtrace_helpers_t *help;
15438
15439                                 lck_mtx_lock(&dtrace_lock);
15440
15441                                 /*
15442                                  * This must be done with the dtrace_lock held
15443                                  */
15444                                 if ((help = p->p_dtrace_helpers) == NULL)
15445                                         help = dtrace_helpers_create(p);
15446
15447                                 /*
15448                                  * If the generation value has been bumped, someone snuck in
15449                                  * when we released the dtrace lock. We have to dump this generation,
15450                                  * there is no safe way to load it.
15451                                  */
15452                                 if (help->dthps_generation <= generation) {
15453                                         help->dthps_generation = generation;
15454
15455                                         /*
15456                                          * dtrace_helper_slurp() takes responsibility for the dof --
15457                                          * it may free it now or it may save it and free it later.
15458                                          */
15459                                         if ((rval = dtrace_helper_slurp(p, dof, dhp)) != generation) {
15460                                                 dtrace_dof_error(NULL, "returned value did not match expected generation");
15461                                         }
15462                                 }
15463
15464                                 lck_mtx_unlock(&dtrace_lock);
15465                         }
15466                 }
15467                 lck_mtx_unlock(&dtrace_meta_lock);
15468                 kmem_free(lazy_dofs, DOF_IOCTL_DATA_T_SIZE(lazy_dofs->dofiod_count));
15469         } else {
15470                 lck_mtx_unlock(&dtrace_meta_lock);
15471         }
15472 }
15473
15474 static int
15475 dtrace_lazy_dofs_proc_iterate_doit(proc_t *p, void* ignored)
15476 {
15477 #pragma unused(ignored)
15478
15479         dtrace_lazy_dofs_process(p);
15480
15481         return PROC_RETURNED;
15482 }
15483
15484 #define DTRACE_LAZY_DOFS_DUPLICATED 1
15485
15486 static int
15487 dtrace_lazy_dofs_duplicate(proc_t *parent, proc_t *child)
15488 {
15489         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
15490         LCK_MTX_ASSERT(&parent->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED);
15491         LCK_MTX_ASSERT(&child->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED);
15492
15493         lck_rw_lock_shared(&dtrace_dof_mode_lock);
15494         dtrace_sprlock(parent);
15495
15496         /*
15497          * We need to make sure that the transition to lazy dofs -> helpers
15498          * was atomic for our parent
15499          */
15500         ASSERT(parent->p_dtrace_lazy_dofs == NULL || parent->p_dtrace_helpers == NULL);
15501         /*
15502          * In theory we should hold the child sprlock, but this is safe...
15503          */
15504         ASSERT(child->p_dtrace_lazy_dofs == NULL && child->p_dtrace_helpers == NULL);
15505
15506         dof_ioctl_data_t* parent_dofs = parent->p_dtrace_lazy_dofs;
15507         dof_ioctl_data_t* child_dofs = NULL;
15508         if (parent_dofs) {
15509                 size_t parent_dofs_size = DOF_IOCTL_DATA_T_SIZE(parent_dofs->dofiod_count);
15510                 child_dofs = kmem_alloc(parent_dofs_size, KM_SLEEP);
15511                 bcopy(parent_dofs, child_dofs, parent_dofs_size);
15512         }
15513
15514         dtrace_sprunlock(parent);
15515
15516         if (child_dofs) {
15517                 dtrace_sprlock(child);
15518                 child->p_dtrace_lazy_dofs = child_dofs;
15519                 dtrace_sprunlock(child);
15520                 /**
15521                  * We process the DOF at this point if the mode is set to
15522                  * LAZY_OFF. This can happen if DTrace is still processing the
15523                  * DOF of other process (which can happen because the
15524                  * protected pager can have a huge latency)
15525                  * but has not processed our parent yet
15526                  */
15527                 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF) {
15528                         dtrace_lazy_dofs_process(child);
15529                 }
15530                 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
15531
15532                 return DTRACE_LAZY_DOFS_DUPLICATED;
15533         }
15534         lck_rw_unlock_shared(&dtrace_dof_mode_lock);
15535
15536         return 0;
15537 }
15538
15539 static dtrace_helpers_t *
15540 dtrace_helpers_create(proc_t *p)
15541 {
15542         dtrace_helpers_t *help;
15543
15544         LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15545         ASSERT(p->p_dtrace_helpers == NULL);
15546
15547         help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
15548         help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *
15549             DTRACE_NHELPER_ACTIONS, KM_SLEEP);
15550
15551         p->p_dtrace_helpers = help;
15552         dtrace_helpers++;
15553
15554         return (help);
15555 }
15556
15557 static void
15558 dtrace_helpers_destroy(proc_t* p)
15559 {
15560         dtrace_helpers_t *help;
15561         dtrace_vstate_t *vstate;
15562         uint_t i;
15563
15564         lck_mtx_lock(&dtrace_meta_lock);
15565         lck_mtx_lock(&dtrace_lock);
15566
15567         ASSERT(p->p_dtrace_helpers != NULL);
15568         ASSERT(dtrace_helpers > 0);
15569
15570         help = p->p_dtrace_helpers;
15571         vstate = &help->dthps_vstate;
15572
15573         /*
15574          * We're now going to lose the help from this process.
15575          */
15576         p->p_dtrace_helpers = NULL;
15577         dtrace_sync();
15578
15579         /*
15580          * Destory the helper actions.
15581          */
15582         for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
15583                 dtrace_helper_action_t *h, *next;
15584
15585                 for (h = help->dthps_actions[i]; h != NULL; h = next) {
15586                         next = h->dtha_next;
15587                         dtrace_helper_action_destroy(h, vstate);
15588                         h = next;
15589                 }
15590         }
15591
15592         lck_mtx_unlock(&dtrace_lock);
15593
15594         /*
15595          * Destroy the helper providers.
15596          */
15597         if (help->dthps_maxprovs > 0) {
15598                 if (dtrace_meta_pid != NULL) {
15599                         ASSERT(dtrace_deferred_pid == NULL);
15600
15601                         for (i = 0; i < help->dthps_nprovs; i++) {
15602                                 dtrace_helper_provider_remove(
15603                                     &help->dthps_provs[i]->dthp_prov, p);
15604                         }
15605                 } else {
15606                         lck_mtx_lock(&dtrace_lock);
15607                         ASSERT(help->dthps_deferred == 0 ||
15608                             help->dthps_next != NULL ||
15609                             help->dthps_prev != NULL ||
15610                             help == dtrace_deferred_pid);
15611
15612                         /*
15613                          * Remove the helper from the deferred list.
15614                          */
15615                         if (help->dthps_next != NULL)
15616                                 help->dthps_next->dthps_prev = help->dthps_prev;
15617                         if (help->dthps_prev != NULL)
15618                                 help->dthps_prev->dthps_next = help->dthps_next;
15619                         if (dtrace_deferred_pid == help) {
15620                                 dtrace_deferred_pid = help->dthps_next;
15621                                 ASSERT(help->dthps_prev == NULL);
15622                         }
15623
15624                         lck_mtx_unlock(&dtrace_lock);
15625                 }
15626
15627
15628                 for (i = 0; i < help->dthps_nprovs; i++) {
15629                         dtrace_helper_provider_destroy(help->dthps_provs[i]);
15630                 }
15631
15632                 kmem_free(help->dthps_provs, help->dthps_maxprovs *
15633                     sizeof (dtrace_helper_provider_t *));
15634         }
15635
15636         lck_mtx_lock(&dtrace_lock);
15637
15638         dtrace_vstate_fini(&help->dthps_vstate);
15639         kmem_free(help->dthps_actions,
15640             sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);
15641         kmem_free(help, sizeof (dtrace_helpers_t));
15642
15643         --dtrace_helpers;
15644         lck_mtx_unlock(&dtrace_lock);
15645         lck_mtx_unlock(&dtrace_meta_lock);
15646 }
15647
15648 static void
15649 dtrace_helpers_duplicate(proc_t *from, proc_t *to)
15650 {
15651         dtrace_helpers_t *help, *newhelp;
15652         dtrace_helper_action_t *helper, *new, *last;
15653         dtrace_difo_t *dp;
15654         dtrace_vstate_t *vstate;
15655         uint_t i;
15656         int j, sz, hasprovs = 0;
15657
15658         lck_mtx_lock(&dtrace_meta_lock);
15659         lck_mtx_lock(&dtrace_lock);
15660         ASSERT(from->p_dtrace_helpers != NULL);
15661         ASSERT(dtrace_helpers > 0);
15662
15663         help = from->p_dtrace_helpers;
15664         newhelp = dtrace_helpers_create(to);
15665         ASSERT(to->p_dtrace_helpers != NULL);
15666
15667         newhelp->dthps_generation = help->dthps_generation;
15668         vstate = &newhelp->dthps_vstate;
15669
15670         /*
15671          * Duplicate the helper actions.
15672          */
15673         for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
15674                 if ((helper = help->dthps_actions[i]) == NULL)
15675                         continue;
15676
15677                 for (last = NULL; helper != NULL; helper = helper->dtha_next) {
15678                         new = kmem_zalloc(sizeof (dtrace_helper_action_t),
15679                             KM_SLEEP);
15680                         new->dtha_generation = helper->dtha_generation;
15681
15682                         if ((dp = helper->dtha_predicate) != NULL) {
15683                                 dp = dtrace_difo_duplicate(dp, vstate);
15684                                 new->dtha_predicate = dp;
15685                         }
15686
15687                         new->dtha_nactions = helper->dtha_nactions;
15688                         sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;
15689                         new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
15690
15691                         for (j = 0; j < new->dtha_nactions; j++) {
15692                                 dtrace_difo_t *dpj = helper->dtha_actions[j];
15693
15694                                 ASSERT(dpj != NULL);
15695                                 dpj = dtrace_difo_duplicate(dpj, vstate);
15696                                 new->dtha_actions[j] = dpj;
15697                         }
15698
15699                         if (last != NULL) {
15700                                 last->dtha_next = new;
15701                         } else {
15702                                 newhelp->dthps_actions[i] = new;
15703                         }
15704
15705                         last = new;
15706                 }
15707         }
15708
15709         /*
15710          * Duplicate the helper providers and register them with the
15711          * DTrace framework.
15712          */
15713         if (help->dthps_nprovs > 0) {
15714                 newhelp->dthps_nprovs = help->dthps_nprovs;
15715                 newhelp->dthps_maxprovs = help->dthps_nprovs;
15716                 newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
15717                     sizeof (dtrace_helper_provider_t *), KM_SLEEP);
15718                 for (i = 0; i < newhelp->dthps_nprovs; i++) {
15719                         newhelp->dthps_provs[i] = help->dthps_provs[i];
15720                         newhelp->dthps_provs[i]->dthp_ref++;
15721                 }
15722
15723                 hasprovs = 1;
15724         }
15725
15726         lck_mtx_unlock(&dtrace_lock);
15727
15728         if (hasprovs)
15729                 dtrace_helper_provider_register(to, newhelp, NULL);
15730
15731         lck_mtx_unlock(&dtrace_meta_lock);
15732 }
15733
15734 /**
15735  * DTrace Process functions
15736  */
15737
15738 void
15739 dtrace_proc_fork(proc_t *parent_proc, proc_t *child_proc, int spawn)
15740 {
15741         /*
15742          * This code applies to new processes who are copying the task
15743          * and thread state and address spaces of their parent process.
15744          */
15745         if (!spawn) {
15746                 /*
15747                  * APPLE NOTE: Solaris does a sprlock() and drops the
15748                  * proc_lock here. We're cheating a bit and only taking
15749                  * the p_dtrace_sprlock lock. A full sprlock would
15750                  * task_suspend the parent.
15751                  */
15752                 dtrace_sprlock(parent_proc);
15753
15754                 /*
15755                  * Remove all DTrace tracepoints from the child process. We
15756                  * need to do this _before_ duplicating USDT providers since
15757                  * any associated probes may be immediately enabled.
15758                  */
15759                 if (parent_proc->p_dtrace_count > 0) {
15760                         dtrace_fasttrap_fork(parent_proc, child_proc);
15761                 }
15762
15763                 dtrace_sprunlock(parent_proc);
15764
15765                 /*
15766                  * Duplicate any lazy dof(s). This must be done while NOT
15767                  * holding the parent sprlock! Lock ordering is
15768                  * dtrace_dof_mode_lock, then sprlock.  It is imperative we
15769                  * always call dtrace_lazy_dofs_duplicate, rather than null
15770                  * check and call if !NULL. If we NULL test, during lazy dof
15771                  * faulting we can race with the faulting code and proceed
15772                  * from here to beyond the helpers copy. The lazy dof
15773                  * faulting will then fail to copy the helpers to the child
15774                  * process. We return if we duplicated lazy dofs as a process
15775                  * can only have one at the same time to avoid a race between
15776                  * a dtrace client and dtrace_proc_fork where a process would
15777                  * end up with both lazy dofs and helpers.
15778                  */
15779                 if (dtrace_lazy_dofs_duplicate(parent_proc, child_proc) == DTRACE_LAZY_DOFS_DUPLICATED) {
15780                         return;
15781                 }
15782
15783                 /*
15784                  * Duplicate any helper actions and providers if they haven't
15785                  * already.
15786                  */
15787 #if !defined(__APPLE__)
15788                  /*
15789                  * The SFORKING
15790                  * we set above informs the code to enable USDT probes that
15791                  * sprlock() may fail because the child is being forked.
15792                  */
15793 #endif
15794                 /*
15795                  * APPLE NOTE: As best I can tell, Apple's sprlock() equivalent
15796                  * never fails to find the child. We do not set SFORKING.
15797                  */
15798                 if (parent_proc->p_dtrace_helpers != NULL && dtrace_helpers_fork) {
15799                         (*dtrace_helpers_fork)(parent_proc, child_proc);
15800                 }
15801         }
15802 }
15803
15804 void
15805 dtrace_proc_exec(proc_t *p)
15806 {
15807         /*
15808          * Invalidate any predicate evaluation already cached for this thread by DTrace.
15809          * That's because we've just stored to p_comm and DTrace refers to that when it
15810          * evaluates the "execname" special variable. uid and gid may have changed as well.
15811          */
15812         dtrace_set_thread_predcache(current_thread(), 0);
15813
15814         /*
15815          * Free any outstanding lazy dof entries. It is imperative we
15816          * always call dtrace_lazy_dofs_destroy, rather than null check
15817          * and call if !NULL. If we NULL test, during lazy dof faulting
15818          * we can race with the faulting code and proceed from here to
15819          * beyond the helpers cleanup. The lazy dof faulting will then
15820          * install new helpers which no longer belong to this process!
15821          */
15822         dtrace_lazy_dofs_destroy(p);
15823
15824
15825         /*
15826          * Clean up any DTrace helpers for the process.
15827          */
15828         if (p->p_dtrace_helpers != NULL && dtrace_helpers_cleanup) {
15829                 (*dtrace_helpers_cleanup)(p);
15830         }
15831
15832         /*
15833          * Cleanup the DTrace provider associated with this process.
15834          */
15835         proc_lock(p);
15836         if (p->p_dtrace_probes && dtrace_fasttrap_exec_ptr) {
15837                 (*dtrace_fasttrap_exec_ptr)(p);
15838         }
15839         proc_unlock(p);
15840 }
15841
15842 void
15843 dtrace_proc_exit(proc_t *p)
15844 {
15845         /*
15846          * Free any outstanding lazy dof entries. It is imperative we
15847          * always call dtrace_lazy_dofs_destroy, rather than null check
15848          * and call if !NULL. If we NULL test, during lazy dof faulting
15849          * we can race with the faulting code and proceed from here to
15850          * beyond the helpers cleanup. The lazy dof faulting will then
15851          * install new helpers which will never be cleaned up, and leak.
15852          */
15853         dtrace_lazy_dofs_destroy(p);
15854
15855         /*
15856          * Clean up any DTrace helper actions or probes for the process.
15857          */
15858         if (p->p_dtrace_helpers != NULL) {
15859                 (*dtrace_helpers_cleanup)(p);
15860         }
15861
15862         /*
15863          * Clean up any DTrace probes associated with this process.
15864          */
15865         /*
15866          * APPLE NOTE: We release ptss pages/entries in dtrace_fasttrap_exit_ptr(),
15867          * call this after dtrace_helpers_cleanup()
15868          */
15869         proc_lock(p);
15870         if (p->p_dtrace_probes && dtrace_fasttrap_exit_ptr) {
15871                 (*dtrace_fasttrap_exit_ptr)(p);
15872         }
15873         proc_unlock(p);
15874 }
15875
15876 /*
15877  * DTrace Hook Functions
15878  */
15879
15880 /*
15881  * APPLE NOTE:  dtrace_modctl_* routines for kext support.
15882  * Used to manipulate the modctl list within dtrace xnu.
15883  */
15884
15885 modctl_t *dtrace_modctl_list;
15886
15887 static void
15888 dtrace_modctl_add(struct modctl * newctl)
15889 {
15890         struct modctl *nextp, *prevp;
15891
15892         ASSERT(newctl != NULL);
15893         LCK_MTX_ASSERT(&mod_lock, LCK_MTX_ASSERT_OWNED);
15894
15895         // Insert new module at the front of the list,
15896
15897         newctl->mod_next = dtrace_modctl_list;
15898         dtrace_modctl_list = newctl;
15899
15900         /*
15901          * If a module exists with the same name, then that module
15902          * must have been unloaded with enabled probes. We will move
15903          * the unloaded module to the new module's stale chain and
15904          * then stop traversing the list.
15905          */
15906
15907         prevp = newctl;
15908         nextp = newctl->mod_next;
15909
15910         while (nextp != NULL) {
15911                 if (nextp->mod_loaded) {
15912                         /* This is a loaded module. Keep traversing. */
15913                         prevp = nextp;
15914                         nextp = nextp->mod_next;
15915                         continue;
15916                 }
15917                 else {
15918                         /* Found an unloaded module */
15919                         if (strncmp (newctl->mod_modname, nextp->mod_modname, KMOD_MAX_NAME)) {
15920                                 /* Names don't match. Keep traversing. */
15921                                 prevp = nextp;
15922                                 nextp = nextp->mod_next;
15923                                 continue;
15924                         }
15925                         else {
15926                                 /* We found a stale entry, move it. We're done. */
15927                                 prevp->mod_next = nextp->mod_next;
15928                                 newctl->mod_stale = nextp;
15929                                 nextp->mod_next = NULL;
15930                                 break;
15931                         }
15932                 }
15933         }
15934 }
15935
15936 static modctl_t *
15937 dtrace_modctl_lookup(struct kmod_info * kmod)
15938 {
15939     LCK_MTX_ASSERT(&mod_lock, LCK_MTX_ASSERT_OWNED);
15940
15941     struct modctl * ctl;
15942
15943     for (ctl = dtrace_modctl_list; ctl; ctl=ctl->mod_next) {
15944         if (ctl->mod_id == kmod->id)
15945             return(ctl);
15946     }
15947     return (NULL);
15948 }
15949
15950 /*
15951  * This routine is called from dtrace_module_unloaded().
15952  * It removes a modctl structure and its stale chain
15953  * from the kext shadow list.
15954  */
15955 static void
15956 dtrace_modctl_remove(struct modctl * ctl)
15957 {
15958         ASSERT(ctl != NULL);
15959         LCK_MTX_ASSERT(&mod_lock, LCK_MTX_ASSERT_OWNED);
15960         modctl_t *prevp, *nextp, *curp;
15961
15962         // Remove stale chain first
15963         for (curp=ctl->mod_stale; curp != NULL; curp=nextp) {
15964                 nextp = curp->mod_stale;
15965                 /* There should NEVER be user symbols allocated at this point */
15966                 ASSERT(curp->mod_user_symbols == NULL);
15967                 kmem_free(curp, sizeof(modctl_t));
15968         }
15969
15970         prevp = NULL;
15971         curp = dtrace_modctl_list;
15972
15973         while (curp != ctl) {
15974                 prevp = curp;
15975                 curp = curp->mod_next;
15976         }
15977
15978         if (prevp != NULL) {
15979                 prevp->mod_next = ctl->mod_next;
15980         }
15981         else {
15982                 dtrace_modctl_list = ctl->mod_next;
15983         }
15984
15985         /* There should NEVER be user symbols allocated at this point */
15986         ASSERT(ctl->mod_user_symbols == NULL);
15987
15988         kmem_free (ctl, sizeof(modctl_t));
15989 }
15990
15991 /*
15992  * APPLE NOTE: The kext loader will call dtrace_module_loaded
15993  * when the kext is loaded in memory, but before calling the
15994  * kext's start routine.
15995  *
15996  * Return 0 on success
15997  * Return -1 on failure
15998  */
15999
16000 static int
16001 dtrace_module_loaded(struct kmod_info *kmod, uint32_t flag)
16002 {
16003         dtrace_provider_t *prv;
16004
16005         /*
16006          * If kernel symbols have been disabled, return immediately
16007          * DTRACE_KERNEL_SYMBOLS_NEVER is a permanent mode, it is safe to test without holding locks
16008          */
16009         if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER)
16010                 return 0;
16011
16012         struct modctl *ctl = NULL;
16013         if (!kmod || kmod->address == 0 || kmod->size == 0)
16014                 return(-1);
16015
16016         lck_mtx_lock(&dtrace_provider_lock);
16017         lck_mtx_lock(&mod_lock);
16018
16019         /*
16020          * Have we seen this kext before?
16021          */
16022
16023         ctl = dtrace_modctl_lookup(kmod);
16024
16025         if (ctl != NULL) {
16026                 /* bail... we already have this kext in the modctl list */
16027                 lck_mtx_unlock(&mod_lock);
16028                 lck_mtx_unlock(&dtrace_provider_lock);
16029                 if (dtrace_err_verbose)
16030                         cmn_err(CE_WARN, "dtrace load module already exists '%s %u' is failing against '%s %u'", kmod->name, (uint_t)kmod->id, ctl->mod_modname, ctl->mod_id);
16031                 return(-1);
16032         }
16033         else {
16034                 ctl = kmem_alloc(sizeof(struct modctl), KM_SLEEP);
16035                 if (ctl == NULL) {
16036                         if (dtrace_err_verbose)
16037                                 cmn_err(CE_WARN, "dtrace module load '%s %u' is failing ", kmod->name, (uint_t)kmod->id);
16038                         lck_mtx_unlock(&mod_lock);
16039                         lck_mtx_unlock(&dtrace_provider_lock);
16040                         return (-1);
16041                 }
16042                 ctl->mod_next = NULL;
16043                 ctl->mod_stale = NULL;
16044                 strlcpy (ctl->mod_modname, kmod->name, sizeof(ctl->mod_modname));
16045                 ctl->mod_loadcnt = kmod->id;
16046                 ctl->mod_nenabled = 0;
16047                 ctl->mod_address  = kmod->address;
16048                 ctl->mod_size = kmod->size;
16049                 ctl->mod_id = kmod->id;
16050                 ctl->mod_loaded = 1;
16051                 ctl->mod_flags = 0;
16052                 ctl->mod_user_symbols = NULL;
16053
16054                 /*
16055                  * Find the UUID for this module, if it has one
16056                  */
16057                 kernel_mach_header_t* header = (kernel_mach_header_t *)ctl->mod_address;
16058                 struct load_command* load_cmd = (struct load_command *)&header[1];
16059                 uint32_t i;
16060                 for (i = 0; i < header->ncmds; i++) {
16061                         if (load_cmd->cmd == LC_UUID) {
16062                                 struct uuid_command* uuid_cmd = (struct uuid_command *)load_cmd;
16063                                 memcpy(ctl->mod_uuid, uuid_cmd->uuid, sizeof(uuid_cmd->uuid));
16064                                 ctl->mod_flags |= MODCTL_HAS_UUID;
16065                                 break;
16066                         }
16067                         load_cmd = (struct load_command *)((caddr_t)load_cmd + load_cmd->cmdsize);
16068                 }
16069
16070                 if (ctl->mod_address == g_kernel_kmod_info.address) {
16071                         ctl->mod_flags |= MODCTL_IS_MACH_KERNEL;
16072                         memcpy(dtrace_kerneluuid, ctl->mod_uuid, sizeof(dtrace_kerneluuid));
16073                 }
16074                 /*
16075                  * Static kexts have a UUID that is not used for symbolication, as all their
16076                  * symbols are in kernel
16077                  */
16078                 else if ((flag & KMOD_DTRACE_STATIC_KEXT) == KMOD_DTRACE_STATIC_KEXT) {
16079                         memcpy(ctl->mod_uuid, dtrace_kerneluuid, sizeof(dtrace_kerneluuid));
16080                         ctl->mod_flags |= MODCTL_IS_STATIC_KEXT;
16081                 }
16082         }
16083         dtrace_modctl_add(ctl);
16084
16085         /*
16086          * We must hold the dtrace_lock to safely test non permanent dtrace_fbt_symbol_mode(s)
16087          */
16088         lck_mtx_lock(&dtrace_lock);
16089
16090         /*
16091          * DTrace must decide if it will instrument modules lazily via
16092          * userspace symbols (default mode), or instrument immediately via
16093          * kernel symbols (non-default mode)
16094          *
16095          * When in default/lazy mode, DTrace will only support modules
16096          * built with a valid UUID.
16097          *
16098          * Overriding the default can be done explicitly in one of
16099          * the following two ways.
16100          *
16101          * A module can force symbols from kernel space using the plist key,
16102          * OSBundleForceDTraceInit (see kmod.h).  If this per kext state is set,
16103          * we fall through and instrument this module now.
16104          *
16105          * Or, the boot-arg, dtrace_kernel_symbol_mode, can be set to force symbols
16106          * from kernel space (see dtrace_impl.h).  If this system state is set
16107          * to a non-userspace mode, we fall through and instrument the module now.
16108          */
16109
16110         if ((dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) &&
16111             (!(flag & KMOD_DTRACE_FORCE_INIT)))
16112         {
16113                 /* We will instrument the module lazily -- this is the default */
16114                 lck_mtx_unlock(&dtrace_lock);
16115                 lck_mtx_unlock(&mod_lock);
16116                 lck_mtx_unlock(&dtrace_provider_lock);
16117                 return 0;
16118         }
16119
16120         /* We will instrument the module immediately using kernel symbols */
16121         ctl->mod_flags |= MODCTL_HAS_KERNEL_SYMBOLS;
16122
16123         lck_mtx_unlock(&dtrace_lock);
16124
16125         /*
16126          * We're going to call each providers per-module provide operation
16127          * specifying only this module.
16128          */
16129         for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
16130                 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
16131
16132         /*
16133          * APPLE NOTE: The contract with the kext loader is that once this function
16134          * has completed, it may delete kernel symbols at will.
16135          * We must set this while still holding the mod_lock.
16136          */
16137         ctl->mod_flags &= ~MODCTL_HAS_KERNEL_SYMBOLS;
16138
16139         lck_mtx_unlock(&mod_lock);
16140         lck_mtx_unlock(&dtrace_provider_lock);
16141
16142         /*
16143          * If we have any retained enablings, we need to match against them.
16144          * Enabling probes requires that cpu_lock be held, and we cannot hold
16145          * cpu_lock here -- it is legal for cpu_lock to be held when loading a
16146          * module.  (In particular, this happens when loading scheduling
16147          * classes.)  So if we have any retained enablings, we need to dispatch
16148          * our task queue to do the match for us.
16149          */
16150         lck_mtx_lock(&dtrace_lock);
16151
16152         if (dtrace_retained == NULL) {
16153                 lck_mtx_unlock(&dtrace_lock);
16154                 return 0;
16155         }
16156
16157         /* APPLE NOTE!
16158          *
16159          * The cpu_lock mentioned above is only held by dtrace code, Apple's xnu never actually
16160          * holds it for any reason. Thus the comment above is invalid, we can directly invoke
16161          * dtrace_enabling_matchall without jumping through all the hoops, and we can avoid
16162          * the delay call as well.
16163          */
16164         lck_mtx_unlock(&dtrace_lock);
16165
16166         dtrace_enabling_matchall();
16167
16168         return 0;
16169 }
16170
16171 /*
16172  * Return 0 on success
16173  * Return -1 on failure
16174  */
16175 static int
16176 dtrace_module_unloaded(struct kmod_info *kmod)
16177 {
16178         dtrace_probe_t template, *probe, *first, *next;
16179         dtrace_provider_t *prov;
16180         struct modctl *ctl = NULL;
16181         struct modctl *syncctl = NULL;
16182         struct modctl *nextsyncctl = NULL;
16183         int syncmode = 0;
16184
16185         lck_mtx_lock(&dtrace_provider_lock);
16186         lck_mtx_lock(&mod_lock);
16187         lck_mtx_lock(&dtrace_lock);
16188
16189         if (kmod == NULL) {
16190             syncmode = 1;
16191         }
16192         else {
16193             ctl = dtrace_modctl_lookup(kmod);
16194             if (ctl == NULL)
16195             {
16196                 lck_mtx_unlock(&dtrace_lock);
16197                 lck_mtx_unlock(&mod_lock);
16198                 lck_mtx_unlock(&dtrace_provider_lock);
16199                 return (-1);
16200             }
16201             ctl->mod_loaded = 0;
16202             ctl->mod_address = 0;
16203             ctl->mod_size = 0;
16204         }
16205
16206         if (dtrace_bymod == NULL) {
16207                 /*
16208                  * The DTrace module is loaded (obviously) but not attached;
16209                  * we don't have any work to do.
16210                  */
16211                  if (ctl != NULL)
16212                          (void)dtrace_modctl_remove(ctl);
16213                  lck_mtx_unlock(&dtrace_lock);
16214                  lck_mtx_unlock(&mod_lock);
16215                  lck_mtx_unlock(&dtrace_provider_lock);
16216                  return(0);
16217         }
16218
16219         /* Syncmode set means we target and traverse entire modctl list. */
16220         if (syncmode)
16221             nextsyncctl = dtrace_modctl_list;
16222
16223 syncloop:
16224         if (syncmode)
16225         {
16226             /* find a stale modctl struct */
16227             for (syncctl = nextsyncctl; syncctl != NULL; syncctl=syncctl->mod_next) {
16228                 if (syncctl->mod_address == 0)
16229                     break;
16230             }
16231             if (syncctl==NULL)
16232             {
16233                 /* We have no more work to do */
16234                 lck_mtx_unlock(&dtrace_lock);
16235                 lck_mtx_unlock(&mod_lock);
16236                 lck_mtx_unlock(&dtrace_provider_lock);
16237                 return(0);
16238             }
16239             else {
16240                 /* keep track of next syncctl in case this one is removed */
16241                 nextsyncctl = syncctl->mod_next;
16242                 ctl = syncctl;
16243             }
16244         }
16245
16246         template.dtpr_mod = ctl->mod_modname;
16247
16248         for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
16249             probe != NULL; probe = probe->dtpr_nextmod) {
16250                 if (probe->dtpr_ecb != NULL) {
16251                         /*
16252                          * This shouldn't _actually_ be possible -- we're
16253                          * unloading a module that has an enabled probe in it.
16254                          * (It's normally up to the provider to make sure that
16255                          * this can't happen.)  However, because dtps_enable()
16256                          * doesn't have a failure mode, there can be an
16257                          * enable/unload race.  Upshot:  we don't want to
16258                          * assert, but we're not going to disable the
16259                          * probe, either.
16260                          */
16261
16262
16263                         if (syncmode) {
16264                             /* We're syncing, let's look at next in list */
16265                             goto syncloop;
16266                         }
16267
16268                         lck_mtx_unlock(&dtrace_lock);
16269                         lck_mtx_unlock(&mod_lock);
16270                         lck_mtx_unlock(&dtrace_provider_lock);
16271
16272                         if (dtrace_err_verbose) {
16273                                 cmn_err(CE_WARN, "unloaded module '%s' had "
16274                                     "enabled probes", ctl->mod_modname);
16275                         }
16276                         return(-1);
16277                 }
16278         }
16279
16280         probe = first;
16281
16282         for (first = NULL; probe != NULL; probe = next) {
16283                 ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
16284
16285                 dtrace_probes[probe->dtpr_id - 1] = NULL;
16286                 probe->dtpr_provider->dtpv_probe_count--;
16287
16288                 next = probe->dtpr_nextmod;
16289                 dtrace_hash_remove(dtrace_byprov, probe);
16290                 dtrace_hash_remove(dtrace_bymod, probe);
16291                 dtrace_hash_remove(dtrace_byfunc, probe);
16292                 dtrace_hash_remove(dtrace_byname, probe);
16293
16294                 if (first == NULL) {
16295                         first = probe;
16296                         probe->dtpr_nextmod = NULL;
16297                 } else {
16298                         probe->dtpr_nextmod = first;
16299                         first = probe;
16300                 }
16301         }
16302
16303         /*
16304          * We've removed all of the module's probes from the hash chains and
16305          * from the probe array.  Now issue a dtrace_sync() to be sure that
16306          * everyone has cleared out from any probe array processing.
16307          */
16308         dtrace_sync();
16309
16310         for (probe = first; probe != NULL; probe = first) {
16311                 first = probe->dtpr_nextmod;
16312                 prov = probe->dtpr_provider;
16313                 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
16314                     probe->dtpr_arg);
16315                 dtrace_strunref(probe->dtpr_mod);
16316                 dtrace_strunref(probe->dtpr_func);
16317                 dtrace_strunref(probe->dtpr_name);
16318                 vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
16319
16320                 zfree(dtrace_probe_t_zone, probe);
16321         }
16322
16323         dtrace_modctl_remove(ctl);
16324
16325         if (syncmode)
16326             goto syncloop;
16327
16328         lck_mtx_unlock(&dtrace_lock);
16329         lck_mtx_unlock(&mod_lock);
16330         lck_mtx_unlock(&dtrace_provider_lock);
16331
16332         return(0);
16333 }
16334
16335 void
16336 dtrace_suspend(void)
16337 {
16338         dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
16339 }
16340
16341 void
16342 dtrace_resume(void)
16343 {
16344         dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
16345 }
16346
16347 static int
16348 dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
16349 {
16350         LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
16351         lck_mtx_lock(&dtrace_lock);
16352
16353         switch (what) {
16354         case CPU_CONFIG: {
16355                 dtrace_state_t *state;
16356                 dtrace_optval_t *opt, rs, c;
16357
16358                 /*
16359                  * For now, we only allocate a new buffer for anonymous state.
16360                  */
16361                 if ((state = dtrace_anon.dta_state) == NULL)
16362                         break;
16363
16364                 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
16365                         break;
16366
16367                 opt = state->dts_options;
16368                 c = opt[DTRACEOPT_CPU];
16369
16370                 if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
16371                         break;
16372
16373                 /*
16374                  * Regardless of what the actual policy is, we're going to
16375                  * temporarily set our resize policy to be manual.  We're
16376                  * also going to temporarily set our CPU option to denote
16377                  * the newly configured CPU.
16378                  */
16379                 rs = opt[DTRACEOPT_BUFRESIZE];
16380                 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
16381                 opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
16382
16383                 (void) dtrace_state_buffers(state);
16384
16385                 opt[DTRACEOPT_BUFRESIZE] = rs;
16386                 opt[DTRACEOPT_CPU] = c;
16387
16388                 break;
16389         }
16390
16391         case CPU_UNCONFIG:
16392                 /*
16393                  * We don't free the buffer in the CPU_UNCONFIG case.  (The
16394                  * buffer will be freed when the consumer exits.)
16395                  */
16396                 break;
16397
16398         default:
16399                 break;
16400         }
16401
16402         lck_mtx_unlock(&dtrace_lock);
16403         return (0);
16404 }
16405
16406 static void
16407 dtrace_cpu_setup_initial(processorid_t cpu)
16408 {
16409         (void) dtrace_cpu_setup(CPU_CONFIG, cpu);
16410 }
16411
16412 static void
16413 dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
16414 {
16415         if (dtrace_toxranges >= dtrace_toxranges_max) {
16416                 int osize, nsize;
16417                 dtrace_toxrange_t *range;
16418
16419                 osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
16420
16421                 if (osize == 0) {
16422                         ASSERT(dtrace_toxrange == NULL);
16423                         ASSERT(dtrace_toxranges_max == 0);
16424                         dtrace_toxranges_max = 1;
16425                 } else {
16426                         dtrace_toxranges_max <<= 1;
16427                 }
16428
16429                 nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
16430                 range = kmem_zalloc(nsize, KM_SLEEP);
16431
16432                 if (dtrace_toxrange != NULL) {
16433                         ASSERT(osize != 0);
16434                         bcopy(dtrace_toxrange, range, osize);
16435                         kmem_free(dtrace_toxrange, osize);
16436                 }
16437
16438                 dtrace_toxrange = range;
16439         }
16440
16441         ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == 0);
16442         ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == 0);
16443
16444         dtrace_toxrange[dtrace_toxranges].dtt_base = base;
16445         dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
16446         dtrace_toxranges++;
16447 }
16448
16449 /*
16450  * DTrace Driver Cookbook Functions
16451  */
16452 /*ARGSUSED*/
16453 static int
16454 dtrace_attach(dev_info_t *devi)
16455 {
16456         dtrace_provider_id_t id;
16457         dtrace_state_t *state = NULL;
16458         dtrace_enabling_t *enab;
16459
16460         lck_mtx_lock(&cpu_lock);
16461         lck_mtx_lock(&dtrace_provider_lock);
16462         lck_mtx_lock(&dtrace_lock);
16463
16464         /* Darwin uses BSD cloning device driver to automagically obtain minor device number. */
16465         dtrace_devi = devi;
16466
16467         dtrace_modload = dtrace_module_loaded;
16468         dtrace_modunload = dtrace_module_unloaded;
16469         dtrace_cpu_init = dtrace_cpu_setup_initial;
16470         dtrace_helpers_cleanup = dtrace_helpers_destroy;
16471         dtrace_helpers_fork = dtrace_helpers_duplicate;
16472         dtrace_cpustart_init = dtrace_suspend;
16473         dtrace_cpustart_fini = dtrace_resume;
16474         dtrace_debugger_init = dtrace_suspend;
16475         dtrace_debugger_fini = dtrace_resume;
16476
16477         register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
16478
16479         LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
16480
16481         dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1,
16482             NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
16483
16484         dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
16485             sizeof (dtrace_dstate_percpu_t) * (int)NCPU, DTRACE_STATE_ALIGN,
16486             NULL, NULL, NULL, NULL, NULL, 0);
16487
16488         LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
16489
16490         dtrace_byprov = dtrace_hash_create(dtrace_strkey_probe_provider,
16491             0, /* unused */
16492             offsetof(dtrace_probe_t, dtpr_nextprov),
16493             offsetof(dtrace_probe_t, dtpr_prevprov));
16494
16495         dtrace_bymod = dtrace_hash_create(dtrace_strkey_deref_offset,
16496             offsetof(dtrace_probe_t, dtpr_mod),
16497             offsetof(dtrace_probe_t, dtpr_nextmod),
16498             offsetof(dtrace_probe_t, dtpr_prevmod));
16499
16500         dtrace_byfunc = dtrace_hash_create(dtrace_strkey_deref_offset,
16501             offsetof(dtrace_probe_t, dtpr_func),
16502             offsetof(dtrace_probe_t, dtpr_nextfunc),
16503             offsetof(dtrace_probe_t, dtpr_prevfunc));
16504
16505         dtrace_byname = dtrace_hash_create(dtrace_strkey_deref_offset,
16506             offsetof(dtrace_probe_t, dtpr_name),
16507             offsetof(dtrace_probe_t, dtpr_nextname),
16508             offsetof(dtrace_probe_t, dtpr_prevname));
16509
16510         if (dtrace_retain_max < 1) {
16511                 cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
16512                     "setting to 1", dtrace_retain_max);
16513                 dtrace_retain_max = 1;
16514         }
16515
16516         /*
16517          * Now discover our toxic ranges.
16518          */
16519         dtrace_toxic_ranges(dtrace_toxrange_add);
16520
16521         /*
16522          * Before we register ourselves as a provider to our own framework,
16523          * we would like to assert that dtrace_provider is NULL -- but that's
16524          * not true if we were loaded as a dependency of a DTrace provider.
16525          * Once we've registered, we can assert that dtrace_provider is our
16526          * pseudo provider.
16527          */
16528         (void) dtrace_register("dtrace", &dtrace_provider_attr,
16529             DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);
16530
16531         ASSERT(dtrace_provider != NULL);
16532         ASSERT((dtrace_provider_id_t)dtrace_provider == id);
16533
16534 #if defined (__x86_64__)
16535         dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
16536             dtrace_provider, NULL, NULL, "BEGIN", 1, NULL);
16537         dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
16538             dtrace_provider, NULL, NULL, "END", 0, NULL);
16539         dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
16540             dtrace_provider, NULL, NULL, "ERROR", 3, NULL);
16541 #elif (defined(__arm__) || defined(__arm64__))
16542         dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
16543             dtrace_provider, NULL, NULL, "BEGIN", 2, NULL);
16544         dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
16545             dtrace_provider, NULL, NULL, "END", 1, NULL);
16546         dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
16547             dtrace_provider, NULL, NULL, "ERROR", 4, NULL);
16548 #else
16549 #error Unknown Architecture
16550 #endif
16551
16552         dtrace_anon_property();
16553         lck_mtx_unlock(&cpu_lock);
16554
16555         /*
16556          * If DTrace helper tracing is enabled, we need to allocate the
16557          * trace buffer and initialize the values.
16558          */
16559         if (dtrace_helptrace_enabled) {
16560                 ASSERT(dtrace_helptrace_buffer == NULL);
16561                 dtrace_helptrace_buffer =
16562                     kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
16563                 dtrace_helptrace_next = 0;
16564         }
16565
16566         /*
16567          * If there are already providers, we must ask them to provide their
16568          * probes, and then match any anonymous enabling against them.  Note
16569          * that there should be no other retained enablings at this time:
16570          * the only retained enablings at this time should be the anonymous
16571          * enabling.
16572          */
16573         if (dtrace_anon.dta_enabling != NULL) {
16574                 ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
16575
16576                 /*
16577                  * APPLE NOTE: if handling anonymous dof, switch symbol modes.
16578                  */
16579                 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) {
16580                         dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_KERNEL;
16581                 }
16582
16583                 dtrace_enabling_provide(NULL);
16584                 state = dtrace_anon.dta_state;
16585
16586                 /*
16587                  * We couldn't hold cpu_lock across the above call to
16588                  * dtrace_enabling_provide(), but we must hold it to actually
16589                  * enable the probes.  We have to drop all of our locks, pick
16590                  * up cpu_lock, and regain our locks before matching the
16591                  * retained anonymous enabling.
16592                  */
16593                 lck_mtx_unlock(&dtrace_lock);
16594                 lck_mtx_unlock(&dtrace_provider_lock);
16595
16596                 lck_mtx_lock(&cpu_lock);
16597                 lck_mtx_lock(&dtrace_provider_lock);
16598                 lck_mtx_lock(&dtrace_lock);
16599
16600                 if ((enab = dtrace_anon.dta_enabling) != NULL)
16601                         (void) dtrace_enabling_match(enab, NULL, NULL);
16602
16603                 lck_mtx_unlock(&cpu_lock);
16604         }
16605
16606         lck_mtx_unlock(&dtrace_lock);
16607         lck_mtx_unlock(&dtrace_provider_lock);
16608
16609         if (state != NULL) {
16610                 /*
16611                  * If we created any anonymous state, set it going now.
16612                  */
16613                 (void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
16614         }
16615
16616         return (DDI_SUCCESS);
16617 }
16618
16619 /*ARGSUSED*/
16620 static int
16621 dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
16622 {
16623 #pragma unused(flag, otyp)
16624         dtrace_state_t *state;
16625         uint32_t priv;
16626         uid_t uid;
16627         zoneid_t zoneid;
16628         int rv;
16629
16630         /* APPLE: Darwin puts Helper on its own major device. */
16631
16632         /*
16633          * If no DTRACE_PRIV_* bits are set in the credential, then the
16634          * caller lacks sufficient permission to do anything with DTrace.
16635          */
16636         dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
16637         if (priv == DTRACE_PRIV_NONE)
16638                 return (EACCES);
16639
16640         /*
16641          * APPLE NOTE: We delay the initialization of fasttrap as late as possible.
16642          * It certainly can't be later than now!
16643          */
16644         fasttrap_init();
16645
16646         /*
16647          * Ask all providers to provide all their probes.
16648          */
16649         lck_mtx_lock(&dtrace_provider_lock);
16650         dtrace_probe_provide(NULL, NULL);
16651         lck_mtx_unlock(&dtrace_provider_lock);
16652
16653         lck_mtx_lock(&cpu_lock);
16654         lck_mtx_lock(&dtrace_lock);
16655         dtrace_opens++;
16656         dtrace_membar_producer();
16657
16658 #ifdef illumos
16659         /*
16660          * If the kernel debugger is active (that is, if the kernel debugger
16661          * modified text in some way), we won't allow the open.
16662          */
16663         if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
16664                 dtrace_opens--;
16665                 lck_mtx_unlock(&dtrace_lock);
16666                 lck_mtx_unlock(&cpu_lock);
16667                 return (EBUSY);
16668         }
16669 #endif
16670
16671         rv = dtrace_state_create(devp, cred_p, &state);
16672         lck_mtx_unlock(&cpu_lock);
16673
16674         if (rv != 0 || state == NULL) {
16675                 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) {
16676 #ifdef illumos
16677                         (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
16678 #endif
16679                 }
16680                 lck_mtx_unlock(&dtrace_lock);
16681                 /* propagate EAGAIN or ERESTART */
16682                 return (rv);
16683         }
16684
16685         lck_mtx_unlock(&dtrace_lock);
16686
16687         lck_rw_lock_exclusive(&dtrace_dof_mode_lock);
16688
16689         /*
16690          * If we are currently lazy, transition states.
16691          *
16692          * Unlike dtrace_close, we do not need to check the
16693          * value of dtrace_opens, as any positive value (and
16694          * we count as 1) means we transition states.
16695          */
16696         if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON) {
16697                 dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_OFF;
16698                 /*
16699                  * We do not need to hold the exclusive lock while processing
16700                  * DOF on processes. We do need to make sure the mode does not get
16701                  * changed to DTRACE_DOF_MODE_LAZY_ON during that stage though
16702                  * (which should not happen anyway since it only happens in
16703                  * dtrace_close). There is no way imcomplete USDT probes can be
16704                  * activate by any DTrace clients here since they all have to
16705                  * call dtrace_open and be blocked on dtrace_dof_mode_lock
16706                  */
16707                 lck_rw_lock_exclusive_to_shared(&dtrace_dof_mode_lock);
16708                 /*
16709                  * Iterate all existing processes and load lazy dofs.
16710                  */
16711                 proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS,
16712                              dtrace_lazy_dofs_proc_iterate_doit,
16713                              NULL,
16714                              dtrace_lazy_dofs_proc_iterate_filter,
16715                              NULL);
16716
16717                 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
16718         }
16719         else {
16720                 lck_rw_unlock_exclusive(&dtrace_dof_mode_lock);
16721         }
16722
16723
16724         /*
16725          * Update kernel symbol state.
16726          *
16727          * We must own the provider and dtrace locks.
16728          *
16729          * NOTE! It may appear there is a race by setting this value so late
16730          * after dtrace_probe_provide. However, any kext loaded after the
16731          * call to probe provide and before we set LAZY_OFF will be marked as
16732          * eligible for symbols from userspace. The same dtrace that is currently
16733          * calling dtrace_open() (this call!) will get a list of kexts needing
16734          * symbols and fill them in, thus closing the race window.
16735          *
16736          * We want to set this value only after it certain it will succeed, as
16737          * this significantly reduces the complexity of error exits.
16738          */
16739         lck_mtx_lock(&dtrace_lock);
16740         if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) {
16741                 dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_KERNEL;
16742         }
16743         lck_mtx_unlock(&dtrace_lock);
16744
16745         return (0);
16746 }
16747
16748 /*ARGSUSED*/
16749 static int
16750 dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
16751 {
16752 #pragma unused(flag, otyp, cred_p) /* __APPLE__ */
16753         minor_t minor = getminor(dev);
16754         dtrace_state_t *state;
16755
16756         /* APPLE NOTE: Darwin puts Helper on its own major device. */
16757         state = dtrace_state_get(minor);
16758
16759         lck_mtx_lock(&cpu_lock);
16760         lck_mtx_lock(&dtrace_lock);
16761
16762         if (state->dts_anon) {
16763                 /*
16764                  * There is anonymous state. Destroy that first.
16765                  */
16766                 ASSERT(dtrace_anon.dta_state == NULL);
16767                 dtrace_state_destroy(state->dts_anon);
16768         }
16769
16770         dtrace_state_destroy(state);
16771         ASSERT(dtrace_opens > 0);
16772
16773         /*
16774          * Only relinquish control of the kernel debugger interface when there
16775          * are no consumers and no anonymous enablings.
16776          */
16777         if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) {
16778 #ifdef illumos
16779                 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
16780 #endif
16781         }
16782
16783         lck_mtx_unlock(&dtrace_lock);
16784         lck_mtx_unlock(&cpu_lock);
16785
16786         /*
16787          * Lock ordering requires the dof mode lock be taken before
16788          * the dtrace_lock.
16789          */
16790         lck_rw_lock_exclusive(&dtrace_dof_mode_lock);
16791         lck_mtx_lock(&dtrace_lock);
16792
16793         if (dtrace_opens == 0) {
16794                 /*
16795                  * If we are currently lazy-off, and this is the last close, transition to
16796                  * lazy state.
16797                  */
16798                 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF) {
16799                         dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_ON;
16800                 }
16801
16802                 /*
16803                  * If we are the last dtrace client, switch back to lazy (from userspace) symbols
16804                  */
16805                 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_KERNEL) {
16806                         dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE;
16807                 }
16808         }
16809
16810         lck_mtx_unlock(&dtrace_lock);
16811         lck_rw_unlock_exclusive(&dtrace_dof_mode_lock);
16812
16813         /*
16814          * Kext probes may be retained past the end of the kext's lifespan. The
16815          * probes are kept until the last reference to them has been removed.
16816          * Since closing an active dtrace context is likely to drop that last reference,
16817          * lets take a shot at cleaning out the orphaned probes now.
16818          */
16819         dtrace_module_unloaded(NULL);
16820
16821         return (0);
16822 }
16823
16824 /*ARGSUSED*/
16825 static int
16826 dtrace_ioctl_helper(u_long cmd, caddr_t arg, int *rv)
16827 {
16828 #pragma unused(rv)
16829         /*
16830          * Safe to check this outside the dof mode lock
16831          */
16832         if (dtrace_dof_mode == DTRACE_DOF_MODE_NEVER)
16833                 return KERN_SUCCESS;
16834
16835         switch (cmd) {
16836 #if defined (__arm64__)
16837         case DTRACEHIOC_ADDDOF_U32:
16838         case DTRACEHIOC_ADDDOF_U64:
16839 #else
16840         case DTRACEHIOC_ADDDOF:
16841 #endif /* __arm64__*/
16842                         {
16843                         dof_helper_t *dhp = NULL;
16844                         size_t dof_ioctl_data_size;
16845                         dof_ioctl_data_t* multi_dof;
16846                         unsigned int i;
16847                         int rval = 0;
16848                         user_addr_t user_address = *(user_addr_t*)arg;
16849                         uint64_t dof_count;
16850                         int multi_dof_claimed = 0;
16851                         proc_t* p = current_proc();
16852
16853                         /*
16854                          * If this is a restricted process and dtrace is restricted,
16855                          * do not allow DOFs to be registered
16856                          */
16857                         if (dtrace_is_restricted() &&
16858                                 !dtrace_are_restrictions_relaxed() &&
16859                                 !dtrace_can_attach_to_proc(current_proc())) {
16860                                 return (EACCES);
16861                         }
16862
16863                         /*
16864                          * Read the number of DOF sections being passed in.
16865                          */
16866                         if (copyin(user_address + offsetof(dof_ioctl_data_t, dofiod_count),
16867                                    &dof_count,
16868                                    sizeof(dof_count))) {
16869                                 dtrace_dof_error(NULL, "failed to copyin dofiod_count");
16870                                 return (EFAULT);
16871                         }
16872
16873                         /*
16874                          * Range check the count.
16875                          */
16876                         if (dof_count == 0 || dof_count > 1024) {
16877                                 dtrace_dof_error(NULL, "dofiod_count is not valid");
16878                                 return (EINVAL);
16879                         }
16880
16881                         /*
16882                          * Allocate a correctly sized structure and copyin the data.
16883                          */
16884                         dof_ioctl_data_size = DOF_IOCTL_DATA_T_SIZE(dof_count);
16885                         if ((multi_dof = kmem_alloc(dof_ioctl_data_size, KM_SLEEP)) == NULL)
16886                                 return (ENOMEM);
16887
16888                         /* NOTE! We can no longer exit this method via return */
16889                         if (copyin(user_address, multi_dof, dof_ioctl_data_size) != 0) {
16890                                 dtrace_dof_error(NULL, "failed copyin of dof_ioctl_data_t");
16891                                 rval = EFAULT;
16892                                 goto cleanup;
16893                         }
16894
16895                         /*
16896                          * Check that the count didn't change between the first copyin and the second.
16897                          */
16898                         if (multi_dof->dofiod_count != dof_count) {
16899                                 rval = EINVAL;
16900                                 goto cleanup;
16901                         }
16902
16903                         /*
16904                          * Try to process lazily first.
16905                          */
16906                         rval = dtrace_lazy_dofs_add(p, multi_dof, &multi_dof_claimed);
16907
16908                         /*
16909                          * If rval is EACCES, we must be non-lazy.
16910                          */
16911                         if (rval == EACCES) {
16912                                 rval = 0;
16913                                 /*
16914                                  * Process each dof_helper_t
16915                                  */
16916                                 i = 0;
16917                                 do {
16918                                         dhp = &multi_dof->dofiod_helpers[i];
16919
16920                                         dof_hdr_t *dof = dtrace_dof_copyin(dhp->dofhp_dof, &rval);
16921
16922                                         if (dof != NULL) {
16923                                                 lck_mtx_lock(&dtrace_meta_lock);
16924                                                 lck_mtx_lock(&dtrace_lock);
16925
16926                                                 /*
16927                                                  * dtrace_helper_slurp() takes responsibility for the dof --
16928                                                  * it may free it now or it may save it and free it later.
16929                                                  */
16930                                                 if ((dhp->dofhp_dof = (uint64_t)dtrace_helper_slurp(p, dof, dhp)) == -1ULL) {
16931                                                         rval = EINVAL;
16932                                                 }
16933
16934                                                 lck_mtx_unlock(&dtrace_lock);
16935                                                 lck_mtx_unlock(&dtrace_meta_lock);
16936                                         }
16937                                 } while (++i < multi_dof->dofiod_count && rval == 0);
16938                         }
16939
16940                         /*
16941                          * We need to copyout the multi_dof struct, because it contains
16942                          * the generation (unique id) values needed to call DTRACEHIOC_REMOVE
16943                          *
16944                          * This could certainly be better optimized.
16945                          */
16946                         if (copyout(multi_dof, user_address, dof_ioctl_data_size) != 0) {
16947                                 dtrace_dof_error(NULL, "failed copyout of dof_ioctl_data_t");
16948                                 /* Don't overwrite pre-existing error code */
16949                                 if (rval == 0) rval = EFAULT;
16950                         }
16951
16952                 cleanup:
16953                         /*
16954                          * If we had to allocate struct memory, free it.
16955                          */
16956                         if (multi_dof != NULL && !multi_dof_claimed) {
16957                                 kmem_free(multi_dof, dof_ioctl_data_size);
16958                         }
16959
16960                         return rval;
16961                 }
16962
16963                 case DTRACEHIOC_REMOVE: {
16964                         int generation = *(int*)arg;
16965                         proc_t* p = current_proc();
16966
16967                         /*
16968                          * Try lazy first.
16969                          */
16970                         int rval = dtrace_lazy_dofs_remove(p, generation);
16971
16972                         /*
16973                          * EACCES means non-lazy
16974                          */
16975                         if (rval == EACCES) {
16976                                 lck_mtx_lock(&dtrace_meta_lock);
16977                                 lck_mtx_lock(&dtrace_lock);
16978                                 rval = dtrace_helper_destroygen(p, generation);
16979                                 lck_mtx_unlock(&dtrace_lock);
16980                                 lck_mtx_unlock(&dtrace_meta_lock);
16981                         }
16982
16983                         return (rval);
16984                 }
16985
16986                 default:
16987                         break;
16988         }
16989
16990         return ENOTTY;
16991 }
16992
16993 /*ARGSUSED*/
16994 static int
16995 dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv)
16996 {
16997 #pragma unused(md)
16998         minor_t minor = getminor(dev);
16999         dtrace_state_t *state;
17000         int rval;
17001
17002         /* Darwin puts Helper on its own major device. */
17003
17004         state = dtrace_state_get(minor);
17005
17006         if (state->dts_anon) {
17007            ASSERT(dtrace_anon.dta_state == NULL);
17008            state = state->dts_anon;
17009         }
17010
17011         switch (cmd) {
17012         case DTRACEIOC_PROVIDER: {
17013                 dtrace_providerdesc_t pvd;
17014                 dtrace_provider_t *pvp;
17015
17016                 if (copyin(arg, &pvd, sizeof (pvd)) != 0)
17017                         return (EFAULT);
17018
17019                 pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
17020                 lck_mtx_lock(&dtrace_provider_lock);
17021
17022                 for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
17023                         if (strncmp(pvp->dtpv_name, pvd.dtvd_name, DTRACE_PROVNAMELEN) == 0)
17024                                 break;
17025                 }
17026
17027                 lck_mtx_unlock(&dtrace_provider_lock);
17028
17029                 if (pvp == NULL)
17030                         return (ESRCH);
17031
17032                 bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
17033                 bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
17034                 if (copyout(&pvd, arg, sizeof (pvd)) != 0)
17035                         return (EFAULT);
17036
17037                 return (0);
17038         }
17039
17040         case DTRACEIOC_EPROBE: {
17041                 dtrace_eprobedesc_t epdesc;
17042                 dtrace_ecb_t *ecb;
17043                 dtrace_action_t *act;
17044                 void *buf;
17045                 size_t size;
17046                 uintptr_t dest;
17047                 int nrecs;
17048
17049                 if (copyin(arg, &epdesc, sizeof (epdesc)) != 0)
17050                         return (EFAULT);
17051
17052                 lck_mtx_lock(&dtrace_lock);
17053
17054                 if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
17055                         lck_mtx_unlock(&dtrace_lock);
17056                         return (EINVAL);
17057                 }
17058
17059                 if (ecb->dte_probe == NULL) {
17060                         lck_mtx_unlock(&dtrace_lock);
17061                         return (EINVAL);
17062                 }
17063
17064                 epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
17065                 epdesc.dtepd_uarg = ecb->dte_uarg;
17066                 epdesc.dtepd_size = ecb->dte_size;
17067
17068                 nrecs = epdesc.dtepd_nrecs;
17069                 epdesc.dtepd_nrecs = 0;
17070                 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17071                         if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17072                                 continue;
17073
17074                         epdesc.dtepd_nrecs++;
17075                 }
17076
17077                 /*
17078                  * Now that we have the size, we need to allocate a temporary
17079                  * buffer in which to store the complete description.  We need
17080                  * the temporary buffer to be able to drop dtrace_lock()
17081                  * across the copyout(), below.
17082                  */
17083                 size = sizeof (dtrace_eprobedesc_t) +
17084                         (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
17085
17086                 buf = kmem_alloc(size, KM_SLEEP);
17087                 dest = (uintptr_t)buf;
17088
17089                 bcopy(&epdesc, (void *)dest, sizeof (epdesc));
17090                 dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
17091
17092                 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17093                         if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17094                                 continue;
17095
17096                         if (nrecs-- == 0)
17097                                 break;
17098
17099                         bcopy(&act->dta_rec, (void *)dest,
17100                         sizeof (dtrace_recdesc_t));
17101                         dest += sizeof (dtrace_recdesc_t);
17102                 }
17103
17104                 lck_mtx_unlock(&dtrace_lock);
17105
17106                 if (copyout(buf, arg, dest - (uintptr_t)buf) != 0) {
17107                         kmem_free(buf, size);
17108                         return (EFAULT);
17109                 }
17110
17111                 kmem_free(buf, size);
17112                 return (0);
17113         }
17114
17115         case DTRACEIOC_AGGDESC: {
17116                 dtrace_aggdesc_t aggdesc;
17117                 dtrace_action_t *act;
17118                 dtrace_aggregation_t *agg;
17119                 int nrecs;
17120                 uint32_t offs;
17121                 dtrace_recdesc_t *lrec;
17122                 void *buf;
17123                 size_t size;
17124                 uintptr_t dest;
17125
17126                 if (copyin(arg, &aggdesc, sizeof (aggdesc)) != 0)
17127                         return (EFAULT);
17128
17129                 lck_mtx_lock(&dtrace_lock);
17130
17131                 if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
17132                         lck_mtx_unlock(&dtrace_lock);
17133                         return (EINVAL);
17134                 }
17135
17136                 aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
17137
17138                 nrecs = aggdesc.dtagd_nrecs;
17139                 aggdesc.dtagd_nrecs = 0;
17140
17141                 offs = agg->dtag_base;
17142                 lrec = &agg->dtag_action.dta_rec;
17143                 aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
17144
17145                 for (act = agg->dtag_first; ; act = act->dta_next) {
17146                         ASSERT(act->dta_intuple ||
17147                         DTRACEACT_ISAGG(act->dta_kind));
17148
17149                         /*
17150                          * If this action has a record size of zero, it
17151                          * denotes an argument to the aggregating action.
17152                          * Because the presence of this record doesn't (or
17153                          * shouldn't) affect the way the data is interpreted,
17154                          * we don't copy it out to save user-level the
17155                          * confusion of dealing with a zero-length record.
17156                          */
17157                         if (act->dta_rec.dtrd_size == 0) {
17158                                 ASSERT(agg->dtag_hasarg);
17159                                 continue;
17160                         }
17161
17162                         aggdesc.dtagd_nrecs++;
17163
17164                         if (act == &agg->dtag_action)
17165                                 break;
17166                 }
17167
17168                 /*
17169                  * Now that we have the size, we need to allocate a temporary
17170                  * buffer in which to store the complete description.  We need
17171                  * the temporary buffer to be able to drop dtrace_lock()
17172                  * across the copyout(), below.
17173                  */
17174                 size = sizeof (dtrace_aggdesc_t) +
17175                         (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
17176
17177                 buf = kmem_alloc(size, KM_SLEEP);
17178                 dest = (uintptr_t)buf;
17179
17180                 bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
17181                 dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
17182
17183                 for (act = agg->dtag_first; ; act = act->dta_next) {
17184                         dtrace_recdesc_t rec = act->dta_rec;
17185
17186                         /*
17187                          * See the comment in the above loop for why we pass
17188                          * over zero-length records.
17189                          */
17190                         if (rec.dtrd_size == 0) {
17191                                 ASSERT(agg->dtag_hasarg);
17192                                 continue;
17193                         }
17194
17195                         if (nrecs-- == 0)
17196                                 break;
17197
17198                         rec.dtrd_offset -= offs;
17199                         bcopy(&rec, (void *)dest, sizeof (rec));
17200                         dest += sizeof (dtrace_recdesc_t);
17201
17202                         if (act == &agg->dtag_action)
17203                                 break;
17204                 }
17205
17206                 lck_mtx_unlock(&dtrace_lock);
17207
17208                 if (copyout(buf, arg, dest - (uintptr_t)buf) != 0) {
17209                         kmem_free(buf, size);
17210                         return (EFAULT);
17211                 }
17212
17213                 kmem_free(buf, size);
17214                 return (0);
17215         }
17216
17217         case DTRACEIOC_ENABLE: {
17218                 dof_hdr_t *dof;
17219                 dtrace_enabling_t *enab = NULL;
17220                 dtrace_vstate_t *vstate;
17221                 int err = 0;
17222
17223                 *rv = 0;
17224
17225                 /*
17226                  * If a NULL argument has been passed, we take this as our
17227                  * cue to reevaluate our enablings.
17228                  */
17229                 if (arg == 0) {
17230                         dtrace_enabling_matchall();
17231
17232                         return (0);
17233                 }
17234
17235                 if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
17236                         return (rval);
17237
17238                 lck_mtx_lock(&cpu_lock);
17239                 lck_mtx_lock(&dtrace_lock);
17240                 vstate = &state->dts_vstate;
17241
17242                 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
17243                         lck_mtx_unlock(&dtrace_lock);
17244                         lck_mtx_unlock(&cpu_lock);
17245                         dtrace_dof_destroy(dof);
17246                         return (EBUSY);
17247                 }
17248
17249                 if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
17250                         lck_mtx_unlock(&dtrace_lock);
17251                         lck_mtx_unlock(&cpu_lock);
17252                         dtrace_dof_destroy(dof);
17253                         return (EINVAL);
17254                 }
17255
17256                 if ((rval = dtrace_dof_options(dof, state)) != 0) {
17257                         dtrace_enabling_destroy(enab);
17258                         lck_mtx_unlock(&dtrace_lock);
17259                         lck_mtx_unlock(&cpu_lock);
17260                         dtrace_dof_destroy(dof);
17261                         return (rval);
17262                 }
17263
17264                 if ((err = dtrace_enabling_match(enab, rv, NULL)) == 0) {
17265                         err = dtrace_enabling_retain(enab);
17266                 } else {
17267                         dtrace_enabling_destroy(enab);
17268                 }
17269
17270                 lck_mtx_unlock(&dtrace_lock);
17271                 lck_mtx_unlock(&cpu_lock);
17272                 dtrace_dof_destroy(dof);
17273
17274                 return (err);
17275         }
17276
17277         case DTRACEIOC_REPLICATE: {
17278                 dtrace_repldesc_t desc;
17279                 dtrace_probedesc_t *match = &desc.dtrpd_match;
17280                 dtrace_probedesc_t *create = &desc.dtrpd_create;
17281                 int err;
17282
17283                 if (copyin(arg, &desc, sizeof (desc)) != 0)
17284                         return (EFAULT);
17285
17286                 match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17287                 match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17288                 match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17289                 match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17290
17291                 create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17292                 create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17293                 create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17294                 create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17295
17296                 lck_mtx_lock(&dtrace_lock);
17297                 err = dtrace_enabling_replicate(state, match, create);
17298                 lck_mtx_unlock(&dtrace_lock);
17299
17300                 return (err);
17301         }
17302
17303         case DTRACEIOC_PROBEMATCH:
17304         case DTRACEIOC_PROBES: {
17305                 dtrace_probe_t *probe = NULL;
17306                 dtrace_probedesc_t desc;
17307                 dtrace_probekey_t pkey;
17308                 dtrace_id_t i;
17309                 int m = 0;
17310                 uint32_t priv;
17311                 uid_t uid;
17312                 zoneid_t zoneid;
17313
17314                 if (copyin(arg, &desc, sizeof (desc)) != 0)
17315                         return (EFAULT);
17316
17317                 desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17318                 desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17319                 desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17320                 desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17321
17322                 /*
17323                  * Before we attempt to match this probe, we want to give
17324                  * all providers the opportunity to provide it.
17325                  */
17326                 if (desc.dtpd_id == DTRACE_IDNONE) {
17327                         lck_mtx_lock(&dtrace_provider_lock);
17328                         dtrace_probe_provide(&desc, NULL);
17329                         lck_mtx_unlock(&dtrace_provider_lock);
17330                         desc.dtpd_id++;
17331                 }
17332
17333                 dtrace_cred2priv(cr, &priv, &uid, &zoneid);
17334
17335                 lck_mtx_lock(&dtrace_lock);
17336
17337                 if (cmd == DTRACEIOC_PROBEMATCH)  {
17338                         dtrace_probekey(&desc, &pkey);
17339                         pkey.dtpk_id = DTRACE_IDNONE;
17340
17341                         /* Quiet compiler warning */
17342                         for (i = desc.dtpd_id; i <= (dtrace_id_t)dtrace_nprobes; i++) {
17343                                 if ((probe = dtrace_probes[i - 1]) != NULL &&
17344                                         (m = dtrace_match_probe(probe, &pkey,
17345                                         priv, uid, zoneid)) != 0)
17346                                         break;
17347                         }
17348
17349                         if (m < 0) {
17350                                 lck_mtx_unlock(&dtrace_lock);
17351                                 return (EINVAL);
17352                         }
17353                         dtrace_probekey_release(&pkey);
17354
17355                 } else {
17356                         /* Quiet compiler warning */
17357                         for (i = desc.dtpd_id; i <= (dtrace_id_t)dtrace_nprobes; i++) {
17358                                 if ((probe = dtrace_probes[i - 1]) != NULL &&
17359                                         dtrace_match_priv(probe, priv, uid, zoneid))
17360                                         break;
17361                         }
17362                 }
17363
17364                 if (probe == NULL) {
17365                         lck_mtx_unlock(&dtrace_lock);
17366                         return (ESRCH);
17367                 }
17368
17369                 dtrace_probe_description(probe, &desc);
17370                 lck_mtx_unlock(&dtrace_lock);
17371
17372                 if (copyout(&desc, arg, sizeof (desc)) != 0)
17373                         return (EFAULT);
17374
17375                 return (0);
17376         }
17377
17378         case DTRACEIOC_PROBEARG: {
17379                 dtrace_argdesc_t desc;
17380                 dtrace_probe_t *probe;
17381                 dtrace_provider_t *prov;
17382
17383                 if (copyin(arg, &desc, sizeof (desc)) != 0)
17384                         return (EFAULT);
17385
17386                 if (desc.dtargd_id == DTRACE_IDNONE)
17387                         return (EINVAL);
17388
17389                 if (desc.dtargd_ndx == DTRACE_ARGNONE)
17390                         return (EINVAL);
17391
17392                 lck_mtx_lock(&dtrace_provider_lock);
17393                 lck_mtx_lock(&mod_lock);
17394                 lck_mtx_lock(&dtrace_lock);
17395
17396                 /* Quiet compiler warning */
17397                 if (desc.dtargd_id > (dtrace_id_t)dtrace_nprobes) {
17398                         lck_mtx_unlock(&dtrace_lock);
17399                         lck_mtx_unlock(&mod_lock);
17400                         lck_mtx_unlock(&dtrace_provider_lock);
17401                         return (EINVAL);
17402                 }
17403
17404                 if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
17405                         lck_mtx_unlock(&dtrace_lock);
17406                         lck_mtx_unlock(&mod_lock);
17407                         lck_mtx_unlock(&dtrace_provider_lock);
17408                         return (EINVAL);
17409                 }
17410
17411                 lck_mtx_unlock(&dtrace_lock);
17412
17413                 prov = probe->dtpr_provider;
17414
17415                 if (prov->dtpv_pops.dtps_getargdesc == NULL) {
17416                 /*
17417                  * There isn't any typed information for this probe.
17418                  * Set the argument number to DTRACE_ARGNONE.
17419                  */
17420                         desc.dtargd_ndx = DTRACE_ARGNONE;
17421                 } else {
17422                         desc.dtargd_native[0] = '\0';
17423                         desc.dtargd_xlate[0] = '\0';
17424                         desc.dtargd_mapping = desc.dtargd_ndx;
17425
17426                         prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
17427                         probe->dtpr_id, probe->dtpr_arg, &desc);
17428                 }
17429
17430                 lck_mtx_unlock(&mod_lock);
17431                 lck_mtx_unlock(&dtrace_provider_lock);
17432
17433                 if (copyout(&desc, arg, sizeof (desc)) != 0)
17434                         return (EFAULT);
17435
17436                 return (0);
17437         }
17438
17439         case DTRACEIOC_GO: {
17440                 processorid_t cpuid;
17441                 rval = dtrace_state_go(state, &cpuid);
17442
17443                 if (rval != 0)
17444                         return (rval);
17445
17446                 if (copyout(&cpuid, arg, sizeof (cpuid)) != 0)
17447                         return (EFAULT);
17448
17449                 return (0);
17450         }
17451
17452         case DTRACEIOC_STOP: {
17453                 processorid_t cpuid;
17454
17455                 lck_mtx_lock(&dtrace_lock);
17456                 rval = dtrace_state_stop(state, &cpuid);
17457                 lck_mtx_unlock(&dtrace_lock);
17458
17459                 if (rval != 0)
17460                         return (rval);
17461
17462                 if (copyout(&cpuid, arg, sizeof (cpuid)) != 0)
17463                         return (EFAULT);
17464
17465                 return (0);
17466         }
17467
17468         case DTRACEIOC_DOFGET: {
17469                 dof_hdr_t hdr, *dof;
17470                 uint64_t len;
17471
17472                 if (copyin(arg, &hdr, sizeof (hdr)) != 0)
17473                         return (EFAULT);
17474
17475                 lck_mtx_lock(&dtrace_lock);
17476                 dof = dtrace_dof_create(state);
17477                 lck_mtx_unlock(&dtrace_lock);
17478
17479                 len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
17480                 rval = copyout(dof, arg, len);
17481                 dtrace_dof_destroy(dof);
17482
17483                 return (rval == 0 ? 0 : EFAULT);
17484         }
17485
17486         case DTRACEIOC_SLEEP: {
17487                 int64_t time;
17488                 uint64_t abstime;
17489                 uint64_t rvalue = DTRACE_WAKE_TIMEOUT;
17490
17491                 if (copyin(arg, &time, sizeof(time)) != 0)
17492                         return (EFAULT);
17493
17494                 nanoseconds_to_absolutetime((uint64_t)time, &abstime);
17495                 clock_absolutetime_interval_to_deadline(abstime, &abstime);
17496
17497                 if (assert_wait_deadline(state, THREAD_ABORTSAFE, abstime) == THREAD_WAITING) {
17498                         if (state->dts_buf_over_limit > 0) {
17499                                 clear_wait(current_thread(), THREAD_INTERRUPTED);
17500                                 rvalue = DTRACE_WAKE_BUF_LIMIT;
17501                         } else {
17502                                 thread_block(THREAD_CONTINUE_NULL);
17503                                 if (state->dts_buf_over_limit > 0) {
17504                                         rvalue = DTRACE_WAKE_BUF_LIMIT;
17505                                 }
17506                         }
17507                 }
17508
17509                 if (copyout(&rvalue, arg, sizeof(rvalue)) != 0)
17510                         return (EFAULT);
17511
17512                 return (0);
17513         }
17514
17515         case DTRACEIOC_SIGNAL: {
17516                 wakeup(state);
17517                 return (0);
17518         }
17519
17520         case DTRACEIOC_AGGSNAP:
17521         case DTRACEIOC_BUFSNAP: {
17522                 dtrace_bufdesc_t desc;
17523                 caddr_t cached;
17524                 boolean_t over_limit;
17525                 dtrace_buffer_t *buf;
17526
17527                 if (copyin(arg, &desc, sizeof (desc)) != 0)
17528                         return (EFAULT);
17529
17530                 if ((int)desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)
17531                         return (EINVAL);
17532
17533                 lck_mtx_lock(&dtrace_lock);
17534
17535                 if (cmd == DTRACEIOC_BUFSNAP) {
17536                         buf = &state->dts_buffer[desc.dtbd_cpu];
17537                 } else {
17538                         buf = &state->dts_aggbuffer[desc.dtbd_cpu];
17539                 }
17540
17541                 if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
17542                         size_t sz = buf->dtb_offset;
17543
17544                         if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
17545                                 lck_mtx_unlock(&dtrace_lock);
17546                                 return (EBUSY);
17547                         }
17548
17549                         /*
17550                          * If this buffer has already been consumed, we're
17551                          * going to indicate that there's nothing left here
17552                          * to consume.
17553                          */
17554                         if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
17555                                 lck_mtx_unlock(&dtrace_lock);
17556
17557                                 desc.dtbd_size = 0;
17558                                 desc.dtbd_drops = 0;
17559                                 desc.dtbd_errors = 0;
17560                                 desc.dtbd_oldest = 0;
17561                                 sz = sizeof (desc);
17562
17563                                 if (copyout(&desc, arg, sz) != 0)
17564                                         return (EFAULT);
17565
17566                                 return (0);
17567                         }
17568
17569                         /*
17570                          * If this is a ring buffer that has wrapped, we want
17571                          * to copy the whole thing out.
17572                          */
17573                         if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
17574                                 dtrace_buffer_polish(buf);
17575                                 sz = buf->dtb_size;
17576                         }
17577
17578                         if (copyout(buf->dtb_tomax, (user_addr_t)desc.dtbd_data, sz) != 0) {
17579                                 lck_mtx_unlock(&dtrace_lock);
17580                                 return (EFAULT);
17581                         }
17582
17583                         desc.dtbd_size = sz;
17584                         desc.dtbd_drops = buf->dtb_drops;
17585                         desc.dtbd_errors = buf->dtb_errors;
17586                         desc.dtbd_oldest = buf->dtb_xamot_offset;
17587                         desc.dtbd_timestamp = dtrace_gethrtime();
17588
17589                         lck_mtx_unlock(&dtrace_lock);
17590
17591                         if (copyout(&desc, arg, sizeof (desc)) != 0)
17592                                 return (EFAULT);
17593
17594                         buf->dtb_flags |= DTRACEBUF_CONSUMED;
17595
17596                         return (0);
17597                 }
17598
17599                 if (buf->dtb_tomax == NULL) {
17600                         ASSERT(buf->dtb_xamot == NULL);
17601                         lck_mtx_unlock(&dtrace_lock);
17602                         return (ENOENT);
17603                 }
17604
17605                 cached = buf->dtb_tomax;
17606                 over_limit = buf->dtb_cur_limit == buf->dtb_size;
17607
17608                 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
17609
17610                 dtrace_xcall(desc.dtbd_cpu,
17611                         (dtrace_xcall_t)dtrace_buffer_switch, buf);
17612
17613                 state->dts_errors += buf->dtb_xamot_errors;
17614
17615                 /*
17616                 * If the buffers did not actually switch, then the cross call
17617                 * did not take place -- presumably because the given CPU is
17618                 * not in the ready set.  If this is the case, we'll return
17619                 * ENOENT.
17620                 */
17621                 if (buf->dtb_tomax == cached) {
17622                         ASSERT(buf->dtb_xamot != cached);
17623                         lck_mtx_unlock(&dtrace_lock);
17624                         return (ENOENT);
17625                 }
17626
17627                 ASSERT(cached == buf->dtb_xamot);
17628                 /*
17629                  * At this point we know the buffer have switched, so we
17630                  * can decrement the over limit count if the buffer was over
17631                  * its limit. The new buffer might already be over its limit
17632                  * yet, but we don't care since we're guaranteed not to be
17633                  * checking the buffer over limit count  at this point.
17634                  */
17635                 if (over_limit) {
17636                         uint32_t old = atomic_add_32(&state->dts_buf_over_limit, -1);
17637                         #pragma unused(old)
17638
17639                         /*
17640                          * Verify that we didn't underflow the value
17641                          */
17642                         ASSERT(old != 0);
17643                 }
17644
17645                 /*
17646                 * We have our snapshot; now copy it out.
17647                 */
17648                 if (dtrace_buffer_copyout(buf->dtb_xamot,
17649                                         (user_addr_t)desc.dtbd_data,
17650                                         buf->dtb_xamot_offset) != 0) {
17651                         lck_mtx_unlock(&dtrace_lock);
17652                         return (EFAULT);
17653                 }
17654
17655                 desc.dtbd_size = buf->dtb_xamot_offset;
17656                 desc.dtbd_drops = buf->dtb_xamot_drops;
17657                 desc.dtbd_errors = buf->dtb_xamot_errors;
17658                 desc.dtbd_oldest = 0;
17659                 desc.dtbd_timestamp = buf->dtb_switched;
17660
17661                 lck_mtx_unlock(&dtrace_lock);
17662
17663                 /*
17664                  * Finally, copy out the buffer description.
17665                  */
17666                 if (copyout(&desc, arg, sizeof (desc)) != 0)
17667                         return (EFAULT);
17668
17669                 return (0);
17670         }
17671
17672         case DTRACEIOC_CONF: {
17673                 dtrace_conf_t conf;
17674
17675                 bzero(&conf, sizeof (conf));
17676                 conf.dtc_difversion = DIF_VERSION;
17677                 conf.dtc_difintregs = DIF_DIR_NREGS;
17678                 conf.dtc_diftupregs = DIF_DTR_NREGS;
17679                 conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
17680
17681                 if (copyout(&conf, arg, sizeof (conf)) != 0)
17682                         return (EFAULT);
17683
17684                 return (0);
17685         }
17686
17687         case DTRACEIOC_STATUS: {
17688                 dtrace_status_t stat;
17689                 dtrace_dstate_t *dstate;
17690                 int i, j;
17691                 uint64_t nerrs;
17692
17693                 /*
17694                 * See the comment in dtrace_state_deadman() for the reason
17695                 * for setting dts_laststatus to INT64_MAX before setting
17696                 * it to the correct value.
17697                 */
17698                 state->dts_laststatus = INT64_MAX;
17699                 dtrace_membar_producer();
17700                 state->dts_laststatus = dtrace_gethrtime();
17701
17702                 bzero(&stat, sizeof (stat));
17703
17704                 lck_mtx_lock(&dtrace_lock);
17705
17706                 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
17707                         lck_mtx_unlock(&dtrace_lock);
17708                         return (ENOENT);
17709                 }
17710
17711                 if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
17712                         stat.dtst_exiting = 1;
17713
17714                 nerrs = state->dts_errors;
17715                 dstate = &state->dts_vstate.dtvs_dynvars;
17716
17717                 for (i = 0; i < (int)NCPU; i++) {
17718                         dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
17719
17720                         stat.dtst_dyndrops += dcpu->dtdsc_drops;
17721                         stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
17722                         stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
17723
17724                         if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
17725                                 stat.dtst_filled++;
17726
17727                         nerrs += state->dts_buffer[i].dtb_errors;
17728
17729                         for (j = 0; j < state->dts_nspeculations; j++) {
17730                                 dtrace_speculation_t *spec;
17731                                 dtrace_buffer_t *buf;
17732
17733                                 spec = &state->dts_speculations[j];
17734                                 buf = &spec->dtsp_buffer[i];
17735                                 stat.dtst_specdrops += buf->dtb_xamot_drops;
17736                         }
17737                 }
17738
17739                 stat.dtst_specdrops_busy = state->dts_speculations_busy;
17740                 stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
17741                 stat.dtst_stkstroverflows = state->dts_stkstroverflows;
17742                 stat.dtst_dblerrors = state->dts_dblerrors;
17743                 stat.dtst_killed =
17744                         (state->dts_activity == DTRACE_ACTIVITY_KILLED);
17745                 stat.dtst_errors = nerrs;
17746
17747                 lck_mtx_unlock(&dtrace_lock);
17748
17749                 if (copyout(&stat, arg, sizeof (stat)) != 0)
17750                         return (EFAULT);
17751
17752                 return (0);
17753         }
17754
17755         case DTRACEIOC_FORMAT: {
17756                 dtrace_fmtdesc_t fmt;
17757                 char *str;
17758                 int len;
17759
17760                 if (copyin(arg, &fmt, sizeof (fmt)) != 0)
17761                         return (EFAULT);
17762
17763                 lck_mtx_lock(&dtrace_lock);
17764
17765                 if (fmt.dtfd_format == 0 ||
17766                         fmt.dtfd_format > state->dts_nformats) {
17767                         lck_mtx_unlock(&dtrace_lock);
17768                         return (EINVAL);
17769                 }
17770
17771                 /*
17772                  * Format strings are allocated contiguously and they are
17773                  * never freed; if a format index is less than the number
17774                  * of formats, we can assert that the format map is non-NULL
17775                  * and that the format for the specified index is non-NULL.
17776                  */
17777                 ASSERT(state->dts_formats != NULL);
17778                 str = state->dts_formats[fmt.dtfd_format - 1];
17779                 ASSERT(str != NULL);
17780
17781                 len = strlen(str) + 1;
17782
17783                 if (len > fmt.dtfd_length) {
17784                         fmt.dtfd_length = len;
17785
17786                         if (copyout(&fmt, arg, sizeof (fmt)) != 0) {
17787                                 lck_mtx_unlock(&dtrace_lock);
17788                                 return (EINVAL);
17789                         }
17790                 } else {
17791                         if (copyout(str, (user_addr_t)fmt.dtfd_string, len) != 0) {
17792                                 lck_mtx_unlock(&dtrace_lock);
17793                                 return (EINVAL);
17794                         }
17795                 }
17796
17797                 lck_mtx_unlock(&dtrace_lock);
17798                 return (0);
17799         }
17800
17801         case DTRACEIOC_MODUUIDSLIST: {
17802                 size_t module_uuids_list_size;
17803                 dtrace_module_uuids_list_t* uuids_list;
17804                 uint64_t dtmul_count;
17805
17806                 /*
17807                  * Security restrictions make this operation illegal, if this is enabled DTrace
17808                  * must refuse to provide any fbt probes.
17809                  */
17810                 if (dtrace_fbt_probes_restricted()) {
17811                         cmn_err(CE_WARN, "security restrictions disallow DTRACEIOC_MODUUIDSLIST");
17812                         return (EPERM);
17813                 }
17814
17815                 /*
17816                  * Fail if the kernel symbol mode makes this operation illegal.
17817                  * Both NEVER & ALWAYS_FROM_KERNEL are permanent states, it is legal to check
17818                  * for them without holding the dtrace_lock.
17819                  */
17820                 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER ||
17821                     dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) {
17822                         cmn_err(CE_WARN, "dtrace_kernel_symbol_mode of %u disallows DTRACEIOC_MODUUIDSLIST", dtrace_kernel_symbol_mode);
17823                         return (EPERM);
17824                 }
17825
17826                 /*
17827                  * Read the number of symbolsdesc structs being passed in.
17828                  */
17829                 if (copyin(arg + offsetof(dtrace_module_uuids_list_t, dtmul_count),
17830                            &dtmul_count,
17831                            sizeof(dtmul_count))) {
17832                         cmn_err(CE_WARN, "failed to copyin dtmul_count");
17833                         return (EFAULT);
17834                 }
17835
17836                 /*
17837                  * Range check the count. More than 2k kexts is probably an error.
17838                  */
17839                 if (dtmul_count > 2048) {
17840                         cmn_err(CE_WARN, "dtmul_count is not valid");
17841                         return (EINVAL);
17842                 }
17843
17844                 /*
17845                  * For all queries, we return EINVAL when the user specified
17846                  * count does not match the actual number of modules we find
17847                  * available.
17848                  *
17849                  * If the user specified count is zero, then this serves as a
17850                  * simple query to count the available modules in need of symbols.
17851                  */
17852
17853                 rval = 0;
17854
17855                 if (dtmul_count == 0)
17856                 {
17857                         lck_mtx_lock(&mod_lock);
17858                         struct modctl* ctl = dtrace_modctl_list;
17859                         while (ctl) {
17860                                 /* Update the private probes bit */
17861                                 if (dtrace_provide_private_probes)
17862                                         ctl->mod_flags |= MODCTL_FBT_PROVIDE_PRIVATE_PROBES;
17863
17864                                 ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
17865                                 if (!MOD_SYMBOLS_DONE(ctl) && !MOD_IS_STATIC_KEXT(ctl)) {
17866                                         dtmul_count++;
17867                                         rval = EINVAL;
17868                                 }
17869                                 ctl = ctl->mod_next;
17870                         }
17871                         lck_mtx_unlock(&mod_lock);
17872
17873                         if (copyout(&dtmul_count, arg, sizeof (dtmul_count)) != 0)
17874                                 return (EFAULT);
17875                         else
17876                                 return (rval);
17877                 }
17878
17879                 /*
17880                  * If we reach this point, then we have a request for full list data.
17881                  * Allocate a correctly sized structure and copyin the data.
17882                  */
17883                 module_uuids_list_size = DTRACE_MODULE_UUIDS_LIST_SIZE(dtmul_count);
17884                 if ((uuids_list = kmem_alloc(module_uuids_list_size, KM_SLEEP)) == NULL)
17885                         return (ENOMEM);
17886
17887                 /* NOTE! We can no longer exit this method via return */
17888                 if (copyin(arg, uuids_list, module_uuids_list_size) != 0) {
17889                         cmn_err(CE_WARN, "failed copyin of dtrace_module_uuids_list_t");
17890                         rval = EFAULT;
17891                         goto moduuidslist_cleanup;
17892                 }
17893
17894                 /*
17895                  * Check that the count didn't change between the first copyin and the second.
17896                  */
17897                 if (uuids_list->dtmul_count != dtmul_count) {
17898                         rval = EINVAL;
17899                         goto moduuidslist_cleanup;
17900                 }
17901
17902                 /*
17903                  * Build the list of UUID's that need symbols
17904                  */
17905                 lck_mtx_lock(&mod_lock);
17906
17907                 dtmul_count = 0;
17908
17909                 struct modctl* ctl = dtrace_modctl_list;
17910                 while (ctl) {
17911                         /* Update the private probes bit */
17912                         if (dtrace_provide_private_probes)
17913                                 ctl->mod_flags |= MODCTL_FBT_PROVIDE_PRIVATE_PROBES;
17914
17915                         /*
17916                          * We assume that userspace symbols will be "better" than kernel level symbols,
17917                          * as userspace can search for dSYM(s) and symbol'd binaries. Even if kernel syms
17918                          * are available, add user syms if the module might use them.
17919                          */
17920                         ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
17921                         if (!MOD_SYMBOLS_DONE(ctl) && !MOD_IS_STATIC_KEXT(ctl)) {
17922                                 UUID* uuid = &uuids_list->dtmul_uuid[dtmul_count];
17923                                 if (dtmul_count++ < uuids_list->dtmul_count) {
17924                                         memcpy(uuid, ctl->mod_uuid, sizeof(UUID));
17925                                 }
17926                         }
17927                         ctl = ctl->mod_next;
17928                 }
17929
17930                 lck_mtx_unlock(&mod_lock);
17931
17932                 if (uuids_list->dtmul_count < dtmul_count)
17933                         rval = EINVAL;
17934
17935                 uuids_list->dtmul_count = dtmul_count;
17936
17937                 /*
17938                  * Copyout the symbols list (or at least the count!)
17939                  */
17940                 if (copyout(uuids_list, arg, module_uuids_list_size) != 0) {
17941                         cmn_err(CE_WARN, "failed copyout of dtrace_symbolsdesc_list_t");
17942                         rval = EFAULT;
17943                 }
17944
17945         moduuidslist_cleanup:
17946                 /*
17947                  * If we had to allocate struct memory, free it.
17948                  */
17949                 if (uuids_list != NULL) {
17950                         kmem_free(uuids_list, module_uuids_list_size);
17951                 }
17952
17953                 return rval;
17954         }
17955
17956         case DTRACEIOC_PROVMODSYMS: {
17957                 size_t module_symbols_size;
17958                 dtrace_module_symbols_t* module_symbols;
17959                 uint64_t dtmodsyms_count;
17960
17961                 /*
17962                  * Security restrictions make this operation illegal, if this is enabled DTrace
17963                  * must refuse to provide any fbt probes.
17964                  */
17965                 if (dtrace_fbt_probes_restricted()) {
17966                         cmn_err(CE_WARN, "security restrictions disallow DTRACEIOC_MODUUIDSLIST");
17967                         return (EPERM);
17968                 }
17969
17970                 /*
17971                  * Fail if the kernel symbol mode makes this operation illegal.
17972                  * Both NEVER & ALWAYS_FROM_KERNEL are permanent states, it is legal to check
17973                  * for them without holding the dtrace_lock.
17974                  */
17975                 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER ||
17976                     dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) {
17977                         cmn_err(CE_WARN, "dtrace_kernel_symbol_mode of %u disallows DTRACEIOC_PROVMODSYMS", dtrace_kernel_symbol_mode);
17978                         return (EPERM);
17979                 }
17980
17981                 /*
17982                  * Read the number of module symbols structs being passed in.
17983                  */
17984                 if (copyin(arg + offsetof(dtrace_module_symbols_t, dtmodsyms_count),
17985                            &dtmodsyms_count,
17986                            sizeof(dtmodsyms_count))) {
17987                         cmn_err(CE_WARN, "failed to copyin dtmodsyms_count");
17988                         return (EFAULT);
17989                 }
17990
17991                 /*
17992                  * Range check the count. How much data can we pass around?
17993                  * FIX ME!
17994                  */
17995                 if (dtmodsyms_count == 0 || (dtmodsyms_count > 100 * 1024)) {
17996                         cmn_err(CE_WARN, "dtmodsyms_count is not valid");
17997                         return (EINVAL);
17998                 }
17999
18000                 /*
18001                  * Allocate a correctly sized structure and copyin the data.
18002                  */
18003                 module_symbols_size = DTRACE_MODULE_SYMBOLS_SIZE(dtmodsyms_count);
18004                 if ((module_symbols = kmem_alloc(module_symbols_size, KM_SLEEP)) == NULL)
18005                         return (ENOMEM);
18006
18007                 rval = 0;
18008
18009                 /* NOTE! We can no longer exit this method via return */
18010                 if (copyin(arg, module_symbols, module_symbols_size) != 0) {
18011                         cmn_err(CE_WARN, "failed copyin of dtrace_module_symbols_t");
18012                         rval = EFAULT;
18013                         goto module_symbols_cleanup;
18014                 }
18015
18016                 /*
18017                  * Check that the count didn't change between the first copyin and the second.
18018                  */
18019                 if (module_symbols->dtmodsyms_count != dtmodsyms_count) {
18020                         rval = EINVAL;
18021                         goto module_symbols_cleanup;
18022                 }
18023
18024                 /*
18025                  * Find the modctl to add symbols to.
18026                  */
18027                 lck_mtx_lock(&dtrace_provider_lock);
18028                 lck_mtx_lock(&mod_lock);
18029
18030                 struct modctl* ctl = dtrace_modctl_list;
18031                 while (ctl) {
18032                         /* Update the private probes bit */
18033                         if (dtrace_provide_private_probes)
18034                                 ctl->mod_flags |= MODCTL_FBT_PROVIDE_PRIVATE_PROBES;
18035
18036                         ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
18037                         if (MOD_HAS_UUID(ctl) && !MOD_SYMBOLS_DONE(ctl) && memcmp(module_symbols->dtmodsyms_uuid, ctl->mod_uuid, sizeof(UUID)) == 0) {
18038                                 dtrace_provider_t *prv;
18039                                 ctl->mod_user_symbols = module_symbols;
18040
18041                                 /*
18042                                  * We're going to call each providers per-module provide operation
18043                                  * specifying only this module.
18044                                  */
18045                                 for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
18046                                         prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
18047                                 /*
18048                                  * We gave every provider a chance to provide with the user syms, go ahead and clear them
18049                                  */
18050                                 ctl->mod_user_symbols = NULL; /* MUST reset this to clear HAS_USERSPACE_SYMBOLS */
18051                         }
18052                         ctl = ctl->mod_next;
18053                 }
18054
18055                 lck_mtx_unlock(&mod_lock);
18056                 lck_mtx_unlock(&dtrace_provider_lock);
18057
18058         module_symbols_cleanup:
18059                 /*
18060                  * If we had to allocate struct memory, free it.
18061                  */
18062                 if (module_symbols != NULL) {
18063                         kmem_free(module_symbols, module_symbols_size);
18064                 }
18065
18066                 return rval;
18067         }
18068
18069         case DTRACEIOC_PROCWAITFOR: {
18070                 dtrace_procdesc_t pdesc = {
18071                         .p_name = {0},
18072                         .p_pid  = -1
18073                 };
18074
18075                 if ((rval = copyin(arg, &pdesc, sizeof(pdesc))) != 0)
18076                         goto proc_waitfor_error;
18077
18078                 if ((rval = dtrace_proc_waitfor(&pdesc)) != 0)
18079                         goto proc_waitfor_error;
18080
18081                 if ((rval = copyout(&pdesc, arg, sizeof(pdesc))) != 0)
18082                         goto proc_waitfor_error;
18083
18084                 return 0;
18085
18086         proc_waitfor_error:
18087                 /* The process was suspended, revert this since the client will not do it. */
18088                 if (pdesc.p_pid != -1) {
18089                         proc_t *proc = proc_find(pdesc.p_pid);
18090                         if (proc != PROC_NULL) {
18091                                 task_pidresume(proc->task);
18092                                 proc_rele(proc);
18093                         }
18094                 }
18095
18096                 return rval;
18097         }
18098
18099         default:
18100                 break;
18101         }
18102
18103         return (ENOTTY);
18104 }
18105
18106 /*
18107  * APPLE NOTE:  dtrace_detach not implemented
18108  */
18109 #if !defined(__APPLE__)
18110 /*ARGSUSED*/
18111 static int
18112 dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
18113 {
18114         dtrace_state_t *state;
18115
18116         switch (cmd) {
18117         case DDI_DETACH:
18118                 break;
18119
18120         case DDI_SUSPEND:
18121                 return (DDI_SUCCESS);
18122
18123         default:
18124                 return (DDI_FAILURE);
18125         }
18126
18127         lck_mtx_lock(&cpu_lock);
18128         lck_mtx_lock(&dtrace_provider_lock);
18129         lck_mtx_lock(&dtrace_lock);
18130
18131         ASSERT(dtrace_opens == 0);
18132
18133         if (dtrace_helpers > 0) {
18134                 lck_mtx_unlock(&dtrace_lock);
18135                 lck_mtx_unlock(&dtrace_provider_lock);
18136                 lck_mtx_unlock(&cpu_lock);
18137                 return (DDI_FAILURE);
18138         }
18139
18140         if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
18141                 lck_mtx_unlock(&dtrace_lock);
18142                 lck_mtx_unlock(&dtrace_provider_lock);
18143                 lck_mtx_unlock(&cpu_lock);
18144                 return (DDI_FAILURE);
18145         }
18146
18147         dtrace_provider = NULL;
18148
18149         if ((state = dtrace_anon_grab()) != NULL) {
18150                 /*
18151                  * If there were ECBs on this state, the provider should
18152                  * have not been allowed to detach; assert that there is
18153                  * none.
18154                  */
18155                 ASSERT(state->dts_necbs == 0);
18156                 dtrace_state_destroy(state);
18157
18158                 /*
18159                  * If we're being detached with anonymous state, we need to
18160                  * indicate to the kernel debugger that DTrace is now inactive.
18161                  */
18162                 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
18163         }
18164
18165         bzero(&dtrace_anon, sizeof (dtrace_anon_t));
18166         unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
18167         dtrace_cpu_init = NULL;
18168         dtrace_helpers_cleanup = NULL;
18169         dtrace_helpers_fork = NULL;
18170         dtrace_cpustart_init = NULL;
18171         dtrace_cpustart_fini = NULL;
18172         dtrace_debugger_init = NULL;
18173         dtrace_debugger_fini = NULL;
18174         dtrace_kreloc_init = NULL;
18175         dtrace_kreloc_fini = NULL;
18176         dtrace_modload = NULL;
18177         dtrace_modunload = NULL;
18178
18179         lck_mtx_unlock(&cpu_lock);
18180
18181         if (dtrace_helptrace_enabled) {
18182                 kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
18183                 dtrace_helptrace_buffer = NULL;
18184         }
18185
18186         kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
18187         dtrace_probes = NULL;
18188         dtrace_nprobes = 0;
18189
18190         dtrace_hash_destroy(dtrace_strings);
18191         dtrace_hash_destroy(dtrace_byprov);
18192         dtrace_hash_destroy(dtrace_bymod);
18193         dtrace_hash_destroy(dtrace_byfunc);
18194         dtrace_hash_destroy(dtrace_byname);
18195         dtrace_strings = NULL;
18196         dtrace_byprov = NULL;
18197         dtrace_bymod = NULL;
18198         dtrace_byfunc = NULL;
18199         dtrace_byname = NULL;
18200
18201         kmem_cache_destroy(dtrace_state_cache);
18202         vmem_destroy(dtrace_arena);
18203
18204         if (dtrace_toxrange != NULL) {
18205                 kmem_free(dtrace_toxrange,
18206                     dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
18207                 dtrace_toxrange = NULL;
18208                 dtrace_toxranges = 0;
18209                 dtrace_toxranges_max = 0;
18210         }
18211
18212         ddi_remove_minor_node(dtrace_devi, NULL);
18213         dtrace_devi = NULL;
18214
18215         ddi_soft_state_fini(&dtrace_softstate);
18216
18217         ASSERT(dtrace_vtime_references == 0);
18218         ASSERT(dtrace_opens == 0);
18219         ASSERT(dtrace_retained == NULL);
18220
18221         lck_mtx_unlock(&dtrace_lock);
18222         lck_mtx_unlock(&dtrace_provider_lock);
18223
18224 #ifdef illumos
18225         /*
18226          * We don't destroy the task queue until after we have dropped our
18227          * locks (taskq_destroy() may block on running tasks).  To prevent
18228          * attempting to do work after we have effectively detached but before
18229          * the task queue has been destroyed, all tasks dispatched via the
18230          * task queue must check that DTrace is still attached before
18231          * performing any operation.
18232          */
18233         taskq_destroy(dtrace_taskq);
18234         dtrace_taskq = NULL;
18235 #endif
18236
18237         return (DDI_SUCCESS);
18238 }
18239 #endif  /* __APPLE__ */
18240
18241 d_open_t _dtrace_open, helper_open;
18242 d_close_t _dtrace_close, helper_close;
18243 d_ioctl_t _dtrace_ioctl, helper_ioctl;
18244
18245 int
18246 _dtrace_open(dev_t dev, int flags, int devtype, struct proc *p)
18247 {
18248 #pragma unused(p)
18249         dev_t locdev = dev;
18250
18251         return  dtrace_open( &locdev, flags, devtype, CRED());
18252 }
18253
18254 int
18255 helper_open(dev_t dev, int flags, int devtype, struct proc *p)
18256 {
18257 #pragma unused(dev,flags,devtype,p)
18258         return 0;
18259 }
18260
18261 int
18262 _dtrace_close(dev_t dev, int flags, int devtype, struct proc *p)
18263 {
18264 #pragma unused(p)
18265         return dtrace_close( dev, flags, devtype, CRED());
18266 }
18267
18268 int
18269 helper_close(dev_t dev, int flags, int devtype, struct proc *p)
18270 {
18271 #pragma unused(dev,flags,devtype,p)
18272         return 0;
18273 }
18274
18275 int
18276 _dtrace_ioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct proc *p)
18277 {
18278 #pragma unused(p)
18279         int err, rv = 0;
18280     user_addr_t uaddrp;
18281
18282     if (proc_is64bit(p))
18283                 uaddrp = *(user_addr_t *)data;
18284         else
18285                 uaddrp = (user_addr_t) *(uint32_t *)data;
18286
18287         err = dtrace_ioctl(dev, cmd, uaddrp, fflag, CRED(), &rv);
18288
18289         /* Darwin's BSD ioctls only return -1 or zero. Overload errno to mimic Solaris. 20 bits suffice. */
18290         if (err != 0) {
18291                 ASSERT( (err & 0xfffff000) == 0 );
18292                 return (err & 0xfff); /* ioctl will return -1 and will set errno to an error code < 4096 */
18293         } else if (rv != 0) {
18294                 ASSERT( (rv & 0xfff00000) == 0 );
18295                 return (((rv & 0xfffff) << 12)); /* ioctl will return -1 and will set errno to a value >= 4096 */
18296         } else
18297                 return 0;
18298 }
18299
18300 int
18301 helper_ioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct proc *p)
18302 {
18303 #pragma unused(dev,fflag,p)
18304         int err, rv = 0;
18305
18306         err = dtrace_ioctl_helper(cmd, data, &rv);
18307         /* Darwin's BSD ioctls only return -1 or zero. Overload errno to mimic Solaris. 20 bits suffice. */
18308         if (err != 0) {
18309                 ASSERT( (err & 0xfffff000) == 0 );
18310                 return (err & 0xfff); /* ioctl will return -1 and will set errno to an error code < 4096 */
18311         } else if (rv != 0) {
18312                 ASSERT( (rv & 0xfff00000) == 0 );
18313                 return (((rv & 0xfffff) << 12)); /* ioctl will return -1 and will set errno to a value >= 4096 */
18314         } else
18315                 return 0;
18316 }
18317
18318 #define HELPER_MAJOR  -24 /* let the kernel pick the device number */
18319
18320 /*
18321  * A struct describing which functions will get invoked for certain
18322  * actions.
18323  */
18324 static struct cdevsw helper_cdevsw =
18325 {
18326         helper_open,            /* open */
18327         helper_close,           /* close */
18328         eno_rdwrt,                      /* read */
18329         eno_rdwrt,                      /* write */
18330         helper_ioctl,           /* ioctl */
18331         (stop_fcn_t *)nulldev, /* stop */
18332         (reset_fcn_t *)nulldev, /* reset */
18333         NULL,                           /* tty's */
18334         eno_select,                     /* select */
18335         eno_mmap,                       /* mmap */
18336         eno_strat,                      /* strategy */
18337         eno_getc,                       /* getc */
18338         eno_putc,                       /* putc */
18339         0                                       /* type */
18340 };
18341
18342 static int helper_majdevno = 0;
18343
18344 static int gDTraceInited = 0;
18345
18346 void
18347 helper_init( void )
18348 {
18349         /*
18350          * Once the "helper" is initialized, it can take ioctl calls that use locks
18351          * and zones initialized in dtrace_init. Make certain dtrace_init was called
18352          * before us.
18353          */
18354
18355         if (!gDTraceInited) {
18356                 panic("helper_init before dtrace_init\n");
18357         }
18358
18359         if (0 >= helper_majdevno)
18360         {
18361                 helper_majdevno = cdevsw_add(HELPER_MAJOR, &helper_cdevsw);
18362
18363                 if (helper_majdevno < 0) {
18364                         printf("helper_init: failed to allocate a major number!\n");
18365                         return;
18366                 }
18367
18368                 if (NULL == devfs_make_node( makedev(helper_majdevno, 0), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666,
18369                                         DTRACEMNR_HELPER, 0 )) {
18370                         printf("dtrace_init: failed to devfs_make_node for helper!\n");
18371                         return;
18372                 }
18373         } else
18374                 panic("helper_init: called twice!\n");
18375 }
18376
18377 #undef HELPER_MAJOR
18378
18379 static int
18380 dtrace_clone_func(dev_t dev, int action)
18381 {
18382 #pragma unused(dev)
18383
18384         if (action == DEVFS_CLONE_ALLOC) {
18385                 return dtrace_state_reserve();
18386         }
18387         else if (action == DEVFS_CLONE_FREE) {
18388                 return 0;
18389         }
18390         else return -1;
18391 }
18392
18393 void dtrace_ast(void);
18394
18395 void
18396 dtrace_ast(void)
18397 {
18398         int i;
18399         uint32_t clients = atomic_and_32(&dtrace_wake_clients, 0);
18400         if (clients == 0)
18401                 return;
18402         /**
18403          * We disable preemption here to be sure that we won't get
18404          * interrupted by a wakeup to a thread that is higher
18405          * priority than us, so that we do issue all wakeups
18406          */
18407         disable_preemption();
18408         for (i = 0; i < DTRACE_NCLIENTS; i++) {
18409                 if (clients & (1 << i)) {
18410                         dtrace_state_t *state = dtrace_state_get(i);
18411                         if (state) {
18412                                 wakeup(state);
18413                         }
18414
18415                 }
18416         }
18417         enable_preemption();
18418 }
18419
18420
18421 #define DTRACE_MAJOR  -24 /* let the kernel pick the device number */
18422
18423 static struct cdevsw dtrace_cdevsw =
18424 {
18425         _dtrace_open,           /* open */
18426         _dtrace_close,          /* close */
18427         eno_rdwrt,                      /* read */
18428         eno_rdwrt,                      /* write */
18429         _dtrace_ioctl,          /* ioctl */
18430         (stop_fcn_t *)nulldev, /* stop */
18431         (reset_fcn_t *)nulldev, /* reset */
18432         NULL,                           /* tty's */
18433         eno_select,                     /* select */
18434         eno_mmap,                       /* mmap */
18435         eno_strat,                      /* strategy */
18436         eno_getc,                       /* getc */
18437         eno_putc,                       /* putc */
18438         0                                       /* type */
18439 };
18440
18441 lck_attr_t* dtrace_lck_attr;
18442 lck_grp_attr_t* dtrace_lck_grp_attr;
18443 lck_grp_t* dtrace_lck_grp;
18444
18445 static int gMajDevNo;
18446
18447 void dtrace_early_init (void)
18448 {
18449         dtrace_restriction_policy_load();
18450
18451         /*
18452          * See dtrace_impl.h for a description of kernel symbol modes.
18453          * The default is to wait for symbols from userspace (lazy symbols).
18454          */
18455         if (!PE_parse_boot_argn("dtrace_kernel_symbol_mode", &dtrace_kernel_symbol_mode, sizeof (dtrace_kernel_symbol_mode))) {
18456                 dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE;
18457         }
18458 }
18459
18460 void
18461 dtrace_init( void )
18462 {
18463         if (0 == gDTraceInited) {
18464                 int i, ncpu;
18465                 size_t size = sizeof(dtrace_buffer_memory_maxsize);
18466
18467                 /*
18468                  * DTrace allocates buffers based on the maximum number
18469                  * of enabled cpus. This call avoids any race when finding
18470                  * that count.
18471                  */
18472                 ASSERT(dtrace_max_cpus == 0);
18473                 ncpu = dtrace_max_cpus = ml_get_max_cpus();
18474
18475                 /*
18476                  * Retrieve the size of the physical memory in order to define
18477                  * the state buffer memory maximal size.  If we cannot retrieve
18478                  * this value, we'll consider that we have 1Gb of memory per CPU, that's
18479                  * still better than raising a kernel panic.
18480                  */
18481                 if (0 != kernel_sysctlbyname("hw.memsize", &dtrace_buffer_memory_maxsize,
18482                                              &size, NULL, 0))
18483                 {
18484                         dtrace_buffer_memory_maxsize = ncpu * 1024 * 1024 * 1024;
18485                         printf("dtrace_init: failed to retrieve the hw.memsize, defaulted to %lld bytes\n",
18486                                dtrace_buffer_memory_maxsize);
18487                 }
18488
18489                 /*
18490                  * Finally, divide by three to prevent DTrace from eating too
18491                  * much memory.
18492                  */
18493                 dtrace_buffer_memory_maxsize /= 3;
18494                 ASSERT(dtrace_buffer_memory_maxsize > 0);
18495
18496                 gMajDevNo = cdevsw_add(DTRACE_MAJOR, &dtrace_cdevsw);
18497
18498                 if (gMajDevNo < 0) {
18499                         printf("dtrace_init: failed to allocate a major number!\n");
18500                         gDTraceInited = 0;
18501                         return;
18502                 }
18503
18504                 if (NULL == devfs_make_node_clone( makedev(gMajDevNo, 0), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666,
18505                                         dtrace_clone_func, DTRACEMNR_DTRACE, 0 )) {
18506                         printf("dtrace_init: failed to devfs_make_node_clone for dtrace!\n");
18507                         gDTraceInited = 0;
18508                         return;
18509                 }
18510
18511                 /*
18512                  * Allocate the dtrace_probe_t zone
18513                  */
18514                 dtrace_probe_t_zone = zinit(sizeof(dtrace_probe_t),
18515                                             1024 * sizeof(dtrace_probe_t),
18516                                             sizeof(dtrace_probe_t),
18517                                             "dtrace.dtrace_probe_t");
18518
18519                 /*
18520                  * Create the dtrace lock group and attrs.
18521                  */
18522                 dtrace_lck_attr = lck_attr_alloc_init();
18523                 dtrace_lck_grp_attr= lck_grp_attr_alloc_init();
18524                 dtrace_lck_grp = lck_grp_alloc_init("dtrace",  dtrace_lck_grp_attr);
18525
18526                 /*
18527                  * We have to initialize all locks explicitly
18528                  */
18529                 lck_mtx_init(&dtrace_lock, dtrace_lck_grp, dtrace_lck_attr);
18530                 lck_mtx_init(&dtrace_provider_lock, dtrace_lck_grp, dtrace_lck_attr);
18531                 lck_mtx_init(&dtrace_meta_lock, dtrace_lck_grp, dtrace_lck_attr);
18532                 lck_mtx_init(&dtrace_procwaitfor_lock, dtrace_lck_grp, dtrace_lck_attr);
18533 #if DEBUG
18534                 lck_mtx_init(&dtrace_errlock, dtrace_lck_grp, dtrace_lck_attr);
18535 #endif
18536                 lck_rw_init(&dtrace_dof_mode_lock, dtrace_lck_grp, dtrace_lck_attr);
18537
18538                 /*
18539                  * The cpu_core structure consists of per-CPU state available in any context.
18540                  * On some architectures, this may mean that the page(s) containing the
18541                  * NCPU-sized array of cpu_core structures must be locked in the TLB -- it
18542                  * is up to the platform to assure that this is performed properly.  Note that
18543                  * the structure is sized to avoid false sharing.
18544                  */
18545                 lck_mtx_init(&cpu_lock, dtrace_lck_grp, dtrace_lck_attr);
18546                 lck_mtx_init(&cyc_lock, dtrace_lck_grp, dtrace_lck_attr);
18547                 lck_mtx_init(&mod_lock, dtrace_lck_grp, dtrace_lck_attr);
18548
18549                 /*
18550                  * Initialize the CPU offline/online hooks.
18551                  */
18552                 dtrace_install_cpu_hooks();
18553
18554                 dtrace_modctl_list = NULL;
18555
18556                 cpu_core = (cpu_core_t *)kmem_zalloc( ncpu * sizeof(cpu_core_t), KM_SLEEP );
18557                 for (i = 0; i < ncpu; ++i) {
18558                         lck_mtx_init(&cpu_core[i].cpuc_pid_lock, dtrace_lck_grp, dtrace_lck_attr);
18559                 }
18560
18561                 cpu_list = (dtrace_cpu_t *)kmem_zalloc( ncpu * sizeof(dtrace_cpu_t), KM_SLEEP );
18562                 for (i = 0; i < ncpu; ++i) {
18563                         cpu_list[i].cpu_id = (processorid_t)i;
18564                         cpu_list[i].cpu_next = &(cpu_list[(i+1) % ncpu]);
18565                         LIST_INIT(&cpu_list[i].cpu_cyc_list);
18566                         lck_rw_init(&cpu_list[i].cpu_ft_lock, dtrace_lck_grp, dtrace_lck_attr);
18567                 }
18568
18569                 lck_mtx_lock(&cpu_lock);
18570                 for (i = 0; i < ncpu; ++i)
18571                         /* FIXME: track CPU configuration */
18572                         dtrace_cpu_setup_initial( (processorid_t)i ); /* In lieu of register_cpu_setup_func() callback */
18573                 lck_mtx_unlock(&cpu_lock);
18574
18575                 (void)dtrace_abs_to_nano(0LL); /* Force once only call to clock_timebase_info (which can take a lock) */
18576
18577                 dtrace_strings = dtrace_hash_create(dtrace_strkey_offset,
18578                     offsetof(dtrace_string_t, dtst_str),
18579                     offsetof(dtrace_string_t, dtst_next),
18580                     offsetof(dtrace_string_t, dtst_prev));
18581
18582                 dtrace_isa_init();
18583                 /*
18584                  * See dtrace_impl.h for a description of dof modes.
18585                  * The default is lazy dof.
18586                  *
18587                  * FIXME: Warn if state is LAZY_OFF? It won't break anything, but
18588                  * makes no sense...
18589                  */
18590                 if (!PE_parse_boot_argn("dtrace_dof_mode", &dtrace_dof_mode, sizeof (dtrace_dof_mode))) {
18591 #if CONFIG_EMBEDDED
18592                         /* Disable DOF mode by default for performance reasons */
18593                         dtrace_dof_mode = DTRACE_DOF_MODE_NEVER;
18594 #else
18595                         dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_ON;
18596 #endif
18597                 }
18598
18599                 /*
18600                  * Sanity check of dof mode value.
18601                  */
18602                 switch (dtrace_dof_mode) {
18603                         case DTRACE_DOF_MODE_NEVER:
18604                         case DTRACE_DOF_MODE_LAZY_ON:
18605                                 /* valid modes, but nothing else we need to do */
18606                                 break;
18607
18608                         case DTRACE_DOF_MODE_LAZY_OFF:
18609                         case DTRACE_DOF_MODE_NON_LAZY:
18610                                 /* Cannot wait for a dtrace_open to init fasttrap */
18611                                 fasttrap_init();
18612                                 break;
18613
18614                         default:
18615                                 /* Invalid, clamp to non lazy */
18616                                 dtrace_dof_mode = DTRACE_DOF_MODE_NON_LAZY;
18617                                 fasttrap_init();
18618                                 break;
18619                 }
18620
18621                 gDTraceInited = 1;
18622
18623         } else
18624                 panic("dtrace_init: called twice!\n");
18625 }
18626
18627 void
18628 dtrace_postinit(void)
18629 {
18630         /*
18631          * Called from bsd_init after all provider's *_init() routines have been
18632          * run. That way, anonymous DOF enabled under dtrace_attach() is safe
18633          * to go.
18634          */
18635         dtrace_attach( (dev_info_t *)(uintptr_t)makedev(gMajDevNo, 0)); /* Punning a dev_t to a dev_info_t* */
18636
18637         /*
18638          * Add the mach_kernel to the module list for lazy processing
18639          */
18640         struct kmod_info fake_kernel_kmod;
18641         memset(&fake_kernel_kmod, 0, sizeof(fake_kernel_kmod));
18642
18643         strlcpy(fake_kernel_kmod.name, "mach_kernel", sizeof(fake_kernel_kmod.name));
18644         fake_kernel_kmod.id = 1;
18645         fake_kernel_kmod.address = g_kernel_kmod_info.address;
18646         fake_kernel_kmod.size = g_kernel_kmod_info.size;
18647
18648         if (dtrace_module_loaded(&fake_kernel_kmod, 0) != 0) {
18649                 printf("dtrace_postinit: Could not register mach_kernel modctl\n");
18650         }
18651
18652         if (!PE_parse_boot_argn("dtrace_provide_private_probes", &dtrace_provide_private_probes, sizeof (dtrace_provide_private_probes))) {
18653                         dtrace_provide_private_probes = 0;
18654         }
18655
18656         (void)OSKextRegisterKextsWithDTrace();
18657 }
18658 #undef DTRACE_MAJOR
18659
18660 /*
18661  * Routines used to register interest in cpu's being added to or removed
18662  * from the system.
18663  */
18664 void
18665 register_cpu_setup_func(cpu_setup_func_t *ignore1, void *ignore2)
18666 {
18667 #pragma unused(ignore1,ignore2)
18668 }
18669
18670 void
18671 unregister_cpu_setup_func(cpu_setup_func_t *ignore1, void *ignore2)
18672 {
18673 #pragma unused(ignore1,ignore2)
18674 }